Compare commits

...

59 Commits

Author SHA1 Message Date
Jeffrey Morgan
bd933c24bc testing new cmake script 2024-03-03 00:51:07 -08:00
Jeffrey Morgan
21347e1ed6 update llama.cpp submodule to c29af7e (#2868) 2024-03-01 15:26:04 -08:00
Jeffrey Morgan
3b4bab3dc5 Fix embeddings load model behavior (#2848) 2024-02-29 17:40:56 -08:00
Daniel Hiltgen
cbd6e3b38e Merge pull request #2838 from dhiltgen/opensuse
Add ollama user to video group
2024-02-29 15:47:56 -08:00
Daniel Hiltgen
b830afa716 Merge pull request #2837 from dhiltgen/podman_image_support
Add env var so podman will map cuda GPUs
2024-02-29 15:47:37 -08:00
Daniel Hiltgen
bd1d8b0d14 Merge pull request #2836 from bmwiedemann/gzip
Omit build date from gzip headers
2024-02-29 15:46:46 -08:00
fred-bf
25c2912120 Add Community Integration: NextChat (#2780) 2024-02-29 12:12:13 -08:00
Michael Yang
0e19476b56 prepend image tags (#2789)
instead of appending image tags, prepend them - this generally produces better results
2024-02-29 11:30:14 -08:00
tylinux
fa2f2b3563 fix: print usedMemory size right (#2827) 2024-02-29 11:11:04 -08:00
Jeffrey Morgan
cbf4970e0f bump submodule to 87c91c07663b707e831c59ec373b5e665ff9d64a (#2828) 2024-02-29 09:42:08 -08:00
Daniel Hiltgen
74468513bd Add ollama user to video group
On OpenSUSE, ollama needs to be a member of the video group
to access the GPU
2024-02-29 08:50:10 -08:00
Daniel Hiltgen
794a916a72 Add env var so podman will map cuda GPUs
Without this env var, podman's GPU logic doesn't map the GPU through
2024-02-29 08:43:08 -08:00
Bernhard M. Wiedemann
76e5d9ec88 Omit build date from gzip headers
See https://reproducible-builds.org/ for why this is good.

This patch was done while working on reproducible builds for openSUSE.
2024-02-29 16:48:19 +01:00
Daniel Hiltgen
076237b8ea Merge pull request #2771 from dhiltgen/toggle_models
Bump llama.cpp to b2276
2024-02-27 11:29:53 -08:00
Daniel Hiltgen
53d694c67f Merge pull request #2772 from dhiltgen/container_image
Refine container image build script
2024-02-27 11:29:08 -08:00
Daniel Hiltgen
5aa6bfea94 Merge pull request #2785 from dhiltgen/win_download
Log unexpected server errors checking for update
2024-02-27 10:43:14 -08:00
Daniel Hiltgen
1cde63dd64 Log unexpected server errors checking for update
This should unmask some failure modes that likely
show up in app logs as unmarshal errors
2024-02-27 09:17:04 -08:00
Daniel Hiltgen
98e0b7e94f Refine container image build script
Allow overriding the platform, image name, and tag latest for
standard and rocm images.
2024-02-26 17:26:49 -08:00
Daniel Hiltgen
061e8f6abc Bump llama.cpp to b2276 2024-02-26 16:49:24 -08:00
peanut256
a189810df6 Determine max VRAM on macOS using recommendedMaxWorkingSetSize (#2354)
* read iogpu.wired_limit_mb on macOS

Fix for https://github.com/ollama/ollama/issues/1826

* improved determination of available vram on macOS

read the recommended maximal vram on macOS via Metal API

* Removed macOS-specific logging

* Remove logging from gpu_darwin.go

* release Core Foundation object

fixes a possible memory leak
2024-02-25 18:16:45 -05:00
Ikko Eltociear Ashimine
e95b896790 Update types.go (#2744)
specfied -> specified
2024-02-25 13:41:25 -05:00
elthommy
1f087c4d26 Update langchain python tutorial (#2737)
Remove unused GPT4all
Use nomic-embed-text as embedded model
Fix a deprecation warning (__call__)
2024-02-25 00:31:36 -05:00
Jeffrey Morgan
5d7ea6616f no extra disk space for windows installation (#2739) 2024-02-25 00:20:35 -05:00
Michael Yang
2a4b128ae3 Merge pull request #2719 from ollama/mxyng/format-private-key
remove format private key
2024-02-23 17:15:14 -08:00
Michael Yang
fc483274ad clean up go.mod 2024-02-23 16:53:36 -08:00
Michael Yang
fd10a2ad4b remove format/openssh.go
this is unnecessary now that x/crypto/ssh.MarshalPrivateKey has been
added
2024-02-23 16:52:23 -08:00
Benn Huang
b291f63188 Add Community Integration: Chatbox
Co-authored-by: bennhuang <bennhuang@tencent.com>
2024-02-23 07:17:28 -05:00
Jeffrey Morgan
f58856bf6f better directory cleanup in ollama.iss 2024-02-23 07:14:59 -05:00
Jeffrey Morgan
275ea01587 restore windows build flags and compression 2024-02-22 18:07:18 -05:00
Jeffrey Morgan
8782dd5628 fix build_windows.ps1 script to run go build with the correct flags 2024-02-22 17:41:43 -05:00
Jeffrey Morgan
11bfff8ee1 update llama.cpp submodule to 96633eeca1265ed03e57230de54032041c58f9cd 2024-02-22 16:44:26 -05:00
Logan Yang
7c0167a8f6 Add copilot for obsidian plugin to community integration (#1918) 2024-02-22 14:17:20 -05:00
LangChain4j
74d898e37d Added LangChain4j links (#1690) 2024-02-22 14:09:08 -05:00
Yuan-Man
c6e8b00718 Add README.md (#2249) 2024-02-22 14:03:44 -05:00
B-Tocs.org Community
be9980ef13 Update README.md - Ollama for SAP ABAP (#2510) 2024-02-22 13:12:27 -05:00
Augustinas Malinauskas
646a0dedb9 Update README.md (#2504)
- Enchanted is now supported for desktop on macOS
2024-02-22 13:09:29 -05:00
Azhar Khan
7f964d938c update README to add Gemma 2B, 7B model in Model Library Table (#2686) 2024-02-22 13:07:47 -05:00
Pavel Frankov
e6b8a139ff Update README.md (#2138) 2024-02-22 10:52:36 -05:00
Jeffrey Morgan
bdc0ea1ba5 Update import.md 2024-02-22 02:08:03 -05:00
Jeffrey Morgan
7fab7918cc Update import.md 2024-02-22 02:06:24 -05:00
Michael Yang
74c1bdba0d Merge pull request #2657 from joshyan1/patch-1
Update install.sh success message
2024-02-21 15:55:20 -08:00
Josh
f983ef7f5f Update install.sh success message 2024-02-21 18:30:01 -05:00
Jeffrey Morgan
1ae1c33651 Windows build + installer adjustments (#2656)
* remove `-w -s` linker flags on windows

* use `zip` for windows installer compression
2024-02-21 18:21:26 -05:00
Jeffrey Morgan
efe040f8c0 reset with init_vars ahead of each cpu build in gen_windows.ps1 (#2654) 2024-02-21 16:35:34 -05:00
Jeffrey Morgan
2a7553ce09 update llama.cpp submodule to c14f72d 2024-02-21 09:03:14 -05:00
Sun Bo
10af6070a9 Update big-AGI config file link (#2626)
Co-authored-by: bo.sun <bo.sun@cotticoffee.com>
2024-02-21 01:24:48 -05:00
Jeffrey Morgan
92423b0600 add dist directory in build_windows.ps 2024-02-21 00:05:05 -05:00
Jeffrey Morgan
b3eac61cac update llama.cpp submodule to f0d1fafc029a056cd765bdae58dcaa12312e9879 2024-02-20 22:56:51 -05:00
Jeffrey Morgan
287ba11500 better error message when calling /api/generate or /api/chat with embedding models 2024-02-20 21:53:45 -05:00
Jeffrey Morgan
63861f58cc Support for bert and nomic-bert embedding models 2024-02-20 21:37:29 -05:00
Jeffrey Morgan
f0425d3de9 Update faq.md 2024-02-20 20:44:45 -05:00
Michael Yang
210b65268e replace strings buffer with hasher (#2437)
the buffered value is going into the hasher eventually so write directly
to the hasher instead
2024-02-20 19:07:50 -05:00
Michael Yang
949d7b1c48 add gguf file types (#2532) 2024-02-20 19:06:29 -05:00
Michael Yang
897b213468 use http.DefaultClient (#2530)
default client already handles proxy
2024-02-20 18:34:47 -05:00
Jeffrey Morgan
4613a080e7 update llama.cpp submodule to 66c1968f7 (#2618) 2024-02-20 17:42:31 -05:00
Muhammed Nazeem
ace2cdf1c6 Add Page Assist to the community integrations (#2447) 2024-02-20 14:03:58 -05:00
Nikesh Parajuli
eed92bc19a docs: add Msty app in readme (#1775)
* docs: add Msty app in readme

* docs: update msty url
2024-02-20 14:03:33 -05:00
Michael Edoror
e0a2f46466 Update README.md to include Elixir LangChain Library (#2180)
The Elixir LangChain Library now supports Ollama Chat with this [PR](https://github.com/brainlid/langchain/pull/70)
2024-02-20 14:03:02 -05:00
Taras Tsugrii
01ff2e14db [nit] Remove unused msg local var. (#2511) 2024-02-20 14:02:34 -05:00
58 changed files with 320 additions and 1951 deletions

View File

@@ -132,6 +132,7 @@ ENV OLLAMA_HOST 0.0.0.0
ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
ENV NVIDIA_VISIBLE_DEVICES=all
ENTRYPOINT ["/bin/ollama"]
CMD ["serve"]

View File

@@ -62,6 +62,8 @@ Here are some example models that can be downloaded:
| Orca Mini | 3B | 1.9GB | `ollama run orca-mini` |
| Vicuna | 7B | 3.8GB | `ollama run vicuna` |
| LLaVA | 7B | 4.5GB | `ollama run llava` |
| Gemma | 2B | 1.4GB | `ollama run gemma:2b` |
| Gemma | 7B | 4.8GB | `ollama run gemma:7b` |
> Note: You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
@@ -258,19 +260,23 @@ See the [API documentation](./docs/api.md) for all endpoints.
### Web & Desktop
- [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
- [Enchanted (macOS native)](https://github.com/AugustDev/enchanted)
- [HTML UI](https://github.com/rtcfirefly/ollama-ui)
- [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
- [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
- [Minimalistic React UI for Ollama Models](https://github.com/richawo/minimal-llm-ui)
- [Open WebUI](https://github.com/open-webui/open-webui)
- [Ollamac](https://github.com/kevinhermawan/Ollamac)
- [big-AGI](https://github.com/enricoros/big-agi/blob/main/docs/config-ollama.md)
- [big-AGI](https://github.com/enricoros/big-AGI/blob/main/docs/config-local-ollama.md)
- [Cheshire Cat assistant framework](https://github.com/cheshire-cat-ai/core)
- [Amica](https://github.com/semperai/amica)
- [chatd](https://github.com/BruceMacD/chatd)
- [Ollama-SwiftUI](https://github.com/kghandour/Ollama-SwiftUI)
- [MindMac](https://mindmac.app)
- [NextJS Web Interface for Ollama](https://github.com/jakobhoeg/nextjs-ollama-llm-ui)
- [Msty](https://msty.app)
- [Chatbox](https://github.com/Bin-Huang/Chatbox)
- [NextChat](https://github.com/ChatGPTNextWeb/ChatGPT-Next-Web) with [Get Started Doc](https://docs.nextchat.dev/models/ollama)
### Terminal
@@ -301,6 +307,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
- [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
- [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
- [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
- [LangChain4j](https://github.com/langchain4j/langchain4j/tree/main/langchain4j-ollama)
- [LiteLLM](https://github.com/BerriAI/litellm)
@@ -315,8 +322,10 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [LangChainDart](https://github.com/davidmigloz/langchain_dart)
- [Semantic Kernel - Python](https://github.com/microsoft/semantic-kernel/tree/main/python/semantic_kernel/connectors/ai/ollama)
- [Haystack](https://github.com/deepset-ai/haystack-integrations/blob/main/integrations/ollama.md)
- [Elixir LangChain](https://github.com/brainlid/langchain)
- [Ollama for R - rollama](https://github.com/JBGruber/rollama)
- [Ollama-ex for Elixir](https://github.com/lebrunel/ollama-ex)
- [Ollama Connector for SAP ABAP](https://github.com/b-tocs/abap_btocs_ollama)
### Mobile
@@ -337,6 +346,9 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Rivet plugin](https://github.com/abrenneke/rivet-plugin-ollama)
- [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
- [Obsidian BMO Chatbot plugin](https://github.com/longy2k/obsidian-bmo-chatbot)
- [Copilot for Obsidian plugin](https://github.com/logancyang/obsidian-copilot)
- [Obsidian Local GPT plugin](https://github.com/pfrankov/obsidian-local-gpt)
- [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
- [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and HuggingFace)
- [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)

View File

@@ -21,7 +21,7 @@ import (
type Client struct {
base *url.URL
http http.Client
http *http.Client
}
func checkError(resp *http.Response, body []byte) error {
@@ -66,30 +66,13 @@ func ClientFromEnvironment() (*Client, error) {
}
}
client := Client{
return &Client{
base: &url.URL{
Scheme: scheme,
Host: net.JoinHostPort(host, port),
},
}
mockRequest, err := http.NewRequest(http.MethodHead, client.base.String(), nil)
if err != nil {
return nil, err
}
proxyURL, err := http.ProxyFromEnvironment(mockRequest)
if err != nil {
return nil, err
}
client.http = http.Client{
Transport: &http.Transport{
Proxy: http.ProxyURL(proxyURL),
},
}
return &client, nil
http: http.DefaultClient,
}, nil
}
func (c *Client) do(ctx context.Context, method, path string, reqData, respData any) error {

View File

@@ -83,7 +83,7 @@ type Metrics struct {
EvalDuration time.Duration `json:"eval_duration,omitempty"`
}
// Options specfied in GenerateRequest, if you add a new option here add it to the API docs also
// Options specified in GenerateRequest, if you add a new option here add it to the API docs also
type Options struct {
Runner
@@ -121,7 +121,6 @@ type Runner struct {
VocabOnly bool `json:"vocab_only,omitempty"`
UseMMap bool `json:"use_mmap,omitempty"`
UseMLock bool `json:"use_mlock,omitempty"`
EmbeddingOnly bool `json:"embedding_only,omitempty"`
RopeFrequencyBase float32 `json:"rope_frequency_base,omitempty"`
RopeFrequencyScale float32 `json:"rope_frequency_scale,omitempty"`
NumThread int `json:"num_thread,omitempty"`
@@ -395,7 +394,6 @@ func DefaultOptions() Options {
UseMLock: false,
UseMMap: true,
UseNUMA: false,
EmbeddingOnly: true,
},
}
}

View File

@@ -34,20 +34,6 @@ type UpdateResponse struct {
UpdateVersion string `json:"version"`
}
func getClient(req *http.Request) http.Client {
proxyURL, err := http.ProxyFromEnvironment(req)
if err != nil {
slog.Warn(fmt.Sprintf("failed to handle proxy: %s", err))
return http.Client{}
}
return http.Client{
Transport: &http.Transport{
Proxy: http.ProxyURL(proxyURL),
},
}
}
func IsNewReleaseAvailable(ctx context.Context) (bool, UpdateResponse) {
var updateResp UpdateResponse
@@ -83,10 +69,9 @@ func IsNewReleaseAvailable(ctx context.Context) (bool, UpdateResponse) {
}
req.Header.Set("Authorization", signature)
req.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
client := getClient(req)
slog.Debug("checking for available update", "requestURL", requestURL)
resp, err := client.Do(req)
resp, err := http.DefaultClient.Do(req)
if err != nil {
slog.Warn(fmt.Sprintf("failed to check for update: %s", err))
return false, updateResp
@@ -101,6 +86,11 @@ func IsNewReleaseAvailable(ctx context.Context) (bool, UpdateResponse) {
if err != nil {
slog.Warn(fmt.Sprintf("failed to read body response: %s", err))
}
if resp.StatusCode != 200 {
slog.Info(fmt.Sprintf("check update error %d - %.96s", resp.StatusCode, string(body)))
return false, updateResp
}
err = json.Unmarshal(body, &updateResp)
if err != nil {
slog.Warn(fmt.Sprintf("malformed response checking for update: %s", err))
@@ -119,8 +109,8 @@ func DownloadNewRelease(ctx context.Context, updateResp UpdateResponse) error {
if err != nil {
return err
}
client := getClient(req)
resp, err := client.Do(req)
resp, err := http.DefaultClient.Do(req)
if err != nil {
return fmt.Errorf("error checking update: %w", err)
}
@@ -151,7 +141,7 @@ func DownloadNewRelease(ctx context.Context, updateResp UpdateResponse) error {
cleanupOldDownloads()
req.Method = http.MethodGet
resp, err = client.Do(req)
resp, err = http.DefaultClient.Do(req)
if err != nil {
return fmt.Errorf("error checking update: %w", err)
}

View File

@@ -49,9 +49,6 @@ SetupLogging=yes
CloseApplications=yes
RestartApplications=no
; Make sure they can at least download llama2 as a minimum
ExtraDiskSpaceRequired=3826806784
; https://jrsoftware.org/ishelp/index.php?topic=setup_wizardimagefile
WizardSmallImageFile=.\assets\setup.bmp
@@ -116,7 +113,8 @@ Filename: "{cmd}"; Parameters: "/c timeout 5"; Flags: runhidden
Type: filesandordirs; Name: "{%TEMP}\ollama*"
Type: filesandordirs; Name: "{%LOCALAPPDATA}\Ollama"
Type: filesandordirs; Name: "{%LOCALAPPDATA}\Programs\Ollama"
Type: filesandordirs; Name: "{%USERPROFILE}\.ollama"
Type: filesandordirs; Name: "{%USERPROFILE}\.ollama\models"
Type: filesandordirs; Name: "{%USERPROFILE}\.ollama\history"
; NOTE: if the user has a custom OLLAMA_MODELS it will be preserved
[Messages]

View File

@@ -718,39 +718,36 @@ func initializeKeypair() error {
_, err = os.Stat(privKeyPath)
if os.IsNotExist(err) {
fmt.Printf("Couldn't find '%s'. Generating new private key.\n", privKeyPath)
_, privKey, err := ed25519.GenerateKey(rand.Reader)
cryptoPublicKey, cryptoPrivateKey, err := ed25519.GenerateKey(rand.Reader)
if err != nil {
return err
}
privKeyBytes, err := format.OpenSSHPrivateKey(privKey, "")
privateKeyBytes, err := ssh.MarshalPrivateKey(cryptoPrivateKey, "")
if err != nil {
return err
}
err = os.MkdirAll(filepath.Dir(privKeyPath), 0o755)
if err != nil {
if err := os.MkdirAll(filepath.Dir(privKeyPath), 0o755); err != nil {
return fmt.Errorf("could not create directory %w", err)
}
err = os.WriteFile(privKeyPath, pem.EncodeToMemory(privKeyBytes), 0o600)
if err := os.WriteFile(privKeyPath, pem.EncodeToMemory(privateKeyBytes), 0o600); err != nil {
return err
}
sshPublicKey, err := ssh.NewPublicKey(cryptoPublicKey)
if err != nil {
return err
}
sshPrivateKey, err := ssh.NewSignerFromKey(privKey)
if err != nil {
publicKeyBytes := ssh.MarshalAuthorizedKey(sshPublicKey)
if err := os.WriteFile(pubKeyPath, publicKeyBytes, 0o644); err != nil {
return err
}
pubKeyData := ssh.MarshalAuthorizedKey(sshPrivateKey.PublicKey())
err = os.WriteFile(pubKeyPath, pubKeyData, 0o644)
if err != nil {
return err
}
fmt.Printf("Your new public key is: \n\n%s\n", string(pubKeyData))
fmt.Printf("Your new public key is: \n\n%s\n", publicKeyBytes)
}
return nil
}

View File

@@ -321,7 +321,6 @@ curl http://localhost:11434/api/generate -d '{
"vocab_only": false,
"use_mmap": true,
"use_mlock": false,
"embedding_only": false,
"rope_frequency_base": 1.1,
"rope_frequency_scale": 0.8,
"num_thread": 8

View File

@@ -113,9 +113,9 @@ If a different directory needs to be used, set the environment variable `OLLAMA_
Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.
## Does Ollama send my prompts and answers back to Ollama.ai to use in any way?
## Does Ollama send my prompts and answers back to ollama.com?
No, Ollama runs entirely locally, and conversation data will never leave your machine.
No. Ollama runs locally, and conversation data does not leave your machine.
## How can I use Ollama in Visual Studio Code?

View File

@@ -124,7 +124,10 @@ ollama run example "What is your favourite condiment?"
Publishing models is in early alpha. If you'd like to publish your model to share with others, follow these steps:
1. Create [an account](https://ollama.com/signup)
2. Run `cat ~/.ollama/id_ed25519.pub` (or `type %USERPROFILE%\.ollama\id_ed25519.pub` on Windows) to view your Ollama public key. Copy this to the clipboard.
2. Copy your Ollama public key:
- macOS: `cat ~/.ollama/id_ed25519.pub`
- Windows: `type %USERPROFILE%\.ollama\id_ed25519.pub`
- Linux: `cat /usr/share/ollama/.ollama/id_ed25519.pub`
3. Add your public key to your [Ollama account](https://ollama.com/settings/keys)
Next, copy your model to your username's namespace:

View File

@@ -42,12 +42,12 @@ text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)
```
It's split up, but we have to find the relevant splits and then submit those to the model. We can do this by creating embeddings and storing them in a vector database. We can use Ollama directly to instantiate an embedding model. We will use ChromaDB in this example for a vector database. `pip install GPT4All chromadb`
It's split up, but we have to find the relevant splits and then submit those to the model. We can do this by creating embeddings and storing them in a vector database. We can use Ollama directly to instantiate an embedding model. We will use ChromaDB in this example for a vector database. `pip install chromadb`
```python
from langchain.embeddings import OllamaEmbeddings
from langchain.vectorstores import Chroma
oembed = OllamaEmbeddings(base_url="http://localhost:11434", model="llama2")
oembed = OllamaEmbeddings(base_url="http://localhost:11434", model="nomic-embed-text")
vectorstore = Chroma.from_documents(documents=all_splits, embedding=oembed)
```
@@ -66,7 +66,7 @@ The next thing is to send the question and the relevant parts of the docs to the
```python
from langchain.chains import RetrievalQA
qachain=RetrievalQA.from_chain_type(ollama, retriever=vectorstore.as_retriever())
qachain({"query": question})
qachain.invoke({"query": question})
```
The answer received from this chain was:

View File

@@ -0,0 +1,21 @@
# Ollama Chat App
Build a Llama2 chat app using Streamlit and Ollama.
## Running the Example
1. Ensure you have the `llama2` model installed:
```bash
ollama pull llama2
```
2. Install the Python Requirements.
```bash
pip install -r requirements.txt
```
3. Run the example:
```bash
python main.py
```

View File

@@ -1,102 +0,0 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Code originally from https://go-review.googlesource.com/c/crypto/+/218620
// TODO: replace with upstream once the above change is merged and released.
package format
import (
"crypto"
"crypto/ed25519"
"crypto/rand"
"encoding/binary"
"encoding/pem"
"fmt"
"golang.org/x/crypto/ssh"
)
const privateKeyAuthMagic = "openssh-key-v1\x00"
type openSSHEncryptedPrivateKey struct {
CipherName string
KDFName string
KDFOptions string
KeysCount uint32
PubKey []byte
KeyBlocks []byte
}
type openSSHPrivateKey struct {
Check1 uint32
Check2 uint32
Keytype string
Rest []byte `ssh:"rest"`
}
type openSSHEd25519PrivateKey struct {
Pub []byte
Priv []byte
Comment string
Pad []byte `ssh:"rest"`
}
func OpenSSHPrivateKey(key crypto.PrivateKey, comment string) (*pem.Block, error) {
var check uint32
if err := binary.Read(rand.Reader, binary.BigEndian, &check); err != nil {
return nil, err
}
var pk1 openSSHPrivateKey
pk1.Check1 = check
pk1.Check2 = check
var w openSSHEncryptedPrivateKey
w.KeysCount = 1
if k, ok := key.(*ed25519.PrivateKey); ok {
key = *k
}
switch k := key.(type) {
case ed25519.PrivateKey:
pub, priv := k[32:], k
key := openSSHEd25519PrivateKey{
Pub: pub,
Priv: priv,
Comment: comment,
}
pk1.Keytype = ssh.KeyAlgoED25519
pk1.Rest = ssh.Marshal(key)
w.PubKey = ssh.Marshal(struct {
KeyType string
Pub []byte
}{
ssh.KeyAlgoED25519, pub,
})
default:
return nil, fmt.Errorf("ssh: unknown key type %T", k)
}
w.KeyBlocks = openSSHPadding(ssh.Marshal(pk1), 8)
w.CipherName, w.KDFName, w.KDFOptions = "none", "none", ""
return &pem.Block{
Type: "OPENSSH PRIVATE KEY",
Bytes: append([]byte(privateKeyAuthMagic), ssh.Marshal(w)...),
}, nil
}
func openSSHPadding(block []byte, blocksize int) []byte {
for i, j := 0, len(block); (j+i)%blocksize != 0; i++ {
block = append(block, byte(i+1))
}
return block
}

16
go.mod
View File

@@ -3,8 +3,10 @@ module github.com/jmorganca/ollama
go 1.21
require (
github.com/containerd/console v1.0.3
github.com/emirpasic/gods v1.18.1
github.com/gin-gonic/gin v1.9.1
github.com/google/uuid v1.0.0
github.com/olekukonko/tablewriter v0.0.5
github.com/spf13/cobra v1.7.0
github.com/stretchr/testify v1.8.4
@@ -12,26 +14,12 @@ require (
)
require (
github.com/containerd/console v1.0.3 // indirect
github.com/cratonica/2goarray v0.0.0-20190331194516-514510793eaa // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/getlantern/context v0.0.0-20190109183933-c447772a6520 // indirect
github.com/getlantern/errors v0.0.0-20190325191628-abdb3e3e36f7 // indirect
github.com/getlantern/golog v0.0.0-20190830074920-4ef2e798c2d7 // indirect
github.com/getlantern/hex v0.0.0-20190417191902-c6586a6fe0b7 // indirect
github.com/getlantern/hidden v0.0.0-20190325191715-f02dbb02be55 // indirect
github.com/getlantern/ops v0.0.0-20190325191751-d70cb0d6f85f // indirect
github.com/getlantern/systray v1.2.2 // indirect
github.com/go-stack/stack v1.8.0 // indirect
github.com/google/uuid v1.0.0 // indirect
github.com/mattn/go-runewidth v0.0.14 // indirect
github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c // indirect
github.com/pborman/uuid v1.2.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/rivo/uniseg v0.2.0 // indirect
)
require (
github.com/bytedance/sonic v1.9.1 // indirect
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 // indirect

29
go.sum
View File

@@ -7,8 +7,6 @@ github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311/go.mod h1:b583j
github.com/containerd/console v1.0.3 h1:lIr7SlA5PxZyMV30bDW0MGbiOPXwc63yRuCP0ARubLw=
github.com/containerd/console v1.0.3/go.mod h1:7LqA/THxQ86k76b8c/EMSiaJ3h1eZkMkXar0TQ1gf3U=
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/cratonica/2goarray v0.0.0-20190331194516-514510793eaa h1:Wg+722vs7a2zQH5lR9QWYsVbplKeffaQFIs5FTdfNNo=
github.com/cratonica/2goarray v0.0.0-20190331194516-514510793eaa/go.mod h1:6Arca19mRx58CA7OWEd7Wu1NpC1rd3uDnNs6s1pj/DI=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
@@ -17,20 +15,6 @@ github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc
github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ=
github.com/gabriel-vasile/mimetype v1.4.2 h1:w5qFW6JKBz9Y393Y4q372O9A7cUSequkh1Q7OhCmWKU=
github.com/gabriel-vasile/mimetype v1.4.2/go.mod h1:zApsH/mKG4w07erKIaJPFiX0Tsq9BFQgN3qGY5GnNgA=
github.com/getlantern/context v0.0.0-20190109183933-c447772a6520 h1:NRUJuo3v3WGC/g5YiyF790gut6oQr5f3FBI88Wv0dx4=
github.com/getlantern/context v0.0.0-20190109183933-c447772a6520/go.mod h1:L+mq6/vvYHKjCX2oez0CgEAJmbq1fbb/oNJIWQkBybY=
github.com/getlantern/errors v0.0.0-20190325191628-abdb3e3e36f7 h1:6uJ+sZ/e03gkbqZ0kUG6mfKoqDb4XMAzMIwlajq19So=
github.com/getlantern/errors v0.0.0-20190325191628-abdb3e3e36f7/go.mod h1:l+xpFBrCtDLpK9qNjxs+cHU6+BAdlBaxHqikB6Lku3A=
github.com/getlantern/golog v0.0.0-20190830074920-4ef2e798c2d7 h1:guBYzEaLz0Vfc/jv0czrr2z7qyzTOGC9hiQ0VC+hKjk=
github.com/getlantern/golog v0.0.0-20190830074920-4ef2e798c2d7/go.mod h1:zx/1xUUeYPy3Pcmet8OSXLbF47l+3y6hIPpyLWoR9oc=
github.com/getlantern/hex v0.0.0-20190417191902-c6586a6fe0b7 h1:micT5vkcr9tOVk1FiH8SWKID8ultN44Z+yzd2y/Vyb0=
github.com/getlantern/hex v0.0.0-20190417191902-c6586a6fe0b7/go.mod h1:dD3CgOrwlzca8ed61CsZouQS5h5jIzkK9ZWrTcf0s+o=
github.com/getlantern/hidden v0.0.0-20190325191715-f02dbb02be55 h1:XYzSdCbkzOC0FDNrgJqGRo8PCMFOBFL9py72DRs7bmc=
github.com/getlantern/hidden v0.0.0-20190325191715-f02dbb02be55/go.mod h1:6mmzY2kW1TOOrVy+r41Za2MxXM+hhqTtY3oBKd2AgFA=
github.com/getlantern/ops v0.0.0-20190325191751-d70cb0d6f85f h1:wrYrQttPS8FHIRSlsrcuKazukx/xqO/PpLZzZXsF+EA=
github.com/getlantern/ops v0.0.0-20190325191751-d70cb0d6f85f/go.mod h1:D5ao98qkA6pxftxoqzibIBBrLSUli+kYnJqrgBf9cIA=
github.com/getlantern/systray v1.2.2 h1:dCEHtfmvkJG7HZ8lS/sLklTH4RKUcIsKrAD9sThoEBE=
github.com/getlantern/systray v1.2.2/go.mod h1:pXFOI1wwqwYXEhLPm9ZGjS2u/vVELeIgNMY5HvhHhcE=
github.com/gin-contrib/cors v1.4.0 h1:oJ6gwtUl3lqV0WEIwM/LxPF1QZ5qe2lGWdY2+bz7y0g=
github.com/gin-contrib/cors v1.4.0/go.mod h1:bs9pNM0x/UsmHPBWT2xZz9ROh8xYjYkiURUfmBoMlcs=
github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE=
@@ -40,6 +24,7 @@ github.com/gin-gonic/gin v1.9.1 h1:4idEAncQnU5cB7BeOkPtxjfCSye0AAm1R0RVIqJ+Jmg=
github.com/gin-gonic/gin v1.9.1/go.mod h1:hPrL7YrpYKXt5YId3A/Tnip5kqbEAP+KLuI3SUcPTeU=
github.com/go-playground/assert/v2 v2.0.1/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s=
github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
github.com/go-playground/locales v0.14.0/go.mod h1:sawfccIbzZTqEDETgFXqTho0QybSa7l++s0DH+LDiLs=
github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA=
github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY=
@@ -49,8 +34,6 @@ github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91
github.com/go-playground/validator/v10 v10.10.0/go.mod h1:74x4gJWsvQexRdW8Pn3dXSGrTK4nAUsbPlLADvpJkos=
github.com/go-playground/validator/v10 v10.14.0 h1:vgvQWe3XCz3gIeFDm/HnTIbj6UGmg/+t63MyGU2n5js=
github.com/go-playground/validator/v10 v10.14.0/go.mod h1:9iXMNT7sEkjXb0I+enO7QXmzG6QCsPWY4zveKFVRSyU=
github.com/go-stack/stack v1.8.0 h1:5SgMzNM5HxrEjV0ww2lTmX6E2Izsfxas4+YHWRs3Lsk=
github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
github.com/goccy/go-json v0.9.7/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU=
github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
@@ -79,8 +62,6 @@ github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/leodido/go-urn v1.2.1/go.mod h1:zt4jvISO2HfUBqxjfIshjdMTYS56ZS/qv49ictyFfxY=
github.com/leodido/go-urn v1.2.4 h1:XlAE/cm/ms7TE/VMVoduSpNBoyc2dOxHs5MZSwAN63Q=
github.com/leodido/go-urn v1.2.4/go.mod h1:7ZrI8mTSeBSHl/UaRyKQW1qZeMgak41ANeCNaVckg+4=
github.com/lxn/walk v0.0.0-20210112085537-c389da54e794/go.mod h1:E23UucZGqpuUANJooIbHWCufXvOcT6E7Stq81gU+CSQ=
github.com/lxn/win v0.0.0-20210218163916-a377121e959e/go.mod h1:KxxjdtRkfNoYDCUP5ryK7XJJNTnpC8atvtmTheChOtk=
github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94=
github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA=
github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
@@ -94,12 +75,8 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec=
github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c h1:rp5dCmg/yLR3mgFuSOe4oEnDDmGLROTvMragMUXpTQw=
github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c/go.mod h1:X07ZCGwUbLaax7L0S3Tw4hpejzu63ZrrQiUe6W0hcy0=
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y=
github.com/pborman/uuid v1.2.1 h1:+ZZIw58t/ozdjRaXh/3awHfmWRbzYxJoAdNJxe/3pvw=
github.com/pborman/uuid v1.2.1/go.mod h1:X/NO0urCmaxf9VXbdlT7C2Yzkj2IKimNn4k+gtPdI/k=
github.com/pelletier/go-toml/v2 v2.0.1/go.mod h1:r9LEWfGN8R5k0VXJ+0BkIe7MYkRdwZOjgMj2KwnJFUo=
github.com/pelletier/go-toml/v2 v2.0.8 h1:0ctb6s9mE31h0/lhu+J6OPmVeDxJn+kYnJc2jZR9tGQ=
github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4=
@@ -112,7 +89,6 @@ github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTE
github.com/rogpeppe/go-internal v1.8.0 h1:FCbCCtXNOY3UtUuHUYaghJg4y7Fd14rXifAYUAtL9R8=
github.com/rogpeppe/go-internal v1.8.0/go.mod h1:WmiCO8CzOY8rg0OYDC4/i/2WRWAB6poM+XZ2dLUbcbE=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/skratchdot/open-golang v0.0.0-20200116055534-eef842397966/go.mod h1:sUM3LWHvSMaG192sy56D9F7CNvL7jUJVXoqM1QKLnog=
github.com/spf13/cobra v1.7.0 h1:hyqWnYt1ZQShIddO5kBpj3vu05/++x6tJ6dg8EC572I=
github.com/spf13/cobra v1.7.0/go.mod h1:uLxZILRyS/50WlhOIKD7W6V5bgeIt+4sICxh6uRMrb0=
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
@@ -149,14 +125,12 @@ golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM=
golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE=
golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E=
golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
golang.org/x/sys v0.0.0-20201018230417-eeed37f84f13/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20210806184541-e5e7981a1069/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220704084225-05e143d24a9e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE=
golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
@@ -173,7 +147,6 @@ google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp0
google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng=
google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
gopkg.in/Knetic/govaluate.v3 v3.0.0/go.mod h1:csKLBORsPbafmSCGTEh3U7Ozmsuq8ZSIlKk1bcqph0E=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=

View File

@@ -1,12 +1,14 @@
//go:build darwin
package gpu
/*
#cgo CFLAGS: -x objective-c
#cgo LDFLAGS: -framework Foundation -framework CoreGraphics -framework Metal
#include "gpu_info_darwin.h"
*/
import "C"
import (
"runtime"
"github.com/pbnjay/memory"
)
// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
@@ -15,19 +17,8 @@ func CheckVRAM() (int64, error) {
// gpu not supported, this may not be metal
return 0, nil
}
// on macOS, there's already buffer for available vram (see below) so just return the total
systemMemory := int64(memory.TotalMemory())
// macOS limits how much memory is available to the GPU based on the amount of system memory
// TODO: handle case where iogpu.wired_limit_mb is set to a higher value
if systemMemory <= 36*1024*1024*1024 {
systemMemory = systemMemory * 2 / 3
} else {
systemMemory = systemMemory * 3 / 4
}
return systemMemory, nil
recommendedMaxVRAM := int64(C.getRecommendedMaxVRAM())
return recommendedMaxVRAM, nil
}
func GetGPUInfo() GpuInfo {

View File

@@ -156,7 +156,7 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
}
LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.free);
LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.used);
resp->total += memInfo.total;
resp->free += memInfo.free;

3
gpu/gpu_info_darwin.h Normal file
View File

@@ -0,0 +1,3 @@
#import <Metal/Metal.h>
#include <stdint.h>
uint64_t getRecommendedMaxVRAM();

11
gpu/gpu_info_darwin.m Normal file
View File

@@ -0,0 +1,11 @@
//go:build darwin
#include "gpu_info_darwin.h"
uint64_t getRecommendedMaxVRAM()
{
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
uint64_t result = device.recommendedMaxWorkingSetSize;
CFRelease(device);
return result;
}

View File

@@ -1,145 +0,0 @@
#include "dyn_ext_server.h"
#include <stdio.h>
#include <string.h>
#ifdef __linux__
#include <dlfcn.h>
#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
#define LOAD_ERR() strdup(dlerror())
#define UNLOAD_LIBRARY(handle) dlclose(handle)
#elif _WIN32
#include <windows.h>
#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
inline char *LOAD_ERR() {
LPSTR messageBuffer = NULL;
size_t size = FormatMessageA(
FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
(LPSTR)&messageBuffer, 0, NULL);
char *resp = strdup(messageBuffer);
LocalFree(messageBuffer);
return resp;
}
#else
#include <dlfcn.h>
#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
#define LOAD_ERR() strdup(dlerror())
#define UNLOAD_LIBRARY(handle) dlclose(handle)
#endif
void dyn_init(const char *libPath, struct dynamic_llama_server *s,
ext_server_resp_t *err) {
int i = 0;
struct lookup {
char *s;
void **p;
} l[] = {
{"llama_server_init", (void *)&s->llama_server_init},
{"llama_server_start", (void *)&s->llama_server_start},
{"llama_server_stop", (void *)&s->llama_server_stop},
{"llama_server_completion", (void *)&s->llama_server_completion},
{"llama_server_completion_next_result",
(void *)&s->llama_server_completion_next_result},
{"llama_server_completion_cancel",
(void *)&s->llama_server_completion_cancel},
{"llama_server_release_task_result",
(void *)&s->llama_server_release_task_result},
{"llama_server_tokenize", (void *)&s->llama_server_tokenize},
{"llama_server_detokenize", (void *)&s->llama_server_detokenize},
{"llama_server_embedding", (void *)&s->llama_server_embedding},
{"llama_server_release_json_resp",
(void *)&s->llama_server_release_json_resp},
{"", NULL},
};
printf("loading library %s\n", libPath);
s->handle = LOAD_LIBRARY(libPath, RTLD_LOCAL|RTLD_NOW);
if (!s->handle) {
err->id = -1;
char *msg = LOAD_ERR();
snprintf(err->msg, err->msg_len,
"Unable to load dynamic server library: %s", msg);
free(msg);
return;
}
for (i = 0; l[i].p != NULL; i++) {
*l[i].p = LOAD_SYMBOL(s->handle, l[i].s);
if (!l[i].p) {
UNLOAD_LIBRARY(s->handle);
err->id = -1;
char *msg = LOAD_ERR();
snprintf(err->msg, err->msg_len, "symbol lookup for %s failed: %s",
l[i].s, msg);
free(msg);
return;
}
}
}
inline void dyn_llama_server_init(struct dynamic_llama_server s,
ext_server_params_t *sparams,
ext_server_resp_t *err) {
s.llama_server_init(sparams, err);
}
inline void dyn_llama_server_start(struct dynamic_llama_server s) {
s.llama_server_start();
}
inline void dyn_llama_server_stop(struct dynamic_llama_server s) {
s.llama_server_stop();
}
inline void dyn_llama_server_completion(struct dynamic_llama_server s,
const char *json_req,
ext_server_resp_t *resp) {
s.llama_server_completion(json_req, resp);
}
inline void dyn_llama_server_completion_next_result(
struct dynamic_llama_server s, const int task_id,
ext_server_task_result_t *result) {
s.llama_server_completion_next_result(task_id, result);
}
inline void dyn_llama_server_completion_cancel(
struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) {
s.llama_server_completion_cancel(task_id, err);
}
inline void dyn_llama_server_release_task_result(
struct dynamic_llama_server s, ext_server_task_result_t *result) {
s.llama_server_release_task_result(result);
}
inline void dyn_llama_server_tokenize(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_tokenize(json_req, json_resp, err);
}
inline void dyn_llama_server_detokenize(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_detokenize(json_req, json_resp, err);
}
inline void dyn_llama_server_embedding(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_embedding(json_req, json_resp, err);
}
inline void dyn_llama_server_release_json_resp(
struct dynamic_llama_server s, char **json_resp) {
s.llama_server_release_json_resp(json_resp);
}

View File

@@ -106,7 +106,12 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
sparams.memory_f16 = C.bool(opts.F16KV)
sparams.use_mlock = C.bool(opts.UseMLock)
sparams.use_mmap = C.bool(opts.UseMMap)
sparams.numa = C.bool(opts.UseNUMA)
if opts.UseNUMA {
sparams.numa = C.int(1)
} else {
sparams.numa = C.int(0)
}
sparams.lora_adapters = nil
for i := 0; i < len(adapters); i++ {

View File

@@ -1,74 +0,0 @@
#include <stdlib.h>
#include "ext_server.h"
#ifdef __cplusplus
extern "C" {
#endif
struct dynamic_llama_server {
void *handle;
void (*llama_server_init)(ext_server_params_t *sparams,
ext_server_resp_t *err);
void (*llama_server_start)();
void (*llama_server_stop)();
void (*llama_server_completion)(const char *json_req,
ext_server_resp_t *resp);
void (*llama_server_completion_next_result)(const int task_id,
ext_server_task_result_t *result);
void (*llama_server_completion_cancel)(const int task_id,
ext_server_resp_t *err);
void (*llama_server_release_task_result)(ext_server_task_result_t *result);
void (*llama_server_tokenize)(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void (*llama_server_detokenize)(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void (*llama_server_embedding)(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void (*llama_server_release_json_resp)(char **json_resp);
};
void dyn_init(const char *libPath, struct dynamic_llama_server *s,
ext_server_resp_t *err);
// No good way to call C function pointers from Go so inline the indirection
void dyn_llama_server_init(struct dynamic_llama_server s,
ext_server_params_t *sparams,
ext_server_resp_t *err);
void dyn_llama_server_start(struct dynamic_llama_server s);
void dyn_llama_server_stop(struct dynamic_llama_server s);
void dyn_llama_server_completion(struct dynamic_llama_server s,
const char *json_req,
ext_server_resp_t *resp);
void dyn_llama_server_completion_next_result(
struct dynamic_llama_server s, const int task_id,
ext_server_task_result_t *result);
void dyn_llama_server_completion_cancel(struct dynamic_llama_server s,
const int task_id,
ext_server_resp_t *err);
void dyn_llama_server_release_task_result(
struct dynamic_llama_server s, ext_server_task_result_t *result);
void dyn_llama_server_tokenize(struct dynamic_llama_server s,
const char *json_req, char **json_resp,
ext_server_resp_t *err);
void dyn_llama_server_detokenize(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err);
void dyn_llama_server_embedding(struct dynamic_llama_server s,
const char *json_req, char **json_resp,
ext_server_resp_t *err);
void dyn_llama_server_release_json_resp(struct dynamic_llama_server s,
char **json_resp);
#ifdef __cplusplus
}
#endif

View File

@@ -1,25 +0,0 @@
# Ollama specific CMakefile to include in llama.cpp/examples/server
set(TARGET ext_server)
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
if (WIN32)
add_library(${TARGET} SHARED ../../../ext_server/ext_server.cpp ../../llama.cpp)
else()
add_library(${TARGET} STATIC ../../../ext_server/ext_server.cpp ../../llama.cpp)
endif()
target_include_directories(${TARGET} PRIVATE ../../common)
target_include_directories(${TARGET} PRIVATE ../..)
target_include_directories(${TARGET} PRIVATE ../../..)
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1)
target_link_libraries(${TARGET} PRIVATE ggml llava common )
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>)
install(TARGETS ext_server LIBRARY)
if (CUDAToolkit_FOUND)
target_include_directories(${TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
if (WIN32)
target_link_libraries(${TARGET} PRIVATE nvml)
endif()
endif()

View File

@@ -1,18 +0,0 @@
# Extern C Server
This directory contains a thin facade we layer on top of the Llama.cpp server to
expose `extern C` interfaces to access the functionality through direct API
calls in-process. The llama.cpp code uses compile time macros to configure GPU
type along with other settings. During the `go generate ./...` execution, the
build will generate one or more copies of the llama.cpp `extern C` server based
on what GPU libraries are detected to support multiple GPU types as well as CPU
only support. The Ollama go build then embeds these different servers to support
different GPUs and settings at runtime.
If you are making changes to the code in this directory, make sure to disable
caching during your go build to ensure you pick up your changes. A typical
iteration cycle from the top of the source tree looks like:
```
go generate ./... && go build -a .
```

View File

@@ -1,381 +0,0 @@
#include "ext_server.h"
#include <atomic>
// Necessary evil since the server types are not defined in a header
#include "server.cpp"
// Low level API access to verify GPU access
#if defined(GGML_USE_CUBLAS)
#if defined(GGML_USE_HIPBLAS)
#include <hip/hip_runtime.h>
#include <hipblas/hipblas.h>
#include <hip/hip_fp16.h>
#ifdef __HIP_PLATFORM_AMD__
// for rocblas_initialize()
#include "rocblas/rocblas.h"
#endif // __HIP_PLATFORM_AMD__
#define cudaGetDevice hipGetDevice
#define cudaError_t hipError_t
#define cudaSuccess hipSuccess
#define cudaGetErrorString hipGetErrorString
#else
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
#endif // defined(GGML_USE_HIPBLAS)
#endif // GGML_USE_CUBLAS
// Expose the llama server as a callable extern "C" API
llama_server_context *llama = NULL;
std::thread ext_server_thread;
bool shutting_down = false;
std::atomic_int recv_counter;
// RAII wrapper for tracking in-flight recv calls
class atomicRecv {
public:
atomicRecv(std::atomic<int> &atomic) : atomic(atomic) {
++this->atomic;
}
~atomicRecv() {
--this->atomic;
}
private:
std::atomic<int> &atomic;
};
void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
recv_counter = 0;
assert(err != NULL && sparams != NULL);
log_set_target(stderr);
if (!sparams->verbose_logging) {
server_verbose = true;
log_disable();
}
LOG_TEE("system info: %s\n", llama_print_system_info());
err->id = 0;
err->msg[0] = '\0';
try {
llama = new llama_server_context;
gpt_params params;
params.n_ctx = sparams->n_ctx;
params.n_batch = sparams->n_batch;
if (sparams->n_threads > 0) {
params.n_threads = sparams->n_threads;
}
params.n_parallel = sparams->n_parallel;
params.rope_freq_base = sparams->rope_freq_base;
params.rope_freq_scale = sparams->rope_freq_scale;
if (sparams->memory_f16) {
params.cache_type_k = "f16";
params.cache_type_v = "f16";
} else {
params.cache_type_k = "f32";
params.cache_type_v = "f32";
}
params.n_gpu_layers = sparams->n_gpu_layers;
params.main_gpu = sparams->main_gpu;
params.use_mlock = sparams->use_mlock;
params.use_mmap = sparams->use_mmap;
params.numa = sparams->numa;
params.embedding = sparams->embedding;
if (sparams->model != NULL) {
params.model = sparams->model;
}
if (sparams->lora_adapters != NULL) {
for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL;
la = la->next) {
params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale));
}
params.use_mmap = false;
}
if (sparams->mmproj != NULL) {
params.mmproj = std::string(sparams->mmproj);
}
#if defined(GGML_USE_CUBLAS)
// Before attempting to init the backend which will assert on error, verify the CUDA/ROCM GPU is accessible
LOG_TEE("Performing pre-initialization of GPU\n");
int id;
cudaError_t cudaErr = cudaGetDevice(&id);
if (cudaErr != cudaSuccess) {
err->id = -1;
snprintf(err->msg, err->msg_len, "Unable to init GPU: %s", cudaGetErrorString(cudaErr));
return;
}
#endif
llama_backend_init(params.numa);
// load the model
if (!llama->load_model(params)) {
// TODO - consider modifying the logging logic or patching load_model so
// we can capture more detailed error messages and pass them back to the
// caller for better UX
err->id = -1;
snprintf(err->msg, err->msg_len, "error loading model %s",
params.model.c_str());
return;
}
llama->initialize();
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
} catch (...) {
err->id = -1;
snprintf(err->msg, err->msg_len,
"Unknown exception initializing llama server");
}
}
void llama_server_start() {
assert(llama != NULL);
// TODO mutex to protect thread creation
ext_server_thread = std::thread([&]() {
try {
LOG_TEE("llama server main loop starting\n");
ggml_time_init();
llama->queue_tasks.on_new_task(std::bind(
&llama_server_context::process_single_task, llama, std::placeholders::_1));
llama->queue_tasks.on_finish_multitask(std::bind(
&llama_server_context::on_finish_multitask, llama, std::placeholders::_1));
llama->queue_tasks.on_all_tasks_finished(std::bind(
&llama_server_context::run_on_all_tasks_finished, llama));
llama->queue_results.on_multitask_update(std::bind(
&llama_server_queue::update_multitask,
&llama->queue_tasks,
std::placeholders::_1,
std::placeholders::_2,
std::placeholders::_3
));
llama->queue_tasks.start_loop();
} catch (std::exception &e) {
LOG_TEE("caught exception in llama server main loop: %s\n", e.what());
} catch (...) {
LOG_TEE("caught unknown exception in llama server main loop\n");
}
LOG_TEE("\nllama server shutting down\n");
llama_backend_free();
});
}
void llama_server_stop() {
assert(llama != NULL);
// Shutdown any in-flight requests and block incoming requests.
LOG_TEE("\ninitiating shutdown - draining remaining tasks...\n");
shutting_down = true;
while (recv_counter.load() > 0) {
std::this_thread::sleep_for(std::chrono::milliseconds(50));
}
// This may take a while for any pending tasks to drain
// TODO - consider a timeout to cancel tasks if it's taking too long
llama->queue_tasks.terminate();
ext_server_thread.join();
delete llama;
llama = NULL;
LOG_TEE("llama server shutdown complete\n");
shutting_down = false;
}
void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
assert(llama != NULL && json_req != NULL && resp != NULL);
resp->id = -1;
resp->msg[0] = '\0';
try {
if (shutting_down) {
throw std::runtime_error("server shutting down");
}
json data = json::parse(json_req);
resp->id = llama->queue_tasks.get_new_id();
llama->queue_results.add_waiting_task_id(resp->id);
llama->request_completion(resp->id, data, false, false, -1);
} catch (std::exception &e) {
snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
} catch (...) {
snprintf(resp->msg, resp->msg_len, "Unknown exception during completion");
}
}
void llama_server_completion_next_result(const int task_id,
ext_server_task_result_t *resp) {
assert(llama != NULL && resp != NULL);
std::string msg;
resp->id = -1;
resp->stop = false;
resp->error = false;
resp->json_resp = NULL;
std::string result_json;
try {
atomicRecv ar(recv_counter);
task_result result = llama->queue_results.recv(task_id);
result_json =
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
resp->id = result.id;
resp->stop = result.stop;
resp->error = result.error;
if (result.error) {
LOG_TEE("next result cancel on error\n");
llama->request_cancel(task_id);
LOG_TEE("next result removing waiting tak ID: %d\n", task_id);
llama->queue_results.remove_waiting_task_id(task_id);
} else if (result.stop) {
LOG_TEE("next result cancel on stop\n");
llama->request_cancel(task_id);
LOG_TEE("next result removing waiting task ID: %d\n", task_id);
llama->queue_results.remove_waiting_task_id(task_id);
} else if (shutting_down) {
LOG_TEE("aborting completion due to shutdown %d\n", task_id);
llama->request_cancel(task_id);
llama->queue_results.remove_waiting_task_id(task_id);
resp->stop = true;
}
} catch (std::exception &e) {
resp->error = true;
resp->id = -1;
result_json = "{\"error\":\"exception " + std::string(e.what()) + "\"}";
LOG_TEE("llama server completion exception %s\n", e.what());
} catch (...) {
resp->error = true;
resp->id = -1;
result_json = "{\"error\":\"Unknown exception during completion\"}";
LOG_TEE("llama server completion unknown exception\n");
}
const std::string::size_type size = result_json.size() + 1;
resp->json_resp = new char[size];
snprintf(resp->json_resp, size, "%s", result_json.c_str());
}
void llama_server_release_task_result(ext_server_task_result_t *result) {
if (result == NULL || result->json_resp == NULL) {
return;
}
delete[] result->json_resp;
}
void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err) {
assert(llama != NULL && err != NULL);
err->id = 0;
err->msg[0] = '\0';
try {
llama->request_cancel(task_id);
llama->queue_results.remove_waiting_task_id(task_id);
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
} catch (...) {
err->id = -1;
snprintf(err->msg, err->msg_len,
"Unknown exception completion cancel in llama server");
}
}
void llama_server_tokenize(const char *json_req, char **json_resp,
ext_server_resp_t *err) {
assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
*json_resp = NULL;
err->id = 0;
err->msg[0] = '\0';
try {
if (shutting_down) {
throw std::runtime_error("server shutting down");
}
const json body = json::parse(json_req);
std::vector<llama_token> tokens;
if (body.count("content") != 0) {
tokens = llama->tokenize(body["content"], false);
}
const json data = format_tokenizer_response(tokens);
std::string result_json = data.dump();
const std::string::size_type size = result_json.size() + 1;
*json_resp = new char[size];
snprintf(*json_resp, size, "%s", result_json.c_str());
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
} catch (...) {
err->id = -1;
snprintf(err->msg, err->msg_len, "Unknown exception during tokenize");
}
}
void llama_server_release_json_resp(char **json_resp) {
if (json_resp == NULL || *json_resp == NULL) {
return;
}
delete[] *json_resp;
}
void llama_server_detokenize(const char *json_req, char **json_resp,
ext_server_resp_t *err) {
assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
*json_resp = NULL;
err->id = 0;
err->msg[0] = '\0';
try {
if (shutting_down) {
throw std::runtime_error("server shutting down");
}
const json body = json::parse(json_req);
std::string content;
if (body.count("tokens") != 0) {
const std::vector<llama_token> tokens = body["tokens"];
content = tokens_to_str(llama->ctx, tokens.cbegin(), tokens.cend());
}
const json data = format_detokenized_response(content);
std::string result_json = data.dump();
const std::string::size_type size = result_json.size() + 1;
*json_resp = new char[size];
snprintf(*json_resp, size, "%s", result_json.c_str());
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
} catch (...) {
err->id = -1;
snprintf(err->msg, err->msg_len, "Unknown exception during detokenize");
}
}
void llama_server_embedding(const char *json_req, char **json_resp,
ext_server_resp_t *err) {
assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
*json_resp = NULL;
err->id = 0;
err->msg[0] = '\0';
try {
if (shutting_down) {
throw std::runtime_error("server shutting down");
}
const json body = json::parse(json_req);
json prompt;
if (body.count("content") != 0) {
prompt = body["content"];
} else {
prompt = "";
}
const int task_id = llama->queue_tasks.get_new_id();
llama->queue_results.add_waiting_task_id(task_id);
llama->request_completion(task_id, {{"prompt", prompt}, {"n_predict", 0}}, false, true, -1);
atomicRecv ar(recv_counter);
task_result result = llama->queue_results.recv(task_id);
std::string result_json = result.result_json.dump();
const std::string::size_type size = result_json.size() + 1;
*json_resp = new char[size];
snprintf(*json_resp, size, "%s", result_json.c_str());
llama->queue_results.remove_waiting_task_id(task_id);
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
} catch (...) {
err->id = -1;
snprintf(err->msg, err->msg_len, "Unknown exception during embedding");
}
}

View File

@@ -1,95 +0,0 @@
#if defined(LLAMA_SERVER_LIBRARY)
#ifndef LLAMA_SERVER_H
#define LLAMA_SERVER_H
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
int __main(int argc, char **argv);
// This exposes extern C entrypoints into the llama_server
// To enable the server compile with LLAMA_SERVER_LIBRARY
#ifdef __cplusplus
extern "C" {
#endif
typedef struct ext_server_resp {
int id; // < 0 on error
size_t msg_len; // caller must allocate msg and set msg_len
char *msg;
} ext_server_resp_t;
// Allocated and freed by caller
typedef struct ext_server_lora_adapter {
char *adapter;
float scale;
struct ext_server_lora_adapter *next;
} ext_server_lora_adapter_t;
// Allocated and freed by caller
typedef struct ext_server_params {
char *model;
uint32_t n_ctx; // token context window, 0 = from model
uint32_t n_batch; // prompt processing maximum batch size
uint32_t n_threads; // number of threads to use for generation
int32_t n_parallel; // number of parallel sequences to decodewra
float rope_freq_base; // RoPE base frequency, 0 = from model
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
bool memory_f16; // use f16 instead of f32 for memory kv
int32_t n_gpu_layers; // number of layers to store in VRAM (-1 - use default)
int32_t main_gpu; // the GPU that is used for scratch and small tensors
bool use_mlock; // force system to keep model in RAM
bool use_mmap; // use mmap if possible
bool numa; // attempt optimizations that help on some NUMA systems
bool embedding; // get only sentence embedding
ext_server_lora_adapter_t *lora_adapters;
char *mmproj;
bool verbose_logging; // Enable verbose logging of the server
} ext_server_params_t;
typedef struct ext_server_task_result {
int id;
bool stop;
bool error;
char *json_resp; // null terminated, memory managed by ext_server
} ext_server_task_result_t;
// Initialize the server once per process
// err->id = 0 for success and err->msg[0] = NULL
// err->id != 0 for failure, and err->msg contains error message
void llama_server_init(ext_server_params_t *sparams, ext_server_resp_t *err);
// Run the main loop, called once per init
void llama_server_start();
// Stop the main loop and free up resources allocated in init and start. Init
// must be called again to reuse
void llama_server_stop();
// json_req null terminated string, memory managed by caller
// resp->id >= 0 on success (task ID)
// resp->id < 0 on error, and resp->msg contains error message
void llama_server_completion(const char *json_req, ext_server_resp_t *resp);
// Caller must call llama_server_release_task_result to free resp->json_resp
void llama_server_completion_next_result(const int task_id,
ext_server_task_result_t *result);
void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err);
void llama_server_release_task_result(ext_server_task_result_t *result);
// Caller must call llama_server_releaes_json_resp to free json_resp if err.id <
// 0
void llama_server_tokenize(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void llama_server_detokenize(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void llama_server_embedding(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void llama_server_release_json_resp(char **json_resp);
#ifdef __cplusplus
}
#endif
#endif
#endif // LLAMA_SERVER_LIBRARY

View File

@@ -1,125 +0,0 @@
# common logic accross linux and darwin
init_vars() {
case "${GOARCH}" in
"amd64")
ARCH="x86_64"
;;
"arm64")
ARCH="arm64"
;;
*)
ARCH=$(uname -m | sed -e "s/aarch64/arm64/g")
esac
LLAMACPP_DIR=../llama.cpp
CMAKE_DEFS=""
CMAKE_TARGETS="--target ext_server"
if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
else
# TODO - add additional optimization flags...
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}"
fi
case $(uname -s) in
"Darwin")
LIB_EXT="dylib"
WHOLE_ARCHIVE="-Wl,-force_load"
NO_WHOLE_ARCHIVE=""
GCC_ARCH="-arch ${ARCH}"
;;
"Linux")
LIB_EXT="so"
WHOLE_ARCHIVE="-Wl,--whole-archive"
NO_WHOLE_ARCHIVE="-Wl,--no-whole-archive"
# Cross compiling not supported on linux - Use docker
GCC_ARCH=""
;;
*)
;;
esac
if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
fi
}
git_module_setup() {
if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then
echo "Skipping submodule initialization"
return
fi
# Make sure the tree is clean after the directory moves
if [ -d "${LLAMACPP_DIR}/gguf" ]; then
echo "Cleaning up old submodule"
rm -rf ${LLAMACPP_DIR}
fi
git submodule init
git submodule update --force ${LLAMACPP_DIR}
}
apply_patches() {
# Wire up our CMakefile
if ! grep ollama ${LLAMACPP_DIR}/examples/server/CMakeLists.txt; then
echo 'include (../../../ext_server/CMakeLists.txt) # ollama' >>${LLAMACPP_DIR}/examples/server/CMakeLists.txt
fi
if [ -n "$(ls -A ../patches/*.diff)" ]; then
# apply temporary patches until fix is upstream
for patch in ../patches/*.diff; do
for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
(cd ${LLAMACPP_DIR}; git checkout ${file})
done
done
for patch in ../patches/*.diff; do
(cd ${LLAMACPP_DIR} && git apply ${patch})
done
fi
# Avoid duplicate main symbols when we link into the cgo binary
sed -e 's/int main(/int __main(/g' <${LLAMACPP_DIR}/examples/server/server.cpp >${LLAMACPP_DIR}/examples/server/server.cpp.tmp &&
mv ${LLAMACPP_DIR}/examples/server/server.cpp.tmp ${LLAMACPP_DIR}/examples/server/server.cpp
}
build() {
cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
mkdir -p ${BUILD_DIR}/lib/
g++ -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.${LIB_EXT} \
${GCC_ARCH} \
${WHOLE_ARCHIVE} ${BUILD_DIR}/examples/server/libext_server.a ${NO_WHOLE_ARCHIVE} \
${BUILD_DIR}/common/libcommon.a \
${BUILD_DIR}/libllama.a \
-Wl,-rpath,\$ORIGIN \
-lpthread -ldl -lm \
${EXTRA_LIBS}
}
compress_libs() {
echo "Compressing payloads to reduce overall binary size..."
pids=""
rm -rf ${BUILD_DIR}/lib/*.${LIB_EXT}*.gz
for lib in ${BUILD_DIR}/lib/*.${LIB_EXT}* ; do
gzip --best -f ${lib} &
pids+=" $!"
done
echo
for pid in ${pids}; do
wait $pid
done
echo "Finished compression"
}
# Keep the local tree clean after we're done with the build
cleanup() {
(cd ${LLAMACPP_DIR}/examples/server/ && git checkout CMakeLists.txt server.cpp)
if [ -n "$(ls -A ../patches/*.diff)" ]; then
for patch in ../patches/*.diff; do
for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
(cd ${LLAMACPP_DIR}; git checkout ${file})
done
done
fi
}

View File

@@ -1,77 +0,0 @@
#!/bin/bash
# This script is intended to run inside the go generate
# working directory must be ./llm/generate/
# TODO - add hardening to detect missing tools (cmake, etc.)
set -ex
set -o pipefail
echo "Starting darwin generate script"
source $(dirname $0)/gen_common.sh
init_vars
git_module_setup
apply_patches
sign() {
if [ -n "$APPLE_IDENTITY" ]; then
codesign -f --timestamp --deep --options=runtime --sign "$APPLE_IDENTITY" --identifier ai.ollama.ollama $1
fi
}
COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin"
case "${GOARCH}" in
"amd64")
COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_NATIVE=off"
#
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
#
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu"
echo "Building LCD CPU"
build
sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu/lib/libext_server.dylib
compress_libs
#
# ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
# Approximately 400% faster than LCD on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx"
echo "Building AVX CPU"
build
sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx/lib/libext_server.dylib
compress_libs
#
# ~2013 CPU Dynamic library
# Approximately 10% faster than AVX on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2"
echo "Building AVX2 CPU"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
build
sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2/lib/libext_server.dylib
compress_libs
;;
"arm64")
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
build
sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/metal/lib/libext_server.dylib
compress_libs
;;
*)
echo "GOARCH must be set"
echo "this script is meant to be run from within go generate"
exit 1
;;
esac
cleanup

View File

@@ -1,196 +0,0 @@
#!/bin/bash
# This script is intended to run inside the go generate
# working directory must be llm/generate/
# First we build one or more CPU based LLM libraries
#
# Then if we detect CUDA, we build a CUDA dynamic library, and carry the required
# library dependencies
#
# Then if we detect ROCm, we build a dynamically loaded ROCm lib. The ROCM
# libraries are quite large, and also dynamically load data files at runtime
# which in turn are large, so we don't attempt to cary them as payload
set -ex
set -o pipefail
# See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
amdGPUs() {
if [ -n "${AMDGPU_TARGETS}" ]; then
echo "${AMDGPU_TARGETS}"
return
fi
GPU_LIST=(
"gfx900"
"gfx906:xnack-"
"gfx908:xnack-"
"gfx90a:xnack+"
"gfx90a:xnack-"
"gfx1010"
"gfx1012"
"gfx1030"
"gfx1100"
"gfx1101"
"gfx1102"
)
(
IFS=$';'
echo "'${GPU_LIST[*]}'"
)
}
echo "Starting linux generate script"
if [ -z "${CUDACXX}" ]; then
if [ -x /usr/local/cuda/bin/nvcc ]; then
export CUDACXX=/usr/local/cuda/bin/nvcc
else
# Try the default location in case it exists
export CUDACXX=$(command -v nvcc)
fi
fi
COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
source $(dirname $0)/gen_common.sh
init_vars
git_module_setup
apply_patches
if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
# Users building from source can tune the exact flags we pass to cmake for configuring
# llama.cpp, and we'll build only 1 CPU variant in that case as the default.
if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
echo "Building custom CPU"
build
compress_libs
else
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
# Note: the following seem to yield slower results than AVX2 - ymmv
# -DLLAMA_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT)
# -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
# -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
#
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
#
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
echo "Building LCD CPU"
build
compress_libs
fi
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then
#
# ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
# Approximately 400% faster than LCD on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx"
echo "Building AVX CPU"
build
compress_libs
fi
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
#
# ~2013 CPU Dynamic library
# Approximately 10% faster than AVX on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2"
echo "Building AVX2 CPU"
build
compress_libs
fi
fi
else
echo "Skipping CPU generation step as requested"
fi
# If needed, look for the default CUDA toolkit location
if [ -z "${CUDA_LIB_DIR}" ] && [ -d /usr/local/cuda/lib64 ]; then
CUDA_LIB_DIR=/usr/local/cuda/lib64
fi
# If needed, look for CUDA on Arch Linux
if [ -z "${CUDA_LIB_DIR}" ] && [ -d /opt/cuda/targets/x86_64-linux/lib ]; then
CUDA_LIB_DIR=/opt/cuda/targets/x86_64-linux/lib
fi
# Allow override in case libcudart is in the wrong place
if [ -z "${CUDART_LIB_DIR}" ]; then
CUDART_LIB_DIR="${CUDA_LIB_DIR}"
fi
if [ -d "${CUDA_LIB_DIR}" ]; then
echo "CUDA libraries detected - building dynamic CUDA library"
init_vars
CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
if [ -n "${CUDA_MAJOR}" ]; then
CUDA_VARIANT=_v${CUDA_MAJOR}
fi
CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}"
EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
build
# Cary the CUDA libs as payloads to help reduce dependency burden on users
#
# TODO - in the future we may shift to packaging these separately and conditionally
# downloading them in the install script.
DEPS="$(ldd ${BUILD_DIR}/lib/libext_server.so )"
for lib in libcudart.so libcublas.so libcublasLt.so ; do
DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/lib/"
elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/lib/"
elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then
cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/lib/"
else
cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/lib/"
fi
done
compress_libs
fi
if [ -z "${ROCM_PATH}" ]; then
# Try the default location in case it exists
ROCM_PATH=/opt/rocm
fi
if [ -z "${CLBlast_DIR}" ]; then
# Try the default location in case it exists
if [ -d /usr/lib/cmake/CLBlast ]; then
export CLBlast_DIR=/usr/lib/cmake/CLBlast
fi
fi
if [ -d "${ROCM_PATH}" ]; then
echo "ROCm libraries detected - building dynamic ROCm library"
if [ -f ${ROCM_PATH}/lib/librocm_smi64.so.? ]; then
ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocm_smi64.so.? | cut -f3 -d. || true)
fi
init_vars
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}"
EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,${ROCM_PATH}/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
build
# Note: the ROCM libs and runtime library files are too large to embed, so we depend on
# them being present at runtime on the host
compress_libs
fi
cleanup

View File

@@ -1,206 +0,0 @@
#!powershell
$ErrorActionPreference = "Stop"
function init_vars {
$script:SRC_DIR = $(resolve-path "..\..\")
$script:llamacppDir = "../llama.cpp"
$script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-A", "x64")
$script:cmakeTargets = @("ext_server")
$script:ARCH = "amd64" # arm not yet supported.
if ($env:CGO_CFLAGS -contains "-g") {
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on")
$script:config = "RelWithDebInfo"
} else {
$script:cmakeDefs += @("-DLLAMA_SERVER_VERBOSE=off")
$script:config = "Release"
}
# Try to find the CUDA dir
if ($env:CUDA_LIB_DIR -eq $null) {
$d=(get-command -ea 'silentlycontinue' nvcc).path
if ($d -ne $null) {
$script:CUDA_LIB_DIR=($d| split-path -parent)
$script:CUDA_INCLUDE_DIR=($script:CUDA_LIB_DIR|split-path -parent)+"\include"
}
} else {
$script:CUDA_LIB_DIR=$env:CUDA_LIB_DIR
}
$script:GZIP=(get-command -ea 'silentlycontinue' gzip).path
$script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path
if ($null -eq $env:CMAKE_CUDA_ARCHITECTURES) {
$script:CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
} else {
$script:CMAKE_CUDA_ARCHITECTURES=$env:CMAKE_CUDA_ARCHITECTURES
}
# Note: 10 Windows Kit signtool crashes with GCP's plugin
${script:SignTool}="C:\Program Files (x86)\Windows Kits\8.1\bin\x64\signtool.exe"
if ("${env:KEY_CONTAINER}") {
${script:OLLAMA_CERT}=$(resolve-path "${script:SRC_DIR}\ollama_inc.crt")
}
}
function git_module_setup {
# TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo
& git submodule init
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& git submodule update --force "${script:llamacppDir}"
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
}
function apply_patches {
# Wire up our CMakefile
if (!(Select-String -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Pattern 'ollama')) {
Add-Content -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Value 'include (../../../ext_server/CMakeLists.txt) # ollama'
}
# Apply temporary patches until fix is upstream
$patches = Get-ChildItem "../patches/*.diff"
foreach ($patch in $patches) {
# Extract file paths from the patch file
$filePaths = Get-Content $patch.FullName | Where-Object { $_ -match '^\+\+\+ ' } | ForEach-Object {
$parts = $_ -split ' '
($parts[1] -split '/', 2)[1]
}
# Checkout each file
Set-Location -Path ${script:llamacppDir}
foreach ($file in $filePaths) {
git checkout $file
}
}
# Apply each patch
foreach ($patch in $patches) {
Set-Location -Path ${script:llamacppDir}
git apply $patch.FullName
}
# Avoid duplicate main symbols when we link into the cgo binary
$content = Get-Content -Path "${script:llamacppDir}/examples/server/server.cpp"
$content = $content -replace 'int main\(', 'int __main('
Set-Content -Path "${script:llamacppDir}/examples/server/server.cpp" -Value $content
}
function build {
write-host "generating config with: cmake -S ${script:llamacppDir} -B $script:buildDir $script:cmakeDefs"
& cmake --version
& cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
write-host "building with: cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })"
& cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
}
function install {
rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
md "${script:buildDir}/lib" -ea 0 > $null
cp "${script:buildDir}/bin/${script:config}/ext_server.dll" "${script:buildDir}/lib"
cp "${script:buildDir}/bin/${script:config}/llama.dll" "${script:buildDir}/lib"
# Display the dll dependencies in the build log
if ($script:DUMPBIN -ne $null) {
& "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
}
}
function sign {
if ("${env:KEY_CONTAINER}") {
write-host "Signing ${script:buildDir}/lib/*.dll"
foreach ($file in (get-childitem "${script:buildDir}/lib/*.dll")){
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
/csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
}
}
}
function compress_libs {
if ($script:GZIP -eq $null) {
write-host "gzip not installed, not compressing files"
return
}
write-host "Compressing dlls..."
$libs = dir "${script:buildDir}/lib/*.dll"
foreach ($file in $libs) {
& "$script:GZIP" --best -f $file
}
}
function cleanup {
$patches = Get-ChildItem "../patches/*.diff"
foreach ($patch in $patches) {
# Extract file paths from the patch file
$filePaths = Get-Content $patch.FullName | Where-Object { $_ -match '^\+\+\+ ' } | ForEach-Object {
$parts = $_ -split ' '
($parts[1] -split '/', 2)[1]
}
# Checkout each file
Set-Location -Path ${script:llamacppDir}
foreach ($file in $filePaths) {
git checkout $file
}
}
Set-Location "${script:llamacppDir}/examples/server"
git checkout CMakeLists.txt server.cpp
}
init_vars
git_module_setup
apply_patches
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on", "-DLLAMA_NATIVE=off")
$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu"
write-host "Building LCD CPU"
build
install
sign
compress_libs
$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx"
write-host "Building AVX CPU"
build
install
sign
compress_libs
$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx2"
write-host "Building AVX2 CPU"
build
install
sign
compress_libs
if ($null -ne $script:CUDA_LIB_DIR) {
# Then build cuda as a dynamically loaded library
$nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
$script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
if ($null -ne $script:CUDA_VERSION) {
$script:CUDA_VARIANT="_"+$script:CUDA_VERSION
}
init_vars
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
build
install
sign
compress_libs
}
# TODO - actually implement ROCm support on windows
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/rocm"
rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
md "${script:buildDir}/lib" -ea 0 > $null
echo $null >> "${script:buildDir}/lib/.generated"
cleanup
write-host "`ngo generate completed"

View File

@@ -1,3 +0,0 @@
package generate
//go:generate sh ./gen_darwin.sh

View File

@@ -1,3 +0,0 @@
package generate
//go:generate bash ./gen_linux.sh

View File

@@ -1,3 +0,0 @@
package generate
//go:generate powershell -ExecutionPolicy Bypass -File ./gen_windows.ps1

View File

@@ -31,6 +31,11 @@ const (
fileTypeQ5_K_S
fileTypeQ5_K_M
fileTypeQ6_K
fileTypeIQ2_XXS
fileTypeIQ2_XS
fileTypeQ2_K_S
fileTypeQ3_K_XS
fileTypeIQ3_XXS
)
func fileType(fileType uint32) string {
@@ -69,6 +74,16 @@ func fileType(fileType uint32) string {
return "Q5_K_M"
case fileTypeQ6_K:
return "Q6_K"
case fileTypeIQ2_XXS:
return "IQ2_XXS"
case fileTypeIQ2_XS:
return "IQ2_XS"
case fileTypeQ2_K_S:
return "Q2_K_S"
case fileTypeQ3_K_XS:
return "Q3_K_XS"
case fileTypeIQ3_XXS:
return "IQ3_XXS"
default:
return "unknown"
}

View File

@@ -115,6 +115,14 @@ func (t tensor) typeSize() uint64 {
return 2 + 2 + 12 + blockSize/8 + blockSize/2
case 14: // Q6_K
return blockSize/2 + blockSize/4 + blockSize/16 + 2
case 15: // Q8_K
return 2 + blockSize + 2*blockSize/16
case 16: // IQ2_XXS
return 2 + 2*blockSize/8
case 17: // IQ2_XS
return 2 + 2*blockSize/8 + blockSize/32
case 18: // IQ3_XXS
return 2 + 3*blockSize/8
default:
return 0
}

Submodule llm/llama.cpp deleted from 6c00a06692

View File

@@ -6,6 +6,7 @@ import (
"log/slog"
"os"
"runtime"
"time"
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/gpu"
@@ -165,3 +166,12 @@ func newLlmServer(gpuInfo gpu.GpuInfo, workDir, model string, adapters, projecto
return nil, err2
}
func parseDurationMs(ms float64) time.Duration {
dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
if err != nil {
panic(err)
}
return dur
}

14
llm/llm_darwin_amd64.go Normal file
View File

@@ -0,0 +1,14 @@
//go:generate cmake -S server -B server/build/cpu -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_METAL=off -DLLAMA_NATIVE=off
//go:generate cmake -S server -B server/build/cpu_avx -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_METAL=off -DLLAMA_NATIVE=off -DLLAMA_AVX=on
//go:generate cmake -S server -B server/build/cpu_avx2 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_METAL=off -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=on
//go:generate cmake --build server/build/cpu --target server -- -j4
//go:generate cmake --build server/build/cpu_avx --target server -- -j4
//go:generate cmake --build server/build/cpu_avx2 --target server -- -j4
package llm
import "embed"
//go:embed server/build/cpu/server
//go:embed server/build/cpu_avx/server
//go:embed server/build/cpu_avx2/server
var libEmbed embed.FS

8
llm/llm_darwin_arm64.go Normal file
View File

@@ -0,0 +1,8 @@
//go:generate cmake -S server -B server/build/metal -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64
//go:generate cmake --build server/build/metal --target server -- -j4
package llm
import "embed"
//go:embed server/build/metal/ggml-metal.metal server/build/metal/server
var libEmbed embed.FS

View File

@@ -1,96 +0,0 @@
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index a0b46970..7800c6e7 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -28,6 +28,7 @@
#include <chrono>
#include <condition_variable>
#include <atomic>
+#include <signal.h>
using json = nlohmann::json;
@@ -2511,6 +2512,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
}
}
+std::function<void(int)> shutdown_handler;
+inline void signal_handler(int signal) { shutdown_handler(signal); }
+
int main(int argc, char **argv)
{
#if SERVER_VERBOSE != 1
@@ -3128,8 +3132,25 @@ int main(int argc, char **argv)
std::placeholders::_2,
std::placeholders::_3
));
- llama.queue_tasks.start_loop();
+ shutdown_handler = [&](int) {
+ llama.queue_tasks.terminate();
+ };
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+ struct sigaction sigint_action;
+ sigint_action.sa_handler = signal_handler;
+ sigemptyset (&sigint_action.sa_mask);
+ sigint_action.sa_flags = 0;
+ sigaction(SIGINT, &sigint_action, NULL);
+#elif defined (_WIN32)
+ auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+ return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
+ };
+ SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+ llama.queue_tasks.start_loop();
+ svr.stop();
t.join();
llama_backend_free();
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 54854896..0ee670db 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -220,6 +220,7 @@ inline std::string format_chatml(std::vector<json> messages)
struct llama_server_queue {
int id = 0;
std::mutex mutex_tasks;
+ bool running;
// queues
std::vector<task_server> queue_tasks;
std::vector<task_server> queue_tasks_deferred;
@@ -278,9 +279,18 @@ struct llama_server_queue {
queue_tasks_deferred.clear();
}
- // Start the main loop. This call is blocking
- [[noreturn]]
+ // end the start_loop routine
+ void terminate() {
+ {
+ std::unique_lock<std::mutex> lock(mutex_tasks);
+ running = false;
+ }
+ condition_tasks.notify_all();
+ }
+
+ // Start the main loop.
void start_loop() {
+ running = true;
while (true) {
// new task arrived
LOG_VERBOSE("have new task", {});
@@ -324,8 +334,12 @@ struct llama_server_queue {
{
std::unique_lock<std::mutex> lock(mutex_tasks);
if (queue_tasks.empty()) {
+ if (!running) {
+ LOG_VERBOSE("ending start_loop", {});
+ return;
+ }
condition_tasks.wait(lock, [&]{
- return !queue_tasks.empty();
+ return (!queue_tasks.empty() || !running);
});
}
}

View File

@@ -1,116 +0,0 @@
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 3102762c..568ac1d0 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -307,6 +307,10 @@ struct llama_client_slot
}
};
+#ifdef GGML_USE_CUBLAS
+extern "C" GGML_CALL void ggml_free_cublas(void);
+#endif
+
struct llama_server_context
{
llama_model *model = nullptr;
@@ -353,6 +357,10 @@ struct llama_server_context
llama_free_model(model);
model = nullptr;
}
+#ifdef GGML_USE_CUBLAS
+ ggml_free_cublas();
+#endif
+
}
bool load_model(const gpt_params &params_)
@@ -3093,6 +3101,7 @@ int main(int argc, char **argv)
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
+ sigaction(SIGUSR1, &sigint_action, NULL);
#elif defined (_WIN32)
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
@@ -3106,3 +3115,4 @@ int main(int argc, char **argv)
llama_backend_free();
return 0;
}
+
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 96976f24..3543920e 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -39,6 +39,7 @@
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
#define cublasCreate hipblasCreate
+#define cublasDestroy hipblasDestroy
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
@@ -7928,10 +7929,11 @@ GGML_CALL bool ggml_cublas_loaded(void) {
return g_cublas_loaded;
}
+static bool g_cublas_initialized = false;
+
GGML_CALL void ggml_init_cublas() {
- static bool initialized = false;
- if (!initialized) {
+ if (!g_cublas_initialized) {
#ifdef __HIP_PLATFORM_AMD__
// Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -7941,7 +7943,7 @@ GGML_CALL void ggml_init_cublas() {
#endif
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
- initialized = true;
+ g_cublas_initialized = true;
g_cublas_loaded = false;
return;
}
@@ -8011,7 +8013,7 @@ GGML_CALL void ggml_init_cublas() {
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
- initialized = true;
+ g_cublas_initialized = true;
g_cublas_loaded = true;
}
}
@@ -11528,3 +11530,17 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
}
return device_count;
}
+
+extern "C" GGML_CALL void ggml_free_cublas(void);
+GGML_CALL void ggml_free_cublas(void) {
+ for (int id = 0; id < g_device_count; ++id) {
+#if !defined(GGML_USE_HIPBLAS)
+ CU_CHECK(cuMemUnmap(g_cuda_pool_addr[id], g_cuda_pool_size[id]));
+ g_cuda_pool_size[id] = 0;
+ g_cuda_pool_addr[id] = 0;
+#endif
+ CUBLAS_CHECK(cublasDestroy(g_cublas_handles[id]));
+ g_cublas_handles[id] = nullptr;
+ }
+ g_cublas_initialized = false;
+}
\ No newline at end of file
diff --git a/ggml-cuda.h b/ggml-cuda.h
index b1ebd61d..b4c80c2c 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -20,6 +20,9 @@ extern "C" {
// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
GGML_API GGML_CALL void ggml_init_cublas(void);
+// Release CUDA resources
+GGML_API GGML_CALL void ggml_free_cublas(void);
+
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
GGML_API GGML_CALL bool ggml_cublas_loaded(void);

View File

@@ -1,8 +0,0 @@
package llm
import (
"embed"
)
//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/x86_64/*/lib/*.dylib*
var libEmbed embed.FS

View File

@@ -1,8 +0,0 @@
package llm
import (
"embed"
)
//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/arm64/*/lib/*.dylib*
var libEmbed embed.FS

View File

@@ -1,8 +0,0 @@
package llm
import (
"embed"
)
//go:embed llama.cpp/build/linux/*/*/lib/*.so*
var libEmbed embed.FS

View File

@@ -1,58 +0,0 @@
package llm
import (
"testing"
"github.com/jmorganca/ollama/gpu"
"github.com/stretchr/testify/assert"
)
func TestGetDynLibs(t *testing.T) {
availableDynLibs = map[string]string{
"cpu": "X_cpu",
}
assert.Equal(t, false, rocmDynLibPresent())
res := getDynLibs(gpu.GpuInfo{Library: "cpu"})
assert.Len(t, res, 1)
assert.Equal(t, availableDynLibs["cpu"], res[0])
variant := gpu.GetCPUVariant()
if variant != "" {
variant = "_" + variant
}
availableDynLibs = map[string]string{
"rocm_v5": "X_rocm_v5",
"rocm_v6": "X_rocm_v6",
"cpu" + variant: "X_cpu",
}
assert.Equal(t, true, rocmDynLibPresent())
res = getDynLibs(gpu.GpuInfo{Library: "rocm"})
assert.Len(t, res, 3)
assert.Equal(t, availableDynLibs["rocm_v5"], res[0])
assert.Equal(t, availableDynLibs["rocm_v6"], res[1])
assert.Equal(t, availableDynLibs["cpu"+variant], res[2])
res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
assert.Len(t, res, 3)
assert.Equal(t, availableDynLibs["rocm_v6"], res[0])
assert.Equal(t, availableDynLibs["rocm_v5"], res[1])
assert.Equal(t, availableDynLibs["cpu"+variant], res[2])
res = getDynLibs(gpu.GpuInfo{Library: "cuda"})
assert.Len(t, res, 1)
assert.Equal(t, availableDynLibs["cpu"+variant], res[0])
res = getDynLibs(gpu.GpuInfo{Library: "default"})
assert.Len(t, res, 1)
assert.Equal(t, "default", res[0])
availableDynLibs = map[string]string{
"rocm": "X_rocm_v5",
"cpu" + variant: "X_cpu",
}
assert.Equal(t, true, rocmDynLibPresent())
res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
assert.Len(t, res, 2)
assert.Equal(t, availableDynLibs["rocm"], res[0])
assert.Equal(t, availableDynLibs["cpu"+variant], res[1])
}

View File

@@ -1,8 +0,0 @@
package llm
import (
"embed"
)
//go:embed llama.cpp/build/windows/*/*/lib/*.dll*
var libEmbed embed.FS

1
llm/server/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
build

93
llm/server/CMakeLists.txt Normal file
View File

@@ -0,0 +1,93 @@
cmake_minimum_required(VERSION 3.14)
project(llm)
include(FetchContent)
set(add_token_patch
git apply ${CMAKE_CURRENT_SOURCE_DIR}/patches/add_token.patch
)
set(FETCHCONTENT_BASE_DIR "${CMAKE_SOURCE_DIR}/build/llama.cpp")
FetchContent_Declare(
llama_cpp
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
GIT_TAG c29af7e2252d288f2ea58a7d437c1cb7c0abf160
# this could be risky if the patch doesn't apply
PATCH_COMMAND ${add_token_patch} || true
)
FetchContent_MakeAvailable(llama_cpp)
add_subdirectory(${llama_cpp_SOURCE_DIR}/examples/llava)
# code signing
function(sign target)
if(APPLE)
if(DEFINED ENV{APPLE_IDENTITY})
add_custom_command(TARGET ${target} POST_BUILD
COMMAND codesign
-f
--timestamp
--deep
--options=runtime
--sign "$ENV{APPLE_IDENTITY}"
--identifier ai.ollama.ollama
$<TARGET_FILE:${target}>
COMMENT "Signing macOS binary: ${target}"
)
endif()
elseif(WIN32)
find_program(SIGNTOOL_EXE NAMES signtool PATHS "C:\\Program Files (x86)\\Windows Kits\\8.1\\bin\\x64" NO_DEFAULT_PATH)
set(KEY_CONTAINER "$ENV{KEY_CONTAINER}")
set(OLLAMA_CERT "$ENV{OLLAMA_CERT}")
if(SIGNTOOL_EXE AND KEY_CONTAINER AND OLLAMA_CERT)
add_custom_command(TARGET ${target} POST_BUILD
COMMAND "${SIGNTOOL_EXE}"
"sign"
"/v"
"/fd" "sha256"
"/t" "http://timestamp.digicert.com"
"/f" "${OLLAMA_CERT}"
"/csp" "Google Cloud KMS Provider"
"/kc" "${KEY_CONTAINER}"
"$<TARGET_FILE:${target}>"
COMMENT "Signing Windows binary: ${target}"
)
endif()
endif()
endfunction()
set(CMAKE_CUDA_ARCHITECTURES "50;52;61;70;75;80")
function(gzip target)
set(gzip_target "gzip_${target}")
add_custom_target(${gzip_target} ALL
COMMAND gzip -k -f ${target}
COMMENT "Gzipping ${target}"
VERBATIM
)
add_dependencies(${gzip_target} ${target})
endfunction()
function(link_windows_libraries target)
if (WIN32)
target_link_libraries(${target} PRIVATE ws2_32)
endif()
endfunction()
add_executable(server ${llama_cpp_SOURCE_DIR}/examples/server/server.cpp ${llama_cpp_SOURCE_DIR})
target_compile_definitions(server PRIVATE)
target_link_libraries(server PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(server PRIVATE cxx_std_17)
link_windows_libraries(server)
sign(server)
gzip(server)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
configure_file(${llama_cpp_SOURCE_DIR}/ggml-metal.metal ${CMAKE_BINARY_DIR}/ggml-metal.metal COPYONLY)
endif()
# TODO: ROCm

View File

@@ -1,9 +1,9 @@
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index d86d7e04..2694e92e 100644
index 2b2f4a0f..afac49af 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -901,13 +901,15 @@ struct llama_server_context
slot.sent_count += result.text_to_send.size();
@@ -997,13 +997,15 @@ struct llama_server_context
slot.n_sent_text += result.text_to_send.size();
// add the token to slot queue and cache
}
- slot.add_token_string(result);

View File

@@ -1,15 +0,0 @@
package llm
import (
"fmt"
"time"
)
func parseDurationMs(ms float64) time.Duration {
dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
if err != nil {
panic(err)
}
return dur
}

View File

@@ -5,13 +5,15 @@ set -eu
export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version=$VERSION\" \"-X=github.com/jmorganca/ollama/server.mode=release\"'"
IMAGE_NAME=${IMAGE_NAME:-"ollama/ollama"}
BUILD_PLATFORM=${BUILD_PLATFORM:-"linux/arm64,linux/amd64"}
docker build \
--load \
--platform=linux/arm64,linux/amd64 \
--platform=${BUILD_PLATFORM} \
--build-arg=VERSION \
--build-arg=GOFLAGS \
-f Dockerfile \
-t ollama/ollama:$VERSION \
-t ${IMAGE_NAME}:$VERSION \
.
docker build \
@@ -21,5 +23,12 @@ docker build \
--build-arg=GOFLAGS \
--target runtime-rocm \
-f Dockerfile \
-t ollama/ollama:$VERSION-rocm \
-t ${IMAGE_NAME}:$VERSION-rocm \
.
docker tag ${IMAGE_NAME}:$VERSION ${IMAGE_NAME}:latest
docker tag ${IMAGE_NAME}:$VERSION-rocm ${IMAGE_NAME}:rocm
echo "To release, run:"
echo " docker push ${IMAGE_NAME}:$VERSION && docker push ${IMAGE_NAME}:latest"
echo " docker push ${IMAGE_NAME}:$VERSION-rocm && docker push ${IMAGE_NAME}:rocm"

View File

@@ -53,13 +53,14 @@ function buildOllama() {
write-host "Building ollama CLI"
& go generate ./...
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& go build "-ldflags=-w -s ""-X=github.com/jmorganca/ollama/version.Version=$script:VERSION"" ""-X=github.com/jmorganca/ollama/server.mode=release""" .
& go build -ldflags "-s -w -X=github.com/jmorganca/ollama/version.Version=$script:VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
if ("${env:KEY_CONTAINER}") {
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
/csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
}
New-Item -ItemType Directory -Path .\dist -Force
cp .\ollama.exe .\dist\ollama-windows-amd64.exe
}
@@ -67,7 +68,7 @@ function buildApp() {
write-host "Building Ollama App"
cd "${script:SRC_DIR}\app"
& windres -l 0 -o ollama.syso ollama.rc
& go build "-ldflags=-H windowsgui -w -s ""-X=github.com/jmorganca/ollama/version.Version=$script:VERSION"" ""-X=github.com/jmorganca/ollama/server.mode=release""" .
& go build -ldflags "-s -w -H windowsgui -X=github.com/jmorganca/ollama/version.Version=$script:VERSION -X=github.com/jmorganca/ollama/server.mode=release" .
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
if ("${env:KEY_CONTAINER}") {
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
@@ -129,4 +130,4 @@ try {
} finally {
set-location $script:SRC_DIR
$env:PKG_VERSION=""
}
}

View File

@@ -72,7 +72,7 @@ $SUDO install -o0 -g0 -m755 -d $BINDIR
$SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $BINDIR/ollama
install_success() {
status 'The Ollama API is now available at 0.0.0.0:11434.'
status 'The Ollama API is now available at 127.0.0.1:11434.'
status 'Install complete. Run "ollama" from the command line.'
}
trap install_success EXIT
@@ -88,6 +88,10 @@ configure_systemd() {
status "Adding ollama user to render group..."
$SUDO usermod -a -G render ollama
fi
if getent group video >/dev/null 2>&1; then
status "Adding ollama user to video group..."
$SUDO usermod -a -G video ollama
fi
status "Adding current user to ollama group..."
$SUDO usermod -a -G ollama $(whoami)

View File

@@ -52,6 +52,10 @@ type Model struct {
Messages []Message
}
func (m *Model) IsEmbedding() bool {
return slices.Contains(m.Config.ModelFamilies, "bert") || slices.Contains(m.Config.ModelFamilies, "nomic-bert")
}
type Message struct {
Role string `json:"role"`
Content string `json:"content"`
@@ -1103,18 +1107,7 @@ func makeRequest(ctx context.Context, method string, requestURL *url.URL, header
req.ContentLength = contentLength
}
proxyURL, err := http.ProxyFromEnvironment(req)
if err != nil {
return nil, err
}
client := http.Client{
Transport: &http.Transport{
Proxy: http.ProxyURL(proxyURL),
},
}
resp, err := client.Do(req)
resp, err := http.DefaultClient.Do(req)
if err != nil {
return nil, err
}

View File

@@ -121,13 +121,15 @@ func ChatPrompt(tmpl string, messages []api.Message, window int, encode func(str
p = prompt{}
}
p.Prompt = msg.Content
var sb strings.Builder
for range msg.Images {
p.Prompt += fmt.Sprintf(" [img-%d]", imgId)
fmt.Fprintf(&sb, "[img-%d] ", imgId)
p.images = append(p.images, imgId)
imgId += 1
}
sb.WriteString(msg.Content)
p.Prompt = sb.String()
case "assistant":
if p.Response != "" {
prompts = append(prompts, p)

View File

@@ -155,7 +155,7 @@ func TestChatPrompt(t *testing.T) {
{Role: "user", Content: "Hello", Images: []api.ImageData{[]byte("base64")}},
},
window: 1024,
want: "You are a Wizard. Hello [img-0]",
want: "You are a Wizard. [img-0] Hello",
},
{
name: "images truncated",
@@ -165,7 +165,7 @@ func TestChatPrompt(t *testing.T) {
{Role: "user", Content: "Hello", Images: []api.ImageData{[]byte("img1"), []byte("img2")}},
},
window: 1024,
want: "You are a Wizard. Hello [img-1]",
want: "You are a Wizard. [img-0] [img-1] Hello",
},
{
name: "empty list",
@@ -198,7 +198,7 @@ func TestChatPrompt(t *testing.T) {
}
if got != tc.want {
t.Errorf("got = %v, want %v", got, tc.want)
t.Errorf("got: %q, want: %q", got, tc.want)
}
})
}

View File

@@ -191,6 +191,11 @@ func GenerateHandler(c *gin.Context) {
return
}
if model.IsEmbedding() {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "embedding models do not support generate"})
return
}
opts, err := modelOptions(model, req.Options)
if err != nil {
if errors.Is(err, api.ErrInvalidOpts) {
@@ -245,6 +250,19 @@ func GenerateHandler(c *gin.Context) {
slog.Debug("generate handler", "system", req.System)
var sb strings.Builder
for i := range req.Images {
fmt.Fprintf(&sb, "[img-%d] ", i)
}
sb.WriteString(req.Prompt)
p, err := Prompt(req.Template, req.System, sb.String(), "", true)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
sb.Reset()
if req.Context != nil {
prev, err := loaded.runner.Decode(c.Request.Context(), req.Context)
if err != nil {
@@ -255,18 +273,6 @@ func GenerateHandler(c *gin.Context) {
sb.WriteString(prev)
}
// write image tags
// TODO: limit the number of images to fit in the context similar to the chat endpoint
for i := range req.Images {
req.Prompt += fmt.Sprintf(" [img-%d]", i)
}
p, err := Prompt(req.Template, req.System, req.Prompt, "", true)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
sb.WriteString(p)
prompt = sb.String()
@@ -379,7 +385,7 @@ func GenerateHandler(c *gin.Context) {
streamResponse(c, ch)
}
func EmbeddingHandler(c *gin.Context) {
func EmbeddingsHandler(c *gin.Context) {
loaded.mu.Lock()
defer loaded.mu.Unlock()
@@ -432,8 +438,9 @@ func EmbeddingHandler(c *gin.Context) {
return
}
if !loaded.Options.EmbeddingOnly {
c.JSON(http.StatusBadRequest, gin.H{"error": "embedding option must be set to true"})
// an empty request loads the model
if req.Prompt == "" {
c.JSON(http.StatusOK, api.EmbeddingResponse{Embedding: []float64{}})
return
}
@@ -942,7 +949,7 @@ func (s *Server) GenerateRoutes() http.Handler {
r.POST("/api/pull", PullModelHandler)
r.POST("/api/generate", GenerateHandler)
r.POST("/api/chat", ChatHandler)
r.POST("/api/embeddings", EmbeddingHandler)
r.POST("/api/embeddings", EmbeddingsHandler)
r.POST("/api/create", CreateModelHandler)
r.POST("/api/push", PushModelHandler)
r.POST("/api/copy", CopyModelHandler)
@@ -1143,6 +1150,11 @@ func ChatHandler(c *gin.Context) {
return
}
if model.IsEmbedding() {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "embedding models do not support chat"})
return
}
opts, err := modelOptions(model, req.Options)
if err != nil {
if errors.Is(err, api.ErrInvalidOpts) {

View File

@@ -12,7 +12,6 @@ import (
"net/http"
"net/url"
"os"
"strings"
"sync"
"sync/atomic"
"time"
@@ -177,16 +176,14 @@ func (b *blobUpload) Run(ctx context.Context, opts *registryOptions) {
requestURL := <-b.nextURL
// calculate md5 checksum and add it to the commit request
var sb strings.Builder
md5sum := md5.New()
for _, part := range b.Parts {
sb.Write(part.Sum(nil))
md5sum.Write(part.Sum(nil))
}
md5sum := md5.Sum([]byte(sb.String()))
values := requestURL.Query()
values.Add("digest", b.Digest)
values.Add("etag", fmt.Sprintf("%x-%d", md5sum, len(b.Parts)))
values.Add("etag", fmt.Sprintf("%x-%d", md5sum.Sum(nil), len(b.Parts)))
requestURL.RawQuery = values.Encode()
headers := make(http.Header)