Compare commits
88 Commits
royh-opena
...
timeout
Author | SHA1 | Date | |
---|---|---|---|
![]() |
d77a174eb4 | ||
![]() |
2cc7d05012 | ||
![]() |
123a722a6f | ||
![]() |
4d311eb731 | ||
![]() |
cb42e607c5 | ||
![]() |
2aa91a937b | ||
![]() |
ccef9431c8 | ||
![]() |
9a9e7d83c4 | ||
![]() |
189a43caa2 | ||
![]() |
e835ef1836 | ||
![]() |
7e7749224c | ||
![]() |
c7c2f3bc22 | ||
![]() |
54a79d6a8a | ||
![]() |
5bf5aeec01 | ||
![]() |
e01e535cbb | ||
![]() |
0195d6a2f8 | ||
![]() |
8e0641a9bf | ||
![]() |
662568d453 | ||
![]() |
4ebb66c662 | ||
![]() |
23e899f32d | ||
![]() |
fedf71635e | ||
![]() |
97c59be653 | ||
![]() |
9d8a4988e8 | ||
![]() |
1ae0750a21 | ||
![]() |
9d91e5e587 | ||
![]() |
96624aa412 | ||
![]() |
10f33b8537 | ||
![]() |
4a633cc295 | ||
![]() |
d34d88e417 | ||
![]() |
52ce350b7a | ||
![]() |
2abebb2cbe | ||
![]() |
380e06e5be | ||
![]() |
badf975e45 | ||
![]() |
755b4e4fc2 | ||
![]() |
1a1c99e334 | ||
![]() |
21adf8b6d2 | ||
![]() |
e873841cbb | ||
![]() |
26d0bf9236 | ||
![]() |
359b15a597 | ||
![]() |
b55958a587 | ||
![]() |
7784ca33ce | ||
![]() |
c9c8c98bf6 | ||
![]() |
171796791f | ||
![]() |
176d0f7075 | ||
![]() |
8ed51cac37 | ||
![]() |
c9e6f0542d | ||
![]() |
b0930626c5 | ||
![]() |
e890be4814 | ||
![]() |
b2799f111b | ||
![]() |
152fc202f5 | ||
![]() |
4ad0d4d6d3 | ||
![]() |
163cd3e77c | ||
![]() |
4c2c8f93dd | ||
![]() |
fd1e6e0590 | ||
![]() |
89c79bec8c | ||
![]() |
c7b77004e3 | ||
![]() |
07d143f412 | ||
![]() |
a12283e2ff | ||
![]() |
4b0050cf0e | ||
![]() |
0577af98f4 | ||
![]() |
17ce203a26 | ||
![]() |
d76555ffb5 | ||
![]() |
2786dff5d3 | ||
![]() |
225f0d1219 | ||
![]() |
532db58311 | ||
![]() |
6be309e1bd | ||
![]() |
da3bf23354 | ||
![]() |
26ab67732b | ||
![]() |
45cacbaf05 | ||
![]() |
17df6520c8 | ||
![]() |
6f351bf586 | ||
![]() |
ff4f0cbd1d | ||
![]() |
fc37c192ae | ||
![]() |
434dfe30c5 | ||
![]() |
4e2b7e181d | ||
![]() |
48702dd149 | ||
![]() |
68dfc6236a | ||
![]() |
5e8ff556cb | ||
![]() |
6fd04ca922 | ||
![]() |
206797bda4 | ||
![]() |
43ed358f9a | ||
![]() |
b32ebb4f29 | ||
![]() |
fb9cdfa723 | ||
![]() |
efac488675 | ||
![]() |
6b800aa7b7 | ||
![]() |
dd7c9ebeaf | ||
![]() |
4dc7fb9525 | ||
![]() |
157f09acdf |
30
.github/workflows/release.yaml
vendored
30
.github/workflows/release.yaml
vendored
@@ -437,6 +437,7 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
OLLAMA_SKIP_IMAGE_BUILD: '1'
|
OLLAMA_SKIP_IMAGE_BUILD: '1'
|
||||||
PUSH: '1'
|
PUSH: '1'
|
||||||
|
GH_TOKEN: ${{ github.token }}
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
- name: Set Version
|
- name: Set Version
|
||||||
@@ -460,15 +461,20 @@ jobs:
|
|||||||
ls -lh dist/
|
ls -lh dist/
|
||||||
(cd dist; sha256sum * > sha256sum.txt)
|
(cd dist; sha256sum * > sha256sum.txt)
|
||||||
cat dist/sha256sum.txt
|
cat dist/sha256sum.txt
|
||||||
- uses: ncipollo/release-action@v1
|
- name: Create or update Release
|
||||||
with:
|
run: |
|
||||||
name: ${{ env.RELEASE_VERSION }}
|
echo "Looking for existing release for ${{ env.RELEASE_VERSION }}"
|
||||||
allowUpdates: true
|
OLD_TAG=$(gh release ls --json name,tagName | jq -r ".[] | select(.name == \"${{ env.RELEASE_VERSION }}\") | .tagName")
|
||||||
artifacts: 'dist/*'
|
if [ -n "$OLD_TAG" ]; then
|
||||||
draft: true
|
echo "Updating release ${{ env.RELEASE_VERSION }} to point to new tag ${GITHUB_REF_NAME}"
|
||||||
prerelease: true
|
gh release edit ${OLD_TAG} --tag ${GITHUB_REF_NAME}
|
||||||
omitBodyDuringUpdate: true
|
else
|
||||||
generateReleaseNotes: true
|
echo "Creating new release ${{ env.RELEASE_VERSION }} pointing to tag ${GITHUB_REF_NAME}"
|
||||||
omitDraftDuringUpdate: true
|
gh release create ${GITHUB_REF_NAME} \
|
||||||
omitPrereleaseDuringUpdate: true
|
--title ${{ env.RELEASE_VERSION }} \
|
||||||
replacesArtifacts: true
|
--draft \
|
||||||
|
--generate-notes \
|
||||||
|
--prerelease
|
||||||
|
fi
|
||||||
|
echo "Uploading artifacts for tag ${GITHUB_REF_NAME}"
|
||||||
|
gh release upload ${GITHUB_REF_NAME} dist/* --clobber
|
||||||
|
2
.github/workflows/test.yaml
vendored
2
.github/workflows/test.yaml
vendored
@@ -124,7 +124,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
rocm-version:
|
rocm-version:
|
||||||
- '6.0.2'
|
- '6.1.1'
|
||||||
runs-on: linux
|
runs-on: linux
|
||||||
container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}
|
container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}
|
||||||
steps:
|
steps:
|
||||||
|
@@ -2,7 +2,7 @@ ARG GOLANG_VERSION=1.22.1
|
|||||||
ARG CMAKE_VERSION=3.22.1
|
ARG CMAKE_VERSION=3.22.1
|
||||||
# this CUDA_VERSION corresponds with the one specified in docs/gpu.md
|
# this CUDA_VERSION corresponds with the one specified in docs/gpu.md
|
||||||
ARG CUDA_VERSION=11.3.1
|
ARG CUDA_VERSION=11.3.1
|
||||||
ARG ROCM_VERSION=6.0.2
|
ARG ROCM_VERSION=6.1.1
|
||||||
|
|
||||||
# Copy the minimal context we need to run the generate scripts
|
# Copy the minimal context we need to run the generate scripts
|
||||||
FROM scratch AS llm-code
|
FROM scratch AS llm-code
|
||||||
|
10
README.md
10
README.md
@@ -53,8 +53,8 @@ Here are some example models that can be downloaded:
|
|||||||
| Llama 3 | 70B | 40GB | `ollama run llama3:70b` |
|
| Llama 3 | 70B | 40GB | `ollama run llama3:70b` |
|
||||||
| Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` |
|
| Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` |
|
||||||
| Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` |
|
| Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` |
|
||||||
| Gemma | 2B | 1.4GB | `ollama run gemma:2b` |
|
| Gemma 2 | 9B | 5.5GB | `ollama run gemma2` |
|
||||||
| Gemma | 7B | 4.8GB | `ollama run gemma:7b` |
|
| Gemma 2 | 27B | 16GB | `ollama run gemma2:27b` |
|
||||||
| Mistral | 7B | 4.1GB | `ollama run mistral` |
|
| Mistral | 7B | 4.1GB | `ollama run mistral` |
|
||||||
| Moondream 2 | 1.4B | 829MB | `ollama run moondream` |
|
| Moondream 2 | 1.4B | 829MB | `ollama run moondream` |
|
||||||
| Neural Chat | 7B | 4.1GB | `ollama run neural-chat` |
|
| Neural Chat | 7B | 4.1GB | `ollama run neural-chat` |
|
||||||
@@ -182,6 +182,12 @@ $ ollama run llama3 "Summarize this file: $(cat README.md)"
|
|||||||
Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
|
Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Show model information
|
||||||
|
|
||||||
|
```
|
||||||
|
ollama show llama3
|
||||||
|
```
|
||||||
|
|
||||||
### List models on your computer
|
### List models on your computer
|
||||||
|
|
||||||
```
|
```
|
||||||
|
101
api/types.go
101
api/types.go
@@ -159,18 +159,49 @@ type Options struct {
|
|||||||
|
|
||||||
// Runner options which must be set when the model is loaded into memory
|
// Runner options which must be set when the model is loaded into memory
|
||||||
type Runner struct {
|
type Runner struct {
|
||||||
UseNUMA bool `json:"numa,omitempty"`
|
UseNUMA bool `json:"numa,omitempty"`
|
||||||
NumCtx int `json:"num_ctx,omitempty"`
|
NumCtx int `json:"num_ctx,omitempty"`
|
||||||
NumBatch int `json:"num_batch,omitempty"`
|
NumBatch int `json:"num_batch,omitempty"`
|
||||||
NumGPU int `json:"num_gpu,omitempty"`
|
NumGPU int `json:"num_gpu,omitempty"`
|
||||||
MainGPU int `json:"main_gpu,omitempty"`
|
MainGPU int `json:"main_gpu,omitempty"`
|
||||||
LowVRAM bool `json:"low_vram,omitempty"`
|
LowVRAM bool `json:"low_vram,omitempty"`
|
||||||
F16KV bool `json:"f16_kv,omitempty"`
|
F16KV bool `json:"f16_kv,omitempty"`
|
||||||
LogitsAll bool `json:"logits_all,omitempty"`
|
LogitsAll bool `json:"logits_all,omitempty"`
|
||||||
VocabOnly bool `json:"vocab_only,omitempty"`
|
VocabOnly bool `json:"vocab_only,omitempty"`
|
||||||
UseMMap bool `json:"use_mmap,omitempty"`
|
UseMMap TriState `json:"use_mmap,omitempty"`
|
||||||
UseMLock bool `json:"use_mlock,omitempty"`
|
UseMLock bool `json:"use_mlock,omitempty"`
|
||||||
NumThread int `json:"num_thread,omitempty"`
|
NumThread int `json:"num_thread,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type TriState int
|
||||||
|
|
||||||
|
const (
|
||||||
|
TriStateUndefined TriState = -1
|
||||||
|
TriStateFalse TriState = 0
|
||||||
|
TriStateTrue TriState = 1
|
||||||
|
)
|
||||||
|
|
||||||
|
func (b *TriState) UnmarshalJSON(data []byte) error {
|
||||||
|
var v bool
|
||||||
|
if err := json.Unmarshal(data, &v); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if v {
|
||||||
|
*b = TriStateTrue
|
||||||
|
}
|
||||||
|
*b = TriStateFalse
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *TriState) MarshalJSON() ([]byte, error) {
|
||||||
|
if *b == TriStateUndefined {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
var v bool
|
||||||
|
if *b == TriStateTrue {
|
||||||
|
v = true
|
||||||
|
}
|
||||||
|
return json.Marshal(v)
|
||||||
}
|
}
|
||||||
|
|
||||||
// EmbeddingRequest is the request passed to [Client.Embeddings].
|
// EmbeddingRequest is the request passed to [Client.Embeddings].
|
||||||
@@ -222,6 +253,7 @@ type ShowRequest struct {
|
|||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
System string `json:"system"`
|
System string `json:"system"`
|
||||||
Template string `json:"template"`
|
Template string `json:"template"`
|
||||||
|
Verbose bool `json:"verbose"`
|
||||||
|
|
||||||
Options map[string]interface{} `json:"options"`
|
Options map[string]interface{} `json:"options"`
|
||||||
|
|
||||||
@@ -231,13 +263,16 @@ type ShowRequest struct {
|
|||||||
|
|
||||||
// ShowResponse is the response returned from [Client.Show].
|
// ShowResponse is the response returned from [Client.Show].
|
||||||
type ShowResponse struct {
|
type ShowResponse struct {
|
||||||
License string `json:"license,omitempty"`
|
License string `json:"license,omitempty"`
|
||||||
Modelfile string `json:"modelfile,omitempty"`
|
Modelfile string `json:"modelfile,omitempty"`
|
||||||
Parameters string `json:"parameters,omitempty"`
|
Parameters string `json:"parameters,omitempty"`
|
||||||
Template string `json:"template,omitempty"`
|
Template string `json:"template,omitempty"`
|
||||||
System string `json:"system,omitempty"`
|
System string `json:"system,omitempty"`
|
||||||
Details ModelDetails `json:"details,omitempty"`
|
Details ModelDetails `json:"details,omitempty"`
|
||||||
Messages []Message `json:"messages,omitempty"`
|
Messages []Message `json:"messages,omitempty"`
|
||||||
|
ModelInfo map[string]any `json:"model_info,omitempty"`
|
||||||
|
ProjectorInfo map[string]any `json:"projector_info,omitempty"`
|
||||||
|
ModifiedAt time.Time `json:"modified_at,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// CopyRequest is the request passed to [Client.Copy].
|
// CopyRequest is the request passed to [Client.Copy].
|
||||||
@@ -402,6 +437,19 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) {
|
||||||
|
val, ok := val.(bool)
|
||||||
|
if !ok {
|
||||||
|
return fmt.Errorf("option %q must be of type boolean", key)
|
||||||
|
}
|
||||||
|
if val {
|
||||||
|
field.SetInt(int64(TriStateTrue))
|
||||||
|
} else {
|
||||||
|
field.SetInt(int64(TriStateFalse))
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
switch field.Kind() {
|
switch field.Kind() {
|
||||||
case reflect.Int:
|
case reflect.Int:
|
||||||
switch t := val.(type) {
|
switch t := val.(type) {
|
||||||
@@ -490,7 +538,7 @@ func DefaultOptions() Options {
|
|||||||
LowVRAM: false,
|
LowVRAM: false,
|
||||||
F16KV: true,
|
F16KV: true,
|
||||||
UseMLock: false,
|
UseMLock: false,
|
||||||
UseMMap: true,
|
UseMMap: TriStateUndefined,
|
||||||
UseNUMA: false,
|
UseNUMA: false,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
@@ -560,6 +608,19 @@ func FormatParams(params map[string][]string) (map[string]interface{}, error) {
|
|||||||
} else {
|
} else {
|
||||||
field := valueOpts.FieldByName(opt.Name)
|
field := valueOpts.FieldByName(opt.Name)
|
||||||
if field.IsValid() && field.CanSet() {
|
if field.IsValid() && field.CanSet() {
|
||||||
|
if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) {
|
||||||
|
boolVal, err := strconv.ParseBool(vals[0])
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid bool value %s", vals)
|
||||||
|
}
|
||||||
|
if boolVal {
|
||||||
|
out[key] = TriStateTrue
|
||||||
|
} else {
|
||||||
|
out[key] = TriStateFalse
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
switch field.Kind() {
|
switch field.Kind() {
|
||||||
case reflect.Float32:
|
case reflect.Float32:
|
||||||
floatVal, err := strconv.ParseFloat(vals[0], 32)
|
floatVal, err := strconv.ParseFloat(vals[0], 32)
|
||||||
|
@@ -2,6 +2,7 @@ package api
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
"math"
|
"math"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
@@ -105,3 +106,101 @@ func TestDurationMarshalUnmarshal(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestUseMmapParsingFromJSON(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
req string
|
||||||
|
exp TriState
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "Undefined",
|
||||||
|
req: `{ }`,
|
||||||
|
exp: TriStateUndefined,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "True",
|
||||||
|
req: `{ "use_mmap": true }`,
|
||||||
|
exp: TriStateTrue,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "False",
|
||||||
|
req: `{ "use_mmap": false }`,
|
||||||
|
exp: TriStateFalse,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
t.Run(test.name, func(t *testing.T) {
|
||||||
|
var oMap map[string]interface{}
|
||||||
|
err := json.Unmarshal([]byte(test.req), &oMap)
|
||||||
|
require.NoError(t, err)
|
||||||
|
opts := DefaultOptions()
|
||||||
|
err = opts.FromMap(oMap)
|
||||||
|
require.NoError(t, err)
|
||||||
|
assert.Equal(t, test.exp, opts.UseMMap)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestUseMmapFormatParams(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
req map[string][]string
|
||||||
|
exp TriState
|
||||||
|
err error
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "True",
|
||||||
|
req: map[string][]string{
|
||||||
|
"use_mmap": []string{"true"},
|
||||||
|
},
|
||||||
|
exp: TriStateTrue,
|
||||||
|
err: nil,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "False",
|
||||||
|
req: map[string][]string{
|
||||||
|
"use_mmap": []string{"false"},
|
||||||
|
},
|
||||||
|
exp: TriStateFalse,
|
||||||
|
err: nil,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Numeric True",
|
||||||
|
req: map[string][]string{
|
||||||
|
"use_mmap": []string{"1"},
|
||||||
|
},
|
||||||
|
exp: TriStateTrue,
|
||||||
|
err: nil,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Numeric False",
|
||||||
|
req: map[string][]string{
|
||||||
|
"use_mmap": []string{"0"},
|
||||||
|
},
|
||||||
|
exp: TriStateFalse,
|
||||||
|
err: nil,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "invalid string",
|
||||||
|
req: map[string][]string{
|
||||||
|
"use_mmap": []string{"foo"},
|
||||||
|
},
|
||||||
|
exp: TriStateUndefined,
|
||||||
|
err: fmt.Errorf("invalid bool value [foo]"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
t.Run(test.name, func(t *testing.T) {
|
||||||
|
resp, err := FormatParams(test.req)
|
||||||
|
require.Equal(t, err, test.err)
|
||||||
|
respVal, ok := resp["use_mmap"]
|
||||||
|
if test.exp != TriStateUndefined {
|
||||||
|
assert.True(t, ok, "resp: %v", resp)
|
||||||
|
assert.Equal(t, test.exp, respVal)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@@ -5,6 +5,8 @@ import (
|
|||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
)
|
)
|
||||||
@@ -24,6 +26,7 @@ func InitLogging() {
|
|||||||
logFile = os.Stderr
|
logFile = os.Stderr
|
||||||
// TODO - write one-line to the app.log file saying we're running in console mode to help avoid confusion
|
// TODO - write one-line to the app.log file saying we're running in console mode to help avoid confusion
|
||||||
} else {
|
} else {
|
||||||
|
rotateLogs(AppLogFile)
|
||||||
logFile, err = os.OpenFile(AppLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
|
logFile, err = os.OpenFile(AppLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Error(fmt.Sprintf("failed to create server log %v", err))
|
slog.Error(fmt.Sprintf("failed to create server log %v", err))
|
||||||
@@ -46,3 +49,32 @@ func InitLogging() {
|
|||||||
|
|
||||||
slog.Info("ollama app started")
|
slog.Info("ollama app started")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func rotateLogs(logFile string) {
|
||||||
|
if _, err := os.Stat(logFile); os.IsNotExist(err) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
index := strings.LastIndex(logFile, ".")
|
||||||
|
pre := logFile[:index]
|
||||||
|
post := "." + logFile[index+1:]
|
||||||
|
for i := LogRotationCount; i > 0; i-- {
|
||||||
|
older := pre + "-" + strconv.Itoa(i) + post
|
||||||
|
newer := pre + "-" + strconv.Itoa(i-1) + post
|
||||||
|
if i == 1 {
|
||||||
|
newer = pre + post
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(newer); err == nil {
|
||||||
|
if _, err := os.Stat(older); err == nil {
|
||||||
|
err := os.Remove(older)
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("Failed to remove older log", "older", older, "error", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
err := os.Rename(newer, older)
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("Failed to rotate log", "older", older, "newer", newer, "error", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
44
app/lifecycle/logging_test.go
Normal file
44
app/lifecycle/logging_test.go
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
package lifecycle
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestRotateLogs(t *testing.T) {
|
||||||
|
logDir := t.TempDir()
|
||||||
|
logFile := filepath.Join(logDir, "testlog.log")
|
||||||
|
|
||||||
|
// No log exists
|
||||||
|
rotateLogs(logFile)
|
||||||
|
|
||||||
|
require.NoError(t, os.WriteFile(logFile, []byte("1"), 0644))
|
||||||
|
assert.FileExists(t, logFile)
|
||||||
|
// First rotation
|
||||||
|
rotateLogs(logFile)
|
||||||
|
assert.FileExists(t, filepath.Join(logDir, "testlog-1.log"))
|
||||||
|
assert.NoFileExists(t, filepath.Join(logDir, "testlog-2.log"))
|
||||||
|
assert.NoFileExists(t, logFile)
|
||||||
|
|
||||||
|
// Should be a no-op without a new log
|
||||||
|
rotateLogs(logFile)
|
||||||
|
assert.FileExists(t, filepath.Join(logDir, "testlog-1.log"))
|
||||||
|
assert.NoFileExists(t, filepath.Join(logDir, "testlog-2.log"))
|
||||||
|
assert.NoFileExists(t, logFile)
|
||||||
|
|
||||||
|
for i := 2; i <= LogRotationCount+1; i++ {
|
||||||
|
require.NoError(t, os.WriteFile(logFile, []byte(strconv.Itoa(i)), 0644))
|
||||||
|
assert.FileExists(t, logFile)
|
||||||
|
rotateLogs(logFile)
|
||||||
|
assert.NoFileExists(t, logFile)
|
||||||
|
for j := 1; j < i; j++ {
|
||||||
|
assert.FileExists(t, filepath.Join(logDir, "testlog-"+strconv.Itoa(j)+".log"))
|
||||||
|
}
|
||||||
|
assert.NoFileExists(t, filepath.Join(logDir, "testlog-"+strconv.Itoa(i+1)+".log"))
|
||||||
|
}
|
||||||
|
}
|
@@ -16,11 +16,12 @@ var (
|
|||||||
AppDir = "/opt/Ollama"
|
AppDir = "/opt/Ollama"
|
||||||
AppDataDir = "/opt/Ollama"
|
AppDataDir = "/opt/Ollama"
|
||||||
// TODO - should there be a distinct log dir?
|
// TODO - should there be a distinct log dir?
|
||||||
UpdateStageDir = "/tmp"
|
UpdateStageDir = "/tmp"
|
||||||
AppLogFile = "/tmp/ollama_app.log"
|
AppLogFile = "/tmp/ollama_app.log"
|
||||||
ServerLogFile = "/tmp/ollama.log"
|
ServerLogFile = "/tmp/ollama.log"
|
||||||
UpgradeLogFile = "/tmp/ollama_update.log"
|
UpgradeLogFile = "/tmp/ollama_update.log"
|
||||||
Installer = "OllamaSetup.exe"
|
Installer = "OllamaSetup.exe"
|
||||||
|
LogRotationCount = 5
|
||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
|
@@ -54,7 +54,7 @@ func start(ctx context.Context, command string) (*exec.Cmd, error) {
|
|||||||
return nil, fmt.Errorf("failed to spawn server stderr pipe: %w", err)
|
return nil, fmt.Errorf("failed to spawn server stderr pipe: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO - rotation
|
rotateLogs(ServerLogFile)
|
||||||
logFile, err := os.OpenFile(ServerLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
|
logFile, err := os.OpenFile(ServerLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to create server log: %w", err)
|
return nil, fmt.Errorf("failed to create server log: %w", err)
|
||||||
|
@@ -88,10 +88,15 @@ DialogFontSize=12
|
|||||||
[Files]
|
[Files]
|
||||||
Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
|
Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
|
||||||
Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
|
Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
|
||||||
Source: "..\dist\windows-{#ARCH}\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
|
|
||||||
Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
|
Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
|
||||||
Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
|
Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
|
||||||
Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
|
Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
|
||||||
|
#if DirExists("..\dist\windows-amd64\cuda")
|
||||||
|
Source: "..\dist\windows-amd64\cuda\*"; DestDir: "{app}\cuda\"; Flags: ignoreversion recursesubdirs
|
||||||
|
#endif
|
||||||
|
#if DirExists("..\dist\windows-amd64\oneapi")
|
||||||
|
Source: "..\dist\windows-amd64\oneapi\*"; DestDir: "{app}\oneapi\"; Flags: ignoreversion recursesubdirs
|
||||||
|
#endif
|
||||||
#if DirExists("..\dist\windows-amd64\rocm")
|
#if DirExists("..\dist\windows-amd64\rocm")
|
||||||
Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
|
Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
|
||||||
#endif
|
#endif
|
||||||
|
214
cmd/cmd.go
214
cmd/cmd.go
@@ -162,9 +162,6 @@ func tempZipFiles(path string) (string, error) {
|
|||||||
}
|
}
|
||||||
defer tempfile.Close()
|
defer tempfile.Close()
|
||||||
|
|
||||||
zipfile := zip.NewWriter(tempfile)
|
|
||||||
defer zipfile.Close()
|
|
||||||
|
|
||||||
detectContentType := func(path string) (string, error) {
|
detectContentType := func(path string) (string, error) {
|
||||||
f, err := os.Open(path)
|
f, err := os.Open(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -233,6 +230,9 @@ func tempZipFiles(path string) (string, error) {
|
|||||||
files = append(files, tks...)
|
files = append(files, tks...)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
zipfile := zip.NewWriter(tempfile)
|
||||||
|
defer zipfile.Close()
|
||||||
|
|
||||||
for _, file := range files {
|
for _, file := range files {
|
||||||
f, err := os.Open(file)
|
f, err := os.Open(file)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -287,38 +287,12 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, er
|
|||||||
}
|
}
|
||||||
|
|
||||||
func RunHandler(cmd *cobra.Command, args []string) error {
|
func RunHandler(cmd *cobra.Command, args []string) error {
|
||||||
client, err := api.ClientFromEnvironment()
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
name := args[0]
|
|
||||||
|
|
||||||
// check if the model exists on the server
|
|
||||||
show, err := client.Show(cmd.Context(), &api.ShowRequest{Name: name})
|
|
||||||
var statusError api.StatusError
|
|
||||||
switch {
|
|
||||||
case errors.As(err, &statusError) && statusError.StatusCode == http.StatusNotFound:
|
|
||||||
if err := PullHandler(cmd, []string{name}); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
show, err = client.Show(cmd.Context(), &api.ShowRequest{Name: name})
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
case err != nil:
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
interactive := true
|
interactive := true
|
||||||
|
|
||||||
opts := runOptions{
|
opts := runOptions{
|
||||||
Model: args[0],
|
Model: args[0],
|
||||||
WordWrap: os.Getenv("TERM") == "xterm-256color",
|
WordWrap: os.Getenv("TERM") == "xterm-256color",
|
||||||
Options: map[string]interface{}{},
|
Options: map[string]interface{}{},
|
||||||
MultiModal: slices.Contains(show.Details.Families, "clip"),
|
|
||||||
ParentModel: show.Details.ParentModel,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
format, err := cmd.Flags().GetString("format")
|
format, err := cmd.Flags().GetString("format")
|
||||||
@@ -362,11 +336,38 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
|||||||
}
|
}
|
||||||
opts.WordWrap = !nowrap
|
opts.WordWrap = !nowrap
|
||||||
|
|
||||||
if !interactive {
|
// Fill out the rest of the options based on information about the
|
||||||
return generate(cmd, opts)
|
// model.
|
||||||
|
client, err := api.ClientFromEnvironment()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
return generateInteractive(cmd, opts)
|
name := args[0]
|
||||||
|
info, err := func() (*api.ShowResponse, error) {
|
||||||
|
showReq := &api.ShowRequest{Name: name}
|
||||||
|
info, err := client.Show(cmd.Context(), showReq)
|
||||||
|
var se api.StatusError
|
||||||
|
if errors.As(err, &se) && se.StatusCode == http.StatusNotFound {
|
||||||
|
if err := PullHandler(cmd, []string{name}); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return client.Show(cmd.Context(), &api.ShowRequest{Name: name})
|
||||||
|
}
|
||||||
|
return info, err
|
||||||
|
}()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
opts.MultiModal = slices.Contains(info.Details.Families, "clip")
|
||||||
|
opts.ParentModel = info.Details.ParentModel
|
||||||
|
opts.Messages = append(opts.Messages, info.Messages...)
|
||||||
|
|
||||||
|
if interactive {
|
||||||
|
return generateInteractive(cmd, opts)
|
||||||
|
}
|
||||||
|
return generate(cmd, opts)
|
||||||
}
|
}
|
||||||
|
|
||||||
func errFromUnknownKey(unknownKeyErr error) error {
|
func errFromUnknownKey(unknownKeyErr error) error {
|
||||||
@@ -579,10 +580,6 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(args) != 1 {
|
|
||||||
return errors.New("missing model name")
|
|
||||||
}
|
|
||||||
|
|
||||||
license, errLicense := cmd.Flags().GetBool("license")
|
license, errLicense := cmd.Flags().GetBool("license")
|
||||||
modelfile, errModelfile := cmd.Flags().GetBool("modelfile")
|
modelfile, errModelfile := cmd.Flags().GetBool("modelfile")
|
||||||
parameters, errParams := cmd.Flags().GetBool("parameters")
|
parameters, errParams := cmd.Flags().GetBool("parameters")
|
||||||
@@ -625,8 +622,29 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
|
|||||||
|
|
||||||
if flagsSet > 1 {
|
if flagsSet > 1 {
|
||||||
return errors.New("only one of '--license', '--modelfile', '--parameters', '--system', or '--template' can be specified")
|
return errors.New("only one of '--license', '--modelfile', '--parameters', '--system', or '--template' can be specified")
|
||||||
} else if flagsSet == 0 {
|
}
|
||||||
return errors.New("one of '--license', '--modelfile', '--parameters', '--system', or '--template' must be specified")
|
|
||||||
|
if flagsSet == 1 {
|
||||||
|
req := api.ShowRequest{Name: args[0]}
|
||||||
|
resp, err := client.Show(cmd.Context(), &req)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
switch showType {
|
||||||
|
case "license":
|
||||||
|
fmt.Println(resp.License)
|
||||||
|
case "modelfile":
|
||||||
|
fmt.Println(resp.Modelfile)
|
||||||
|
case "parameters":
|
||||||
|
fmt.Println(resp.Parameters)
|
||||||
|
case "system":
|
||||||
|
fmt.Println(resp.System)
|
||||||
|
case "template":
|
||||||
|
fmt.Println(resp.Template)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
req := api.ShowRequest{Name: args[0]}
|
req := api.ShowRequest{Name: args[0]}
|
||||||
@@ -635,22 +653,114 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
switch showType {
|
arch := resp.ModelInfo["general.architecture"].(string)
|
||||||
case "license":
|
|
||||||
fmt.Println(resp.License)
|
modelData := [][]string{
|
||||||
case "modelfile":
|
{"arch", arch},
|
||||||
fmt.Println(resp.Modelfile)
|
{"parameters", resp.Details.ParameterSize},
|
||||||
case "parameters":
|
{"quantization", resp.Details.QuantizationLevel},
|
||||||
fmt.Println(resp.Parameters)
|
{"context length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))},
|
||||||
case "system":
|
{"embedding length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64))},
|
||||||
fmt.Println(resp.System)
|
|
||||||
case "template":
|
|
||||||
fmt.Println(resp.Template)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mainTableData := [][]string{
|
||||||
|
{"Model"},
|
||||||
|
{renderSubTable(modelData, false)},
|
||||||
|
}
|
||||||
|
|
||||||
|
if resp.ProjectorInfo != nil {
|
||||||
|
projectorData := [][]string{
|
||||||
|
{"arch", "clip"},
|
||||||
|
{"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))},
|
||||||
|
{"projector type", resp.ProjectorInfo["clip.projector_type"].(string)},
|
||||||
|
{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
|
||||||
|
{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
|
||||||
|
}
|
||||||
|
|
||||||
|
mainTableData = append(mainTableData,
|
||||||
|
[]string{"Projector"},
|
||||||
|
[]string{renderSubTable(projectorData, false)},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
if resp.Parameters != "" {
|
||||||
|
mainTableData = append(mainTableData, []string{"Parameters"}, []string{formatParams(resp.Parameters)})
|
||||||
|
}
|
||||||
|
|
||||||
|
if resp.System != "" {
|
||||||
|
mainTableData = append(mainTableData, []string{"System"}, []string{renderSubTable(twoLines(resp.System), true)})
|
||||||
|
}
|
||||||
|
|
||||||
|
if resp.License != "" {
|
||||||
|
mainTableData = append(mainTableData, []string{"License"}, []string{renderSubTable(twoLines(resp.License), true)})
|
||||||
|
}
|
||||||
|
|
||||||
|
table := tablewriter.NewWriter(os.Stdout)
|
||||||
|
table.SetAutoWrapText(false)
|
||||||
|
table.SetBorder(false)
|
||||||
|
table.SetAlignment(tablewriter.ALIGN_LEFT)
|
||||||
|
|
||||||
|
for _, v := range mainTableData {
|
||||||
|
table.Append(v)
|
||||||
|
}
|
||||||
|
|
||||||
|
table.Render()
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func renderSubTable(data [][]string, file bool) string {
|
||||||
|
var buf bytes.Buffer
|
||||||
|
table := tablewriter.NewWriter(&buf)
|
||||||
|
table.SetAutoWrapText(!file)
|
||||||
|
table.SetBorder(false)
|
||||||
|
table.SetNoWhiteSpace(true)
|
||||||
|
table.SetTablePadding("\t")
|
||||||
|
table.SetAlignment(tablewriter.ALIGN_LEFT)
|
||||||
|
|
||||||
|
for _, v := range data {
|
||||||
|
table.Append(v)
|
||||||
|
}
|
||||||
|
|
||||||
|
table.Render()
|
||||||
|
|
||||||
|
renderedTable := buf.String()
|
||||||
|
lines := strings.Split(renderedTable, "\n")
|
||||||
|
for i, line := range lines {
|
||||||
|
lines[i] = "\t" + line
|
||||||
|
}
|
||||||
|
|
||||||
|
return strings.Join(lines, "\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
func twoLines(s string) [][]string {
|
||||||
|
lines := strings.Split(s, "\n")
|
||||||
|
res := [][]string{}
|
||||||
|
|
||||||
|
count := 0
|
||||||
|
for _, line := range lines {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line != "" {
|
||||||
|
count++
|
||||||
|
res = append(res, []string{line})
|
||||||
|
if count == 2 {
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatParams(s string) string {
|
||||||
|
lines := strings.Split(s, "\n")
|
||||||
|
table := [][]string{}
|
||||||
|
|
||||||
|
for _, line := range lines {
|
||||||
|
table = append(table, strings.Fields(line))
|
||||||
|
}
|
||||||
|
return renderSubTable(table, false)
|
||||||
|
}
|
||||||
|
|
||||||
func CopyHandler(cmd *cobra.Command, args []string) error {
|
func CopyHandler(cmd *cobra.Command, args []string) error {
|
||||||
client, err := api.ClientFromEnvironment()
|
client, err := api.ClientFromEnvironment()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@@ -31,65 +31,40 @@ const (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func loadModel(cmd *cobra.Command, opts *runOptions) error {
|
func loadModel(cmd *cobra.Command, opts *runOptions) error {
|
||||||
client, err := api.ClientFromEnvironment()
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
p := progress.NewProgress(os.Stderr)
|
p := progress.NewProgress(os.Stderr)
|
||||||
defer p.StopAndClear()
|
defer p.StopAndClear()
|
||||||
|
|
||||||
spinner := progress.NewSpinner("")
|
spinner := progress.NewSpinner("")
|
||||||
p.Add("", spinner)
|
p.Add("", spinner)
|
||||||
|
|
||||||
showReq := api.ShowRequest{Name: opts.Model}
|
client, err := api.ClientFromEnvironment()
|
||||||
showResp, err := client.Show(cmd.Context(), &showReq)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
opts.MultiModal = slices.Contains(showResp.Details.Families, "clip")
|
|
||||||
opts.ParentModel = showResp.Details.ParentModel
|
|
||||||
|
|
||||||
if len(showResp.Messages) > 0 {
|
|
||||||
opts.Messages = append(opts.Messages, showResp.Messages...)
|
|
||||||
}
|
|
||||||
|
|
||||||
chatReq := &api.ChatRequest{
|
chatReq := &api.ChatRequest{
|
||||||
Model: opts.Model,
|
Model: opts.Model,
|
||||||
Messages: []api.Message{},
|
KeepAlive: opts.KeepAlive,
|
||||||
}
|
}
|
||||||
|
|
||||||
if opts.KeepAlive != nil {
|
return client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
|
||||||
chatReq.KeepAlive = opts.KeepAlive
|
|
||||||
}
|
|
||||||
|
|
||||||
err = client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
|
|
||||||
p.StopAndClear()
|
p.StopAndClear()
|
||||||
if len(opts.Messages) > 0 {
|
for _, msg := range opts.Messages {
|
||||||
for _, msg := range opts.Messages {
|
switch msg.Role {
|
||||||
switch msg.Role {
|
case "user":
|
||||||
case "user":
|
fmt.Printf(">>> %s\n", msg.Content)
|
||||||
fmt.Printf(">>> %s\n", msg.Content)
|
case "assistant":
|
||||||
case "assistant":
|
state := &displayResponseState{}
|
||||||
state := &displayResponseState{}
|
displayResponse(msg.Content, opts.WordWrap, state)
|
||||||
displayResponse(msg.Content, opts.WordWrap, state)
|
fmt.Println()
|
||||||
fmt.Println()
|
fmt.Println()
|
||||||
fmt.Println()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
})
|
})
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||||
opts.Messages = make([]api.Message, 0)
|
|
||||||
|
|
||||||
err := loadModel(cmd, &opts)
|
err := loadModel(cmd, &opts)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
37
docs/api.md
37
docs/api.md
@@ -777,11 +777,12 @@ A single JSON object will be returned.
|
|||||||
POST /api/show
|
POST /api/show
|
||||||
```
|
```
|
||||||
|
|
||||||
Show information about a model including details, modelfile, template, parameters, license, and system prompt.
|
Show information about a model including details, modelfile, template, parameters, license, system prompt.
|
||||||
|
|
||||||
### Parameters
|
### Parameters
|
||||||
|
|
||||||
- `name`: name of the model to show
|
- `name`: name of the model to show
|
||||||
|
- `verbose`: (optional) if set to `true`, returns full data for verbose response fields
|
||||||
|
|
||||||
### Examples
|
### Examples
|
||||||
|
|
||||||
@@ -798,14 +799,40 @@ curl http://localhost:11434/api/show -d '{
|
|||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: \"\"\"\nPARAMETER num_ctx 4096\nPARAMETER stop \"\u003c/s\u003e\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSISTANT:\"",
|
"modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: \"\"\"\nPARAMETER num_ctx 4096\nPARAMETER stop \"\u003c/s\u003e\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSISTANT:\"",
|
||||||
"parameters": "num_ctx 4096\nstop \u003c/s\u003e\nstop USER:\nstop ASSISTANT:",
|
"parameters": "num_keep 24\nstop \"<|start_header_id|>\"\nstop \"<|end_header_id|>\"\nstop \"<|eot_id|>\"",
|
||||||
"template": "{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: ",
|
"template": "{{ if .System }}<|start_header_id|>system<|end_header_id|>\n\n{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>\n\n{{ .Response }}<|eot_id|>",
|
||||||
"details": {
|
"details": {
|
||||||
|
"parent_model": "",
|
||||||
"format": "gguf",
|
"format": "gguf",
|
||||||
"family": "llama",
|
"family": "llama",
|
||||||
"families": ["llama", "clip"],
|
"families": [
|
||||||
"parameter_size": "7B",
|
"llama"
|
||||||
|
],
|
||||||
|
"parameter_size": "8.0B",
|
||||||
"quantization_level": "Q4_0"
|
"quantization_level": "Q4_0"
|
||||||
|
},
|
||||||
|
"model_info": {
|
||||||
|
"general.architecture": "llama",
|
||||||
|
"general.file_type": 2,
|
||||||
|
"general.parameter_count": 8030261248,
|
||||||
|
"general.quantization_version": 2,
|
||||||
|
"llama.attention.head_count": 32,
|
||||||
|
"llama.attention.head_count_kv": 8,
|
||||||
|
"llama.attention.layer_norm_rms_epsilon": 0.00001,
|
||||||
|
"llama.block_count": 32,
|
||||||
|
"llama.context_length": 8192,
|
||||||
|
"llama.embedding_length": 4096,
|
||||||
|
"llama.feed_forward_length": 14336,
|
||||||
|
"llama.rope.dimension_count": 128,
|
||||||
|
"llama.rope.freq_base": 500000,
|
||||||
|
"llama.vocab_size": 128256,
|
||||||
|
"tokenizer.ggml.bos_token_id": 128000,
|
||||||
|
"tokenizer.ggml.eos_token_id": 128009,
|
||||||
|
"tokenizer.ggml.merges": [], // populates if `verbose=true`
|
||||||
|
"tokenizer.ggml.model": "gpt2",
|
||||||
|
"tokenizer.ggml.pre": "llama-bpe",
|
||||||
|
"tokenizer.ggml.token_type": [], // populates if `verbose=true`
|
||||||
|
"tokenizer.ggml.tokens": [] // populates if `verbose=true`
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
@@ -114,15 +114,18 @@ If you have Docker available, you can build linux binaries with `./scripts/build
|
|||||||
|
|
||||||
### Windows
|
### Windows
|
||||||
|
|
||||||
Note: The windows build for Ollama is still under development.
|
Note: The Windows build for Ollama is still under development.
|
||||||
|
|
||||||
Install required tools:
|
First, install required tools:
|
||||||
|
|
||||||
- MSVC toolchain - C/C++ and cmake as minimal requirements
|
- MSVC toolchain - C/C++ and cmake as minimal requirements
|
||||||
- Go version 1.22 or higher
|
- Go version 1.22 or higher
|
||||||
- MinGW (pick one variant) with GCC.
|
- MinGW (pick one variant) with GCC.
|
||||||
- [MinGW-w64](https://www.mingw-w64.org/)
|
- [MinGW-w64](https://www.mingw-w64.org/)
|
||||||
- [MSYS2](https://www.msys2.org/)
|
- [MSYS2](https://www.msys2.org/)
|
||||||
|
- The `ThreadJob` Powershell module: `Install-Module -Name ThreadJob -Scope CurrentUser`
|
||||||
|
|
||||||
|
Then, build the `ollama` binary:
|
||||||
|
|
||||||
```powershell
|
```powershell
|
||||||
$env:CGO_ENABLED="1"
|
$env:CGO_ENABLED="1"
|
||||||
|
@@ -8,7 +8,7 @@ Check your compute compatibility to see if your card is supported:
|
|||||||
| Compute Capability | Family | Cards |
|
| Compute Capability | Family | Cards |
|
||||||
| ------------------ | ------------------- | ----------------------------------------------------------------------------------------------------------- |
|
| ------------------ | ------------------- | ----------------------------------------------------------------------------------------------------------- |
|
||||||
| 9.0 | NVIDIA | `H100` |
|
| 9.0 | NVIDIA | `H100` |
|
||||||
| 8.9 | GeForce RTX 40xx | `RTX 4090` `RTX 4080` `RTX 4070 Ti` `RTX 4060 Ti` |
|
| 8.9 | GeForce RTX 40xx | `RTX 4090` `RTX 4080 SUPER` `RTX 4080` `RTX 4070 Ti SUPER` `RTX 4070 Ti` `RTX 4070 SUPER` `RTX 4070` `RTX 4060 Ti` `RTX 4060` |
|
||||||
| | NVIDIA Professional | `L4` `L40` `RTX 6000` |
|
| | NVIDIA Professional | `L4` `L40` `RTX 6000` |
|
||||||
| 8.6 | GeForce RTX 30xx | `RTX 3090 Ti` `RTX 3090` `RTX 3080 Ti` `RTX 3080` `RTX 3070 Ti` `RTX 3070` `RTX 3060 Ti` `RTX 3060` |
|
| 8.6 | GeForce RTX 30xx | `RTX 3090 Ti` `RTX 3090` `RTX 3080 Ti` `RTX 3080` `RTX 3070 Ti` `RTX 3070` `RTX 3060 Ti` `RTX 3060` |
|
||||||
| | NVIDIA Professional | `A40` `RTX A6000` `RTX A5000` `RTX A4000` `RTX A3000` `RTX A2000` `A10` `A16` `A2` |
|
| | NVIDIA Professional | `A40` `RTX A6000` `RTX A5000` `RTX A4000` `RTX A3000` `RTX A2000` `A10` `A16` `A2` |
|
||||||
|
@@ -47,19 +47,13 @@ success
|
|||||||
|
|
||||||
### Supported Quantizations
|
### Supported Quantizations
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>Legacy Quantization</summary>
|
|
||||||
|
|
||||||
- `Q4_0`
|
- `Q4_0`
|
||||||
- `Q4_1`
|
- `Q4_1`
|
||||||
- `Q5_0`
|
- `Q5_0`
|
||||||
- `Q5_1`
|
- `Q5_1`
|
||||||
- `Q8_0`
|
- `Q8_0`
|
||||||
|
|
||||||
</details>
|
#### K-means Quantizations
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>K-means Quantization</summary>`
|
|
||||||
|
|
||||||
- `Q3_K_S`
|
- `Q3_K_S`
|
||||||
- `Q3_K_M`
|
- `Q3_K_M`
|
||||||
@@ -70,11 +64,6 @@ success
|
|||||||
- `Q5_K_M`
|
- `Q5_K_M`
|
||||||
- `Q6_K`
|
- `Q6_K`
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
> [!NOTE]
|
|
||||||
> Activation-aware Weight Quantization (i.e. IQ) are not currently supported for automatic quantization however you can still import the quantized model into Ollama, see [Import GGUF](#import-gguf).
|
|
||||||
|
|
||||||
## Template Detection
|
## Template Detection
|
||||||
|
|
||||||
> [!NOTE]
|
> [!NOTE]
|
||||||
|
@@ -104,7 +104,6 @@ curl http://localhost:11434/v1/chat/completions \
|
|||||||
|
|
||||||
#### Notes
|
#### Notes
|
||||||
|
|
||||||
- Setting `seed` will always set `temperature` to `0`
|
|
||||||
- `finish_reason` will always be `stop`
|
- `finish_reason` will always be `stop`
|
||||||
- `usage.prompt_tokens` will be 0 for completions where prompt evaluation is cached
|
- `usage.prompt_tokens` will be 0 for completions where prompt evaluation is cached
|
||||||
|
|
||||||
|
@@ -22,7 +22,7 @@ docker logs <container-name>
|
|||||||
If manually running `ollama serve` in a terminal, the logs will be on that terminal.
|
If manually running `ollama serve` in a terminal, the logs will be on that terminal.
|
||||||
|
|
||||||
When you run Ollama on **Windows**, there are a few different locations. You can view them in the explorer window by hitting `<cmd>+R` and type in:
|
When you run Ollama on **Windows**, there are a few different locations. You can view them in the explorer window by hitting `<cmd>+R` and type in:
|
||||||
- `explorer %LOCALAPPDATA%\Ollama` to view logs
|
- `explorer %LOCALAPPDATA%\Ollama` to view logs. The most recent server logs will be in `server.log` and older logs will be in `server-#.log`
|
||||||
- `explorer %LOCALAPPDATA%\Programs\Ollama` to browse the binaries (The installer adds this to your user PATH)
|
- `explorer %LOCALAPPDATA%\Programs\Ollama` to browse the binaries (The installer adds this to your user PATH)
|
||||||
- `explorer %HOMEPATH%\.ollama` to browse where models and configuration is stored
|
- `explorer %HOMEPATH%\.ollama` to browse where models and configuration is stored
|
||||||
- `explorer %TEMP%` where temporary executable files are stored in one or more `ollama*` directories
|
- `explorer %TEMP%` where temporary executable files are stored in one or more `ollama*` directories
|
||||||
|
@@ -39,8 +39,8 @@ server.
|
|||||||
Ollama on Windows stores files in a few different locations. You can view them in
|
Ollama on Windows stores files in a few different locations. You can view them in
|
||||||
the explorer window by hitting `<cmd>+R` and type in:
|
the explorer window by hitting `<cmd>+R` and type in:
|
||||||
- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
|
- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
|
||||||
- *app.log* contains logs from the GUI application
|
- *app.log* contains most resent logs from the GUI application
|
||||||
- *server.log* contains the server logs
|
- *server.log* contains the most recent server logs
|
||||||
- *upgrade.log* contains log output for upgrades
|
- *upgrade.log* contains log output for upgrades
|
||||||
- `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
|
- `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
|
||||||
- `explorer %HOMEPATH%\.ollama` contains models and configuration
|
- `explorer %HOMEPATH%\.ollama` contains models and configuration
|
||||||
|
@@ -53,8 +53,23 @@ var (
|
|||||||
NumParallel int
|
NumParallel int
|
||||||
// Set via OLLAMA_RUNNERS_DIR in the environment
|
// Set via OLLAMA_RUNNERS_DIR in the environment
|
||||||
RunnersDir string
|
RunnersDir string
|
||||||
|
// Set via OLLAMA_SCHED_SPREAD in the environment
|
||||||
|
SchedSpread bool
|
||||||
// Set via OLLAMA_TMPDIR in the environment
|
// Set via OLLAMA_TMPDIR in the environment
|
||||||
TmpDir string
|
TmpDir string
|
||||||
|
// Set via OLLAMA_INTEL_GPU in the environment
|
||||||
|
IntelGpu bool
|
||||||
|
|
||||||
|
// Set via CUDA_VISIBLE_DEVICES in the environment
|
||||||
|
CudaVisibleDevices string
|
||||||
|
// Set via HIP_VISIBLE_DEVICES in the environment
|
||||||
|
HipVisibleDevices string
|
||||||
|
// Set via ROCR_VISIBLE_DEVICES in the environment
|
||||||
|
RocrVisibleDevices string
|
||||||
|
// Set via GPU_DEVICE_ORDINAL in the environment
|
||||||
|
GpuDeviceOrdinal string
|
||||||
|
// Set via HSA_OVERRIDE_GFX_VERSION in the environment
|
||||||
|
HsaOverrideGfxVersion string
|
||||||
)
|
)
|
||||||
|
|
||||||
type EnvVar struct {
|
type EnvVar struct {
|
||||||
@@ -64,7 +79,7 @@ type EnvVar struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func AsMap() map[string]EnvVar {
|
func AsMap() map[string]EnvVar {
|
||||||
return map[string]EnvVar{
|
ret := map[string]EnvVar{
|
||||||
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug, "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
|
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug, "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
|
||||||
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention, "Enabled flash attention"},
|
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention, "Enabled flash attention"},
|
||||||
"OLLAMA_HOST": {"OLLAMA_HOST", Host, "IP Address for the ollama server (default 127.0.0.1:11434)"},
|
"OLLAMA_HOST": {"OLLAMA_HOST", Host, "IP Address for the ollama server (default 127.0.0.1:11434)"},
|
||||||
@@ -79,8 +94,18 @@ func AsMap() map[string]EnvVar {
|
|||||||
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests (default 1)"},
|
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests (default 1)"},
|
||||||
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"},
|
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"},
|
||||||
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
|
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
|
||||||
|
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"},
|
||||||
"OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
|
"OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
|
||||||
}
|
}
|
||||||
|
if runtime.GOOS != "darwin" {
|
||||||
|
ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices, "Set which NVIDIA devices are visible"}
|
||||||
|
ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices, "Set which AMD devices are visible"}
|
||||||
|
ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices, "Set which AMD devices are visible"}
|
||||||
|
ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal, "Set which AMD devices are visible"}
|
||||||
|
ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion, "Override the gfx used for all detected AMD GPUs"}
|
||||||
|
ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGpu, "Enable experimental Intel GPU detection"}
|
||||||
|
}
|
||||||
|
return ret
|
||||||
}
|
}
|
||||||
|
|
||||||
func Values() map[string]string {
|
func Values() map[string]string {
|
||||||
@@ -191,6 +216,15 @@ func LoadConfig() {
|
|||||||
NoHistory = true
|
NoHistory = true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if spread := clean("OLLAMA_SCHED_SPREAD"); spread != "" {
|
||||||
|
s, err := strconv.ParseBool(spread)
|
||||||
|
if err == nil {
|
||||||
|
SchedSpread = s
|
||||||
|
} else {
|
||||||
|
SchedSpread = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
|
if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
|
||||||
NoPrune = true
|
NoPrune = true
|
||||||
}
|
}
|
||||||
@@ -244,6 +278,16 @@ func LoadConfig() {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Error("invalid setting", "OLLAMA_HOST", Host, "error", err, "using default port", Host.Port)
|
slog.Error("invalid setting", "OLLAMA_HOST", Host, "error", err, "using default port", Host.Port)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if set, err := strconv.ParseBool(clean("OLLAMA_INTEL_GPU")); err == nil {
|
||||||
|
IntelGpu = set
|
||||||
|
}
|
||||||
|
|
||||||
|
CudaVisibleDevices = clean("CUDA_VISIBLE_DEVICES")
|
||||||
|
HipVisibleDevices = clean("HIP_VISIBLE_DEVICES")
|
||||||
|
RocrVisibleDevices = clean("ROCR_VISIBLE_DEVICES")
|
||||||
|
GpuDeviceOrdinal = clean("GPU_DEVICE_ORDINAL")
|
||||||
|
HsaOverrideGfxVersion = clean("HSA_OVERRIDE_GFX_VERSION")
|
||||||
}
|
}
|
||||||
|
|
||||||
func getModelsDir() (string, error) {
|
func getModelsDir() (string, error) {
|
||||||
|
220
gpu/amd_linux.go
220
gpu/amd_linux.go
@@ -13,6 +13,7 @@ import (
|
|||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -25,7 +26,16 @@ const (
|
|||||||
|
|
||||||
// Prefix with the node dir
|
// Prefix with the node dir
|
||||||
GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
|
GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
|
||||||
GPUUsedMemoryFileGlob = "mem_banks/*/used_memory"
|
|
||||||
|
// Direct Rendering Manager sysfs location
|
||||||
|
DRMDeviceDirGlob = "/sys/class/drm/card*/device"
|
||||||
|
DRMTotalMemoryFile = "mem_info_vram_total"
|
||||||
|
DRMUsedMemoryFile = "mem_info_vram_used"
|
||||||
|
|
||||||
|
// In hex; properties file is in decimal
|
||||||
|
DRMUniqueIDFile = "unique_id"
|
||||||
|
DRMVendorFile = "vendor"
|
||||||
|
DRMDeviceFile = "device"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
@@ -35,8 +45,8 @@ var (
|
|||||||
)
|
)
|
||||||
|
|
||||||
// Gather GPU information from the amdgpu driver if any supported GPUs are detected
|
// Gather GPU information from the amdgpu driver if any supported GPUs are detected
|
||||||
func AMDGetGPUInfo() []GpuInfo {
|
func AMDGetGPUInfo() []RocmGPUInfo {
|
||||||
resp := []GpuInfo{}
|
resp := []RocmGPUInfo{}
|
||||||
if !AMDDetected() {
|
if !AMDDetected() {
|
||||||
return resp
|
return resp
|
||||||
}
|
}
|
||||||
@@ -50,9 +60,9 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
|
|
||||||
// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
|
// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
|
||||||
var visibleDevices []string
|
var visibleDevices []string
|
||||||
hipVD := os.Getenv("HIP_VISIBLE_DEVICES") // zero based index only
|
hipVD := envconfig.HipVisibleDevices // zero based index only
|
||||||
rocrVD := os.Getenv("ROCR_VISIBLE_DEVICES") // zero based index or UUID, but consumer cards seem to not support UUID
|
rocrVD := envconfig.RocrVisibleDevices // zero based index or UUID, but consumer cards seem to not support UUID
|
||||||
gpuDO := os.Getenv("GPU_DEVICE_ORDINAL") // zero based index
|
gpuDO := envconfig.GpuDeviceOrdinal // zero based index
|
||||||
switch {
|
switch {
|
||||||
// TODO is this priorty order right?
|
// TODO is this priorty order right?
|
||||||
case hipVD != "":
|
case hipVD != "":
|
||||||
@@ -65,7 +75,7 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
visibleDevices = strings.Split(gpuDO, ",")
|
visibleDevices = strings.Split(gpuDO, ",")
|
||||||
}
|
}
|
||||||
|
|
||||||
gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
|
gfxOverride := envconfig.HsaOverrideGfxVersion
|
||||||
var supported []string
|
var supported []string
|
||||||
libDir := ""
|
libDir := ""
|
||||||
|
|
||||||
@@ -90,7 +100,7 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
scanner := bufio.NewScanner(fp)
|
scanner := bufio.NewScanner(fp)
|
||||||
isCPU := false
|
isCPU := false
|
||||||
var major, minor, patch uint64
|
var major, minor, patch uint64
|
||||||
var vendor, device uint64
|
var vendor, device, uniqueID uint64
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
line := strings.TrimSpace(scanner.Text())
|
line := strings.TrimSpace(scanner.Text())
|
||||||
// Note: we could also use "cpu_cores_count X" where X is greater than zero to detect CPUs
|
// Note: we could also use "cpu_cores_count X" where X is greater than zero to detect CPUs
|
||||||
@@ -121,30 +131,43 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
} else if strings.HasPrefix(line, "vendor_id") {
|
} else if strings.HasPrefix(line, "vendor_id") {
|
||||||
ver := strings.Fields(line)
|
ver := strings.Fields(line)
|
||||||
if len(ver) != 2 {
|
if len(ver) != 2 {
|
||||||
slog.Debug("malformed vendor_id", "vendor_id", line)
|
slog.Debug("malformed", "vendor_id", line)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
vendor, err = strconv.ParseUint(ver[1], 10, 32)
|
vendor, err = strconv.ParseUint(ver[1], 10, 64)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Debug("malformed vendor_id" + line)
|
slog.Debug("malformed", "vendor_id", line, "error", err)
|
||||||
}
|
}
|
||||||
} else if strings.HasPrefix(line, "device_id") {
|
} else if strings.HasPrefix(line, "device_id") {
|
||||||
ver := strings.Fields(line)
|
ver := strings.Fields(line)
|
||||||
if len(ver) != 2 {
|
if len(ver) != 2 {
|
||||||
slog.Debug("malformed device_id", "device_id", line)
|
slog.Debug("malformed", "device_id", line)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
device, err = strconv.ParseUint(ver[1], 10, 32)
|
device, err = strconv.ParseUint(ver[1], 10, 64)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Debug("malformed device_id" + line)
|
slog.Debug("malformed", "device_id", line, "error", err)
|
||||||
|
}
|
||||||
|
} else if strings.HasPrefix(line, "unique_id") {
|
||||||
|
ver := strings.Fields(line)
|
||||||
|
if len(ver) != 2 {
|
||||||
|
slog.Debug("malformed", "unique_id", line)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
uniqueID, err = strconv.ParseUint(ver[1], 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
slog.Debug("malformed", "unique_id", line, "error", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO - any other properties we want to extract and record?
|
// TODO - any other properties we want to extract and record?
|
||||||
// vendor_id + device_id -> pci lookup for "Name"
|
// vendor_id + device_id -> pci lookup for "Name"
|
||||||
// Other metrics that may help us understand relative performance between multiple GPUs
|
// Other metrics that may help us understand relative performance between multiple GPUs
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Note: while ./mem_banks/*/used_memory exists, it doesn't appear to take other VRAM consumers
|
||||||
|
// into consideration, so we instead map the device over to the DRM driver sysfs nodes which
|
||||||
|
// do reliably report VRAM usage.
|
||||||
|
|
||||||
if isCPU {
|
if isCPU {
|
||||||
cpuCount++
|
cpuCount++
|
||||||
continue
|
continue
|
||||||
@@ -156,7 +179,7 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
// Shouldn't happen, but just in case...
|
// Shouldn't happen, but just in case...
|
||||||
if gpuID < 0 {
|
if gpuID < 0 {
|
||||||
slog.Error("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
|
slog.Error("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
|
||||||
return []GpuInfo{}
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
if int(major) < RocmComputeMin {
|
if int(major) < RocmComputeMin {
|
||||||
@@ -167,65 +190,68 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
// Look up the memory for the current node
|
// Look up the memory for the current node
|
||||||
totalMemory := uint64(0)
|
totalMemory := uint64(0)
|
||||||
usedMemory := uint64(0)
|
usedMemory := uint64(0)
|
||||||
propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(nodeID), GPUTotalMemoryFileGlob)
|
var usedFile string
|
||||||
propFiles, err := filepath.Glob(propGlob)
|
mapping := []struct {
|
||||||
if err != nil {
|
id uint64
|
||||||
slog.Warn("error looking up total GPU memory", "glob", propGlob, "error", err)
|
filename string
|
||||||
|
}{
|
||||||
|
{vendor, DRMVendorFile},
|
||||||
|
{device, DRMDeviceFile},
|
||||||
|
{uniqueID, DRMUniqueIDFile}, // Not all devices will report this
|
||||||
}
|
}
|
||||||
// 1 or more memory banks - sum the values of all of them
|
slog.Debug("mapping amdgpu to drm sysfs nodes", "amdgpu", match, "vendor", vendor, "device", device, "unique_id", uniqueID)
|
||||||
for _, propFile := range propFiles {
|
// Map over to DRM location to find the total/free memory
|
||||||
fp, err := os.Open(propFile)
|
drmMatches, _ := filepath.Glob(DRMDeviceDirGlob)
|
||||||
if err != nil {
|
for _, devDir := range drmMatches {
|
||||||
slog.Warn("failed to open sysfs node", "file", propFile, "erroir", err)
|
matched := true
|
||||||
continue
|
for _, m := range mapping {
|
||||||
}
|
if m.id == 0 {
|
||||||
defer fp.Close()
|
// Null ID means it didn't populate, so we can't use it to match
|
||||||
scanner := bufio.NewScanner(fp)
|
continue
|
||||||
for scanner.Scan() {
|
}
|
||||||
line := strings.TrimSpace(scanner.Text())
|
filename := filepath.Join(devDir, m.filename)
|
||||||
if strings.HasPrefix(line, "size_in_bytes") {
|
buf, err := os.ReadFile(filename)
|
||||||
ver := strings.Fields(line)
|
if err != nil {
|
||||||
if len(ver) != 2 {
|
slog.Debug("failed to read sysfs node", "file", filename, "error", err)
|
||||||
slog.Warn("malformed " + line)
|
matched = false
|
||||||
continue
|
break
|
||||||
}
|
}
|
||||||
bankSizeInBytes, err := strconv.ParseUint(ver[1], 10, 64)
|
// values here are in hex, strip off the lead 0x and parse so we can compare the numeric (decimal) values in amdgpu
|
||||||
if err != nil {
|
cmp, err := strconv.ParseUint(strings.TrimPrefix(strings.TrimSpace(string(buf)), "0x"), 16, 64)
|
||||||
slog.Warn("malformed int " + line)
|
if err != nil {
|
||||||
continue
|
slog.Debug("failed to parse sysfs node", "file", filename, "error", err)
|
||||||
}
|
matched = false
|
||||||
totalMemory += bankSizeInBytes
|
break
|
||||||
|
}
|
||||||
|
if cmp != m.id {
|
||||||
|
matched = false
|
||||||
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
if !matched {
|
||||||
if totalMemory == 0 {
|
|
||||||
slog.Warn("amdgpu reports zero total memory", "gpu", gpuID)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(nodeID), GPUUsedMemoryFileGlob)
|
|
||||||
usedFiles, err := filepath.Glob(usedGlob)
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("error looking up used GPU memory", "glob", usedGlob, "error", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
for _, usedFile := range usedFiles {
|
|
||||||
fp, err := os.Open(usedFile)
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("failed to open sysfs node", "file", usedFile, "error", err)
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
defer fp.Close()
|
|
||||||
data, err := io.ReadAll(fp)
|
// Found the matching DRM directory
|
||||||
|
slog.Debug("matched", "amdgpu", match, "drm", devDir)
|
||||||
|
totalFile := filepath.Join(devDir, DRMTotalMemoryFile)
|
||||||
|
buf, err := os.ReadFile(totalFile)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("failed to read sysfs node", "file", usedFile, "error", err)
|
slog.Debug("failed to read sysfs node", "file", totalFile, "error", err)
|
||||||
continue
|
break
|
||||||
}
|
}
|
||||||
used, err := strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64)
|
totalMemory, err = strconv.ParseUint(strings.TrimSpace(string(buf)), 10, 64)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("malformed used memory", "data", string(data), "error", err)
|
slog.Debug("failed to parse sysfs node", "file", totalFile, "error", err)
|
||||||
continue
|
break
|
||||||
}
|
}
|
||||||
usedMemory += used
|
|
||||||
|
usedFile = filepath.Join(devDir, DRMUsedMemoryFile)
|
||||||
|
usedMemory, err = getFreeMemory(usedFile)
|
||||||
|
if err != nil {
|
||||||
|
slog.Debug("failed to update used memory", "error", err)
|
||||||
|
}
|
||||||
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
|
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
|
||||||
@@ -241,18 +267,21 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
|
|
||||||
slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
|
slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
|
||||||
slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
|
slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
|
||||||
gpuInfo := GpuInfo{
|
gpuInfo := RocmGPUInfo{
|
||||||
Library: "rocm",
|
GpuInfo: GpuInfo{
|
||||||
memInfo: memInfo{
|
Library: "rocm",
|
||||||
TotalMemory: totalMemory,
|
memInfo: memInfo{
|
||||||
FreeMemory: (totalMemory - usedMemory),
|
TotalMemory: totalMemory,
|
||||||
|
FreeMemory: (totalMemory - usedMemory),
|
||||||
|
},
|
||||||
|
ID: strconv.Itoa(gpuID),
|
||||||
|
Name: name,
|
||||||
|
Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch),
|
||||||
|
MinimumMemory: rocmMinimumMemory,
|
||||||
|
DriverMajor: driverMajor,
|
||||||
|
DriverMinor: driverMinor,
|
||||||
},
|
},
|
||||||
ID: fmt.Sprintf("%d", gpuID),
|
usedFilepath: usedFile,
|
||||||
Name: name,
|
|
||||||
Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch),
|
|
||||||
MinimumMemory: rocmMinimumMemory,
|
|
||||||
DriverMajor: driverMajor,
|
|
||||||
DriverMinor: driverMinor,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the user wants to filter to a subset of devices, filter out if we aren't a match
|
// If the user wants to filter to a subset of devices, filter out if we aren't a match
|
||||||
@@ -276,7 +305,7 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
libDir, err = AMDValidateLibDir()
|
libDir, err = AMDValidateLibDir()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("unable to verify rocm library, will use cpu", "error", err)
|
slog.Warn("unable to verify rocm library, will use cpu", "error", err)
|
||||||
return []GpuInfo{}
|
return nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
gpuInfo.DependencyPath = libDir
|
gpuInfo.DependencyPath = libDir
|
||||||
@@ -287,7 +316,7 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
supported, err = GetSupportedGFX(libDir)
|
supported, err = GetSupportedGFX(libDir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
|
slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
|
||||||
return []GpuInfo{}
|
return nil
|
||||||
}
|
}
|
||||||
slog.Debug("rocm supported GPUs", "types", supported)
|
slog.Debug("rocm supported GPUs", "types", supported)
|
||||||
}
|
}
|
||||||
@@ -304,6 +333,11 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
|
slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check for env var workarounds
|
||||||
|
if name == "1002:687f" { // Vega RX 56
|
||||||
|
gpuInfo.EnvWorkarounds = append(gpuInfo.EnvWorkarounds, [2]string{"HSA_ENABLE_SDMA", "0"})
|
||||||
|
}
|
||||||
|
|
||||||
// The GPU has passed all the verification steps and is supported
|
// The GPU has passed all the verification steps and is supported
|
||||||
resp = append(resp, gpuInfo)
|
resp = append(resp, gpuInfo)
|
||||||
}
|
}
|
||||||
@@ -378,3 +412,31 @@ func AMDDriverVersion() (driverMajor, driverMinor int, err error) {
|
|||||||
}
|
}
|
||||||
return driverMajor, driverMinor, nil
|
return driverMajor, driverMinor, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
|
||||||
|
if len(gpus) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
for i := range gpus {
|
||||||
|
usedMemory, err := getFreeMemory(gpus[i].usedFilepath)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(gpus[i].TotalMemory-usedMemory))
|
||||||
|
gpus[i].FreeMemory = gpus[i].TotalMemory - usedMemory
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func getFreeMemory(usedFile string) (uint64, error) {
|
||||||
|
buf, err := os.ReadFile(usedFile)
|
||||||
|
if err != nil {
|
||||||
|
return 0, fmt.Errorf("failed to read sysfs node %s %w", usedFile, err)
|
||||||
|
}
|
||||||
|
usedMemory, err := strconv.ParseUint(strings.TrimSpace(string(buf)), 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
slog.Debug("failed to parse sysfs node", "file", usedFile, "error", err)
|
||||||
|
return 0, fmt.Errorf("failed to parse sysfs node %s %w", usedFile, err)
|
||||||
|
}
|
||||||
|
return usedMemory, nil
|
||||||
|
}
|
||||||
|
@@ -7,8 +7,10 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"slices"
|
"slices"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -24,8 +26,8 @@ var (
|
|||||||
RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\5.7\\bin"} // TODO glob?
|
RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\5.7\\bin"} // TODO glob?
|
||||||
)
|
)
|
||||||
|
|
||||||
func AMDGetGPUInfo() []GpuInfo {
|
func AMDGetGPUInfo() []RocmGPUInfo {
|
||||||
resp := []GpuInfo{}
|
resp := []RocmGPUInfo{}
|
||||||
hl, err := NewHipLib()
|
hl, err := NewHipLib()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Debug(err.Error())
|
slog.Debug(err.Error())
|
||||||
@@ -52,7 +54,7 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var supported []string
|
var supported []string
|
||||||
gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
|
gfxOverride := envconfig.HsaOverrideGfxVersion
|
||||||
if gfxOverride == "" {
|
if gfxOverride == "" {
|
||||||
supported, err = GetSupportedGFX(libDir)
|
supported, err = GetSupportedGFX(libDir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -117,21 +119,24 @@ func AMDGetGPUInfo() []GpuInfo {
|
|||||||
// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
|
// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
|
||||||
slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
|
slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
|
||||||
slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
|
slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
|
||||||
gpuInfo := GpuInfo{
|
gpuInfo := RocmGPUInfo{
|
||||||
Library: "rocm",
|
GpuInfo: GpuInfo{
|
||||||
memInfo: memInfo{
|
Library: "rocm",
|
||||||
TotalMemory: totalMemory,
|
memInfo: memInfo{
|
||||||
FreeMemory: freeMemory,
|
TotalMemory: totalMemory,
|
||||||
},
|
FreeMemory: freeMemory,
|
||||||
ID: fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices
|
},
|
||||||
DependencyPath: libDir,
|
ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
|
||||||
MinimumMemory: rocmMinimumMemory,
|
DependencyPath: libDir,
|
||||||
Name: name,
|
MinimumMemory: rocmMinimumMemory,
|
||||||
Compute: gfx,
|
Name: name,
|
||||||
|
Compute: gfx,
|
||||||
|
|
||||||
// TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve
|
// TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve
|
||||||
// DriverMajor: driverMajor,
|
// DriverMajor: driverMajor,
|
||||||
// DriverMinor: driverMinor,
|
// DriverMinor: driverMinor,
|
||||||
|
},
|
||||||
|
index: i,
|
||||||
}
|
}
|
||||||
|
|
||||||
resp = append(resp, gpuInfo)
|
resp = append(resp, gpuInfo)
|
||||||
@@ -159,3 +164,30 @@ func AMDValidateLibDir() (string, error) {
|
|||||||
slog.Warn("amdgpu detected, but no compatible rocm library found. Please install ROCm")
|
slog.Warn("amdgpu detected, but no compatible rocm library found. Please install ROCm")
|
||||||
return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
|
return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
|
||||||
|
if len(gpus) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
hl, err := NewHipLib()
|
||||||
|
if err != nil {
|
||||||
|
slog.Debug(err.Error())
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
defer hl.Release()
|
||||||
|
|
||||||
|
for i := range gpus {
|
||||||
|
err := hl.HipSetDevice(gpus[i].index)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
freeMemory, _, err := hl.HipMemGetInfo()
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("get mem info", "id", i, "error", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(freeMemory))
|
||||||
|
gpus[i].FreeMemory = freeMemory
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
@@ -77,20 +77,27 @@ func cleanupTmpDirs() {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
raw, err := os.ReadFile(filepath.Join(d, "ollama.pid"))
|
raw, err := os.ReadFile(filepath.Join(d, "ollama.pid"))
|
||||||
if err == nil {
|
|
||||||
pid, err := strconv.Atoi(string(raw))
|
|
||||||
if err == nil {
|
|
||||||
if proc, err := os.FindProcess(pid); err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
|
|
||||||
// Another running ollama, ignore this tmpdir
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
slog.Debug("failed to open ollama.pid", "path", d, "error", err)
|
|
||||||
}
|
|
||||||
err = os.RemoveAll(d)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Debug("unable to cleanup stale tmpdir", "path", d, "error", err)
|
slog.Warn("failed to read ollama.pid", "path", d, "error", err)
|
||||||
|
// No pid, ignore this tmpdir
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
pid, err := strconv.Atoi(string(raw))
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("failed to parse pid", "path", d, "error", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
proc, err := os.FindProcess(pid)
|
||||||
|
if err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
|
||||||
|
slog.Warn("found running ollama", "pid", pid, "path", d)
|
||||||
|
// Another running ollama, ignore this tmpdir
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := os.Remove(d); err != nil {
|
||||||
|
slog.Warn("unable to cleanup stale tmpdir", "path", d, "error", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -1,21 +1,16 @@
|
|||||||
package gpu
|
package gpu
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"log/slog"
|
|
||||||
|
|
||||||
"golang.org/x/sys/cpu"
|
"golang.org/x/sys/cpu"
|
||||||
)
|
)
|
||||||
|
|
||||||
func GetCPUVariant() string {
|
func GetCPUCapability() CPUCapability {
|
||||||
if cpu.X86.HasAVX2 {
|
if cpu.X86.HasAVX2 {
|
||||||
slog.Debug("CPU has AVX2")
|
return CPUCapabilityAVX2
|
||||||
return "avx2"
|
|
||||||
}
|
}
|
||||||
if cpu.X86.HasAVX {
|
if cpu.X86.HasAVX {
|
||||||
slog.Debug("CPU has AVX")
|
return CPUCapabilityAVX
|
||||||
return "avx"
|
|
||||||
}
|
}
|
||||||
slog.Debug("CPU does not have vector extensions")
|
|
||||||
// else LCD
|
// else LCD
|
||||||
return ""
|
return CPUCapabilityNone
|
||||||
}
|
}
|
||||||
|
516
gpu/gpu.go
516
gpu/gpu.go
@@ -24,19 +24,37 @@ import (
|
|||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
)
|
)
|
||||||
|
|
||||||
type handles struct {
|
type cudaHandles struct {
|
||||||
deviceCount int
|
deviceCount int
|
||||||
cudart *C.cudart_handle_t
|
cudart *C.cudart_handle_t
|
||||||
nvcuda *C.nvcuda_handle_t
|
nvcuda *C.nvcuda_handle_t
|
||||||
|
nvml *C.nvml_handle_t
|
||||||
|
}
|
||||||
|
|
||||||
|
type oneapiHandles struct {
|
||||||
oneapi *C.oneapi_handle_t
|
oneapi *C.oneapi_handle_t
|
||||||
|
deviceCount int
|
||||||
}
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
cudaMinimumMemory = 457 * format.MebiByte
|
cudaMinimumMemory = 457 * format.MebiByte
|
||||||
rocmMinimumMemory = 457 * format.MebiByte
|
rocmMinimumMemory = 457 * format.MebiByte
|
||||||
|
// TODO OneAPI minimum memory
|
||||||
)
|
)
|
||||||
|
|
||||||
var gpuMutex sync.Mutex
|
var (
|
||||||
|
gpuMutex sync.Mutex
|
||||||
|
bootstrapped bool
|
||||||
|
cpuCapability CPUCapability
|
||||||
|
cpus []CPUInfo
|
||||||
|
cudaGPUs []CudaGPUInfo
|
||||||
|
nvcudaLibPath string
|
||||||
|
cudartLibPath string
|
||||||
|
oneapiLibPath string
|
||||||
|
nvmlLibPath string
|
||||||
|
rocmGPUs []RocmGPUInfo
|
||||||
|
oneapiGPUs []OneapiGPUInfo
|
||||||
|
)
|
||||||
|
|
||||||
// With our current CUDA compile flags, older than 5.0 will not work properly
|
// With our current CUDA compile flags, older than 5.0 will not work properly
|
||||||
var CudaComputeMin = [2]C.int{5, 0}
|
var CudaComputeMin = [2]C.int{5, 0}
|
||||||
@@ -46,113 +64,113 @@ var RocmComputeMin = 9
|
|||||||
// TODO find a better way to detect iGPU instead of minimum memory
|
// TODO find a better way to detect iGPU instead of minimum memory
|
||||||
const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
|
const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
|
||||||
|
|
||||||
var CudartLinuxGlobs = []string{
|
|
||||||
"/usr/local/cuda/lib64/libcudart.so*",
|
|
||||||
"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
|
|
||||||
"/usr/lib/x86_64-linux-gnu/libcudart.so*",
|
|
||||||
"/usr/lib/wsl/lib/libcudart.so*",
|
|
||||||
"/usr/lib/wsl/drivers/*/libcudart.so*",
|
|
||||||
"/opt/cuda/lib64/libcudart.so*",
|
|
||||||
"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
|
|
||||||
"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
|
|
||||||
"/usr/lib/aarch64-linux-gnu/libcudart.so*",
|
|
||||||
"/usr/local/cuda/lib*/libcudart.so*",
|
|
||||||
"/usr/lib*/libcudart.so*",
|
|
||||||
"/usr/local/lib*/libcudart.so*",
|
|
||||||
}
|
|
||||||
|
|
||||||
var CudartWindowsGlobs = []string{
|
|
||||||
"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
|
|
||||||
}
|
|
||||||
|
|
||||||
var NvcudaLinuxGlobs = []string{
|
|
||||||
"/usr/local/cuda*/targets/*/lib/libcuda.so*",
|
|
||||||
"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
|
|
||||||
"/usr/lib/*-linux-gnu/libcuda.so*",
|
|
||||||
"/usr/lib/wsl/lib/libcuda.so*",
|
|
||||||
"/usr/lib/wsl/drivers/*/libcuda.so*",
|
|
||||||
"/opt/cuda/lib*/libcuda.so*",
|
|
||||||
"/usr/local/cuda/lib*/libcuda.so*",
|
|
||||||
"/usr/lib*/libcuda.so*",
|
|
||||||
"/usr/local/lib*/libcuda.so*",
|
|
||||||
}
|
|
||||||
|
|
||||||
var NvcudaWindowsGlobs = []string{
|
|
||||||
"c:\\windows\\system*\\nvcuda.dll",
|
|
||||||
}
|
|
||||||
|
|
||||||
var OneapiWindowsGlobs = []string{
|
|
||||||
"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
|
|
||||||
}
|
|
||||||
|
|
||||||
var OneapiLinuxGlobs = []string{
|
|
||||||
"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
|
|
||||||
"/usr/lib*/libze_intel_gpu.so*",
|
|
||||||
}
|
|
||||||
|
|
||||||
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
|
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
|
||||||
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
|
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
|
||||||
var CudaTegra string = os.Getenv("JETSON_JETPACK")
|
var CudaTegra string = os.Getenv("JETSON_JETPACK")
|
||||||
|
|
||||||
// Note: gpuMutex must already be held
|
// Note: gpuMutex must already be held
|
||||||
func initGPUHandles() *handles {
|
func initCudaHandles() *cudaHandles {
|
||||||
|
|
||||||
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
|
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
|
||||||
|
|
||||||
gpuHandles := &handles{}
|
cHandles := &cudaHandles{}
|
||||||
var cudartMgmtName string
|
// Short Circuit if we already know which library to use
|
||||||
var cudartMgmtPatterns []string
|
if nvmlLibPath != "" {
|
||||||
var nvcudaMgmtName string
|
cHandles.nvml, _ = LoadNVMLMgmt([]string{nvmlLibPath})
|
||||||
var nvcudaMgmtPatterns []string
|
return cHandles
|
||||||
|
}
|
||||||
tmpDir, _ := PayloadsDir()
|
if nvcudaLibPath != "" {
|
||||||
switch runtime.GOOS {
|
cHandles.deviceCount, cHandles.nvcuda, _ = LoadNVCUDAMgmt([]string{nvcudaLibPath})
|
||||||
case "windows":
|
return cHandles
|
||||||
cudartMgmtName = "cudart64_*.dll"
|
}
|
||||||
localAppData := os.Getenv("LOCALAPPDATA")
|
if cudartLibPath != "" {
|
||||||
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
|
cHandles.deviceCount, cHandles.cudart, _ = LoadCUDARTMgmt([]string{cudartLibPath})
|
||||||
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
|
return cHandles
|
||||||
// Aligned with driver, we can't carry as payloads
|
|
||||||
nvcudaMgmtName = "nvcuda.dll"
|
|
||||||
nvcudaMgmtPatterns = NvcudaWindowsGlobs
|
|
||||||
case "linux":
|
|
||||||
cudartMgmtName = "libcudart.so*"
|
|
||||||
if tmpDir != "" {
|
|
||||||
// TODO - add "payloads" for subprocess
|
|
||||||
cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)}
|
|
||||||
}
|
|
||||||
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
|
|
||||||
// Aligned with driver, we can't carry as payloads
|
|
||||||
nvcudaMgmtName = "libcuda.so*"
|
|
||||||
nvcudaMgmtPatterns = NvcudaLinuxGlobs
|
|
||||||
default:
|
|
||||||
return gpuHandles
|
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Debug("Detecting GPUs")
|
slog.Debug("searching for GPU discovery libraries for NVIDIA")
|
||||||
nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns)
|
var cudartMgmtPatterns []string
|
||||||
|
|
||||||
|
// Aligned with driver, we can't carry as payloads
|
||||||
|
nvcudaMgmtPatterns := NvcudaGlobs
|
||||||
|
|
||||||
|
if runtime.GOOS == "windows" {
|
||||||
|
localAppData := os.Getenv("LOCALAPPDATA")
|
||||||
|
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
|
||||||
|
}
|
||||||
|
tmpDir, _ := PayloadsDir()
|
||||||
|
if tmpDir != "" {
|
||||||
|
// TODO - add "payloads" for subprocess
|
||||||
|
cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", CudartMgmtName)}
|
||||||
|
}
|
||||||
|
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)
|
||||||
|
|
||||||
|
if len(NvmlGlobs) > 0 {
|
||||||
|
nvmlLibPaths := FindGPULibs(NvmlMgmtName, NvmlGlobs)
|
||||||
|
if len(nvmlLibPaths) > 0 {
|
||||||
|
nvml, libPath := LoadNVMLMgmt(nvmlLibPaths)
|
||||||
|
if nvml != nil {
|
||||||
|
slog.Debug("nvidia-ml loaded", "library", libPath)
|
||||||
|
cHandles.nvml = nvml
|
||||||
|
nvmlLibPath = libPath
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
nvcudaLibPaths := FindGPULibs(NvcudaMgmtName, nvcudaMgmtPatterns)
|
||||||
if len(nvcudaLibPaths) > 0 {
|
if len(nvcudaLibPaths) > 0 {
|
||||||
deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
|
deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
|
||||||
if nvcuda != nil {
|
if nvcuda != nil {
|
||||||
slog.Debug("detected GPUs", "count", deviceCount, "library", libPath)
|
slog.Debug("detected GPUs", "count", deviceCount, "library", libPath)
|
||||||
gpuHandles.nvcuda = nvcuda
|
cHandles.nvcuda = nvcuda
|
||||||
gpuHandles.deviceCount = deviceCount
|
cHandles.deviceCount = deviceCount
|
||||||
return gpuHandles
|
nvcudaLibPath = libPath
|
||||||
|
return cHandles
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns)
|
cudartLibPaths := FindGPULibs(CudartMgmtName, cudartMgmtPatterns)
|
||||||
if len(cudartLibPaths) > 0 {
|
if len(cudartLibPaths) > 0 {
|
||||||
deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
|
deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
|
||||||
if cudart != nil {
|
if cudart != nil {
|
||||||
slog.Debug("detected GPUs", "library", libPath, "count", deviceCount)
|
slog.Debug("detected GPUs", "library", libPath, "count", deviceCount)
|
||||||
gpuHandles.cudart = cudart
|
cHandles.cudart = cudart
|
||||||
gpuHandles.deviceCount = deviceCount
|
cHandles.deviceCount = deviceCount
|
||||||
return gpuHandles
|
cudartLibPath = libPath
|
||||||
|
return cHandles
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return gpuHandles
|
return cHandles
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note: gpuMutex must already be held
|
||||||
|
func initOneAPIHandles() *oneapiHandles {
|
||||||
|
oHandles := &oneapiHandles{}
|
||||||
|
|
||||||
|
// Short Circuit if we already know which library to use
|
||||||
|
if oneapiLibPath != "" {
|
||||||
|
oHandles.deviceCount, oHandles.oneapi, _ = LoadOneapiMgmt([]string{oneapiLibPath})
|
||||||
|
return oHandles
|
||||||
|
}
|
||||||
|
|
||||||
|
oneapiLibPaths := FindGPULibs(OneapiMgmtName, OneapiGlobs)
|
||||||
|
if len(oneapiLibPaths) > 0 {
|
||||||
|
oHandles.deviceCount, oHandles.oneapi, oneapiLibPath = LoadOneapiMgmt(oneapiLibPaths)
|
||||||
|
}
|
||||||
|
|
||||||
|
return oHandles
|
||||||
|
}
|
||||||
|
|
||||||
|
func GetCPUInfo() GpuInfoList {
|
||||||
|
gpuMutex.Lock()
|
||||||
|
if !bootstrapped {
|
||||||
|
gpuMutex.Unlock()
|
||||||
|
GetGPUInfo()
|
||||||
|
} else {
|
||||||
|
gpuMutex.Unlock()
|
||||||
|
}
|
||||||
|
return GpuInfoList{cpus[0].GpuInfo}
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetGPUInfo() GpuInfoList {
|
func GetGPUInfo() GpuInfoList {
|
||||||
@@ -160,112 +178,255 @@ func GetGPUInfo() GpuInfoList {
|
|||||||
// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
|
// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
|
||||||
gpuMutex.Lock()
|
gpuMutex.Lock()
|
||||||
defer gpuMutex.Unlock()
|
defer gpuMutex.Unlock()
|
||||||
|
needRefresh := true
|
||||||
gpuHandles := initGPUHandles()
|
var cHandles *cudaHandles
|
||||||
|
var oHandles *oneapiHandles
|
||||||
defer func() {
|
defer func() {
|
||||||
if gpuHandles.cudart != nil {
|
if cHandles != nil {
|
||||||
C.cudart_release(*gpuHandles.cudart)
|
if cHandles.cudart != nil {
|
||||||
|
C.cudart_release(*cHandles.cudart)
|
||||||
|
}
|
||||||
|
if cHandles.nvcuda != nil {
|
||||||
|
C.nvcuda_release(*cHandles.nvcuda)
|
||||||
|
}
|
||||||
|
if cHandles.nvml != nil {
|
||||||
|
C.nvml_release(*cHandles.nvml)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if gpuHandles.nvcuda != nil {
|
if oHandles != nil {
|
||||||
C.nvcuda_release(*gpuHandles.nvcuda)
|
if oHandles.oneapi != nil {
|
||||||
|
// TODO - is this needed?
|
||||||
|
C.oneapi_release(*oHandles.oneapi)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
// All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX
|
if !bootstrapped {
|
||||||
cpuVariant := GetCPUVariant()
|
slog.Debug("Detecting GPUs")
|
||||||
if cpuVariant == "" && runtime.GOARCH == "amd64" {
|
needRefresh = false
|
||||||
slog.Warn("CPU does not have AVX or AVX2, disabling GPU support.")
|
cpuCapability = GetCPUCapability()
|
||||||
}
|
var memInfo C.mem_info_t
|
||||||
|
|
||||||
// On windows we bundle the nvidia library one level above the runner dir
|
mem, err := GetCPUMem()
|
||||||
depPath := ""
|
if err != nil {
|
||||||
if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
|
slog.Warn("error looking up system memory", "error", err)
|
||||||
depPath = filepath.Dir(envconfig.RunnersDir)
|
|
||||||
}
|
|
||||||
|
|
||||||
var memInfo C.mem_info_t
|
|
||||||
resp := []GpuInfo{}
|
|
||||||
|
|
||||||
// NVIDIA first
|
|
||||||
for i := range gpuHandles.deviceCount {
|
|
||||||
// TODO once we support CPU compilation variants of GPU libraries refine this...
|
|
||||||
if cpuVariant == "" && runtime.GOARCH == "amd64" {
|
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
if gpuHandles.cudart != nil || gpuHandles.nvcuda != nil {
|
cpus = []CPUInfo{CPUInfo{
|
||||||
gpuInfo := GpuInfo{
|
GpuInfo: GpuInfo{
|
||||||
Library: "cuda",
|
memInfo: mem,
|
||||||
|
Library: "cpu",
|
||||||
|
Variant: cpuCapability,
|
||||||
|
ID: "0",
|
||||||
|
},
|
||||||
|
}}
|
||||||
|
|
||||||
|
// Fallback to CPU mode if we're lacking required vector extensions on x86
|
||||||
|
if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
|
||||||
|
slog.Warn("CPU does not have minimum vector extensions, GPU inference disabled", "required", GPURunnerCPUCapability, "detected", cpuCapability)
|
||||||
|
bootstrapped = true
|
||||||
|
// No need to do any GPU discovery, since we can't run on them
|
||||||
|
return GpuInfoList{cpus[0].GpuInfo}
|
||||||
|
}
|
||||||
|
|
||||||
|
// On windows we bundle the nvidia library one level above the runner dir
|
||||||
|
depPath := ""
|
||||||
|
if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
|
||||||
|
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "cuda")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load ALL libraries
|
||||||
|
cHandles = initCudaHandles()
|
||||||
|
|
||||||
|
// NVIDIA
|
||||||
|
for i := range cHandles.deviceCount {
|
||||||
|
if cHandles.cudart != nil || cHandles.nvcuda != nil {
|
||||||
|
gpuInfo := CudaGPUInfo{
|
||||||
|
GpuInfo: GpuInfo{
|
||||||
|
Library: "cuda",
|
||||||
|
},
|
||||||
|
index: i,
|
||||||
|
}
|
||||||
|
var driverMajor int
|
||||||
|
var driverMinor int
|
||||||
|
if cHandles.cudart != nil {
|
||||||
|
C.cudart_bootstrap(*cHandles.cudart, C.int(i), &memInfo)
|
||||||
|
} else {
|
||||||
|
C.nvcuda_bootstrap(*cHandles.nvcuda, C.int(i), &memInfo)
|
||||||
|
driverMajor = int(cHandles.nvcuda.driver_major)
|
||||||
|
driverMinor = int(cHandles.nvcuda.driver_minor)
|
||||||
|
}
|
||||||
|
if memInfo.err != nil {
|
||||||
|
slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
|
||||||
|
C.free(unsafe.Pointer(memInfo.err))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
|
||||||
|
slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
gpuInfo.TotalMemory = uint64(memInfo.total)
|
||||||
|
gpuInfo.FreeMemory = uint64(memInfo.free)
|
||||||
|
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
|
||||||
|
gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
|
||||||
|
gpuInfo.MinimumMemory = cudaMinimumMemory
|
||||||
|
gpuInfo.DependencyPath = depPath
|
||||||
|
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
|
||||||
|
gpuInfo.DriverMajor = driverMajor
|
||||||
|
gpuInfo.DriverMinor = driverMinor
|
||||||
|
|
||||||
|
// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
|
||||||
|
cudaGPUs = append(cudaGPUs, gpuInfo)
|
||||||
}
|
}
|
||||||
var driverMajor int
|
}
|
||||||
var driverMinor int
|
|
||||||
if gpuHandles.cudart != nil {
|
// Intel
|
||||||
C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
|
if envconfig.IntelGpu {
|
||||||
|
oHandles = initOneAPIHandles()
|
||||||
|
// On windows we bundle the oneapi library one level above the runner dir
|
||||||
|
depPath = ""
|
||||||
|
if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
|
||||||
|
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "oneapi")
|
||||||
|
}
|
||||||
|
|
||||||
|
for d := range oHandles.oneapi.num_drivers {
|
||||||
|
if oHandles.oneapi == nil {
|
||||||
|
// shouldn't happen
|
||||||
|
slog.Warn("nil oneapi handle with driver count", "count", int(oHandles.oneapi.num_drivers))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
|
||||||
|
for i := range devCount {
|
||||||
|
gpuInfo := OneapiGPUInfo{
|
||||||
|
GpuInfo: GpuInfo{
|
||||||
|
Library: "oneapi",
|
||||||
|
},
|
||||||
|
driverIndex: int(d),
|
||||||
|
gpuIndex: int(i),
|
||||||
|
}
|
||||||
|
// TODO - split bootstrapping from updating free memory
|
||||||
|
C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo)
|
||||||
|
// TODO - convert this to MinimumMemory based on testing...
|
||||||
|
var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
|
||||||
|
memInfo.free = C.uint64_t(totalFreeMem)
|
||||||
|
gpuInfo.TotalMemory = uint64(memInfo.total)
|
||||||
|
gpuInfo.FreeMemory = uint64(memInfo.free)
|
||||||
|
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
|
||||||
|
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
|
||||||
|
gpuInfo.DependencyPath = depPath
|
||||||
|
oneapiGPUs = append(oneapiGPUs, gpuInfo)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
rocmGPUs = AMDGetGPUInfo()
|
||||||
|
bootstrapped = true
|
||||||
|
}
|
||||||
|
|
||||||
|
// For detected GPUs, load library if not loaded
|
||||||
|
|
||||||
|
// Refresh free memory usage
|
||||||
|
if needRefresh {
|
||||||
|
mem, err := GetCPUMem()
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("error looking up system memory", "error", err)
|
||||||
|
} else {
|
||||||
|
slog.Debug("updating system memory data",
|
||||||
|
slog.Group(
|
||||||
|
"before",
|
||||||
|
"total", format.HumanBytes2(cpus[0].TotalMemory),
|
||||||
|
"free", format.HumanBytes2(cpus[0].FreeMemory),
|
||||||
|
),
|
||||||
|
slog.Group(
|
||||||
|
"now",
|
||||||
|
"total", format.HumanBytes2(mem.TotalMemory),
|
||||||
|
"free", format.HumanBytes2(mem.FreeMemory),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
cpus[0].FreeMemory = mem.FreeMemory
|
||||||
|
}
|
||||||
|
|
||||||
|
var memInfo C.mem_info_t
|
||||||
|
if cHandles == nil && len(cudaGPUs) > 0 {
|
||||||
|
cHandles = initCudaHandles()
|
||||||
|
}
|
||||||
|
for i, gpu := range cudaGPUs {
|
||||||
|
if cHandles.nvml != nil {
|
||||||
|
C.nvml_get_free(*cHandles.nvml, C.int(gpu.index), &memInfo.free, &memInfo.total, &memInfo.used)
|
||||||
|
} else if cHandles.cudart != nil {
|
||||||
|
C.cudart_bootstrap(*cHandles.cudart, C.int(gpu.index), &memInfo)
|
||||||
|
} else if cHandles.nvcuda != nil {
|
||||||
|
C.nvcuda_get_free(*cHandles.nvcuda, C.int(gpu.index), &memInfo.free, &memInfo.total)
|
||||||
|
memInfo.used = memInfo.total - memInfo.free
|
||||||
} else {
|
} else {
|
||||||
C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
|
// shouldn't happen
|
||||||
driverMajor = int(gpuHandles.nvcuda.driver_major)
|
slog.Warn("no valid cuda library loaded to refresh vram usage")
|
||||||
driverMinor = int(gpuHandles.nvcuda.driver_minor)
|
break
|
||||||
}
|
}
|
||||||
if memInfo.err != nil {
|
if memInfo.err != nil {
|
||||||
slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
|
slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
|
||||||
C.free(unsafe.Pointer(memInfo.err))
|
C.free(unsafe.Pointer(memInfo.err))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
|
if memInfo.free == 0 {
|
||||||
slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
|
slog.Warn("error looking up nvidia GPU memory")
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
gpuInfo.TotalMemory = uint64(memInfo.total)
|
slog.Debug("updating cuda memory data",
|
||||||
gpuInfo.FreeMemory = uint64(memInfo.free)
|
"gpu", gpu.ID,
|
||||||
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
|
"name", gpu.Name,
|
||||||
gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
|
slog.Group(
|
||||||
gpuInfo.MinimumMemory = cudaMinimumMemory
|
"before",
|
||||||
gpuInfo.DependencyPath = depPath
|
"total", format.HumanBytes2(gpu.TotalMemory),
|
||||||
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
|
"free", format.HumanBytes2(gpu.FreeMemory),
|
||||||
gpuInfo.DriverMajor = driverMajor
|
),
|
||||||
gpuInfo.DriverMinor = driverMinor
|
slog.Group(
|
||||||
|
"now",
|
||||||
|
"total", format.HumanBytes2(uint64(memInfo.total)),
|
||||||
|
"free", format.HumanBytes2(uint64(memInfo.free)),
|
||||||
|
"used", format.HumanBytes2(uint64(memInfo.used)),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
cudaGPUs[i].FreeMemory = uint64(memInfo.free)
|
||||||
|
}
|
||||||
|
|
||||||
// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
|
if oHandles == nil && len(oneapiGPUs) > 0 {
|
||||||
resp = append(resp, gpuInfo)
|
oHandles = initOneAPIHandles()
|
||||||
|
}
|
||||||
|
for i, gpu := range oneapiGPUs {
|
||||||
|
if oHandles.oneapi == nil {
|
||||||
|
// shouldn't happen
|
||||||
|
slog.Warn("nil oneapi handle with device count", "count", oHandles.deviceCount)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
C.oneapi_check_vram(*oHandles.oneapi, C.int(gpu.driverIndex), C.int(gpu.gpuIndex), &memInfo)
|
||||||
|
// TODO - convert this to MinimumMemory based on testing...
|
||||||
|
var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
|
||||||
|
memInfo.free = C.uint64_t(totalFreeMem)
|
||||||
|
oneapiGPUs[i].FreeMemory = uint64(memInfo.free)
|
||||||
|
}
|
||||||
|
|
||||||
|
err = RocmGPUInfoList(rocmGPUs).RefreshFreeMemory()
|
||||||
|
if err != nil {
|
||||||
|
slog.Debug("problem refreshing ROCm free memory", "error", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Then AMD
|
resp := []GpuInfo{}
|
||||||
resp = append(resp, AMDGetGPUInfo()...)
|
for _, gpu := range cudaGPUs {
|
||||||
|
resp = append(resp, gpu.GpuInfo)
|
||||||
|
}
|
||||||
|
for _, gpu := range rocmGPUs {
|
||||||
|
resp = append(resp, gpu.GpuInfo)
|
||||||
|
}
|
||||||
|
for _, gpu := range oneapiGPUs {
|
||||||
|
resp = append(resp, gpu.GpuInfo)
|
||||||
|
}
|
||||||
if len(resp) == 0 {
|
if len(resp) == 0 {
|
||||||
C.cpu_check_ram(&memInfo)
|
resp = append(resp, cpus[0].GpuInfo)
|
||||||
if memInfo.err != nil {
|
|
||||||
slog.Info("error looking up CPU memory", "error", C.GoString(memInfo.err))
|
|
||||||
C.free(unsafe.Pointer(memInfo.err))
|
|
||||||
return resp
|
|
||||||
}
|
|
||||||
gpuInfo := GpuInfo{
|
|
||||||
Library: "cpu",
|
|
||||||
Variant: cpuVariant,
|
|
||||||
}
|
|
||||||
gpuInfo.TotalMemory = uint64(memInfo.total)
|
|
||||||
gpuInfo.FreeMemory = uint64(memInfo.free)
|
|
||||||
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
|
|
||||||
|
|
||||||
resp = append(resp, gpuInfo)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return resp
|
return resp
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetCPUMem() (memInfo, error) {
|
|
||||||
var ret memInfo
|
|
||||||
var info C.mem_info_t
|
|
||||||
C.cpu_check_ram(&info)
|
|
||||||
if info.err != nil {
|
|
||||||
defer C.free(unsafe.Pointer(info.err))
|
|
||||||
return ret, fmt.Errorf(C.GoString(info.err))
|
|
||||||
}
|
|
||||||
ret.FreeMemory = uint64(info.free)
|
|
||||||
ret.TotalMemory = uint64(info.total)
|
|
||||||
return ret, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
|
func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
|
||||||
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
|
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
|
||||||
var ldPaths []string
|
var ldPaths []string
|
||||||
@@ -362,8 +523,26 @@ func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
|
|||||||
return 0, nil, ""
|
return 0, nil, ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func LoadNVMLMgmt(nvmlLibPaths []string) (*C.nvml_handle_t, string) {
|
||||||
|
var resp C.nvml_init_resp_t
|
||||||
|
resp.ch.verbose = getVerboseState()
|
||||||
|
for _, libPath := range nvmlLibPaths {
|
||||||
|
lib := C.CString(libPath)
|
||||||
|
defer C.free(unsafe.Pointer(lib))
|
||||||
|
C.nvml_init(lib, &resp)
|
||||||
|
if resp.err != nil {
|
||||||
|
slog.Info(fmt.Sprintf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err)))
|
||||||
|
C.free(unsafe.Pointer(resp.err))
|
||||||
|
} else {
|
||||||
|
return &resp.ch, libPath
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil, ""
|
||||||
|
}
|
||||||
|
|
||||||
func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
|
func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
|
||||||
var resp C.oneapi_init_resp_t
|
var resp C.oneapi_init_resp_t
|
||||||
|
num_devices := 0
|
||||||
resp.oh.verbose = getVerboseState()
|
resp.oh.verbose = getVerboseState()
|
||||||
for _, libPath := range oneapiLibPaths {
|
for _, libPath := range oneapiLibPaths {
|
||||||
lib := C.CString(libPath)
|
lib := C.CString(libPath)
|
||||||
@@ -373,7 +552,10 @@ func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
|
|||||||
slog.Debug("Unable to load oneAPI management library", "library", libPath, "error", C.GoString(resp.err))
|
slog.Debug("Unable to load oneAPI management library", "library", libPath, "error", C.GoString(resp.err))
|
||||||
C.free(unsafe.Pointer(resp.err))
|
C.free(unsafe.Pointer(resp.err))
|
||||||
} else {
|
} else {
|
||||||
return int(resp.num_devices), &resp.oh, libPath
|
for i := range resp.oh.num_drivers {
|
||||||
|
num_devices += int(C.oneapi_get_device_count(resp.oh, C.int(i)))
|
||||||
|
}
|
||||||
|
return num_devices, &resp.oh, libPath
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 0, nil, ""
|
return 0, nil, ""
|
||||||
|
@@ -24,7 +24,7 @@ func GetGPUInfo() GpuInfoList {
|
|||||||
return []GpuInfo{
|
return []GpuInfo{
|
||||||
{
|
{
|
||||||
Library: "cpu",
|
Library: "cpu",
|
||||||
Variant: GetCPUVariant(),
|
Variant: GetCPUCapability(),
|
||||||
memInfo: mem,
|
memInfo: mem,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
@@ -42,6 +42,17 @@ func GetGPUInfo() GpuInfoList {
|
|||||||
return []GpuInfo{info}
|
return []GpuInfo{info}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func GetCPUInfo() GpuInfoList {
|
||||||
|
mem, _ := GetCPUMem()
|
||||||
|
return []GpuInfo{
|
||||||
|
{
|
||||||
|
Library: "cpu",
|
||||||
|
Variant: GetCPUCapability(),
|
||||||
|
memInfo: mem,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func GetCPUMem() (memInfo, error) {
|
func GetCPUMem() (memInfo, error) {
|
||||||
return memInfo{
|
return memInfo{
|
||||||
TotalMemory: uint64(C.getPhysicalMemory()),
|
TotalMemory: uint64(C.getPhysicalMemory()),
|
||||||
|
@@ -47,6 +47,7 @@ typedef struct mem_info {
|
|||||||
char gpu_name[GPU_NAME_LEN];
|
char gpu_name[GPU_NAME_LEN];
|
||||||
uint64_t total;
|
uint64_t total;
|
||||||
uint64_t free;
|
uint64_t free;
|
||||||
|
uint64_t used;
|
||||||
|
|
||||||
// Compute Capability
|
// Compute Capability
|
||||||
int major;
|
int major;
|
||||||
@@ -62,6 +63,7 @@ void cpu_check_ram(mem_info_t *resp);
|
|||||||
|
|
||||||
#include "gpu_info_cudart.h"
|
#include "gpu_info_cudart.h"
|
||||||
#include "gpu_info_nvcuda.h"
|
#include "gpu_info_nvcuda.h"
|
||||||
|
#include "gpu_info_nvml.h"
|
||||||
#include "gpu_info_oneapi.h"
|
#include "gpu_info_oneapi.h"
|
||||||
|
|
||||||
#endif // __GPU_INFO_H__
|
#endif // __GPU_INFO_H__
|
||||||
|
@@ -1,45 +0,0 @@
|
|||||||
#include "gpu_info.h"
|
|
||||||
// Fallbacks for CPU mode
|
|
||||||
|
|
||||||
#ifdef _WIN32
|
|
||||||
#include <sysinfoapi.h>
|
|
||||||
void cpu_check_ram(mem_info_t *resp) {
|
|
||||||
resp->err = NULL;
|
|
||||||
MEMORYSTATUSEX info;
|
|
||||||
info.dwLength = sizeof(info);
|
|
||||||
if (GlobalMemoryStatusEx(&info) != 0) {
|
|
||||||
resp->total = info.ullTotalPhys;
|
|
||||||
resp->free = info.ullAvailPhys;
|
|
||||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
|
|
||||||
} else {
|
|
||||||
resp->err = LOAD_ERR();
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
#elif __linux__
|
|
||||||
#include <errno.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <sys/sysinfo.h>
|
|
||||||
void cpu_check_ram(mem_info_t *resp) {
|
|
||||||
struct sysinfo info;
|
|
||||||
resp->err = NULL;
|
|
||||||
if (sysinfo(&info) != 0) {
|
|
||||||
resp->err = strdup(strerror(errno));
|
|
||||||
} else {
|
|
||||||
resp->total = info.totalram * info.mem_unit;
|
|
||||||
resp->free = info.freeram * info.mem_unit;
|
|
||||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
#elif __APPLE__
|
|
||||||
// TODO consider an Apple implementation that does something useful
|
|
||||||
// mem_info_t cpu_check_ram() {
|
|
||||||
// mem_info_t resp = {0, 0, NULL};
|
|
||||||
// return resp;
|
|
||||||
// }
|
|
||||||
#else
|
|
||||||
#error "Unsupported platform"
|
|
||||||
#endif
|
|
@@ -40,7 +40,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
|
|||||||
|
|
||||||
for (i = 0; l[i].s != NULL; i++) {
|
for (i = 0; l[i].s != NULL; i++) {
|
||||||
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
|
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
|
||||||
if (!l[i].p) {
|
if (!*(l[i].p)) {
|
||||||
char *msg = LOAD_ERR();
|
char *msg = LOAD_ERR();
|
||||||
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
|
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
|
||||||
UNLOAD_LIBRARY(resp->ch.handle);
|
UNLOAD_LIBRARY(resp->ch.handle);
|
||||||
@@ -94,7 +94,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void cudart_check_vram(cudart_handle_t h, int i, mem_info_t *resp) {
|
void cudart_bootstrap(cudart_handle_t h, int i, mem_info_t *resp) {
|
||||||
resp->err = NULL;
|
resp->err = NULL;
|
||||||
cudartMemory_t memInfo = {0,0,0};
|
cudartMemory_t memInfo = {0,0,0};
|
||||||
cudartReturn_t ret;
|
cudartReturn_t ret;
|
||||||
@@ -166,9 +166,11 @@ void cudart_check_vram(cudart_handle_t h, int i, mem_info_t *resp) {
|
|||||||
|
|
||||||
resp->total = memInfo.total;
|
resp->total = memInfo.total;
|
||||||
resp->free = memInfo.free;
|
resp->free = memInfo.free;
|
||||||
|
resp->used = memInfo.used;
|
||||||
|
|
||||||
LOG(h.verbose, "[%s] CUDA totalMem %lu\n", resp->gpu_id, resp->total);
|
LOG(h.verbose, "[%s] CUDA totalMem %lu\n", resp->gpu_id, resp->total);
|
||||||
LOG(h.verbose, "[%s] CUDA freeMem %lu\n", resp->gpu_id, resp->free);
|
LOG(h.verbose, "[%s] CUDA freeMem %lu\n", resp->gpu_id, resp->free);
|
||||||
|
LOG(h.verbose, "[%s] CUDA usedMem %lu\n", resp->gpu_id, resp->used);
|
||||||
LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
|
LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -140,7 +140,8 @@ typedef struct cudart_init_resp {
|
|||||||
} cudart_init_resp_t;
|
} cudart_init_resp_t;
|
||||||
|
|
||||||
void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
|
void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
|
||||||
void cudart_check_vram(cudart_handle_t ch, int device_id, mem_info_t *resp);
|
void cudart_bootstrap(cudart_handle_t ch, int device_id, mem_info_t *resp);
|
||||||
|
// TODO - if we keep this library longer term, add cudart_get_free
|
||||||
void cudart_release(cudart_handle_t ch);
|
void cudart_release(cudart_handle_t ch);
|
||||||
|
|
||||||
#endif // __GPU_INFO_CUDART_H__
|
#endif // __GPU_INFO_CUDART_H__
|
||||||
|
@@ -43,7 +43,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
|||||||
|
|
||||||
for (i = 0; l[i].s != NULL; i++) {
|
for (i = 0; l[i].s != NULL; i++) {
|
||||||
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
|
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
|
||||||
if (!*l[i].p) {
|
if (!*(l[i].p)) {
|
||||||
char *msg = LOAD_ERR();
|
char *msg = LOAD_ERR();
|
||||||
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
|
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
|
||||||
UNLOAD_LIBRARY(resp->ch.handle);
|
UNLOAD_LIBRARY(resp->ch.handle);
|
||||||
@@ -96,7 +96,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const int buflen = 256;
|
const int buflen = 256;
|
||||||
void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
|
void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
|
||||||
resp->err = NULL;
|
resp->err = NULL;
|
||||||
nvcudaMemory_t memInfo = {0,0};
|
nvcudaMemory_t memInfo = {0,0};
|
||||||
CUresult ret;
|
CUresult ret;
|
||||||
@@ -168,7 +168,7 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
|
|||||||
// To get memory we have to set (and release) a context
|
// To get memory we have to set (and release) a context
|
||||||
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
|
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
|
||||||
if (ret != CUDA_SUCCESS) {
|
if (ret != CUDA_SUCCESS) {
|
||||||
snprintf(buf, buflen, "nvcuda failed to get primary device context %d", ret);
|
snprintf(buf, buflen, "nvcuda failed to get device context %d", ret);
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -193,7 +193,42 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
|
|||||||
|
|
||||||
ret = (*h.cuCtxDestroy)(ctx);
|
ret = (*h.cuCtxDestroy)(ctx);
|
||||||
if (ret != CUDA_SUCCESS) {
|
if (ret != CUDA_SUCCESS) {
|
||||||
LOG(1, "nvcuda failed to release primary device context %d", ret);
|
LOG(1, "nvcuda failed to release device context %d", ret);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total) {
|
||||||
|
CUresult ret;
|
||||||
|
CUcontext ctx = NULL;
|
||||||
|
CUdevice device = -1;
|
||||||
|
*free = 0;
|
||||||
|
*total = 0;
|
||||||
|
|
||||||
|
ret = (*h.cuDeviceGet)(&device, i);
|
||||||
|
if (ret != CUDA_SUCCESS) {
|
||||||
|
LOG(1, "nvcuda device failed to initialize");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// To get memory we have to set (and release) a context
|
||||||
|
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
|
||||||
|
if (ret != CUDA_SUCCESS) {
|
||||||
|
LOG(1, "nvcuda failed to get device context %d", ret);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = (*h.cuMemGetInfo_v2)(free, total);
|
||||||
|
if (ret != CUDA_SUCCESS) {
|
||||||
|
LOG(1, "nvcuda device memory info lookup failure %d", ret);
|
||||||
|
// Best effort on failure...
|
||||||
|
(*h.cuCtxDestroy)(ctx);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = (*h.cuCtxDestroy)(ctx);
|
||||||
|
if (ret != CUDA_SUCCESS) {
|
||||||
|
LOG(1, "nvcuda failed to release device context %d", ret);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -67,7 +67,8 @@ typedef struct nvcuda_init_resp {
|
|||||||
} nvcuda_init_resp_t;
|
} nvcuda_init_resp_t;
|
||||||
|
|
||||||
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
|
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
|
||||||
void nvcuda_check_vram(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
|
void nvcuda_bootstrap(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
|
||||||
|
void nvcuda_get_free(nvcuda_handle_t ch, int device_id, uint64_t *free, uint64_t *total);
|
||||||
void nvcuda_release(nvcuda_handle_t ch);
|
void nvcuda_release(nvcuda_handle_t ch);
|
||||||
|
|
||||||
#endif // __GPU_INFO_NVCUDA_H__
|
#endif // __GPU_INFO_NVCUDA_H__
|
||||||
|
104
gpu/gpu_info_nvml.c
Normal file
104
gpu/gpu_info_nvml.c
Normal file
@@ -0,0 +1,104 @@
|
|||||||
|
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
|
||||||
|
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include "gpu_info_nvml.h"
|
||||||
|
|
||||||
|
void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
|
||||||
|
nvmlReturn_t ret;
|
||||||
|
resp->err = NULL;
|
||||||
|
const int buflen = 256;
|
||||||
|
char buf[buflen + 1];
|
||||||
|
int i;
|
||||||
|
|
||||||
|
struct lookup {
|
||||||
|
char *s;
|
||||||
|
void **p;
|
||||||
|
} l[] = {
|
||||||
|
{"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
|
||||||
|
{"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
|
||||||
|
{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
|
||||||
|
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
|
||||||
|
{NULL, NULL},
|
||||||
|
};
|
||||||
|
|
||||||
|
resp->ch.handle = LOAD_LIBRARY(nvml_lib_path, RTLD_LAZY);
|
||||||
|
if (!resp->ch.handle) {
|
||||||
|
char *msg = LOAD_ERR();
|
||||||
|
LOG(resp->ch.verbose, "library %s load err: %s\n", nvml_lib_path, msg);
|
||||||
|
snprintf(buf, buflen,
|
||||||
|
"Unable to load %s library to query for Nvidia GPUs: %s",
|
||||||
|
nvml_lib_path, msg);
|
||||||
|
free(msg);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO once we've squashed the remaining corner cases remove this log
|
||||||
|
// LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
|
||||||
|
|
||||||
|
for (i = 0; l[i].s != NULL; i++) {
|
||||||
|
// TODO once we've squashed the remaining corner cases remove this log
|
||||||
|
// LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
|
||||||
|
|
||||||
|
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
|
||||||
|
if (!*(l[i].p)) {
|
||||||
|
resp->ch.handle = NULL;
|
||||||
|
char *msg = LOAD_ERR();
|
||||||
|
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
|
||||||
|
UNLOAD_LIBRARY(resp->ch.handle);
|
||||||
|
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
|
||||||
|
msg);
|
||||||
|
free(msg);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = (*resp->ch.nvmlInit_v2)();
|
||||||
|
if (ret != NVML_SUCCESS) {
|
||||||
|
LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
|
||||||
|
UNLOAD_LIBRARY(resp->ch.handle);
|
||||||
|
resp->ch.handle = NULL;
|
||||||
|
snprintf(buf, buflen, "nvml vram init failure: %d", ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *total, uint64_t *used) {
|
||||||
|
nvmlDevice_t device;
|
||||||
|
nvmlMemory_t memInfo = {0};
|
||||||
|
nvmlReturn_t ret;
|
||||||
|
ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
|
||||||
|
if (ret != NVML_SUCCESS) {
|
||||||
|
LOG(1, "unable to get device handle %d: %d", device_id, ret);
|
||||||
|
*free = 0;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
|
||||||
|
if (ret != NVML_SUCCESS) {
|
||||||
|
LOG(1, "device memory info lookup failure %d: %d", device_id, ret);
|
||||||
|
*free = 0;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
*free = memInfo.free;
|
||||||
|
*total = memInfo.total;
|
||||||
|
*used = memInfo.used;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void nvml_release(nvml_handle_t h) {
|
||||||
|
LOG(h.verbose, "releasing nvml library\n");
|
||||||
|
nvmlReturn_t ret;
|
||||||
|
ret = (*h.nvmlShutdown)();
|
||||||
|
if (ret != NVML_SUCCESS) {
|
||||||
|
LOG(1, "error during nvmlShutdown %d", ret);
|
||||||
|
}
|
||||||
|
UNLOAD_LIBRARY(h.handle);
|
||||||
|
h.handle = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // __APPLE__
|
48
gpu/gpu_info_nvml.h
Normal file
48
gpu/gpu_info_nvml.h
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
#ifndef __APPLE__
|
||||||
|
#ifndef __GPU_INFO_NVML_H__
|
||||||
|
#define __GPU_INFO_NVML_H__
|
||||||
|
#include "gpu_info.h"
|
||||||
|
|
||||||
|
// Just enough typedef's to dlopen/dlsym for memory information
|
||||||
|
typedef enum nvmlReturn_enum {
|
||||||
|
NVML_SUCCESS = 0,
|
||||||
|
// Other values omitted for now...
|
||||||
|
} nvmlReturn_t;
|
||||||
|
typedef void *nvmlDevice_t; // Opaque is sufficient
|
||||||
|
typedef struct nvmlMemory_st {
|
||||||
|
unsigned long long total;
|
||||||
|
unsigned long long free;
|
||||||
|
unsigned long long used;
|
||||||
|
} nvmlMemory_t;
|
||||||
|
|
||||||
|
typedef enum nvmlBrandType_enum
|
||||||
|
{
|
||||||
|
NVML_BRAND_UNKNOWN = 0,
|
||||||
|
} nvmlBrandType_t;
|
||||||
|
|
||||||
|
typedef struct nvml_handle {
|
||||||
|
void *handle;
|
||||||
|
uint16_t verbose;
|
||||||
|
nvmlReturn_t (*nvmlInit_v2)(void);
|
||||||
|
nvmlReturn_t (*nvmlShutdown)(void);
|
||||||
|
nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
|
||||||
|
nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
|
||||||
|
} nvml_handle_t;
|
||||||
|
|
||||||
|
typedef struct nvml_init_resp {
|
||||||
|
char *err; // If err is non-null handle is invalid
|
||||||
|
nvml_handle_t ch;
|
||||||
|
} nvml_init_resp_t;
|
||||||
|
|
||||||
|
typedef struct nvml_compute_capability {
|
||||||
|
char *err;
|
||||||
|
int major;
|
||||||
|
int minor;
|
||||||
|
} nvml_compute_capability_t;
|
||||||
|
|
||||||
|
void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
|
||||||
|
void nvml_get_free(nvml_handle_t ch, int device_id, uint64_t *free, uint64_t *total, uint64_t *used);
|
||||||
|
void nvml_release(nvml_handle_t ch);
|
||||||
|
|
||||||
|
#endif // __GPU_INFO_NVML_H__
|
||||||
|
#endif // __APPLE__
|
@@ -4,15 +4,17 @@
|
|||||||
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
|
void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
|
||||||
{
|
|
||||||
ze_result_t ret;
|
ze_result_t ret;
|
||||||
resp->err = NULL;
|
resp->err = NULL;
|
||||||
|
resp->oh.devices = NULL;
|
||||||
|
resp->oh.num_devices = NULL;
|
||||||
|
resp->oh.drivers = NULL;
|
||||||
|
resp->oh.num_drivers = 0;
|
||||||
const int buflen = 256;
|
const int buflen = 256;
|
||||||
char buf[buflen + 1];
|
char buf[buflen + 1];
|
||||||
int i;
|
int i, d;
|
||||||
struct lookup
|
struct lookup {
|
||||||
{
|
|
||||||
char *s;
|
char *s;
|
||||||
void **p;
|
void **p;
|
||||||
} l[] = {
|
} l[] = {
|
||||||
@@ -28,8 +30,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
|
|||||||
};
|
};
|
||||||
|
|
||||||
resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY);
|
resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY);
|
||||||
if (!resp->oh.handle)
|
if (!resp->oh.handle) {
|
||||||
{
|
|
||||||
char *msg = LOAD_ERR();
|
char *msg = LOAD_ERR();
|
||||||
snprintf(buf, buflen,
|
snprintf(buf, buflen,
|
||||||
"Unable to load %s library to query for Intel GPUs: %s\n",
|
"Unable to load %s library to query for Intel GPUs: %s\n",
|
||||||
@@ -44,14 +45,12 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
|
|||||||
"wiring Level-Zero management library functions in %s\n",
|
"wiring Level-Zero management library functions in %s\n",
|
||||||
oneapi_lib_path);
|
oneapi_lib_path);
|
||||||
|
|
||||||
for (i = 0; l[i].s != NULL; i++)
|
for (i = 0; l[i].s != NULL; i++) {
|
||||||
{
|
|
||||||
// TODO once we've squashed the remaining corner cases remove this log
|
// TODO once we've squashed the remaining corner cases remove this log
|
||||||
LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
|
LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
|
||||||
|
|
||||||
*l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
|
*l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
|
||||||
if (!l[i].p)
|
if (!*(l[i].p)) {
|
||||||
{
|
|
||||||
resp->oh.handle = NULL;
|
resp->oh.handle = NULL;
|
||||||
char *msg = LOAD_ERR();
|
char *msg = LOAD_ERR();
|
||||||
LOG(resp->oh.verbose, "dlerr: %s\n", msg);
|
LOG(resp->oh.verbose, "dlerr: %s\n", msg);
|
||||||
@@ -63,23 +62,70 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG(resp->oh.verbose, "calling zesInit\n");
|
||||||
|
|
||||||
ret = (*resp->oh.zesInit)(0);
|
ret = (*resp->oh.zesInit)(0);
|
||||||
if (ret != ZE_RESULT_SUCCESS)
|
if (ret != ZE_RESULT_SUCCESS) {
|
||||||
{
|
LOG(resp->oh.verbose, "zesInit err: %x\n", ret);
|
||||||
LOG(resp->oh.verbose, "zesInit err: %d\n", ret);
|
snprintf(buf, buflen, "oneapi vram init failure: %x", ret);
|
||||||
UNLOAD_LIBRARY(resp->oh.handle);
|
|
||||||
resp->oh.handle = NULL;
|
|
||||||
snprintf(buf, buflen, "oneapi vram init failure: %d", ret);
|
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
|
oneapi_release(resp->oh);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
(*resp->oh.zesDriverGet)(&resp->num_devices, NULL);
|
LOG(resp->oh.verbose, "calling zesDriverGet\n");
|
||||||
|
ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, NULL);
|
||||||
|
if (ret != ZE_RESULT_SUCCESS) {
|
||||||
|
LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
|
||||||
|
snprintf(buf, buflen, "unable to get driver count: %x", ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
oneapi_release(resp->oh);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
LOG(resp->oh.verbose, "oneapi driver count: %d\n", resp->oh.num_drivers);
|
||||||
|
resp->oh.drivers = malloc(resp->oh.num_drivers * sizeof(zes_driver_handle_t));
|
||||||
|
resp->oh.num_devices = malloc(resp->oh.num_drivers * sizeof(uint32_t));
|
||||||
|
memset(&resp->oh.num_devices[0], 0, resp->oh.num_drivers * sizeof(uint32_t));
|
||||||
|
resp->oh.devices =
|
||||||
|
malloc(resp->oh.num_drivers * sizeof(zes_device_handle_t *));
|
||||||
|
ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, &resp->oh.drivers[0]);
|
||||||
|
if (ret != ZE_RESULT_SUCCESS) {
|
||||||
|
LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
|
||||||
|
snprintf(buf, buflen, "unable to get driver count: %x", ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
oneapi_release(resp->oh);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (d = 0; d < resp->oh.num_drivers; d++) {
|
||||||
|
LOG(resp->oh.verbose, "calling zesDeviceGet count %d: %p\n", d, resp->oh.drivers[d]);
|
||||||
|
ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d],
|
||||||
|
&resp->oh.num_devices[d], NULL);
|
||||||
|
if (ret != ZE_RESULT_SUCCESS) {
|
||||||
|
LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
|
||||||
|
snprintf(buf, buflen, "unable to get device count: %x", ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
oneapi_release(resp->oh);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
resp->oh.devices[d] =
|
||||||
|
malloc(resp->oh.num_devices[d] * sizeof(zes_device_handle_t));
|
||||||
|
ret = (*resp->oh.zesDeviceGet)(
|
||||||
|
resp->oh.drivers[d], &resp->oh.num_devices[d], resp->oh.devices[d]);
|
||||||
|
if (ret != ZE_RESULT_SUCCESS) {
|
||||||
|
LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
|
||||||
|
snprintf(buf, buflen, "unable to get device count: %x", ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
oneapi_release(resp->oh);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
void oneapi_check_vram(oneapi_handle_t h, mem_info_t *resp)
|
void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
|
||||||
{
|
mem_info_t *resp) {
|
||||||
ze_result_t ret;
|
ze_result_t ret;
|
||||||
resp->err = NULL;
|
resp->err = NULL;
|
||||||
uint64_t totalMem = 0;
|
uint64_t totalMem = 0;
|
||||||
@@ -88,127 +134,126 @@ void oneapi_check_vram(oneapi_handle_t h, mem_info_t *resp)
|
|||||||
char buf[buflen + 1];
|
char buf[buflen + 1];
|
||||||
int i, d, m;
|
int i, d, m;
|
||||||
|
|
||||||
if (h.handle == NULL)
|
if (h.handle == NULL) {
|
||||||
{
|
|
||||||
resp->err = strdup("Level-Zero handle not initialized");
|
resp->err = strdup("Level-Zero handle not initialized");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t driversCount = 0;
|
if (driver > h.num_drivers || device > h.num_devices[driver]) {
|
||||||
ret = (*h.zesDriverGet)(&driversCount, NULL);
|
resp->err = strdup("driver of device index out of bounds");
|
||||||
if (ret != ZE_RESULT_SUCCESS)
|
|
||||||
{
|
|
||||||
snprintf(buf, buflen, "unable to get driver count: %d", ret);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
LOG(h.verbose, "discovered %d Level-Zero drivers\n", driversCount);
|
|
||||||
|
|
||||||
zes_driver_handle_t *allDrivers =
|
|
||||||
malloc(driversCount * sizeof(zes_driver_handle_t));
|
|
||||||
(*h.zesDriverGet)(&driversCount, allDrivers);
|
|
||||||
|
|
||||||
resp->total = 0;
|
resp->total = 0;
|
||||||
resp->free = 0;
|
resp->free = 0;
|
||||||
|
|
||||||
for (d = 0; d < driversCount; d++)
|
zes_device_ext_properties_t ext_props;
|
||||||
{
|
ext_props.stype = ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES;
|
||||||
uint32_t deviceCount = 0;
|
ext_props.pNext = NULL;
|
||||||
ret = (*h.zesDeviceGet)(allDrivers[d], &deviceCount, NULL);
|
|
||||||
if (ret != ZE_RESULT_SUCCESS)
|
zes_device_properties_t props;
|
||||||
{
|
props.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
|
||||||
snprintf(buf, buflen, "unable to get device count: %d", ret);
|
props.pNext = &ext_props;
|
||||||
|
|
||||||
|
ret = (*h.zesDeviceGetProperties)(h.devices[driver][device], &props);
|
||||||
|
if (ret != ZE_RESULT_SUCCESS) {
|
||||||
|
snprintf(buf, buflen, "unable to get device properties: %d", ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
snprintf(&resp->gpu_name[0], GPU_NAME_LEN, "%s", props.modelName);
|
||||||
|
|
||||||
|
// TODO this needs to map to ONEAPI_DEVICE_SELECTOR syntax
|
||||||
|
// (this is probably wrong...)
|
||||||
|
// TODO - the driver isn't included - what if there are multiple drivers?
|
||||||
|
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", device);
|
||||||
|
|
||||||
|
if (h.verbose) {
|
||||||
|
// When in verbose mode, report more information about
|
||||||
|
// the card we discover.
|
||||||
|
LOG(h.verbose, "[%d:%d] oneAPI device name: %s\n", driver, device,
|
||||||
|
props.modelName);
|
||||||
|
LOG(h.verbose, "[%d:%d] oneAPI brand: %s\n", driver, device,
|
||||||
|
props.brandName);
|
||||||
|
LOG(h.verbose, "[%d:%d] oneAPI vendor: %s\n", driver, device,
|
||||||
|
props.vendorName);
|
||||||
|
LOG(h.verbose, "[%d:%d] oneAPI S/N: %s\n", driver, device,
|
||||||
|
props.serialNumber);
|
||||||
|
LOG(h.verbose, "[%d:%d] oneAPI board number: %s\n", driver, device,
|
||||||
|
props.boardNumber);
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO
|
||||||
|
// Compute Capability equivalent in resp->major, resp->minor, resp->patch
|
||||||
|
|
||||||
|
uint32_t memCount = 0;
|
||||||
|
ret = (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount,
|
||||||
|
NULL);
|
||||||
|
if (ret != ZE_RESULT_SUCCESS) {
|
||||||
|
snprintf(buf, buflen, "unable to enumerate Level-Zero memory modules: %x",
|
||||||
|
ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG(h.verbose, "discovered %d Level-Zero memory modules\n", memCount);
|
||||||
|
|
||||||
|
zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
|
||||||
|
(*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, mems);
|
||||||
|
|
||||||
|
for (m = 0; m < memCount; m++) {
|
||||||
|
zes_mem_state_t state;
|
||||||
|
state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
|
||||||
|
state.pNext = NULL;
|
||||||
|
ret = (*h.zesMemoryGetState)(mems[m], &state);
|
||||||
|
if (ret != ZE_RESULT_SUCCESS) {
|
||||||
|
snprintf(buf, buflen, "unable to get memory state: %x", ret);
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
free(allDrivers);
|
free(mems);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG(h.verbose, "discovered %d Level-Zero devices\n", deviceCount);
|
resp->total += state.size;
|
||||||
|
resp->free += state.free;
|
||||||
zes_device_handle_t *devices =
|
|
||||||
malloc(deviceCount * sizeof(zes_device_handle_t));
|
|
||||||
(*h.zesDeviceGet)(allDrivers[d], &deviceCount, devices);
|
|
||||||
|
|
||||||
for (i = 0; i < deviceCount; i++)
|
|
||||||
{
|
|
||||||
zes_device_ext_properties_t ext_props;
|
|
||||||
ext_props.stype = ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES;
|
|
||||||
ext_props.pNext = NULL;
|
|
||||||
|
|
||||||
zes_device_properties_t props;
|
|
||||||
props.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
|
|
||||||
props.pNext = &ext_props;
|
|
||||||
|
|
||||||
ret = (*h.zesDeviceGetProperties)(devices[i], &props);
|
|
||||||
if (ret != ZE_RESULT_SUCCESS)
|
|
||||||
{
|
|
||||||
snprintf(buf, buflen, "unable to get device properties: %d", ret);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
free(allDrivers);
|
|
||||||
free(devices);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (h.verbose)
|
|
||||||
{
|
|
||||||
// When in verbose mode, report more information about
|
|
||||||
// the card we discover.
|
|
||||||
LOG(h.verbose, "[%d] oneAPI device name: %s\n", i,
|
|
||||||
props.modelName);
|
|
||||||
LOG(h.verbose, "[%d] oneAPI brand: %s\n", i,
|
|
||||||
props.brandName);
|
|
||||||
LOG(h.verbose, "[%d] oneAPI vendor: %s\n", i,
|
|
||||||
props.vendorName);
|
|
||||||
LOG(h.verbose, "[%d] oneAPI S/N: %s\n", i,
|
|
||||||
props.serialNumber);
|
|
||||||
LOG(h.verbose, "[%d] oneAPI board number: %s\n", i,
|
|
||||||
props.boardNumber);
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t memCount = 0;
|
|
||||||
ret = (*h.zesDeviceEnumMemoryModules)(devices[i], &memCount, NULL);
|
|
||||||
if (ret != ZE_RESULT_SUCCESS)
|
|
||||||
{
|
|
||||||
snprintf(buf, buflen,
|
|
||||||
"unable to enumerate Level-Zero memory modules: %d", ret);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
free(allDrivers);
|
|
||||||
free(devices);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG(h.verbose, "discovered %d Level-Zero memory modules\n", memCount);
|
|
||||||
|
|
||||||
zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
|
|
||||||
(*h.zesDeviceEnumMemoryModules)(devices[i], &memCount, mems);
|
|
||||||
|
|
||||||
for (m = 0; m < memCount; m++)
|
|
||||||
{
|
|
||||||
zes_mem_state_t state;
|
|
||||||
state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
|
|
||||||
state.pNext = NULL;
|
|
||||||
ret = (*h.zesMemoryGetState)(mems[m], &state);
|
|
||||||
if (ret != ZE_RESULT_SUCCESS)
|
|
||||||
{
|
|
||||||
snprintf(buf, buflen, "unable to get memory state: %d", ret);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
free(allDrivers);
|
|
||||||
free(devices);
|
|
||||||
free(mems);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
resp->total += state.size;
|
|
||||||
resp->free += state.free;
|
|
||||||
}
|
|
||||||
|
|
||||||
free(mems);
|
|
||||||
}
|
|
||||||
|
|
||||||
free(devices);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
free(allDrivers);
|
free(mems);
|
||||||
|
}
|
||||||
|
|
||||||
|
void oneapi_release(oneapi_handle_t h) {
|
||||||
|
int d;
|
||||||
|
LOG(h.verbose, "releasing oneapi library\n");
|
||||||
|
for (d = 0; d < h.num_drivers; d++) {
|
||||||
|
if (h.devices != NULL && h.devices[d] != NULL) {
|
||||||
|
free(h.devices[d]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (h.devices != NULL) {
|
||||||
|
free(h.devices);
|
||||||
|
h.devices = NULL;
|
||||||
|
}
|
||||||
|
if (h.num_devices != NULL) {
|
||||||
|
free(h.num_devices);
|
||||||
|
h.num_devices = NULL;
|
||||||
|
}
|
||||||
|
if (h.drivers != NULL) {
|
||||||
|
free(h.drivers);
|
||||||
|
h.drivers = NULL;
|
||||||
|
}
|
||||||
|
h.num_drivers = 0;
|
||||||
|
UNLOAD_LIBRARY(h.handle);
|
||||||
|
h.handle = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
int oneapi_get_device_count(oneapi_handle_t h, int driver) {
|
||||||
|
if (h.handle == NULL || h.num_devices == NULL) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (driver > h.num_drivers) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return (int)h.num_devices[driver];
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // __APPLE__
|
#endif // __APPLE__
|
||||||
|
@@ -9,8 +9,7 @@
|
|||||||
#define ZE_BIT(_i) (1 << _i)
|
#define ZE_BIT(_i) (1 << _i)
|
||||||
|
|
||||||
// Just enough typedef's to dlopen/dlsym for memory information
|
// Just enough typedef's to dlopen/dlsym for memory information
|
||||||
typedef enum ze_result_t
|
typedef enum ze_result_t {
|
||||||
{
|
|
||||||
ZE_RESULT_SUCCESS = 0,
|
ZE_RESULT_SUCCESS = 0,
|
||||||
// Other values omitted for now...
|
// Other values omitted for now...
|
||||||
} ze_result_t;
|
} ze_result_t;
|
||||||
@@ -20,13 +19,11 @@ typedef struct _zes_driver_handle_t *zes_driver_handle_t;
|
|||||||
typedef struct _zes_device_handle_t *zes_device_handle_t;
|
typedef struct _zes_device_handle_t *zes_device_handle_t;
|
||||||
typedef struct _zes_mem_handle_t *zes_mem_handle_t;
|
typedef struct _zes_mem_handle_t *zes_mem_handle_t;
|
||||||
|
|
||||||
typedef enum _ze_structure_type_t
|
typedef enum _ze_structure_type_t {
|
||||||
{
|
|
||||||
ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
|
ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
|
||||||
} ze_structure_type_t;
|
} ze_structure_type_t;
|
||||||
|
|
||||||
typedef enum _zes_structure_type_t
|
typedef enum _zes_structure_type_t {
|
||||||
{
|
|
||||||
ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,
|
ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,
|
||||||
ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,
|
ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,
|
||||||
ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,
|
ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,
|
||||||
@@ -34,35 +31,29 @@ typedef enum _zes_structure_type_t
|
|||||||
ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
|
ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
|
||||||
} zes_structure_type_t;
|
} zes_structure_type_t;
|
||||||
|
|
||||||
typedef enum _zes_mem_type_t
|
typedef enum _zes_mem_type_t {
|
||||||
{
|
|
||||||
ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
|
ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
|
||||||
} zes_mem_type_t;
|
} zes_mem_type_t;
|
||||||
|
|
||||||
typedef enum _zes_mem_loc_t
|
typedef enum _zes_mem_loc_t {
|
||||||
{
|
|
||||||
ZES_MEM_LOC_SYSTEM = 0,
|
ZES_MEM_LOC_SYSTEM = 0,
|
||||||
ZES_MEM_LOC_DEVICE = 1,
|
ZES_MEM_LOC_DEVICE = 1,
|
||||||
ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff
|
ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff
|
||||||
} zes_mem_loc_t;
|
} zes_mem_loc_t;
|
||||||
|
|
||||||
typedef enum _zes_mem_health_t
|
typedef enum _zes_mem_health_t {
|
||||||
{
|
|
||||||
ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff
|
ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff
|
||||||
} zes_mem_health_t;
|
} zes_mem_health_t;
|
||||||
|
|
||||||
typedef struct _ze_device_uuid_t
|
typedef struct _ze_device_uuid_t {
|
||||||
{
|
|
||||||
uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
|
uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
|
||||||
} ze_device_uuid_t;
|
} ze_device_uuid_t;
|
||||||
|
|
||||||
typedef struct _zes_uuid_t
|
typedef struct _zes_uuid_t {
|
||||||
{
|
|
||||||
uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
|
uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
|
||||||
} zes_uuid_t;
|
} zes_uuid_t;
|
||||||
|
|
||||||
typedef enum _ze_device_type_t
|
typedef enum _ze_device_type_t {
|
||||||
{
|
|
||||||
ZE_DEVICE_TYPE_GPU = 1,
|
ZE_DEVICE_TYPE_GPU = 1,
|
||||||
ZE_DEVICE_TYPE_CPU = 2,
|
ZE_DEVICE_TYPE_CPU = 2,
|
||||||
ZE_DEVICE_TYPE_FPGA = 3,
|
ZE_DEVICE_TYPE_FPGA = 3,
|
||||||
@@ -71,8 +62,7 @@ typedef enum _ze_device_type_t
|
|||||||
ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
|
ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
|
||||||
} ze_device_type_t;
|
} ze_device_type_t;
|
||||||
|
|
||||||
typedef enum _zes_device_type_t
|
typedef enum _zes_device_type_t {
|
||||||
{
|
|
||||||
ZES_DEVICE_TYPE_GPU = 1,
|
ZES_DEVICE_TYPE_GPU = 1,
|
||||||
ZES_DEVICE_TYPE_CPU = 2,
|
ZES_DEVICE_TYPE_CPU = 2,
|
||||||
ZES_DEVICE_TYPE_FPGA = 3,
|
ZES_DEVICE_TYPE_FPGA = 3,
|
||||||
@@ -82,8 +72,7 @@ typedef enum _zes_device_type_t
|
|||||||
} zes_device_type_t;
|
} zes_device_type_t;
|
||||||
|
|
||||||
typedef uint32_t ze_device_property_flags_t;
|
typedef uint32_t ze_device_property_flags_t;
|
||||||
typedef enum _ze_device_property_flag_t
|
typedef enum _ze_device_property_flag_t {
|
||||||
{
|
|
||||||
ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
|
ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
|
||||||
ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
|
ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
|
||||||
ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
|
ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
|
||||||
@@ -92,8 +81,7 @@ typedef enum _ze_device_property_flag_t
|
|||||||
} ze_device_property_flag_t;
|
} ze_device_property_flag_t;
|
||||||
|
|
||||||
typedef uint32_t zes_device_property_flags_t;
|
typedef uint32_t zes_device_property_flags_t;
|
||||||
typedef enum _zes_device_property_flag_t
|
typedef enum _zes_device_property_flag_t {
|
||||||
{
|
|
||||||
ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
|
ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
|
||||||
ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
|
ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
|
||||||
ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
|
ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
|
||||||
@@ -101,8 +89,7 @@ typedef enum _zes_device_property_flag_t
|
|||||||
ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
|
ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
|
||||||
} zes_device_property_flag_t;
|
} zes_device_property_flag_t;
|
||||||
|
|
||||||
typedef struct _ze_device_properties_t
|
typedef struct _ze_device_properties_t {
|
||||||
{
|
|
||||||
ze_structure_type_t stype;
|
ze_structure_type_t stype;
|
||||||
void *pNext;
|
void *pNext;
|
||||||
ze_device_type_t type;
|
ze_device_type_t type;
|
||||||
@@ -126,8 +113,7 @@ typedef struct _ze_device_properties_t
|
|||||||
char name[ZE_MAX_DEVICE_NAME];
|
char name[ZE_MAX_DEVICE_NAME];
|
||||||
} ze_device_properties_t;
|
} ze_device_properties_t;
|
||||||
|
|
||||||
typedef struct _zes_device_properties_t
|
typedef struct _zes_device_properties_t {
|
||||||
{
|
|
||||||
zes_structure_type_t stype;
|
zes_structure_type_t stype;
|
||||||
void *pNext;
|
void *pNext;
|
||||||
ze_device_properties_t core;
|
ze_device_properties_t core;
|
||||||
@@ -140,8 +126,7 @@ typedef struct _zes_device_properties_t
|
|||||||
char driverVersion[ZES_STRING_PROPERTY_SIZE];
|
char driverVersion[ZES_STRING_PROPERTY_SIZE];
|
||||||
} zes_device_properties_t;
|
} zes_device_properties_t;
|
||||||
|
|
||||||
typedef struct _zes_device_ext_properties_t
|
typedef struct _zes_device_ext_properties_t {
|
||||||
{
|
|
||||||
zes_structure_type_t stype;
|
zes_structure_type_t stype;
|
||||||
void *pNext;
|
void *pNext;
|
||||||
zes_uuid_t uuid;
|
zes_uuid_t uuid;
|
||||||
@@ -149,8 +134,7 @@ typedef struct _zes_device_ext_properties_t
|
|||||||
zes_device_property_flags_t flags;
|
zes_device_property_flags_t flags;
|
||||||
} zes_device_ext_properties_t;
|
} zes_device_ext_properties_t;
|
||||||
|
|
||||||
typedef struct _zes_mem_properties_t
|
typedef struct _zes_mem_properties_t {
|
||||||
{
|
|
||||||
zes_structure_type_t stype;
|
zes_structure_type_t stype;
|
||||||
void *pNext;
|
void *pNext;
|
||||||
zes_mem_type_t type;
|
zes_mem_type_t type;
|
||||||
@@ -162,8 +146,7 @@ typedef struct _zes_mem_properties_t
|
|||||||
int32_t numChannels;
|
int32_t numChannels;
|
||||||
} zes_mem_properties_t;
|
} zes_mem_properties_t;
|
||||||
|
|
||||||
typedef struct _zes_mem_state_t
|
typedef struct _zes_mem_state_t {
|
||||||
{
|
|
||||||
zes_structure_type_t stype;
|
zes_structure_type_t stype;
|
||||||
const void *pNext;
|
const void *pNext;
|
||||||
zes_mem_health_t health;
|
zes_mem_health_t health;
|
||||||
@@ -171,10 +154,19 @@ typedef struct _zes_mem_state_t
|
|||||||
uint64_t size;
|
uint64_t size;
|
||||||
} zes_mem_state_t;
|
} zes_mem_state_t;
|
||||||
|
|
||||||
typedef struct oneapi_handle
|
typedef struct oneapi_handle {
|
||||||
{
|
|
||||||
void *handle;
|
void *handle;
|
||||||
uint16_t verbose;
|
uint16_t verbose;
|
||||||
|
|
||||||
|
uint32_t num_drivers;
|
||||||
|
zes_driver_handle_t *drivers;
|
||||||
|
uint32_t *num_devices;
|
||||||
|
zes_device_handle_t **devices;
|
||||||
|
|
||||||
|
// TODO Driver major, minor information
|
||||||
|
// int driver_major;
|
||||||
|
// int driver_minor;
|
||||||
|
|
||||||
ze_result_t (*zesInit)(int);
|
ze_result_t (*zesInit)(int);
|
||||||
ze_result_t (*zesDriverGet)(uint32_t *pCount, zes_driver_handle_t *phDrivers);
|
ze_result_t (*zesDriverGet)(uint32_t *pCount, zes_driver_handle_t *phDrivers);
|
||||||
ze_result_t (*zesDeviceGet)(zes_driver_handle_t hDriver, uint32_t *pCount,
|
ze_result_t (*zesDeviceGet)(zes_driver_handle_t hDriver, uint32_t *pCount,
|
||||||
@@ -191,21 +183,21 @@ typedef struct oneapi_handle
|
|||||||
|
|
||||||
} oneapi_handle_t;
|
} oneapi_handle_t;
|
||||||
|
|
||||||
typedef struct oneapi_init_resp
|
typedef struct oneapi_init_resp {
|
||||||
{
|
|
||||||
char *err; // If err is non-null handle is invalid
|
char *err; // If err is non-null handle is invalid
|
||||||
int num_devices;
|
|
||||||
oneapi_handle_t oh;
|
oneapi_handle_t oh;
|
||||||
} oneapi_init_resp_t;
|
} oneapi_init_resp_t;
|
||||||
|
|
||||||
typedef struct oneapi_version_resp
|
typedef struct oneapi_version_resp {
|
||||||
{
|
|
||||||
ze_result_t status;
|
ze_result_t status;
|
||||||
char *str; // Contains version or error string if status != 0
|
char *str; // Contains version or error string if status != 0
|
||||||
} oneapi_version_resp_t;
|
} oneapi_version_resp_t;
|
||||||
|
|
||||||
void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp);
|
void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp);
|
||||||
void oneapi_check_vram(oneapi_handle_t rh, mem_info_t *resp);
|
void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
|
||||||
|
mem_info_t *resp);
|
||||||
|
void oneapi_release(oneapi_handle_t h);
|
||||||
|
int oneapi_get_device_count(oneapi_handle_t h, int driver);
|
||||||
|
|
||||||
#endif // __GPU_INFO_INTEL_H__
|
#endif // __GPU_INFO_INTEL_H__
|
||||||
#endif // __APPLE__
|
#endif // __APPLE__
|
||||||
|
89
gpu/gpu_linux.go
Normal file
89
gpu/gpu_linux.go
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
package gpu
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/format"
|
||||||
|
)
|
||||||
|
|
||||||
|
var CudartGlobs = []string{
|
||||||
|
"/usr/local/cuda/lib64/libcudart.so*",
|
||||||
|
"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
|
||||||
|
"/usr/lib/x86_64-linux-gnu/libcudart.so*",
|
||||||
|
"/usr/lib/wsl/lib/libcudart.so*",
|
||||||
|
"/usr/lib/wsl/drivers/*/libcudart.so*",
|
||||||
|
"/opt/cuda/lib64/libcudart.so*",
|
||||||
|
"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
|
||||||
|
"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
|
||||||
|
"/usr/lib/aarch64-linux-gnu/libcudart.so*",
|
||||||
|
"/usr/local/cuda/lib*/libcudart.so*",
|
||||||
|
"/usr/lib*/libcudart.so*",
|
||||||
|
"/usr/local/lib*/libcudart.so*",
|
||||||
|
}
|
||||||
|
|
||||||
|
var NvmlGlobs = []string{}
|
||||||
|
|
||||||
|
var NvcudaGlobs = []string{
|
||||||
|
"/usr/local/cuda*/targets/*/lib/libcuda.so*",
|
||||||
|
"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
|
||||||
|
"/usr/lib/*-linux-gnu/libcuda.so*",
|
||||||
|
"/usr/lib/wsl/lib/libcuda.so*",
|
||||||
|
"/usr/lib/wsl/drivers/*/libcuda.so*",
|
||||||
|
"/opt/cuda/lib*/libcuda.so*",
|
||||||
|
"/usr/local/cuda/lib*/libcuda.so*",
|
||||||
|
"/usr/lib*/libcuda.so*",
|
||||||
|
"/usr/local/lib*/libcuda.so*",
|
||||||
|
}
|
||||||
|
|
||||||
|
var OneapiGlobs = []string{
|
||||||
|
"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
|
||||||
|
"/usr/lib*/libze_intel_gpu.so*",
|
||||||
|
}
|
||||||
|
|
||||||
|
var CudartMgmtName = "libcudart.so*"
|
||||||
|
var NvcudaMgmtName = "libcuda.so*"
|
||||||
|
var NvmlMgmtName = "" // not currently wired on linux
|
||||||
|
var OneapiMgmtName = "libze_intel_gpu.so"
|
||||||
|
|
||||||
|
func GetCPUMem() (memInfo, error) {
|
||||||
|
var mem memInfo
|
||||||
|
var total, available, free, buffers, cached uint64
|
||||||
|
f, err := os.Open("/proc/meminfo")
|
||||||
|
if err != nil {
|
||||||
|
return mem, err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
s := bufio.NewScanner(f)
|
||||||
|
for s.Scan() {
|
||||||
|
line := s.Text()
|
||||||
|
switch {
|
||||||
|
case strings.HasPrefix(line, "MemTotal:"):
|
||||||
|
_, err = fmt.Sscanf(line, "MemTotal:%d", &total)
|
||||||
|
case strings.HasPrefix(line, "MemAvailable:"):
|
||||||
|
_, err = fmt.Sscanf(line, "MemAvailable:%d", &available)
|
||||||
|
case strings.HasPrefix(line, "MemFree:"):
|
||||||
|
_, err = fmt.Sscanf(line, "MemFree:%d", &free)
|
||||||
|
case strings.HasPrefix(line, "Buffers:"):
|
||||||
|
_, err = fmt.Sscanf(line, "Buffers:%d", &buffers)
|
||||||
|
case strings.HasPrefix(line, "Cached:"):
|
||||||
|
_, err = fmt.Sscanf(line, "Cached:%d", &cached)
|
||||||
|
default:
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return mem, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if total > 0 && available > 0 {
|
||||||
|
mem.TotalMemory = total * format.KibiByte
|
||||||
|
mem.FreeMemory = available * format.KibiByte
|
||||||
|
return mem, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mem.TotalMemory = total * format.KibiByte
|
||||||
|
mem.FreeMemory = (free + buffers + cached) * format.KibiByte
|
||||||
|
return mem, nil
|
||||||
|
}
|
55
gpu/gpu_windows.go
Normal file
55
gpu/gpu_windows.go
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
package gpu
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"syscall"
|
||||||
|
"unsafe"
|
||||||
|
)
|
||||||
|
|
||||||
|
type MEMORYSTATUSEX struct {
|
||||||
|
length uint32
|
||||||
|
MemoryLoad uint32
|
||||||
|
TotalPhys uint64
|
||||||
|
AvailPhys uint64
|
||||||
|
TotalPageFile uint64
|
||||||
|
AvailPageFile uint64
|
||||||
|
TotalVirtual uint64
|
||||||
|
AvailVirtual uint64
|
||||||
|
AvailExtendedVirtual uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
k32 = syscall.NewLazyDLL("kernel32.dll")
|
||||||
|
globalMemoryStatusExProc = k32.NewProc("GlobalMemoryStatusEx")
|
||||||
|
sizeofMemoryStatusEx = uint32(unsafe.Sizeof(MEMORYSTATUSEX{}))
|
||||||
|
)
|
||||||
|
|
||||||
|
var CudartGlobs = []string{
|
||||||
|
"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
|
||||||
|
}
|
||||||
|
|
||||||
|
var NvmlGlobs = []string{
|
||||||
|
"c:\\Windows\\System32\\nvml.dll",
|
||||||
|
}
|
||||||
|
|
||||||
|
var NvcudaGlobs = []string{
|
||||||
|
"c:\\windows\\system*\\nvcuda.dll",
|
||||||
|
}
|
||||||
|
|
||||||
|
var OneapiGlobs = []string{
|
||||||
|
"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
|
||||||
|
}
|
||||||
|
|
||||||
|
var CudartMgmtName = "cudart64_*.dll"
|
||||||
|
var NvcudaMgmtName = "nvcuda.dll"
|
||||||
|
var NvmlMgmtName = "nvml.dll"
|
||||||
|
var OneapiMgmtName = "ze_intel_gpu64.dll"
|
||||||
|
|
||||||
|
func GetCPUMem() (memInfo, error) {
|
||||||
|
memStatus := MEMORYSTATUSEX{length: sizeofMemoryStatusEx}
|
||||||
|
r1, _, err := globalMemoryStatusExProc.Call(uintptr(unsafe.Pointer(&memStatus)))
|
||||||
|
if r1 == 0 {
|
||||||
|
return memInfo{}, fmt.Errorf("GlobalMemoryStatusEx failed: %w", err)
|
||||||
|
}
|
||||||
|
return memInfo{TotalMemory: memStatus.TotalPhys, FreeMemory: memStatus.AvailPhys}, nil
|
||||||
|
}
|
56
gpu/types.go
56
gpu/types.go
@@ -18,7 +18,7 @@ type GpuInfo struct {
|
|||||||
Library string `json:"library,omitempty"`
|
Library string `json:"library,omitempty"`
|
||||||
|
|
||||||
// Optional variant to select (e.g. versions, cpu feature flags)
|
// Optional variant to select (e.g. versions, cpu feature flags)
|
||||||
Variant string `json:"variant,omitempty"`
|
Variant CPUCapability `json:"variant"`
|
||||||
|
|
||||||
// MinimumMemory represents the minimum memory required to use the GPU
|
// MinimumMemory represents the minimum memory required to use the GPU
|
||||||
MinimumMemory uint64 `json:"-"`
|
MinimumMemory uint64 `json:"-"`
|
||||||
@@ -26,6 +26,9 @@ type GpuInfo struct {
|
|||||||
// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
|
// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
|
||||||
DependencyPath string `json:"lib_path,omitempty"`
|
DependencyPath string `json:"lib_path,omitempty"`
|
||||||
|
|
||||||
|
// Extra environment variables specific to the GPU as list of [key,value]
|
||||||
|
EnvWorkarounds [][2]string `json:"envs,omitempty"`
|
||||||
|
|
||||||
// GPU information
|
// GPU information
|
||||||
ID string `json:"gpu_id"` // string to use for selection of this specific GPU
|
ID string `json:"gpu_id"` // string to use for selection of this specific GPU
|
||||||
Name string `json:"name"` // user friendly name if available
|
Name string `json:"name"` // user friendly name if available
|
||||||
@@ -38,6 +41,30 @@ type GpuInfo struct {
|
|||||||
// TODO other performance capability info to help in scheduling decisions
|
// TODO other performance capability info to help in scheduling decisions
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type CPUInfo struct {
|
||||||
|
GpuInfo
|
||||||
|
}
|
||||||
|
|
||||||
|
type CudaGPUInfo struct {
|
||||||
|
GpuInfo
|
||||||
|
index int //nolint:unused,nolintlint
|
||||||
|
}
|
||||||
|
type CudaGPUInfoList []CudaGPUInfo
|
||||||
|
|
||||||
|
type RocmGPUInfo struct {
|
||||||
|
GpuInfo
|
||||||
|
usedFilepath string //nolint:unused,nolintlint
|
||||||
|
index int //nolint:unused,nolintlint
|
||||||
|
}
|
||||||
|
type RocmGPUInfoList []RocmGPUInfo
|
||||||
|
|
||||||
|
type OneapiGPUInfo struct {
|
||||||
|
GpuInfo
|
||||||
|
driverIndex int //nolint:unused,nolintlint
|
||||||
|
gpuIndex int //nolint:unused,nolintlint
|
||||||
|
}
|
||||||
|
type OneapiGPUInfoList []OneapiGPUInfo
|
||||||
|
|
||||||
type GpuInfoList []GpuInfo
|
type GpuInfoList []GpuInfo
|
||||||
|
|
||||||
// Split up the set of gpu info's by Library and variant
|
// Split up the set of gpu info's by Library and variant
|
||||||
@@ -47,8 +74,8 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
|
|||||||
for _, info := range l {
|
for _, info := range l {
|
||||||
found := false
|
found := false
|
||||||
requested := info.Library
|
requested := info.Library
|
||||||
if info.Variant != "" {
|
if info.Variant != CPUCapabilityNone {
|
||||||
requested += "_" + info.Variant
|
requested += "_" + info.Variant.String()
|
||||||
}
|
}
|
||||||
for i, lib := range libs {
|
for i, lib := range libs {
|
||||||
if lib == requested {
|
if lib == requested {
|
||||||
@@ -86,3 +113,26 @@ type ByFreeMemory []GpuInfo
|
|||||||
func (a ByFreeMemory) Len() int { return len(a) }
|
func (a ByFreeMemory) Len() int { return len(a) }
|
||||||
func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||||
func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
|
func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
|
||||||
|
|
||||||
|
type CPUCapability uint32
|
||||||
|
|
||||||
|
// Override at build time when building base GPU runners
|
||||||
|
var GPURunnerCPUCapability = CPUCapabilityAVX
|
||||||
|
|
||||||
|
const (
|
||||||
|
CPUCapabilityNone CPUCapability = iota
|
||||||
|
CPUCapabilityAVX
|
||||||
|
CPUCapabilityAVX2
|
||||||
|
// TODO AVX512
|
||||||
|
)
|
||||||
|
|
||||||
|
func (c CPUCapability) String() string {
|
||||||
|
switch c {
|
||||||
|
case CPUCapabilityAVX:
|
||||||
|
return "avx"
|
||||||
|
case CPUCapabilityAVX2:
|
||||||
|
return "avx2"
|
||||||
|
default:
|
||||||
|
return "no vector extensions"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@@ -19,17 +19,19 @@ func TestMultiModelConcurrency(t *testing.T) {
|
|||||||
var (
|
var (
|
||||||
req = [2]api.GenerateRequest{
|
req = [2]api.GenerateRequest{
|
||||||
{
|
{
|
||||||
Model: "orca-mini",
|
Model: "orca-mini",
|
||||||
Prompt: "why is the ocean blue?",
|
Prompt: "why is the ocean blue?",
|
||||||
Stream: &stream,
|
Stream: &stream,
|
||||||
|
KeepAlive: &api.Duration{Duration: 10 * time.Second},
|
||||||
Options: map[string]interface{}{
|
Options: map[string]interface{}{
|
||||||
"seed": 42,
|
"seed": 42,
|
||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
},
|
},
|
||||||
}, {
|
}, {
|
||||||
Model: "tinydolphin",
|
Model: "tinydolphin",
|
||||||
Prompt: "what is the origin of the us thanksgiving holiday?",
|
Prompt: "what is the origin of the us thanksgiving holiday?",
|
||||||
Stream: &stream,
|
Stream: &stream,
|
||||||
|
KeepAlive: &api.Duration{Duration: 10 * time.Second},
|
||||||
Options: map[string]interface{}{
|
Options: map[string]interface{}{
|
||||||
"seed": 42,
|
"seed": 42,
|
||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
@@ -38,42 +40,64 @@ func TestMultiModelConcurrency(t *testing.T) {
|
|||||||
}
|
}
|
||||||
resp = [2][]string{
|
resp = [2][]string{
|
||||||
[]string{"sunlight"},
|
[]string{"sunlight"},
|
||||||
[]string{"england", "english", "massachusetts", "pilgrims"},
|
[]string{"england", "english", "massachusetts", "pilgrims", "british"},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
wg.Add(len(req))
|
wg.Add(len(req))
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), time.Second*120)
|
ctx, cancel := context.WithTimeout(context.Background(), time.Second*240)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
|
client, _, cleanup := InitServerConnection(ctx, t)
|
||||||
|
defer cleanup()
|
||||||
|
|
||||||
|
for i := 0; i < len(req); i++ {
|
||||||
|
require.NoError(t, PullIfMissing(ctx, client, req[i].Model))
|
||||||
|
}
|
||||||
|
|
||||||
for i := 0; i < len(req); i++ {
|
for i := 0; i < len(req); i++ {
|
||||||
go func(i int) {
|
go func(i int) {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
GenerateTestHelper(ctx, t, req[i], resp[i])
|
DoGenerate(ctx, t, client, req[i], resp[i], 60*time.Second, 10*time.Second)
|
||||||
}(i)
|
}(i)
|
||||||
}
|
}
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
|
func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) // GTX 750 2G card takes ~9 minutes
|
req, resp := GenerateRequests()
|
||||||
|
reqLimit := len(req)
|
||||||
|
iterLimit := 5
|
||||||
|
|
||||||
|
vram := os.Getenv("OLLAMA_MAX_VRAM")
|
||||||
|
if vram != "" {
|
||||||
|
max, err := strconv.ParseUint(vram, 10, 64)
|
||||||
|
require.NoError(t, err)
|
||||||
|
// Don't hammer on small VRAM cards...
|
||||||
|
if max < 4*1024*1024*1024 {
|
||||||
|
reqLimit = min(reqLimit, 2)
|
||||||
|
iterLimit = 2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 9*time.Minute)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
client, _, cleanup := InitServerConnection(ctx, t)
|
client, _, cleanup := InitServerConnection(ctx, t)
|
||||||
defer cleanup()
|
defer cleanup()
|
||||||
|
|
||||||
req, resp := GenerateRequests()
|
|
||||||
// Get the server running (if applicable) warm the model up with a single initial request
|
// Get the server running (if applicable) warm the model up with a single initial request
|
||||||
DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 5*time.Second)
|
DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 10*time.Second)
|
||||||
|
|
||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
wg.Add(len(req))
|
wg.Add(reqLimit)
|
||||||
for i := 0; i < len(req); i++ {
|
for i := 0; i < reqLimit; i++ {
|
||||||
go func(i int) {
|
go func(i int) {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
for j := 0; j < 5; j++ {
|
for j := 0; j < iterLimit; j++ {
|
||||||
slog.Info("Starting", "req", i, "iter", j)
|
slog.Info("Starting", "req", i, "iter", j)
|
||||||
// On slower GPUs it can take a while to process the 4 concurrent requests
|
// On slower GPUs it can take a while to process the concurrent requests
|
||||||
// so we allow a much longer initial timeout
|
// so we allow a much longer initial timeout
|
||||||
DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 5*time.Second)
|
DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 20*time.Second)
|
||||||
}
|
}
|
||||||
}(i)
|
}(i)
|
||||||
}
|
}
|
||||||
@@ -221,5 +245,23 @@ func TestMultiModelStress(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}(i)
|
}(i)
|
||||||
}
|
}
|
||||||
|
go func() {
|
||||||
|
for {
|
||||||
|
time.Sleep(2 * time.Second)
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
default:
|
||||||
|
models, err := client.ListRunning(ctx)
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("failed to list running models", "error", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for _, m := range models.Models {
|
||||||
|
slog.Info("loaded model snapshot", "model", m)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
}
|
}
|
||||||
|
@@ -11,7 +11,8 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func TestContextExhaustion(t *testing.T) {
|
func TestContextExhaustion(t *testing.T) {
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) // TODO maybe shorter?
|
// Longer needed for small footprint GPUs
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 6*time.Minute)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
// Set up the test data
|
// Set up the test data
|
||||||
req := api.GenerateRequest{
|
req := api.GenerateRequest{
|
||||||
|
@@ -32,7 +32,11 @@ func TestIntegrationMultimodal(t *testing.T) {
|
|||||||
resp := "the ollam"
|
resp := "the ollam"
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
|
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
GenerateTestHelper(ctx, t, req, []string{resp})
|
client, _, cleanup := InitServerConnection(ctx, t)
|
||||||
|
defer cleanup()
|
||||||
|
require.NoError(t, PullIfMissing(ctx, client, req.Model))
|
||||||
|
// llava models on CPU can be quite slow to start,
|
||||||
|
DoGenerate(ctx, t, client, req, []string{resp}, 120*time.Second, 30*time.Second)
|
||||||
}
|
}
|
||||||
|
|
||||||
const imageEncoding = `iVBORw0KGgoAAAANSUhEUgAAANIAAAB4CAYAAACHHqzKAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEb
|
const imageEncoding = `iVBORw0KGgoAAAANSUhEUgAAANIAAAB4CAYAAACHHqzKAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEb
|
||||||
|
@@ -140,7 +140,7 @@ func PullIfMissing(ctx context.Context, client *api.Client, modelName string) er
|
|||||||
|
|
||||||
showCtx, cancel := context.WithDeadlineCause(
|
showCtx, cancel := context.WithDeadlineCause(
|
||||||
ctx,
|
ctx,
|
||||||
time.Now().Add(5*time.Second),
|
time.Now().Add(10*time.Second),
|
||||||
fmt.Errorf("show for existing model %s took too long", modelName),
|
fmt.Errorf("show for existing model %s took too long", modelName),
|
||||||
)
|
)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
@@ -287,41 +287,46 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap
|
|||||||
func GenerateRequests() ([]api.GenerateRequest, [][]string) {
|
func GenerateRequests() ([]api.GenerateRequest, [][]string) {
|
||||||
return []api.GenerateRequest{
|
return []api.GenerateRequest{
|
||||||
{
|
{
|
||||||
Model: "orca-mini",
|
Model: "orca-mini",
|
||||||
Prompt: "why is the ocean blue?",
|
Prompt: "why is the ocean blue?",
|
||||||
Stream: &stream,
|
Stream: &stream,
|
||||||
|
KeepAlive: &api.Duration{Duration: 10 * time.Second},
|
||||||
Options: map[string]interface{}{
|
Options: map[string]interface{}{
|
||||||
"seed": 42,
|
"seed": 42,
|
||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
},
|
},
|
||||||
}, {
|
}, {
|
||||||
Model: "orca-mini",
|
Model: "orca-mini",
|
||||||
Prompt: "why is the color of dirt brown?",
|
Prompt: "why is the color of dirt brown?",
|
||||||
Stream: &stream,
|
Stream: &stream,
|
||||||
|
KeepAlive: &api.Duration{Duration: 10 * time.Second},
|
||||||
Options: map[string]interface{}{
|
Options: map[string]interface{}{
|
||||||
"seed": 42,
|
"seed": 42,
|
||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
},
|
},
|
||||||
}, {
|
}, {
|
||||||
Model: "orca-mini",
|
Model: "orca-mini",
|
||||||
Prompt: "what is the origin of the us thanksgiving holiday?",
|
Prompt: "what is the origin of the us thanksgiving holiday?",
|
||||||
Stream: &stream,
|
Stream: &stream,
|
||||||
|
KeepAlive: &api.Duration{Duration: 10 * time.Second},
|
||||||
Options: map[string]interface{}{
|
Options: map[string]interface{}{
|
||||||
"seed": 42,
|
"seed": 42,
|
||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
},
|
},
|
||||||
}, {
|
}, {
|
||||||
Model: "orca-mini",
|
Model: "orca-mini",
|
||||||
Prompt: "what is the origin of independence day?",
|
Prompt: "what is the origin of independence day?",
|
||||||
Stream: &stream,
|
Stream: &stream,
|
||||||
|
KeepAlive: &api.Duration{Duration: 10 * time.Second},
|
||||||
Options: map[string]interface{}{
|
Options: map[string]interface{}{
|
||||||
"seed": 42,
|
"seed": 42,
|
||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
},
|
},
|
||||||
}, {
|
}, {
|
||||||
Model: "orca-mini",
|
Model: "orca-mini",
|
||||||
Prompt: "what is the composition of air?",
|
Prompt: "what is the composition of air?",
|
||||||
Stream: &stream,
|
Stream: &stream,
|
||||||
|
KeepAlive: &api.Duration{Duration: 10 * time.Second},
|
||||||
Options: map[string]interface{}{
|
Options: map[string]interface{}{
|
||||||
"seed": 42,
|
"seed": 42,
|
||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
@@ -331,7 +336,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
|
|||||||
[][]string{
|
[][]string{
|
||||||
[]string{"sunlight"},
|
[]string{"sunlight"},
|
||||||
[]string{"soil", "organic", "earth", "black", "tan"},
|
[]string{"soil", "organic", "earth", "black", "tan"},
|
||||||
[]string{"england", "english", "massachusetts", "pilgrims"},
|
[]string{"england", "english", "massachusetts", "pilgrims", "british"},
|
||||||
[]string{"fourth", "july", "declaration", "independence"},
|
[]string{"fourth", "july", "declaration", "independence"},
|
||||||
[]string{"nitrogen", "oxygen", "carbon", "dioxide"},
|
[]string{"nitrogen", "oxygen", "carbon", "dioxide"},
|
||||||
}
|
}
|
||||||
|
31
llm/ext_server/server.cpp
vendored
31
llm/ext_server/server.cpp
vendored
@@ -56,7 +56,6 @@ struct server_params {
|
|||||||
std::string hostname = "127.0.0.1";
|
std::string hostname = "127.0.0.1";
|
||||||
std::vector<std::string> api_keys;
|
std::vector<std::string> api_keys;
|
||||||
std::string public_path = "examples/server/public";
|
std::string public_path = "examples/server/public";
|
||||||
std::string chat_template = "";
|
|
||||||
int32_t port = 8080;
|
int32_t port = 8080;
|
||||||
int32_t read_timeout = 600;
|
int32_t read_timeout = 600;
|
||||||
int32_t write_timeout = 600;
|
int32_t write_timeout = 600;
|
||||||
@@ -427,16 +426,6 @@ struct llama_server_context
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void validate_model_chat_template(server_params & sparams) {
|
|
||||||
llama_chat_message chat[] = {{"user", "test"}};
|
|
||||||
std::vector<char> buf(1);
|
|
||||||
int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
|
|
||||||
if (res < 0) {
|
|
||||||
LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
|
|
||||||
sparams.chat_template = "chatml";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void initialize() {
|
void initialize() {
|
||||||
// create slots
|
// create slots
|
||||||
all_slots_are_idle = true;
|
all_slots_are_idle = true;
|
||||||
@@ -2335,9 +2324,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
|
|||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#ifndef GGML_USE_CUBLAS
|
#ifndef GGML_USE_CUDA
|
||||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
|
fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n");
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUDA
|
||||||
}
|
}
|
||||||
else if (arg == "--tensor-split" || arg == "-ts")
|
else if (arg == "--tensor-split" || arg == "-ts")
|
||||||
{
|
{
|
||||||
@@ -2346,7 +2335,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
|
|||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
|
#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
|
||||||
std::string arg_next = argv[i];
|
std::string arg_next = argv[i];
|
||||||
|
|
||||||
// split string by , and /
|
// split string by , and /
|
||||||
@@ -2367,8 +2356,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
|
LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {});
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUDA
|
||||||
}
|
}
|
||||||
else if (arg == "--main-gpu" || arg == "-mg")
|
else if (arg == "--main-gpu" || arg == "-mg")
|
||||||
{
|
{
|
||||||
@@ -2377,7 +2366,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
|
|||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
|
#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
|
||||||
params.main_gpu = std::stoi(argv[i]);
|
params.main_gpu = std::stoi(argv[i]);
|
||||||
#else
|
#else
|
||||||
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
|
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
|
||||||
@@ -2535,7 +2524,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
|
|||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
sparams.chat_template = argv[i];
|
|
||||||
}
|
}
|
||||||
else if (arg == "--override-kv")
|
else if (arg == "--override-kv")
|
||||||
{
|
{
|
||||||
@@ -3008,11 +2996,6 @@ int main(int argc, char **argv) {
|
|||||||
}
|
}
|
||||||
const auto model_meta = llama.model_meta();
|
const auto model_meta = llama.model_meta();
|
||||||
|
|
||||||
if (sparams.chat_template.empty()) { // custom chat template is not supplied
|
|
||||||
// check if the template comes with the model is supported by us
|
|
||||||
llama.validate_model_chat_template(sparams);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Middleware for API key validation
|
// Middleware for API key validation
|
||||||
auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
|
auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
|
||||||
// If API key is not set, skip validation
|
// If API key is not set, skip validation
|
||||||
|
@@ -18,7 +18,7 @@ sign() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on"
|
COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_OPENMP=off"
|
||||||
|
|
||||||
case "${GOARCH}" in
|
case "${GOARCH}" in
|
||||||
"amd64")
|
"amd64")
|
||||||
@@ -27,7 +27,7 @@ case "${GOARCH}" in
|
|||||||
# Static build for linking into the Go binary
|
# Static build for linking into the Go binary
|
||||||
init_vars
|
init_vars
|
||||||
CMAKE_TARGETS="--target llama --target ggml"
|
CMAKE_TARGETS="--target llama --target ggml"
|
||||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_BLAS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="../build/darwin/${ARCH}_static"
|
BUILD_DIR="../build/darwin/${ARCH}_static"
|
||||||
echo "Building static library"
|
echo "Building static library"
|
||||||
build
|
build
|
||||||
@@ -37,7 +37,7 @@ case "${GOARCH}" in
|
|||||||
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
|
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
|
||||||
#
|
#
|
||||||
init_vars
|
init_vars
|
||||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="../build/darwin/${ARCH}/cpu"
|
BUILD_DIR="../build/darwin/${ARCH}/cpu"
|
||||||
echo "Building LCD CPU"
|
echo "Building LCD CPU"
|
||||||
build
|
build
|
||||||
@@ -49,7 +49,7 @@ case "${GOARCH}" in
|
|||||||
# Approximately 400% faster than LCD on same CPU
|
# Approximately 400% faster than LCD on same CPU
|
||||||
#
|
#
|
||||||
init_vars
|
init_vars
|
||||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
|
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
|
||||||
echo "Building AVX CPU"
|
echo "Building AVX CPU"
|
||||||
build
|
build
|
||||||
@@ -61,7 +61,7 @@ case "${GOARCH}" in
|
|||||||
# Approximately 10% faster than AVX on same CPU
|
# Approximately 10% faster than AVX on same CPU
|
||||||
#
|
#
|
||||||
init_vars
|
init_vars
|
||||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
|
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
|
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
|
||||||
echo "Building AVX2 CPU"
|
echo "Building AVX2 CPU"
|
||||||
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
|
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
|
||||||
@@ -75,7 +75,7 @@ case "${GOARCH}" in
|
|||||||
# Static build for linking into the Go binary
|
# Static build for linking into the Go binary
|
||||||
init_vars
|
init_vars
|
||||||
CMAKE_TARGETS="--target llama --target ggml"
|
CMAKE_TARGETS="--target llama --target ggml"
|
||||||
CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_BLAS=off -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="../build/darwin/${ARCH}_static"
|
BUILD_DIR="../build/darwin/${ARCH}_static"
|
||||||
echo "Building static library"
|
echo "Building static library"
|
||||||
build
|
build
|
||||||
|
@@ -51,7 +51,7 @@ if [ -z "${CUDACXX}" ]; then
|
|||||||
export CUDACXX=$(command -v nvcc)
|
export CUDACXX=$(command -v nvcc)
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
|
COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off"
|
||||||
source $(dirname $0)/gen_common.sh
|
source $(dirname $0)/gen_common.sh
|
||||||
init_vars
|
init_vars
|
||||||
git_module_setup
|
git_module_setup
|
||||||
@@ -64,7 +64,7 @@ if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ];
|
|||||||
# Static build for linking into the Go binary
|
# Static build for linking into the Go binary
|
||||||
init_vars
|
init_vars
|
||||||
CMAKE_TARGETS="--target llama --target ggml"
|
CMAKE_TARGETS="--target llama --target ggml"
|
||||||
CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="../build/linux/${ARCH}_static"
|
BUILD_DIR="../build/linux/${ARCH}_static"
|
||||||
echo "Building static library"
|
echo "Building static library"
|
||||||
build
|
build
|
||||||
@@ -93,7 +93,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
|||||||
# -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
|
# -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
|
||||||
# -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
|
# -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
|
||||||
|
|
||||||
COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
|
COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_OPENMP=off"
|
||||||
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
|
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
|
||||||
#
|
#
|
||||||
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
|
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
|
||||||
@@ -178,7 +178,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
|
|||||||
CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
|
CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
|
||||||
echo "Building custom CUDA GPU"
|
echo "Building custom CUDA GPU"
|
||||||
else
|
else
|
||||||
CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
|
CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
|
||||||
fi
|
fi
|
||||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
|
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
|
||||||
BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
|
BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
|
||||||
|
@@ -39,7 +39,8 @@ function init_vars {
|
|||||||
}
|
}
|
||||||
$script:cmakeDefs = @(
|
$script:cmakeDefs = @(
|
||||||
"-DBUILD_SHARED_LIBS=on",
|
"-DBUILD_SHARED_LIBS=on",
|
||||||
"-DLLAMA_NATIVE=off"
|
"-DLLAMA_NATIVE=off",
|
||||||
|
"-DLLAMA_OPENMP=off"
|
||||||
)
|
)
|
||||||
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
|
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
|
||||||
$script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
|
$script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
|
||||||
@@ -122,8 +123,13 @@ function build {
|
|||||||
& cmake --version
|
& cmake --version
|
||||||
& cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
|
& cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
|
||||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||||
write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ })"
|
if ($cmakeDefs -contains "-G") {
|
||||||
& cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
|
$extra=@("-j8")
|
||||||
|
} else {
|
||||||
|
$extra= @("--", "/p:CL_MPcount=8")
|
||||||
|
}
|
||||||
|
write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
|
||||||
|
& cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
|
||||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||||
# Rearrange output to be consistent between different generators
|
# Rearrange output to be consistent between different generators
|
||||||
if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) {
|
if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) {
|
||||||
@@ -203,7 +209,8 @@ function build_static() {
|
|||||||
"-DLLAMA_AVX2=off",
|
"-DLLAMA_AVX2=off",
|
||||||
"-DLLAMA_AVX512=off",
|
"-DLLAMA_AVX512=off",
|
||||||
"-DLLAMA_F16C=off",
|
"-DLLAMA_F16C=off",
|
||||||
"-DLLAMA_FMA=off")
|
"-DLLAMA_FMA=off",
|
||||||
|
"-DLLAMA_OPENMP=off")
|
||||||
$script:buildDir="../build/windows/${script:ARCH}_static"
|
$script:buildDir="../build/windows/${script:ARCH}_static"
|
||||||
write-host "Building static library"
|
write-host "Building static library"
|
||||||
build
|
build
|
||||||
@@ -270,7 +277,15 @@ function build_cuda() {
|
|||||||
init_vars
|
init_vars
|
||||||
$script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
|
$script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
|
||||||
$script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT"
|
$script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT"
|
||||||
$script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
|
$script:cmakeDefs += @(
|
||||||
|
"-A", "x64",
|
||||||
|
"-DLLAMA_CUDA=ON",
|
||||||
|
"-DLLAMA_AVX=on",
|
||||||
|
"-DLLAMA_AVX2=off",
|
||||||
|
"-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR",
|
||||||
|
"-DCMAKE_CUDA_FLAGS=-t8",
|
||||||
|
"-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}"
|
||||||
|
)
|
||||||
if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
|
if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
|
||||||
write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
|
write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
|
||||||
$script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
|
$script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
|
||||||
@@ -280,10 +295,12 @@ function build_cuda() {
|
|||||||
sign
|
sign
|
||||||
install
|
install
|
||||||
|
|
||||||
write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\"
|
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
|
||||||
cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
|
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" -ea 0 > $null
|
||||||
cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
|
write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
|
||||||
cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
|
cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
|
||||||
|
cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
|
||||||
|
cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
|
||||||
} else {
|
} else {
|
||||||
write-host "Skipping CUDA generation step"
|
write-host "Skipping CUDA generation step"
|
||||||
}
|
}
|
||||||
@@ -317,16 +334,18 @@ function build_oneapi() {
|
|||||||
sign
|
sign
|
||||||
install
|
install
|
||||||
|
|
||||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:distDir}"
|
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:distDir}"
|
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" -ea 0 > $null
|
||||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:distDir}"
|
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:distDir}"
|
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:distDir}"
|
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:distDir}"
|
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:distDir}"
|
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||||
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:distDir}"
|
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||||
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:distDir}"
|
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||||
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:distDir}"
|
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||||
|
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||||
|
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||||
} else {
|
} else {
|
||||||
Write-Host "Skipping oneAPI generation step"
|
Write-Host "Skipping oneAPI generation step"
|
||||||
}
|
}
|
||||||
|
13
llm/ggla.go
13
llm/ggla.go
@@ -53,7 +53,7 @@ func (llm *ggla) Tensors() Tensors {
|
|||||||
return llm.tensors
|
return llm.tensors
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm *ggla) decode(rs io.ReadSeeker) error {
|
func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {
|
||||||
var r uint32
|
var r uint32
|
||||||
if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
|
if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -69,9 +69,18 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
|
|||||||
for {
|
for {
|
||||||
var dims uint32
|
var dims uint32
|
||||||
if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {
|
if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {
|
||||||
|
if errors.Is(err, io.EOF) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
defer func() {
|
||||||
|
if errors.Is(retErr, io.EOF) {
|
||||||
|
retErr = io.ErrUnexpectedEOF
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
var namesize uint32
|
var namesize uint32
|
||||||
if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil {
|
if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -108,7 +117,7 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if _, err := rs.Seek((offset+31)&-32, io.SeekStart); err != nil {
|
if _, err := rs.Seek((offset+31)&-32-offset, io.SeekCurrent); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
71
llm/ggml.go
71
llm/ggml.go
@@ -6,6 +6,8 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/util/bufioutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
type GGML struct {
|
type GGML struct {
|
||||||
@@ -69,6 +71,30 @@ func (kv KV) HeadCountKV() uint64 {
|
|||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (kv KV) EmbeddingHeadCount() uint64 {
|
||||||
|
if heads := kv.HeadCount(); heads > 0 {
|
||||||
|
return kv.EmbeddingLength() / kv.HeadCount()
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func (kv KV) EmbeddingHeadCountK() uint64 {
|
||||||
|
if k := kv.u64(fmt.Sprintf("%s.attention.key_length", kv.Architecture())); k > 0 {
|
||||||
|
return k
|
||||||
|
}
|
||||||
|
|
||||||
|
return kv.EmbeddingHeadCount()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (kv KV) EmbeddingHeadCountV() uint64 {
|
||||||
|
if v := kv.u64(fmt.Sprintf("%s.attention.value_length", kv.Architecture())); v > 0 {
|
||||||
|
return v
|
||||||
|
}
|
||||||
|
|
||||||
|
return kv.EmbeddingHeadCount()
|
||||||
|
}
|
||||||
|
|
||||||
func (kv KV) GQA() uint64 {
|
func (kv KV) GQA() uint64 {
|
||||||
return kv.HeadCount() / kv.HeadCountKV()
|
return kv.HeadCount() / kv.HeadCountKV()
|
||||||
}
|
}
|
||||||
@@ -254,7 +280,18 @@ func DetectGGMLType(b []byte) string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
|
// DecodeGGML decodes a GGML model from the given reader.
|
||||||
|
//
|
||||||
|
// It collects array values for arrays with a size less than or equal to
|
||||||
|
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
|
||||||
|
// the maxArraySize is negative, all arrays are collected.
|
||||||
|
func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
|
||||||
|
if maxArraySize == 0 {
|
||||||
|
maxArraySize = 1024
|
||||||
|
}
|
||||||
|
|
||||||
|
rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
|
||||||
|
|
||||||
var magic uint32
|
var magic uint32
|
||||||
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
|
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
|
||||||
return nil, 0, err
|
return nil, 0, err
|
||||||
@@ -267,17 +304,15 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
|
|||||||
case FILE_MAGIC_GGLA:
|
case FILE_MAGIC_GGLA:
|
||||||
c = &containerGGLA{}
|
c = &containerGGLA{}
|
||||||
case FILE_MAGIC_GGUF_LE:
|
case FILE_MAGIC_GGUF_LE:
|
||||||
c = &containerGGUF{ByteOrder: binary.LittleEndian}
|
c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
|
||||||
case FILE_MAGIC_GGUF_BE:
|
case FILE_MAGIC_GGUF_BE:
|
||||||
c = &containerGGUF{ByteOrder: binary.BigEndian}
|
c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
|
||||||
default:
|
default:
|
||||||
return nil, 0, errors.New("invalid file magic")
|
return nil, 0, errors.New("invalid file magic")
|
||||||
}
|
}
|
||||||
|
|
||||||
model, err := c.Decode(rs)
|
model, err := c.Decode(rs)
|
||||||
if errors.Is(err, io.EOF) {
|
if err != nil {
|
||||||
// noop
|
|
||||||
} else if err != nil {
|
|
||||||
return nil, 0, err
|
return nil, 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -297,7 +332,10 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
|||||||
embedding := llm.KV().EmbeddingLength()
|
embedding := llm.KV().EmbeddingLength()
|
||||||
heads := llm.KV().HeadCount()
|
heads := llm.KV().HeadCount()
|
||||||
headsKV := llm.KV().HeadCountKV()
|
headsKV := llm.KV().HeadCountKV()
|
||||||
vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any)))
|
vocab := uint64(llm.KV()["tokenizer.ggml.tokens"].(*array).size)
|
||||||
|
|
||||||
|
embeddingHeads := llm.KV().EmbeddingHeadCount()
|
||||||
|
embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
|
||||||
|
|
||||||
layers := llm.Tensors().Layers()
|
layers := llm.Tensors().Layers()
|
||||||
|
|
||||||
@@ -307,7 +345,8 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
|||||||
|
|
||||||
partialOffload = 4 * batch * embedding
|
partialOffload = 4 * batch * embedding
|
||||||
partialOffload += max(
|
partialOffload += max(
|
||||||
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV),
|
// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
|
||||||
|
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
|
||||||
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -315,15 +354,15 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
|||||||
// mixtral 8x22b
|
// mixtral 8x22b
|
||||||
ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
|
ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
|
||||||
partialOffload = max(
|
partialOffload = max(
|
||||||
3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embedding/heads*headsKV),
|
3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
|
||||||
4*(context*batch*heads+context*embedding/heads*headsKV+batch*1024+embedding/heads*headsKV*batch),
|
4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
|
||||||
)
|
)
|
||||||
} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
|
} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
|
||||||
// mixtral 8x7b
|
// mixtral 8x7b
|
||||||
ffnGateWeight1 := ffnGateWeight.Shape[1]
|
ffnGateWeight1 := ffnGateWeight.Shape[1]
|
||||||
fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
|
fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
|
||||||
partialOffload = max(
|
partialOffload = max(
|
||||||
4*batch*(3+embedding/heads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
|
4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
|
||||||
4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
|
4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -366,6 +405,16 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
|||||||
4*batch*(vocab+2*embedding),
|
4*batch*(vocab+2*embedding),
|
||||||
fullOffload,
|
fullOffload,
|
||||||
)
|
)
|
||||||
|
case "deepseek2":
|
||||||
|
fullOffload = max(
|
||||||
|
4*batch*(3*embedding+vocab),
|
||||||
|
4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
|
||||||
|
)
|
||||||
|
|
||||||
|
partialOffload = max(
|
||||||
|
4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
|
||||||
|
4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
return
|
return
|
||||||
|
1
llm/ggml_test.go
Normal file
1
llm/ggml_test.go
Normal file
@@ -0,0 +1 @@
|
|||||||
|
package llm
|
130
llm/gguf.go
130
llm/gguf.go
@@ -3,11 +3,10 @@ package llm
|
|||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"encoding/binary"
|
"encoding/binary"
|
||||||
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"log/slog"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type containerGGUF struct {
|
type containerGGUF struct {
|
||||||
@@ -29,6 +28,12 @@ type containerGGUF struct {
|
|||||||
NumTensor uint64
|
NumTensor uint64
|
||||||
NumKV uint64
|
NumKV uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
|
maxArraySize int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *containerGGUF) canCollectArray(size int) bool {
|
||||||
|
return c.maxArraySize < 0 || size <= c.maxArraySize
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *containerGGUF) Name() string {
|
func (c *containerGGUF) Name() string {
|
||||||
@@ -54,7 +59,6 @@ func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
model := newGGUF(c)
|
model := newGGUF(c)
|
||||||
slog.Debug(fmt.Sprintf("model = %#v", model))
|
|
||||||
if err := model.Decode(rs); err != nil {
|
if err := model.Decode(rs); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -85,6 +89,8 @@ type gguf struct {
|
|||||||
tensors []*Tensor
|
tensors []*Tensor
|
||||||
|
|
||||||
parameters uint64
|
parameters uint64
|
||||||
|
|
||||||
|
scratch [16 << 10]byte
|
||||||
}
|
}
|
||||||
|
|
||||||
func newGGUF(container *containerGGUF) *gguf {
|
func newGGUF(container *containerGGUF) *gguf {
|
||||||
@@ -181,34 +187,34 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// decode tensors
|
// decode tensors
|
||||||
for i := 0; uint64(i) < llm.numTensor(); i++ {
|
for range llm.numTensor() {
|
||||||
name, err := readGGUFString(llm, rs)
|
name, err := readGGUFString(llm, rs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return fmt.Errorf("failed to read tensor name: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// dims is the number of dimensions in the tensor
|
// dims is the number of dimensions in the tensor
|
||||||
dims, err := readGGUF[uint32](llm, rs)
|
dims, err := readGGUF[uint32](llm, rs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return fmt.Errorf("failed to read tensor dimensions: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
shape := [4]uint64{1, 1, 1, 1}
|
shape := [4]uint64{1, 1, 1, 1}
|
||||||
for i := 0; uint32(i) < dims; i++ {
|
for i := 0; uint32(i) < dims; i++ {
|
||||||
shape[i], err = readGGUF[uint64](llm, rs)
|
shape[i], err = readGGUF[uint64](llm, rs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return fmt.Errorf("failed to read tensor shape: %w", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
kind, err := readGGUF[uint32](llm, rs)
|
kind, err := readGGUF[uint32](llm, rs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return fmt.Errorf("failed to read tensor kind: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
offset, err := readGGUF[uint64](llm, rs)
|
offset, err := readGGUF[uint64](llm, rs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return fmt.Errorf("failed to read tensor offset: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
tensor := Tensor{
|
tensor := Tensor{
|
||||||
@@ -230,24 +236,19 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
|
|||||||
alignment = 32
|
alignment = 32
|
||||||
}
|
}
|
||||||
|
|
||||||
offset, err := rs.Seek(0, io.SeekCurrent)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
padding := llm.padding(offset, int64(alignment))
|
|
||||||
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tensor := range llm.tensors {
|
for _, tensor := range llm.tensors {
|
||||||
if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
|
offset, err := rs.Seek(0, io.SeekCurrent)
|
||||||
return err
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to get current offset: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
padding := llm.padding(int64(tensor.Size()), int64(alignment))
|
padding := llm.padding(offset, int64(alignment))
|
||||||
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
|
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
|
||||||
return err
|
return fmt.Errorf("failed to seek to init padding: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
|
||||||
|
return fmt.Errorf("failed to seek to tensor: %w", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -285,22 +286,48 @@ func readGGUFV1String(llm *gguf, r io.Reader) (string, error) {
|
|||||||
return b.String(), nil
|
return b.String(), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func discardGGUFString(llm *gguf, r io.Reader) error {
|
||||||
|
buf := llm.scratch[:8]
|
||||||
|
_, err := io.ReadFull(r, buf)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
size := int(llm.ByteOrder.Uint64(buf))
|
||||||
|
for size > 0 {
|
||||||
|
n, err := r.Read(llm.scratch[:min(size, cap(llm.scratch))])
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
size -= n
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func readGGUFString(llm *gguf, r io.Reader) (string, error) {
|
func readGGUFString(llm *gguf, r io.Reader) (string, error) {
|
||||||
if llm.Version == 1 {
|
if llm.Version == 1 {
|
||||||
return readGGUFV1String(llm, r)
|
return readGGUFV1String(llm, r)
|
||||||
}
|
}
|
||||||
|
|
||||||
var length uint64
|
buf := llm.scratch[:8]
|
||||||
if err := binary.Read(r, llm.ByteOrder, &length); err != nil {
|
_, err := io.ReadFull(r, buf)
|
||||||
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
var b bytes.Buffer
|
length := int(llm.ByteOrder.Uint64(buf))
|
||||||
if _, err := io.CopyN(&b, r, int64(length)); err != nil {
|
if length > len(llm.scratch) {
|
||||||
|
buf = make([]byte, length)
|
||||||
|
} else {
|
||||||
|
buf = llm.scratch[:length]
|
||||||
|
}
|
||||||
|
clear(buf)
|
||||||
|
|
||||||
|
_, err = io.ReadFull(r, buf)
|
||||||
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
return string(buf), nil
|
||||||
return b.String(), nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func writeGGUFString(llm *gguf, w io.Writer, s string) error {
|
func writeGGUFString(llm *gguf, w io.Writer, s string) error {
|
||||||
@@ -316,7 +343,16 @@ func writeGGUFString(llm *gguf, w io.Writer, s string) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
|
type array struct {
|
||||||
|
size int
|
||||||
|
values []any
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *array) MarshalJSON() ([]byte, error) {
|
||||||
|
return json.Marshal(a.values)
|
||||||
|
}
|
||||||
|
|
||||||
|
func readGGUFV1Array(llm *gguf, r io.Reader) (*array, error) {
|
||||||
t, err := readGGUF[uint32](llm, r)
|
t, err := readGGUF[uint32](llm, r)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@@ -327,7 +363,12 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
for i := 0; uint32(i) < n; i++ {
|
a := &array{size: int(n)}
|
||||||
|
if llm.canCollectArray(int(n)) {
|
||||||
|
a.values = make([]any, 0, int(n))
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := range n {
|
||||||
var e any
|
var e any
|
||||||
switch t {
|
switch t {
|
||||||
case ggufTypeUint8:
|
case ggufTypeUint8:
|
||||||
@@ -361,13 +402,15 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
a = append(a, e)
|
if a.values != nil {
|
||||||
|
a.values[i] = e
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return
|
return a, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
|
func readGGUFArray(llm *gguf, r io.Reader) (*array, error) {
|
||||||
if llm.Version == 1 {
|
if llm.Version == 1 {
|
||||||
return readGGUFV1Array(llm, r)
|
return readGGUFV1Array(llm, r)
|
||||||
}
|
}
|
||||||
@@ -382,7 +425,12 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
for i := 0; uint64(i) < n; i++ {
|
a := &array{size: int(n)}
|
||||||
|
if llm.canCollectArray(int(n)) {
|
||||||
|
a.values = make([]any, int(n))
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := range n {
|
||||||
var e any
|
var e any
|
||||||
switch t {
|
switch t {
|
||||||
case ggufTypeUint8:
|
case ggufTypeUint8:
|
||||||
@@ -408,7 +456,11 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
|
|||||||
case ggufTypeBool:
|
case ggufTypeBool:
|
||||||
e, err = readGGUF[bool](llm, r)
|
e, err = readGGUF[bool](llm, r)
|
||||||
case ggufTypeString:
|
case ggufTypeString:
|
||||||
e, err = readGGUFString(llm, r)
|
if a.values != nil {
|
||||||
|
e, err = readGGUFString(llm, r)
|
||||||
|
} else {
|
||||||
|
err = discardGGUFString(llm, r)
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
return nil, fmt.Errorf("invalid array type: %d", t)
|
return nil, fmt.Errorf("invalid array type: %d", t)
|
||||||
}
|
}
|
||||||
@@ -416,10 +468,12 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
a = append(a, e)
|
if a.values != nil {
|
||||||
|
a.values[i] = e
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return
|
return a, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func writeGGUFArray[S ~[]E, E any](llm *gguf, w io.Writer, t uint32, s S) error {
|
func writeGGUFArray[S ~[]E, E any](llm *gguf, w io.Writer, t uint32, s S) error {
|
||||||
|
Submodule llm/llama.cpp updated: 5921b8f089...7c26775adb
315
llm/memory.go
315
llm/memory.go
@@ -3,9 +3,10 @@ package llm
|
|||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/envconfig"
|
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/gpu"
|
"github.com/ollama/ollama/gpu"
|
||||||
)
|
)
|
||||||
@@ -16,7 +17,8 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
|
|||||||
var estimatedVRAM uint64
|
var estimatedVRAM uint64
|
||||||
for _, gpus := range allGpus.ByLibrary() {
|
for _, gpus := range allGpus.ByLibrary() {
|
||||||
var layerCount int
|
var layerCount int
|
||||||
layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
|
estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||||
|
layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
|
||||||
if opts.NumGPU < 0 {
|
if opts.NumGPU < 0 {
|
||||||
if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
|
if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
|
||||||
return true, estimatedVRAM
|
return true, estimatedVRAM
|
||||||
@@ -30,24 +32,76 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
|
|||||||
return false, estimatedVRAM
|
return false, estimatedVRAM
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type MemoryEstimate struct {
|
||||||
|
// How many layers we predict we can load
|
||||||
|
Layers int
|
||||||
|
|
||||||
|
// The size of the graph which occupies the main GPU
|
||||||
|
Graph uint64
|
||||||
|
|
||||||
|
// How much VRAM will be allocated given the number of layers we predict
|
||||||
|
VRAMSize uint64
|
||||||
|
|
||||||
|
// The total size of the model if loaded into VRAM. If all layers are loaded, VRAMSize == TotalSize
|
||||||
|
TotalSize uint64
|
||||||
|
|
||||||
|
// For multi-GPU scenarios, this provides the tensor split parameter
|
||||||
|
TensorSplit string
|
||||||
|
|
||||||
|
// For multi-GPU scenarios, this is the size in bytes per GPU
|
||||||
|
GPUSizes []uint64
|
||||||
|
|
||||||
|
// internal fields for logging purposes
|
||||||
|
inferenceLibrary string
|
||||||
|
layersRequested int
|
||||||
|
layersModel int
|
||||||
|
availableList []string
|
||||||
|
kv uint64
|
||||||
|
allocationsList []string
|
||||||
|
memoryWeights uint64
|
||||||
|
memoryLayerOutput uint64
|
||||||
|
graphFullOffload uint64
|
||||||
|
graphPartialOffload uint64
|
||||||
|
}
|
||||||
|
|
||||||
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
|
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
|
||||||
// The GPUs provided must all be the same Library
|
// The GPUs provided must all be the same Library
|
||||||
func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) {
|
func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
|
||||||
var memoryAvailable uint64
|
// Graph size for a partial offload, applies to all GPUs
|
||||||
for _, info := range gpus {
|
var graphPartialOffload uint64
|
||||||
memoryAvailable += info.FreeMemory
|
|
||||||
}
|
|
||||||
if envconfig.MaxVRAM > 0 {
|
|
||||||
memoryAvailable = envconfig.MaxVRAM
|
|
||||||
}
|
|
||||||
|
|
||||||
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))
|
// Graph size when all layers are offloaded, applies to all GPUs
|
||||||
|
var graphFullOffload uint64
|
||||||
|
|
||||||
// TODO - this is probably wrong, first GPU vs secondaries will have different overheads
|
// Final graph offload once we know full or partial
|
||||||
memoryMinimum := gpus[0].MinimumMemory
|
var graphOffload uint64
|
||||||
|
|
||||||
|
// Projectors loaded into GPU0 only
|
||||||
|
var projectorSize uint64
|
||||||
|
|
||||||
|
// Conditional output size on GPU 0
|
||||||
|
var memoryLayerOutput uint64
|
||||||
|
|
||||||
|
// The sizes of a layer
|
||||||
|
var layerSize uint64
|
||||||
|
|
||||||
|
// The sum of all the layer sizes (just for logging)
|
||||||
|
var memoryWeights uint64
|
||||||
|
|
||||||
|
// True if all the layers are loaded
|
||||||
|
var fullyLoaded bool
|
||||||
|
|
||||||
|
// Overflow that didn't fit into the GPU
|
||||||
|
var overflow uint64
|
||||||
|
|
||||||
|
availableList := make([]string, len(gpus))
|
||||||
|
for i, gpu := range gpus {
|
||||||
|
availableList[i] = format.HumanBytes2(gpu.FreeMemory)
|
||||||
|
}
|
||||||
|
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
|
||||||
|
|
||||||
for _, projector := range projectors {
|
for _, projector := range projectors {
|
||||||
memoryMinimum += projectorMemoryRequirements(projector)
|
projectorSize += projectorMemoryRequirements(projector)
|
||||||
|
|
||||||
// multimodal models require at least 2048 context
|
// multimodal models require at least 2048 context
|
||||||
opts.NumCtx = max(opts.NumCtx, 2048)
|
opts.NumCtx = max(opts.NumCtx, 2048)
|
||||||
@@ -56,127 +110,246 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
|||||||
layers := ggml.Tensors().Layers()
|
layers := ggml.Tensors().Layers()
|
||||||
// add one layer worth of memory as a buffer
|
// add one layer worth of memory as a buffer
|
||||||
if blk0, ok := layers["blk.0"]; ok {
|
if blk0, ok := layers["blk.0"]; ok {
|
||||||
memoryMinimum += blk0.size()
|
layerSize = blk0.size()
|
||||||
|
} else {
|
||||||
|
slog.Warn("model missing blk.0 layer size")
|
||||||
}
|
}
|
||||||
|
|
||||||
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
|
// fp16 k,v = sizeof(float16) * n_ctx * n_layer * (n_embd_head_k + n_embd_head_v) * n_head_kv
|
||||||
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
|
var kv uint64 = 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * (ggml.KV().EmbeddingHeadCountK() + ggml.KV().EmbeddingHeadCountV()) * ggml.KV().HeadCountKV()
|
||||||
|
|
||||||
graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
|
// KV is proportional to the number of layers
|
||||||
|
layerSize += kv / ggml.KV().BlockCount()
|
||||||
|
|
||||||
|
graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
|
||||||
if graphPartialOffload == 0 {
|
if graphPartialOffload == 0 {
|
||||||
graphPartialOffload = ggml.KV().GQA() * kv / 6
|
graphPartialOffload = ggml.KV().GQA() * kv / 6
|
||||||
}
|
}
|
||||||
|
|
||||||
if graphFullOffload == 0 {
|
if graphFullOffload == 0 {
|
||||||
graphFullOffload = graphPartialOffload
|
graphFullOffload = graphPartialOffload
|
||||||
}
|
}
|
||||||
|
|
||||||
graphFullOffload *= uint64(len(gpus))
|
|
||||||
graphPartialOffload *= uint64(len(gpus))
|
|
||||||
|
|
||||||
// on metal there's no partial offload overhead
|
// on metal there's no partial offload overhead
|
||||||
if gpus[0].Library == "metal" {
|
if gpus[0].Library == "metal" {
|
||||||
graphPartialOffload = graphFullOffload
|
graphPartialOffload = graphFullOffload
|
||||||
|
} else if len(gpus) > 1 {
|
||||||
|
// multigpu should always use the partial graph size
|
||||||
|
graphFullOffload = graphPartialOffload
|
||||||
}
|
}
|
||||||
|
|
||||||
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
|
|
||||||
memoryRequiredTotal := memoryMinimum + graphFullOffload
|
|
||||||
|
|
||||||
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
|
|
||||||
memoryRequiredPartial := memoryMinimum + graphPartialOffload
|
|
||||||
|
|
||||||
var memoryLayerOutput uint64
|
|
||||||
if layer, ok := layers["output_norm"]; ok {
|
if layer, ok := layers["output_norm"]; ok {
|
||||||
memoryLayerOutput += layer.size()
|
memoryLayerOutput += layer.size()
|
||||||
}
|
}
|
||||||
|
|
||||||
if layer, ok := layers["output"]; ok {
|
if layer, ok := layers["output"]; ok {
|
||||||
memoryLayerOutput += layer.size()
|
memoryLayerOutput += layer.size()
|
||||||
} else if layer, ok := layers["token_embd"]; ok {
|
} else if layer, ok := layers["token_embd"]; ok {
|
||||||
memoryLayerOutput += layer.size()
|
memoryLayerOutput += layer.size()
|
||||||
}
|
}
|
||||||
|
|
||||||
if gpus[0].Library == "metal" && opts.UseMMap {
|
// Output layer handled at the end if we have space
|
||||||
// memory is preallocated for output tensors
|
gpuZeroOverhead := projectorSize
|
||||||
memoryRequiredTotal += memoryLayerOutput
|
|
||||||
memoryRequiredPartial += memoryLayerOutput
|
// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
|
||||||
|
var layerCount int
|
||||||
|
layerCounts := make([]int, len(gpus))
|
||||||
|
gpuAllocations := make([]uint64, len(gpus))
|
||||||
|
type gs struct {
|
||||||
|
i int
|
||||||
|
g *gpu.GpuInfo
|
||||||
|
}
|
||||||
|
gpusWithSpace := []gs{}
|
||||||
|
for i := range gpus {
|
||||||
|
var gzo uint64
|
||||||
|
if len(gpusWithSpace) == 0 {
|
||||||
|
gzo = gpuZeroOverhead
|
||||||
|
}
|
||||||
|
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
|
||||||
|
if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
|
||||||
|
slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
|
||||||
|
gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full
|
||||||
}
|
}
|
||||||
|
|
||||||
var layerCount int
|
var gpuZeroID int
|
||||||
|
if len(gpusWithSpace) > 0 {
|
||||||
|
gpuZeroID = gpusWithSpace[0].i
|
||||||
|
gpuAllocations[gpuZeroID] += gpuZeroOverhead
|
||||||
|
}
|
||||||
|
|
||||||
|
// For all the layers, find where they can fit on the GPU(s)
|
||||||
for i := range int(ggml.KV().BlockCount()) {
|
for i := range int(ggml.KV().BlockCount()) {
|
||||||
|
// Some models have inconsistent layer sizes
|
||||||
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
|
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
|
||||||
memoryLayer := blk.size()
|
layerSize = blk.size()
|
||||||
|
layerSize += kv / ggml.KV().BlockCount()
|
||||||
|
}
|
||||||
|
memoryWeights += layerSize
|
||||||
|
|
||||||
// KV is proportional to the number of layers
|
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
|
||||||
memoryLayer += kv / ggml.KV().BlockCount()
|
// Stop allocating on GPU(s) once we hit the users target NumGPU
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
memoryRequiredTotal += memoryLayer
|
// distribute the layers across the GPU(s) that have space
|
||||||
if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredPartial+memoryLayer) {
|
for j := len(gpusWithSpace); j > 0; j-- {
|
||||||
memoryRequiredPartial += memoryLayer
|
g := gpusWithSpace[i%j]
|
||||||
|
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
||||||
|
if g.g.FreeMemory > used+layerSize {
|
||||||
|
gpuAllocations[g.i] += layerSize
|
||||||
|
layerCounts[g.i]++
|
||||||
layerCount++
|
layerCount++
|
||||||
|
break
|
||||||
|
} else {
|
||||||
|
gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if layerCount >= int(ggml.KV().BlockCount()) {
|
||||||
if gpus[0].Library != "metal" || !opts.UseMMap {
|
fullyLoaded = true
|
||||||
// memory was not preallocated for output tensors
|
} else {
|
||||||
memoryRequiredTotal += memoryLayerOutput
|
for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
|
||||||
|
overflow += layerSize
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredTotal) {
|
// Determine if we need to consider output then find where it fits
|
||||||
layerCount = int(ggml.KV().BlockCount()) + 1
|
if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
|
||||||
memoryRequiredPartial = memoryRequiredTotal
|
for j := len(gpusWithSpace); j > 0; j-- {
|
||||||
|
g := gpusWithSpace[layerCount%j]
|
||||||
|
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
||||||
|
if g.g.FreeMemory > used+memoryLayerOutput {
|
||||||
|
gpuAllocations[g.i] += memoryLayerOutput
|
||||||
|
layerCounts[g.i]++
|
||||||
|
layerCount++
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if layerCount < int(ggml.KV().BlockCount())+1 {
|
||||||
|
fullyLoaded = false
|
||||||
|
overflow += memoryLayerOutput
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
memoryWeights := memoryRequiredTotal - memoryMinimum - graphFullOffload - kv
|
// Add the applicable (full or partial) graph allocations
|
||||||
|
for i := range gpus {
|
||||||
|
if layerCounts[i] <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if fullyLoaded {
|
||||||
|
gpuAllocations[i] += graphFullOffload
|
||||||
|
} else {
|
||||||
|
gpuAllocations[i] += graphPartialOffload
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if fullyLoaded {
|
||||||
|
graphOffload = graphFullOffload
|
||||||
|
} else {
|
||||||
|
graphOffload = graphPartialOffload
|
||||||
|
}
|
||||||
|
|
||||||
|
// Summaries for the log
|
||||||
|
var memoryRequiredPartial, memoryRequiredTotal uint64
|
||||||
|
for i := range gpuAllocations {
|
||||||
|
memoryRequiredPartial += gpuAllocations[i]
|
||||||
|
}
|
||||||
|
memoryRequiredTotal = memoryRequiredPartial + overflow
|
||||||
|
|
||||||
|
tensorSplit := ""
|
||||||
|
if len(gpus) > 1 {
|
||||||
|
splits := make([]string, len(gpus))
|
||||||
|
for i, count := range layerCounts {
|
||||||
|
splits[i] = strconv.Itoa(count)
|
||||||
|
}
|
||||||
|
tensorSplit = strings.Join(splits, ",")
|
||||||
|
}
|
||||||
|
allocationsList := []string{}
|
||||||
|
for _, a := range gpuAllocations {
|
||||||
|
allocationsList = append(allocationsList, format.HumanBytes2(a))
|
||||||
|
}
|
||||||
|
|
||||||
|
estimate := MemoryEstimate{
|
||||||
|
TotalSize: memoryRequiredTotal,
|
||||||
|
Layers: 0,
|
||||||
|
Graph: 0,
|
||||||
|
VRAMSize: 0,
|
||||||
|
GPUSizes: []uint64{},
|
||||||
|
|
||||||
|
inferenceLibrary: gpus[0].Library,
|
||||||
|
layersRequested: opts.NumGPU,
|
||||||
|
layersModel: int(ggml.KV().BlockCount()) + 1,
|
||||||
|
availableList: availableList,
|
||||||
|
kv: kv,
|
||||||
|
allocationsList: allocationsList,
|
||||||
|
memoryWeights: memoryWeights,
|
||||||
|
memoryLayerOutput: memoryLayerOutput,
|
||||||
|
graphFullOffload: graphFullOffload,
|
||||||
|
graphPartialOffload: graphPartialOffload,
|
||||||
|
}
|
||||||
|
|
||||||
|
if gpus[0].Library == "cpu" {
|
||||||
|
return estimate
|
||||||
|
}
|
||||||
|
if layerCount == 0 {
|
||||||
|
slog.Debug("insufficient VRAM to load any model layers")
|
||||||
|
return estimate
|
||||||
|
}
|
||||||
|
estimate.Layers = layerCount
|
||||||
|
estimate.Graph = graphOffload
|
||||||
|
estimate.VRAMSize = memoryRequiredPartial
|
||||||
|
estimate.TotalSize = memoryRequiredTotal
|
||||||
|
estimate.TensorSplit = tensorSplit
|
||||||
|
estimate.GPUSizes = gpuAllocations
|
||||||
|
return estimate
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m MemoryEstimate) log() {
|
||||||
slog.Info(
|
slog.Info(
|
||||||
"offload to gpu",
|
"offload to "+m.inferenceLibrary,
|
||||||
slog.Group(
|
slog.Group(
|
||||||
"layers",
|
"layers",
|
||||||
// requested number of layers to offload
|
// requested number of layers to offload
|
||||||
"requested", opts.NumGPU,
|
"requested", m.layersRequested,
|
||||||
|
// The number of layers the model has (including output)
|
||||||
|
"model", m.layersModel,
|
||||||
// estimated number of layers that can be offloaded
|
// estimated number of layers that can be offloaded
|
||||||
"real", layerCount,
|
"offload", m.Layers,
|
||||||
|
// multi-gpu split for tensors
|
||||||
|
"split", m.TensorSplit,
|
||||||
),
|
),
|
||||||
slog.Group(
|
slog.Group(
|
||||||
"memory",
|
"memory",
|
||||||
// memory available for offloading
|
// memory available by GPU for offloading
|
||||||
"available", format.HumanBytes2(memoryAvailable),
|
"available", m.availableList,
|
||||||
slog.Group(
|
slog.Group(
|
||||||
"required",
|
"required",
|
||||||
// memory required for full offloading
|
// memory required for full offloading
|
||||||
"full", format.HumanBytes2(memoryRequiredTotal),
|
"full", format.HumanBytes2(m.TotalSize),
|
||||||
// memory required to offload layers.estimate layers
|
// memory required to offload layers.estimate layers
|
||||||
"partial", format.HumanBytes2(memoryRequiredPartial),
|
"partial", format.HumanBytes2(m.VRAMSize),
|
||||||
// memory of KV cache
|
// memory of KV cache
|
||||||
"kv", format.HumanBytes2(kv),
|
"kv", format.HumanBytes2(m.kv),
|
||||||
|
// Allocations across the GPUs
|
||||||
|
"allocations", m.allocationsList,
|
||||||
),
|
),
|
||||||
slog.Group(
|
slog.Group(
|
||||||
"weights",
|
"weights",
|
||||||
// memory of the weights
|
// memory of the weights
|
||||||
"total", format.HumanBytes2(memoryWeights),
|
"total", format.HumanBytes2(m.memoryWeights),
|
||||||
// memory of repeating layers
|
// memory of repeating layers
|
||||||
"repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
|
"repeating", format.HumanBytes2(m.memoryWeights-m.memoryLayerOutput),
|
||||||
// memory of non-repeating layers
|
// memory of non-repeating layers
|
||||||
"nonrepeating", format.HumanBytes2(memoryLayerOutput),
|
"nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
|
||||||
),
|
),
|
||||||
slog.Group(
|
slog.Group(
|
||||||
"graph",
|
"graph",
|
||||||
// memory of graph when fully offloaded
|
// memory of graph when fully offloaded
|
||||||
"full", format.HumanBytes2(graphFullOffload),
|
"full", format.HumanBytes2(m.graphFullOffload),
|
||||||
// memory of graph when not fully offloaded
|
// memory of graph when not fully offloaded
|
||||||
"partial", format.HumanBytes2(graphPartialOffload),
|
"partial", format.HumanBytes2(m.graphPartialOffload),
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
if gpus[0].Library == "cpu" {
|
|
||||||
return 0, 0, memoryRequiredTotal
|
|
||||||
}
|
|
||||||
if memoryRequiredPartial > memoryAvailable {
|
|
||||||
slog.Debug("insufficient VRAM to load any model layers")
|
|
||||||
return 0, 0, memoryRequiredTotal
|
|
||||||
}
|
|
||||||
|
|
||||||
return layerCount, memoryRequiredPartial, memoryRequiredTotal
|
|
||||||
}
|
}
|
||||||
|
130
llm/memory_test.go
Normal file
130
llm/memory_test.go
Normal file
@@ -0,0 +1,130 @@
|
|||||||
|
package llm
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"encoding/binary"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/api"
|
||||||
|
"github.com/ollama/ollama/envconfig"
|
||||||
|
"github.com/ollama/ollama/gpu"
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestEstimateGPULayers(t *testing.T) {
|
||||||
|
envconfig.Debug = true
|
||||||
|
modelName := "dummy"
|
||||||
|
f, err := os.CreateTemp(t.TempDir(), modelName)
|
||||||
|
require.NoError(t, err)
|
||||||
|
defer f.Close()
|
||||||
|
gguf := NewGGUFV3(binary.LittleEndian)
|
||||||
|
inputLayerCount := 5
|
||||||
|
|
||||||
|
tensors := []Tensor{
|
||||||
|
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
|
{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
|
{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
|
{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
|
{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
|
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
|
}
|
||||||
|
assert.Len(t, tensors, inputLayerCount+1)
|
||||||
|
err = gguf.Encode(f, KV{
|
||||||
|
"general.architecture": "llama",
|
||||||
|
"general.name": "name",
|
||||||
|
"llama.context_length": uint32(32),
|
||||||
|
"llama.embedding_length": uint32(4096),
|
||||||
|
"llama.block_count": uint32(inputLayerCount),
|
||||||
|
"llama.attention.head_count": uint32(32),
|
||||||
|
"llama.attention.head_count_kv": uint32(32),
|
||||||
|
"tokenizer.ggml.tokens": []string{" "},
|
||||||
|
"tokenizer.ggml.scores": []float32{0},
|
||||||
|
"tokenizer.ggml.token_type": []int32{0},
|
||||||
|
}, tensors)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
ggml, err := LoadModel(f.Name(), 0)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Simple CPU scenario
|
||||||
|
gpus := []gpu.GpuInfo{
|
||||||
|
{
|
||||||
|
Library: "cpu",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
projectors := []string{}
|
||||||
|
opts := api.DefaultOptions()
|
||||||
|
t.Run("cpu", func(t *testing.T) {
|
||||||
|
estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||||
|
assert.Equal(t, 0, estimate.Layers)
|
||||||
|
assert.Equal(t, uint64(0), estimate.Graph)
|
||||||
|
})
|
||||||
|
|
||||||
|
// derived from the dummy ggml file above
|
||||||
|
graphPartialOffload := uint64(202377216)
|
||||||
|
graphFullOffload := uint64(171968512)
|
||||||
|
layerSize := uint64(33554436)
|
||||||
|
projectorSize := uint64(0)
|
||||||
|
memoryLayerOutput := uint64(4)
|
||||||
|
|
||||||
|
// Dual CUDA scenario with assymetry
|
||||||
|
gpuMinimumMemory := uint64(2048)
|
||||||
|
gpus = []gpu.GpuInfo{
|
||||||
|
{
|
||||||
|
Library: "cuda",
|
||||||
|
MinimumMemory: gpuMinimumMemory,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Library: "cuda",
|
||||||
|
MinimumMemory: gpuMinimumMemory,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
|
||||||
|
for i, s := range []struct {
|
||||||
|
layer0, layer1 uint64
|
||||||
|
expect0, expect1 uint64
|
||||||
|
}{
|
||||||
|
{1, 1, 1, 1},
|
||||||
|
{2, 1, 2, 1},
|
||||||
|
{2, 2, 2, 2},
|
||||||
|
{1, 2, 1, 2},
|
||||||
|
{3, 3, 3, 3},
|
||||||
|
{4, 4, 3, 3},
|
||||||
|
{6, 6, 3, 3},
|
||||||
|
{0, 3, 0, 3},
|
||||||
|
} {
|
||||||
|
t.Run(fmt.Sprintf("%v", s), func(t *testing.T) {
|
||||||
|
gpus[0].FreeMemory = 0
|
||||||
|
gpus[1].FreeMemory = 0
|
||||||
|
gpus[0].FreeMemory += projectorSize
|
||||||
|
if s.layer0 > 0 {
|
||||||
|
gpus[0].FreeMemory += memoryLayerOutput
|
||||||
|
} else {
|
||||||
|
gpus[1].FreeMemory += memoryLayerOutput
|
||||||
|
}
|
||||||
|
gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s.layer0*layerSize + 1
|
||||||
|
gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
|
||||||
|
gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
|
||||||
|
gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
|
||||||
|
estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||||
|
assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)
|
||||||
|
assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)
|
||||||
|
var layerSums uint64
|
||||||
|
for _, b := range estimate.GPUSizes {
|
||||||
|
layerSums += b
|
||||||
|
}
|
||||||
|
if estimate.Layers < inputLayerCount+1 {
|
||||||
|
assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
|
||||||
|
assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
|
||||||
|
} else {
|
||||||
|
assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
|
||||||
|
assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
@@ -1,8 +1,8 @@
|
|||||||
diff --git a/common/common.cpp b/common/common.cpp
|
diff --git a/common/common.cpp b/common/common.cpp
|
||||||
index ba1ecf0e..cead57cc 100644
|
index 73ff0e85..6adb1a92 100644
|
||||||
--- a/common/common.cpp
|
--- a/common/common.cpp
|
||||||
+++ b/common/common.cpp
|
+++ b/common/common.cpp
|
||||||
@@ -1836,6 +1836,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
@@ -2447,6 +2447,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
||||||
mparams.use_mmap = params.use_mmap;
|
mparams.use_mmap = params.use_mmap;
|
||||||
mparams.use_mlock = params.use_mlock;
|
mparams.use_mlock = params.use_mlock;
|
||||||
mparams.check_tensors = params.check_tensors;
|
mparams.check_tensors = params.check_tensors;
|
||||||
@@ -12,20 +12,20 @@ index ba1ecf0e..cead57cc 100644
|
|||||||
mparams.kv_overrides = NULL;
|
mparams.kv_overrides = NULL;
|
||||||
} else {
|
} else {
|
||||||
diff --git a/common/common.h b/common/common.h
|
diff --git a/common/common.h b/common/common.h
|
||||||
index d80344f2..71e84834 100644
|
index 58ed72f4..0bb2605e 100644
|
||||||
--- a/common/common.h
|
--- a/common/common.h
|
||||||
+++ b/common/common.h
|
+++ b/common/common.h
|
||||||
@@ -174,6 +174,13 @@ struct gpt_params {
|
@@ -180,6 +180,13 @@ struct gpt_params {
|
||||||
// multimodal models (see examples/llava)
|
|
||||||
std::string mmproj = ""; // path to multimodal projector
|
std::string mmproj = ""; // path to multimodal projector
|
||||||
std::vector<std::string> image; // path to image file(s)
|
std::vector<std::string> image; // path to image file(s)
|
||||||
+
|
|
||||||
+ // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
+ // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
||||||
+ // If the provided progress_callback returns true, model loading continues.
|
+ // If the provided progress_callback returns true, model loading continues.
|
||||||
+ // If it returns false, model loading is immediately aborted.
|
+ // If it returns false, model loading is immediately aborted.
|
||||||
+ llama_progress_callback progress_callback = NULL;
|
+ llama_progress_callback progress_callback = NULL;
|
||||||
+ // context pointer passed to the progress callback
|
+ // context pointer passed to the progress callback
|
||||||
+ void * progress_callback_user_data;
|
+ void * progress_callback_user_data;
|
||||||
};
|
+
|
||||||
|
// server params
|
||||||
void gpt_params_handle_model_default(gpt_params & params);
|
int32_t port = 8080; // server listens on this network port
|
||||||
|
int32_t timeout_read = 600; // http read timeout in seconds
|
||||||
|
@@ -1,8 +1,8 @@
|
|||||||
diff --git a/llama.cpp b/llama.cpp
|
diff --git a/llama.cpp b/llama.cpp
|
||||||
index 40d2ec2c..74f3ee9c 100644
|
index 61948751..4b72a293 100644
|
||||||
--- a/llama.cpp
|
--- a/llama.cpp
|
||||||
+++ b/llama.cpp
|
+++ b/llama.cpp
|
||||||
@@ -4642,16 +4642,7 @@ static void llm_load_vocab(
|
@@ -4824,16 +4824,7 @@ static void llm_load_vocab(
|
||||||
|
|
||||||
// for now, only BPE models have pre-tokenizers
|
// for now, only BPE models have pre-tokenizers
|
||||||
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
||||||
@@ -15,14 +15,14 @@ index 40d2ec2c..74f3ee9c 100644
|
|||||||
- LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
- LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
||||||
- LLAMA_LOG_WARN("%s: \n", __func__);
|
- LLAMA_LOG_WARN("%s: \n", __func__);
|
||||||
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
- } else if (
|
- } else if (tokenizer_pre == "default") {
|
||||||
+ if (
|
+ if (tokenizer_pre == "default") {
|
||||||
tokenizer_pre == "default") {
|
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
} else if (
|
} else if (
|
||||||
@@ -4703,7 +4694,8 @@ static void llm_load_vocab(
|
tokenizer_pre == "llama3" ||
|
||||||
tokenizer_pre == "smaug-bpe") {
|
@@ -4888,7 +4879,8 @@ static void llm_load_vocab(
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
|
tokenizer_pre == "poro-chat") {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
|
||||||
} else {
|
} else {
|
||||||
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||||
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
|
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
|
||||||
|
305
llm/patches/07-gemma.diff
Normal file
305
llm/patches/07-gemma.diff
Normal file
@@ -0,0 +1,305 @@
|
|||||||
|
From 5cadb45f39d001ffbad95b690d6cf0abcb4a6d96 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Ollama maintainers <hello@ollama.com>
|
||||||
|
Date: Wed, 26 Jun 2024 16:18:09 -0700
|
||||||
|
Subject: [PATCH] Architecture support
|
||||||
|
|
||||||
|
---
|
||||||
|
llama.cpp | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
|
||||||
|
1 file changed, 193 insertions(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/llama.cpp b/llama.cpp
|
||||||
|
index 61948751..3b4196f5 100644
|
||||||
|
--- a/llama.cpp
|
||||||
|
+++ b/llama.cpp
|
||||||
|
@@ -217,6 +217,7 @@ enum llm_arch {
|
||||||
|
LLM_ARCH_INTERNLM2,
|
||||||
|
LLM_ARCH_MINICPM,
|
||||||
|
LLM_ARCH_GEMMA,
|
||||||
|
+ LLM_ARCH_GEMMA2,
|
||||||
|
LLM_ARCH_STARCODER2,
|
||||||
|
LLM_ARCH_MAMBA,
|
||||||
|
LLM_ARCH_XVERSE,
|
||||||
|
@@ -255,6 +256,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
|
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
||||||
|
{ LLM_ARCH_MINICPM, "minicpm" },
|
||||||
|
{ LLM_ARCH_GEMMA, "gemma" },
|
||||||
|
+ { LLM_ARCH_GEMMA2, "gemma2" },
|
||||||
|
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
||||||
|
{ LLM_ARCH_MAMBA, "mamba" },
|
||||||
|
{ LLM_ARCH_XVERSE, "xverse" },
|
||||||
|
@@ -464,10 +466,12 @@ enum llm_tensor {
|
||||||
|
LLM_TENSOR_ATTN_NORM,
|
||||||
|
LLM_TENSOR_ATTN_NORM_2,
|
||||||
|
LLM_TENSOR_ATTN_OUT_NORM,
|
||||||
|
+ LLM_TENSOR_ATTN_POST_NORM,
|
||||||
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
||||||
|
LLM_TENSOR_FFN_GATE_INP,
|
||||||
|
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
||||||
|
LLM_TENSOR_FFN_NORM,
|
||||||
|
+ LLM_TENSOR_FFN_POST_NORM,
|
||||||
|
LLM_TENSOR_FFN_GATE,
|
||||||
|
LLM_TENSOR_FFN_DOWN,
|
||||||
|
LLM_TENSOR_FFN_UP,
|
||||||
|
@@ -960,6 +964,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
+ {
|
||||||
|
+ LLM_ARCH_GEMMA2,
|
||||||
|
+ {
|
||||||
|
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
||||||
|
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
||||||
|
+ },
|
||||||
|
+ },
|
||||||
|
{
|
||||||
|
LLM_ARCH_STARCODER2,
|
||||||
|
{
|
||||||
|
@@ -1941,6 +1963,8 @@ enum e_model {
|
||||||
|
MODEL_8x22B,
|
||||||
|
MODEL_16x12B,
|
||||||
|
MODEL_10B_128x3_66B,
|
||||||
|
+ MODEL_9B,
|
||||||
|
+ MODEL_27B,
|
||||||
|
};
|
||||||
|
|
||||||
|
static const size_t kiB = 1024;
|
||||||
|
@@ -2114,6 +2138,7 @@ struct llama_layer {
|
||||||
|
struct ggml_tensor * attn_out_norm_b;
|
||||||
|
struct ggml_tensor * attn_q_a_norm;
|
||||||
|
struct ggml_tensor * attn_kv_a_norm;
|
||||||
|
+ struct ggml_tensor * attn_post_norm;
|
||||||
|
|
||||||
|
// attention
|
||||||
|
struct ggml_tensor * wq;
|
||||||
|
@@ -2136,6 +2161,7 @@ struct llama_layer {
|
||||||
|
// normalization
|
||||||
|
struct ggml_tensor * ffn_norm;
|
||||||
|
struct ggml_tensor * ffn_norm_b;
|
||||||
|
+ struct ggml_tensor * ffn_post_norm;
|
||||||
|
struct ggml_tensor * layer_out_norm;
|
||||||
|
struct ggml_tensor * layer_out_norm_b;
|
||||||
|
struct ggml_tensor * ffn_norm_exps;
|
||||||
|
@@ -4529,6 +4555,16 @@ static void llm_load_hparams(
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
case LLM_ARCH_GEMMA:
|
||||||
|
+ {
|
||||||
|
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
+
|
||||||
|
+ switch (hparams.n_layer) {
|
||||||
|
+ case 18: model.type = e_model::MODEL_9B; break;
|
||||||
|
+ case 28: model.type = e_model::MODEL_27B; break;
|
||||||
|
+ default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
|
+ }
|
||||||
|
+ } break;
|
||||||
|
+ case LLM_ARCH_GEMMA2:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
|
||||||
|
@@ -6305,6 +6341,40 @@ static bool llm_load_tensors(
|
||||||
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
+ case LLM_ARCH_GEMMA2:
|
||||||
|
+ {
|
||||||
|
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
|
+
|
||||||
|
+ // output
|
||||||
|
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||||
|
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
|
||||||
|
+
|
||||||
|
+ const int64_t n_ff = hparams.n_ff;
|
||||||
|
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||||
|
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||||
|
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||||
|
+
|
||||||
|
+ for (uint32_t i = 0; i < n_layer; ++i) {
|
||||||
|
+ ggml_context * ctx_layer = ctx_for_layer(i);
|
||||||
|
+ ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||||
|
+
|
||||||
|
+ auto & layer = model.layers[i];
|
||||||
|
+
|
||||||
|
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||||
|
+
|
||||||
|
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
|
||||||
|
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
||||||
|
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
||||||
|
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
|
||||||
|
+ layer.attn_post_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd});
|
||||||
|
+
|
||||||
|
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||||
|
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||||
|
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||||
|
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||||
|
+ layer.ffn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd});
|
||||||
|
+ }
|
||||||
|
+ } break;
|
||||||
|
case LLM_ARCH_STARCODER2:
|
||||||
|
{
|
||||||
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
|
@@ -10614,6 +10684,123 @@ struct llm_build_context {
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
|
+ struct ggml_cgraph * build_gemma2() {
|
||||||
|
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
+
|
||||||
|
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||||
|
+
|
||||||
|
+ struct ggml_tensor * cur;
|
||||||
|
+ struct ggml_tensor * inpL;
|
||||||
|
+
|
||||||
|
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||||
|
+
|
||||||
|
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
||||||
|
+ cb(inpL, "inp_scaled", -1);
|
||||||
|
+
|
||||||
|
+ // inp_pos - contains the positions
|
||||||
|
+ struct ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
+
|
||||||
|
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
|
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||||
|
+
|
||||||
|
+ for (int il = 0; il < n_layer; ++il) {
|
||||||
|
+ // norm
|
||||||
|
+ cur = llm_build_norm(ctx0, inpL, hparams,
|
||||||
|
+ model.layers[il].attn_norm, NULL,
|
||||||
|
+ LLM_NORM_RMS, cb, il);
|
||||||
|
+ cb(cur, "attn_norm", il);
|
||||||
|
+
|
||||||
|
+ // self-attention
|
||||||
|
+ {
|
||||||
|
+ // compute Q and K and RoPE them
|
||||||
|
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||||
|
+ cb(Qcur, "Qcur", il);
|
||||||
|
+
|
||||||
|
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||||
|
+ cb(Kcur, "Kcur", il);
|
||||||
|
+
|
||||||
|
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||||
|
+ cb(Vcur, "Vcur", il);
|
||||||
|
+
|
||||||
|
+ Qcur = ggml_rope_ext(
|
||||||
|
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
|
||||||
|
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
+ ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
+ cb(Qcur, "Qcur", il);
|
||||||
|
+
|
||||||
|
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
|
||||||
|
+ cb(Qcur, "Qcur_scaled", il);
|
||||||
|
+
|
||||||
|
+ Kcur = ggml_rope_ext(
|
||||||
|
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
|
||||||
|
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
+ ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
+ cb(Kcur, "Kcur", il);
|
||||||
|
+
|
||||||
|
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
||||||
|
+ model.layers[il].wo, NULL,
|
||||||
|
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ if (il == n_layer - 1) {
|
||||||
|
+ // skip computing output for unused tokens
|
||||||
|
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
+ model.layers[il].attn_post_norm, NULL,
|
||||||
|
+ LLM_NORM_RMS, cb, il);
|
||||||
|
+ cb(cur, "attn_post_norm", il);
|
||||||
|
+
|
||||||
|
+ struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
||||||
|
+ cb(sa_out, "sa_out", il);
|
||||||
|
+
|
||||||
|
+ cur = llm_build_norm(ctx0, sa_out, hparams,
|
||||||
|
+ model.layers[il].ffn_norm, NULL,
|
||||||
|
+ LLM_NORM_RMS, cb, il);
|
||||||
|
+ cb(cur, "ffn_norm", il);
|
||||||
|
+
|
||||||
|
+ // feed-forward network
|
||||||
|
+ {
|
||||||
|
+ cur = llm_build_ffn(ctx0, cur,
|
||||||
|
+ model.layers[il].ffn_up, NULL,
|
||||||
|
+ model.layers[il].ffn_gate, NULL,
|
||||||
|
+ model.layers[il].ffn_down, NULL,
|
||||||
|
+ NULL,
|
||||||
|
+ LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
|
||||||
|
+ cb(cur, "ffn_out", il);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
+ model.layers[il].ffn_post_norm, NULL,
|
||||||
|
+ LLM_NORM_RMS, cb, -1);
|
||||||
|
+ cb(cur, "ffn_post_norm", -1);
|
||||||
|
+
|
||||||
|
+ cur = ggml_add(ctx0, cur, sa_out);
|
||||||
|
+ cb(cur, "l_out", il);
|
||||||
|
+
|
||||||
|
+ // input for next layer
|
||||||
|
+ inpL = cur;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ cur = inpL;
|
||||||
|
+
|
||||||
|
+ cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
+ model.output_norm, NULL,
|
||||||
|
+ LLM_NORM_RMS, cb, -1);
|
||||||
|
+ cb(cur, "result_norm", -1);
|
||||||
|
+
|
||||||
|
+ // lm_head
|
||||||
|
+ cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||||
|
+ cb(cur, "result_output", -1);
|
||||||
|
+
|
||||||
|
+ ggml_build_forward_expand(gf, cur);
|
||||||
|
+
|
||||||
|
+ return gf;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
struct ggml_cgraph * build_starcoder2() {
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
|
@@ -11847,6 +12034,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
|
{
|
||||||
|
result = llm.build_gemma();
|
||||||
|
} break;
|
||||||
|
+ case LLM_ARCH_GEMMA2:
|
||||||
|
+ {
|
||||||
|
+ result = llm.build_gemma2();
|
||||||
|
+ } break;
|
||||||
|
case LLM_ARCH_STARCODER2:
|
||||||
|
{
|
||||||
|
result = llm.build_starcoder2();
|
||||||
|
@@ -16671,6 +16862,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||||
|
case LLM_ARCH_PHI2:
|
||||||
|
case LLM_ARCH_PHI3:
|
||||||
|
case LLM_ARCH_GEMMA:
|
||||||
|
+ case LLM_ARCH_GEMMA2:
|
||||||
|
case LLM_ARCH_STARCODER2:
|
||||||
|
case LLM_ARCH_GPTNEOX:
|
||||||
|
return LLAMA_ROPE_TYPE_NEOX;
|
||||||
|
@@ -18551,7 +18743,7 @@ static int32_t llama_chat_apply_template_internal(
|
||||||
|
if (add_ass) {
|
||||||
|
ss << "<s>assistant\n";
|
||||||
|
}
|
||||||
|
- } else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
|
||||||
|
+ } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl.find("<start_of_turn>") != std::string::npos) {
|
||||||
|
// google/gemma-7b-it
|
||||||
|
std::string system_prompt = "";
|
||||||
|
for (auto message : chat) {
|
||||||
|
--
|
||||||
|
2.45.2
|
||||||
|
|
@@ -58,7 +58,7 @@ func availableServers() map[string]string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// glob payloadsDir for files that start with ollama_
|
// glob payloadsDir for files that start with ollama_
|
||||||
pattern := filepath.Join(payloadsDir, "*")
|
pattern := filepath.Join(payloadsDir, "*", "ollama_*")
|
||||||
|
|
||||||
files, err := filepath.Glob(pattern)
|
files, err := filepath.Glob(pattern)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -69,7 +69,7 @@ func availableServers() map[string]string {
|
|||||||
servers := make(map[string]string)
|
servers := make(map[string]string)
|
||||||
for _, file := range files {
|
for _, file := range files {
|
||||||
slog.Debug("availableServers : found", "file", file)
|
slog.Debug("availableServers : found", "file", file)
|
||||||
servers[filepath.Base(file)] = file
|
servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
|
||||||
}
|
}
|
||||||
|
|
||||||
return servers
|
return servers
|
||||||
@@ -82,8 +82,8 @@ func serversForGpu(info gpu.GpuInfo) []string {
|
|||||||
// glob workDir for files that start with ollama_
|
// glob workDir for files that start with ollama_
|
||||||
availableServers := availableServers()
|
availableServers := availableServers()
|
||||||
requested := info.Library
|
requested := info.Library
|
||||||
if info.Variant != "" {
|
if info.Variant != gpu.CPUCapabilityNone {
|
||||||
requested += "_" + info.Variant
|
requested += "_" + info.Variant.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
servers := []string{}
|
servers := []string{}
|
||||||
@@ -117,14 +117,14 @@ func serversForGpu(info gpu.GpuInfo) []string {
|
|||||||
|
|
||||||
// Load up the best CPU variant if not primary requested
|
// Load up the best CPU variant if not primary requested
|
||||||
if info.Library != "cpu" {
|
if info.Library != "cpu" {
|
||||||
variant := gpu.GetCPUVariant()
|
variant := gpu.GetCPUCapability()
|
||||||
// If no variant, then we fall back to default
|
// If no variant, then we fall back to default
|
||||||
// If we have a variant, try that if we find an exact match
|
// If we have a variant, try that if we find an exact match
|
||||||
// Attempting to run the wrong CPU instructions will panic the
|
// Attempting to run the wrong CPU instructions will panic the
|
||||||
// process
|
// process
|
||||||
if variant != "" {
|
if variant != gpu.CPUCapabilityNone {
|
||||||
for cmp := range availableServers {
|
for cmp := range availableServers {
|
||||||
if cmp == "cpu_"+variant {
|
if cmp == "cpu_"+variant.String() {
|
||||||
servers = append(servers, cmp)
|
servers = append(servers, cmp)
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
@@ -146,11 +146,11 @@ func serverForCpu() string {
|
|||||||
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
|
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
|
||||||
return "metal"
|
return "metal"
|
||||||
}
|
}
|
||||||
variant := gpu.GetCPUVariant()
|
variant := gpu.GetCPUCapability()
|
||||||
availableServers := availableServers()
|
availableServers := availableServers()
|
||||||
if variant != "" {
|
if variant != gpu.CPUCapabilityNone {
|
||||||
for cmp := range availableServers {
|
for cmp := range availableServers {
|
||||||
if cmp == "cpu_"+variant {
|
if cmp == "cpu_"+variant.String() {
|
||||||
return cmp
|
return cmp
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
146
llm/server.go
146
llm/server.go
@@ -37,8 +37,9 @@ type LlamaServer interface {
|
|||||||
Tokenize(ctx context.Context, content string) ([]int, error)
|
Tokenize(ctx context.Context, content string) ([]int, error)
|
||||||
Detokenize(ctx context.Context, tokens []int) (string, error)
|
Detokenize(ctx context.Context, tokens []int) (string, error)
|
||||||
Close() error
|
Close() error
|
||||||
EstimatedVRAM() uint64
|
EstimatedVRAM() uint64 // Total VRAM across all GPUs
|
||||||
EstimatedTotal() uint64
|
EstimatedTotal() uint64
|
||||||
|
EstimatedVRAMByGPU(gpuID string) uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
// llmServer is an instance of the llama.cpp server
|
// llmServer is an instance of the llama.cpp server
|
||||||
@@ -49,18 +50,22 @@ type llmServer struct {
|
|||||||
status *StatusWriter
|
status *StatusWriter
|
||||||
options api.Options
|
options api.Options
|
||||||
|
|
||||||
// TODO - this should be broken down by GPU
|
estimate MemoryEstimate
|
||||||
estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model
|
totalLayers uint64
|
||||||
estimatedTotal uint64 // Total size of model
|
// gpuCount int
|
||||||
totalLayers uint64
|
gpus gpu.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
|
||||||
gpuCount int
|
loadDuration time.Duration // Record how long it took the model to load
|
||||||
loadDuration time.Duration // Record how long it took the model to load
|
loadProgress float32
|
||||||
loadProgress float32
|
|
||||||
|
|
||||||
sem *semaphore.Weighted
|
sem *semaphore.Weighted
|
||||||
}
|
}
|
||||||
|
|
||||||
func LoadModel(model string) (*GGML, error) {
|
// LoadModel will load a model from disk. The model must be in the GGML format.
|
||||||
|
//
|
||||||
|
// It collects array values for arrays with a size less than or equal to
|
||||||
|
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
|
||||||
|
// the maxArraySize is negative, all arrays are collected.
|
||||||
|
func LoadModel(model string, maxArraySize int) (*GGML, error) {
|
||||||
if _, err := os.Stat(model); err != nil {
|
if _, err := os.Stat(model); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -71,7 +76,7 @@ func LoadModel(model string) (*GGML, error) {
|
|||||||
}
|
}
|
||||||
defer f.Close()
|
defer f.Close()
|
||||||
|
|
||||||
ggml, _, err := DecodeGGML(f)
|
ggml, _, err := DecodeGGML(f, maxArraySize)
|
||||||
return ggml, err
|
return ggml, err
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -80,43 +85,45 @@ func LoadModel(model string) (*GGML, error) {
|
|||||||
func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
|
func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
|
||||||
var err error
|
var err error
|
||||||
var cpuRunner string
|
var cpuRunner string
|
||||||
var estimatedVRAM uint64
|
var estimate MemoryEstimate
|
||||||
var estimatedTotal uint64
|
var systemTotalMemory uint64
|
||||||
var systemMemory uint64
|
var systemFreeMemory uint64
|
||||||
gpuCount := len(gpus)
|
|
||||||
if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
|
|
||||||
// TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner
|
|
||||||
|
|
||||||
cpuRunner = serverForCpu()
|
systemMemInfo, err := gpu.GetCPUMem()
|
||||||
gpuCount = 0
|
if err != nil {
|
||||||
_, _, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
|
slog.Error("failed to lookup system memory", "error", err)
|
||||||
} else {
|
} else {
|
||||||
if gpus[0].Library == "metal" {
|
systemTotalMemory = systemMemInfo.TotalMemory
|
||||||
memInfo, err := gpu.GetCPUMem()
|
systemFreeMemory = systemMemInfo.FreeMemory
|
||||||
if err != nil {
|
slog.Debug("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", systemFreeMemory)
|
||||||
slog.Error("failed to lookup system memory", "error", err)
|
}
|
||||||
} else {
|
|
||||||
systemMemory = memInfo.TotalMemory
|
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
|
||||||
slog.Debug("system memory", "total", format.HumanBytes2(systemMemory))
|
if opts.NumGPU == 0 {
|
||||||
}
|
gpus = gpu.GetCPUInfo()
|
||||||
}
|
}
|
||||||
var layers int
|
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
||||||
layers, estimatedVRAM, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
|
cpuRunner = serverForCpu()
|
||||||
|
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||||
|
} else {
|
||||||
|
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
case gpus[0].Library == "metal" && estimatedVRAM > systemMemory:
|
case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
|
||||||
// disable partial offloading when model is greater than total system memory as this
|
// disable partial offloading when model is greater than total system memory as this
|
||||||
// can lead to locking up the system
|
// can lead to locking up the system
|
||||||
opts.NumGPU = 0
|
opts.NumGPU = 0
|
||||||
case gpus[0].Library != "metal" && layers == 0:
|
case gpus[0].Library != "metal" && estimate.Layers == 0:
|
||||||
// Don't bother loading into the GPU if no layers can fit
|
// Don't bother loading into the GPU if no layers can fit
|
||||||
cpuRunner = serverForCpu()
|
cpuRunner = serverForCpu()
|
||||||
gpuCount = 0
|
gpus = gpu.GetCPUInfo()
|
||||||
case opts.NumGPU < 0 && layers > 0 && gpus[0].Library != "cpu":
|
case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
|
||||||
opts.NumGPU = layers
|
opts.NumGPU = estimate.Layers
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
estimate.log()
|
||||||
|
|
||||||
// Loop through potential servers
|
// Loop through potential servers
|
||||||
finalErr := errors.New("no suitable llama servers found")
|
finalErr := errors.New("no suitable llama servers found")
|
||||||
|
|
||||||
@@ -159,6 +166,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
|
|
||||||
params = append(params, "--log-disable")
|
params = append(params, "--log-disable")
|
||||||
|
|
||||||
|
params = append(params, "--timeout", fmt.Sprintf("%d", 600))
|
||||||
|
|
||||||
if opts.NumGPU >= 0 {
|
if opts.NumGPU >= 0 {
|
||||||
params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
|
params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
|
||||||
}
|
}
|
||||||
@@ -201,7 +210,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
if g.Library == "metal" &&
|
if g.Library == "metal" &&
|
||||||
uint64(opts.NumGPU) > 0 &&
|
uint64(opts.NumGPU) > 0 &&
|
||||||
uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
|
uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
|
||||||
opts.UseMMap = false
|
opts.UseMMap = api.TriStateFalse
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -209,7 +218,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
params = append(params, "--flash-attn")
|
params = append(params, "--flash-attn")
|
||||||
}
|
}
|
||||||
|
|
||||||
if !opts.UseMMap {
|
// Windows CUDA should not use mmap for best performance
|
||||||
|
// Linux with a model larger than free space, mmap leads to thrashing
|
||||||
|
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) ||
|
||||||
|
(runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) ||
|
||||||
|
opts.UseMMap == api.TriStateFalse {
|
||||||
params = append(params, "--no-mmap")
|
params = append(params, "--no-mmap")
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -232,6 +245,14 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
|
|
||||||
params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
|
params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
|
||||||
|
|
||||||
|
if estimate.TensorSplit != "" {
|
||||||
|
params = append(params, "--tensor-split", estimate.TensorSplit)
|
||||||
|
}
|
||||||
|
|
||||||
|
if estimate.TensorSplit != "" {
|
||||||
|
params = append(params, "--tensor-split", estimate.TensorSplit)
|
||||||
|
}
|
||||||
|
|
||||||
for i := range len(servers) {
|
for i := range len(servers) {
|
||||||
dir := availableServers[servers[i]]
|
dir := availableServers[servers[i]]
|
||||||
if dir == "" {
|
if dir == "" {
|
||||||
@@ -242,8 +263,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
}
|
}
|
||||||
|
|
||||||
if strings.HasPrefix(servers[i], "cpu") {
|
if strings.HasPrefix(servers[i], "cpu") {
|
||||||
// TODO if we tried a gpu runner first, and it failed, record the error and bubble that back up
|
gpus = gpu.GetCPUInfo()
|
||||||
gpuCount = 0
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find an availableServers port, retry on each iteration in case the failure was a port conflict race
|
// Find an availableServers port, retry on each iteration in case the failure was a port conflict race
|
||||||
@@ -265,8 +285,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
if runtime.GOOS == "windows" {
|
if runtime.GOOS == "windows" {
|
||||||
pathEnv = "PATH"
|
pathEnv = "PATH"
|
||||||
}
|
}
|
||||||
// prepend the server directory to LD_LIBRARY_PATH/PATH
|
// prepend the server directory to LD_LIBRARY_PATH/PATH and the parent dir for common dependencies
|
||||||
libraryPaths := []string{dir}
|
libraryPaths := []string{dir, filepath.Dir(dir)}
|
||||||
|
|
||||||
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
|
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
|
||||||
// Append our runner directory to the path
|
// Append our runner directory to the path
|
||||||
@@ -299,22 +319,25 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
}
|
}
|
||||||
|
|
||||||
s := &llmServer{
|
s := &llmServer{
|
||||||
port: port,
|
port: port,
|
||||||
cmd: exec.Command(server, finalParams...),
|
cmd: exec.Command(server, finalParams...),
|
||||||
status: NewStatusWriter(os.Stderr),
|
status: NewStatusWriter(os.Stderr),
|
||||||
options: opts,
|
options: opts,
|
||||||
estimatedVRAM: estimatedVRAM,
|
estimate: estimate,
|
||||||
estimatedTotal: estimatedTotal,
|
sem: semaphore.NewWeighted(int64(numParallel)),
|
||||||
sem: semaphore.NewWeighted(int64(numParallel)),
|
totalLayers: ggml.KV().BlockCount() + 1,
|
||||||
totalLayers: ggml.KV().BlockCount() + 1,
|
gpus: gpus,
|
||||||
gpuCount: gpuCount,
|
done: make(chan error, 1),
|
||||||
done: make(chan error, 1),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
s.cmd.Env = os.Environ()
|
s.cmd.Env = os.Environ()
|
||||||
s.cmd.Stdout = os.Stdout
|
s.cmd.Stdout = os.Stdout
|
||||||
s.cmd.Stderr = s.status
|
s.cmd.Stderr = s.status
|
||||||
|
|
||||||
|
envWorkarounds := [][2]string{}
|
||||||
|
for _, gpu := range gpus {
|
||||||
|
envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
|
||||||
|
}
|
||||||
visibleDevicesEnv, visibleDevicesEnvVal := gpus.GetVisibleDevicesEnv()
|
visibleDevicesEnv, visibleDevicesEnvVal := gpus.GetVisibleDevicesEnv()
|
||||||
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
|
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
|
||||||
|
|
||||||
@@ -329,6 +352,12 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
} else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) {
|
} else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) {
|
||||||
s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal
|
s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal
|
||||||
devicesNeeded = false
|
devicesNeeded = false
|
||||||
|
} else if len(envWorkarounds) != 0 {
|
||||||
|
for _, kv := range envWorkarounds {
|
||||||
|
if strings.EqualFold(cmp[0], kv[0]) {
|
||||||
|
s.cmd.Env[i] = kv[0] + "=" + kv[1]
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if pathNeeded {
|
if pathNeeded {
|
||||||
@@ -390,7 +419,7 @@ func projectorMemoryRequirements(filename string) uint64 {
|
|||||||
}
|
}
|
||||||
defer file.Close()
|
defer file.Close()
|
||||||
|
|
||||||
ggml, _, err := DecodeGGML(file)
|
ggml, _, err := DecodeGGML(file, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
@@ -1004,11 +1033,20 @@ func (s *llmServer) Close() error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *llmServer) EstimatedVRAM() uint64 {
|
func (s *llmServer) EstimatedVRAM() uint64 {
|
||||||
return s.estimatedVRAM
|
return s.estimate.VRAMSize
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *llmServer) EstimatedTotal() uint64 {
|
func (s *llmServer) EstimatedTotal() uint64 {
|
||||||
return s.estimatedTotal
|
return s.estimate.TotalSize
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *llmServer) EstimatedVRAMByGPU(gpuID string) uint64 {
|
||||||
|
for i, gpu := range s.gpus {
|
||||||
|
if gpu.ID == gpuID {
|
||||||
|
return s.estimate.GPUSizes[i]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
func parseDurationMs(ms float64) time.Duration {
|
func parseDurationMs(ms float64) time.Duration {
|
||||||
|
@@ -178,9 +178,6 @@ func fromRequest(r ChatCompletionRequest) api.ChatRequest {
|
|||||||
|
|
||||||
if r.Seed != nil {
|
if r.Seed != nil {
|
||||||
options["seed"] = *r.Seed
|
options["seed"] = *r.Seed
|
||||||
|
|
||||||
// temperature=0 is required for reproducible outputs
|
|
||||||
options["temperature"] = 0.0
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if r.FrequencyPenalty != nil {
|
if r.FrequencyPenalty != nil {
|
||||||
|
@@ -103,19 +103,19 @@ function buildApp() {
|
|||||||
function gatherDependencies() {
|
function gatherDependencies() {
|
||||||
write-host "Gathering runtime dependencies"
|
write-host "Gathering runtime dependencies"
|
||||||
cd "${script:SRC_DIR}"
|
cd "${script:SRC_DIR}"
|
||||||
md "${script:DEPS_DIR}" -ea 0 > $null
|
md "${script:DEPS_DIR}\ollama_runners" -ea 0 > $null
|
||||||
|
|
||||||
# TODO - this varies based on host build system and MSVC version - drive from dumpbin output
|
# TODO - this varies based on host build system and MSVC version - drive from dumpbin output
|
||||||
# currently works for Win11 + MSVC 2019 + Cuda V11
|
# currently works for Win11 + MSVC 2019 + Cuda V11
|
||||||
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140.dll" "${script:DEPS_DIR}\"
|
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140.dll" "${script:DEPS_DIR}\ollama_runners\"
|
||||||
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\"
|
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_runners\"
|
||||||
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\"
|
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_runners\"
|
||||||
|
|
||||||
|
|
||||||
cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
|
cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
|
||||||
if ("${env:KEY_CONTAINER}") {
|
if ("${env:KEY_CONTAINER}") {
|
||||||
write-host "about to sign"
|
write-host "about to sign"
|
||||||
foreach ($file in (get-childitem "${script:DEPS_DIR}/cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
|
foreach ($file in (get-childitem "${script:DEPS_DIR}\cuda\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
|
||||||
write-host "signing $file"
|
write-host "signing $file"
|
||||||
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
|
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
|
||||||
/csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file
|
/csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file
|
||||||
|
@@ -159,8 +159,8 @@ check_gpu() {
|
|||||||
esac ;;
|
esac ;;
|
||||||
lshw)
|
lshw)
|
||||||
case $2 in
|
case $2 in
|
||||||
nvidia) available lshw && $SUDO lshw -c display -numeric | grep -q 'vendor: .* \[10DE\]' || return 1 ;;
|
nvidia) available lshw && $SUDO lshw -c display -numeric -disable network | grep -q 'vendor: .* \[10DE\]' || return 1 ;;
|
||||||
amdgpu) available lshw && $SUDO lshw -c display -numeric | grep -q 'vendor: .* \[1002\]' || return 1 ;;
|
amdgpu) available lshw && $SUDO lshw -c display -numeric -disable network | grep -q 'vendor: .* \[1002\]' || return 1 ;;
|
||||||
esac ;;
|
esac ;;
|
||||||
nvidia-smi) available nvidia-smi || return 1 ;;
|
nvidia-smi) available nvidia-smi || return 1 ;;
|
||||||
esac
|
esac
|
||||||
@@ -279,7 +279,7 @@ if ! check_gpu nvidia-smi || [ -z "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\
|
|||||||
case $OS_NAME in
|
case $OS_NAME in
|
||||||
centos|rhel) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -d '.' -f 1) ;;
|
centos|rhel) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -d '.' -f 1) ;;
|
||||||
rocky) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -c1) ;;
|
rocky) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -c1) ;;
|
||||||
fedora) [ $OS_VERSION -lt '37' ] && install_cuda_driver_yum $OS_NAME $OS_VERSION || install_cuda_driver_yum $OS_NAME '37';;
|
fedora) [ $OS_VERSION -lt '39' ] && install_cuda_driver_yum $OS_NAME $OS_VERSION || install_cuda_driver_yum $OS_NAME '39';;
|
||||||
amzn) install_cuda_driver_yum 'fedora' '37' ;;
|
amzn) install_cuda_driver_yum 'fedora' '37' ;;
|
||||||
debian) install_cuda_driver_apt $OS_NAME $OS_VERSION ;;
|
debian) install_cuda_driver_apt $OS_NAME $OS_VERSION ;;
|
||||||
ubuntu) install_cuda_driver_apt $OS_NAME $(echo $OS_VERSION | sed 's/\.//') ;;
|
ubuntu) install_cuda_driver_apt $OS_NAME $(echo $OS_VERSION | sed 's/\.//') ;;
|
||||||
|
@@ -414,17 +414,22 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
layers, err := parseFromFile(ctx, temp, "", fn)
|
layer, err := NewLayer(temp, baseLayer.MediaType)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(layers) != 1 {
|
if _, err := temp.Seek(0, io.SeekStart); err != nil {
|
||||||
return errors.New("quantization failed")
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
baseLayer.Layer = layers[0].Layer
|
ggml, _, err := llm.DecodeGGML(temp, 0)
|
||||||
baseLayer.GGML = layers[0].GGML
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
baseLayer.Layer = layer
|
||||||
|
baseLayer.GGML = ggml
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -11,6 +11,7 @@ import (
|
|||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/convert"
|
"github.com/ollama/ollama/convert"
|
||||||
@@ -63,7 +64,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
|
|||||||
}
|
}
|
||||||
defer blob.Close()
|
defer blob.Close()
|
||||||
|
|
||||||
ggml, _, err := llm.DecodeGGML(blob)
|
ggml, _, err := llm.DecodeGGML(blob, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -77,62 +78,80 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
|
|||||||
return layers, nil
|
return layers, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
|
func extractFromZipFile(p string, file *os.File, fn func(api.ProgressResponse)) error {
|
||||||
stat, err := file.Stat()
|
stat, err := file.Stat()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
r, err := zip.NewReader(file, stat.Size())
|
r, err := zip.NewReader(file, stat.Size())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
tempdir, err := os.MkdirTemp(filepath.Dir(file.Name()), "")
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
defer os.RemoveAll(tempdir)
|
|
||||||
|
|
||||||
fn(api.ProgressResponse{Status: "unpacking model metadata"})
|
fn(api.ProgressResponse{Status: "unpacking model metadata"})
|
||||||
for _, f := range r.File {
|
for _, f := range r.File {
|
||||||
|
n := filepath.Join(p, f.Name)
|
||||||
|
if !strings.HasPrefix(n, p) {
|
||||||
|
slog.Warn("skipped extracting file outside of context", "name", f.Name)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := os.MkdirAll(filepath.Dir(n), 0o750); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
// TODO(mxyng): this should not write out all files to disk
|
// TODO(mxyng): this should not write out all files to disk
|
||||||
outfile, err := os.Create(filepath.Join(tempdir, f.Name))
|
outfile, err := os.Create(n)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return err
|
||||||
}
|
}
|
||||||
defer outfile.Close()
|
defer outfile.Close()
|
||||||
|
|
||||||
infile, err := f.Open()
|
infile, err := f.Open()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return err
|
||||||
}
|
}
|
||||||
defer infile.Close()
|
defer infile.Close()
|
||||||
|
|
||||||
if _, err = io.Copy(outfile, infile); err != nil {
|
if _, err = io.Copy(outfile, infile); err != nil {
|
||||||
return nil, err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := outfile.Close(); err != nil {
|
if err := outfile.Close(); err != nil {
|
||||||
return nil, err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := infile.Close(); err != nil {
|
if err := infile.Close(); err != nil {
|
||||||
return nil, err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
mf, err := convert.GetModelFormat(tempdir)
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
|
||||||
|
tempDir, err := os.MkdirTemp(filepath.Dir(file.Name()), "")
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer os.RemoveAll(tempDir)
|
||||||
|
|
||||||
|
if err := extractFromZipFile(tempDir, file, fn); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
mf, err := convert.GetModelFormat(tempDir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
params, err := mf.GetParams(tempdir)
|
params, err := mf.GetParams(tempDir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
mArch, err := mf.GetModelArch("", tempdir, params)
|
mArch, err := mf.GetModelArch("", tempDir, params)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -150,7 +169,7 @@ func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(a
|
|||||||
|
|
||||||
// TODO(mxyng): this should write directly into a layer
|
// TODO(mxyng): this should write directly into a layer
|
||||||
// e.g. NewLayer(arch.Reader(), "application/vnd.ollama.image.model")
|
// e.g. NewLayer(arch.Reader(), "application/vnd.ollama.image.model")
|
||||||
temp, err := os.CreateTemp(tempdir, "fp16")
|
temp, err := os.CreateTemp(tempDir, "fp16")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -176,7 +195,7 @@ func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(a
|
|||||||
}
|
}
|
||||||
defer bin.Close()
|
defer bin.Close()
|
||||||
|
|
||||||
ggml, _, err := llm.DecodeGGML(bin)
|
ggml, _, err := llm.DecodeGGML(bin, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -210,7 +229,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap
|
|||||||
|
|
||||||
var offset int64
|
var offset int64
|
||||||
for offset < stat.Size() {
|
for offset < stat.Size() {
|
||||||
ggml, n, err := llm.DecodeGGML(file)
|
ggml, n, err := llm.DecodeGGML(file, 0)
|
||||||
if errors.Is(err, io.EOF) {
|
if errors.Is(err, io.EOF) {
|
||||||
break
|
break
|
||||||
} else if err != nil {
|
} else if err != nil {
|
||||||
|
92
server/model_test.go
Normal file
92
server/model_test.go
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
package server
|
||||||
|
|
||||||
|
import (
|
||||||
|
"archive/zip"
|
||||||
|
"bytes"
|
||||||
|
"io"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"slices"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/api"
|
||||||
|
)
|
||||||
|
|
||||||
|
func createZipFile(t *testing.T, name string) *os.File {
|
||||||
|
t.Helper()
|
||||||
|
|
||||||
|
f, err := os.CreateTemp(t.TempDir(), "")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
zf := zip.NewWriter(f)
|
||||||
|
defer zf.Close()
|
||||||
|
|
||||||
|
zh, err := zf.CreateHeader(&zip.FileHeader{Name: name})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, err := io.Copy(zh, bytes.NewReader([]byte(""))); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return f
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExtractFromZipFile(t *testing.T) {
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
expect []string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "good",
|
||||||
|
expect: []string{"good"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: filepath.Join("..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "bad"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range cases {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
f := createZipFile(t, tt.name)
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
tempDir := t.TempDir()
|
||||||
|
if err := extractFromZipFile(tempDir, f, func(api.ProgressResponse) {}); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var matches []string
|
||||||
|
if err := filepath.Walk(tempDir, func(p string, fi os.FileInfo, err error) error {
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if !fi.IsDir() {
|
||||||
|
matches = append(matches, p)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var actual []string
|
||||||
|
for _, match := range matches {
|
||||||
|
rel, err := filepath.Rel(tempDir, match)
|
||||||
|
if err != nil {
|
||||||
|
t.Error(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
actual = append(actual, rel)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !slices.Equal(actual, tt.expect) {
|
||||||
|
t.Fatalf("expected %d files, got %d", len(tt.expect), len(matches))
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
108
server/routes.go
108
server/routes.go
@@ -646,9 +646,12 @@ func (s *Server) ShowModelHandler(c *gin.Context) {
|
|||||||
|
|
||||||
resp, err := GetModelInfo(req)
|
resp, err := GetModelInfo(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if os.IsNotExist(err) {
|
switch {
|
||||||
|
case os.IsNotExist(err):
|
||||||
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)})
|
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)})
|
||||||
} else {
|
case err.Error() == "invalid model name":
|
||||||
|
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||||
|
default:
|
||||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
@@ -658,44 +661,55 @@ func (s *Server) ShowModelHandler(c *gin.Context) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
|
func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
|
||||||
model, err := GetModel(req.Model)
|
m, err := GetModel(req.Model)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
modelDetails := api.ModelDetails{
|
modelDetails := api.ModelDetails{
|
||||||
ParentModel: model.ParentModel,
|
ParentModel: m.ParentModel,
|
||||||
Format: model.Config.ModelFormat,
|
Format: m.Config.ModelFormat,
|
||||||
Family: model.Config.ModelFamily,
|
Family: m.Config.ModelFamily,
|
||||||
Families: model.Config.ModelFamilies,
|
Families: m.Config.ModelFamilies,
|
||||||
ParameterSize: model.Config.ModelType,
|
ParameterSize: m.Config.ModelType,
|
||||||
QuantizationLevel: model.Config.FileType,
|
QuantizationLevel: m.Config.FileType,
|
||||||
}
|
}
|
||||||
|
|
||||||
if req.System != "" {
|
if req.System != "" {
|
||||||
model.System = req.System
|
m.System = req.System
|
||||||
}
|
}
|
||||||
|
|
||||||
if req.Template != "" {
|
if req.Template != "" {
|
||||||
model.Template = req.Template
|
m.Template = req.Template
|
||||||
}
|
}
|
||||||
|
|
||||||
msgs := make([]api.Message, 0)
|
msgs := make([]api.Message, 0)
|
||||||
for _, msg := range model.Messages {
|
for _, msg := range m.Messages {
|
||||||
msgs = append(msgs, api.Message{Role: msg.Role, Content: msg.Content})
|
msgs = append(msgs, api.Message{Role: msg.Role, Content: msg.Content})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
n := model.ParseName(req.Model)
|
||||||
|
if !n.IsValid() {
|
||||||
|
return nil, fmt.Errorf("invalid model name")
|
||||||
|
}
|
||||||
|
|
||||||
|
manifest, err := ParseNamedManifest(n)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
resp := &api.ShowResponse{
|
resp := &api.ShowResponse{
|
||||||
License: strings.Join(model.License, "\n"),
|
License: strings.Join(m.License, "\n"),
|
||||||
System: model.System,
|
System: m.System,
|
||||||
Template: model.Template,
|
Template: m.Template,
|
||||||
Details: modelDetails,
|
Details: modelDetails,
|
||||||
Messages: msgs,
|
Messages: msgs,
|
||||||
|
ModifiedAt: manifest.fi.ModTime(),
|
||||||
}
|
}
|
||||||
|
|
||||||
var params []string
|
var params []string
|
||||||
cs := 30
|
cs := 30
|
||||||
for k, v := range model.Options {
|
for k, v := range m.Options {
|
||||||
switch val := v.(type) {
|
switch val := v.(type) {
|
||||||
case []interface{}:
|
case []interface{}:
|
||||||
for _, nv := range val {
|
for _, nv := range val {
|
||||||
@@ -709,20 +723,59 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
|
|||||||
|
|
||||||
for k, v := range req.Options {
|
for k, v := range req.Options {
|
||||||
if _, ok := req.Options[k]; ok {
|
if _, ok := req.Options[k]; ok {
|
||||||
model.Options[k] = v
|
m.Options[k] = v
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var sb strings.Builder
|
var sb strings.Builder
|
||||||
fmt.Fprintln(&sb, "# Modelfile generated by \"ollama show\"")
|
fmt.Fprintln(&sb, "# Modelfile generated by \"ollama show\"")
|
||||||
fmt.Fprintln(&sb, "# To build a new Modelfile based on this, replace FROM with:")
|
fmt.Fprintln(&sb, "# To build a new Modelfile based on this, replace FROM with:")
|
||||||
fmt.Fprintf(&sb, "# FROM %s\n\n", model.ShortName)
|
fmt.Fprintf(&sb, "# FROM %s\n\n", m.ShortName)
|
||||||
fmt.Fprint(&sb, model.String())
|
fmt.Fprint(&sb, m.String())
|
||||||
resp.Modelfile = sb.String()
|
resp.Modelfile = sb.String()
|
||||||
|
|
||||||
|
kvData, err := getKVData(m.ModelPath, req.Verbose)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
delete(kvData, "general.name")
|
||||||
|
delete(kvData, "tokenizer.chat_template")
|
||||||
|
resp.ModelInfo = kvData
|
||||||
|
|
||||||
|
if len(m.ProjectorPaths) > 0 {
|
||||||
|
projectorData, err := getKVData(m.ProjectorPaths[0], req.Verbose)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
resp.ProjectorInfo = projectorData
|
||||||
|
}
|
||||||
|
|
||||||
return resp, nil
|
return resp, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getKVData(digest string, verbose bool) (llm.KV, error) {
|
||||||
|
maxArraySize := 0
|
||||||
|
if verbose {
|
||||||
|
maxArraySize = -1
|
||||||
|
}
|
||||||
|
kvData, err := llm.LoadModel(digest, maxArraySize)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
kv := kvData.KV()
|
||||||
|
|
||||||
|
if !verbose {
|
||||||
|
for k := range kv {
|
||||||
|
if t, ok := kv[k].([]any); len(t) > 5 && ok {
|
||||||
|
kv[k] = []any{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return kv, nil
|
||||||
|
}
|
||||||
|
|
||||||
func (s *Server) ListModelsHandler(c *gin.Context) {
|
func (s *Server) ListModelsHandler(c *gin.Context) {
|
||||||
ms, err := Manifests()
|
ms, err := Manifests()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -1052,11 +1105,20 @@ func Serve(ln net.Listener) error {
|
|||||||
schedCtx, schedDone := context.WithCancel(ctx)
|
schedCtx, schedDone := context.WithCancel(ctx)
|
||||||
sched := InitScheduler(schedCtx)
|
sched := InitScheduler(schedCtx)
|
||||||
s := &Server{addr: ln.Addr(), sched: sched}
|
s := &Server{addr: ln.Addr(), sched: sched}
|
||||||
r := s.GenerateRoutes()
|
|
||||||
|
http.Handle("/", s.GenerateRoutes())
|
||||||
|
|
||||||
slog.Info(fmt.Sprintf("Listening on %s (version %s)", ln.Addr(), version.Version))
|
slog.Info(fmt.Sprintf("Listening on %s (version %s)", ln.Addr(), version.Version))
|
||||||
srvr := &http.Server{
|
srvr := &http.Server{
|
||||||
Handler: r,
|
// Use http.DefaultServeMux so we get net/http/pprof for
|
||||||
|
// free.
|
||||||
|
//
|
||||||
|
// TODO(bmizerany): Decide if we want to make this
|
||||||
|
// configurable so it is not exposed by default, or allow
|
||||||
|
// users to bind it to a different port. This was a quick
|
||||||
|
// and easy way to get pprof, but it may not be the best
|
||||||
|
// way.
|
||||||
|
Handler: nil,
|
||||||
}
|
}
|
||||||
|
|
||||||
// listen for a ctrl+c and stop any loaded llm
|
// listen for a ctrl+c and stop any loaded llm
|
||||||
|
@@ -19,6 +19,7 @@ import (
|
|||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
|
"github.com/ollama/ollama/llm"
|
||||||
"github.com/ollama/ollama/parser"
|
"github.com/ollama/ollama/parser"
|
||||||
"github.com/ollama/ollama/types/model"
|
"github.com/ollama/ollama/types/model"
|
||||||
"github.com/ollama/ollama/version"
|
"github.com/ollama/ollama/version"
|
||||||
@@ -212,6 +213,7 @@ func Test_Routes(t *testing.T) {
|
|||||||
"top_p 0.9",
|
"top_p 0.9",
|
||||||
}
|
}
|
||||||
assert.Equal(t, expectedParams, params)
|
assert.Equal(t, expectedParams, params)
|
||||||
|
assert.InDelta(t, 0, showResp.ModelInfo["general.parameter_count"], 1e-9, "Parameter count should be 0")
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
@@ -325,3 +327,40 @@ func TestCase(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestShow(t *testing.T) {
|
||||||
|
t.Setenv("OLLAMA_MODELS", t.TempDir())
|
||||||
|
envconfig.LoadConfig()
|
||||||
|
|
||||||
|
var s Server
|
||||||
|
|
||||||
|
createRequest(t, s.CreateModelHandler, api.CreateRequest{
|
||||||
|
Name: "show-model",
|
||||||
|
Modelfile: fmt.Sprintf(
|
||||||
|
"FROM %s\nFROM %s",
|
||||||
|
createBinFile(t, llm.KV{"general.architecture": "test"}, nil),
|
||||||
|
createBinFile(t, llm.KV{"general.architecture": "clip"}, nil),
|
||||||
|
),
|
||||||
|
})
|
||||||
|
|
||||||
|
w := createRequest(t, s.ShowModelHandler, api.ShowRequest{
|
||||||
|
Name: "show-model",
|
||||||
|
})
|
||||||
|
|
||||||
|
if w.Code != http.StatusOK {
|
||||||
|
t.Fatalf("expected status code 200, actual %d", w.Code)
|
||||||
|
}
|
||||||
|
|
||||||
|
var resp api.ShowResponse
|
||||||
|
if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if resp.ModelInfo["general.architecture"] != "test" {
|
||||||
|
t.Fatal("Expected model architecture to be 'test', but got", resp.ModelInfo["general.architecture"])
|
||||||
|
}
|
||||||
|
|
||||||
|
if resp.ProjectorInfo["general.architecture"] != "clip" {
|
||||||
|
t.Fatal("Expected projector architecture to be 'clip', but got", resp.ProjectorInfo["general.architecture"])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
171
server/sched.go
171
server/sched.go
@@ -7,7 +7,6 @@ import (
|
|||||||
"log/slog"
|
"log/slog"
|
||||||
"reflect"
|
"reflect"
|
||||||
"runtime"
|
"runtime"
|
||||||
"slices"
|
|
||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
@@ -27,6 +26,7 @@ type LlmRequest struct {
|
|||||||
sessionDuration time.Duration
|
sessionDuration time.Duration
|
||||||
successCh chan *runnerRef
|
successCh chan *runnerRef
|
||||||
errCh chan error
|
errCh chan error
|
||||||
|
schedAttempts uint
|
||||||
}
|
}
|
||||||
|
|
||||||
type Scheduler struct {
|
type Scheduler struct {
|
||||||
@@ -38,9 +38,11 @@ type Scheduler struct {
|
|||||||
loaded map[string]*runnerRef
|
loaded map[string]*runnerRef
|
||||||
loadedMu sync.Mutex
|
loadedMu sync.Mutex
|
||||||
|
|
||||||
loadFn func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
|
loadFn func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
|
||||||
newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error)
|
newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error)
|
||||||
getGpuFn func() gpu.GpuInfoList
|
getGpuFn func() gpu.GpuInfoList
|
||||||
|
getCpuFn func() gpu.GpuInfoList
|
||||||
|
reschedDelay time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded")
|
var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded")
|
||||||
@@ -54,6 +56,8 @@ func InitScheduler(ctx context.Context) *Scheduler {
|
|||||||
loaded: make(map[string]*runnerRef),
|
loaded: make(map[string]*runnerRef),
|
||||||
newServerFn: llm.NewLlamaServer,
|
newServerFn: llm.NewLlamaServer,
|
||||||
getGpuFn: gpu.GetGPUInfo,
|
getGpuFn: gpu.GetGPUInfo,
|
||||||
|
getCpuFn: gpu.GetCPUInfo,
|
||||||
|
reschedDelay: 250 * time.Millisecond,
|
||||||
}
|
}
|
||||||
sched.loadFn = sched.load
|
sched.loadFn = sched.load
|
||||||
return sched
|
return sched
|
||||||
@@ -105,6 +109,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|||||||
return
|
return
|
||||||
case pending := <-s.pendingReqCh:
|
case pending := <-s.pendingReqCh:
|
||||||
// Block other requests until we get this pending request running
|
// Block other requests until we get this pending request running
|
||||||
|
pending.schedAttempts++
|
||||||
|
|
||||||
if pending.ctx.Err() != nil {
|
if pending.ctx.Err() != nil {
|
||||||
slog.Debug("pending request cancelled or timed out, skipping scheduling")
|
slog.Debug("pending request cancelled or timed out, skipping scheduling")
|
||||||
@@ -131,25 +136,36 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|||||||
} else {
|
} else {
|
||||||
// Either no models are loaded or below envconfig.MaxRunners
|
// Either no models are loaded or below envconfig.MaxRunners
|
||||||
// Get a refreshed GPU list
|
// Get a refreshed GPU list
|
||||||
gpus := s.getGpuFn()
|
var gpus gpu.GpuInfoList
|
||||||
|
if pending.opts.NumGPU == 0 {
|
||||||
|
gpus = s.getCpuFn()
|
||||||
|
} else {
|
||||||
|
gpus = s.getGpuFn()
|
||||||
|
}
|
||||||
|
|
||||||
// Load model for fitting
|
// Load model for fitting
|
||||||
ggml, err := llm.LoadModel(pending.model.ModelPath)
|
ggml, err := llm.LoadModel(pending.model.ModelPath, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
pending.errCh <- err
|
pending.errCh <- err
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
// If we're CPU only mode, just limit by envconfig.MaxRunners above
|
// Evaluate if the model will fit in the available system memory, or if we should unload a model first
|
||||||
// TODO handle system memory exhaustion
|
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
||||||
if (len(gpus) == 1 && gpus[0].Library == "cpu") || pending.opts.NumGPU == 0 {
|
if loadedCount == 0 {
|
||||||
slog.Debug("cpu mode with existing models, loading")
|
slog.Debug("cpu mode with first model, loading")
|
||||||
s.loadFn(pending, ggml, gpus)
|
s.loadFn(pending, ggml, gpus)
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
runnerToExpire = s.maybeFindCPURunnerToUnload(pending, ggml, gpus)
|
||||||
// No models loaded. Load the model but prefer the best fit.
|
if runnerToExpire == nil {
|
||||||
if loadedCount == 0 {
|
slog.Debug("cpu mode with available system memory or first model, loading")
|
||||||
|
s.loadFn(pending, ggml, gpus)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
// else we need to expire a runner
|
||||||
|
} else if loadedCount == 0 {
|
||||||
|
// No models loaded. Load the model but prefer the best fit.
|
||||||
slog.Debug("loading first model", "model", pending.model.ModelPath)
|
slog.Debug("loading first model", "model", pending.model.ModelPath)
|
||||||
g := pickBestFitGPUs(pending, ggml, gpus)
|
g := pickBestFitGPUs(pending, ggml, gpus)
|
||||||
if g != nil {
|
if g != nil {
|
||||||
@@ -159,16 +175,44 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
// More than one loaded model, so we have to see if the new one fits
|
if runnerToExpire == nil {
|
||||||
// Update free memory from currently loaded models
|
// More than one loaded model, so we have to see if the
|
||||||
s.updateFreeSpace(gpus)
|
// new one fits
|
||||||
gpus = pickBestFitGPUs(pending, ggml, gpus)
|
//
|
||||||
if gpus != nil {
|
// We want to avoid loading on any GPUs that have other
|
||||||
slog.Debug("new model fits with existing models, loading")
|
// models still loading on them to avoid potential races
|
||||||
s.loadFn(pending, ggml, gpus)
|
// with VRAM consumption ramping up during load
|
||||||
break
|
availGpus := s.filterGPUsWithoutLoadingModels(gpus)
|
||||||
|
|
||||||
|
// Update free memory from currently loaded models
|
||||||
|
s.updateFreeSpace(availGpus)
|
||||||
|
fitGpus := pickBestFitGPUs(pending, ggml, availGpus)
|
||||||
|
if fitGpus != nil {
|
||||||
|
slog.Debug("new model fits with existing models, loading")
|
||||||
|
s.loadFn(pending, ggml, fitGpus)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
// We couldn't find a set of GPUs to fully load the new
|
||||||
|
// model. If no other models are loading (both GPU lists
|
||||||
|
// are the same) then we need to unload another model to
|
||||||
|
// make room
|
||||||
|
if len(availGpus) < len(gpus) {
|
||||||
|
// There are other requests pending, and this one
|
||||||
|
// needs more time, so put it on the back of the
|
||||||
|
// queue so that we might satisfy other pending
|
||||||
|
// requests that aren't blocked
|
||||||
|
go func() {
|
||||||
|
// Process in a go routine to avoid deadlocking
|
||||||
|
// the scheduler if our queue is full
|
||||||
|
slog.Debug("delaying scheduling while other models finish loading", "attempts", pending.schedAttempts, "model", pending.model.ModelPath)
|
||||||
|
time.Sleep(s.reschedDelay)
|
||||||
|
s.pendingReqCh <- pending
|
||||||
|
}()
|
||||||
|
break
|
||||||
|
}
|
||||||
|
runnerToExpire = s.findRunnerToUnload()
|
||||||
}
|
}
|
||||||
runnerToExpire = s.findRunnerToUnload()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if runnerToExpire == nil {
|
if runnerToExpire == nil {
|
||||||
@@ -368,17 +412,9 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
|
|||||||
s.loadedMu.Lock()
|
s.loadedMu.Lock()
|
||||||
for _, r := range s.loaded {
|
for _, r := range s.loaded {
|
||||||
r.refMu.Lock()
|
r.refMu.Lock()
|
||||||
gpuIDs := make([]string, 0, len(r.gpus))
|
|
||||||
if r.llama != nil {
|
if r.llama != nil {
|
||||||
// TODO this should be broken down by GPU instead of assuming uniform spread
|
|
||||||
estimatedVRAMPerGPU := r.llama.EstimatedVRAM() / uint64(len(r.gpus))
|
|
||||||
for _, gpu := range r.gpus {
|
|
||||||
gpuIDs = append(gpuIDs, gpu.ID)
|
|
||||||
}
|
|
||||||
for _, gpu := range allGpus {
|
for _, gpu := range allGpus {
|
||||||
if slices.Contains(gpuIDs, gpu.ID) {
|
predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimatedVRAMByGPU(gpu.ID)
|
||||||
predMap[predKey{gpu.Library, gpu.ID}] += estimatedVRAMPerGPU
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
slog.Warn("unexpected nil runner reference, memory prediction may be incorrect")
|
slog.Warn("unexpected nil runner reference, memory prediction may be incorrect")
|
||||||
@@ -401,11 +437,36 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
|
|||||||
// after we start our first runner, then we'll never acount for that, so picking the smallest free value seems prudent.
|
// after we start our first runner, then we'll never acount for that, so picking the smallest free value seems prudent.
|
||||||
allGpus[i].FreeMemory = allGpus[i].TotalMemory - p
|
allGpus[i].FreeMemory = allGpus[i].TotalMemory - p
|
||||||
}
|
}
|
||||||
slog.Info("updated VRAM", "gpu", allGpus[i].ID, "library", allGpus[i].Library, "total", format.HumanBytes2(allGpus[i].TotalMemory), "available", format.HumanBytes2(allGpus[i].FreeMemory))
|
slog.Info("updated VRAM based on existing loaded models", "gpu", allGpus[i].ID, "library", allGpus[i].Library, "total", format.HumanBytes2(allGpus[i].TotalMemory), "available", format.HumanBytes2(allGpus[i].FreeMemory))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// While models are loading the VRAM consumption numbers will be indeterminate, so we have
|
||||||
|
// to avoid scheduling another model on the same GPU(s) that haven't stabilized.
|
||||||
|
// This routine returns the set of GPUs that do not have an active loading model.
|
||||||
|
// If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
|
||||||
|
func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus gpu.GpuInfoList) gpu.GpuInfoList {
|
||||||
|
ret := append(gpu.GpuInfoList{}, allGpus...)
|
||||||
|
s.loadedMu.Lock()
|
||||||
|
defer s.loadedMu.Unlock()
|
||||||
|
for _, runner := range s.loaded {
|
||||||
|
if runner.loading {
|
||||||
|
slog.Debug("overlapping loads detected", "gpus", runner.gpus, "model", runner.modelPath)
|
||||||
|
for _, busyGPU := range runner.gpus {
|
||||||
|
for i := range ret {
|
||||||
|
if ret[i].ID == busyGPU.ID {
|
||||||
|
ret = append(ret[:i], ret[i+1:]...)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ret
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO consolidate sched_types.go
|
||||||
type runnerRef struct {
|
type runnerRef struct {
|
||||||
refMu sync.Mutex
|
refMu sync.Mutex
|
||||||
// refCond sync.Cond // Signaled on transition from 1 -> 0 refCount
|
// refCond sync.Cond // Signaled on transition from 1 -> 0 refCount
|
||||||
@@ -487,8 +548,11 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
|
|||||||
func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
|
func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
|
||||||
finished := make(chan interface{}, 1)
|
finished := make(chan interface{}, 1)
|
||||||
|
|
||||||
// CPU or Metal don't need checking, so no waiting required, windows can page VRAM, and the APIs we query tend to be optimistic on free space
|
// CPU or Metal don't need checking, so no waiting required
|
||||||
if (len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) || runtime.GOOS == "windows" {
|
// windows can page VRAM, only cuda currently can report accurate used vram usage
|
||||||
|
if len(runner.gpus) == 0 ||
|
||||||
|
(len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) ||
|
||||||
|
(runtime.GOOS == "windows" && runner.gpus[0].Library != "cuda") {
|
||||||
finished <- struct{}{}
|
finished <- struct{}{}
|
||||||
return finished
|
return finished
|
||||||
}
|
}
|
||||||
@@ -508,7 +572,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
|
|||||||
for {
|
for {
|
||||||
<-ticker.C
|
<-ticker.C
|
||||||
if time.Now().After(expiresAt) {
|
if time.Now().After(expiresAt) {
|
||||||
slog.Warn("gpu VRAM usage didn't recover within timeout", "seconds", time.Since(start).Seconds())
|
slog.Warn("gpu VRAM usage didn't recover within timeout", "seconds", time.Since(start).Seconds(), "model", runner.modelPath)
|
||||||
finished <- struct{}{}
|
finished <- struct{}{}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -521,7 +585,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
|
|||||||
}
|
}
|
||||||
// If we're within ~80% of the estimated memory usage recovered, bail out
|
// If we're within ~80% of the estimated memory usage recovered, bail out
|
||||||
if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.estimatedVRAM)*0.8 {
|
if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.estimatedVRAM)*0.8 {
|
||||||
slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()))
|
slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()), "model", runner.modelPath)
|
||||||
finished <- struct{}{}
|
finished <- struct{}{}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -558,10 +622,12 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.
|
|||||||
sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl)))
|
sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl)))
|
||||||
|
|
||||||
// First attempt to fit the model into a single GPU
|
// First attempt to fit the model into a single GPU
|
||||||
for _, g := range sgl {
|
if !envconfig.SchedSpread {
|
||||||
if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
for _, g := range sgl {
|
||||||
slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
|
if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
||||||
return []gpu.GpuInfo{g}
|
slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
|
||||||
|
return []gpu.GpuInfo{g}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -586,6 +652,10 @@ func (s *Scheduler) findRunnerToUnload() *runnerRef {
|
|||||||
runnerList = append(runnerList, r)
|
runnerList = append(runnerList, r)
|
||||||
}
|
}
|
||||||
s.loadedMu.Unlock()
|
s.loadedMu.Unlock()
|
||||||
|
if len(runnerList) == 0 {
|
||||||
|
slog.Debug("no loaded runner to unload")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// In the future we can enhance the algorithm to be smarter about picking the optimal runner to unload
|
// In the future we can enhance the algorithm to be smarter about picking the optimal runner to unload
|
||||||
// e.g., if we have multiple options, will one make room for the request?
|
// e.g., if we have multiple options, will one make room for the request?
|
||||||
@@ -616,3 +686,18 @@ func (s *Scheduler) unloadAllRunners() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If other runners are loaded, make sure the pending request will fit in system memory
|
||||||
|
// If not, pick a runner to unload, else return nil and the request can be loaded
|
||||||
|
func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef {
|
||||||
|
slog.Debug("evaluating if CPU model load will fit in available system memory")
|
||||||
|
estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
|
||||||
|
if estimate.TotalSize <= gpus[0].FreeMemory {
|
||||||
|
slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO - optimization: try to find CPU only runners first, or partial offloads with enough in system memory to make room
|
||||||
|
|
||||||
|
return s.findRunnerToUnload()
|
||||||
|
}
|
||||||
|
@@ -60,7 +60,7 @@ func TestLoad(t *testing.T) {
|
|||||||
err := <-req.errCh
|
err := <-req.errCh
|
||||||
require.Contains(t, err.Error(), "this model may be incompatible")
|
require.Contains(t, err.Error(), "this model may be incompatible")
|
||||||
|
|
||||||
server := &mockLlm{estimatedVRAM: 10}
|
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
|
||||||
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
|
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
|
||||||
return server, nil
|
return server, nil
|
||||||
}
|
}
|
||||||
@@ -128,13 +128,14 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
|
|||||||
"tokenizer.ggml.scores": []float32{0},
|
"tokenizer.ggml.scores": []float32{0},
|
||||||
"tokenizer.ggml.token_type": []int32{0},
|
"tokenizer.ggml.token_type": []int32{0},
|
||||||
}, []llm.Tensor{
|
}, []llm.Tensor{
|
||||||
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
|
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
})
|
})
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
fname := f.Name()
|
fname := f.Name()
|
||||||
model := &Model{Name: modelName, ModelPath: fname}
|
model := &Model{Name: modelName, ModelPath: fname}
|
||||||
scenario.ggml, err = llm.LoadModel(model.ModelPath)
|
scenario.ggml, err = llm.LoadModel(model.ModelPath, 0)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
scenario.req = &LlmRequest{
|
scenario.req = &LlmRequest{
|
||||||
@@ -145,17 +146,17 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
|
|||||||
successCh: make(chan *runnerRef, 1),
|
successCh: make(chan *runnerRef, 1),
|
||||||
errCh: make(chan error, 1),
|
errCh: make(chan error, 1),
|
||||||
}
|
}
|
||||||
scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM}
|
scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
|
||||||
return scenario
|
return scenario
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestRequests(t *testing.T) {
|
func TestRequests(t *testing.T) {
|
||||||
ctx, done := context.WithTimeout(context.Background(), time.Second)
|
ctx, done := context.WithTimeout(context.Background(), 10*time.Second)
|
||||||
defer done()
|
defer done()
|
||||||
|
|
||||||
// Same model, same request
|
// Same model, same request
|
||||||
scenario1a := newScenario(t, ctx, "ollama-model-1", 10)
|
scenario1a := newScenario(t, ctx, "ollama-model-1", 10)
|
||||||
scenario1a.req.sessionDuration = 0
|
scenario1a.req.sessionDuration = 5 * time.Millisecond
|
||||||
scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
|
scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
|
||||||
scenario1b.req.model = scenario1a.req.model
|
scenario1b.req.model = scenario1a.req.model
|
||||||
scenario1b.ggml = scenario1a.ggml
|
scenario1b.ggml = scenario1a.ggml
|
||||||
@@ -166,6 +167,7 @@ func TestRequests(t *testing.T) {
|
|||||||
tmpModel := *scenario1a.req.model
|
tmpModel := *scenario1a.req.model
|
||||||
scenario2a.req.model = &tmpModel
|
scenario2a.req.model = &tmpModel
|
||||||
scenario2a.ggml = scenario1a.ggml
|
scenario2a.ggml = scenario1a.ggml
|
||||||
|
scenario2a.req.sessionDuration = 5 * time.Millisecond
|
||||||
|
|
||||||
// Multiple loaded models
|
// Multiple loaded models
|
||||||
scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
|
scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
|
||||||
@@ -181,6 +183,12 @@ func TestRequests(t *testing.T) {
|
|||||||
g.FreeMemory = 12 * format.GigaByte
|
g.FreeMemory = 12 * format.GigaByte
|
||||||
return []gpu.GpuInfo{g}
|
return []gpu.GpuInfo{g}
|
||||||
}
|
}
|
||||||
|
s.getCpuFn = func() gpu.GpuInfoList {
|
||||||
|
g := gpu.GpuInfo{Library: "cpu"}
|
||||||
|
g.TotalMemory = 32 * format.GigaByte
|
||||||
|
g.FreeMemory = 26 * format.GigaByte
|
||||||
|
return []gpu.GpuInfo{g}
|
||||||
|
}
|
||||||
s.newServerFn = scenario1a.newServer
|
s.newServerFn = scenario1a.newServer
|
||||||
slog.Info("scenario1a")
|
slog.Info("scenario1a")
|
||||||
s.pendingReqCh <- scenario1a.req
|
s.pendingReqCh <- scenario1a.req
|
||||||
@@ -309,7 +317,6 @@ func TestGetRunner(t *testing.T) {
|
|||||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||||
defer done()
|
defer done()
|
||||||
|
|
||||||
// Same model, same request
|
|
||||||
scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
|
scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
|
||||||
scenario1a.req.sessionDuration = 0
|
scenario1a.req.sessionDuration = 0
|
||||||
scenario1b := newScenario(t, ctx, "ollama-model-1b", 10)
|
scenario1b := newScenario(t, ctx, "ollama-model-1b", 10)
|
||||||
@@ -419,7 +426,7 @@ func TestUseLoadedRunner(t *testing.T) {
|
|||||||
sessionDuration: 2,
|
sessionDuration: 2,
|
||||||
}
|
}
|
||||||
finished := make(chan *LlmRequest)
|
finished := make(chan *LlmRequest)
|
||||||
llm1 := &mockLlm{}
|
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
|
||||||
r1 := &runnerRef{llama: llm1, sessionDuration: 1}
|
r1 := &runnerRef{llama: llm1, sessionDuration: 1}
|
||||||
req.useLoadedRunner(r1, finished)
|
req.useLoadedRunner(r1, finished)
|
||||||
require.Equal(t, uint(1), r1.refCount)
|
require.Equal(t, uint(1), r1.refCount)
|
||||||
@@ -452,8 +459,8 @@ func TestUpdateFreeSpace(t *testing.T) {
|
|||||||
gpus[0].FreeMemory = 900
|
gpus[0].FreeMemory = 900
|
||||||
gpus[1].TotalMemory = 2000
|
gpus[1].TotalMemory = 2000
|
||||||
gpus[1].FreeMemory = 1900
|
gpus[1].FreeMemory = 1900
|
||||||
llm1 := &mockLlm{estimatedVRAM: 100}
|
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
|
||||||
llm2 := &mockLlm{estimatedVRAM: 200}
|
llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
|
||||||
r1 := &runnerRef{llama: llm1, gpus: gpus}
|
r1 := &runnerRef{llama: llm1, gpus: gpus}
|
||||||
r2 := &runnerRef{llama: llm2, gpus: gpus}
|
r2 := &runnerRef{llama: llm2, gpus: gpus}
|
||||||
|
|
||||||
@@ -464,8 +471,42 @@ func TestUpdateFreeSpace(t *testing.T) {
|
|||||||
s.loadedMu.Unlock()
|
s.loadedMu.Unlock()
|
||||||
|
|
||||||
s.updateFreeSpace(gpus)
|
s.updateFreeSpace(gpus)
|
||||||
require.Equal(t, uint64(850), gpus[0].FreeMemory)
|
require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
|
||||||
require.Equal(t, uint64(1850), gpus[1].FreeMemory)
|
require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
|
||||||
|
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||||
|
defer done()
|
||||||
|
gpus := gpu.GpuInfoList{
|
||||||
|
{
|
||||||
|
Library: "cuda",
|
||||||
|
ID: "0",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Library: "cuda",
|
||||||
|
ID: "1",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true}
|
||||||
|
|
||||||
|
s := InitScheduler(ctx)
|
||||||
|
s.loadedMu.Lock()
|
||||||
|
s.loaded["a"] = r1
|
||||||
|
s.loadedMu.Unlock()
|
||||||
|
|
||||||
|
tmp := s.filterGPUsWithoutLoadingModels(gpus)
|
||||||
|
require.Len(t, tmp, 1)
|
||||||
|
require.Equal(t, "1", tmp[0].ID)
|
||||||
|
|
||||||
|
r1.gpus = gpu.GpuInfoList{gpus[1]}
|
||||||
|
tmp = s.filterGPUsWithoutLoadingModels(gpus)
|
||||||
|
require.Len(t, tmp, 1)
|
||||||
|
require.Equal(t, "0", tmp[0].ID)
|
||||||
|
|
||||||
|
r1.gpus = gpu.GpuInfoList{}
|
||||||
|
tmp = s.filterGPUsWithoutLoadingModels(gpus)
|
||||||
|
require.Len(t, tmp, 2)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestFindRunnerToUnload(t *testing.T) {
|
func TestFindRunnerToUnload(t *testing.T) {
|
||||||
@@ -492,7 +533,7 @@ func TestNeedsReload(t *testing.T) {
|
|||||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||||
defer done()
|
defer done()
|
||||||
|
|
||||||
llm := &mockLlm{}
|
llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
|
||||||
do := api.DefaultOptions()
|
do := api.DefaultOptions()
|
||||||
runner := &runnerRef{
|
runner := &runnerRef{
|
||||||
model: &Model{AdapterPaths: []string{"adapter1"}, ProjectorPaths: []string{"projector1"}},
|
model: &Model{AdapterPaths: []string{"adapter1"}, ProjectorPaths: []string{"projector1"}},
|
||||||
@@ -535,8 +576,8 @@ func TestUnloadAllRunners(t *testing.T) {
|
|||||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||||
defer done()
|
defer done()
|
||||||
|
|
||||||
llm1 := &mockLlm{}
|
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
|
||||||
llm2 := &mockLlm{}
|
llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
|
||||||
s := InitScheduler(ctx)
|
s := InitScheduler(ctx)
|
||||||
s.unloadAllRunners()
|
s.unloadAllRunners()
|
||||||
|
|
||||||
@@ -554,7 +595,7 @@ func TestUnloadAllRunners(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestUnload(t *testing.T) {
|
func TestUnload(t *testing.T) {
|
||||||
llm1 := &mockLlm{}
|
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
|
||||||
r1 := &runnerRef{llama: llm1}
|
r1 := &runnerRef{llama: llm1}
|
||||||
r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}}
|
r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}}
|
||||||
r1.unload()
|
r1.unload()
|
||||||
@@ -564,19 +605,20 @@ func TestUnload(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type mockLlm struct {
|
type mockLlm struct {
|
||||||
pingResp error
|
pingResp error
|
||||||
waitResp error
|
waitResp error
|
||||||
completionResp error
|
completionResp error
|
||||||
embeddingResp []float64
|
embeddingResp []float64
|
||||||
embeddingRespErr error
|
embeddingRespErr error
|
||||||
tokenizeResp []int
|
tokenizeResp []int
|
||||||
tokenizeRespErr error
|
tokenizeRespErr error
|
||||||
detokenizeResp string
|
detokenizeResp string
|
||||||
detonekizeRespErr error
|
detonekizeRespErr error
|
||||||
closeResp error
|
closeResp error
|
||||||
closeCalled bool
|
closeCalled bool
|
||||||
estimatedVRAM uint64
|
estimatedVRAM uint64
|
||||||
estimatedTotal uint64
|
estimatedTotal uint64
|
||||||
|
estimatedVRAMByGPU map[string]uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
|
func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
|
||||||
@@ -597,5 +639,6 @@ func (s *mockLlm) Close() error {
|
|||||||
s.closeCalled = true
|
s.closeCalled = true
|
||||||
return s.closeResp
|
return s.closeResp
|
||||||
}
|
}
|
||||||
func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
|
func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
|
||||||
func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }
|
func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }
|
||||||
|
func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }
|
||||||
|
@@ -4,7 +4,6 @@ package model
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"cmp"
|
"cmp"
|
||||||
"encoding/hex"
|
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
@@ -371,57 +370,3 @@ func cutPromised(s, sep string) (before, after string, ok bool) {
|
|||||||
}
|
}
|
||||||
return cmp.Or(before, MissingPart), cmp.Or(after, MissingPart), true
|
return cmp.Or(before, MissingPart), cmp.Or(after, MissingPart), true
|
||||||
}
|
}
|
||||||
|
|
||||||
type DigestType byte
|
|
||||||
|
|
||||||
const (
|
|
||||||
DigestTypeInvalid DigestType = iota
|
|
||||||
DigestTypeSHA256
|
|
||||||
)
|
|
||||||
|
|
||||||
func (t DigestType) String() string {
|
|
||||||
switch t {
|
|
||||||
case DigestTypeSHA256:
|
|
||||||
return "sha256"
|
|
||||||
default:
|
|
||||||
return "invalid"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
type Digest struct {
|
|
||||||
Type DigestType
|
|
||||||
Sum [32]byte
|
|
||||||
}
|
|
||||||
|
|
||||||
func ParseDigest(s string) (Digest, error) {
|
|
||||||
i := strings.IndexAny(s, "-:")
|
|
||||||
if i < 0 {
|
|
||||||
return Digest{}, fmt.Errorf("invalid digest %q", s)
|
|
||||||
}
|
|
||||||
typ, encSum := s[:i], s[i+1:]
|
|
||||||
if typ != "sha256" {
|
|
||||||
return Digest{}, fmt.Errorf("unsupported digest type %q", typ)
|
|
||||||
}
|
|
||||||
d := Digest{
|
|
||||||
Type: DigestTypeSHA256,
|
|
||||||
}
|
|
||||||
n, err := hex.Decode(d.Sum[:], []byte(encSum))
|
|
||||||
if err != nil {
|
|
||||||
return Digest{}, err
|
|
||||||
}
|
|
||||||
if n != 32 {
|
|
||||||
return Digest{}, fmt.Errorf("digest %q decoded to %d bytes; want 32", encSum, n)
|
|
||||||
}
|
|
||||||
return d, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (d Digest) String() string {
|
|
||||||
if d.Type == DigestTypeInvalid {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
return fmt.Sprintf("sha256-%x", d.Sum)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (d Digest) IsValid() bool {
|
|
||||||
return d.Type != DigestTypeInvalid
|
|
||||||
}
|
|
||||||
|
@@ -284,40 +284,6 @@ func TestFilepathAllocs(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const (
|
|
||||||
validSha256 = "sha256-1000000000000000000000000000000000000000000000000000000000000000"
|
|
||||||
validSha256Old = "sha256:1000000000000000000000000000000000000000000000000000000000000000"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestParseDigest(t *testing.T) {
|
|
||||||
cases := []struct {
|
|
||||||
in string
|
|
||||||
want string
|
|
||||||
}{
|
|
||||||
{"", ""}, // empty
|
|
||||||
{"sha123-12", ""}, // invalid type
|
|
||||||
{"sha256-", ""}, // invalid sum
|
|
||||||
{"sha256-123", ""}, // invalid odd length sum
|
|
||||||
|
|
||||||
{validSha256, validSha256},
|
|
||||||
{validSha256Old, validSha256},
|
|
||||||
}
|
|
||||||
for _, tt := range cases {
|
|
||||||
t.Run(tt.in, func(t *testing.T) {
|
|
||||||
got, err := ParseDigest(tt.in)
|
|
||||||
if err != nil {
|
|
||||||
if tt.want != "" {
|
|
||||||
t.Errorf("parseDigest(%q) = %v; want %v", tt.in, err, tt.want)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if got.String() != tt.want {
|
|
||||||
t.Errorf("parseDigest(%q).String() = %q; want %q", tt.in, got, tt.want)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestParseNameFromFilepath(t *testing.T) {
|
func TestParseNameFromFilepath(t *testing.T) {
|
||||||
cases := map[string]Name{
|
cases := map[string]Name{
|
||||||
filepath.Join("host", "namespace", "model", "tag"): {Host: "host", Namespace: "namespace", Model: "model", Tag: "tag"},
|
filepath.Join("host", "namespace", "model", "tag"): {Host: "host", Namespace: "namespace", Model: "model", Tag: "tag"},
|
||||||
|
34
util/bufioutil/buffer_seeker.go
Normal file
34
util/bufioutil/buffer_seeker.go
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
package bufioutil
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"io"
|
||||||
|
)
|
||||||
|
|
||||||
|
type BufferedSeeker struct {
|
||||||
|
rs io.ReadSeeker
|
||||||
|
br *bufio.Reader
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewBufferedSeeker(rs io.ReadSeeker, size int) *BufferedSeeker {
|
||||||
|
return &BufferedSeeker{
|
||||||
|
rs: rs,
|
||||||
|
br: bufio.NewReaderSize(rs, size),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *BufferedSeeker) Read(p []byte) (int, error) {
|
||||||
|
return b.br.Read(p)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *BufferedSeeker) Seek(offset int64, whence int) (int64, error) {
|
||||||
|
if whence == io.SeekCurrent {
|
||||||
|
offset -= int64(b.br.Buffered())
|
||||||
|
}
|
||||||
|
n, err := b.rs.Seek(offset, whence)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
b.br.Reset(b.rs)
|
||||||
|
return n, nil
|
||||||
|
}
|
64
util/bufioutil/buffer_seeker_test.go
Normal file
64
util/bufioutil/buffer_seeker_test.go
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
package bufioutil
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"io"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestBufferedSeeker(t *testing.T) {
|
||||||
|
const alphabet = "abcdefghijklmnopqrstuvwxyz"
|
||||||
|
|
||||||
|
bs := NewBufferedSeeker(strings.NewReader(alphabet), 0) // minReadBufferSize = 16
|
||||||
|
|
||||||
|
checkRead := func(buf []byte, expected string) {
|
||||||
|
t.Helper()
|
||||||
|
_, err := bs.Read(buf)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if !bytes.Equal(buf, []byte(expected)) {
|
||||||
|
t.Fatalf("expected %s, got %s", expected, buf)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read the first 5 bytes
|
||||||
|
buf := make([]byte, 5)
|
||||||
|
|
||||||
|
checkRead(buf, "abcde")
|
||||||
|
|
||||||
|
// Seek back to the beginning
|
||||||
|
_, err := bs.Seek(0, io.SeekStart)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// read 'a'
|
||||||
|
checkRead(buf[:1], "a")
|
||||||
|
|
||||||
|
if bs.br.Buffered() == 0 {
|
||||||
|
t.Fatalf("totally unexpected sanity check failed")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Seek past 'b'
|
||||||
|
_, err = bs.Seek(1, io.SeekCurrent)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
checkRead(buf, "cdefg")
|
||||||
|
|
||||||
|
// Seek back to the beginning
|
||||||
|
_, err = bs.Seek(0, io.SeekStart)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
checkRead(buf, "abcde")
|
||||||
|
|
||||||
|
// Seek to the end
|
||||||
|
_, err = bs.Seek(-5, io.SeekEnd)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
checkRead(buf, "vwxyz")
|
||||||
|
}
|
Reference in New Issue
Block a user