Compare commits
20 Commits
format-con
...
matt/strea
Author | SHA1 | Date | |
---|---|---|---|
![]() |
e2389b63aa | ||
![]() |
f89c23764b | ||
![]() |
d028853879 | ||
![]() |
949553db23 | ||
![]() |
0c5a454361 | ||
![]() |
f59c4d03f7 | ||
![]() |
7dee25a07f | ||
![]() |
f221637053 | ||
![]() |
45ac07cd02 | ||
![]() |
7d749cc787 | ||
![]() |
e7e91cd71c | ||
![]() |
3920e15386 | ||
![]() |
41e976edde | ||
![]() |
de227b620f | ||
![]() |
63def6ca49 | ||
![]() |
738fe9c4aa | ||
![]() |
a8da0bacbe | ||
![]() |
bf146fb072 | ||
![]() |
f0f4943577 | ||
![]() |
09dd2aeff9 |
@@ -2,3 +2,4 @@
|
||||
ollama
|
||||
app
|
||||
llm/llama.cpp/ggml
|
||||
llm/llama.cpp/gguf
|
||||
|
11
.gitmodules
vendored
11
.gitmodules
vendored
@@ -1,4 +1,9 @@
|
||||
[submodule "llm/llama.cpp/ggml"]
|
||||
path = llm/llama.cpp/ggml
|
||||
url = https://github.com/ggerganov/llama.cpp.git
|
||||
ignore = dirty
|
||||
path = llm/llama.cpp/ggml
|
||||
url = https://github.com/ggerganov/llama.cpp.git
|
||||
ignore = dirty
|
||||
shallow = true
|
||||
[submodule "llm/llama.cpp/gguf"]
|
||||
path = llm/llama.cpp/gguf
|
||||
url = https://github.com/ggerganov/llama.cpp.git
|
||||
shallow = true
|
||||
|
22
Dockerfile.cuda
Normal file
22
Dockerfile.cuda
Normal file
@@ -0,0 +1,22 @@
|
||||
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
|
||||
|
||||
WORKDIR /go/src/github.com/jmorganca/ollama
|
||||
RUN apt-get update && apt-get install -y git build-essential cmake
|
||||
ADD https://dl.google.com/go/go1.21.1.linux-amd64.tar.gz /tmp/go1.21.1.tar.gz
|
||||
RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.1.tar.gz
|
||||
|
||||
COPY . .
|
||||
RUN /usr/local/go/bin/go generate ./... && /usr/local/go/bin/go build -ldflags '-linkmode external -extldflags "-static"' .
|
||||
|
||||
FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04
|
||||
ENV OLLAMA_HOST 0.0.0.0
|
||||
|
||||
ARG USER=ollama
|
||||
ARG GROUP=ollama
|
||||
RUN groupadd $GROUP && useradd -m -g $GROUP $USER
|
||||
|
||||
COPY --from=0 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
|
||||
|
||||
USER $USER:$GROUP
|
||||
ENTRYPOINT ["/bin/ollama"]
|
||||
CMD ["serve"]
|
@@ -291,7 +291,7 @@ func DefaultOptions() Options {
|
||||
NumCtx: 2048,
|
||||
NumKeep: -1,
|
||||
NumBatch: 512,
|
||||
NumGPU: 1,
|
||||
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
|
||||
NumGQA: 1,
|
||||
LowVRAM: false,
|
||||
F16KV: true,
|
||||
|
@@ -672,6 +672,12 @@ func RunServer(cmd *cobra.Command, _ []string) error {
|
||||
origins = strings.Split(o, ",")
|
||||
}
|
||||
|
||||
if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
|
||||
if err := server.PruneLayers(); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return server.Serve(ln, origins)
|
||||
}
|
||||
|
||||
|
@@ -20,6 +20,10 @@ Model names follow a `model:tag` format. Some examples are `orca-mini:3b-q4_1` a
|
||||
|
||||
All durations are returned in nanoseconds.
|
||||
|
||||
### Streams
|
||||
|
||||
Many API responses are streams of JSON objects showing the current status. For examples of working with streams in various languages, see [streaming.md](./streaming.md)
|
||||
|
||||
## Generate a completion
|
||||
|
||||
```
|
||||
|
@@ -6,6 +6,10 @@
|
||||
|
||||
Install required tools:
|
||||
|
||||
- cmake version 3.24 or higher
|
||||
- go version 1.20 or higher
|
||||
- gcc version 11.4.0 or higher
|
||||
|
||||
```
|
||||
brew install go cmake gcc
|
||||
```
|
||||
@@ -27,3 +31,9 @@ Now you can run `ollama`:
|
||||
```
|
||||
./ollama
|
||||
```
|
||||
|
||||
## Building on Linux with GPU support
|
||||
|
||||
- Install cmake and nvidia-cuda-toolkit
|
||||
- run `go generate ./...`
|
||||
- run `go build .`
|
||||
|
35
docs/streaming.md
Normal file
35
docs/streaming.md
Normal file
@@ -0,0 +1,35 @@
|
||||
# Streaming responses in the Ollama Client API
|
||||
|
||||
## JavaScript / TypeScript / Deno
|
||||
|
||||
```javascript
|
||||
const pull = async () => {
|
||||
const request = await fetch("http://localhost:11434/api/pull", {
|
||||
method: "POST",
|
||||
body: JSON.stringify({ name: "llama2:7b-q5_0" }),
|
||||
});
|
||||
|
||||
const reader = await request.body?.pipeThrough(new TextDecoderStream());
|
||||
if (!reader) throw new Error("No reader");
|
||||
for await (const chunk of reader) {
|
||||
const out = JSON.parse(chunk);
|
||||
if (out.status.startsWith("downloading")) {
|
||||
console.log(`${out.status} - ${(out.completed / out.total) * 100}%`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pull();
|
||||
```
|
||||
|
||||
## Python
|
||||
|
||||
```python
|
||||
import requests
|
||||
import json
|
||||
response = requests.post("http://localhost:11434/api/pull", json={"name": "llama2:7b-q5_0"}, stream=True)
|
||||
for data in response.iter_lines():
|
||||
out = json.loads(data)
|
||||
if "completed" in out:
|
||||
print(out["completed"] / out["total"] * 100)
|
||||
```
|
22
llm/falcon.go
Normal file
22
llm/falcon.go
Normal file
@@ -0,0 +1,22 @@
|
||||
package llm
|
||||
|
||||
const ModelFamilyFalcon = "falcon"
|
||||
|
||||
const (
|
||||
falconModelType7B = 32
|
||||
falconModelType40B = 60
|
||||
falconModelType180B = 80
|
||||
)
|
||||
|
||||
func falconModelType(numLayer uint32) string {
|
||||
switch numLayer {
|
||||
case 32:
|
||||
return "7B"
|
||||
case 60:
|
||||
return "40B"
|
||||
case 80:
|
||||
return "180B"
|
||||
default:
|
||||
return "Unknown"
|
||||
}
|
||||
}
|
182
llm/ggml.go
182
llm/ggml.go
@@ -3,72 +3,97 @@ package llm
|
||||
import (
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"path"
|
||||
"sync"
|
||||
)
|
||||
|
||||
type ModelFamily string
|
||||
|
||||
type ModelType uint32
|
||||
|
||||
const (
|
||||
ModelType3B ModelType = 26
|
||||
ModelType7B ModelType = 32
|
||||
ModelType13B ModelType = 40
|
||||
ModelType34B ModelType = 48
|
||||
ModelType30B ModelType = 60
|
||||
ModelType65B ModelType = 80
|
||||
)
|
||||
|
||||
func (mt ModelType) String() string {
|
||||
switch mt {
|
||||
case ModelType3B:
|
||||
return "3B"
|
||||
case ModelType7B:
|
||||
return "7B"
|
||||
case ModelType13B:
|
||||
return "13B"
|
||||
case ModelType34B:
|
||||
return "34B"
|
||||
case ModelType30B:
|
||||
return "30B"
|
||||
case ModelType65B:
|
||||
return "65B"
|
||||
default:
|
||||
return "Unknown"
|
||||
}
|
||||
}
|
||||
|
||||
type FileType interface {
|
||||
String() string
|
||||
}
|
||||
|
||||
type GGML struct {
|
||||
magic uint32
|
||||
container
|
||||
model
|
||||
}
|
||||
|
||||
const (
|
||||
fileTypeF32 uint32 = iota
|
||||
fileTypeF16
|
||||
fileTypeQ4_0
|
||||
fileTypeQ4_1
|
||||
fileTypeQ4_1_F16
|
||||
fileTypeQ8_0 uint32 = iota + 2
|
||||
fileTypeQ5_0
|
||||
fileTypeQ5_1
|
||||
fileTypeQ2_K
|
||||
fileTypeQ3_K_S
|
||||
fileTypeQ3_K_M
|
||||
fileTypeQ3_K_L
|
||||
fileTypeQ4_K_S
|
||||
fileTypeQ4_K_M
|
||||
fileTypeQ5_K_S
|
||||
fileTypeQ5_K_M
|
||||
fileTypeQ6_K
|
||||
)
|
||||
|
||||
func fileType(fileType uint32) string {
|
||||
switch fileType {
|
||||
case fileTypeF32:
|
||||
return "F32"
|
||||
case fileTypeF16:
|
||||
return "F16"
|
||||
case fileTypeQ4_0:
|
||||
return "Q4_0"
|
||||
case fileTypeQ4_1:
|
||||
return "Q4_1"
|
||||
case fileTypeQ4_1_F16:
|
||||
return "Q4_1_F16"
|
||||
case fileTypeQ8_0:
|
||||
return "Q8_0"
|
||||
case fileTypeQ5_0:
|
||||
return "Q5_0"
|
||||
case fileTypeQ5_1:
|
||||
return "Q5_1"
|
||||
case fileTypeQ2_K:
|
||||
return "Q2_K"
|
||||
case fileTypeQ3_K_S:
|
||||
return "Q3_K_S"
|
||||
case fileTypeQ3_K_M:
|
||||
return "Q3_K_M"
|
||||
case fileTypeQ3_K_L:
|
||||
return "Q3_K_L"
|
||||
case fileTypeQ4_K_S:
|
||||
return "Q4_K_S"
|
||||
case fileTypeQ4_K_M:
|
||||
return "Q4_K_M"
|
||||
case fileTypeQ5_K_S:
|
||||
return "Q5_K_S"
|
||||
case fileTypeQ5_K_M:
|
||||
return "Q5_K_M"
|
||||
case fileTypeQ6_K:
|
||||
return "Q6_K"
|
||||
default:
|
||||
return "Unknown"
|
||||
}
|
||||
}
|
||||
|
||||
type model interface {
|
||||
ModelFamily() ModelFamily
|
||||
ModelType() ModelType
|
||||
FileType() FileType
|
||||
ModelFamily() string
|
||||
ModelType() string
|
||||
FileType() string
|
||||
}
|
||||
|
||||
type container interface {
|
||||
Name() string
|
||||
Decode(io.Reader) error
|
||||
Decode(io.Reader) (model, error)
|
||||
}
|
||||
|
||||
type containerGGML struct {
|
||||
}
|
||||
type containerGGML struct{}
|
||||
|
||||
func (c *containerGGML) Name() string {
|
||||
return "ggml"
|
||||
}
|
||||
|
||||
func (c *containerGGML) Decode(r io.Reader) error {
|
||||
return nil
|
||||
func (c *containerGGML) Decode(r io.Reader) (model, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
type containerGGMF struct {
|
||||
@@ -79,18 +104,18 @@ func (c *containerGGMF) Name() string {
|
||||
return "ggmf"
|
||||
}
|
||||
|
||||
func (c *containerGGMF) Decode(r io.Reader) error {
|
||||
func (c *containerGGMF) Decode(r io.Reader) (model, error) {
|
||||
var version uint32
|
||||
binary.Read(r, binary.LittleEndian, &version)
|
||||
|
||||
switch version {
|
||||
case 1:
|
||||
default:
|
||||
return errors.New("invalid version")
|
||||
return nil, errors.New("invalid version")
|
||||
}
|
||||
|
||||
c.version = version
|
||||
return nil
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
type containerGGJT struct {
|
||||
@@ -101,18 +126,22 @@ func (c *containerGGJT) Name() string {
|
||||
return "ggjt"
|
||||
}
|
||||
|
||||
func (c *containerGGJT) Decode(r io.Reader) error {
|
||||
func (c *containerGGJT) Decode(r io.Reader) (model, error) {
|
||||
var version uint32
|
||||
binary.Read(r, binary.LittleEndian, &version)
|
||||
|
||||
switch version {
|
||||
case 1, 2, 3:
|
||||
default:
|
||||
return errors.New("invalid version")
|
||||
return nil, errors.New("invalid version")
|
||||
}
|
||||
|
||||
c.version = version
|
||||
return nil
|
||||
|
||||
// different model types may have different layouts for hyperparameters
|
||||
var llama llamaModel
|
||||
binary.Read(r, binary.LittleEndian, &llama.hyperparameters)
|
||||
return &llama, nil
|
||||
}
|
||||
|
||||
type containerLORA struct {
|
||||
@@ -123,32 +152,51 @@ func (c *containerLORA) Name() string {
|
||||
return "ggla"
|
||||
}
|
||||
|
||||
func (c *containerLORA) Decode(r io.Reader) error {
|
||||
func (c *containerLORA) Decode(r io.Reader) (model, error) {
|
||||
var version uint32
|
||||
binary.Read(r, binary.LittleEndian, &version)
|
||||
|
||||
switch version {
|
||||
case 1:
|
||||
default:
|
||||
return errors.New("invalid version")
|
||||
return nil, errors.New("invalid version")
|
||||
}
|
||||
|
||||
c.version = version
|
||||
return nil
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
var (
|
||||
ggmlGPU = path.Join("llama.cpp", "ggml", "build", "gpu", "bin")
|
||||
ggmlCPU = path.Join("llama.cpp", "ggml", "build", "cpu", "bin")
|
||||
)
|
||||
|
||||
var (
|
||||
ggmlInit sync.Once
|
||||
ggmlRunnerPath string
|
||||
)
|
||||
|
||||
func ggmlRunner() ModelRunner {
|
||||
ggmlInit.Do(func() {
|
||||
ggmlRunnerPath = chooseRunner(ggmlGPU, ggmlCPU)
|
||||
})
|
||||
return ModelRunner{Path: ggmlRunnerPath}
|
||||
}
|
||||
|
||||
const (
|
||||
// / Magic constant for `ggml` files (unversioned).
|
||||
// Magic constant for `ggml` files (unversioned).
|
||||
FILE_MAGIC_GGML = 0x67676d6c
|
||||
// / Magic constant for `ggml` files (versioned, ggmf).
|
||||
// Magic constant for `ggml` files (versioned, ggmf).
|
||||
FILE_MAGIC_GGMF = 0x67676d66
|
||||
// / Magic constant for `ggml` files (versioned, ggjt).
|
||||
// Magic constant for `ggml` files (versioned, ggjt).
|
||||
FILE_MAGIC_GGJT = 0x67676a74
|
||||
// / Magic constant for `ggla` files (LoRA adapter).
|
||||
// Magic constant for `ggla` files (LoRA adapter).
|
||||
FILE_MAGIC_GGLA = 0x67676C61
|
||||
// Magic constant for `gguf` files (versioned, gguf)
|
||||
FILE_MAGIC_GGUF = 0x46554747
|
||||
)
|
||||
|
||||
func DecodeGGML(r io.ReadSeeker, hint ModelFamily) (*GGML, error) {
|
||||
func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
|
||||
var ggml GGML
|
||||
binary.Read(r, binary.LittleEndian, &ggml.magic)
|
||||
|
||||
@@ -161,24 +209,18 @@ func DecodeGGML(r io.ReadSeeker, hint ModelFamily) (*GGML, error) {
|
||||
ggml.container = &containerGGJT{}
|
||||
case FILE_MAGIC_GGLA:
|
||||
ggml.container = &containerLORA{}
|
||||
case FILE_MAGIC_GGUF:
|
||||
ggml.container = &containerGGUF{}
|
||||
default:
|
||||
return nil, errors.New("invalid file magic")
|
||||
}
|
||||
|
||||
if err := ggml.Decode(r); err != nil {
|
||||
model, err := ggml.Decode(r)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// different model types may have different layouts for hyperparameters
|
||||
switch hint {
|
||||
case ModelFamilyLlama:
|
||||
var llama llamaModel
|
||||
binary.Read(r, binary.LittleEndian, &llama.hyperparameters)
|
||||
ggml.model = &llama
|
||||
// TODO: sanity check hyperparameters
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported model type: %s", hint)
|
||||
}
|
||||
ggml.model = model
|
||||
|
||||
// final model type
|
||||
return &ggml, nil
|
||||
|
389
llm/gguf.go
Normal file
389
llm/gguf.go
Normal file
@@ -0,0 +1,389 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"path"
|
||||
"sync"
|
||||
)
|
||||
|
||||
type containerGGUF struct {
|
||||
Version uint32
|
||||
|
||||
V1 struct {
|
||||
NumTensor uint32
|
||||
NumKV uint32
|
||||
}
|
||||
|
||||
V2 struct {
|
||||
NumTensor uint64
|
||||
NumKV uint64
|
||||
}
|
||||
}
|
||||
|
||||
func (c *containerGGUF) Name() string {
|
||||
return "gguf"
|
||||
}
|
||||
|
||||
func (c *containerGGUF) Decode(r io.Reader) (model, error) {
|
||||
binary.Read(r, binary.LittleEndian, &c.Version)
|
||||
|
||||
switch c.Version {
|
||||
case 1:
|
||||
binary.Read(r, binary.LittleEndian, &c.V1)
|
||||
case 2:
|
||||
binary.Read(r, binary.LittleEndian, &c.V2)
|
||||
default:
|
||||
return nil, errors.New("invalid version")
|
||||
}
|
||||
|
||||
model := newGGUFModel(c)
|
||||
if err := model.Decode(r); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return model, nil
|
||||
}
|
||||
|
||||
const (
|
||||
ggufTypeUint8 uint32 = iota
|
||||
ggufTypeInt8
|
||||
ggufTypeUint16
|
||||
ggufTypeInt16
|
||||
ggufTypeUint32
|
||||
ggufTypeInt32
|
||||
ggufTypeFloat32
|
||||
ggufTypeBool
|
||||
ggufTypeString
|
||||
ggufTypeArray
|
||||
ggufTypeUint64
|
||||
ggufTypeInt64
|
||||
ggufTypeFloat64
|
||||
)
|
||||
|
||||
type kv map[string]any
|
||||
|
||||
type ggufModel struct {
|
||||
*containerGGUF
|
||||
kv
|
||||
}
|
||||
|
||||
func newGGUFModel(container *containerGGUF) *ggufModel {
|
||||
return &ggufModel{
|
||||
containerGGUF: container,
|
||||
kv: make(kv),
|
||||
}
|
||||
}
|
||||
|
||||
func (llm *ggufModel) NumKV() uint64 {
|
||||
if llm.Version == 1 {
|
||||
return uint64(llm.V1.NumKV)
|
||||
}
|
||||
|
||||
return llm.V2.NumKV
|
||||
}
|
||||
|
||||
func (llm *ggufModel) ModelFamily() string {
|
||||
t, ok := llm.kv["general.architecture"].(string)
|
||||
if ok {
|
||||
return t
|
||||
}
|
||||
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
func (llm *ggufModel) ModelType() string {
|
||||
switch llm.ModelFamily() {
|
||||
case "llama":
|
||||
if blocks, ok := llm.kv["llama.block_count"].(uint32); ok {
|
||||
heads, headsOK := llm.kv["llama.head_count"].(uint32)
|
||||
headKVs, headsKVsOK := llm.kv["llama.head_count_kv"].(uint32)
|
||||
if headsOK && headsKVsOK && heads/headKVs == 8 {
|
||||
return "70B"
|
||||
}
|
||||
|
||||
return llamaModelType(blocks)
|
||||
}
|
||||
case "falcon":
|
||||
if blocks, ok := llm.kv["falcon.block_count"].(uint32); ok {
|
||||
return falconModelType(blocks)
|
||||
}
|
||||
}
|
||||
|
||||
return "Unknown"
|
||||
}
|
||||
|
||||
func (llm *ggufModel) FileType() string {
|
||||
t, ok := llm.kv["general.file_type"].(uint32)
|
||||
if ok {
|
||||
return fileType(t)
|
||||
}
|
||||
|
||||
return "Unknown"
|
||||
}
|
||||
|
||||
func (llm *ggufModel) Decode(r io.Reader) error {
|
||||
read := llm.readString
|
||||
if llm.Version == 1 {
|
||||
read = llm.readStringV1
|
||||
}
|
||||
|
||||
for i := 0; uint64(i) < llm.NumKV(); i++ {
|
||||
k, err := read(r)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
vtype := llm.readU32(r)
|
||||
|
||||
var v any
|
||||
switch vtype {
|
||||
case ggufTypeUint8:
|
||||
v = llm.readU8(r)
|
||||
case ggufTypeInt8:
|
||||
v = llm.readI8(r)
|
||||
case ggufTypeUint16:
|
||||
v = llm.readU16(r)
|
||||
case ggufTypeInt16:
|
||||
v = llm.readI16(r)
|
||||
case ggufTypeUint32:
|
||||
v = llm.readU32(r)
|
||||
case ggufTypeInt32:
|
||||
v = llm.readI32(r)
|
||||
case ggufTypeUint64:
|
||||
v = llm.readU64(r)
|
||||
case ggufTypeInt64:
|
||||
v = llm.readI64(r)
|
||||
case ggufTypeFloat32:
|
||||
v = llm.readF32(r)
|
||||
case ggufTypeFloat64:
|
||||
v = llm.readF64(r)
|
||||
case ggufTypeBool:
|
||||
v = llm.readBool(r)
|
||||
case ggufTypeString:
|
||||
fn := llm.readString
|
||||
if llm.Version == 1 {
|
||||
fn = llm.readStringV1
|
||||
}
|
||||
|
||||
s, err := fn(r)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
v = s
|
||||
case ggufTypeArray:
|
||||
fn := llm.readArray
|
||||
if llm.Version == 1 {
|
||||
fn = llm.readArrayV1
|
||||
}
|
||||
|
||||
a, err := fn(r)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
v = a
|
||||
default:
|
||||
return fmt.Errorf("invalid type: %d", vtype)
|
||||
}
|
||||
|
||||
llm.kv[k] = v
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (ggufModel) readU8(r io.Reader) uint8 {
|
||||
var u8 uint8
|
||||
binary.Read(r, binary.LittleEndian, &u8)
|
||||
return u8
|
||||
}
|
||||
|
||||
func (ggufModel) readI8(r io.Reader) int8 {
|
||||
var i8 int8
|
||||
binary.Read(r, binary.LittleEndian, &i8)
|
||||
return i8
|
||||
}
|
||||
|
||||
func (ggufModel) readU16(r io.Reader) uint16 {
|
||||
var u16 uint16
|
||||
binary.Read(r, binary.LittleEndian, &u16)
|
||||
return u16
|
||||
}
|
||||
|
||||
func (ggufModel) readI16(r io.Reader) int16 {
|
||||
var i16 int16
|
||||
binary.Read(r, binary.LittleEndian, &i16)
|
||||
return i16
|
||||
}
|
||||
|
||||
func (ggufModel) readU32(r io.Reader) uint32 {
|
||||
var u32 uint32
|
||||
binary.Read(r, binary.LittleEndian, &u32)
|
||||
return u32
|
||||
}
|
||||
|
||||
func (ggufModel) readI32(r io.Reader) int32 {
|
||||
var i32 int32
|
||||
binary.Read(r, binary.LittleEndian, &i32)
|
||||
return i32
|
||||
}
|
||||
|
||||
func (ggufModel) readU64(r io.Reader) uint64 {
|
||||
var u64 uint64
|
||||
binary.Read(r, binary.LittleEndian, &u64)
|
||||
return u64
|
||||
}
|
||||
|
||||
func (ggufModel) readI64(r io.Reader) int64 {
|
||||
var i64 int64
|
||||
binary.Read(r, binary.LittleEndian, &i64)
|
||||
return i64
|
||||
}
|
||||
|
||||
func (ggufModel) readF32(r io.Reader) float32 {
|
||||
var f32 float32
|
||||
binary.Read(r, binary.LittleEndian, &f32)
|
||||
return f32
|
||||
}
|
||||
|
||||
func (ggufModel) readF64(r io.Reader) float64 {
|
||||
var f64 float64
|
||||
binary.Read(r, binary.LittleEndian, &f64)
|
||||
return f64
|
||||
}
|
||||
|
||||
func (ggufModel) readBool(r io.Reader) bool {
|
||||
var b bool
|
||||
binary.Read(r, binary.LittleEndian, &b)
|
||||
return b
|
||||
}
|
||||
|
||||
func (ggufModel) readStringV1(r io.Reader) (string, error) {
|
||||
var nameLength uint32
|
||||
binary.Read(r, binary.LittleEndian, &nameLength)
|
||||
|
||||
var b bytes.Buffer
|
||||
if _, err := io.CopyN(&b, r, int64(nameLength)); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// gguf v1 strings are null-terminated
|
||||
b.Truncate(b.Len() - 1)
|
||||
|
||||
return b.String(), nil
|
||||
}
|
||||
|
||||
func (llm ggufModel) readString(r io.Reader) (string, error) {
|
||||
var nameLength uint64
|
||||
binary.Read(r, binary.LittleEndian, &nameLength)
|
||||
|
||||
var b bytes.Buffer
|
||||
if _, err := io.CopyN(&b, r, int64(nameLength)); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return b.String(), nil
|
||||
}
|
||||
|
||||
func (llm *ggufModel) readArrayV1(r io.Reader) (arr []any, err error) {
|
||||
atype := llm.readU32(r)
|
||||
n := llm.readU32(r)
|
||||
|
||||
for i := 0; uint32(i) < n; i++ {
|
||||
switch atype {
|
||||
case ggufTypeUint8:
|
||||
arr = append(arr, llm.readU8(r))
|
||||
case ggufTypeInt8:
|
||||
arr = append(arr, llm.readU8(r))
|
||||
case ggufTypeUint16:
|
||||
arr = append(arr, llm.readU16(r))
|
||||
case ggufTypeInt16:
|
||||
arr = append(arr, llm.readI16(r))
|
||||
case ggufTypeUint32:
|
||||
arr = append(arr, llm.readU32(r))
|
||||
case ggufTypeInt32:
|
||||
arr = append(arr, llm.readI32(r))
|
||||
case ggufTypeFloat32:
|
||||
arr = append(arr, llm.readF32(r))
|
||||
case ggufTypeBool:
|
||||
arr = append(arr, llm.readBool(r))
|
||||
case ggufTypeString:
|
||||
s, err := llm.readStringV1(r)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
arr = append(arr, s)
|
||||
default:
|
||||
return nil, fmt.Errorf("invalid array type: %d", atype)
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func (llm *ggufModel) readArray(r io.Reader) (arr []any, err error) {
|
||||
atype := llm.readU32(r)
|
||||
n := llm.readU64(r)
|
||||
|
||||
for i := 0; uint64(i) < n; i++ {
|
||||
switch atype {
|
||||
case ggufTypeUint8:
|
||||
arr = append(arr, llm.readU8(r))
|
||||
case ggufTypeInt8:
|
||||
arr = append(arr, llm.readU8(r))
|
||||
case ggufTypeUint16:
|
||||
arr = append(arr, llm.readU16(r))
|
||||
case ggufTypeInt16:
|
||||
arr = append(arr, llm.readI16(r))
|
||||
case ggufTypeUint32:
|
||||
arr = append(arr, llm.readU32(r))
|
||||
case ggufTypeInt32:
|
||||
arr = append(arr, llm.readI32(r))
|
||||
case ggufTypeUint64:
|
||||
arr = append(arr, llm.readU64(r))
|
||||
case ggufTypeInt64:
|
||||
arr = append(arr, llm.readI64(r))
|
||||
case ggufTypeFloat32:
|
||||
arr = append(arr, llm.readF32(r))
|
||||
case ggufTypeFloat64:
|
||||
arr = append(arr, llm.readF64(r))
|
||||
case ggufTypeBool:
|
||||
arr = append(arr, llm.readBool(r))
|
||||
case ggufTypeString:
|
||||
s, err := llm.readString(r)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
arr = append(arr, s)
|
||||
default:
|
||||
return nil, fmt.Errorf("invalid array type: %d", atype)
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
var (
|
||||
ggufGPU = path.Join("llama.cpp", "gguf", "build", "gpu", "bin")
|
||||
ggufCPU = path.Join("llama.cpp", "gguf", "build", "cpu", "bin")
|
||||
)
|
||||
|
||||
var (
|
||||
ggufInit sync.Once
|
||||
ggufRunnerPath string
|
||||
)
|
||||
|
||||
func ggufRunner() ModelRunner {
|
||||
ggufInit.Do(func() {
|
||||
ggufRunnerPath = chooseRunner(ggufGPU, ggufCPU)
|
||||
})
|
||||
|
||||
return ModelRunner{Path: ggufRunnerPath}
|
||||
}
|
@@ -4,10 +4,14 @@
|
||||
package llm
|
||||
|
||||
//go:generate git submodule init
|
||||
|
||||
//go:generate git submodule update --force ggml
|
||||
//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||
//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
|
||||
//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
||||
//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
||||
//go:generate cmake --fresh -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
|
||||
//go:generate -command git-apply git -C ggml apply
|
||||
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
|
||||
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
|
||||
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
||||
|
||||
//go:generate git submodule update --force gguf
|
||||
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
|
||||
//go:generate cmake --build gguf/build/cpu --target server --config Release
|
||||
|
@@ -1,10 +1,16 @@
|
||||
package llm
|
||||
|
||||
//go:generate git submodule init
|
||||
|
||||
//go:generate git submodule update --force ggml
|
||||
//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||
//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
|
||||
//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
||||
//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
||||
//go:generate cmake --fresh -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||
//go:generate -command git-apply git -C ggml apply
|
||||
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
|
||||
//go:generate git-apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
||||
//go:generate git-apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
||||
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
||||
|
||||
//go:generate git submodule update --force gguf
|
||||
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||
//go:generate cmake --build gguf/build/cpu --target server --config Release
|
||||
|
@@ -1,10 +1,16 @@
|
||||
package llm
|
||||
|
||||
//go:generate git submodule init
|
||||
|
||||
//go:generate git submodule update --force ggml
|
||||
//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||
//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
|
||||
//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
||||
//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
||||
//go:generate cmake --fresh -S ggml -B ggml/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||
//go:generate -command git-apply git -C ggml apply
|
||||
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
|
||||
//go:generate git-apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
||||
//go:generate git-apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
||||
//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||
//go:generate cmake --build ggml/build/gpu --target server --config Release
|
||||
|
||||
//go:generate git submodule update --force gguf
|
||||
//go:generate cmake -S gguf -B gguf/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||
//go:generate cmake --build gguf/build/gpu --target server --config Release
|
||||
|
15
llm/llama.cpp/generate_linux.go
Normal file
15
llm/llama.cpp/generate_linux.go
Normal file
@@ -0,0 +1,15 @@
|
||||
package llm
|
||||
|
||||
//go:generate git submodule init
|
||||
|
||||
//go:generate git submodule update --force ggml
|
||||
//go:generate -command git-apply git -C ggml apply
|
||||
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
|
||||
//go:generate git-apply ../ggml_patch/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
|
||||
//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
|
||||
//go:generate cmake --build ggml/build/gpu --target server --config Release
|
||||
|
||||
//go:generate git submodule update --force gguf
|
||||
//go:generate cmake -S gguf -B gguf/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
|
||||
//go:generate cmake --build gguf/build/gpu --target server --config Release
|
@@ -0,0 +1,32 @@
|
||||
From 1e3bc523d8053a77df3ac7126a84d0297ee97ef6 Mon Sep 17 00:00:00 2001
|
||||
From: Kylin <56434533+KyL0N@users.noreply.github.com>
|
||||
Date: Tue, 22 Aug 2023 15:14:23 +0800
|
||||
Subject: [PATCH] ggml : support CUDA's half type for aarch64(#1455) (#2670)
|
||||
|
||||
* ggml: support CUDA's half type for aarch64(#1455)
|
||||
support CUDA's half type for aarch64 in ggml_fp16_t definition
|
||||
|
||||
* ggml: use __CUDACC__ to recognise nvcc compiler
|
||||
---
|
||||
ggml.h | 5 +++--
|
||||
1 file changed, 3 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml.h b/ggml.h
|
||||
index 544ad2d..0ec7ec5 100644
|
||||
--- a/ggml.h
|
||||
+++ b/ggml.h
|
||||
@@ -259,8 +259,9 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
-#ifdef __ARM_NEON
|
||||
- // we use the built-in 16-bit float type
|
||||
+#if defined(__ARM_NEON) && defined(__CUDACC__)
|
||||
+ typedef half ggml_fp16_t;
|
||||
+#elif defined(__ARM_NEON)
|
||||
typedef __fp16 ggml_fp16_t;
|
||||
#else
|
||||
typedef uint16_t ggml_fp16_t;
|
||||
--
|
||||
2.39.2 (Apple Git-143)
|
||||
|
1
llm/llama.cpp/gguf
Submodule
1
llm/llama.cpp/gguf
Submodule
Submodule llm/llama.cpp/gguf added at 53885d7256
@@ -20,27 +20,14 @@ import (
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/jmorganca/ollama/api"
|
||||
)
|
||||
|
||||
const ModelFamilyLlama ModelFamily = "llama"
|
||||
|
||||
//go:embed llama.cpp/ggml/build/*/bin/*
|
||||
//go:embed llama.cpp/*/build/*/bin/*
|
||||
var llamaCppEmbed embed.FS
|
||||
|
||||
var (
|
||||
ggmlGPU = path.Join("llama.cpp", "ggml", "build", "gpu", "bin")
|
||||
ggmlCPU = path.Join("llama.cpp", "ggml", "build", "cpu", "bin")
|
||||
)
|
||||
|
||||
var (
|
||||
ggmlInit sync.Once
|
||||
ggmlRunnerPath string
|
||||
)
|
||||
|
||||
func osPath(llamaPath string) string {
|
||||
if runtime.GOOS == "windows" {
|
||||
return path.Join(llamaPath, "Release")
|
||||
@@ -49,98 +36,98 @@ func osPath(llamaPath string) string {
|
||||
return llamaPath
|
||||
}
|
||||
|
||||
func initGGML() {
|
||||
ggmlInit.Do(func() {
|
||||
tmpDir, err := os.MkdirTemp("", "llama-*")
|
||||
if err != nil {
|
||||
log.Fatalf("llama.cpp: failed to create temp dir: %v", err)
|
||||
}
|
||||
func chooseRunner(gpuPath, cpuPath string) string {
|
||||
tmpDir, err := os.MkdirTemp("", "llama-*")
|
||||
if err != nil {
|
||||
log.Fatalf("llama.cpp: failed to create temp dir: %v", err)
|
||||
}
|
||||
|
||||
llamaPath := osPath(ggmlGPU)
|
||||
llamaPath := osPath(gpuPath)
|
||||
if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
|
||||
llamaPath = osPath(cpuPath)
|
||||
if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
|
||||
llamaPath = osPath(ggmlCPU)
|
||||
if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
|
||||
log.Fatalf("llama.cpp executable not found")
|
||||
}
|
||||
log.Fatalf("llama.cpp executable not found")
|
||||
}
|
||||
}
|
||||
|
||||
files := []string{"server"}
|
||||
switch runtime.GOOS {
|
||||
case "windows":
|
||||
files = []string{"server.exe"}
|
||||
case "darwin":
|
||||
if llamaPath == osPath(ggmlGPU) {
|
||||
files = append(files, "ggml-metal.metal")
|
||||
}
|
||||
files := []string{"server"}
|
||||
switch runtime.GOOS {
|
||||
case "windows":
|
||||
files = []string{"server.exe"}
|
||||
case "darwin":
|
||||
if llamaPath == osPath(gpuPath) {
|
||||
files = append(files, "ggml-metal.metal")
|
||||
}
|
||||
|
||||
for _, f := range files {
|
||||
srcPath := path.Join(llamaPath, f)
|
||||
destPath := filepath.Join(tmpDir, f)
|
||||
|
||||
srcFile, err := llamaCppEmbed.Open(srcPath)
|
||||
if err != nil {
|
||||
log.Fatalf("read llama.cpp %s: %v", f, err)
|
||||
}
|
||||
defer srcFile.Close()
|
||||
|
||||
destFile, err := os.OpenFile(destPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
|
||||
if err != nil {
|
||||
log.Fatalf("write llama.cpp %s: %v", f, err)
|
||||
}
|
||||
defer destFile.Close()
|
||||
|
||||
if _, err := io.Copy(destFile, srcFile); err != nil {
|
||||
log.Fatalf("copy llama.cpp %s: %v", f, err)
|
||||
}
|
||||
case "linux":
|
||||
// check if there is a GPU available
|
||||
if _, err := CheckVRAM(); errors.Is(err, errNoGPU) {
|
||||
// this error was logged on start-up, so we don't need to log it again
|
||||
llamaPath = osPath(cpuPath)
|
||||
}
|
||||
}
|
||||
|
||||
ggmlRunnerPath = filepath.Join(tmpDir, "server")
|
||||
if runtime.GOOS == "windows" {
|
||||
ggmlRunnerPath = filepath.Join(tmpDir, "server.exe")
|
||||
for _, f := range files {
|
||||
srcPath := path.Join(llamaPath, f)
|
||||
destPath := filepath.Join(tmpDir, f)
|
||||
|
||||
srcFile, err := llamaCppEmbed.Open(srcPath)
|
||||
if err != nil {
|
||||
log.Fatalf("read llama.cpp %s: %v", f, err)
|
||||
}
|
||||
})
|
||||
}
|
||||
defer srcFile.Close()
|
||||
|
||||
type ModelRunner struct {
|
||||
Path string // path to the model runner executable
|
||||
}
|
||||
destFile, err := os.OpenFile(destPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
|
||||
if err != nil {
|
||||
log.Fatalf("write llama.cpp %s: %v", f, err)
|
||||
}
|
||||
defer destFile.Close()
|
||||
|
||||
func ggmlRunner() ModelRunner {
|
||||
initGGML()
|
||||
return ModelRunner{Path: ggmlRunnerPath}
|
||||
if _, err := io.Copy(destFile, srcFile); err != nil {
|
||||
log.Fatalf("copy llama.cpp %s: %v", f, err)
|
||||
}
|
||||
}
|
||||
|
||||
runPath := filepath.Join(tmpDir, "server")
|
||||
if runtime.GOOS == "windows" {
|
||||
runPath = filepath.Join(tmpDir, "server.exe")
|
||||
}
|
||||
|
||||
return runPath
|
||||
}
|
||||
|
||||
type llamaModel struct {
|
||||
hyperparameters llamaHyperparameters
|
||||
}
|
||||
|
||||
func (llm *llamaModel) ModelFamily() ModelFamily {
|
||||
return ModelFamilyLlama
|
||||
func (llm *llamaModel) ModelFamily() string {
|
||||
return "llama"
|
||||
}
|
||||
|
||||
func (llm *llamaModel) ModelType() ModelType {
|
||||
switch llm.hyperparameters.NumLayer {
|
||||
func llamaModelType(numLayer uint32) string {
|
||||
switch numLayer {
|
||||
case 26:
|
||||
return ModelType3B
|
||||
return "3B"
|
||||
case 32:
|
||||
return ModelType7B
|
||||
return "7B"
|
||||
case 40:
|
||||
return ModelType13B
|
||||
return "13B"
|
||||
case 48:
|
||||
return ModelType34B
|
||||
return "34B"
|
||||
case 60:
|
||||
return ModelType30B
|
||||
return "30B"
|
||||
case 80:
|
||||
return ModelType65B
|
||||
return "65B"
|
||||
default:
|
||||
return "Unknown"
|
||||
}
|
||||
|
||||
// TODO: find a better default
|
||||
return ModelType7B
|
||||
}
|
||||
|
||||
func (llm *llamaModel) FileType() FileType {
|
||||
return llm.hyperparameters.FileType
|
||||
func (llm *llamaModel) ModelType() string {
|
||||
return llamaModelType(llm.hyperparameters.NumLayer)
|
||||
}
|
||||
|
||||
func (llm *llamaModel) FileType() string {
|
||||
return fileType(llm.hyperparameters.FileType)
|
||||
}
|
||||
|
||||
type llamaHyperparameters struct {
|
||||
@@ -157,70 +144,7 @@ type llamaHyperparameters struct {
|
||||
NumRot uint32
|
||||
|
||||
// FileType describes the quantization level of the model, e.g. Q4_0, Q5_K, etc.
|
||||
FileType llamaFileType
|
||||
}
|
||||
|
||||
type llamaFileType uint32
|
||||
|
||||
const (
|
||||
llamaFileTypeF32 llamaFileType = iota
|
||||
llamaFileTypeF16
|
||||
llamaFileTypeQ4_0
|
||||
llamaFileTypeQ4_1
|
||||
llamaFileTypeQ4_1_F16
|
||||
llamaFileTypeQ8_0 llamaFileType = iota + 2
|
||||
llamaFileTypeQ5_0
|
||||
llamaFileTypeQ5_1
|
||||
llamaFileTypeQ2_K
|
||||
llamaFileTypeQ3_K_S
|
||||
llamaFileTypeQ3_K_M
|
||||
llamaFileTypeQ3_K_L
|
||||
llamaFileTypeQ4_K_S
|
||||
llamaFileTypeQ4_K_M
|
||||
llamaFileTypeQ5_K_S
|
||||
llamaFileTypeQ5_K_M
|
||||
llamaFileTypeQ6_K
|
||||
)
|
||||
|
||||
func (ft llamaFileType) String() string {
|
||||
switch ft {
|
||||
case llamaFileTypeF32:
|
||||
return "F32"
|
||||
case llamaFileTypeF16:
|
||||
return "F16"
|
||||
case llamaFileTypeQ4_0:
|
||||
return "Q4_0"
|
||||
case llamaFileTypeQ4_1:
|
||||
return "Q4_1"
|
||||
case llamaFileTypeQ4_1_F16:
|
||||
return "Q4_1_F16"
|
||||
case llamaFileTypeQ8_0:
|
||||
return "Q8_0"
|
||||
case llamaFileTypeQ5_0:
|
||||
return "Q5_0"
|
||||
case llamaFileTypeQ5_1:
|
||||
return "Q5_1"
|
||||
case llamaFileTypeQ2_K:
|
||||
return "Q2_K"
|
||||
case llamaFileTypeQ3_K_S:
|
||||
return "Q3_K_S"
|
||||
case llamaFileTypeQ3_K_M:
|
||||
return "Q3_K_M"
|
||||
case llamaFileTypeQ3_K_L:
|
||||
return "Q3_K_L"
|
||||
case llamaFileTypeQ4_K_S:
|
||||
return "Q4_K_S"
|
||||
case llamaFileTypeQ4_K_M:
|
||||
return "Q4_K_M"
|
||||
case llamaFileTypeQ5_K_S:
|
||||
return "Q5_K_S"
|
||||
case llamaFileTypeQ5_K_M:
|
||||
return "Q5_K_M"
|
||||
case llamaFileTypeQ6_K:
|
||||
return "Q6_K"
|
||||
default:
|
||||
return "Unknown"
|
||||
}
|
||||
FileType uint32
|
||||
}
|
||||
|
||||
type Running struct {
|
||||
@@ -229,11 +153,81 @@ type Running struct {
|
||||
Cancel context.CancelFunc
|
||||
}
|
||||
|
||||
type ModelRunner struct {
|
||||
Path string // path to the model runner executable
|
||||
}
|
||||
|
||||
type llama struct {
|
||||
api.Options
|
||||
Running
|
||||
}
|
||||
|
||||
var errNoGPU = errors.New("nvidia-smi command failed")
|
||||
|
||||
// CheckVRAM returns the available VRAM in MiB on Linux machines with NVIDIA GPUs
|
||||
func CheckVRAM() (int, error) {
|
||||
cmd := exec.Command("nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits")
|
||||
var stdout bytes.Buffer
|
||||
cmd.Stdout = &stdout
|
||||
err := cmd.Run()
|
||||
if err != nil {
|
||||
return 0, errNoGPU
|
||||
}
|
||||
|
||||
var total int
|
||||
scanner := bufio.NewScanner(&stdout)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
vram, err := strconv.Atoi(line)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("failed to parse available VRAM: %v", err)
|
||||
}
|
||||
|
||||
total += vram
|
||||
}
|
||||
|
||||
return total, nil
|
||||
}
|
||||
|
||||
func NumGPU(opts api.Options) int {
|
||||
if opts.NumGPU != -1 {
|
||||
return opts.NumGPU
|
||||
}
|
||||
n := 1 // default to enable metal on macOS
|
||||
if runtime.GOOS == "linux" {
|
||||
vram, err := CheckVRAM()
|
||||
if err != nil {
|
||||
if err.Error() != "nvidia-smi command failed" {
|
||||
log.Print(err.Error())
|
||||
}
|
||||
// nvidia driver not installed or no nvidia GPU found
|
||||
return 0
|
||||
}
|
||||
// TODO: this is a very rough heuristic, better would be to calculate this based on number of layers and context size
|
||||
switch {
|
||||
case vram < 500:
|
||||
log.Printf("WARNING: Low VRAM detected, disabling GPU")
|
||||
n = 0
|
||||
case vram < 1000:
|
||||
n = 4
|
||||
case vram < 2000:
|
||||
n = 8
|
||||
case vram < 4000:
|
||||
n = 12
|
||||
case vram < 8000:
|
||||
n = 16
|
||||
case vram < 12000:
|
||||
n = 24
|
||||
case vram < 16000:
|
||||
n = 32
|
||||
default:
|
||||
n = 48
|
||||
}
|
||||
log.Printf("%d MB VRAM available, loading %d GPU layers", vram, n)
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
func newLlama(model string, adapters []string, runner ModelRunner, opts api.Options) (*llama, error) {
|
||||
if _, err := os.Stat(model); err != nil {
|
||||
return nil, err
|
||||
@@ -250,14 +244,17 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
|
||||
params := []string{
|
||||
"--model", model,
|
||||
"--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
|
||||
"--gqa", fmt.Sprintf("%d", opts.NumGQA),
|
||||
"--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase),
|
||||
"--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale),
|
||||
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
||||
"--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU),
|
||||
"--n-gpu-layers", fmt.Sprintf("%d", NumGPU(opts)),
|
||||
"--embedding",
|
||||
}
|
||||
|
||||
if opts.NumGQA > 0 {
|
||||
params = append(params, "--gqa", fmt.Sprintf("%d", opts.NumGQA))
|
||||
}
|
||||
|
||||
if len(adapters) > 0 {
|
||||
// TODO: applying multiple adapters is not supported by the llama.cpp server yet
|
||||
params = append(params, "--lora", adapters[0])
|
||||
@@ -289,17 +286,25 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
|
||||
runner.Path,
|
||||
append(params, "--port", strconv.Itoa(port))...,
|
||||
)
|
||||
|
||||
cmd.Stdout = os.Stderr
|
||||
cmd.Stderr = os.Stderr
|
||||
|
||||
llm := &llama{Options: opts, Running: Running{Port: port, Cmd: cmd, Cancel: cancel}}
|
||||
|
||||
log.Print("starting llama.cpp server")
|
||||
if err := llm.Cmd.Start(); err != nil {
|
||||
log.Printf("error starting the external llama.cpp server: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
if err := waitForServer(llm); err != nil {
|
||||
log.Printf("error starting llama.cpp server: %v", err)
|
||||
llm.Close()
|
||||
// try again
|
||||
continue
|
||||
}
|
||||
|
||||
// server started successfully
|
||||
return llm, nil
|
||||
}
|
||||
@@ -308,48 +313,31 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
|
||||
}
|
||||
|
||||
func waitForServer(llm *llama) error {
|
||||
log.Print("starting llama.cpp server")
|
||||
var stderr bytes.Buffer
|
||||
llm.Cmd.Stderr = &stderr
|
||||
err := llm.Cmd.Start()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error starting the external llama.cpp server: %w", err)
|
||||
}
|
||||
|
||||
exitChan := make(chan error, 1)
|
||||
|
||||
// the server is a long running process, watch for it exiting to keep track of something going wrong
|
||||
go func() {
|
||||
err := llm.Cmd.Wait()
|
||||
log.Print(stderr.String())
|
||||
exitChan <- err
|
||||
}()
|
||||
|
||||
// wait for the server to start responding
|
||||
start := time.Now()
|
||||
expiresAt := time.Now().Add(30 * time.Second)
|
||||
ticker := time.NewTicker(100 * time.Millisecond)
|
||||
expiresAt := time.Now().Add(45 * time.Second)
|
||||
ticker := time.NewTicker(200 * time.Millisecond)
|
||||
|
||||
log.Print("waiting for llama.cpp server to start responding")
|
||||
for range ticker.C {
|
||||
if time.Now().After(expiresAt) {
|
||||
return fmt.Errorf("llama.cpp server did not start within alloted time, retrying")
|
||||
}
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
if time.Now().After(expiresAt) {
|
||||
return fmt.Errorf("llama.cpp server did not start responding within 30 seconds, retrying")
|
||||
}
|
||||
if err := llm.Ping(context.Background()); err == nil {
|
||||
log.Printf("llama.cpp server started in %f seconds", time.Since(start).Seconds())
|
||||
return nil
|
||||
}
|
||||
case err := <-exitChan:
|
||||
return fmt.Errorf("llama.cpp server exited unexpectedly: %w", err)
|
||||
if err := llm.Ping(context.Background()); err == nil {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
log.Printf("llama.cpp server started in %f seconds", time.Since(start).Seconds())
|
||||
return nil
|
||||
}
|
||||
|
||||
func (llm *llama) Close() {
|
||||
llm.Running.Cmd.Cancel()
|
||||
llm.Cancel()
|
||||
if err := llm.Cmd.Wait(); err != nil {
|
||||
log.Printf("llama.cpp server exited with error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func (llm *llama) SetOptions(opts api.Options) {
|
||||
@@ -676,7 +664,7 @@ func (llm *llama) Embedding(ctx context.Context, input string) ([]float64, error
|
||||
|
||||
// Ping checks that the server subprocess is still running and responding to requests
|
||||
func (llm *llama) Ping(ctx context.Context) error {
|
||||
resp, err := http.Head(fmt.Sprintf("http://127.0.0.1:%d", llm.Running.Port))
|
||||
resp, err := http.Head(fmt.Sprintf("http://127.0.0.1:%d", llm.Port))
|
||||
if err != nil {
|
||||
return fmt.Errorf("ping resp: %w", err)
|
||||
}
|
44
llm/llm.go
44
llm/llm.go
@@ -32,15 +32,22 @@ func New(model string, adapters []string, opts api.Options) (LLM, error) {
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
ggml, err := DecodeGGML(f, ModelFamilyLlama)
|
||||
ggml, err := DecodeGGML(f)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
switch ggml.FileType().String() {
|
||||
case "F32", "Q5_0", "Q5_1", "Q8_0":
|
||||
switch ggml.FileType() {
|
||||
case "Q8_0":
|
||||
if ggml.Name() != "gguf" && opts.NumGPU != 0 {
|
||||
// GGML Q8_0 do not support Metal API and will
|
||||
// cause the runner to segmentation fault so disable GPU
|
||||
log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
|
||||
opts.NumGPU = 0
|
||||
}
|
||||
case "F32", "Q5_0", "Q5_1":
|
||||
if opts.NumGPU != 0 {
|
||||
// F32, F16, Q5_0, Q5_1, and Q8_0 do not support Metal API and will
|
||||
// F32, Q5_0, Q5_1, and Q8_0 do not support Metal API and will
|
||||
// cause the runner to segmentation fault so disable GPU
|
||||
log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
|
||||
opts.NumGPU = 0
|
||||
@@ -49,34 +56,43 @@ func New(model string, adapters []string, opts api.Options) (LLM, error) {
|
||||
|
||||
totalResidentMemory := memory.TotalMemory()
|
||||
switch ggml.ModelType() {
|
||||
case ModelType3B, ModelType7B:
|
||||
if ggml.FileType().String() == "F16" && totalResidentMemory < 16*1024*1024 {
|
||||
case "3B", "7B":
|
||||
if ggml.FileType() == "F16" && totalResidentMemory < 16*1024*1024 {
|
||||
return nil, fmt.Errorf("F16 model requires at least 16GB of memory")
|
||||
} else if totalResidentMemory < 8*1024*1024 {
|
||||
return nil, fmt.Errorf("model requires at least 8GB of memory")
|
||||
}
|
||||
case ModelType13B:
|
||||
if ggml.FileType().String() == "F16" && totalResidentMemory < 32*1024*1024 {
|
||||
case "13B":
|
||||
if ggml.FileType() == "F16" && totalResidentMemory < 32*1024*1024 {
|
||||
return nil, fmt.Errorf("F16 model requires at least 32GB of memory")
|
||||
} else if totalResidentMemory < 16*1024*1024 {
|
||||
return nil, fmt.Errorf("model requires at least 16GB of memory")
|
||||
}
|
||||
case ModelType30B, ModelType34B:
|
||||
if ggml.FileType().String() == "F16" && totalResidentMemory < 64*1024*1024 {
|
||||
case "30B", "34B", "40B":
|
||||
if ggml.FileType() == "F16" && totalResidentMemory < 64*1024*1024 {
|
||||
return nil, fmt.Errorf("F16 model requires at least 64GB of memory")
|
||||
} else if totalResidentMemory < 32*1024*1024 {
|
||||
return nil, fmt.Errorf("model requires at least 32GB of memory")
|
||||
}
|
||||
case ModelType65B:
|
||||
if ggml.FileType().String() == "F16" && totalResidentMemory < 128*1024*1024 {
|
||||
case "65B", "70B":
|
||||
if ggml.FileType() == "F16" && totalResidentMemory < 128*1024*1024 {
|
||||
return nil, fmt.Errorf("F16 model requires at least 128GB of memory")
|
||||
} else if totalResidentMemory < 64*1024*1024 {
|
||||
return nil, fmt.Errorf("model requires at least 64GB of memory")
|
||||
}
|
||||
case "180B":
|
||||
if ggml.FileType() == "F16" && totalResidentMemory < 512*1024*1024 {
|
||||
return nil, fmt.Errorf("F16 model requires at least 512GB of memory")
|
||||
} else if totalResidentMemory < 128*1024*1024 {
|
||||
return nil, fmt.Errorf("model requires at least 128GB of memory")
|
||||
}
|
||||
}
|
||||
|
||||
switch ggml.ModelFamily() {
|
||||
case ModelFamilyLlama:
|
||||
switch ggml.Name() {
|
||||
case "gguf":
|
||||
opts.NumGQA = 0 // TODO: remove this when llama.cpp runners differ enough to need separate newLlama functions
|
||||
return newLlama(model, adapters, ggufRunner(), opts)
|
||||
case "ggml", "ggmf", "ggjt", "ggla":
|
||||
return newLlama(model, adapters, ggmlRunner(), opts)
|
||||
default:
|
||||
return nil, fmt.Errorf("unknown ggml type: %s", ggml.ModelFamily())
|
||||
|
@@ -8,7 +8,7 @@ GO_LDFLAGS="$GO_LDFLAGS -X github.com/jmorganca/ollama/server.mode=release"
|
||||
# build universal binary
|
||||
GOARCH=arm64 go generate ./...
|
||||
GOARCH=arm64 go build -ldflags "$GO_LDFLAGS" -o dist/ollama-darwin-arm64
|
||||
rm -rf llm/llama.cpp/ggml/build/*/bin
|
||||
rm -rf llm/llama.cpp/*/build/*/bin
|
||||
GOARCH=amd64 go generate ./...
|
||||
GOARCH=amd64 go build -ldflags "$GO_LDFLAGS" -o dist/ollama-darwin-amd64
|
||||
lipo -create -output dist/ollama dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
|
||||
|
@@ -103,7 +103,7 @@ func getAuthToken(ctx context.Context, redirData AuthRedirect, regOpts *Registry
|
||||
|
||||
headers := make(http.Header)
|
||||
headers.Set("Authorization", sig)
|
||||
resp, err := makeRequest(ctx, "GET", redirectURL, headers, nil, regOpts)
|
||||
resp, err := makeRequest(ctx, "GET", redirectURL, headers, nil, nil)
|
||||
if err != nil {
|
||||
log.Printf("couldn't get token: %q", err)
|
||||
}
|
||||
|
189
server/images.go
189
server/images.go
@@ -114,10 +114,11 @@ type LayerReader struct {
|
||||
}
|
||||
|
||||
type ConfigV2 struct {
|
||||
ModelFamily llm.ModelFamily `json:"model_family"`
|
||||
ModelType string `json:"model_type"`
|
||||
FileType string `json:"file_type"`
|
||||
RootFS RootFS `json:"rootfs"`
|
||||
ModelFormat string `json:"model_format"`
|
||||
ModelFamily string `json:"model_family"`
|
||||
ModelType string `json:"model_type"`
|
||||
FileType string `json:"file_type"`
|
||||
RootFS RootFS `json:"rootfs"`
|
||||
|
||||
// required by spec
|
||||
Architecture string `json:"architecture"`
|
||||
@@ -268,6 +269,29 @@ func filenameWithPath(path, f string) (string, error) {
|
||||
}
|
||||
|
||||
func CreateModel(ctx context.Context, name string, path string, fn func(resp api.ProgressResponse)) error {
|
||||
mp := ParseModelPath(name)
|
||||
|
||||
var manifest *ManifestV2
|
||||
var err error
|
||||
var noprune string
|
||||
|
||||
// build deleteMap to prune unused layers
|
||||
deleteMap := make(map[string]bool)
|
||||
|
||||
if noprune = os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
|
||||
manifest, _, err = GetManifest(mp)
|
||||
if err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||
return err
|
||||
}
|
||||
|
||||
if manifest != nil {
|
||||
for _, l := range manifest.Layers {
|
||||
deleteMap[l.Digest] = true
|
||||
}
|
||||
deleteMap[manifest.Config.Digest] = true
|
||||
}
|
||||
}
|
||||
|
||||
mf, err := os.Open(path)
|
||||
if err != nil {
|
||||
fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't open modelfile '%s'", path)})
|
||||
@@ -328,14 +352,15 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
ggml, err := llm.DecodeGGML(file, llm.ModelFamilyLlama)
|
||||
ggml, err := llm.DecodeGGML(file)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
config.ModelFormat = ggml.Name()
|
||||
config.ModelFamily = ggml.ModelFamily()
|
||||
config.ModelType = ggml.ModelType().String()
|
||||
config.FileType = ggml.FileType().String()
|
||||
config.ModelType = ggml.ModelType()
|
||||
config.FileType = ggml.FileType()
|
||||
|
||||
// reset the file
|
||||
file.Seek(0, io.SeekStart)
|
||||
@@ -369,6 +394,7 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
||||
// copie the model metadata
|
||||
config.ModelFamily = source.ModelFamily
|
||||
config.ModelType = source.ModelType
|
||||
config.ModelFormat = source.ModelFormat
|
||||
config.FileType = source.FileType
|
||||
|
||||
for _, l := range mf.Layers {
|
||||
@@ -472,6 +498,12 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
||||
}
|
||||
}
|
||||
|
||||
if config.ModelType == "65B" {
|
||||
if numGQA, ok := formattedParams["num_gqa"].(int); ok && numGQA == 8 {
|
||||
config.ModelType = "70B"
|
||||
}
|
||||
}
|
||||
|
||||
bts, err := json.Marshal(formattedParams)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -503,6 +535,7 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
||||
var manifestLayers []*Layer
|
||||
for _, l := range layers {
|
||||
manifestLayers = append(manifestLayers, &l.Layer)
|
||||
delete(deleteMap, l.Layer.Digest)
|
||||
}
|
||||
|
||||
// Create a layer for the config object
|
||||
@@ -512,6 +545,7 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
||||
return err
|
||||
}
|
||||
layers = append(layers, cfg)
|
||||
delete(deleteMap, cfg.Layer.Digest)
|
||||
|
||||
if err := SaveLayers(layers, fn, false); err != nil {
|
||||
return err
|
||||
@@ -524,6 +558,14 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
||||
return err
|
||||
}
|
||||
|
||||
if noprune == "" {
|
||||
fn(api.ProgressResponse{Status: "removing any unused layers"})
|
||||
err = deleteUnusedLayers(nil, deleteMap, false)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
fn(api.ProgressResponse{Status: "success"})
|
||||
return nil
|
||||
}
|
||||
@@ -779,14 +821,14 @@ func formatParams(params map[string][]string) (map[string]interface{}, error) {
|
||||
return nil, fmt.Errorf("invalid float value %s", vals)
|
||||
}
|
||||
|
||||
out[key] = floatVal
|
||||
out[key] = float32(floatVal)
|
||||
case reflect.Int:
|
||||
intVal, err := strconv.ParseInt(vals[0], 10, 0)
|
||||
intVal, err := strconv.ParseInt(vals[0], 10, 64)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid int value %s", vals)
|
||||
}
|
||||
|
||||
out[key] = intVal
|
||||
out[key] = int(intVal)
|
||||
case reflect.Bool:
|
||||
boolVal, err := strconv.ParseBool(vals[0])
|
||||
if err != nil {
|
||||
@@ -866,18 +908,7 @@ func CopyModel(src, dest string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func DeleteModel(name string) error {
|
||||
mp := ParseModelPath(name)
|
||||
manifest, _, err := GetManifest(mp)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
deleteMap := make(map[string]bool)
|
||||
for _, layer := range manifest.Layers {
|
||||
deleteMap[layer.Digest] = true
|
||||
}
|
||||
deleteMap[manifest.Config.Digest] = true
|
||||
|
||||
func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]bool, dryRun bool) error {
|
||||
fp, err := GetManifestPath()
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -894,14 +925,13 @@ func DeleteModel(name string) error {
|
||||
fmp := ParseModelPath(tag)
|
||||
|
||||
// skip the manifest we're trying to delete
|
||||
if mp.GetFullTagname() == fmp.GetFullTagname() {
|
||||
if skipModelPath != nil && skipModelPath.GetFullTagname() == fmp.GetFullTagname() {
|
||||
return nil
|
||||
}
|
||||
|
||||
// save (i.e. delete from the deleteMap) any files used in other manifests
|
||||
manifest, _, err := GetManifest(fmp)
|
||||
if err != nil {
|
||||
log.Printf("skipping file: %s", fp)
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -925,14 +955,72 @@ func DeleteModel(name string) error {
|
||||
log.Printf("couldn't get file path for '%s': %v", k, err)
|
||||
continue
|
||||
}
|
||||
if err := os.Remove(fp); err != nil {
|
||||
log.Printf("couldn't remove file '%s': %v", fp, err)
|
||||
continue
|
||||
if !dryRun {
|
||||
if err := os.Remove(fp); err != nil {
|
||||
log.Printf("couldn't remove file '%s': %v", fp, err)
|
||||
continue
|
||||
}
|
||||
} else {
|
||||
log.Printf("wanted to remove: %s", fp)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fp, err = mp.GetManifestPath(false)
|
||||
return nil
|
||||
}
|
||||
|
||||
func PruneLayers() error {
|
||||
deleteMap := make(map[string]bool)
|
||||
p, err := GetBlobsPath("")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
blobs, err := os.ReadDir(p)
|
||||
if err != nil {
|
||||
log.Printf("couldn't read dir '%s': %v", p, err)
|
||||
return err
|
||||
}
|
||||
|
||||
for _, blob := range blobs {
|
||||
name := blob.Name()
|
||||
if runtime.GOOS == "windows" {
|
||||
name = strings.ReplaceAll(name, "-", ":")
|
||||
}
|
||||
deleteMap[name] = true
|
||||
}
|
||||
|
||||
log.Printf("total blobs: %d", len(deleteMap))
|
||||
|
||||
err = deleteUnusedLayers(nil, deleteMap, false)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
log.Printf("total unused blobs removed: %d", len(deleteMap))
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func DeleteModel(name string) error {
|
||||
mp := ParseModelPath(name)
|
||||
manifest, _, err := GetManifest(mp)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
deleteMap := make(map[string]bool)
|
||||
for _, layer := range manifest.Layers {
|
||||
deleteMap[layer.Digest] = true
|
||||
}
|
||||
deleteMap[manifest.Config.Digest] = true
|
||||
|
||||
err = deleteUnusedLayers(&mp, deleteMap, false)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fp, err := mp.GetManifestPath(false)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -1111,13 +1199,34 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
|
||||
func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn func(api.ProgressResponse)) error {
|
||||
mp := ParseModelPath(name)
|
||||
|
||||
var manifest *ManifestV2
|
||||
var err error
|
||||
var noprune string
|
||||
|
||||
// build deleteMap to prune unused layers
|
||||
deleteMap := make(map[string]bool)
|
||||
|
||||
if noprune = os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
|
||||
manifest, _, err = GetManifest(mp)
|
||||
if err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||
return err
|
||||
}
|
||||
|
||||
if manifest != nil {
|
||||
for _, l := range manifest.Layers {
|
||||
deleteMap[l.Digest] = true
|
||||
}
|
||||
deleteMap[manifest.Config.Digest] = true
|
||||
}
|
||||
}
|
||||
|
||||
if mp.ProtocolScheme == "http" && !regOpts.Insecure {
|
||||
return fmt.Errorf("insecure protocol http")
|
||||
}
|
||||
|
||||
fn(api.ProgressResponse{Status: "pulling manifest"})
|
||||
|
||||
manifest, err := pullModelManifest(ctx, mp, regOpts)
|
||||
manifest, err = pullModelManifest(ctx, mp, regOpts)
|
||||
if err != nil {
|
||||
return fmt.Errorf("pull model manifest: %s", err)
|
||||
}
|
||||
@@ -1137,7 +1246,9 @@ func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
delete(deleteMap, layer.Digest)
|
||||
}
|
||||
delete(deleteMap, manifest.Config.Digest)
|
||||
|
||||
fn(api.ProgressResponse{Status: "verifying sha256 digest"})
|
||||
for _, layer := range layers {
|
||||
@@ -1175,6 +1286,14 @@ func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
|
||||
return err
|
||||
}
|
||||
|
||||
if noprune == "" {
|
||||
fn(api.ProgressResponse{Status: "removing any unused layers"})
|
||||
err = deleteUnusedLayers(nil, deleteMap, false)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
fn(api.ProgressResponse{Status: "success"})
|
||||
|
||||
return nil
|
||||
@@ -1300,7 +1419,7 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
|
||||
}
|
||||
|
||||
func makeRequest(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.Reader, regOpts *RegistryOptions) (*http.Response, error) {
|
||||
if requestURL.Scheme != "http" && regOpts.Insecure {
|
||||
if requestURL.Scheme != "http" && regOpts != nil && regOpts.Insecure {
|
||||
requestURL.Scheme = "http"
|
||||
}
|
||||
|
||||
@@ -1313,10 +1432,12 @@ func makeRequest(ctx context.Context, method string, requestURL *url.URL, header
|
||||
req.Header = headers
|
||||
}
|
||||
|
||||
if regOpts.Token != "" {
|
||||
req.Header.Set("Authorization", "Bearer "+regOpts.Token)
|
||||
} else if regOpts.Username != "" && regOpts.Password != "" {
|
||||
req.SetBasicAuth(regOpts.Username, regOpts.Password)
|
||||
if regOpts != nil {
|
||||
if regOpts.Token != "" {
|
||||
req.Header.Set("Authorization", "Bearer "+regOpts.Token)
|
||||
} else if regOpts.Username != "" && regOpts.Password != "" {
|
||||
req.SetBasicAuth(regOpts.Username, regOpts.Password)
|
||||
}
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
|
||||
|
@@ -133,7 +133,12 @@ func GetBlobsPath(digest string) (string, error) {
|
||||
}
|
||||
|
||||
path := filepath.Join(home, ".ollama", "models", "blobs", digest)
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
|
||||
dirPath := filepath.Dir(path)
|
||||
if digest == "" {
|
||||
dirPath = path
|
||||
}
|
||||
|
||||
if err := os.MkdirAll(dirPath, 0o755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
|
@@ -12,6 +12,7 @@ import (
|
||||
"os/signal"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
@@ -363,6 +364,7 @@ func DeleteModelHandler(c *gin.Context) {
|
||||
}
|
||||
return
|
||||
}
|
||||
c.JSON(http.StatusOK, nil)
|
||||
}
|
||||
|
||||
func ShowModelHandler(c *gin.Context) {
|
||||
@@ -547,6 +549,13 @@ func Serve(ln net.Listener, origins []string) error {
|
||||
os.Exit(0)
|
||||
}()
|
||||
|
||||
if runtime.GOOS == "linux" {
|
||||
// check compatibility to log warnings
|
||||
if _, err := llm.CheckVRAM(); err != nil {
|
||||
log.Printf("Warning: GPU support not enabled, you may need to install GPU drivers: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
return s.Serve(ln)
|
||||
}
|
||||
|
||||
|
@@ -66,31 +66,39 @@ func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, r
|
||||
|
||||
sectionReader := io.NewSectionReader(f, int64(offset), chunk)
|
||||
for try := 0; try < MaxRetries; try++ {
|
||||
ch := make(chan error, 1)
|
||||
|
||||
r, w := io.Pipe()
|
||||
defer r.Close()
|
||||
go func() {
|
||||
defer w.Close()
|
||||
|
||||
for chunked := int64(0); chunked < chunk; {
|
||||
n, err := io.CopyN(w, sectionReader, 1024*1024)
|
||||
if err != nil && !errors.Is(err, io.EOF) {
|
||||
select {
|
||||
case err := <-ch:
|
||||
log.Printf("chunk interrupted: %v", err)
|
||||
return
|
||||
default:
|
||||
n, err := io.CopyN(w, sectionReader, 1024*1024)
|
||||
if err != nil && !errors.Is(err, io.EOF) {
|
||||
fn(api.ProgressResponse{
|
||||
Status: fmt.Sprintf("error reading chunk: %v", err),
|
||||
Digest: layer.Digest,
|
||||
Total: layer.Size,
|
||||
Completed: int(offset),
|
||||
})
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
chunked += n
|
||||
fn(api.ProgressResponse{
|
||||
Status: fmt.Sprintf("error reading chunk: %v", err),
|
||||
Status: fmt.Sprintf("uploading %s", layer.Digest),
|
||||
Digest: layer.Digest,
|
||||
Total: layer.Size,
|
||||
Completed: int(offset),
|
||||
Completed: int(offset) + int(chunked),
|
||||
})
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
chunked += n
|
||||
fn(api.ProgressResponse{
|
||||
Status: fmt.Sprintf("uploading %s", layer.Digest),
|
||||
Digest: layer.Digest,
|
||||
Total: layer.Size,
|
||||
Completed: int(offset) + int(chunked),
|
||||
})
|
||||
}
|
||||
}()
|
||||
|
||||
@@ -113,6 +121,8 @@ func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, r
|
||||
|
||||
switch {
|
||||
case resp.StatusCode == http.StatusUnauthorized:
|
||||
ch <- errors.New("unauthorized")
|
||||
|
||||
auth := resp.Header.Get("www-authenticate")
|
||||
authRedir := ParseAuthRedirectString(auth)
|
||||
token, err := getAuthToken(ctx, authRedir, regOpts)
|
||||
@@ -121,10 +131,7 @@ func uploadBlobChunked(ctx context.Context, requestURL *url.URL, layer *Layer, r
|
||||
}
|
||||
|
||||
regOpts.Token = token
|
||||
if _, err := sectionReader.Seek(0, io.SeekStart); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
sectionReader = io.NewSectionReader(f, int64(offset), chunk)
|
||||
continue
|
||||
case resp.StatusCode >= http.StatusBadRequest:
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
|
Reference in New Issue
Block a user