From 3b2d2c8326c245f0210a549777d0a77c2ccd92d1 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Mon, 5 May 2025 14:54:40 -0700 Subject: [PATCH] api: remove unused or unsupported api options (#10574) Some options listed in api/types.go are not supported in newer models, or have been deprecated in the past. This is the first of a series of PRs to clean up the API options --- api/types.go | 7 ------- docs/api.md | 3 --- llama/llama.go | 2 -- llm/server.go | 4 ---- parser/parser.go | 9 ++++++++- parser/parser_test.go | 4 ---- runner/llamarunner/runner.go | 2 -- runner/ollamarunner/runner.go | 1 - 8 files changed, 8 insertions(+), 24 deletions(-) diff --git a/api/types.go b/api/types.go index 7d8b6e532..bb9f181c7 100644 --- a/api/types.go +++ b/api/types.go @@ -283,12 +283,7 @@ type Runner struct { NumBatch int `json:"num_batch,omitempty"` NumGPU int `json:"num_gpu,omitempty"` MainGPU int `json:"main_gpu,omitempty"` - LowVRAM bool `json:"low_vram,omitempty"` - F16KV bool `json:"f16_kv,omitempty"` // Deprecated: This option is ignored - LogitsAll bool `json:"logits_all,omitempty"` - VocabOnly bool `json:"vocab_only,omitempty"` UseMMap *bool `json:"use_mmap,omitempty"` - UseMLock bool `json:"use_mlock,omitempty"` NumThread int `json:"num_thread,omitempty"` } @@ -671,8 +666,6 @@ func DefaultOptions() Options { NumBatch: 512, NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically NumThread: 0, // let the runtime decide - LowVRAM: false, - UseMLock: false, UseMMap: nil, }, } diff --git a/docs/api.md b/docs/api.md index 7f3e5e2d2..df5edff6d 100644 --- a/docs/api.md +++ b/docs/api.md @@ -404,10 +404,7 @@ curl http://localhost:11434/api/generate -d '{ "num_batch": 2, "num_gpu": 1, "main_gpu": 0, - "low_vram": false, - "vocab_only": false, "use_mmap": true, - "use_mlock": false, "num_thread": 8 } }' diff --git a/llama/llama.go b/llama/llama.go index 7f9f9549a..ccd63b5a4 100644 --- a/llama/llama.go +++ b/llama/llama.go @@ -199,7 +199,6 @@ type ModelParams struct { NumGpuLayers int MainGpu int UseMmap bool - UseMlock bool TensorSplit []float32 Progress func(float32) VocabOnly bool @@ -218,7 +217,6 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) { cparams.n_gpu_layers = C.int(params.NumGpuLayers) cparams.main_gpu = C.int32_t(params.MainGpu) cparams.use_mmap = C.bool(params.UseMmap) - cparams.use_mlock = C.bool(params.UseMlock) cparams.vocab_only = C.bool(params.VocabOnly) if len(params.TensorSplit) > 0 { diff --git a/llm/server.go b/llm/server.go index 7172d9240..d7c466a9a 100644 --- a/llm/server.go +++ b/llm/server.go @@ -217,10 +217,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a params = append(params, "--no-mmap") } - if opts.UseMLock { - params = append(params, "--mlock") - } - // TODO - NUMA support currently doesn't work properly params = append(params, "--parallel", strconv.Itoa(numParallel)) diff --git a/parser/parser.go b/parser/parser.go index 0a732653c..7c94d2002 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -39,7 +39,14 @@ func (f Modelfile) String() string { return sb.String() } -var deprecatedParameters = []string{"penalize_newline"} +var deprecatedParameters = []string{ + "penalize_newline", + "low_vram", + "f16_kv", + "logits_all", + "vocab_only", + "use_mlock", +} // CreateRequest creates a new *api.CreateRequest from an existing Modelfile func (f Modelfile) CreateRequest(relativeDir string) (*api.CreateRequest, error) { diff --git a/parser/parser_test.go b/parser/parser_test.go index 097c058fb..f2aa5ab79 100644 --- a/parser/parser_test.go +++ b/parser/parser_test.go @@ -478,11 +478,7 @@ func TestParseFileParameters(t *testing.T) { "num_gqa 1": {"num_gqa", "1"}, "num_gpu 1": {"num_gpu", "1"}, "main_gpu 1": {"main_gpu", "1"}, - "low_vram true": {"low_vram", "true"}, - "logits_all true": {"logits_all", "true"}, - "vocab_only true": {"vocab_only", "true"}, "use_mmap true": {"use_mmap", "true"}, - "use_mlock true": {"use_mlock", "true"}, "num_thread 1": {"num_thread", "1"}, "num_keep 1": {"num_keep", "1"}, "seed 1": {"seed", "1"}, diff --git a/runner/llamarunner/runner.go b/runner/llamarunner/runner.go index d8169be40..5b7d6c317 100644 --- a/runner/llamarunner/runner.go +++ b/runner/llamarunner/runner.go @@ -820,7 +820,6 @@ func Execute(args []string) error { threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation") verbose := fs.Bool("verbose", false, "verbose output (default: disabled)") noMmap := fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)") - mlock := fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing") tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions") multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users") @@ -876,7 +875,6 @@ func Execute(args []string) error { NumGpuLayers: *nGpuLayers, MainGpu: *mainGpu, UseMmap: !*noMmap && lpaths.String() == "", - UseMlock: *mlock, TensorSplit: tensorSplitFloats, Progress: func(progress float32) { server.progress = progress diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go index 3e0bb34ec..b028a7216 100644 --- a/runner/ollamarunner/runner.go +++ b/runner/ollamarunner/runner.go @@ -818,7 +818,6 @@ func Execute(args []string) error { threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation") verbose := fs.Bool("verbose", false, "verbose output (default: disabled)") _ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)") - _ = fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing") tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions") multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")