api: remove unused or unsupported api options (#10574)

Some options listed in api/types.go are not supported in newer models, or have been deprecated in the past. This is the first of a series of PRs to clean up the API options
2025-05-05 14:54:40 -07:00 · 2025-05-05 14:54:40 -07:00 · 3b2d2c8326
commit 3b2d2c8326
parent d931ee8f22
8 changed files with 8 additions and 24 deletions
--- a/api/types.go
+++ b/api/types.go
@ -283,12 +283,7 @@ type Runner struct {
 	NumBatch  int   `json:"num_batch,omitempty"`
 	NumGPU    int   `json:"num_gpu,omitempty"`
 	MainGPU   int   `json:"main_gpu,omitempty"`
-	LowVRAM   bool  `json:"low_vram,omitempty"`
-	F16KV     bool  `json:"f16_kv,omitempty"` // Deprecated: This option is ignored
-	LogitsAll bool  `json:"logits_all,omitempty"`
-	VocabOnly bool  `json:"vocab_only,omitempty"`
 	UseMMap   *bool `json:"use_mmap,omitempty"`
-	UseMLock  bool  `json:"use_mlock,omitempty"`
 	NumThread int   `json:"num_thread,omitempty"`
 }

@ -671,8 +666,6 @@ func DefaultOptions() Options {
 			NumBatch:  512,
 			NumGPU:    -1, // -1 here indicates that NumGPU should be set dynamically
 			NumThread: 0,  // let the runtime decide
-			LowVRAM:   false,
-			UseMLock:  false,
 			UseMMap:   nil,
 		},
 	}
--- a/docs/api.md
+++ b/docs/api.md
@ -404,10 +404,7 @@ curl http://localhost:11434/api/generate -d '{
    "num_batch": 2,
    "num_gpu": 1,
    "main_gpu": 0,
-    "low_vram": false,
-    "vocab_only": false,
    "use_mmap": true,
-    "use_mlock": false,
    "num_thread": 8
  }
 }'
--- a/llama/llama.go
+++ b/llama/llama.go
@ -199,7 +199,6 @@ type ModelParams struct {
 	NumGpuLayers int
 	MainGpu      int
 	UseMmap      bool
-	UseMlock     bool
 	TensorSplit  []float32
 	Progress     func(float32)
 	VocabOnly    bool
@ -218,7 +217,6 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
 	cparams.n_gpu_layers = C.int(params.NumGpuLayers)
 	cparams.main_gpu = C.int32_t(params.MainGpu)
 	cparams.use_mmap = C.bool(params.UseMmap)
-	cparams.use_mlock = C.bool(params.UseMlock)
 	cparams.vocab_only = C.bool(params.VocabOnly)

 	if len(params.TensorSplit) > 0 {
--- a/llm/server.go
+++ b/llm/server.go
@ -217,10 +217,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		params = append(params, "--no-mmap")
 	}

-	if opts.UseMLock {
-		params = append(params, "--mlock")
-	}
-
 	// TODO - NUMA support currently doesn't work properly

 	params = append(params, "--parallel", strconv.Itoa(numParallel))
--- a/parser/parser.go
+++ b/parser/parser.go
@ -39,7 +39,14 @@ func (f Modelfile) String() string {
 	return sb.String()
 }

-var deprecatedParameters = []string{"penalize_newline"}
+var deprecatedParameters = []string{
+	"penalize_newline",
+	"low_vram",
+	"f16_kv",
+	"logits_all",
+	"vocab_only",
+	"use_mlock",
+}

 // CreateRequest creates a new *api.CreateRequest from an existing Modelfile
 func (f Modelfile) CreateRequest(relativeDir string) (*api.CreateRequest, error) {
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@ -478,11 +478,7 @@ func TestParseFileParameters(t *testing.T) {
 		"num_gqa 1":                    {"num_gqa", "1"},
 		"num_gpu 1":                    {"num_gpu", "1"},
 		"main_gpu 1":                   {"main_gpu", "1"},
-		"low_vram true":                {"low_vram", "true"},
-		"logits_all true":              {"logits_all", "true"},
-		"vocab_only true":              {"vocab_only", "true"},
 		"use_mmap true":                {"use_mmap", "true"},
-		"use_mlock true":               {"use_mlock", "true"},
 		"num_thread 1":                 {"num_thread", "1"},
 		"num_keep 1":                   {"num_keep", "1"},
 		"seed 1":                       {"seed", "1"},
--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
@ -820,7 +820,6 @@ func Execute(args []string) error {
 	threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
 	verbose := fs.Bool("verbose", false, "verbose output (default: disabled)")
 	noMmap := fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
-	mlock := fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
 	tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
 	multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")

@ -876,7 +875,6 @@ func Execute(args []string) error {
 		NumGpuLayers: *nGpuLayers,
 		MainGpu:      *mainGpu,
 		UseMmap:      !*noMmap && lpaths.String() == "",
-		UseMlock:     *mlock,
 		TensorSplit:  tensorSplitFloats,
 		Progress: func(progress float32) {
 			server.progress = progress
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@ -818,7 +818,6 @@ func Execute(args []string) error {
 	threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
 	verbose := fs.Bool("verbose", false, "verbose output (default: disabled)")
 	_ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
-	_ = fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
 	tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
 	multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")