From 3b2d2c8326c245f0210a549777d0a77c2ccd92d1 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Mon, 5 May 2025 14:54:40 -0700
Subject: [PATCH] api: remove unused or unsupported api options (#10574)

Some options listed in api/types.go are not supported in
newer models, or have been deprecated in the past. This is
the first of a series of PRs to clean up the API options
---
 api/types.go                  | 7 -------
 docs/api.md                   | 3 ---
 llama/llama.go                | 2 --
 llm/server.go                 | 4 ----
 parser/parser.go              | 9 ++++++++-
 parser/parser_test.go         | 4 ----
 runner/llamarunner/runner.go  | 2 --
 runner/ollamarunner/runner.go | 1 -
 8 files changed, 8 insertions(+), 24 deletions(-)

diff --git a/api/types.go b/api/types.go
index 7d8b6e532..bb9f181c7 100644
--- a/api/types.go
+++ b/api/types.go
@@ -283,12 +283,7 @@ type Runner struct {
 	NumBatch  int   `json:"num_batch,omitempty"`
 	NumGPU    int   `json:"num_gpu,omitempty"`
 	MainGPU   int   `json:"main_gpu,omitempty"`
-	LowVRAM   bool  `json:"low_vram,omitempty"`
-	F16KV     bool  `json:"f16_kv,omitempty"` // Deprecated: This option is ignored
-	LogitsAll bool  `json:"logits_all,omitempty"`
-	VocabOnly bool  `json:"vocab_only,omitempty"`
 	UseMMap   *bool `json:"use_mmap,omitempty"`
-	UseMLock  bool  `json:"use_mlock,omitempty"`
 	NumThread int   `json:"num_thread,omitempty"`
 }
 
@@ -671,8 +666,6 @@ func DefaultOptions() Options {
 			NumBatch:  512,
 			NumGPU:    -1, // -1 here indicates that NumGPU should be set dynamically
 			NumThread: 0,  // let the runtime decide
-			LowVRAM:   false,
-			UseMLock:  false,
 			UseMMap:   nil,
 		},
 	}
diff --git a/docs/api.md b/docs/api.md
index 7f3e5e2d2..df5edff6d 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -404,10 +404,7 @@ curl http://localhost:11434/api/generate -d '{
     "num_batch": 2,
     "num_gpu": 1,
     "main_gpu": 0,
-    "low_vram": false,
-    "vocab_only": false,
     "use_mmap": true,
-    "use_mlock": false,
     "num_thread": 8
   }
 }'
diff --git a/llama/llama.go b/llama/llama.go
index 7f9f9549a..ccd63b5a4 100644
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -199,7 +199,6 @@ type ModelParams struct {
 	NumGpuLayers int
 	MainGpu      int
 	UseMmap      bool
-	UseMlock     bool
 	TensorSplit  []float32
 	Progress     func(float32)
 	VocabOnly    bool
@@ -218,7 +217,6 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
 	cparams.n_gpu_layers = C.int(params.NumGpuLayers)
 	cparams.main_gpu = C.int32_t(params.MainGpu)
 	cparams.use_mmap = C.bool(params.UseMmap)
-	cparams.use_mlock = C.bool(params.UseMlock)
 	cparams.vocab_only = C.bool(params.VocabOnly)
 
 	if len(params.TensorSplit) > 0 {
diff --git a/llm/server.go b/llm/server.go
index 7172d9240..d7c466a9a 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -217,10 +217,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		params = append(params, "--no-mmap")
 	}
 
-	if opts.UseMLock {
-		params = append(params, "--mlock")
-	}
-
 	// TODO - NUMA support currently doesn't work properly
 
 	params = append(params, "--parallel", strconv.Itoa(numParallel))
diff --git a/parser/parser.go b/parser/parser.go
index 0a732653c..7c94d2002 100644
--- a/parser/parser.go
+++ b/parser/parser.go
@@ -39,7 +39,14 @@ func (f Modelfile) String() string {
 	return sb.String()
 }
 
-var deprecatedParameters = []string{"penalize_newline"}
+var deprecatedParameters = []string{
+	"penalize_newline",
+	"low_vram",
+	"f16_kv",
+	"logits_all",
+	"vocab_only",
+	"use_mlock",
+}
 
 // CreateRequest creates a new *api.CreateRequest from an existing Modelfile
 func (f Modelfile) CreateRequest(relativeDir string) (*api.CreateRequest, error) {
diff --git a/parser/parser_test.go b/parser/parser_test.go
index 097c058fb..f2aa5ab79 100644
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@@ -478,11 +478,7 @@ func TestParseFileParameters(t *testing.T) {
 		"num_gqa 1":                    {"num_gqa", "1"},
 		"num_gpu 1":                    {"num_gpu", "1"},
 		"main_gpu 1":                   {"main_gpu", "1"},
-		"low_vram true":                {"low_vram", "true"},
-		"logits_all true":              {"logits_all", "true"},
-		"vocab_only true":              {"vocab_only", "true"},
 		"use_mmap true":                {"use_mmap", "true"},
-		"use_mlock true":               {"use_mlock", "true"},
 		"num_thread 1":                 {"num_thread", "1"},
 		"num_keep 1":                   {"num_keep", "1"},
 		"seed 1":                       {"seed", "1"},
diff --git a/runner/llamarunner/runner.go b/runner/llamarunner/runner.go
index d8169be40..5b7d6c317 100644
--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
@@ -820,7 +820,6 @@ func Execute(args []string) error {
 	threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
 	verbose := fs.Bool("verbose", false, "verbose output (default: disabled)")
 	noMmap := fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
-	mlock := fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
 	tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
 	multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
 
@@ -876,7 +875,6 @@ func Execute(args []string) error {
 		NumGpuLayers: *nGpuLayers,
 		MainGpu:      *mainGpu,
 		UseMmap:      !*noMmap && lpaths.String() == "",
-		UseMlock:     *mlock,
 		TensorSplit:  tensorSplitFloats,
 		Progress: func(progress float32) {
 			server.progress = progress
diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go
index 3e0bb34ec..b028a7216 100644
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -818,7 +818,6 @@ func Execute(args []string) error {
 	threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
 	verbose := fs.Bool("verbose", false, "verbose output (default: disabled)")
 	_ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
-	_ = fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
 	tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
 	multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")