api: remove unused or unsupported api options (#10574)

Some options listed in api/types.go are not supported in
newer models, or have been deprecated in the past. This is
the first of a series of PRs to clean up the API options
This commit is contained in:
Jeffrey Morgan 2025-05-05 14:54:40 -07:00 committed by GitHub
parent d931ee8f22
commit 3b2d2c8326
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 8 additions and 24 deletions

View File

@ -283,12 +283,7 @@ type Runner struct {
NumBatch int `json:"num_batch,omitempty"`
NumGPU int `json:"num_gpu,omitempty"`
MainGPU int `json:"main_gpu,omitempty"`
LowVRAM bool `json:"low_vram,omitempty"`
F16KV bool `json:"f16_kv,omitempty"` // Deprecated: This option is ignored
LogitsAll bool `json:"logits_all,omitempty"`
VocabOnly bool `json:"vocab_only,omitempty"`
UseMMap *bool `json:"use_mmap,omitempty"`
UseMLock bool `json:"use_mlock,omitempty"`
NumThread int `json:"num_thread,omitempty"`
}
@ -671,8 +666,6 @@ func DefaultOptions() Options {
NumBatch: 512,
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
NumThread: 0, // let the runtime decide
LowVRAM: false,
UseMLock: false,
UseMMap: nil,
},
}

View File

@ -404,10 +404,7 @@ curl http://localhost:11434/api/generate -d '{
"num_batch": 2,
"num_gpu": 1,
"main_gpu": 0,
"low_vram": false,
"vocab_only": false,
"use_mmap": true,
"use_mlock": false,
"num_thread": 8
}
}'

View File

@ -199,7 +199,6 @@ type ModelParams struct {
NumGpuLayers int
MainGpu int
UseMmap bool
UseMlock bool
TensorSplit []float32
Progress func(float32)
VocabOnly bool
@ -218,7 +217,6 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
cparams.n_gpu_layers = C.int(params.NumGpuLayers)
cparams.main_gpu = C.int32_t(params.MainGpu)
cparams.use_mmap = C.bool(params.UseMmap)
cparams.use_mlock = C.bool(params.UseMlock)
cparams.vocab_only = C.bool(params.VocabOnly)
if len(params.TensorSplit) > 0 {

View File

@ -217,10 +217,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
params = append(params, "--no-mmap")
}
if opts.UseMLock {
params = append(params, "--mlock")
}
// TODO - NUMA support currently doesn't work properly
params = append(params, "--parallel", strconv.Itoa(numParallel))

View File

@ -39,7 +39,14 @@ func (f Modelfile) String() string {
return sb.String()
}
var deprecatedParameters = []string{"penalize_newline"}
var deprecatedParameters = []string{
"penalize_newline",
"low_vram",
"f16_kv",
"logits_all",
"vocab_only",
"use_mlock",
}
// CreateRequest creates a new *api.CreateRequest from an existing Modelfile
func (f Modelfile) CreateRequest(relativeDir string) (*api.CreateRequest, error) {

View File

@ -478,11 +478,7 @@ func TestParseFileParameters(t *testing.T) {
"num_gqa 1": {"num_gqa", "1"},
"num_gpu 1": {"num_gpu", "1"},
"main_gpu 1": {"main_gpu", "1"},
"low_vram true": {"low_vram", "true"},
"logits_all true": {"logits_all", "true"},
"vocab_only true": {"vocab_only", "true"},
"use_mmap true": {"use_mmap", "true"},
"use_mlock true": {"use_mlock", "true"},
"num_thread 1": {"num_thread", "1"},
"num_keep 1": {"num_keep", "1"},
"seed 1": {"seed", "1"},

View File

@ -820,7 +820,6 @@ func Execute(args []string) error {
threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
verbose := fs.Bool("verbose", false, "verbose output (default: disabled)")
noMmap := fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
mlock := fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
@ -876,7 +875,6 @@ func Execute(args []string) error {
NumGpuLayers: *nGpuLayers,
MainGpu: *mainGpu,
UseMmap: !*noMmap && lpaths.String() == "",
UseMlock: *mlock,
TensorSplit: tensorSplitFloats,
Progress: func(progress float32) {
server.progress = progress

View File

@ -818,7 +818,6 @@ func Execute(args []string) error {
threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
verbose := fs.Bool("verbose", false, "verbose output (default: disabled)")
_ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
_ = fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")