api: remove unused or unsupported api options (#10574)
Some options listed in api/types.go are not supported in newer models, or have been deprecated in the past. This is the first of a series of PRs to clean up the API options
This commit is contained in:
parent
d931ee8f22
commit
3b2d2c8326
@ -283,12 +283,7 @@ type Runner struct {
|
||||
NumBatch int `json:"num_batch,omitempty"`
|
||||
NumGPU int `json:"num_gpu,omitempty"`
|
||||
MainGPU int `json:"main_gpu,omitempty"`
|
||||
LowVRAM bool `json:"low_vram,omitempty"`
|
||||
F16KV bool `json:"f16_kv,omitempty"` // Deprecated: This option is ignored
|
||||
LogitsAll bool `json:"logits_all,omitempty"`
|
||||
VocabOnly bool `json:"vocab_only,omitempty"`
|
||||
UseMMap *bool `json:"use_mmap,omitempty"`
|
||||
UseMLock bool `json:"use_mlock,omitempty"`
|
||||
NumThread int `json:"num_thread,omitempty"`
|
||||
}
|
||||
|
||||
@ -671,8 +666,6 @@ func DefaultOptions() Options {
|
||||
NumBatch: 512,
|
||||
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
|
||||
NumThread: 0, // let the runtime decide
|
||||
LowVRAM: false,
|
||||
UseMLock: false,
|
||||
UseMMap: nil,
|
||||
},
|
||||
}
|
||||
|
@ -404,10 +404,7 @@ curl http://localhost:11434/api/generate -d '{
|
||||
"num_batch": 2,
|
||||
"num_gpu": 1,
|
||||
"main_gpu": 0,
|
||||
"low_vram": false,
|
||||
"vocab_only": false,
|
||||
"use_mmap": true,
|
||||
"use_mlock": false,
|
||||
"num_thread": 8
|
||||
}
|
||||
}'
|
||||
|
@ -199,7 +199,6 @@ type ModelParams struct {
|
||||
NumGpuLayers int
|
||||
MainGpu int
|
||||
UseMmap bool
|
||||
UseMlock bool
|
||||
TensorSplit []float32
|
||||
Progress func(float32)
|
||||
VocabOnly bool
|
||||
@ -218,7 +217,6 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
|
||||
cparams.n_gpu_layers = C.int(params.NumGpuLayers)
|
||||
cparams.main_gpu = C.int32_t(params.MainGpu)
|
||||
cparams.use_mmap = C.bool(params.UseMmap)
|
||||
cparams.use_mlock = C.bool(params.UseMlock)
|
||||
cparams.vocab_only = C.bool(params.VocabOnly)
|
||||
|
||||
if len(params.TensorSplit) > 0 {
|
||||
|
@ -217,10 +217,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
||||
params = append(params, "--no-mmap")
|
||||
}
|
||||
|
||||
if opts.UseMLock {
|
||||
params = append(params, "--mlock")
|
||||
}
|
||||
|
||||
// TODO - NUMA support currently doesn't work properly
|
||||
|
||||
params = append(params, "--parallel", strconv.Itoa(numParallel))
|
||||
|
@ -39,7 +39,14 @@ func (f Modelfile) String() string {
|
||||
return sb.String()
|
||||
}
|
||||
|
||||
var deprecatedParameters = []string{"penalize_newline"}
|
||||
var deprecatedParameters = []string{
|
||||
"penalize_newline",
|
||||
"low_vram",
|
||||
"f16_kv",
|
||||
"logits_all",
|
||||
"vocab_only",
|
||||
"use_mlock",
|
||||
}
|
||||
|
||||
// CreateRequest creates a new *api.CreateRequest from an existing Modelfile
|
||||
func (f Modelfile) CreateRequest(relativeDir string) (*api.CreateRequest, error) {
|
||||
|
@ -478,11 +478,7 @@ func TestParseFileParameters(t *testing.T) {
|
||||
"num_gqa 1": {"num_gqa", "1"},
|
||||
"num_gpu 1": {"num_gpu", "1"},
|
||||
"main_gpu 1": {"main_gpu", "1"},
|
||||
"low_vram true": {"low_vram", "true"},
|
||||
"logits_all true": {"logits_all", "true"},
|
||||
"vocab_only true": {"vocab_only", "true"},
|
||||
"use_mmap true": {"use_mmap", "true"},
|
||||
"use_mlock true": {"use_mlock", "true"},
|
||||
"num_thread 1": {"num_thread", "1"},
|
||||
"num_keep 1": {"num_keep", "1"},
|
||||
"seed 1": {"seed", "1"},
|
||||
|
@ -820,7 +820,6 @@ func Execute(args []string) error {
|
||||
threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
|
||||
verbose := fs.Bool("verbose", false, "verbose output (default: disabled)")
|
||||
noMmap := fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
|
||||
mlock := fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
|
||||
tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
|
||||
multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
|
||||
|
||||
@ -876,7 +875,6 @@ func Execute(args []string) error {
|
||||
NumGpuLayers: *nGpuLayers,
|
||||
MainGpu: *mainGpu,
|
||||
UseMmap: !*noMmap && lpaths.String() == "",
|
||||
UseMlock: *mlock,
|
||||
TensorSplit: tensorSplitFloats,
|
||||
Progress: func(progress float32) {
|
||||
server.progress = progress
|
||||
|
@ -818,7 +818,6 @@ func Execute(args []string) error {
|
||||
threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
|
||||
verbose := fs.Bool("verbose", false, "verbose output (default: disabled)")
|
||||
_ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
|
||||
_ = fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
|
||||
tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
|
||||
multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user