api: remove unused or unsupported api options (#10574)
Some options listed in api/types.go are not supported in newer models, or have been deprecated in the past. This is the first of a series of PRs to clean up the API options
This commit is contained in:
parent
d931ee8f22
commit
3b2d2c8326
@ -283,12 +283,7 @@ type Runner struct {
|
|||||||
NumBatch int `json:"num_batch,omitempty"`
|
NumBatch int `json:"num_batch,omitempty"`
|
||||||
NumGPU int `json:"num_gpu,omitempty"`
|
NumGPU int `json:"num_gpu,omitempty"`
|
||||||
MainGPU int `json:"main_gpu,omitempty"`
|
MainGPU int `json:"main_gpu,omitempty"`
|
||||||
LowVRAM bool `json:"low_vram,omitempty"`
|
|
||||||
F16KV bool `json:"f16_kv,omitempty"` // Deprecated: This option is ignored
|
|
||||||
LogitsAll bool `json:"logits_all,omitempty"`
|
|
||||||
VocabOnly bool `json:"vocab_only,omitempty"`
|
|
||||||
UseMMap *bool `json:"use_mmap,omitempty"`
|
UseMMap *bool `json:"use_mmap,omitempty"`
|
||||||
UseMLock bool `json:"use_mlock,omitempty"`
|
|
||||||
NumThread int `json:"num_thread,omitempty"`
|
NumThread int `json:"num_thread,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -671,8 +666,6 @@ func DefaultOptions() Options {
|
|||||||
NumBatch: 512,
|
NumBatch: 512,
|
||||||
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
|
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
|
||||||
NumThread: 0, // let the runtime decide
|
NumThread: 0, // let the runtime decide
|
||||||
LowVRAM: false,
|
|
||||||
UseMLock: false,
|
|
||||||
UseMMap: nil,
|
UseMMap: nil,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -404,10 +404,7 @@ curl http://localhost:11434/api/generate -d '{
|
|||||||
"num_batch": 2,
|
"num_batch": 2,
|
||||||
"num_gpu": 1,
|
"num_gpu": 1,
|
||||||
"main_gpu": 0,
|
"main_gpu": 0,
|
||||||
"low_vram": false,
|
|
||||||
"vocab_only": false,
|
|
||||||
"use_mmap": true,
|
"use_mmap": true,
|
||||||
"use_mlock": false,
|
|
||||||
"num_thread": 8
|
"num_thread": 8
|
||||||
}
|
}
|
||||||
}'
|
}'
|
||||||
|
@ -199,7 +199,6 @@ type ModelParams struct {
|
|||||||
NumGpuLayers int
|
NumGpuLayers int
|
||||||
MainGpu int
|
MainGpu int
|
||||||
UseMmap bool
|
UseMmap bool
|
||||||
UseMlock bool
|
|
||||||
TensorSplit []float32
|
TensorSplit []float32
|
||||||
Progress func(float32)
|
Progress func(float32)
|
||||||
VocabOnly bool
|
VocabOnly bool
|
||||||
@ -218,7 +217,6 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
|
|||||||
cparams.n_gpu_layers = C.int(params.NumGpuLayers)
|
cparams.n_gpu_layers = C.int(params.NumGpuLayers)
|
||||||
cparams.main_gpu = C.int32_t(params.MainGpu)
|
cparams.main_gpu = C.int32_t(params.MainGpu)
|
||||||
cparams.use_mmap = C.bool(params.UseMmap)
|
cparams.use_mmap = C.bool(params.UseMmap)
|
||||||
cparams.use_mlock = C.bool(params.UseMlock)
|
|
||||||
cparams.vocab_only = C.bool(params.VocabOnly)
|
cparams.vocab_only = C.bool(params.VocabOnly)
|
||||||
|
|
||||||
if len(params.TensorSplit) > 0 {
|
if len(params.TensorSplit) > 0 {
|
||||||
|
@ -217,10 +217,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
|||||||
params = append(params, "--no-mmap")
|
params = append(params, "--no-mmap")
|
||||||
}
|
}
|
||||||
|
|
||||||
if opts.UseMLock {
|
|
||||||
params = append(params, "--mlock")
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO - NUMA support currently doesn't work properly
|
// TODO - NUMA support currently doesn't work properly
|
||||||
|
|
||||||
params = append(params, "--parallel", strconv.Itoa(numParallel))
|
params = append(params, "--parallel", strconv.Itoa(numParallel))
|
||||||
|
@ -39,7 +39,14 @@ func (f Modelfile) String() string {
|
|||||||
return sb.String()
|
return sb.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
var deprecatedParameters = []string{"penalize_newline"}
|
var deprecatedParameters = []string{
|
||||||
|
"penalize_newline",
|
||||||
|
"low_vram",
|
||||||
|
"f16_kv",
|
||||||
|
"logits_all",
|
||||||
|
"vocab_only",
|
||||||
|
"use_mlock",
|
||||||
|
}
|
||||||
|
|
||||||
// CreateRequest creates a new *api.CreateRequest from an existing Modelfile
|
// CreateRequest creates a new *api.CreateRequest from an existing Modelfile
|
||||||
func (f Modelfile) CreateRequest(relativeDir string) (*api.CreateRequest, error) {
|
func (f Modelfile) CreateRequest(relativeDir string) (*api.CreateRequest, error) {
|
||||||
|
@ -478,11 +478,7 @@ func TestParseFileParameters(t *testing.T) {
|
|||||||
"num_gqa 1": {"num_gqa", "1"},
|
"num_gqa 1": {"num_gqa", "1"},
|
||||||
"num_gpu 1": {"num_gpu", "1"},
|
"num_gpu 1": {"num_gpu", "1"},
|
||||||
"main_gpu 1": {"main_gpu", "1"},
|
"main_gpu 1": {"main_gpu", "1"},
|
||||||
"low_vram true": {"low_vram", "true"},
|
|
||||||
"logits_all true": {"logits_all", "true"},
|
|
||||||
"vocab_only true": {"vocab_only", "true"},
|
|
||||||
"use_mmap true": {"use_mmap", "true"},
|
"use_mmap true": {"use_mmap", "true"},
|
||||||
"use_mlock true": {"use_mlock", "true"},
|
|
||||||
"num_thread 1": {"num_thread", "1"},
|
"num_thread 1": {"num_thread", "1"},
|
||||||
"num_keep 1": {"num_keep", "1"},
|
"num_keep 1": {"num_keep", "1"},
|
||||||
"seed 1": {"seed", "1"},
|
"seed 1": {"seed", "1"},
|
||||||
|
@ -820,7 +820,6 @@ func Execute(args []string) error {
|
|||||||
threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
|
threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
|
||||||
verbose := fs.Bool("verbose", false, "verbose output (default: disabled)")
|
verbose := fs.Bool("verbose", false, "verbose output (default: disabled)")
|
||||||
noMmap := fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
|
noMmap := fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
|
||||||
mlock := fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
|
|
||||||
tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
|
tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
|
||||||
multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
|
multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
|
||||||
|
|
||||||
@ -876,7 +875,6 @@ func Execute(args []string) error {
|
|||||||
NumGpuLayers: *nGpuLayers,
|
NumGpuLayers: *nGpuLayers,
|
||||||
MainGpu: *mainGpu,
|
MainGpu: *mainGpu,
|
||||||
UseMmap: !*noMmap && lpaths.String() == "",
|
UseMmap: !*noMmap && lpaths.String() == "",
|
||||||
UseMlock: *mlock,
|
|
||||||
TensorSplit: tensorSplitFloats,
|
TensorSplit: tensorSplitFloats,
|
||||||
Progress: func(progress float32) {
|
Progress: func(progress float32) {
|
||||||
server.progress = progress
|
server.progress = progress
|
||||||
|
@ -818,7 +818,6 @@ func Execute(args []string) error {
|
|||||||
threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
|
threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
|
||||||
verbose := fs.Bool("verbose", false, "verbose output (default: disabled)")
|
verbose := fs.Bool("verbose", false, "verbose output (default: disabled)")
|
||||||
_ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
|
_ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
|
||||||
_ = fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
|
|
||||||
tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
|
tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
|
||||||
multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
|
multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user