diff --git a/cmd/cmd.go b/cmd/cmd.go index 79ff87ac8..befe578d6 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -1407,6 +1407,7 @@ func NewCLI() *cobra.Command { envVars["OLLAMA_LLM_LIBRARY"], envVars["OLLAMA_GPU_OVERHEAD"], envVars["OLLAMA_LOAD_TIMEOUT"], + envVars["OLLAMA_CONTEXT_LENGTH"], }) default: appendEnvDocs(cmd, envs) diff --git a/docs/faq.md b/docs/faq.md index f418da47f..327afc6e5 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -20,7 +20,7 @@ Please refer to the [GPU docs](./gpu.md). ## How can I specify the context window size? -By default, Ollama uses a context window size of 2048 tokens. +By default, Ollama uses a context window size of 4096 tokens, unless you have a single GPU with <= 4 GB of VRAM, in which case it will default to 2048 tokens. This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: @@ -31,7 +31,7 @@ OLLAMA_CONTEXT_LENGTH=8192 ollama serve To change this when using `ollama run`, use `/set parameter`: ```shell -/set parameter num_ctx 4096 +/set parameter num_ctx 8192 ``` When using the API, specify the `num_ctx` parameter: @@ -41,7 +41,7 @@ curl http://localhost:11434/api/generate -d '{ "model": "llama3.2", "prompt": "Why is the sky blue?", "options": { - "num_ctx": 4096 + "num_ctx": 8192 } }' ``` diff --git a/envconfig/config.go b/envconfig/config.go index fc702198f..fcb0a6947 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -169,7 +169,7 @@ var ( // Enable the new Ollama engine NewEngine = Bool("OLLAMA_NEW_ENGINE") // ContextLength sets the default context length - ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 2048) + ContextLength = Int64("OLLAMA_CONTEXT_LENGTH", -1) ) func String(s string) func() string { @@ -227,6 +227,20 @@ func Uint64(key string, defaultValue uint64) func() uint64 { } } +func Int64(key string, defaultValue int64) func() int64 { + return func() int64 { + if s := Var(key); s != "" { + if n, err := strconv.ParseInt(s, 10, 64); err != nil { + slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue) + } else { + return n + } + } + + return defaultValue + } +} + // Set aside VRAM per GPU var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0) @@ -255,7 +269,7 @@ func AsMap() map[string]EnvVar { "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"}, "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"}, "OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"}, - "OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 2048)"}, + "OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default 4096 or 2048 with low VRAM)"}, "OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"}, // Informational diff --git a/envconfig/config_test.go b/envconfig/config_test.go index 5694eb8a3..72bfb4df5 100644 --- a/envconfig/config_test.go +++ b/envconfig/config_test.go @@ -278,8 +278,8 @@ func TestVar(t *testing.T) { } func TestContextLength(t *testing.T) { - cases := map[string]uint{ - "": 2048, + cases := map[string]int64{ + "": -1, "4096": 4096, } diff --git a/server/routes_generate_test.go b/server/routes_generate_test.go index 56121d41b..dd77b574a 100644 --- a/server/routes_generate_test.go +++ b/server/routes_generate_test.go @@ -299,6 +299,9 @@ func TestGenerateChat(t *testing.T) { {Role: "user", Content: "Hello!"}, }, Stream: &stream, + Options: map[string]any{ + "num_ctx": 1024, + }, }) if w.Code != http.StatusOK { @@ -321,6 +324,9 @@ func TestGenerateChat(t *testing.T) { {Role: "user", Content: "Hello!"}, }, Stream: &stream, + Options: map[string]any{ + "num_ctx": 1024, + }, }) if w.Code != http.StatusOK { @@ -344,6 +350,9 @@ func TestGenerateChat(t *testing.T) { {Role: "user", Content: "Help me write tests."}, }, Stream: &stream, + Options: map[string]any{ + "num_ctx": 1024, + }, }) if w.Code != http.StatusOK { diff --git a/server/sched.go b/server/sched.go index f3978796c..d5b19fbfd 100644 --- a/server/sched.go +++ b/server/sched.go @@ -58,7 +58,7 @@ var defaultModelsPerGPU = 3 // Default automatic value for parallel setting // Model will still need to fit in VRAM. If this setting won't fit // we'll back off down to 1 to try to get it to fit -var defaultParallel = 4 +var defaultParallel = 2 var ErrMaxQueue = errors.New("server busy, please try again. maximum pending requests exceeded") @@ -81,10 +81,6 @@ func InitScheduler(ctx context.Context) *Scheduler { // context must be canceled to decrement ref count and release the runner func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) { - if opts.NumCtx < 4 { - opts.NumCtx = 4 - } - req := &LlmRequest{ ctx: c, model: model, @@ -114,6 +110,11 @@ func (s *Scheduler) Run(ctx context.Context) { }() } +const ( + defaultContextLength = 4096 + smallGpuContextLength = 2048 +) + func (s *Scheduler) processPending(ctx context.Context) { for { select { @@ -166,6 +167,17 @@ func (s *Scheduler) processPending(ctx context.Context) { gpus = s.getGpuFn() } + if pending.origNumCtx == -1 { + if len(gpus) == 1 && gpus[0].Library != "cpu" && gpus[0].TotalMemory <= 4096*1024*1024 { + slog.Info("GPU is small, limiting default context window", "num_ctx", smallGpuContextLength) + pending.opts.NumCtx = smallGpuContextLength + pending.origNumCtx = smallGpuContextLength + } else { + pending.opts.NumCtx = defaultContextLength + pending.origNumCtx = defaultContextLength + } + } + if envconfig.MaxRunners() <= 0 { // No user specified MaxRunners, so figure out what automatic setting to use // If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs diff --git a/server/sched_test.go b/server/sched_test.go index 274e18cec..1b620329c 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -148,6 +148,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est successCh: make(chan *runnerRef, 1), errCh: make(chan error, 1), } + b.req.opts.NumCtx = 4096 b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}} return b }