From 424f648632c925ce14a75018c4dcab395e035993 Mon Sep 17 00:00:00 2001 From: Devon Rifkin Date: Tue, 22 Apr 2025 16:33:24 -0700 Subject: [PATCH] increase default context length to 4096 (#10364) * increase default context length to 4096 We lower the default numParallel from 4 to 2 and use these "savings" to double the default context length from 2048 to 4096. We're memory neutral in cases when we previously would've used numParallel == 4, but we add the following mitigation to handle some cases where we would have previously fallen back to 1x2048 due to low VRAM: we decide between 2048 and 4096 using a runtime check, choosing 2048 if we're on a one GPU system with total VRAM of <= 4 GB. We purposefully don't check the available VRAM because we don't want the context window size to change unexpectedly based on the available VRAM. We plan on making the default even larger, but this is a relatively low-risk change we can make to quickly double it. * fix tests add an explicit context length so they don't get truncated. The code that converts -1 from being a signal for doing a runtime check isn't running as part of these tests. * tweak small gpu message * clarify context length default also make it actually show up in `ollama serve --help` --- cmd/cmd.go | 1 + docs/faq.md | 6 +++--- envconfig/config.go | 18 ++++++++++++++++-- envconfig/config_test.go | 4 ++-- server/routes_generate_test.go | 9 +++++++++ server/sched.go | 22 +++++++++++++++++----- server/sched_test.go | 1 + 7 files changed, 49 insertions(+), 12 deletions(-) diff --git a/cmd/cmd.go b/cmd/cmd.go index 79ff87ac8..befe578d6 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -1407,6 +1407,7 @@ func NewCLI() *cobra.Command { envVars["OLLAMA_LLM_LIBRARY"], envVars["OLLAMA_GPU_OVERHEAD"], envVars["OLLAMA_LOAD_TIMEOUT"], + envVars["OLLAMA_CONTEXT_LENGTH"], }) default: appendEnvDocs(cmd, envs) diff --git a/docs/faq.md b/docs/faq.md index f418da47f..327afc6e5 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -20,7 +20,7 @@ Please refer to the [GPU docs](./gpu.md). ## How can I specify the context window size? -By default, Ollama uses a context window size of 2048 tokens. +By default, Ollama uses a context window size of 4096 tokens, unless you have a single GPU with <= 4 GB of VRAM, in which case it will default to 2048 tokens. This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: @@ -31,7 +31,7 @@ OLLAMA_CONTEXT_LENGTH=8192 ollama serve To change this when using `ollama run`, use `/set parameter`: ```shell -/set parameter num_ctx 4096 +/set parameter num_ctx 8192 ``` When using the API, specify the `num_ctx` parameter: @@ -41,7 +41,7 @@ curl http://localhost:11434/api/generate -d '{ "model": "llama3.2", "prompt": "Why is the sky blue?", "options": { - "num_ctx": 4096 + "num_ctx": 8192 } }' ``` diff --git a/envconfig/config.go b/envconfig/config.go index fc702198f..fcb0a6947 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -169,7 +169,7 @@ var ( // Enable the new Ollama engine NewEngine = Bool("OLLAMA_NEW_ENGINE") // ContextLength sets the default context length - ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 2048) + ContextLength = Int64("OLLAMA_CONTEXT_LENGTH", -1) ) func String(s string) func() string { @@ -227,6 +227,20 @@ func Uint64(key string, defaultValue uint64) func() uint64 { } } +func Int64(key string, defaultValue int64) func() int64 { + return func() int64 { + if s := Var(key); s != "" { + if n, err := strconv.ParseInt(s, 10, 64); err != nil { + slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue) + } else { + return n + } + } + + return defaultValue + } +} + // Set aside VRAM per GPU var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0) @@ -255,7 +269,7 @@ func AsMap() map[string]EnvVar { "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"}, "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"}, "OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"}, - "OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 2048)"}, + "OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default 4096 or 2048 with low VRAM)"}, "OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"}, // Informational diff --git a/envconfig/config_test.go b/envconfig/config_test.go index 5694eb8a3..72bfb4df5 100644 --- a/envconfig/config_test.go +++ b/envconfig/config_test.go @@ -278,8 +278,8 @@ func TestVar(t *testing.T) { } func TestContextLength(t *testing.T) { - cases := map[string]uint{ - "": 2048, + cases := map[string]int64{ + "": -1, "4096": 4096, } diff --git a/server/routes_generate_test.go b/server/routes_generate_test.go index 56121d41b..dd77b574a 100644 --- a/server/routes_generate_test.go +++ b/server/routes_generate_test.go @@ -299,6 +299,9 @@ func TestGenerateChat(t *testing.T) { {Role: "user", Content: "Hello!"}, }, Stream: &stream, + Options: map[string]any{ + "num_ctx": 1024, + }, }) if w.Code != http.StatusOK { @@ -321,6 +324,9 @@ func TestGenerateChat(t *testing.T) { {Role: "user", Content: "Hello!"}, }, Stream: &stream, + Options: map[string]any{ + "num_ctx": 1024, + }, }) if w.Code != http.StatusOK { @@ -344,6 +350,9 @@ func TestGenerateChat(t *testing.T) { {Role: "user", Content: "Help me write tests."}, }, Stream: &stream, + Options: map[string]any{ + "num_ctx": 1024, + }, }) if w.Code != http.StatusOK { diff --git a/server/sched.go b/server/sched.go index f3978796c..d5b19fbfd 100644 --- a/server/sched.go +++ b/server/sched.go @@ -58,7 +58,7 @@ var defaultModelsPerGPU = 3 // Default automatic value for parallel setting // Model will still need to fit in VRAM. If this setting won't fit // we'll back off down to 1 to try to get it to fit -var defaultParallel = 4 +var defaultParallel = 2 var ErrMaxQueue = errors.New("server busy, please try again. maximum pending requests exceeded") @@ -81,10 +81,6 @@ func InitScheduler(ctx context.Context) *Scheduler { // context must be canceled to decrement ref count and release the runner func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) { - if opts.NumCtx < 4 { - opts.NumCtx = 4 - } - req := &LlmRequest{ ctx: c, model: model, @@ -114,6 +110,11 @@ func (s *Scheduler) Run(ctx context.Context) { }() } +const ( + defaultContextLength = 4096 + smallGpuContextLength = 2048 +) + func (s *Scheduler) processPending(ctx context.Context) { for { select { @@ -166,6 +167,17 @@ func (s *Scheduler) processPending(ctx context.Context) { gpus = s.getGpuFn() } + if pending.origNumCtx == -1 { + if len(gpus) == 1 && gpus[0].Library != "cpu" && gpus[0].TotalMemory <= 4096*1024*1024 { + slog.Info("GPU is small, limiting default context window", "num_ctx", smallGpuContextLength) + pending.opts.NumCtx = smallGpuContextLength + pending.origNumCtx = smallGpuContextLength + } else { + pending.opts.NumCtx = defaultContextLength + pending.origNumCtx = defaultContextLength + } + } + if envconfig.MaxRunners() <= 0 { // No user specified MaxRunners, so figure out what automatic setting to use // If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs diff --git a/server/sched_test.go b/server/sched_test.go index 274e18cec..1b620329c 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -148,6 +148,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est successCh: make(chan *runnerRef, 1), errCh: make(chan error, 1), } + b.req.opts.NumCtx = 4096 b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}} return b }