diff --git a/cmd/cmd.go b/cmd/cmd.go
index 79ff87ac8..befe578d6 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1407,6 +1407,7 @@ func NewCLI() *cobra.Command {
 				envVars["OLLAMA_LLM_LIBRARY"],
 				envVars["OLLAMA_GPU_OVERHEAD"],
 				envVars["OLLAMA_LOAD_TIMEOUT"],
+				envVars["OLLAMA_CONTEXT_LENGTH"],
 			})
 		default:
 			appendEnvDocs(cmd, envs)
diff --git a/docs/faq.md b/docs/faq.md
index f418da47f..327afc6e5 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -20,7 +20,7 @@ Please refer to the [GPU docs](./gpu.md).
 
 ## How can I specify the context window size?
 
-By default, Ollama uses a context window size of 2048 tokens. 
+By default, Ollama uses a context window size of 4096 tokens, unless you have a single GPU with <= 4 GB of VRAM, in which case it will default to 2048 tokens. 
 
 This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: 
 
@@ -31,7 +31,7 @@ OLLAMA_CONTEXT_LENGTH=8192 ollama serve
 To change this when using `ollama run`, use `/set parameter`:
 
 ```shell
-/set parameter num_ctx 4096
+/set parameter num_ctx 8192
 ```
 
 When using the API, specify the `num_ctx` parameter:
@@ -41,7 +41,7 @@ curl http://localhost:11434/api/generate -d '{
   "model": "llama3.2",
   "prompt": "Why is the sky blue?",
   "options": {
-    "num_ctx": 4096
+    "num_ctx": 8192
   }
 }'
 ```
diff --git a/envconfig/config.go b/envconfig/config.go
index fc702198f..fcb0a6947 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -169,7 +169,7 @@ var (
 	// Enable the new Ollama engine
 	NewEngine = Bool("OLLAMA_NEW_ENGINE")
 	// ContextLength sets the default context length
-	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 2048)
+	ContextLength = Int64("OLLAMA_CONTEXT_LENGTH", -1)
 )
 
 func String(s string) func() string {
@@ -227,6 +227,20 @@ func Uint64(key string, defaultValue uint64) func() uint64 {
 	}
 }
 
+func Int64(key string, defaultValue int64) func() int64 {
+	return func() int64 {
+		if s := Var(key); s != "" {
+			if n, err := strconv.ParseInt(s, 10, 64); err != nil {
+				slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
+			} else {
+				return n
+			}
+		}
+
+		return defaultValue
+	}
+}
+
 // Set aside VRAM per GPU
 var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
 
@@ -255,7 +269,7 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
 		"OLLAMA_MULTIUSER_CACHE":   {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
-		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 2048)"},
+		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default 4096 or 2048 with low VRAM)"},
 		"OLLAMA_NEW_ENGINE":        {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
 
 		// Informational
diff --git a/envconfig/config_test.go b/envconfig/config_test.go
index 5694eb8a3..72bfb4df5 100644
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@@ -278,8 +278,8 @@ func TestVar(t *testing.T) {
 }
 
 func TestContextLength(t *testing.T) {
-	cases := map[string]uint{
-		"":     2048,
+	cases := map[string]int64{
+		"":     -1,
 		"4096": 4096,
 	}
 
diff --git a/server/routes_generate_test.go b/server/routes_generate_test.go
index 56121d41b..dd77b574a 100644
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -299,6 +299,9 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "user", Content: "Hello!"},
 			},
 			Stream: &stream,
+			Options: map[string]any{
+				"num_ctx": 1024,
+			},
 		})
 
 		if w.Code != http.StatusOK {
@@ -321,6 +324,9 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "user", Content: "Hello!"},
 			},
 			Stream: &stream,
+			Options: map[string]any{
+				"num_ctx": 1024,
+			},
 		})
 
 		if w.Code != http.StatusOK {
@@ -344,6 +350,9 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "user", Content: "Help me write tests."},
 			},
 			Stream: &stream,
+			Options: map[string]any{
+				"num_ctx": 1024,
+			},
 		})
 
 		if w.Code != http.StatusOK {
diff --git a/server/sched.go b/server/sched.go
index f3978796c..d5b19fbfd 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -58,7 +58,7 @@ var defaultModelsPerGPU = 3
 // Default automatic value for parallel setting
 // Model will still need to fit in VRAM.  If this setting won't fit
 // we'll back off down to 1 to try to get it to fit
-var defaultParallel = 4
+var defaultParallel = 2
 
 var ErrMaxQueue = errors.New("server busy, please try again.  maximum pending requests exceeded")
 
@@ -81,10 +81,6 @@ func InitScheduler(ctx context.Context) *Scheduler {
 
 // context must be canceled to decrement ref count and release the runner
 func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
-	if opts.NumCtx < 4 {
-		opts.NumCtx = 4
-	}
-
 	req := &LlmRequest{
 		ctx:             c,
 		model:           model,
@@ -114,6 +110,11 @@ func (s *Scheduler) Run(ctx context.Context) {
 	}()
 }
 
+const (
+	defaultContextLength  = 4096
+	smallGpuContextLength = 2048
+)
+
 func (s *Scheduler) processPending(ctx context.Context) {
 	for {
 		select {
@@ -166,6 +167,17 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						gpus = s.getGpuFn()
 					}
 
+					if pending.origNumCtx == -1 {
+						if len(gpus) == 1 && gpus[0].Library != "cpu" && gpus[0].TotalMemory <= 4096*1024*1024 {
+							slog.Info("GPU is small, limiting default context window", "num_ctx", smallGpuContextLength)
+							pending.opts.NumCtx = smallGpuContextLength
+							pending.origNumCtx = smallGpuContextLength
+						} else {
+							pending.opts.NumCtx = defaultContextLength
+							pending.origNumCtx = defaultContextLength
+						}
+					}
+
 					if envconfig.MaxRunners() <= 0 {
 						// No user specified MaxRunners, so figure out what automatic setting to use
 						// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
diff --git a/server/sched_test.go b/server/sched_test.go
index 274e18cec..1b620329c 100644
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -148,6 +148,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 		successCh:       make(chan *runnerRef, 1),
 		errCh:           make(chan error, 1),
 	}
+	b.req.opts.NumCtx = 4096
 	b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
 	return b
 }