feat: add trace log level (#10650)

reduce prompt log to trace level
2025-05-12 11:43:00 -07:00 · 2025-05-12 11:43:00 -07:00 · f95a1f2bef
commit f95a1f2bef
parent 82a9e9462a
13 changed files with 141 additions and 114 deletions
--- a/app/lifecycle/logging.go
+++ b/app/lifecycle/logging.go
@ -4,20 +4,14 @@ import (
 	"fmt"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"strconv"
 	"strings"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/logutil"
 )
 func InitLogging() {
 	level := slog.LevelInfo
 	if envconfig.Debug() {
 		level = slog.LevelDebug
 	}
 	var logFile *os.File
 	var err error
 	// Detect if we're a GUI app on windows, and if not, send logs to console
@ -33,20 +27,8 @@ func InitLogging() {
 			return
 		}
 	}
 	handler := slog.NewTextHandler(logFile, &slog.HandlerOptions{
 		Level:     level,
 		AddSource: true,
 		ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
 			if attr.Key == slog.SourceKey {
 				source := attr.Value.Any().(*slog.Source)
 				source.File = filepath.Base(source.File)
 			}
 			return attr
 		},
 	})
 	slog.SetDefault(slog.New(handler))
 	slog.SetDefault(logutil.NewLogger(logFile, envconfig.LogLevel()))
 	slog.Info("ollama app started")
 }
--- a/discover/gpu.go
+++ b/discover/gpu.go
@ -670,7 +670,7 @@ func loadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string, e
 }
 func getVerboseState() C.uint16_t {
-	if envconfig.Debug() {
+	if envconfig.LogLevel() < slog.LevelInfo {
 		return C.uint16_t(1)
 	}
 	return C.uint16_t(0)
--- a/envconfig/config.go
+++ b/envconfig/config.go
@ -149,9 +149,22 @@ func Bool(k string) func() bool {
 	}
 }
 // LogLevel returns the log level for the application.
 // Values are 0 or false INFO (Default), 1 or true DEBUG, 2 TRACE
 func LogLevel() slog.Level {
 	level := slog.LevelInfo
 	if s := Var("OLLAMA_DEBUG"); s != "" {
 		if b, _ := strconv.ParseBool(s); b {
 			level = slog.LevelDebug
 		} else if i, _ := strconv.ParseInt(s, 10, 64); i != 0 {
 			level = slog.Level(i * -4)
 		}
 	}
 	return level
 }
 var (
 	// Debug enabled additional debug information.
 	Debug = Bool("OLLAMA_DEBUG")
 	// FlashAttention enables the experimental flash attention feature.
 	FlashAttention = Bool("OLLAMA_FLASH_ATTENTION")
 	// KvCacheType is the quantization type for the K/V cache.
@ -236,7 +249,7 @@ type EnvVar struct {
 func AsMap() map[string]EnvVar {
 	ret := map[string]EnvVar{
-		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
+		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", LogLevel(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
 		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
 		"OLLAMA_KV_CACHE_TYPE":     {"OLLAMA_KV_CACHE_TYPE", KvCacheType(), "Quantization type for the K/V cache (default: f16)"},
 		"OLLAMA_GPU_OVERHEAD":      {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@ -1,11 +1,13 @@
 package envconfig
 import (
 	"log/slog"
 	"math"
 	"testing"
 	"time"
 	"github.com/google/go-cmp/cmp"
 	"github.com/ollama/ollama/logutil"
 )
 func TestHost(t *testing.T) {
@ -292,3 +294,34 @@ func TestContextLength(t *testing.T) {
 		})
 	}
 }
 func TestLogLevel(t *testing.T) {
 	cases := map[string]slog.Level{
 		// Default to INFO
 		"":      slog.LevelInfo,
 		"false": slog.LevelInfo,
 		"f":     slog.LevelInfo,
 		"0":     slog.LevelInfo,
 		// True values enable Debug
 		"true": slog.LevelDebug,
 		"t":    slog.LevelDebug,
 		// Positive values increase verbosity
 		"1": slog.LevelDebug,
 		"2": logutil.LevelTrace,
 		// Negative values decrease verbosity
 		"-1": slog.LevelWarn,
 		"-2": slog.LevelError,
 	}
 	for k, v := range cases {
 		t.Run(k, func(t *testing.T) {
 			t.Setenv("OLLAMA_DEBUG", k)
 			if i := LogLevel(); i != v {
 				t.Errorf("%s: expected %d, got %d", k, v, i)
 			}
 		})
 	}
 }
--- a/llm/server.go
+++ b/llm/server.go
@ -17,6 +17,7 @@ import (
 	"os/exec"
 	"path/filepath"
 	"runtime"
 	"slices"
 	"strconv"
 	"strings"
 	"sync"
@ -30,9 +31,37 @@ import (
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llama"
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/model"
 )
 type filteredEnv []string
 func (e filteredEnv) LogValue() slog.Value {
 	var attrs []slog.Attr
 	for _, env := range e {
 		if key, value, ok := strings.Cut(env, "="); ok {
 			switch {
 			case strings.HasPrefix(key, "OLLAMA_"),
 				strings.HasPrefix(key, "CUDA_"),
 				strings.HasPrefix(key, "ROCR_"),
 				strings.HasPrefix(key, "ROCM_"),
 				strings.HasPrefix(key, "HIP_"),
 				strings.HasPrefix(key, "GPU_"),
 				strings.HasPrefix(key, "HSA_"),
 				strings.HasPrefix(key, "GGML_"),
 				slices.Contains([]string{
 					"PATH",
 					"LD_LIBRARY_PATH",
 					"DYLD_LIBRARY_PATH",
 				}, key):
 				attrs = append(attrs, slog.String(key, value))
 			}
 		}
 	}
 	return slog.GroupValue(attrs...)
 }
 type LlamaServer interface {
 	Ping(ctx context.Context) error
 	WaitUntilRunning(ctx context.Context) error
@ -148,10 +177,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		params = append(params, "--n-gpu-layers", strconv.Itoa(opts.NumGPU))
 	}
 	if envconfig.Debug() {
 		params = append(params, "--verbose")
 	}
 	if opts.MainGPU > 0 {
 		params = append(params, "--main-gpu", strconv.Itoa(opts.MainGPU))
 	}
@ -404,26 +429,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		}
 		slog.Info("starting llama server", "cmd", s.cmd)
-		if envconfig.Debug() {
+		slog.Debug("subprocess", "", filteredEnv(s.cmd.Env))
 			filteredEnv := []string{}
 			for _, ev := range s.cmd.Env {
 				if strings.HasPrefix(ev, "OLLAMA_") ||
 					strings.HasPrefix(ev, "CUDA_") ||
 					strings.HasPrefix(ev, "ROCR_") ||
 					strings.HasPrefix(ev, "ROCM_") ||
 					strings.HasPrefix(ev, "HIP_") ||
 					strings.HasPrefix(ev, "GPU_") ||
 					strings.HasPrefix(ev, "HSA_") ||
 					strings.HasPrefix(ev, "GGML_") ||
 					strings.HasPrefix(ev, "PATH=") ||
 					strings.HasPrefix(ev, "LD_LIBRARY_PATH=") ||
 					strings.HasPrefix(ev, "DYLD_LIBRARY_PATH=") {
 					filteredEnv = append(filteredEnv, ev)
 				}
 			}
 			// Log at debug as the environment is inherited and might contain sensitive information
 			slog.Debug("subprocess", "environment", filteredEnv)
 		}
 		if err = s.cmd.Start(); err != nil {
 			var msg string
@ -721,6 +727,9 @@ type CompletionResponse struct {
 }
 func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
 	slog.Debug("completion request", "images", len(req.Images), "prompt", len(req.Prompt), "format", string(req.Format))
 	slog.Log(ctx, logutil.LevelTrace, "completion request", "prompt", req.Prompt)
 	if len(req.Format) > 0 {
 		switch string(req.Format) {
 		case `null`, `""`:
@ -884,6 +893,8 @@ type EmbeddingResponse struct {
 }
 func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) {
 	slog.Log(ctx, logutil.LevelTrace, "embedding request", "input", input)
 	if err := s.sem.Acquire(ctx, 1); err != nil {
 		if errors.Is(err, context.Canceled) {
 			slog.Info("aborting embedding request due to client closing the connection")
--- a/logutil/logutil.go
+++ b/logutil/logutil.go
@ -0,0 +1,29 @@
 package logutil
 import (
 	"io"
 	"log/slog"
 	"path/filepath"
 )
 const LevelTrace slog.Level = -8
 func NewLogger(w io.Writer, level slog.Level) *slog.Logger {
 	return slog.New(slog.NewTextHandler(w, &slog.HandlerOptions{
 		Level:     level,
 		AddSource: true,
 		ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
 			switch attr.Key {
 			case slog.LevelKey:
 				switch attr.Value.Any().(slog.Level) {
 				case LevelTrace:
 					attr.Value = slog.StringValue("TRACE")
 				}
 			case slog.SourceKey:
 				source := attr.Value.Any().(*slog.Source)
 				source.File = filepath.Base(source.File)
 			}
 			return attr
 		},
 	}))
 }
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@ -27,6 +27,7 @@ import (
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs"
 	fsggml "github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/ml"
 	ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
 	"golang.org/x/sync/errgroup"
@ -222,7 +223,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 			tt := C.ggml_new_tensor(ctxs[bt], t.source.Kind, C.int(len(t.source.Shape)), (*C.int64_t)(unsafe.Pointer(&t.source.Shape[0])))
 			C.ggml_set_name(tt, cname)
-			slog.Debug("created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
+			slog.Log(context.TODO(), logutil.LevelTrace, "created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
 			//nolint:staticcheck // TODO: check if buffer type supports this tensor
 			return tt
 		}
--- a/model/model.go
+++ b/model/model.go
@ -19,6 +19,7 @@ import (
 	"github.com/ollama/ollama/fs"
 	fsggml "github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/ml"
 	_ "github.com/ollama/ollama/ml/backend"
 	"github.com/ollama/ollama/model/input"
@ -202,7 +203,7 @@ func populateFields(base Base, v reflect.Value, tags ...Tag) reflect.Value {
 				names := fn(tagsCopy)
 				for _, name := range names {
 					if tensor := base.Backend().Get(strings.Join(name, ".")); tensor != nil {
-						slog.Debug("found tensor", "", tensor)
+						slog.Log(context.TODO(), logutil.LevelTrace, "found tensor", "", tensor)
 						vv.Set(reflect.ValueOf(tensor))
 						break
 					}
--- a/model/process_text.go
+++ b/model/process_text.go
@ -2,6 +2,7 @@ package model
 import (
 	"cmp"
 	"context"
 	"iter"
 	"log/slog"
 	"slices"
@ -10,6 +11,7 @@ import (
 	"github.com/dlclark/regexp2"
 	heap "github.com/emirpasic/gods/v2/trees/binaryheap"
 	"github.com/ollama/ollama/logutil"
 )
 type Special int32
@ -322,6 +324,7 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
 		}
 	}
 	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "ids", ids)
 	return ids, nil
 }
@ -349,5 +352,6 @@ func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
 		}
 	}
 	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String())
 	return sb.String(), nil
 }
--- a/model/process_text_spm.go
+++ b/model/process_text_spm.go
@ -2,10 +2,13 @@ package model
 import (
 	"container/heap"
 	"context"
 	"fmt"
 	"log/slog"
 	"strconv"
 	"strings"
 	"github.com/ollama/ollama/logutil"
 )
 const spmWhitespaceSep = "▁"
@ -22,7 +25,7 @@ func (spm SentencePieceModel) Vocabulary() *Vocabulary {
 }
 func NewSentencePieceModel(vocab *Vocabulary) SentencePieceModel {
-	slog.Debug("Tokens", "num tokens", len(vocab.Values), "vals", vocab.Values[:5], "scores", vocab.Scores[:5], "types", vocab.Types[:5])
+	slog.Log(context.TODO(), logutil.LevelTrace, "Tokens", "num tokens", len(vocab.Values), "vals", vocab.Values[:5], "scores", vocab.Scores[:5], "types", vocab.Types[:5])
 	counter := map[int]int{}
 	var maxTokenLen int
@ -36,7 +39,7 @@ func NewSentencePieceModel(vocab *Vocabulary) SentencePieceModel {
 		}
 	}
-	slog.Debug("Token counts", "normal", counter[TOKEN_TYPE_NORMAL], "unknown", counter[TOKEN_TYPE_UNKNOWN], "control", counter[TOKEN_TYPE_CONTROL],
+	slog.Log(context.TODO(), logutil.LevelTrace, "Token counts", "normal", counter[TOKEN_TYPE_NORMAL], "unknown", counter[TOKEN_TYPE_UNKNOWN], "control", counter[TOKEN_TYPE_CONTROL],
 		"user defined", counter[TOKEN_TYPE_USER_DEFINED], "unused", counter[TOKEN_TYPE_UNUSED], "byte", counter[TOKEN_TYPE_BYTE],
 		"max token len", maxTokenLen)
@ -199,6 +202,7 @@ func (spm SentencePieceModel) Encode(s string, addSpecial bool) ([]int32, error)
 		}
 	}
 	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "ids", ids)
 	return ids, nil
 }
@ -257,5 +261,6 @@ func (spm SentencePieceModel) Decode(ids []int32) (string, error) {
 		}
 	}
 	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String())
 	return sb.String(), nil
 }
--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
@ -11,7 +11,6 @@ import (
 	"net"
 	"net/http"
 	"os"
 	"path/filepath"
 	"regexp"
 	"runtime"
 	"strconv"
@ -23,8 +22,10 @@ import (
 	"golang.org/x/sync/semaphore"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/llama"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/runner/common"
 )
@ -680,8 +681,6 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
 	w.Header().Set("Content-Type", "application/json")
 	slog.Debug("embedding request", "content", req.Content)
 	seq, err := s.NewSequence(req.Content, nil, NewSequenceParams{embedding: true})
 	if err != nil {
 		http.Error(w, fmt.Sprintf("Failed to create new sequence: %v", err), http.StatusInternalServerError)
@ -815,7 +814,7 @@ func Execute(args []string) error {
 	kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
 	port := fs.Int("port", 8080, "Port to expose the server on")
 	threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
-	verbose := fs.Bool("verbose", false, "verbose output (default: disabled)")
+	_ = fs.Bool("verbose", false, "verbose output (default: disabled)")
 	noMmap := fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
 	tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
 	multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
@ -830,22 +829,7 @@ func Execute(args []string) error {
 	if err := fs.Parse(args); err != nil {
 		return err
 	}
-	level := slog.LevelInfo
+	slog.SetDefault(logutil.NewLogger(os.Stderr, envconfig.LogLevel()))
 	if *verbose {
 		level = slog.LevelDebug
 	}
 	handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
 		Level:     level,
 		AddSource: true,
 		ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
 			if attr.Key == slog.SourceKey {
 				source := attr.Value.Any().(*slog.Source)
 				source.File = filepath.Base(source.File)
 			}
 			return attr
 		},
 	})
 	slog.SetDefault(slog.New(handler))
 	slog.Info("starting go runner")
 	llama.BackendInit()
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@ -12,7 +12,6 @@ import (
 	"net"
 	"net/http"
 	"os"
 	"path/filepath"
 	"regexp"
 	"runtime"
 	"strconv"
@ -24,7 +23,9 @@ import (
 	"golang.org/x/sync/semaphore"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
@ -816,7 +817,7 @@ func Execute(args []string) error {
 	kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
 	port := fs.Int("port", 8080, "Port to expose the server on")
 	threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
-	verbose := fs.Bool("verbose", false, "verbose output (default: disabled)")
+	_ = fs.Bool("verbose", false, "verbose output (default: disabled)")
 	_ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
 	tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
 	multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
@ -831,22 +832,7 @@ func Execute(args []string) error {
 	if err := fs.Parse(args); err != nil {
 		return err
 	}
-	level := slog.LevelInfo
+	slog.SetDefault(logutil.NewLogger(os.Stderr, envconfig.LogLevel()))
 	if *verbose {
 		level = slog.LevelDebug
 	}
 	handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
 		Level:     level,
 		AddSource: true,
 		ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
 			if attr.Key == slog.SourceKey {
 				source := attr.Value.Any().(*slog.Source)
 				source.File = filepath.Base(source.File)
 			}
 			return attr
 		},
 	})
 	slog.SetDefault(slog.New(handler))
 	slog.Info("starting ollama engine")
 	server := &Server{
--- a/server/routes.go
+++ b/server/routes.go
@ -17,7 +17,6 @@ import (
 	"net/netip"
 	"os"
 	"os/signal"
 	"path/filepath"
 	"regexp"
 	"slices"
 	"strings"
@ -33,6 +32,7 @@ import (
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/model/models/mllama"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/server/internal/client/ollama"
@ -295,8 +295,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		prompt = b.String()
 	}
 	slog.Debug("generate request", "images", len(images), "prompt", prompt)
 	ch := make(chan any)
 	go func() {
 		// TODO (jmorganca): avoid building the response twice both here and below
@ -1226,26 +1224,8 @@ func (s *Server) GenerateRoutes(rc *ollama.Registry) (http.Handler, error) {
 }
 func Serve(ln net.Listener) error {
-	level := slog.LevelInfo
+	slog.SetDefault(logutil.NewLogger(os.Stderr, envconfig.LogLevel()))
 	if envconfig.Debug() {
 		level = slog.LevelDebug
 	}
 	slog.Info("server config", "env", envconfig.Values())
 	handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
 		Level:     level,
 		AddSource: true,
 		ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
 			if attr.Key == slog.SourceKey {
 				source := attr.Value.Any().(*slog.Source)
 				source.File = filepath.Base(source.File)
 			}
 			return attr
 		},
 	})
 	slog.SetDefault(slog.New(handler))
 	blobsDir, err := GetBlobsPath("")
 	if err != nil {
@ -1521,8 +1501,6 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		return
 	}
 	slog.Debug("chat request", "images", len(images), "prompt", prompt)
 	ch := make(chan any)
 	go func() {
 		defer close(ch)