WIP but got logits n stuff

2024-12-13 16:20:44 -08:00 · 2024-12-13 16:20:44 -08:00 · c92d418a7c
commit c92d418a7c
parent d7e7e6a01e
5 changed files with 114 additions and 47 deletions
--- a/api/types.go
+++ b/api/types.go
@ -80,6 +80,8 @@ type GenerateRequest struct {
 	// Options lists model-specific options. For example, temperature can be
 	// set through this field, if the model supports it.
 	Options map[string]interface{} `json:"options"`
 	ReturnLogits bool `json:"return_logits,omitempty"`
 }
 // ChatRequest describes a request sent by [Client.Chat].
@ -105,6 +107,8 @@ type ChatRequest struct {
 	// Options lists model-specific options.
 	Options map[string]interface{} `json:"options"`
 	ReturnLogits bool `json:"return_logits,omitempty"`
 }
 type Tools []Tool
@ -189,6 +193,7 @@ type ChatResponse struct {
 	CreatedAt  time.Time `json:"created_at"`
 	Message    Message   `json:"message"`
 	DoneReason string    `json:"done_reason,omitempty"`
 	Logits     []float32 `json:"logits"`
 	Done bool `json:"done"`
@ -204,6 +209,15 @@ type Metrics struct {
 	EvalDuration       time.Duration `json:"eval_duration,omitempty"`
 }
 type TokenLogprob struct {
 	Token   string  `json:"token"`
 	Logprob float32 `json:"logprob"`
 }
 type LogProbs struct {
 	TopLogprobs []TokenLogprob `json:"top_logprobs"`
 }
 // Options specified in [GenerateRequest].  If you add a new option here, also
 // add it to the API docs.
 type Options struct {
@ -450,6 +464,8 @@ type GenerateResponse struct {
 	Context []int `json:"context,omitempty"`
 	Metrics
 	Logits []float32 `json:"logits"`
 }
 // ModelDetails provides details about a model.
--- a/llama/llama.go
+++ b/llama/llama.go
@ -260,6 +260,19 @@ func (c *Context) GetEmbeddingsIth(i int) []float32 {
 	return unsafe.Slice((*float32)(embeddings), c.Model().NEmbd())
 }
 // GetLogits returns the logits from the last decode operation.
 // The returned slice has length equal to the vocabulary size.
 func (c *Context) GetLogits() []float32 {
 	logits := unsafe.Pointer(C.llama_get_logits(c.c))
 	if logits == nil {
 		return nil
 	}
 	// Get the number of vocabulary tokens to determine array size
 	vocabSize := c.Model().NumVocab()
 	return unsafe.Slice((*float32)(logits), vocabSize)
 }
 type ModelParams struct {
 	NumGpuLayers int
 	MainGpu      int
@ -737,14 +750,3 @@ func SchemaToGrammar(schema []byte) []byte {
 	}
 	return buf[:n]
 }
 // GetLogits returns the logits from the last decode operation.
 // The returned slice has length equal to the vocabulary size.
 func (c *Context) GetLogits() []float32 {
 	logits := unsafe.Pointer(C.llama_get_logits(c.c))
 	if logits == nil {
 		return nil
 	}
 	// Get the number of vocabulary tokens to determine array size
 	vocabSize := c.Model().NumVocab()
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@ -8,12 +8,14 @@ import (
 	"fmt"
 	"log"
 	"log/slog"
 	"math"
 	"net"
 	"net/http"
 	"os"
 	"path/filepath"
 	"regexp"
 	"runtime"
 	"sort"
 	"strconv"
 	"strings"
 	"sync"
@ -59,7 +61,7 @@ type Sequence struct {
 	crossAttention bool
 	// channel to send responses over
-	responses chan string
+	responses chan CompletionResponse
 	// channel to stop decoding (such as if the remote connection is closed)
 	quit chan bool
@ -88,6 +90,15 @@ type Sequence struct {
 	startGenerationTime time.Time
 	numDecoded          int
 	numPromptInputs     int
 	// New flag we need to add to Sequence struct
 	returnLogits bool
 	// Using our new GetLogits() method
 	logits []float32
 	// Add new channel for logits
 	logitsOut chan []float32
 }
 type NewSequenceParams struct {
@ -96,6 +107,7 @@ type NewSequenceParams struct {
 	numKeep        int
 	samplingParams *llama.SamplingParams
 	embedding      bool
 	returnLogits   bool
 }
 func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequenceParams) (*Sequence, error) {
@ -149,13 +161,15 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen
 		startProcessingTime: startTime,
 		numPredict:          params.numPredict,
 		pendingResponses:    make([]string, 0),
-		responses:           make(chan string, 100),
+		responses:           make(chan CompletionResponse, 100),
 		quit:                make(chan bool, 1),
 		embedding:           make(chan []float32, 1),
 		samplingCtx:         sc,
 		embeddingOnly:       params.embedding,
 		stop:                params.stop,
 		numKeep:             params.numKeep,
 		returnLogits:        params.returnLogits,
 		logitsOut:           make(chan []float32, 100),
 	}, nil
 }
@ -274,25 +288,36 @@ func (s *Server) allNil() bool {
 }
 func flushPending(seq *Sequence) bool {
-	joined := strings.Join(seq.pendingResponses, "")
+	if len(seq.pendingResponses) == 0 {
-	seq.pendingResponses = []string{}
+		return true
 	}
 	content := strings.Join(seq.pendingResponses, "")
 	// Check if there are any partial UTF-8 characters remaining.
 	// We already check and queue as we are generating but some may
 	// still make it here:
 	// - Sequence is ending, e.g. generation limit has been hit
 	// - Invalid characters in the middle of a string
 	// This is a stricter check to ensure we never output invalid Unicode.
-	for !utf8.ValidString(joined) {
+	for !utf8.ValidString(content) {
-		joined = joined[:len(joined)-1]
+		content = content[:len(content)-1]
 	}
 	seq.pendingResponses = nil
 	resp := CompletionResponse{
 		Content: content,
 	}
-	if len(joined) == 0 {
+	// Add logits if requested and available
-		return true
+	if seq.returnLogits && seq.logits != nil {
 		slog.Info("returning logits - flushPending")
 		resp.Logits = seq.logits
 		seq.logits = nil
 	}
 	slog.Info("returning logits - flushPending", "logits", resp.Logits[0])
 	select {
-	case seq.responses <- joined:
+	case seq.responses <- resp:
 		return true
 	case <-seq.quit:
 		return false
@ -476,7 +501,14 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			continue
 		}
-		// sample a token
+		// Before sampling:
 		if seq.returnLogits { // New flag we need to add to Sequence struct
 			slog.Info("returning logits")
 			seq.logits = s.lc.GetLogits() // Using our new GetLogits() method
 		}
 		// Then sample token
 		token := seq.samplingCtx.Sample(s.lc, seq.iBatch)
 		seq.samplingCtx.Accept(token, true)
 		piece := s.model.TokenToPiece(token)
@ -576,6 +608,7 @@ type CompletionRequest struct {
 	Images       []ImageData `json:"image_data"`
 	Grammar      string      `json:"grammar"`
 	CachePrompt  bool        `json:"cache_prompt"`
 	ReturnLogits bool        `json:"return_logits"`
 	Options
 }
@ -589,6 +622,8 @@ type Timings struct {
 type CompletionResponse struct {
 	Content string    `json:"content"`
 	Logits  []float32 `json:"logits,omitempty"`
 	Tokens  []string  `json:"tokens,omitempty"`
 	Stop    bool      `json:"stop"`
 	Model        string  `json:"model,omitempty"`
@ -637,12 +672,14 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 	samplingParams.Seed = uint32(req.Seed)
 	samplingParams.Grammar = req.Grammar
 	slog.Info("completion request", "return_logits", req.ReturnLogits)
 	seq, err := s.NewSequence(req.Prompt, req.Images, NewSequenceParams{
 		numPredict:     req.NumPredict,
 		stop:           req.Stop,
 		numKeep:        req.NumKeep,
 		samplingParams: &samplingParams,
 		embedding:      false,
 		returnLogits:   req.ReturnLogits,
 	})
 	if err != nil {
 		http.Error(w, fmt.Sprintf("Failed to create new sequence: %v", err), http.StatusInternalServerError)
@ -691,10 +728,10 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 			close(seq.quit)
 			return
 		case content, ok := <-seq.responses:
 			slog.Info("logits in last chan", "content", content.Logits[0])
 			if ok {
-				if err := json.NewEncoder(w).Encode(&CompletionResponse{
+				slog.Info("content", "content", content.Content)
-					Content: content,
+				if err := json.NewEncoder(w).Encode(&content); err != nil {
 				}); err != nil {
 					http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
 					close(seq.quit)
 					return
--- a/llm/server.go
+++ b/llm/server.go
@ -647,6 +647,7 @@ type completion struct {
 	Prompt       string    `json:"prompt"`
 	Stop         bool      `json:"stop"`
 	StoppedLimit bool      `json:"stopped_limit"`
 	Logits       []float32 `json:"logits,omitempty"`
 	Timings struct {
 		PredictedN  int     `json:"predicted_n"`
@ -661,6 +662,7 @@ type CompletionRequest struct {
 	Format       json.RawMessage
 	Images       []ImageData
 	Options      *api.Options
 	ReturnLogits bool
 }
 type CompletionResponse struct {
@ -671,6 +673,7 @@ type CompletionResponse struct {
 	PromptEvalDuration time.Duration
 	EvalCount          int
 	EvalDuration       time.Duration
 	Logits             []float32
 }
 func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
@ -696,6 +699,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 		"seed":              req.Options.Seed,
 		"stop":              req.Options.Stop,
 		"image_data":        req.Images,
 		"return_logits":     req.ReturnLogits,
 		"cache_prompt":      true,
 	}
@ -821,6 +825,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 			if c.Content != "" {
 				fn(CompletionResponse{
 					Content: c.Content,
 					Logits:  c.Logits,
 				})
 			}
@ -837,6 +842,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 					PromptEvalDuration: parseDurationMs(c.Timings.PromptMS),
 					EvalCount:          c.Timings.PredictedN,
 					EvalDuration:       parseDurationMs(c.Timings.PredictedMS),
 					Logits:             c.Logits,
 				})
 				return nil
 			}
--- a/server/routes.go
+++ b/server/routes.go
@ -299,6 +299,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			Images:       images,
 			Format:       req.Format,
 			Options:      opts,
 			ReturnLogits: req.ReturnLogits,
 		}, func(cr llm.CompletionResponse) {
 			res := api.GenerateResponse{
 				Model:      req.Model,
@ -312,6 +313,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 					EvalCount:          cr.EvalCount,
 					EvalDuration:       cr.EvalDuration,
 				},
 				Logits: cr.Logits,
 			}
 			if _, err := sb.WriteString(cr.Content); err != nil {
@ -1541,6 +1543,8 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	slog.Debug("chat request", "images", len(images), "prompt", prompt)
 	slog.Info("chat request", "return_logits", req.ReturnLogits)
 	ch := make(chan any)
 	go func() {
 		defer close(ch)
@ -1551,6 +1555,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 			Images:       images,
 			Format:       req.Format,
 			Options:      opts,
 			ReturnLogits: true,
 		}, func(r llm.CompletionResponse) {
 			res := api.ChatResponse{
 				Model:      req.Model,
@ -1558,6 +1563,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 				Message:    api.Message{Role: "assistant", Content: r.Content},
 				Done:       r.Done,
 				DoneReason: r.DoneReason,
 				Logits:     r.Logits,
 				Metrics: api.Metrics{
 					PromptEvalCount:    r.PromptEvalCount,
 					PromptEvalDuration: r.PromptEvalDuration,