Update readme.md

function calling for python. already had ts.
Signed-off-by: Matt Williams <m@technovangelist.com>
2023-11-29 10:45:07 -08:00 · 2023-11-29 10:06:11 -08:00
43 changed files with 1212 additions and 2280 deletions
--- a/6
+++ b/6
@@ -19,11 +19,5 @@ RUN apt-get update && apt-get install -y ca-certificates
 COPY --from=0 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
 EXPOSE 11434
 ENV OLLAMA_HOST 0.0.0.0
-
-# set some environment variable for better NVIDIA compatibility
-ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
-ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
-ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
-
 ENTRYPOINT ["/bin/ollama"]
 CMD ["serve"]
--- a/README.md
+++ b/README.md
@@ -57,7 +57,6 @@ Here are some example open-source models that can be downloaded:
 | Llama 2 70B        | 70B        | 39GB  | `ollama run llama2:70b`        |
 | Orca Mini          | 3B         | 1.9GB | `ollama run orca-mini`         |
 | Vicuna             | 7B         | 3.8GB | `ollama run vicuna`            |
-| LLaVA              | 7B         | 4.5GB | `ollama run llava`             |

 > Note: You should have at least 8 GB of RAM to run the 3B models, 16 GB to run the 7B models, and 32 GB to run the 13B models.

@@ -105,7 +104,7 @@ FROM llama2
 # set the temperature to 1 [higher is more creative, lower is more coherent]
 PARAMETER temperature 1

-# set the system message
+# set the system prompt
 SYSTEM """
 You are Mario from Super Mario Bros. Answer as Mario, the assistant, only.
 """
@@ -159,13 +158,6 @@ For multiline input, you can wrap text with `"""`:
 I'm a basic program that prints the famous "Hello, world!" message to the console.
 ```

-### Multimodal models
-
-```
->>> What's in this image? /Users/jmorgan/Desktop/smile.png
-The image features a yellow smiley face, which is likely the central focus of the picture.
-```
-
 ### Pass in prompt as arguments

 ```
@@ -213,8 +205,7 @@ Finally, in a separate shell, run a model:
 ## REST API

 Ollama has a REST API for running and managing models.
-
-### Generate a response
+For example, to generate text from a model:

 ```
 curl http://localhost:11434/api/generate -d '{
@@ -223,21 +214,14 @@ curl http://localhost:11434/api/generate -d '{
 }'
 ```

-### Chat with a model
-
-```
-curl http://localhost:11434/api/chat -d '{
-  "model": "mistral",
-  "messages": [
-    { "role": "user", "content": "why is the sky blue?" }
-  ]
-}'
-```
-
 See the [API documentation](./docs/api.md) for all endpoints.

 ## Community Integrations

+### Mobile
+
+- [Mobile Artificial Intelligence Distribution](https://github.com/MaidFoundation/Maid) (Maid)
+
 ### Web & Desktop

 - [HTML UI](https://github.com/rtcfirefly/ollama-ui)
@@ -249,7 +233,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [big-AGI](https://github.com/enricoros/big-agi/blob/main/docs/config-ollama.md)
 - [Cheshire Cat assistant framework](https://github.com/cheshire-cat-ai/core)
 - [Amica](https://github.com/semperai/amica)
- [chatd](https://github.com/BruceMacD/chatd)

 ### Terminal

@@ -262,10 +245,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [gptel Emacs client](https://github.com/karthink/gptel)
 - [Oatmeal](https://github.com/dustinblackman/oatmeal)

-### Database
-
- [MindsDB](https://github.com/mindsdb/mindsdb/blob/staging/mindsdb/integrations/handlers/ollama_handler/README.md)
-
 ### Package managers

 - [Pacman](https://archlinux.org/packages/extra/x86_64/ollama/)
@@ -297,7 +276,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Logseq Ollama plugin](https://github.com/omagdy7/ollama-logseq)
 - [Dagger Chatbot](https://github.com/samalba/dagger-chatbot)
 - [Discord AI Bot](https://github.com/mekb-turtle/discord-ai-bot)
- [Ollama Telegram Bot](https://github.com/ruecat/ollama-telegram)
 - [Hass Ollama Conversation](https://github.com/ej52/hass-ollama-conversation)
 - [Rivet plugin](https://github.com/abrenneke/rivet-plugin-ollama)
 - [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
--- a/api/client.go
+++ b/api/client.go
@@ -221,19 +221,6 @@ func (c *Client) Generate(ctx context.Context, req *GenerateRequest, fn Generate
 	})
 }

-type ChatResponseFunc func(ChatResponse) error
-
-func (c *Client) Chat(ctx context.Context, req *ChatRequest, fn ChatResponseFunc) error {
-	return c.stream(ctx, http.MethodPost, "/api/chat", req, func(bts []byte) error {
-		var resp ChatResponse
-		if err := json.Unmarshal(bts, &resp); err != nil {
-			return err
-		}
-
-		return fn(resp)
-	})
-}
-
 type PullProgressFunc func(ProgressResponse) error

 func (c *Client) Pull(ctx context.Context, req *PullRequest, fn PullProgressFunc) error {
@@ -324,15 +311,3 @@ func (c *Client) CreateBlob(ctx context.Context, digest string, r io.Reader) err

 	return nil
 }
-
-func (c *Client) Version(ctx context.Context) (string, error) {
-	var version struct {
-		Version string `json:"version"`
-	}
-
-	if err := c.do(ctx, http.MethodGet, "/api/version", nil, &version); err != nil {
-		return "", err
-	}
-
-	return version.Version, nil
-}
--- a/api/types.go
+++ b/api/types.go
@@ -6,7 +6,6 @@ import (
 	"math"
 	"os"
 	"reflect"
-	"strconv"
 	"strings"
 	"time"
 )
@@ -31,56 +30,19 @@ func (e StatusError) Error() string {
 	}
 }

-type ImageData []byte
-
 type GenerateRequest struct {
-	Model    string      `json:"model"`
-	Prompt   string      `json:"prompt"`
-	System   string      `json:"system"`
-	Template string      `json:"template"`
-	Context  []int       `json:"context,omitempty"`
-	Stream   *bool       `json:"stream,omitempty"`
-	Raw      bool        `json:"raw,omitempty"`
-	Format   string      `json:"format"`
-	Images   []ImageData `json:"images,omitempty"`
+	Model    string `json:"model"`
+	Prompt   string `json:"prompt"`
+	System   string `json:"system"`
+	Template string `json:"template"`
+	Context  []int  `json:"context,omitempty"`
+	Stream   *bool  `json:"stream,omitempty"`
+	Raw      bool   `json:"raw,omitempty"`
+	Format   string `json:"format"`

 	Options map[string]interface{} `json:"options"`
 }

-type ChatRequest struct {
-	Model    string    `json:"model"`
-	Messages []Message `json:"messages"`
-	Stream   *bool     `json:"stream,omitempty"`
-	Format   string    `json:"format"`
-
-	Options map[string]interface{} `json:"options"`
-}
-
-type Message struct {
-	Role    string      `json:"role"` // one of ["system", "user", "assistant"]
-	Content string      `json:"content"`
-	Images  []ImageData `json:"images, omitempty"`
-}
-
-type ChatResponse struct {
-	Model     string    `json:"model"`
-	CreatedAt time.Time `json:"created_at"`
-	Message   *Message  `json:"message,omitempty"`
-
-	Done bool `json:"done"`
-
-	Metrics
-}
-
-type Metrics struct {
-	TotalDuration      time.Duration `json:"total_duration,omitempty"`
-	LoadDuration       time.Duration `json:"load_duration,omitempty"`
-	PromptEvalCount    int           `json:"prompt_eval_count,omitempty"`
-	PromptEvalDuration time.Duration `json:"prompt_eval_duration,omitempty"`
-	EvalCount          int           `json:"eval_count,omitempty"`
-	EvalDuration       time.Duration `json:"eval_duration,omitempty"`
-}
-
 // Options specfied in GenerateRequest, if you add a new option here add it to the API docs also
 type Options struct {
 	Runner
@@ -152,12 +114,11 @@ type ShowRequest struct {
 }

 type ShowResponse struct {
-	License    string       `json:"license,omitempty"`
-	Modelfile  string       `json:"modelfile,omitempty"`
-	Parameters string       `json:"parameters,omitempty"`
-	Template   string       `json:"template,omitempty"`
-	System     string       `json:"system,omitempty"`
-	Details    ModelDetails `json:"details,omitempty"`
+	License    string `json:"license,omitempty"`
+	Modelfile  string `json:"modelfile,omitempty"`
+	Parameters string `json:"parameters,omitempty"`
+	Template   string `json:"template,omitempty"`
+	System     string `json:"system,omitempty"`
 }

 type CopyRequest struct {
@@ -193,11 +154,10 @@ type ListResponse struct {
 }

 type ModelResponse struct {
-	Name       string       `json:"name"`
-	ModifiedAt time.Time    `json:"modified_at"`
-	Size       int64        `json:"size"`
-	Digest     string       `json:"digest"`
-	Details    ModelDetails `json:"details,omitempty"`
+	Name       string    `json:"name"`
+	ModifiedAt time.Time `json:"modified_at"`
+	Size       int64     `json:"size"`
+	Digest     string    `json:"digest"`
 }

 type TokenResponse struct {
@@ -212,42 +172,39 @@ type GenerateResponse struct {
 	Done    bool  `json:"done"`
 	Context []int `json:"context,omitempty"`

-	Metrics
+	TotalDuration      time.Duration `json:"total_duration,omitempty"`
+	LoadDuration       time.Duration `json:"load_duration,omitempty"`
+	PromptEvalCount    int           `json:"prompt_eval_count,omitempty"`
+	PromptEvalDuration time.Duration `json:"prompt_eval_duration,omitempty"`
+	EvalCount          int           `json:"eval_count,omitempty"`
+	EvalDuration       time.Duration `json:"eval_duration,omitempty"`
 }

-type ModelDetails struct {
-	Format            string   `json:"format"`
-	Family            string   `json:"family"`
-	Families          []string `json:"families"`
-	ParameterSize     string   `json:"parameter_size"`
-	QuantizationLevel string   `json:"quantization_level"`
-}
-
-func (m *Metrics) Summary() {
-	if m.TotalDuration > 0 {
-		fmt.Fprintf(os.Stderr, "total duration:       %v\n", m.TotalDuration)
+func (r *GenerateResponse) Summary() {
+	if r.TotalDuration > 0 {
+		fmt.Fprintf(os.Stderr, "total duration:       %v\n", r.TotalDuration)
 	}

-	if m.LoadDuration > 0 {
-		fmt.Fprintf(os.Stderr, "load duration:        %v\n", m.LoadDuration)
+	if r.LoadDuration > 0 {
+		fmt.Fprintf(os.Stderr, "load duration:        %v\n", r.LoadDuration)
 	}

-	if m.PromptEvalCount > 0 {
-		fmt.Fprintf(os.Stderr, "prompt eval count:    %d token(s)\n", m.PromptEvalCount)
+	if r.PromptEvalCount > 0 {
+		fmt.Fprintf(os.Stderr, "prompt eval count:    %d token(s)\n", r.PromptEvalCount)
 	}

-	if m.PromptEvalDuration > 0 {
-		fmt.Fprintf(os.Stderr, "prompt eval duration: %s\n", m.PromptEvalDuration)
-		fmt.Fprintf(os.Stderr, "prompt eval rate:     %.2f tokens/s\n", float64(m.PromptEvalCount)/m.PromptEvalDuration.Seconds())
+	if r.PromptEvalDuration > 0 {
+		fmt.Fprintf(os.Stderr, "prompt eval duration: %s\n", r.PromptEvalDuration)
+		fmt.Fprintf(os.Stderr, "prompt eval rate:     %.2f tokens/s\n", float64(r.PromptEvalCount)/r.PromptEvalDuration.Seconds())
 	}

-	if m.EvalCount > 0 {
-		fmt.Fprintf(os.Stderr, "eval count:           %d token(s)\n", m.EvalCount)
+	if r.EvalCount > 0 {
+		fmt.Fprintf(os.Stderr, "eval count:           %d token(s)\n", r.EvalCount)
 	}

-	if m.EvalDuration > 0 {
-		fmt.Fprintf(os.Stderr, "eval duration:        %s\n", m.EvalDuration)
-		fmt.Fprintf(os.Stderr, "eval rate:            %.2f tokens/s\n", float64(m.EvalCount)/m.EvalDuration.Seconds())
+	if r.EvalDuration > 0 {
+		fmt.Fprintf(os.Stderr, "eval duration:        %s\n", r.EvalDuration)
+		fmt.Fprintf(os.Stderr, "eval rate:            %.2f tokens/s\n", float64(r.EvalCount)/r.EvalDuration.Seconds())
 	}
 }

@@ -403,63 +360,3 @@ func (d *Duration) UnmarshalJSON(b []byte) (err error) {

 	return nil
 }
-
-// FormatParams converts specified parameter options to their correct types
-func FormatParams(params map[string][]string) (map[string]interface{}, error) {
-	opts := Options{}
-	valueOpts := reflect.ValueOf(&opts).Elem() // names of the fields in the options struct
-	typeOpts := reflect.TypeOf(opts)           // types of the fields in the options struct
-
-	// build map of json struct tags to their types
-	jsonOpts := make(map[string]reflect.StructField)
-	for _, field := range reflect.VisibleFields(typeOpts) {
-		jsonTag := strings.Split(field.Tag.Get("json"), ",")[0]
-		if jsonTag != "" {
-			jsonOpts[jsonTag] = field
-		}
-	}
-
-	out := make(map[string]interface{})
-	// iterate params and set values based on json struct tags
-	for key, vals := range params {
-		if opt, ok := jsonOpts[key]; !ok {
-			return nil, fmt.Errorf("unknown parameter '%s'", key)
-		} else {
-			field := valueOpts.FieldByName(opt.Name)
-			if field.IsValid() && field.CanSet() {
-				switch field.Kind() {
-				case reflect.Float32:
-					floatVal, err := strconv.ParseFloat(vals[0], 32)
-					if err != nil {
-						return nil, fmt.Errorf("invalid float value %s", vals)
-					}
-
-					out[key] = float32(floatVal)
-				case reflect.Int:
-					intVal, err := strconv.ParseInt(vals[0], 10, 64)
-					if err != nil {
-						return nil, fmt.Errorf("invalid int value %s", vals)
-					}
-
-					out[key] = intVal
-				case reflect.Bool:
-					boolVal, err := strconv.ParseBool(vals[0])
-					if err != nil {
-						return nil, fmt.Errorf("invalid bool value %s", vals)
-					}
-
-					out[key] = boolVal
-				case reflect.String:
-					out[key] = vals[0]
-				case reflect.Slice:
-					// TODO: only string slices are supported right now
-					out[key] = vals
-				default:
-					return nil, fmt.Errorf("unknown type %s for %s", field.Kind(), key)
-				}
-			}
-		}
-	}
-
-	return out, nil
-}
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -17,9 +17,7 @@ import (
 	"os/exec"
 	"os/signal"
 	"path/filepath"
-	"regexp"
 	"runtime"
-	"slices"
 	"strings"
 	"syscall"
 	"time"
@@ -38,8 +36,6 @@ import (
 	"github.com/jmorganca/ollama/version"
 )

-type ImageData []byte
-
 func CreateHandler(cmd *cobra.Command, args []string) error {
 	filename, _ := cmd.Flags().GetString("file")
 	filename, err := filepath.Abs(filename)
@@ -137,7 +133,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	}

 	request := api.CreateRequest{Name: args[0], Modelfile: string(modelfile)}
-	if err := client.Create(cmd.Context(), &request, fn); err != nil {
+	if err := client.Create(context.Background(), &request, fn); err != nil {
 		return err
 	}

@@ -152,7 +148,7 @@ func RunHandler(cmd *cobra.Command, args []string) error {

 	name := args[0]
 	// check if the model exists on the server
-	_, err = client.Show(cmd.Context(), &api.ShowRequest{Name: name})
+	_, err = client.Show(context.Background(), &api.ShowRequest{Name: name})
 	var statusError api.StatusError
 	switch {
 	case errors.As(err, &statusError) && statusError.StatusCode == http.StatusNotFound:
@@ -212,7 +208,7 @@ func PushHandler(cmd *cobra.Command, args []string) error {
 	}

 	request := api.PushRequest{Name: args[0], Insecure: insecure}
-	if err := client.Push(cmd.Context(), &request, fn); err != nil {
+	if err := client.Push(context.Background(), &request, fn); err != nil {
 		return err
 	}

@@ -226,7 +222,7 @@ func ListHandler(cmd *cobra.Command, args []string) error {
 		return err
 	}

-	models, err := client.List(cmd.Context())
+	models, err := client.List(context.Background())
 	if err != nil {
 		return err
 	}
@@ -261,7 +257,7 @@ func DeleteHandler(cmd *cobra.Command, args []string) error {

 	for _, name := range args {
 		req := api.DeleteRequest{Name: name}
-		if err := client.Delete(cmd.Context(), &req); err != nil {
+		if err := client.Delete(context.Background(), &req); err != nil {
 			return err
 		}
 		fmt.Printf("deleted '%s'\n", name)
@@ -326,7 +322,7 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 	}

 	req := api.ShowRequest{Name: args[0]}
-	resp, err := client.Show(cmd.Context(), &req)
+	resp, err := client.Show(context.Background(), &req)
 	if err != nil {
 		return err
 	}
@@ -354,7 +350,7 @@ func CopyHandler(cmd *cobra.Command, args []string) error {
 	}

 	req := api.CopyRequest{Source: args[0], Destination: args[1]}
-	if err := client.Copy(cmd.Context(), &req); err != nil {
+	if err := client.Copy(context.Background(), &req); err != nil {
 		return err
 	}
 	fmt.Printf("copied '%s' to '%s'\n", args[0], args[1])
@@ -408,7 +404,7 @@ func PullHandler(cmd *cobra.Command, args []string) error {
 	}

 	request := api.PullRequest{Name: args[0], Insecure: insecure}
-	if err := client.Pull(cmd.Context(), &request, fn); err != nil {
+	if err := client.Pull(context.Background(), &request, fn); err != nil {
 		return err
 	}

@@ -416,22 +412,13 @@ func PullHandler(cmd *cobra.Command, args []string) error {
 }

 func RunGenerate(cmd *cobra.Command, args []string) error {
-	interactive := true
-
-	opts := generateOptions{
-		Model:    args[0],
-		WordWrap: os.Getenv("TERM") == "xterm-256color",
-		Options:  map[string]interface{}{},
-		Images:   []ImageData{},
-	}
-
 	format, err := cmd.Flags().GetString("format")
 	if err != nil {
 		return err
 	}
-	opts.Format = format

 	prompts := args[1:]
+
 	// prepend stdin to the prompt if provided
 	if !term.IsTerminal(int(os.Stdin.Fd())) {
 		in, err := io.ReadAll(os.Stdin)
@@ -440,41 +427,34 @@ func RunGenerate(cmd *cobra.Command, args []string) error {
 		}

 		prompts = append([]string{string(in)}, prompts...)
-		opts.WordWrap = false
-		interactive = false
 	}
-	opts.Prompt = strings.Join(prompts, " ")
-	if len(prompts) > 0 {
-		interactive = false
+
+	// output is being piped
+	if !term.IsTerminal(int(os.Stdout.Fd())) {
+		return generate(cmd, args[0], strings.Join(prompts, " "), false, format)
 	}

+	wordWrap := os.Getenv("TERM") == "xterm-256color"
+
 	nowrap, err := cmd.Flags().GetBool("nowordwrap")
 	if err != nil {
 		return err
 	}
-	opts.WordWrap = !nowrap
-
-	if !interactive {
-		return generate(cmd, opts)
+	if nowrap {
+		wordWrap = false
 	}

-	return generateInteractive(cmd, opts)
+	// prompts are provided via stdin or args so don't enter interactive mode
+	if len(prompts) > 0 {
+		return generate(cmd, args[0], strings.Join(prompts, " "), wordWrap, format)
+	}
+
+	return generateInteractive(cmd, args[0], wordWrap, format)
 }

 type generateContextKey string

-type generateOptions struct {
-	Model    string
-	Prompt   string
-	WordWrap bool
-	Format   string
-	System   string
-	Template string
-	Images   []ImageData
-	Options  map[string]interface{}
-}
-
-func generate(cmd *cobra.Command, opts generateOptions) error {
+func generate(cmd *cobra.Command, model, prompt string, wordWrap bool, format string) error {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		return err
@@ -495,39 +475,34 @@ func generate(cmd *cobra.Command, opts generateOptions) error {

 	termWidth, _, err := term.GetSize(int(os.Stdout.Fd()))
 	if err != nil {
-		opts.WordWrap = false
+		wordWrap = false
 	}

-	ctx, cancel := context.WithCancel(cmd.Context())
+	cancelCtx, cancel := context.WithCancel(context.Background())
 	defer cancel()

 	sigChan := make(chan os.Signal, 1)
 	signal.Notify(sigChan, syscall.SIGINT)
+	var abort bool

 	go func() {
 		<-sigChan
 		cancel()
+		abort = true
 	}()

 	var currentLineLength int
 	var wordBuffer string

+	request := api.GenerateRequest{Model: model, Prompt: prompt, Context: generateContext, Format: format}
 	fn := func(response api.GenerateResponse) error {
 		p.StopAndClear()

 		latest = response

-		termWidth, _, _ = term.GetSize(int(os.Stdout.Fd()))
-		if opts.WordWrap && termWidth >= 10 {
+		if wordWrap {
 			for _, ch := range response.Response {
 				if currentLineLength+1 > termWidth-5 {
-					if len(wordBuffer) > termWidth-10 {
-						fmt.Printf("%s%c", wordBuffer, ch)
-						wordBuffer = ""
-						currentLineLength = 0
-						continue
-					}
-
 					// backtrack the length of the last word and clear to the end of the line
 					fmt.Printf("\x1b[%dD\x1b[K\n", len(wordBuffer))
 					fmt.Printf("%s%c", wordBuffer, ch)
@@ -547,43 +522,28 @@ func generate(cmd *cobra.Command, opts generateOptions) error {
 				}
 			}
 		} else {
-			fmt.Printf("%s%s", wordBuffer, response.Response)
-			if len(wordBuffer) > 0 {
-				wordBuffer = ""
-			}
+			fmt.Print(response.Response)
 		}

 		return nil
 	}

-	images := make([]api.ImageData, 0)
-	for _, i := range opts.Images {
-		images = append(images, api.ImageData(i))
-	}
-	request := api.GenerateRequest{
-		Model:    opts.Model,
-		Prompt:   opts.Prompt,
-		Context:  generateContext,
-		Format:   opts.Format,
-		System:   opts.System,
-		Template: opts.Template,
-		Options:  opts.Options,
-		Images:   images,
-	}
-
-	if err := client.Generate(ctx, &request, fn); err != nil {
-		if errors.Is(err, context.Canceled) {
+	if err := client.Generate(cancelCtx, &request, fn); err != nil {
+		if strings.Contains(err.Error(), "context canceled") && abort {
 			return nil
 		}
 		return err
 	}
-	if opts.Prompt != "" {
+	if prompt != "" {
 		fmt.Println()
 		fmt.Println()
 	}

 	if !latest.Done {
-		return nil
+		if abort {
+			return nil
+		}
+		return errors.New("unexpected end of response")
 	}

 	verbose, err := cmd.Flags().GetBool("verbose")
@@ -595,48 +555,16 @@ func generate(cmd *cobra.Command, opts generateOptions) error {
 		latest.Summary()
 	}

-	ctx = context.WithValue(cmd.Context(), generateContextKey("context"), latest.Context)
+	ctx := cmd.Context()
+	ctx = context.WithValue(ctx, generateContextKey("context"), latest.Context)
 	cmd.SetContext(ctx)

 	return nil
 }

-type MultilineState int
-
-const (
-	MultilineNone MultilineState = iota
-	MultilinePrompt
-	MultilineSystem
-	MultilineTemplate
-)
-
-func modelIsMultiModal(cmd *cobra.Command, name string) bool {
-	// get model details
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		fmt.Println("error: couldn't connect to ollama server")
-		return false
-	}
-
-	req := api.ShowRequest{Name: name}
-	resp, err := client.Show(cmd.Context(), &req)
-	if err != nil {
-		return false
-	}
-
-	return slices.Contains(resp.Details.Families, "clip")
-}
-
-func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
-	multiModal := modelIsMultiModal(cmd, opts.Model)
-
+func generateInteractive(cmd *cobra.Command, model string, wordWrap bool, format string) error {
 	// load the model
-	loadOpts := generateOptions{
-		Model:  opts.Model,
-		Prompt: "",
-		Images: []ImageData{},
-	}
-	if err := generate(cmd, loadOpts); err != nil {
+	if err := generate(cmd, model, "", false, ""); err != nil {
 		return err
 	}

@@ -653,17 +581,14 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {

 	usageSet := func() {
 		fmt.Fprintln(os.Stderr, "Available Commands:")
-		fmt.Fprintln(os.Stderr, "  /set parameter ...     Set a parameter")
-		fmt.Fprintln(os.Stderr, "  /set system <string>   Set system message")
-		fmt.Fprintln(os.Stderr, "  /set template <string> Set prompt template")
-		fmt.Fprintln(os.Stderr, "  /set history           Enable history")
-		fmt.Fprintln(os.Stderr, "  /set nohistory         Disable history")
-		fmt.Fprintln(os.Stderr, "  /set wordwrap          Enable wordwrap")
-		fmt.Fprintln(os.Stderr, "  /set nowordwrap        Disable wordwrap")
-		fmt.Fprintln(os.Stderr, "  /set format json       Enable JSON mode")
-		fmt.Fprintln(os.Stderr, "  /set noformat          Disable formatting")
-		fmt.Fprintln(os.Stderr, "  /set verbose           Show LLM stats")
-		fmt.Fprintln(os.Stderr, "  /set quiet             Disable LLM stats")
+		fmt.Fprintln(os.Stderr, "  /set history      Enable history")
+		fmt.Fprintln(os.Stderr, "  /set nohistory    Disable history")
+		fmt.Fprintln(os.Stderr, "  /set wordwrap     Enable wordwrap")
+		fmt.Fprintln(os.Stderr, "  /set nowordwrap   Disable wordwrap")
+		fmt.Fprintln(os.Stderr, "  /set format json  Enable JSON mode")
+		fmt.Fprintln(os.Stderr, "  /set noformat     Disable formatting")
+		fmt.Fprintln(os.Stderr, "  /set verbose      Show LLM stats")
+		fmt.Fprintln(os.Stderr, "  /set quiet        Disable LLM stats")
 		fmt.Fprintln(os.Stderr, "")
 	}

@@ -672,27 +597,11 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 		fmt.Fprintln(os.Stderr, "  /show license      Show model license")
 		fmt.Fprintln(os.Stderr, "  /show modelfile    Show Modelfile for this model")
 		fmt.Fprintln(os.Stderr, "  /show parameters   Show parameters for this model")
-		fmt.Fprintln(os.Stderr, "  /show system       Show system message")
+		fmt.Fprintln(os.Stderr, "  /show system       Show system prompt")
 		fmt.Fprintln(os.Stderr, "  /show template     Show prompt template")
 		fmt.Fprintln(os.Stderr, "")
 	}

-	// only list out the most common parameters
-	usageParameters := func() {
-		fmt.Fprintln(os.Stderr, "Available Parameters:")
-		fmt.Fprintln(os.Stderr, "  /set parameter seed <int>             Random number seed")
-		fmt.Fprintln(os.Stderr, "  /set parameter num_predict <int>      Max number of tokens to predict")
-		fmt.Fprintln(os.Stderr, "  /set parameter top_k <int>            Pick from top k num of tokens")
-		fmt.Fprintln(os.Stderr, "  /set parameter top_p <float>          Pick token based on sum of probabilities")
-		fmt.Fprintln(os.Stderr, "  /set parameter num_ctx <int>          Set the context size")
-		fmt.Fprintln(os.Stderr, "  /set parameter temperature <float>    Set creativity level")
-		fmt.Fprintln(os.Stderr, "  /set parameter repeat_penalty <float> How strongly to penalize repetitions")
-		fmt.Fprintln(os.Stderr, "  /set parameter repeat_last_n <int>    Set how far back to look for repetitions")
-		fmt.Fprintln(os.Stderr, "  /set parameter num_gpu <int>          The number of layers to send to the GPU")
-		fmt.Fprintln(os.Stderr, "  /set parameter stop \"<string>\", ...   Set the stop parameters")
-		fmt.Fprintln(os.Stderr, "")
-	}
-
 	scanner, err := readline.New(readline.Prompt{
 		Prompt:         ">>> ",
 		AltPrompt:      "... ",
@@ -706,7 +615,6 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 	fmt.Print(readline.StartBracketedPaste)
 	defer fmt.Printf(readline.EndBracketedPaste)

-	var multiline MultilineState
 	var prompt string

 	for {
@@ -733,30 +641,16 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 			// if the prompt so far starts with """ then we're in multiline mode
 			// and we need to keep reading until we find a line that ends with """
 			cut, found := strings.CutSuffix(line, `"""`)
-			prompt += cut
+			prompt += cut + "\n"

 			if !found {
-				prompt += "\n"
 				continue
 			}

 			prompt = strings.TrimPrefix(prompt, `"""`)
 			scanner.Prompt.UseAlt = false
-
-			switch multiline {
-			case MultilineSystem:
-				opts.System = prompt
-				prompt = ""
-				fmt.Println("Set system message.")
-			case MultilineTemplate:
-				opts.Template = prompt
-				prompt = ""
-				fmt.Println("Set prompt template.")
-			}
-			multiline = MultilineNone
 		case strings.HasPrefix(line, `"""`) && len(prompt) == 0:
 			scanner.Prompt.UseAlt = true
-			multiline = MultilinePrompt
 			prompt += line + "\n"
 			continue
 		case scanner.Pasting:
@@ -776,10 +670,10 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 				case "nohistory":
 					scanner.HistoryDisable()
 				case "wordwrap":
-					opts.WordWrap = true
+					wordWrap = true
 					fmt.Println("Set 'wordwrap' mode.")
 				case "nowordwrap":
-					opts.WordWrap = false
+					wordWrap = false
 					fmt.Println("Set 'nowordwrap' mode.")
 				case "verbose":
 					cmd.Flags().Set("verbose", "true")
@@ -791,60 +685,12 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 					if len(args) < 3 || args[2] != "json" {
 						fmt.Println("Invalid or missing format. For 'json' mode use '/set format json'")
 					} else {
-						opts.Format = args[2]
+						format = args[2]
 						fmt.Printf("Set format to '%s' mode.\n", args[2])
 					}
 				case "noformat":
-					opts.Format = ""
+					format = ""
 					fmt.Println("Disabled format.")
-				case "parameter":
-					if len(args) < 4 {
-						usageParameters()
-						continue
-					}
-					var params []string
-					for _, p := range args[3:] {
-						params = append(params, p)
-					}
-					fp, err := api.FormatParams(map[string][]string{args[2]: params})
-					if err != nil {
-						fmt.Printf("Couldn't set parameter: %q\n\n", err)
-						continue
-					}
-					fmt.Printf("Set parameter '%s' to '%s'\n\n", args[2], strings.Join(params, ", "))
-					opts.Options[args[2]] = fp[args[2]]
-				case "system", "template":
-					if len(args) < 3 {
-						usageSet()
-						continue
-					}
-					line := strings.Join(args[2:], " ")
-					line = strings.TrimPrefix(line, `"""`)
-					if strings.HasPrefix(args[2], `"""`) {
-						cut, found := strings.CutSuffix(line, `"""`)
-						prompt += cut
-						if found {
-							if args[1] == "system" {
-								opts.System = prompt
-								fmt.Println("Set system message.")
-							} else {
-								opts.Template = prompt
-								fmt.Println("Set prompt template.")
-							}
-							prompt = ""
-						} else {
-							prompt = `"""` + prompt + "\n"
-							if args[1] == "system" {
-								multiline = MultilineSystem
-							} else {
-								multiline = MultilineTemplate
-							}
-							scanner.Prompt.UseAlt = true
-						}
-					} else {
-						opts.System = line
-						fmt.Println("Set system message.")
-					}
 				default:
 					fmt.Printf("Unknown command '/set %s'. Type /? for help\n", args[1])
 				}
@@ -859,7 +705,7 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 					fmt.Println("error: couldn't connect to ollama server")
 					return err
 				}
-				resp, err := client.Show(cmd.Context(), &api.ShowRequest{Name: opts.Model})
+				resp, err := client.Show(cmd.Context(), &api.ShowRequest{Name: model})
 				if err != nil {
 					fmt.Println("error: couldn't get model")
 					return err
@@ -878,33 +724,19 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 					if resp.Parameters == "" {
 						fmt.Print("No parameters were specified for this model.\n\n")
 					} else {
-						if len(opts.Options) > 0 {
-							fmt.Println("User defined parameters:")
-							for k, v := range opts.Options {
-								fmt.Printf("%-*s %v\n", 30, k, v)
-							}
-							fmt.Println()
-						}
-						fmt.Println("Model defined parameters:")
 						fmt.Println(resp.Parameters)
 					}
 				case "system":
-					switch {
-					case opts.System != "":
-						fmt.Println(opts.System + "\n")
-					case resp.System != "":
-						fmt.Println(resp.System + "\n")
-					default:
-						fmt.Print("No system message was specified for this model.\n\n")
+					if resp.System == "" {
+						fmt.Print("No system prompt was specified for this model.\n\n")
+					} else {
+						fmt.Println(resp.System)
 					}
 				case "template":
-					switch {
-					case opts.Template != "":
-						fmt.Println(opts.Template + "\n")
-					case resp.Template != "":
-						fmt.Println(resp.Template)
-					default:
+					if resp.Template == "" {
 						fmt.Print("No prompt template was specified for this model.\n\n")
+					} else {
+						fmt.Println(resp.Template)
 					}
 				default:
 					fmt.Printf("Unknown command '/show %s'. Type /? for help\n", args[1])
@@ -934,30 +766,8 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 			prompt += line
 		}

-		if len(prompt) > 0 && multiline == MultilineNone {
-			opts.Prompt = prompt
-			if multiModal {
-				newPrompt, images, err := extractFileNames(prompt)
-				if err != nil {
-					return err
-				}
-				opts.Prompt = newPrompt
-
-				// reset the context if we find another image
-				if len(images) > 0 {
-					opts.Images = images
-					ctx := cmd.Context()
-					ctx = context.WithValue(ctx, generateContextKey("context"), []int{})
-					cmd.SetContext(ctx)
-				}
-				if len(opts.Images) == 0 {
-					fmt.Println("This model requires you to add a jpeg, png, or svg image.")
-					fmt.Println()
-					prompt = ""
-					continue
-				}
-			}
-			if err := generate(cmd, opts); err != nil {
+		if len(prompt) > 0 && prompt[0] != '/' {
+			if err := generate(cmd, model, prompt, wordWrap, format); err != nil {
 				return err
 			}

@@ -966,57 +776,6 @@ func generateInteractive(cmd *cobra.Command, opts generateOptions) error {
 	}
 }

-func normalizeFilePath(fp string) string {
-	// Define a map of escaped characters and their replacements
-	replacements := map[string]string{
-		"\\ ":  " ",  // Escaped space
-		"\\(":  "(",  // Escaped left parenthesis
-		"\\)":  ")",  // Escaped right parenthesis
-		"\\[":  "[",  // Escaped left square bracket
-		"\\]":  "]",  // Escaped right square bracket
-		"\\{":  "{",  // Escaped left curly brace
-		"\\}":  "}",  // Escaped right curly brace
-		"\\$":  "$",  // Escaped dollar sign
-		"\\&":  "&",  // Escaped ampersand
-		"\\;":  ";",  // Escaped semicolon
-		"\\'":  "'",  // Escaped single quote
-		"\\\\": "\\", // Escaped backslash
-		"\\*":  "*",  // Escaped asterisk
-		"\\?":  "?",  // Escaped question mark
-	}
-
-	for escaped, actual := range replacements {
-		fp = strings.ReplaceAll(fp, escaped, actual)
-	}
-	return fp
-}
-
-func extractFileNames(input string) (string, []ImageData, error) {
-	// Regex to match file paths starting with / or ./ and include escaped spaces (\ or %20)
-	// and followed by more characters and a file extension
-	regexPattern := `(?:\./|/)[\S\\ ]+?\.(?i:jpg|jpeg|png|svg)\b`
-	re := regexp.MustCompile(regexPattern)
-
-	filePaths := re.FindAllString(input, -1)
-	var imgs []ImageData
-
-	for _, fp := range filePaths {
-		nfp := normalizeFilePath(fp)
-		data, err := getImageData(nfp)
-		if err != nil {
-			if os.IsNotExist(err) {
-				continue
-			}
-			fmt.Printf("Couldn't process image: %q\n", err)
-			return "", imgs, err
-		}
-		fmt.Printf("Added image '%s'\n", nfp)
-		input = strings.ReplaceAll(input, fp, "")
-		imgs = append(imgs, data)
-	}
-	return input, imgs, nil
-}
-
 func RunServer(cmd *cobra.Command, _ []string) error {
 	host, port, err := net.SplitHostPort(os.Getenv("OLLAMA_HOST"))
 	if err != nil {
@@ -1043,50 +802,6 @@ func RunServer(cmd *cobra.Command, _ []string) error {
 	return server.Serve(ln, origins)
 }

-func getImageData(filePath string) ([]byte, error) {
-	file, err := os.Open(filePath)
-	if err != nil {
-		return nil, err
-	}
-	defer file.Close()
-
-	buf := make([]byte, 512)
-	_, err = file.Read(buf)
-	if err != nil {
-		return nil, err
-	}
-
-	contentType := http.DetectContentType(buf)
-	allowedTypes := []string{"image/jpeg", "image/jpg", "image/svg+xml", "image/png"}
-	if !slices.Contains(allowedTypes, contentType) {
-		return nil, fmt.Errorf("invalid image type: %s", contentType)
-	}
-
-	info, err := file.Stat()
-	if err != nil {
-		return nil, err
-	}
-
-	// Check if the file size exceeds 100MB
-	var maxSize int64 = 100 * 1024 * 1024 // 100MB in bytes
-	if info.Size() > maxSize {
-		return nil, fmt.Errorf("file size exceeds maximum limit (100MB)")
-	}
-
-	buf = make([]byte, info.Size())
-	_, err = file.Seek(0, 0)
-	if err != nil {
-		return nil, err
-	}
-
-	_, err = io.ReadFull(file, buf)
-	if err != nil {
-		return nil, err
-	}
-
-	return buf, nil
-}
-
 func initializeKeypair() error {
 	home, err := os.UserHomeDir()
 	if err != nil {
@@ -1136,7 +851,7 @@ func initializeKeypair() error {
 	return nil
 }

-func startMacApp(ctx context.Context, client *api.Client) error {
+func startMacApp(client *api.Client) error {
 	exe, err := os.Executable()
 	if err != nil {
 		return err
@@ -1160,24 +875,24 @@ func startMacApp(ctx context.Context, client *api.Client) error {
 		case <-timeout:
 			return errors.New("timed out waiting for server to start")
 		case <-tick:
-			if err := client.Heartbeat(ctx); err == nil {
+			if err := client.Heartbeat(context.Background()); err == nil {
 				return nil // server has started
 			}
 		}
 	}
 }

-func checkServerHeartbeat(cmd *cobra.Command, _ []string) error {
+func checkServerHeartbeat(_ *cobra.Command, _ []string) error {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		return err
 	}
-	if err := client.Heartbeat(cmd.Context()); err != nil {
+	if err := client.Heartbeat(context.Background()); err != nil {
 		if !strings.Contains(err.Error(), "connection refused") {
 			return err
 		}
 		if runtime.GOOS == "darwin" {
-			if err := startMacApp(cmd.Context(), client); err != nil {
+			if err := startMacApp(client); err != nil {
 				return fmt.Errorf("could not connect to ollama app, is it running?")
 			}
 		} else {
@@ -1187,29 +902,8 @@ func checkServerHeartbeat(cmd *cobra.Command, _ []string) error {
 	return nil
 }

-func versionHandler(cmd *cobra.Command, _ []string) {
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		return
-	}
-
-	serverVersion, err := client.Version(cmd.Context())
-	if err != nil {
-		fmt.Println("Warning: could not connect to a running Ollama instance")
-	}
-
-	if serverVersion != "" {
-		fmt.Printf("ollama version is %s\n", serverVersion)
-	}
-
-	if serverVersion != version.Version {
-		fmt.Printf("Warning: client version is %s\n", version.Version)
-	}
-}
-
 func NewCLI() *cobra.Command {
 	log.SetFlags(log.LstdFlags | log.Lshortfile)
-	cobra.EnableCommandSorting = false

 	rootCmd := &cobra.Command{
 		Use:           "ollama",
@@ -1219,17 +913,10 @@ func NewCLI() *cobra.Command {
 		CompletionOptions: cobra.CompletionOptions{
 			DisableDefaultCmd: true,
 		},
-		Run: func(cmd *cobra.Command, args []string) {
-			if version, _ := cmd.Flags().GetBool("version"); version {
-				versionHandler(cmd, args)
-				return
-			}
-
-			cmd.Print(cmd.UsageString())
-		},
+		Version: version.Version,
 	}

-	rootCmd.Flags().BoolP("version", "v", false, "Show version information")
+	cobra.EnableCommandSorting = false

 	createCmd := &cobra.Command{
 		Use:     "create MODEL",
@@ -1253,7 +940,7 @@ func NewCLI() *cobra.Command {
 	showCmd.Flags().Bool("modelfile", false, "Show Modelfile of a model")
 	showCmd.Flags().Bool("parameters", false, "Show parameters of a model")
 	showCmd.Flags().Bool("template", false, "Show template of a model")
-	showCmd.Flags().Bool("system", false, "Show system message of a model")
+	showCmd.Flags().Bool("system", false, "Show system prompt of a model")

 	runCmd := &cobra.Command{
 		Use:     "run MODEL [PROMPT]",
--- a/docs/api.md
+++ b/docs/api.md
@@ -3,7 +3,6 @@
 ## Endpoints

 - [Generate a completion](#generate-a-completion)
- [Generate a chat completion](#generate-a-chat-completion)
 - [Create a Model](#create-a-model)
 - [List Local Models](#list-local-models)
 - [Show Model Information](#show-model-information)
@@ -25,7 +24,7 @@ All durations are returned in nanoseconds.

 ### Streaming responses

-Certain endpoints stream responses as JSON objects.
+Certain endpoints stream responses as JSON objects delineated with the newline (`\n`) character.

 ## Generate a completion

@@ -33,23 +32,22 @@ Certain endpoints stream responses as JSON objects.
 POST /api/generate
 ```

-Generate a response for a given prompt with a provided model. This is a streaming endpoint, so there will be a series of responses. The final response object will include statistics and additional data from the request.
+Generate a response for a given prompt with a provided model. This is a streaming endpoint, so will be a series of responses. The final response object will include statistics and additional data from the request.

 ### Parameters

 - `model`: (required) the [model name](#model-names)
 - `prompt`: the prompt to generate a response for
- `images`: a list of base64-encoded images (for multimodal models such as `llava`)

 Advanced parameters (optional):

 - `format`: the format to return a response in. Currently the only accepted value is `json`
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
- `system`: system message to (overrides what is defined in the `Modelfile`)
+- `system`: system prompt to (overrides what is defined in the `Modelfile`)
 - `template`: the full prompt or prompt template (overrides what is defined in the `Modelfile`)
 - `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
- `raw`: if `true` no formatting will be applied to the prompt. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API.
+- `raw`: if `true` no formatting will be applied to the prompt and no context will be returned. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API, and are managing history yourself.

 ### JSON mode

@@ -116,8 +114,6 @@ To calculate how fast the response is generated in tokens per second (token/s),

 #### Request (No streaming)

-A response can be recieved in one reply when streaming is off.
-
 ```shell
 curl http://localhost:11434/api/generate -d '{
  "model": "llama2",
@@ -148,40 +144,9 @@ If `stream` is set to `false`, the response will be a single JSON object:
 }
 ```

-#### Request (with images)
+#### Request (Raw mode)

-To submit images to multimodal models such as `llava` or `bakllava`, provide a list of base64-encoded `images`:
-
-```shell
-curl http://localhost:11434/api/generate -d '{
-  "model": "llava",
-  "prompt":"What is in this picture?",
-  "stream": false,
-  "images": ["iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC"]
-}'
-```
-
-#### Response
-
-```
-{
-  "model": "llava",
-  "created_at": "2023-11-03T15:36:02.583064Z",
-  "response": "A happy cartoon character, which is cute and cheerful.",
-  "context": [1, 2, 3],
-  "done": true,
-  "total_duration": 14648695333,
-  "load_duration": 3302671417,
-  "prompt_eval_count": 14,
-  "prompt_eval_duration": 286243000,
-  "eval_count": 129,
-  "eval_duration": 10931424000
-}
-```
-
-#### Request (Raw Mode)
-
-In some cases you may wish to bypass the templating system and provide a full prompt. In this case, you can use the `raw` parameter to disable formatting.
+In some cases you may wish to bypass the templating system and provide a full prompt. In this case, you can use the `raw` parameter to disable formatting and context.

 ```shell
 curl http://localhost:11434/api/generate -d '{
@@ -199,7 +164,6 @@ curl http://localhost:11434/api/generate -d '{
  "model": "mistral",
  "created_at": "2023-11-03T15:36:02.583064Z",
  "response": " The sky appears blue because of a phenomenon called Rayleigh scattering.",
-  "context": [1, 2, 3],
  "done": true,
  "total_duration": 14648695333,
  "load_duration": 3302671417,
@@ -285,7 +249,7 @@ curl http://localhost:11434/api/generate -d '{
    "penalize_newline": true,
    "stop": ["\n", "user:"],
    "numa": false,
-    "num_ctx": 1024,
+    "num_ctx": 4,
    "num_batch": 2,
    "num_gqa": 1,
    "num_gpu": 1,
@@ -300,7 +264,7 @@ curl http://localhost:11434/api/generate -d '{
    "rope_frequency_base": 1.1,
    "rope_frequency_scale": 0.8,
    "num_thread": 8
-  }
+    }
 }'
 ```

@@ -311,6 +275,7 @@ curl http://localhost:11434/api/generate -d '{
  "model": "llama2",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "The sky is blue because it is the color of the sky.",
+  "context": [1, 2, 3],
  "done": true,
  "total_duration": 5589157167,
  "load_duration": 3013701500,
@@ -323,159 +288,6 @@ curl http://localhost:11434/api/generate -d '{
 }
 ```

-## Generate a chat completion
-
-```shell
-POST /api/chat
-```
-
-Generate the next message in a chat with a provided model. This is a streaming endpoint, so there will be a series of responses. The final response object will include statistics and additional data from the request.
-
-### Parameters
-
- `model`: (required) the [model name](#model-names)
- `messages`: the messages of the chat, this can be used to keep a chat memory
-
-The `message` object has the following fields:
-
- `role`: the role of the message, either `system`, `user` or `assistant`
- `content`: the content of the message
- `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
-
-Advanced parameters (optional):
-
- `format`: the format to return a response in. Currently the only accepted value is `json`
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
- `template`: the full prompt or prompt template (overrides what is defined in the `Modelfile`)
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
-
-### Examples
-
-#### Request
-
-Send a chat message with a streaming response.
-
-```shell
-curl http://localhost:11434/api/chat -d '{
-  "model": "llama2",
-  "messages": [
-    {
-      "role": "user",
-      "content": "why is the sky blue?"
-    }
-  ]
-}'
-```
-
-#### Response
-
-A stream of JSON objects is returned:
-
-```json
-{
-  "model": "llama2",
-  "created_at": "2023-08-04T08:52:19.385406455-07:00",
-  "message": {
-    "role": "assisant",
-    "content": "The"
-  },
-  "done": false
-}
-```
-
-Final response:
-
-```json
-{
-  "model": "llama2",
-  "created_at": "2023-08-04T19:22:45.499127Z",
-  "done": true,
-  "total_duration": 5589157167,
-  "load_duration": 3013701500,
-  "sample_count": 114,
-  "sample_duration": 81442000,
-  "prompt_eval_count": 46,
-  "prompt_eval_duration": 1160282000,
-  "eval_count": 113,
-  "eval_duration": 1325948000
-}
-```
-
-#### Request (With History)
-
-Send a chat message with a conversation history.
-
-```shell
-curl http://localhost:11434/api/chat -d '{
-  "model": "llama2",
-  "messages": [
-    {
-      "role": "user",
-      "content": "why is the sky blue?"
-    },
-    {
-      "role": "assistant",
-      "content": "due to rayleigh scattering."
-    },
-    {
-      "role": "user",
-      "content": "how is that different than mie scattering?"
-    }
-  ]
-}'
-```
-
-#### Response
-
-A stream of JSON objects is returned:
-
-```json
-{
-  "model": "llama2",
-  "created_at": "2023-08-04T08:52:19.385406455-07:00",
-  "message": {
-    "role": "assisant",
-    "content": "The"
-  },
-  "done": false
-}
-```
-
-Final response:
-
-```json
-{
-  "model": "llama2",
-  "created_at": "2023-08-04T19:22:45.499127Z",
-  "done": true,
-  "total_duration": 5589157167,
-  "load_duration": 3013701500,
-  "sample_count": 114,
-  "sample_duration": 81442000,
-  "prompt_eval_count": 46,
-  "prompt_eval_duration": 1160282000,
-  "eval_count": 113,
-  "eval_duration": 1325948000
-}
-```
-
-#### Request (with images)
-
-Send a chat message with a conversation history.
-
-```shell
-curl http://localhost:11434/api/chat -d '{
-  "model": "llama2",
-  "messages": [
-    {
-      "role": "user",
-      "content": "what is in this image?",
-      "images": ["iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC"]
-    },
-  ]
-}'
-```
-
 ## Create a Model

 ```shell
@@ -603,7 +415,7 @@ A single JSON object will be returned.
 POST /api/show
 ```

-Show information about a model including details, modelfile, template, parameters, license, and system prompt.
+Show details about a model including modelfile, template, parameters, license, and system prompt.

 ### Parameters

@@ -623,16 +435,10 @@ curl http://localhost:11434/api/show -d '{

 ```json
 {
-  "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM mike/llava:latest\nTEMPLATE \"\"\"\nUSER:{{ .Prompt }}\nASSISTANT:\n\"\"\"\nPARAMETER num_ctx 4096",
-  "parameters": "num_ctx                        4096",
-  "template": "\nUSER:{{ .Prompt }}\nASSISTANT:\n",
-  "license:": "<license>",
-  "details": {
-    "format": "gguf",
-    "families": ["llama", "clip"],
-    "parameter_size": "7B",
-    "quantization_level": "Q4_0"
-  }
+  "license": "<contents of license block>",
+  "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llama2:latest\n\nFROM /Users/username/.ollama/models/blobs/sha256:8daa9615cce30c259a9555b1cc250d461d1bc69980a274b44d7eda0be78076d8\nTEMPLATE \"\"\"[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] \"\"\"\nSYSTEM \"\"\"\"\"\"\nPARAMETER stop [INST]\nPARAMETER stop [/INST]\nPARAMETER stop <<SYS>>\nPARAMETER stop <</SYS>>\n",
+  "parameters": "stop                           [INST]\nstop                           [/INST]\nstop                           <<SYS>>\nstop                           <</SYS>>",
+  "template": "[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] "
 }
 ```

--- a/docs/faq.md
+++ b/docs/faq.md
@@ -23,7 +23,7 @@ Ollama binds to 127.0.0.1 port 11434 by default. Change the bind address with th
 On macOS:

 ```bash
-OLLAMA_HOST=0.0.0.0:11434 ollama serve
+OLLAMA_HOST=0.0.0.0:11435 ollama serve
 ```

 On Linux:
@@ -59,7 +59,7 @@ OLLAMA_ORIGINS=http://192.168.1.1:*,https://example.com ollama serve
 On Linux:

 ```bash
-echo 'Environment="OLLAMA_ORIGINS=http://192.168.1.1:*,https://example.com"' >>/etc/systemd/system/ollama.service.d/environment.conf
+echo 'Environment="OLLAMA_ORIGINS=http://129.168.1.1:*,https://example.com"' >>/etc/systemd/system/ollama.service.d/environment.conf
 ```

 Reload `systemd` and restart Ollama:
--- a/docs/import.md
+++ b/docs/import.md
@@ -43,6 +43,7 @@ Ollama supports a set of model architectures, with support for more coming soon:

 - Llama & Mistral
 - Falcon & RW
+- GPT-NeoX
 - BigCode

 To view a model's architecture, check the `config.json` file in its HuggingFace repo. You should see an entry under `architectures` (e.g. `LlamaForCausalLM`).
@@ -183,6 +184,9 @@ python convert.py <path to model directory>
 # FalconForCausalLM
 python convert-falcon-hf-to-gguf.py <path to model directory>

+# GPTNeoXForCausalLM
+python convert-gptneox-hf-to-gguf.py <path to model directory>
+
 # GPTBigCodeForCausalLM
 python convert-starcoder-hf-to-gguf.py <path to model directory>
 ```
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -30,14 +30,14 @@ The format of the `Modelfile`:
 INSTRUCTION arguments
 ```

-| Instruction                         | Description                                                    |
-| ----------------------------------- | -------------------------------------------------------------- |
-| [`FROM`](#from-required) (required) | Defines the base model to use.                                 |
-| [`PARAMETER`](#parameter)           | Sets the parameters for how Ollama will run the model.         |
-| [`TEMPLATE`](#template)             | The full prompt template to be sent to the model.              |
-| [`SYSTEM`](#system)                 | Specifies the system message that will be set in the template. |
-| [`ADAPTER`](#adapter)               | Defines the (Q)LoRA adapters to apply to the model.            |
-| [`LICENSE`](#license)               | Specifies the legal license.                                   |
+| Instruction                         | Description                                                   |
+| ----------------------------------- | ------------------------------------------------------------- |
+| [`FROM`](#from-required) (required) | Defines the base model to use.                                |
+| [`PARAMETER`](#parameter)           | Sets the parameters for how Ollama will run the model.        |
+| [`TEMPLATE`](#template)             | The full prompt template to be sent to the model.             |
+| [`SYSTEM`](#system)                 | Specifies the system prompt that will be set in the template. |
+| [`ADAPTER`](#adapter)               | Defines the (Q)LoRA adapters to apply to the model.           |
+| [`LICENSE`](#license)               | Specifies the legal license.                                  |

 ## Examples

@@ -52,7 +52,7 @@ PARAMETER temperature 1
 # sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token
 PARAMETER num_ctx 4096

-# sets a custom system message to specify the behavior of the chat assistant
+# sets a custom system prompt to specify the behavior of the chat assistant
 SYSTEM You are Mario from super mario bros, acting as an assistant.
 ```

@@ -70,9 +70,9 @@ More examples are available in the [examples directory](../examples).
 There are two ways to view `Modelfile`s underlying the models in [ollama.ai/library][1]:

 - Option 1: view a details page from a model's tags page:
-  1.  Go to a particular model's tags (e.g. https://ollama.ai/library/llama2/tags)
-  2.  Click on a tag (e.g. https://ollama.ai/library/llama2:13b)
-  3.  Scroll down to "Layers"
+   1. Go to a particular model's tags (e.g. https://ollama.ai/library/llama2/tags)
+   2. Click on a tag (e.g. https://ollama.ai/library/llama2:13b)
+   3. Scroll down to "Layers"
      - Note: if the [`FROM` instruction](#from-required) is not present,
        it means the model was created from a local file
 - Option 2: use `ollama show` to print the `Modelfile` like so:
@@ -152,15 +152,15 @@ PARAMETER <parameter> <parametervalue>

 ### TEMPLATE

-`TEMPLATE` of the full prompt template to be passed into the model. It may include (optionally) a system message and a user's prompt. This is used to create a full custom prompt, and syntax may be model specific. You can usually find the template for a given model in the readme for that model.
+`TEMPLATE` of the full prompt template to be passed into the model. It may include (optionally) a system prompt and a user's prompt. This is used to create a full custom prompt, and syntax may be model specific. You can usually find the template for a given model in the readme for that model.

 #### Template Variables

-| Variable        | Description                                                                                                   |
-| --------------- | ------------------------------------------------------------------------------------------------------------- |
-| `{{ .System }}` | The system message used to specify custom behavior, this must also be set in the Modelfile as an instruction. |
-| `{{ .Prompt }}` | The incoming prompt, this is not specified in the model file and will be set based on input.                  |
-| `{{ .First }}`  | A boolean value used to render specific template information for the first generation of a session.           |
+| Variable        | Description                                                                                                  |
+| --------------- | ------------------------------------------------------------------------------------------------------------ |
+| `{{ .System }}` | The system prompt used to specify custom behavior, this must also be set in the Modelfile as an instruction. |
+| `{{ .Prompt }}` | The incoming prompt, this is not specified in the model file and will be set based on input.                 |
+| `{{ .First }}`  | A boolean value used to render specific template information for the first generation of a session.          |

 ```modelfile
 TEMPLATE """
@@ -180,7 +180,7 @@ SYSTEM """<system message>"""

 ### SYSTEM

-The `SYSTEM` instruction specifies the system message to be used in the template, if applicable.
+The `SYSTEM` instruction specifies the system prompt to be used in the template, if applicable.

 ```modelfile
 SYSTEM """<system message>"""
--- a/docs/tutorials/fly-gpu.md
+++ b/docs/tutorials/fly-gpu.md
@@ -1,83 +0,0 @@
-# Running Ollama on Fly.io GPU Instances
-
-Ollama runs with little to no configuration on [Fly.io GPU instances](https://fly.io/docs/gpus/gpu-quickstart/). If you don't have access to GPUs yet, you'll need to [apply for access](https://fly.io/gpu/) on the waitlist. Once you're accepted, you'll get an email with instructions on how to get started.
-
-Create a new app with `fly apps create`:
-
-```bash
-fly apps create
-```
-
-Then create a `fly.toml` file in a new folder that looks like this:
-
-```toml
-app = "sparkling-violet-709"
-primary_region = "ord"
-vm.size = "a100-40gb" # see https://fly.io/docs/gpus/gpu-quickstart/ for more info
-
-[build]
-  image = "ollama/ollama"
-
-[http_service]
-  internal_port = 11434
-  force_https = false
-  auto_stop_machines = true
-  auto_start_machines = true
-  min_machines_running = 0
-  processes = ["app"]
-
-[mounts]
-  source = "models"
-  destination = "/root/.ollama"
-  initial_size = "100gb"
-```
-
-Then create a [new private IPv6 address](https://fly.io/docs/reference/private-networking/#flycast-private-load-balancing) for your app:
-
-```bash
-fly ips allocate-v6 --private
-```
-
-Then deploy your app:
-
-```bash
-fly deploy
-```
-
-And finally you can access it interactively with a new Fly.io Machine:
-
-```
-fly machine run -e OLLAMA_HOST=http://your-app-name.flycast --shell ollama/ollama
-```
-
-```bash
-$ ollama run openchat:7b-v3.5-fp16
->>> How do I bake chocolate chip cookies?
- To bake chocolate chip cookies, follow these steps:
-
-1. Preheat the oven to 375°F (190°C) and line a baking sheet with parchment paper or silicone baking mat.
-
-2. In a large bowl, mix together 1 cup of unsalted butter (softened), 3/4 cup granulated sugar, and 3/4
-cup packed brown sugar until light and fluffy.
-
-3. Add 2 large eggs, one at a time, to the butter mixture, beating well after each addition. Stir in 1
-teaspoon of pure vanilla extract.
-
-4. In a separate bowl, whisk together 2 cups all-purpose flour, 1/2 teaspoon baking soda, and 1/2 teaspoon
-salt. Gradually add the dry ingredients to the wet ingredients, stirring until just combined.
-
-5. Fold in 2 cups of chocolate chips (or chunks) into the dough.
-
-6. Drop rounded tablespoons of dough onto the prepared baking sheet, spacing them about 2 inches apart.
-
-7. Bake for 10-12 minutes, or until the edges are golden brown. The centers should still be slightly soft.
-
-8. Allow the cookies to cool on the baking sheet for a few minutes before transferring them to a wire rack
-to cool completely.
-
-Enjoy your homemade chocolate chip cookies!
-```
-
-When you set it up like this, it will automatically turn off when you're done using it. Then when you access it again, it will automatically turn back on. This is a great way to save money on GPU instances when you're not using them. If you want a persistent wake-on-use connection to your Ollama instance, you can set up a [connection to your Fly network using WireGuard](https://fly.io/docs/reference/private-networking/#discovering-apps-through-dns-on-a-wireguard-connection). Then you can access your Ollama instance at `http://your-app-name.flycast`.
-
-And that's it!
--- a/docs/tutorials/langchainpy.md
+++ b/docs/tutorials/langchainpy.md
@@ -42,13 +42,12 @@ text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
 all_splits = text_splitter.split_documents(data)
 ```

-It's split up, but we have to find the relevant splits and then submit those to the model. We can do this by creating embeddings and storing them in a vector database. We can use Ollama directly to instantiate an embedding model. We will use ChromaDB in this example for a vector database. `pip install GPT4All chromadb`
+It's split up, but we have to find the relevant splits and then submit those to the model. We can do this by creating embeddings and storing them in a vector database. For now, we don't have embeddings built in to Ollama, though we will be adding that soon, so for now, we can use the GPT4All library for that. We will use ChromaDB in this example for a vector database. `pip install GPT4All chromadb`

 ```python
-from langchain.embeddings import OllamaEmbeddings
+from langchain.embeddings import GPT4AllEmbeddings
 from langchain.vectorstores import Chroma
-oembed = OllamaEmbeddings(base_url="http://localhost:11434", model="llama2")
-vectorstore = Chroma.from_documents(documents=all_splits, embedding=oembed)
+vectorstore = Chroma.from_documents(documents=all_splits, embedding=GPT4AllEmbeddings())
 ```

 Now let's ask a question from the document. **Who was Neleus, and who is in his family?** Neleus is a character in the Odyssey, and the answer can be found in our text.
--- a/examples/kubernetes/gpu.yaml
+++ b/examples/kubernetes/gpu.yaml
@@ -25,11 +25,9 @@ spec:
        image: ollama/ollama:latest
        env:
        - name: PATH
-          value: /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+          value: /usr/local/nvidia/bin:/usr/local/nvidia/lib64:/usr/bin:/usr/sbin:/bin:/sbin
        - name: LD_LIBRARY_PATH
-          value: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
-        - name: NVIDIA_DRIVER_CAPABILITIES
-          value: compute,utility
+          value: /usr/local/nvidia/lib64
        ports:
        - name: http
          containerPort: 11434
--- a/examples/python-functioncalling/emails.txt
+++ b/examples/python-functioncalling/emails.txt
@@ -0,0 +1,17 @@
+---
+Hi matt, 
+
+thanks for letting me know that you are going to come today, November 16, for my tea party. My address is 123 Falk St on Bainbridge Island. I live in the house with the red door. I will be home all day so just come by whenever you want.
+
+Fred
+
+---
+Great, send the check to our office at 1917 1st St, Seattle, WA 98101. I will let you know when we receive it.
+
+Mark Richardson
+Big Corp
+---
+We are looking forward to seeing you at our Local AI Meetup. It will be held on December 3. It will be at the offices of Enormous Co. Our address is 344 1st Ave, Seattle, WA 98101. We will be meeting in the conference room on the 3rd floor.
+
+Barbara Reilly
+Enormous Co.
--- a/examples/python-functioncalling/extractemail.py
+++ b/examples/python-functioncalling/extractemail.py
@@ -0,0 +1,108 @@
+import requests
+import json
+
+model = "openchat"
+
+
+def reportEvents(name, date, location):
+    nameString = name if name else "an event"
+    dateString = f" on {date}" if date else ""
+    locationString = f" at {location}" if location else ""
+    print(f"You have an event: {nameString}{dateString}{locationString}")
+
+
+def reportAddresses(address):
+    for field in address:
+        if field == "city":
+            city = address["city"]
+            state = f", {address['state']}" if address["state"] else ""
+            zip = f" {address['zip']}"
+            print(f"{city}{state}{zip}\n")
+            break
+        else:
+            print(address[field])
+
+
+systemPrompt = "You will be given a text along with a prompt and a schema. You will have to extract the information requested in the prompt from the text and generate output in JSON observing the schema provided. If the schema shows a type of integer or number, you must only show a integer for that field. A string should always be a valid string. If a value is unknown, leave it empty. Output the JSON with extra spaces to ensure that it pretty prints."
+
+schema = {
+    "eventsQuantity": {
+        "type": "integer",
+        "description": "The number of events in the source text",
+    },
+    "addressesQuantity": {
+        "type": "integer",
+        "description": "The number of addresses in the source text",
+    },
+    "events": [
+        {
+            "name": {"type": "string", "description": "Name of the event"},
+            "date": {"type": "string", "description": "Date of the event"},
+            "location": {"type": "string", "description": "Location of the event"},
+            "extraInfo": {
+                "type": "string",
+                "description": "Any extra information that is provided about the event.",
+            },
+        }
+    ],
+    "people": [
+        {
+            "name": {"type": "string", "description": "Name of the person"},
+            "company": {
+                "type": "string",
+                "description": "Name of the company where they work",
+            },
+            "street": {
+                "type": "string",
+                "description": "Street address of the person or company. This is only the street name and the numerical address. Do not include city, state, or zip of the address in this field.",
+            },
+            "city": {
+                "type": "string",
+                "description": "City portion of the address of the person or company",
+            },
+            "state": {
+                "type": "string",
+                "description": "State portion of the address of the person or company",
+            },
+            "zip": {
+                "type": "string",
+                "description": "Zip code of the person or company",
+            },
+            "extraInfo": {
+                "type": "string",
+                "description": "Any extra information that is provided about the location.",
+            },
+        }
+    ],
+}
+
+with open("emails.txt") as f:
+    content=f.read()
+
+prompt = f"The source text is a series of emails that have been put into a single file. They are separated by three dashes. Review the source text and determine the full address of the person sending each of the emails as well as any events that we need to track. If they provide a company address use that. If any extra info is provided, such as a description of the place, or a floor, add it to extraInfo. The first field in the address JSON is quantity of events and should be set to the number of events tracked and the second field should be set to the number of addresses tracked in the file. Don't stuff an event into the output that isn't an event. Only add data to the mostly appropriate field. Don't make up fields that aren't in the schema. If there isn't a value for a field, use null. Output should be in JSON.\n\nSchema: \n{schema}\n\nSource Text:\n{content}"
+
+
+r = requests.post(
+    "http://localhost:11434/api/generate",
+    json={
+        "model": model,
+        "system": systemPrompt,
+        "prompt": prompt,
+        "format": "json",
+        "stream": False,
+    },
+)
+
+j = json.loads(r.text)
+
+output = json.loads(j["response"])
+events = output["events"]
+addresses = output["people"]
+
+print(f"Here are your {output['eventsQuantity']} events:")
+for event in events:
+    reportEvents(event["name"], event["date"], event["location"])
+
+print(f"\n\nHere are your {output['addressesQuantity']} addresses")
+for address in addresses:
+    reportAddresses(address)
--- a/examples/python-functioncalling/extractwp.py
+++ b/examples/python-functioncalling/extractwp.py
@@ -0,0 +1,52 @@
+import requests
+import json
+
+model = "orca2"
+
+systemprompt = "You will be given a text along with a prompt and a schema. You will have to extract the information requested in the prompt from the text and generate output in JSON observing the schema provided. If the schema shows a type of integer or number, you must only show a integer for that field. A string should always be a valid string. If a value is unknown, leave it empty. Output the JSON with extra spaces to ensure that it pretty prints."
+
+schema = {
+    "people": [
+        {
+            "name": {"type": "string", "description": "Name of the person"},
+            "title": {"type": "string", "description": "Title of the person"},
+        }
+    ],
+}
+
+# Read the content from the file
+words = []
+with open("wp.txt") as f:
+    maxwords = 2000
+    count = 0
+    lines = f.readlines()
+    for line in lines:
+        for word in line.split(" "):
+            count += 1
+            if count > maxwords:
+                break
+            words.append(word)
+content = ' '.join(words)
+
+# Use the text and schema to set the prompt
+prompt = f"Review the source text and determine 10 the most important people to focus on. Then extract the name and title for those people. Output should be in JSON.\n\nSchema: {schema}\n\nSource Text:\n{content}"
+
+
+# Make the actual request to the model
+r = requests.post(
+    "http://localhost:11434/api/generate",
+    json={
+        "model": model,
+        "system": systemprompt,
+        "prompt": prompt,
+        "format": "json",
+        "stream": False
+    },
+)
+
+# Get the response as JSON.
+j = json.loads(r.text)
+
+# Return the result.
+print(j["response"])
+
--- a/examples/python-functioncalling/readme.md
+++ b/examples/python-functioncalling/readme.md
@@ -0,0 +1,28 @@
+# Function calling
+
+![function calling 2023-11-16 16_12_58](https://github.com/jmorganca/ollama/assets/633681/a0acc247-9746-45ab-b325-b65dfbbee4fb)
+
+Function calling in the context of LLM's simply means that the output of the model is formatted in JSON, using a preconfigured schema, and uses the expected types. Then your code can use the output of the model and call functions with it. Using the JSON format in Ollama, you can use any model for function calling. 
+
+The two examples provided can extract information out of the provided texts. The first example uses the first couple of chapters from War and Peace by Lev Nikolayevich Tolstoy, and extracts the names and titles of the characters introduced in the story. The second example uses a more complicated schema to pull out addresses and event information from a series of emails.
+
+## Running the examples
+
+1. Clone this repo and navigate to the `examples/python-functioncalling` directory.
+2. Install the dependencies with `pip install -r requirements.txt`.
+3. Review the `wp.txt` file.
+4. Run `python extractwp.py`.
+5. Review the `info.txt` file.
+6. Run `python extractemail.py`.
+
+## Review the Code
+
+Both examples do roughly the same thing with different source material. They both use the same system prompt, which tells the model to expect some instructions and a schema. Then we inject the schema into the prompt and generate an answer.
+
+The first example, `extractwp.py`, outputs the resulting JSON to the console, listing the characters introduced at the start of War and Peace. The second example, `extractemail.py`, is a bit more complicated, extracting two different types of information: addresses and events. It outputs the results to a JSON blob, then the addresses are handed off to one function called `reportAddresses` and the events are handed off to another function called `reportEvents`.
+
+Notice that both examples are using the model from Intel called `openchat`. This is not a model tuned for function calling, yet it performs very well at this task.
+
+## Next Steps
+
+Try exporting some of your real emails to the input file and seeing how well the model does. Try pointing the first example at other books. You could even have it cycle through all the sections and maybe add up the number of times any character is seen throughout the book, determining the most important characters. You can also try out different models.
--- a/examples/python-functioncalling/wp.txt
+++ b/examples/python-functioncalling/wp.txt
@@ -0,0 +1,183 @@
+"Well, Prince, so Genoa and Lucca are now just family estates of the Buonapartes. But I warn you, if you don't tell me that this means war, if you still try to defend the infamies and horrors perpetrated by that Antichrist - I really believe he is Antichrist - I will have nothing more to do with you and you are no longer my friend, no longer my 'faithful slave,' as you call yourself! But how do you do? I see I have frightened you - sit down and tell me all the news."
+
+It was in July, 1805, and the speaker was the well-known Anna Pavlovna Scherer, maid of honor and favorite of the Empress Marya Fedorovna. With these words she greeted Prince Vasili Kuragin, a man of high rank and importance, who was the first to arrive at her reception. Anna Pavlovna had had a cough for some days. She was, as she said, suffering from la grippe; grippe being then a new word in St. Petersburg, used only by the elite.
+
+All her invitations without exception, written in French, and delivered by a scarlet-liveried footman that morning, ran as follows:
+
+"If you have nothing better to do, Count (or Prince), and if the prospect of spending an evening with a poor invalid is not too terrible, I shall be very charmed to see you tonight between 7 and 10 - Annette Scherer."
+
+"Heavens! what a virulent attack!" replied the prince, not in the least disconcerted by this reception. He had just entered, wearing an embroidered court uniform, knee breeches, and shoes, and had stars on his breast and a serene expression on his flat face. He spoke in that refined French in which our grandfathers not only spoke but thought, and with the gentle, patronizing intonation natural to a man of importance who had grown old in society and at court. He went up to Anna Pavlovna, kissed her hand, presenting to her his bald, scented, and shining head, and complacently seated himself on the sofa.
+
+"First of all, dear friend, tell me how you are. Set your friend's mind at rest," said he without altering his tone, beneath the politeness and affected sympathy of which indifference and even irony could be discerned.
+
+"Can one be well while suffering morally? Can one be calm in times like these if one has any feeling?" said Anna Pavlovna. "You are staying the whole evening, I hope?"
+
+"And the fete at the English ambassador's? Today is Wednesday. I must put in an appearance there," said the prince. "My daughter is coming for me to take me there."
+
+"I thought today's fete had been canceled. I confess all these festivities and fireworks are becoming wearisome."
+
+"If they had known that you wished it, the entertainment would have been put off," said the prince, who, like a wound-up clock, by force of habit said things he did not even wish to be believed.
+
+"Don't tease! Well, and what has been decided about Novosiltsev's dispatch? You know everything."
+
+"What can one say about it?" replied the prince in a cold, listless tone. "What has been decided? They have decided that Buonaparte has burnt his boats, and I believe that we are ready to burn ours."
+
+Prince Vasili always spoke languidly, like an actor repeating a stale part. Anna Pavlovna Scherer on the contrary, despite her forty years, overflowed with animation and impulsiveness. To be an enthusiast had become her social vocation and, sometimes even when she did not feel like it, she became enthusiastic in order not to disappoint the expectations of those who knew her. The subdued smile which, though it did not suit her faded features, always played round her lips expressed, as in a spoiled child, a continual consciousness of her charming defect, which she neither wished, nor could, nor considered it necessary, to correct.
+
+In the midst of a conversation on political matters Anna Pavlovna burst out:
+
+"Oh, don't speak to me of Austria. Perhaps I don't understand things, but Austria never has wished, and does not wish, for war. She is betraying us! Russia alone must save Europe. Our gracious sovereign recognizes his high vocation and will be true to it. That is the one thing I have faith in! Our good and wonderful sovereign has to perform the noblest role on earth, and he is so virtuous and noble that God will not forsake him. He will fulfill his vocation and crush the hydra of revolution, which has become more terrible than ever in the person of this murderer and villain! We alone must avenge the blood of the just one.... Whom, I ask you, can we rely on?... England with her commercial spirit will not and cannot understand the Emperor Alexander's loftiness of soul. She has refused to evacuate Malta. She wanted to find, and still seeks, some secret motive in our actions. What answer did Novosiltsev get? None. The English have not understood and cannot understand the self-abnegation of our Emperor who wants nothing for himself, but only desires the good of mankind. And what have they promised? Nothing! And what little they have promised they will not perform! Prussia has always declared that Buonaparte is invincible, and that all Europe is powerless before him.... And I don't believe a word that Hardenburg says, or Haugwitz either. This famous Prussian neutrality is just a trap. I have faith only in God and the lofty destiny of our adored monarch. He will save Europe!"
+
+She suddenly paused, smiling at her own impetuosity.
+
+"I think," said the prince with a smile, "that if you had been sent instead of our dear Wintzingerode you would have captured the King of Prussia's consent by assault. You are so eloquent. Will you give me a cup of tea?"
+
+"In a moment. A propos," she added, becoming calm again, "I am expecting two very interesting men tonight, le Vicomte de Mortemart, who is connected with the Montmorencys through the Rohans, one of the best French families. He is one of the genuine emigres, the good ones. And also the Abbe Morio. Do you know that profound thinker? He has been received by the Emperor. Had you heard?"
+
+"I shall be delighted to meet them," said the prince. "But tell me," he added with studied carelessness as if it had only just occurred to him, though the question he was about to ask was the chief motive of his visit, "is it true that the Dowager Empress wants Baron Funke to be appointed first secretary at Vienna? The baron by all accounts is a poor creature."
+
+Prince Vasili wished to obtain this post for his son, but others were trying through the Dowager Empress Marya Fedorovna to secure it for the baron.
+
+Anna Pavlovna almost closed her eyes to indicate that neither she nor anyone else had a right to criticize what the Empress desired or was pleased with.
+
+"Baron Funke has been recommended to the Dowager Empress by her sister," was all she said, in a dry and mournful tone.
+
+As she named the Empress, Anna Pavlovna's face suddenly assumed an expression of profound and sincere devotion and respect mingled with sadness, and this occurred every time she mentioned her illustrious patroness. She added that Her Majesty had deigned to show Baron Funke beaucoup d'estime, and again her face clouded over with sadness.
+
+The prince was silent and looked indifferent. But, with the womanly and courtierlike quickness and tact habitual to her, Anna Pavlovna wished both to rebuke him (for daring to speak as he had done of a man recommended to the Empress) and at the same time to console him, so she said:
+
+"Now about your family. Do you know that since your daughter came out everyone has been enraptured by her? They say she is amazingly beautiful."
+
+The prince bowed to signify his respect and gratitude.
+
+"I often think," she continued after a short pause, drawing nearer to the prince and smiling amiably at him as if to show that political and social topics were ended and the time had come for intimate conversation - "I often think how unfairly sometimes the joys of life are distributed. Why has fate given you two such splendid children? I don't speak of Anatole, your youngest. I don't like him," she added in a tone admitting of no rejoinder and raising her eyebrows. "Two such charming children. And really you appreciate them less than anyone, and so you don't deserve to have them."
+
+And she smiled her ecstatic smile.
+
+"I can't help it," said the prince. "Lavater would have said I lack the bump of paternity."
+
+"Don't joke; I mean to have a serious talk with you. Do you know I am dissatisfied with your younger son? Between ourselves" (and her face assumed its melancholy expression), "he was mentioned at Her Majesty's and you were pitied...."
+
+The prince answered nothing, but she looked at him significantly, awaiting a reply. He frowned.
+
+"What would you have me do?" he said at last. "You know I did all a father could for their education, and they have both turned out fools. Hippolyte is at least a quiet fool, but Anatole is an active one. That is the only difference between them." He said this smiling in a way more natural and animated than usual, so that the wrinkles round his mouth very clearly revealed something unexpectedly coarse and unpleasant.
+
+"And why are children born to such men as you? If you were not a father there would be nothing I could reproach you with," said Anna Pavlovna, looking up pensively.
+
+"I am your faithful slave and to you alone I can confess that my children are the bane of my life. It is the cross I have to bear. That is how I explain it to myself. It can't be helped!"
+
+He said no more, but expressed his resignation to cruel fate by a gesture. Anna Pavlovna meditated.
+
+"Have you never thought of marrying your prodigal son Anatole?" she asked. "They say old maids have a mania for matchmaking, and though I don't feel that weakness in myself as yet, I know a little person who is very unhappy with her father. She is a relation of yours, Princess Mary Bolkonskaya."
+
+Prince Vasili did not reply, though, with the quickness of memory and perception befitting a man of the world, he indicated by a movement of the head that he was considering this information.
+
+"Do you know," he said at last, evidently unable to check the sad current of his thoughts, "that Anatole is costing me forty thousand rubles a year? And," he went on after a pause, "what will it be in five years, if he goes on like this?" Presently he added: "That's what we fathers have to put up with.... Is this princess of yours rich?"
+
+"Her father is very rich and stingy. He lives in the country. He is the well-known Prince Bolkonski who had to retire from the army under the late Emperor, and was nicknamed 'the King of Prussia.' He is very clever but eccentric, and a bore. The poor girl is very unhappy. She has a brother; I think you know him, he married Lise Meinen lately. He is an aide-de-camp of Kutuzov's and will be here tonight."
+
+"Listen, dear Annette," said the prince, suddenly taking Anna Pavlovna's hand and for some reason drawing it downwards. "Arrange that affair for me and I shall always be your most devoted slave-slafe with an f, as a village elder of mine writes in his reports. She is rich and of good family and that's all I want."
+
+And with the familiarity and easy grace peculiar to him, he raised the maid of honor's hand to his lips, kissed it, and swung it to and fro as he lay back in his armchair, looking in another direction.
+
+"Attendez," said Anna Pavlovna, reflecting, "I'll speak to Lise, young Bolkonski's wife, this very evening, and perhaps the thing can be arranged. It shall be on your family's behalf that I'll start my apprenticeship as old maid."
+
+Anna Pavlovna's drawing room was gradually filling. The highest Petersburg society was assembled there: people differing widely in age and character but alike in the social circle to which they belonged. Prince Vasili's daughter, the beautiful Helene, came to take her father to the ambassador's entertainment; she wore a ball dress and her badge as maid of honor. The youthful little Princess Bolkonskaya, known as la femme la plus seduisante de Petersbourg, * was also there. She had been married during the previous winter, and being pregnant did not go to any large gatherings, but only to small receptions. Prince Vasili's son, Hippolyte, had come with Mortemart, whom he introduced. The Abbe Morio and many others had also come.
+
+* The most fascinating woman in Petersburg.
+
+To each new arrival Anna Pavlovna said, "You have not yet seen my aunt," or "You do not know my aunt?" and very gravely conducted him or her to a little old lady, wearing large bows of ribbon in her cap, who had come sailing in from another room as soon as the guests began to arrive; and slowly turning her eyes from the visitor to her aunt, Anna Pavlovna mentioned each one's name and then left them.
+
+Each visitor performed the ceremony of greeting this old aunt whom not one of them knew, not one of them wanted to know, and not one of them cared about; Anna Pavlovna observed these greetings with mournful and solemn interest and silent approval. The aunt spoke to each of them in the same words, about their health and her own, and the health of Her Majesty, "who, thank God, was better today." And each visitor, though politeness prevented his showing impatience, left the old woman with a sense of relief at having performed a vexatious duty and did not return to her the whole evening.
+
+The young Princess Bolkonskaya had brought some work in a gold-embroidered velvet bag. Her pretty little upper lip, on which a delicate dark down was just perceptible, was too short for her teeth, but it lifted all the more sweetly, and was especially charming when she occasionally drew it down to meet the lower lip. As is always the case with a thoroughly attractive woman, her defect - the shortness of her upper lip and her half-open mouth - seemed to be her own special and peculiar form of beauty. Everyone brightened at the sight of this pretty young woman, so soon to become a mother, so full of life and health, and carrying her burden so lightly. Old men and dull dispirited young ones who looked at her, after being in her company and talking to her a little while, felt as if they too were becoming, like her, full of life and health. All who talked to her, and at each word saw her bright smile and the constant gleam of her white teeth, thought that they were in a specially amiable mood that day.
+
+The little princess went round the table with quick, short, swaying steps, her workbag on her arm, and gaily spreading out her dress sat down on a sofa near the silver samovar, as if all she was doing was a pleasure to herself and to all around her. "I have brought my work," said she in French, displaying her bag and addressing all present. "Mind, Annette, I hope you have not played a wicked trick on me," she added, turning to her hostess. "You wrote that it was to be quite a small reception, and just see how badly I am dressed." And she spread out her arms to show her short-waisted, lace-trimmed, dainty gray dress, girdled with a broad ribbon just below the breast.
+
+"Soyez tranquille, Lise, you will always be prettier than anyone else," replied Anna Pavlovna.
+
+"You know," said the princess in the same tone of voice and still in French, turning to a general, "my husband is deserting me? He is going to get himself killed. Tell me what this wretched war is for?" she added, addressing Prince Vasili, and without waiting for an answer she turned to speak to his daughter, the beautiful Helene.
+
+"What a delightful woman this little princess is!" said Prince Vasili to Anna Pavlovna.
+
+One of the next arrivals was a stout, heavily built young man with close-cropped hair, spectacles, the light-colored breeches fashionable at that time, a very high ruffle, and a brown dress coat. This stout young man was an illegitimate son of Count Bezukhov, a well-known grandee of Catherine's time who now lay dying in Moscow. The young man had not yet entered either the military or civil service, as he had only just returned from abroad where he had been educated, and this was his first appearance in society. Anna Pavlovna greeted him with the nod she accorded to the lowest hierarchy in her drawing room. But in spite of this lowest-grade greeting, a look of anxiety and fear, as at the sight of something too large and unsuited to the place, came over her face when she saw Pierre enter. Though he was certainly rather bigger than the other men in the room, her anxiety could only have reference to the clever though shy, but observant and natural, expression which distinguished him from everyone else in that drawing room.
+
+"It is very good of you, Monsieur Pierre, to come and visit a poor invalid," said Anna Pavlovna, exchanging an alarmed glance with her aunt as she conducted him to her.
+
+Pierre murmured something unintelligible, and continued to look round as if in search of something. On his way to the aunt he bowed to the little princess with a pleased smile, as to an intimate acquaintance.
+
+Anna Pavlovna's alarm was justified, for Pierre turned away from the aunt without waiting to hear her speech about Her Majesty's health. Anna Pavlovna in dismay detained him with the words: "Do you know the Abbe Morio? He is a most interesting man."
+
+"Yes, I have heard of his scheme for perpetual peace, and it is very interesting but hardly feasible."
+
+"You think so?" rejoined Anna Pavlovna in order to say something and get away to attend to her duties as hostess. But Pierre now committed a reverse act of impoliteness. First he had left a lady before she had finished speaking to him, and now he continued to speak to another who wished to get away. With his head bent, and his big feet spread apart, he began explaining his reasons for thinking the abbe's plan chimerical.
+
+"We will talk of it later," said Anna Pavlovna with a smile.
+
+And having got rid of this young man who did not know how to behave, she resumed her duties as hostess and continued to listen and watch, ready to help at any point where the conversation might happen to flag. As the foreman of a spinning mill, when he has set the hands to work, goes round and notices here a spindle that has stopped or there one that creaks or makes more noise than it should, and hastens to check the machine or set it in proper motion, so Anna Pavlovna moved about her drawing room, approaching now a silent, now a too-noisy group, and by a word or slight rearrangement kept the conversational machine in steady, proper, and regular motion. But amid these cares her anxiety about Pierre was evident. She kept an anxious watch on him when he approached the group round Mortemart to listen to what was being said there, and again when he passed to another group whose center was the abbe.
+
+Pierre had been educated abroad, and this reception at Anna Pavlovna's was the first he had attended in Russia. He knew that all the intellectual lights of Petersburg were gathered there and, like a child in a toyshop, did not know which way to look, afraid of missing any clever conversation that was to be heard. Seeing the self-confident and refined expression on the faces of those present he was always expecting to hear something very profound. At last he came up to Morio. Here the conversation seemed interesting and he stood waiting for an opportunity to express his own views, as young people are fond of doing.
+
+CHAPTER III
+Anna Pavlovna's reception was in full swing. The spindles hummed steadily and ceaselessly on all sides. With the exception of the aunt, beside whom sat only one elderly lady, who with her thin careworn face was rather out of place in this brilliant society, the whole company had settled into three groups. One, chiefly masculine, had formed round the abbe. Another, of young people, was grouped round the beautiful Princess Helene, Prince Vasili's daughter, and the little Princess Bolkonskaya, very pretty and rosy, though rather too plump for her age. The third group was gathered round Mortemart and Anna Pavlovna.
+
+The vicomte was a nice-looking young man with soft features and polished manners, who evidently considered himself a celebrity but out of politeness modestly placed himself at the disposal of the circle in which he found himself. Anna Pavlovna was obviously serving him up as a treat to her guests. As a clever maitre d'hotel serves up as a specially choice delicacy a piece of meat that no one who had seen it in the kitchen would have cared to eat, so Anna Pavlovna served up to her guests, first the vicomte and then the abbe, as peculiarly choice morsels. The group about Mortemart immediately began discussing the murder of the Duc d'Enghien. The vicomte said that the Duc d'Enghien had perished by his own magnanimity, and that there were particular reasons for Buonaparte's hatred of him.
+
+"Ah, yes! Do tell us all about it, Vicomte," said Anna Pavlovna, with a pleasant feeling that there was something A la Louis XV in the sound of that sentence: "Contez nous cela, Vicomte."
+
+The vicomte bowed and smiled courteously in token of his willingness to comply. Anna Pavlovna arranged a group round him, inviting everyone to listen to his tale.
+
+"The vicomte knew the duc personally," whispered Anna Pavlovna to one of the guests. "The vicomte is a wonderful raconteur," said she to another. "How evidently he belongs to the best society," said she to a third; and the vicomte was served up to the company in the choicest and most advantageous style, like a well-garnished joint of roast beef on a hot dish.
+
+The vicomte wished to begin his story and gave a subtle smile.
+
+"Come over here, Helene, dear," said Anna Pavlovna to the beautiful young princess who was sitting some way off, the center of another group.
+
+The princess smiled. She rose with the same unchanging smile with which she had first entered the room - the smile of a perfectly beautiful woman. With a slight rustle of her white dress trimmed with moss and ivy, with a gleam of white shoulders, glossy hair, and sparkling diamonds, she passed between the men who made way for her, not looking at any of them but smiling on all, as if graciously allowing each the privilege of admiring her beautiful figure and shapely shoulders, back, and bosom - which in the fashion of those days were very much exposed - and she seemed to bring the glamour of a ballroom with her as she moved toward Anna Pavlovna. Helene was so lovely that not only did she not show any trace of coquetry, but on the contrary she even appeared shy of her unquestionable and all too victorious beauty. She seemed to wish, but to be unable, to diminish its effect.
+
+"How lovely!" said everyone who saw her; and the vicomte lifted his shoulders and dropped his eyes as if startled by something extraordinary when she took her seat opposite and beamed upon him also with her unchanging smile.
+
+"Madame, I doubt my ability before such an audience," said he, smilingly inclining his head.
+
+The princess rested her bare round arm on a little table and considered a reply unnecessary. She smilingly waited. All the time the story was being told she sat upright, glancing now at her beautiful round arm, altered in shape by its pressure on the table, now at her still more beautiful bosom, on which she readjusted a diamond necklace. From time to time she smoothed the folds of her dress, and whenever the story produced an effect she glanced at Anna Pavlovna, at once adopted just the expression she saw on the maid of honor's face, and again relapsed into her radiant smile.
+
+The little princess had also left the tea table and followed Helene.
+
+"Wait a moment, I'll get my work.... Now then, what are you thinking of?" she went on, turning to Prince Hippolyte. "Fetch me my workbag."
+
+There was a general movement as the princess, smiling and talking merrily to everyone at once, sat down and gaily arranged herself in her seat.
+
+"Now I am all right," she said, and asking the vicomte to begin, she took up her work.
+
+Prince Hippolyte, having brought the workbag, joined the circle and moving a chair close to hers seated himself beside her.
+
+Le charmant Hippolyte was surprising by his extraordinary resemblance to his beautiful sister, but yet more by the fact that in spite of this resemblance he was exceedingly ugly. His features were like his sister's, but while in her case everything was lit up by a joyous, self-satisfied, youthful, and constant smile of animation, and by the wonderful classic beauty of her figure, his face on the contrary was dulled by imbecility and a constant expression of sullen self-confidence, while his body was thin and weak. His eyes, nose, and mouth all seemed puckered into a vacant, wearied grimace, and his arms and legs always fell into unnatural positions.
+
+"It's not going to be a ghost story?" said he, sitting down beside the princess and hastily adjusting his lorgnette, as if without this instrument he could not begin to speak.
+
+"Why no, my dear fellow," said the astonished narrator, shrugging his shoulders.
+
+"Because I hate ghost stories," said Prince Hippolyte in a tone which showed that he only understood the meaning of his words after he had uttered them.
+
+He spoke with such self-confidence that his hearers could not be sure whether what he said was very witty or very stupid. He was dressed in a dark-green dress coat, knee breeches of the color of cuisse de nymphe effrayee, as he called it, shoes, and silk stockings.
+
+The vicomte told his tale very neatly. It was an anecdote, then current, to the effect that the Duc d'Enghien had gone secretly to Paris to visit Mademoiselle George; that at her house he came upon Bonaparte, who also enjoyed the famous actress' favors, and that in his presence Napoleon happened to fall into one of the fainting fits to which he was subject, and was thus at the duc's mercy. The latter spared him, and this magnanimity Bonaparte subsequently repaid by death.
+
+The story was very pretty and interesting, especially at the point where the rivals suddenly recognized one another; and the ladies looked agitated.
+
+"Charming!" said Anna Pavlovna with an inquiring glance at the little princess.
+
+"Charming!" whispered the little princess, sticking the needle into her work as if to testify that the interest and fascination of the story prevented her from going on with it.
+
+The vicomte appreciated this silent praise and smiling gratefully prepared to continue, but just then Anna Pavlovna, who had kept a watchful eye on the young man who so alarmed her, noticed that he was talking too loudly and vehemently with the abbe, so she hurried to the rescue. Pierre had managed to start a conversation with the abbe about the balance of power, and the latter, evidently interested by the young man's simple-minded eagerness, was explaining his pet theory. Both were talking and listening too eagerly and too naturally, which was why Anna Pavlovna disapproved.
+
+"The means are ... the balance of power in Europe and the rights of the people," the abbe was saying. "It is only necessary for one powerful nation like Russia - barbaric as she is said to be - to place herself disinterestedly at the head of an alliance having for its object the maintenance of the balance of power of Europe, and it would save the world!"
+
+"But how are you to get that balance?" Pierre was beginning.
+
+At that moment Anna Pavlovna came up and, looking severely at Pierre, asked the Italian how he stood Russian climate. The Italian's face instantly changed and assumed an offensively affected, sugary expression, evidently habitual to him when conversing with women.
+
+"I am so enchanted by the brilliancy of the wit and culture of the society, more especially of the feminine society, in which I have had the honor of being received, that I have not yet had time to think of the climate," said he.
+
+Not letting the abbe and Pierre escape, Anna Pavlovna, the more conveniently to keep them under observation, brought them into the larger circle.
+
--- a/examples/python-simplechat/client.py
+++ b/examples/python-simplechat/client.py
@@ -1,46 +0,0 @@
-import json
-import requests
-
-# NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
-model = "llama2"  # TODO: update this for whatever model you wish to use
-
-
-def chat(messages):
-    r = requests.post(
-        "http://0.0.0.0:11434/api/chat",
-        json={"model": model, "messages": messages, "stream": True},
-    )
-    r.raise_for_status()
-    output = ""
-
-    for line in r.iter_lines():
-        body = json.loads(line)
-        if "error" in body:
-            raise Exception(body["error"])
-        if body.get("done") is False:
-            message = body.get("message", "")
-            content = message.get("content", "")
-            output += content
-            # the response streams one token at a time, print that as we receive it
-            print(content, end="", flush=True)
-
-
-        if body.get("done", False):
-            message["content"] = output
-            return message
-
-
-def main():
-    messages = []
-    
-    while True:
-        user_input = input("Enter a prompt: ")
-        print()
-        messages.append({"role": "user", "content": user_input})
-        message = chat(messages)
-        messages.append(message)
-        print("\n\n")
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/python-simplechat/readme.md
+++ b/examples/python-simplechat/readme.md
@@ -1,24 +0,0 @@
-# Simple Chat Example
-
-The **chat** endpoint is one of two ways to generate text from an LLM with Ollama. At a high level you provide the endpoint an array of objects with a role and content specified. Then with each output and prompt, you add more of those role/content objects, which builds up the history.
-
-## Review the Code
-
-You can see in the **chat** function that actually calling the endpoint is done simply with:
-
-```python
-r = requests.post(
-  "http://0.0.0.0:11434/api/chat",
-  json={"model": model, "messages": messages, "stream": True},
-)
-```
-
-With the **generate** endpoint, you need to provide a `prompt`. But with **chat**, you provide `messages`. And the resulting stream of responses includes a `message` object with a `content` field.
-
-The final JSON object doesn't provide the full content, so you will need to build the content yourself.
-
-In the **main** function, we collect `user_input` and add it as a message to our messages and that is passed to the chat function. When the LLM is done responding the output is added as another message.
-
-## Next Steps
-
-In this example, all generations are kept. You might want to experiment with summarizing everything older than 10 conversations to enable longer history with less context being used.
--- a/examples/typescript-simplechat/client.ts
+++ b/examples/typescript-simplechat/client.ts
@@ -1,77 +0,0 @@
-import * as readline from "readline";
-
-const model = "llama2";
-type Message = {
-  role: "assistant" | "user" | "system";
-  content: string;
-}
-const messages: Message[] = [{
-  role: "system",
-  content: "You are a helpful AI agent."
-}]
-
-const rl = readline.createInterface({
-  input: process.stdin,
-  output: process.stdout
-})
-
-async function chat(messages: Message[]): Promise<Message> {
-  const body = {
-    model: model,
-    messages: messages
-  }
-
-  const response = await fetch("http://localhost:11434/api/chat", {
-    method: "POST",
-    body: JSON.stringify(body)
-  })
-
-  const reader = response.body?.getReader()
-  if (!reader) {
-    throw new Error("Failed to read response body")
-  }
-  let content = ""
-  while (true) {
-    const { done, value } = await reader.read()
-    if (done) {
-      break;
-    }
-    const rawjson = new TextDecoder().decode(value);
-    const json = JSON.parse(rawjson)
-
-    if (json.done === false) {
-      process.stdout.write(json.message.content);
-      content += json.message.content
-    }
-
-  }
-  return { role: "assistant", content: content };
-}
-
-async function askQuestion(): Promise<void> {
-  return new Promise<void>((resolve) => {
-    rl.question("\n\nAsk a question: (press enter alone to quit)\n\n", async (user_input) => {
-      if (user_input.trim() === "") {
-        rl.close();
-        console.log("Thankyou. Goodbye.\n")
-        console.log("=======\nHere is the message history that was used in this conversation.\n=======\n")
-        messages.forEach(message => {
-          console.log(message)
-        })
-        resolve();
-      } else {
-        console.log();
-        messages.push({ role: "user", content: user_input });
-        messages.push(await chat(messages));
-        await askQuestion(); // Ask the next question
-      }
-    });
-  });
-}
-
-async function main() {
-  await askQuestion();
-
-}
-
-main();
--- a/examples/typescript-simplechat/package.json
+++ b/examples/typescript-simplechat/package.json
@@ -1 +0,0 @@
-{ "dependencies": { "@types/node": "^20.10.4", "prompt-sync": "^4.2.0", "readline": "^1.3.0" } }
--- a/examples/typescript-simplechat/readme.md
+++ b/examples/typescript-simplechat/readme.md
@@ -1,39 +0,0 @@
-# Simple Chat Example
-
-The **chat** endpoint is one of two ways to generate text from an LLM with Ollama. At a high level you provide the endpoint an array of message objects with a role and content specified. Then with each output and prompt, you add more messages, which builds up the history.
-
-## Run the Example
-
-There are a few ways to run this, just like any Typescript code:
-
-1. Compile with `tsc` and then run it with `node client.js`.
-2. Install `tsx` and run it with `tsx client.ts`.
-3. Install `bun` and run it with `bun client.ts`.
-
-## Review the Code
-
-You can see in the **chat** function that is actually calling the endpoint is simply done with:
-
-```typescript
-const body = {
-  model: model,
-  messages: messages
-}
-
-const response = await fetch("http://localhost:11434/api/chat", {
-  method: "POST",
-  body: JSON.stringify(body)
-})
-```
-
-With the **generate** endpoint, you need to provide a `prompt`. But with **chat**, you provide `messages`. And the resulting stream of responses includes a `message` object with a `content` field.
-
-The final JSON object doesn't provide the full content, so you will need to build the content yourself. In this example, **chat** takes the full array of messages and outputs the resulting message from this call of the chat endpoint.
-
-In the **askQuestion** function, we collect `user_input` and add it as a message to our messages and that is passed to the chat function. When the LLM is done responding the output is added as another message to the messages array.
-
-At the end, you will see a printout of all the messages.
-
-## Next Steps
-
-In this example, all generations are kept. You might want to experiment with summarizing everything older than 10 conversations to enable longer history with less context being used.
--- a/go.mod
+++ b/go.mod
@@ -5,15 +5,14 @@ go 1.20
 require (
 	github.com/emirpasic/gods v1.18.1
 	github.com/gin-gonic/gin v1.9.1
+	github.com/mattn/go-runewidth v0.0.14
+	github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db
 	github.com/olekukonko/tablewriter v0.0.5
 	github.com/spf13/cobra v1.7.0
 	golang.org/x/sync v0.3.0
 )

-require (
-	github.com/mattn/go-runewidth v0.0.14 // indirect
-	github.com/rivo/uniseg v0.2.0 // indirect
-)
+require github.com/rivo/uniseg v0.2.0 // indirect

 require (
 	github.com/bytedance/sonic v1.9.1 // indirect
--- a/go.sum
+++ b/go.sum
@@ -63,6 +63,8 @@ github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D
 github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI=
 github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU=
 github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
+github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ=
+github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw=
 github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
--- a/llm/falcon.go
+++ b/llm/falcon.go
@@ -0,0 +1,20 @@
+package llm
+
+const (
+	falconModelType7B   = 32
+	falconModelType40B  = 60
+	falconModelType180B = 80
+)
+
+func falconModelType(numLayer uint32) string {
+	switch numLayer {
+	case 32:
+		return "7B"
+	case 60:
+		return "40B"
+	case 80:
+		return "180B"
+	default:
+		return "unknown"
+	}
+}
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -7,10 +7,9 @@ import (
 )

 type GGML struct {
+	magic uint32
 	container
 	model
-
-	Size int64
 }

 const (
@@ -83,7 +82,7 @@ type model interface {

 type container interface {
 	Name() string
-	Decode(*readSeekOffset) (model, error)
+	Decode(io.Reader) (model, error)
 }

 type containerGGML struct{}
@@ -92,9 +91,7 @@ func (c *containerGGML) Name() string {
 	return "ggml"
 }

-func (c *containerGGML) Decode(ro *readSeekOffset) (model, error) {
-	// file contents aren't decoded
-	ro.Seek(0, io.SeekEnd)
+func (c *containerGGML) Decode(r io.Reader) (model, error) {
 	return nil, nil
 }

@@ -106,9 +103,9 @@ func (c *containerGGMF) Name() string {
 	return "ggmf"
 }

-func (c *containerGGMF) Decode(ro *readSeekOffset) (model, error) {
+func (c *containerGGMF) Decode(r io.Reader) (model, error) {
 	var version uint32
-	binary.Read(ro, binary.LittleEndian, &version)
+	binary.Read(r, binary.LittleEndian, &version)

 	switch version {
 	case 1:
@@ -117,10 +114,6 @@ func (c *containerGGMF) Decode(ro *readSeekOffset) (model, error) {
 	}

 	c.version = version
-
-	// remaining file contents aren't decoded
-	ro.Seek(0, io.SeekEnd)
-
 	return nil, nil
 }

@@ -132,9 +125,9 @@ func (c *containerGGJT) Name() string {
 	return "ggjt"
 }

-func (c *containerGGJT) Decode(ro *readSeekOffset) (model, error) {
+func (c *containerGGJT) Decode(r io.Reader) (model, error) {
 	var version uint32
-	binary.Read(ro, binary.LittleEndian, &version)
+	binary.Read(r, binary.LittleEndian, &version)

 	switch version {
 	case 1, 2, 3:
@@ -146,11 +139,7 @@ func (c *containerGGJT) Decode(ro *readSeekOffset) (model, error) {

 	// different model types may have different layouts for hyperparameters
 	var llama llamaModel
-	binary.Read(ro, binary.LittleEndian, &llama.hyperparameters)
-
-	// remaining file contents aren't decoded
-	ro.Seek(0, io.SeekEnd)
-
+	binary.Read(r, binary.LittleEndian, &llama.hyperparameters)
 	return &llama, nil
 }

@@ -162,9 +151,9 @@ func (c *containerLORA) Name() string {
 	return "ggla"
 }

-func (c *containerLORA) Decode(ro *readSeekOffset) (model, error) {
+func (c *containerLORA) Decode(r io.Reader) (model, error) {
 	var version uint32
-	binary.Read(ro, binary.LittleEndian, &version)
+	binary.Read(r, binary.LittleEndian, &version)

 	switch version {
 	case 1:
@@ -173,10 +162,6 @@ func (c *containerLORA) Decode(ro *readSeekOffset) (model, error) {
 	}

 	c.version = version
-
-	// remaining file contents aren't decoded
-	ro.Seek(0, io.SeekEnd)
-
 	return nil, nil
 }

@@ -195,61 +180,33 @@ const (
 )

 func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
-	ro := readSeekOffset{ReadSeeker: r}
+	var ggml GGML
+	binary.Read(r, binary.LittleEndian, &ggml.magic)

-	var magic uint32
-	if err := binary.Read(&ro, binary.LittleEndian, &magic); err != nil {
-		return nil, err
-	}
-
-	var c container
-	switch magic {
+	switch ggml.magic {
 	case FILE_MAGIC_GGML:
-		c = &containerGGML{}
+		ggml.container = &containerGGML{}
 	case FILE_MAGIC_GGMF:
-		c = &containerGGMF{}
+		ggml.container = &containerGGMF{}
 	case FILE_MAGIC_GGJT:
-		c = &containerGGJT{}
+		ggml.container = &containerGGJT{}
 	case FILE_MAGIC_GGLA:
-		c = &containerLORA{}
+		ggml.container = &containerLORA{}
 	case FILE_MAGIC_GGUF_LE:
-		c = &containerGGUF{bo: binary.LittleEndian}
+		ggml.container = &containerGGUF{bo: binary.LittleEndian}
 	case FILE_MAGIC_GGUF_BE:
-		c = &containerGGUF{bo: binary.BigEndian}
+		ggml.container = &containerGGUF{bo: binary.BigEndian}
 	default:
 		return nil, errors.New("invalid file magic")
 	}

-	model, err := c.Decode(&ro)
+	model, err := ggml.Decode(r)
 	if err != nil {
 		return nil, err
 	}

+	ggml.model = model
+
 	// final model type
-	return &GGML{
-		container: c,
-		model:     model,
-		Size:      ro.offset,
-	}, nil
-}
-
-type readSeekOffset struct {
-	io.ReadSeeker
-	offset int64
-}
-
-func (rso *readSeekOffset) Seek(offset int64, whence int) (int64, error) {
-	offset, err := rso.ReadSeeker.Seek(offset, whence)
-	if err != nil {
-		return 0, err
-	}
-
-	rso.offset = offset
-	return offset, nil
-}
-
-func (rso *readSeekOffset) Read(p []byte) (int, error) {
-	n, err := rso.ReadSeeker.Read(p)
-	rso.offset += int64(n)
-	return n, err
+	return &ggml, nil
 }
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -23,24 +23,26 @@ type containerGGUF struct {
 		NumTensor uint64
 		NumKV     uint64
 	}
+
+	parameters uint64
 }

 func (c *containerGGUF) Name() string {
 	return "gguf"
 }

-func (c *containerGGUF) Decode(rso *readSeekOffset) (model, error) {
-	binary.Read(rso, c.bo, &c.Version)
+func (c *containerGGUF) Decode(r io.Reader) (model, error) {
+	binary.Read(r, c.bo, &c.Version)

 	switch c.Version {
 	case 1:
-		binary.Read(rso, c.bo, &c.V1)
+		binary.Read(r, c.bo, &c.V1)
 	default:
-		binary.Read(rso, c.bo, &c.V2)
+		binary.Read(r, c.bo, &c.V2)
 	}

 	model := newGGUFModel(c)
-	if err := model.Decode(rso); err != nil {
+	if err := model.Decode(r); err != nil {
 		return nil, err
 	}

@@ -65,23 +67,9 @@ const (

 type kv map[string]any

-type tensor struct {
-	name   string
-	kind   uint32
-	offset uint64
-	size   uint64
-
-	// shape is the number of elements in each dimension
-	shape [4]uint64
-}
-
 type ggufModel struct {
 	*containerGGUF
-
 	kv
-	tensors []tensor
-
-	parameters uint64
 }

 func newGGUFModel(container *containerGGUF) *ggufModel {
@@ -108,7 +96,8 @@ func (llm *ggufModel) NumKV() uint64 {
 }

 func (llm *ggufModel) ModelFamily() string {
-	if t, ok := llm.kv["general.architecture"].(string); ok {
+	t, ok := llm.kv["general.architecture"].(string)
+	if ok {
 		return t
 	}

@@ -120,60 +109,82 @@ func (llm *ggufModel) ModelType() string {
 		return format.HumanNumber(llm.parameters)
 	}

+	switch llm.ModelFamily() {
+	case "llama":
+		if blocks, ok := llm.kv["llama.block_count"].(uint32); ok {
+			heads, headsOK := llm.kv["llama.head_count"].(uint32)
+			headKVs, headsKVsOK := llm.kv["llama.head_count_kv"].(uint32)
+			if headsOK && headsKVsOK && heads/headKVs == 8 {
+				return "70B"
+			}
+
+			return llamaModelType(blocks)
+		}
+	case "falcon":
+		if blocks, ok := llm.kv["falcon.block_count"].(uint32); ok {
+			return falconModelType(blocks)
+		}
+	case "starcoder":
+		if blocks, ok := llm.kv["starcoder.block_count"].(uint32); ok {
+			return starCoderModelType(blocks)
+		}
+	}
+
 	return "unknown"
 }

 func (llm *ggufModel) FileType() string {
-	if t, ok := llm.kv["general.file_type"].(uint32); ok {
+	t, ok := llm.kv["general.file_type"].(uint32)
+	if ok {
 		return fileType(t)
 	}

 	return "unknown"
 }

-func (llm *ggufModel) Decode(rso *readSeekOffset) error {
+func (llm *ggufModel) Decode(r io.Reader) error {
 	// decode key-values
 	for i := 0; uint64(i) < llm.NumKV(); i++ {
-		k, err := llm.readString(rso)
+		k, err := llm.readString(r)
 		if err != nil {
 			return err
 		}

-		vtype := llm.readU32(rso)
+		vtype := llm.readU32(r)

 		var v any
 		switch vtype {
 		case ggufTypeUint8:
-			v = llm.readU8(rso)
+			v = llm.readU8(r)
 		case ggufTypeInt8:
-			v = llm.readI8(rso)
+			v = llm.readI8(r)
 		case ggufTypeUint16:
-			v = llm.readU16(rso)
+			v = llm.readU16(r)
 		case ggufTypeInt16:
-			v = llm.readI16(rso)
+			v = llm.readI16(r)
 		case ggufTypeUint32:
-			v = llm.readU32(rso)
+			v = llm.readU32(r)
 		case ggufTypeInt32:
-			v = llm.readI32(rso)
+			v = llm.readI32(r)
 		case ggufTypeUint64:
-			v = llm.readU64(rso)
+			v = llm.readU64(r)
 		case ggufTypeInt64:
-			v = llm.readI64(rso)
+			v = llm.readI64(r)
 		case ggufTypeFloat32:
-			v = llm.readF32(rso)
+			v = llm.readF32(r)
 		case ggufTypeFloat64:
-			v = llm.readF64(rso)
+			v = llm.readF64(r)
 		case ggufTypeBool:
-			v = llm.readBool(rso)
+			v = llm.readBool(r)
 		case ggufTypeString:
-			s, err := llm.readString(rso)
+			s, err := llm.readString(r)
 			if err != nil {
 				return err
 			}

 			v = s
 		case ggufTypeArray:
-			a, err := llm.readArray(rso)
+			a, err := llm.readArray(r)
 			if err != nil {
 				return err
 			}
@@ -188,85 +199,21 @@ func (llm *ggufModel) Decode(rso *readSeekOffset) error {

 	// decode tensors
 	for i := 0; uint64(i) < llm.NumTensor(); i++ {
-		name, err := llm.readString(rso)
-		if err != nil {
+		if _, err := llm.readString(r); err != nil {
 			return err
 		}

-		// dims is the number of dimensions in the tensor
-		dims := llm.readU32(rso)
+		dimensions := llm.readU32(r)

-		shape := [4]uint64{1, 1, 1, 1}
-		for i := 0; uint32(i) < dims; i++ {
-			shape[i] = llm.readU64(rso)
+		var elements uint64 = 1
+		for i := 0; uint32(i) < dimensions; i++ {
+			elements *= llm.readU64(r)
 		}

-		kind := llm.readU32(rso)
-		offset := llm.readU64(rso)
+		llm.readU32(r) // type
+		llm.readU64(r) // offset

-		var blockSize uint64
-		switch {
-		case kind < 2:
-			blockSize = 1
-		case kind < 10:
-			blockSize = 32
-		default:
-			blockSize = 256
-		}
-
-		var typeSize uint64
-		switch kind {
-		case 0: // FP32
-			typeSize = 4
-		case 1: // FP16
-			typeSize = 2
-		case 2: // Q4_0
-			typeSize = 2 + blockSize/2
-		case 3: // Q4_1
-			typeSize = 2 + 2 + blockSize/2
-		case 6: // Q5_0
-			typeSize = 2 + 4 + blockSize/2
-		case 7: // Q5_1
-			typeSize = 2 + 2 + 4 + blockSize/2
-		case 8: // Q8_0
-			typeSize = 2 + blockSize
-		case 9: // Q8_1
-			typeSize = 4 + 4 + blockSize
-		case 10: // Q2_K
-			typeSize = blockSize/16 + blockSize/4 + 2 + 2
-		case 11: // Q3_K
-			typeSize = blockSize/8 + blockSize/4 + 12 + 2
-		case 12: // Q4_K
-			typeSize = 2 + 2 + 12 + blockSize/2
-		case 13: // Q5_K
-			typeSize = 2 + 2 + 12 + blockSize/8 + blockSize/2
-		case 14: // Q6_K
-			typeSize = blockSize/2 + blockSize/4 + blockSize/16 + 2
-		}
-
-		parameters := shape[0] * shape[1] * shape[2] * shape[3]
-		size := parameters * typeSize / blockSize
-
-		llm.tensors = append(llm.tensors, tensor{
-			name:   name,
-			kind:   kind,
-			offset: offset,
-			size:   size,
-			shape:  shape,
-		})
-
-		llm.parameters += parameters
-	}
-
-	alignment, ok := llm.kv["general.alignment"].(uint32)
-	if !ok {
-		alignment = 32
-	}
-
-	rso.Seek(int64(alignment)-rso.offset%int64(alignment), io.SeekCurrent)
-	for _, tensor := range llm.tensors {
-		padded := (int64(tensor.size) + int64(alignment) - 1) & ^(int64(alignment) - 1)
-		rso.Seek(padded, io.SeekCurrent)
+		llm.parameters += elements
 	}

 	return nil
--- a/llm/llama.cpp/gguf
+++ b/llm/llama.cpp/gguf
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -59,7 +59,6 @@ ws ::= ([ \t\n] ws)?
 var llamaCppEmbed embed.FS

 type ModelRunner struct {
-	Type        string // "gguf" or "ggml"
 	Path        string // path to the model runner executable
 	Accelerated bool
 }
@@ -73,25 +72,25 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
 	switch runtime.GOOS {
 	case "darwin":
 		if runtime.GOARCH == "arm64" {
-			runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}}
+			runners = []ModelRunner{{Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}}
 		} else {
-			runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}}
+			runners = []ModelRunner{{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}}
 		}
 	case "linux":
 		runners = []ModelRunner{
-			{Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
-			{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
+			{Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
+			{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
 		}
 	case "windows":
 		// TODO: select windows GPU runner here when available
 		runners = []ModelRunner{
-			{Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true},
-			{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
+			{Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true},
+			{Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
 		}
 	default:
 		log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
 		runners = []ModelRunner{
-			{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
+			{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
 		}
 	}

@@ -149,7 +148,6 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
 	for _, r := range runners {
 		// clean the ModelRunner paths so that they match the OS we are running on
 		localRunnersByPriority = append(localRunnersByPriority, ModelRunner{
-			Type:        r.Type,
 			Path:        filepath.Clean(path.Join(workDir, r.Path)),
 			Accelerated: r.Accelerated,
 		})
@@ -223,14 +221,8 @@ type Running struct {
 	*StatusWriter            // captures error messages from the llama runner process
 }

-type ImageData struct {
-	Data []byte `json:"data"`
-	ID   int    `json:"id"`
-}
-
 type llama struct {
 	api.Options
-	ImageData []ImageData
 	Running
 }

@@ -333,7 +325,7 @@ func (w *StatusWriter) Write(b []byte) (int, error) {
 	return os.Stderr.Write(b)
 }

-func newLlama(model string, adapters, projectors []string, runners []ModelRunner, numLayers int64, opts api.Options) (*llama, error) {
+func newLlama(model string, adapters []string, runners []ModelRunner, numLayers int64, opts api.Options) (*llama, error) {
 	fileInfo, err := os.Stat(model)
 	if err != nil {
 		return nil, err
@@ -373,11 +365,6 @@ func newLlama(model string, adapters, projectors []string, runners []ModelRunner
 		params = append(params, "--lora", adapters[0])
 	}

-	if len(projectors) > 0 {
-		// TODO: applying multiple projectors is not supported by the llama.cpp server yet
-		params = append(params, "--mmproj", projectors[0])
-	}
-
 	if opts.NumThread > 0 {
 		params = append(params, "--threads", fmt.Sprintf("%d", opts.NumThread))
 	}
@@ -410,13 +397,11 @@ func newLlama(model string, adapters, projectors []string, runners []ModelRunner
 		}

 		port := rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
-		params := append(params, "--port", strconv.Itoa(port))
-
 		ctx, cancel := context.WithCancel(context.Background())
 		cmd := exec.CommandContext(
 			ctx,
 			runner.Path,
-			params...,
+			append(params, "--port", strconv.Itoa(port))...,
 		)

 		var libraryPaths []string
@@ -545,39 +530,22 @@ type prediction struct {
 }

 const maxBufferSize = 512 * format.KiloByte
-const maxRetries = 6

-type PredictOpts struct {
-	Prompt string
-	Format string
-	Images []api.ImageData
-}
-
-type PredictResult struct {
-	Content            string
-	Done               bool
-	PromptEvalCount    int
-	PromptEvalDuration time.Duration
-	EvalCount          int
-	EvalDuration       time.Duration
-}
-
-// IsRetryable checks if the line matches a condition that can be retried
-func isRetryable(line []byte) bool {
-	return bytes.Contains(line, []byte("slot unavailable"))
-}
-
-func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
-	imageData := llm.ImageData
-	if len(predict.Images) > 0 {
-		for cnt, i := range predict.Images {
-			imageData = append(imageData, ImageData{Data: i, ID: cnt})
-		}
+func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string, format string, fn func(api.GenerateResponse)) error {
+	prevConvo, err := llm.Decode(ctx, prevContext)
+	if err != nil {
+		return err
 	}
-	log.Printf("loaded %d images", len(imageData))
+
+	// Remove leading spaces from prevConvo if present
+	prevConvo = strings.TrimPrefix(prevConvo, " ")
+
+	var nextContext strings.Builder
+	nextContext.WriteString(prevConvo)
+	nextContext.WriteString(prompt)

 	request := map[string]any{
-		"prompt":            predict.Prompt,
+		"prompt":            nextContext.String(),
 		"stream":            true,
 		"n_predict":         llm.NumPredict,
 		"n_keep":            llm.NumKeep,
@@ -597,121 +565,103 @@ func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(Pred
 		"penalize_nl":       llm.PenalizeNewline,
 		"seed":              llm.Seed,
 		"stop":              llm.Stop,
-		"image_data":        imageData,
 	}

-	if predict.Format == "json" {
+	if format == "json" {
 		request["grammar"] = jsonGrammar
 	}

-	retryDelay := 100 * time.Microsecond
-	for retries := 0; retries < maxRetries; retries++ {
-		if retries > 0 {
-			time.Sleep(retryDelay) // wait before retrying
-			retryDelay *= 2        // exponential backoff
-		}
+	// Handling JSON marshaling with special characters unescaped.
+	buffer := &bytes.Buffer{}
+	enc := json.NewEncoder(buffer)
+	enc.SetEscapeHTML(false)

-		// Handling JSON marshaling with special characters unescaped.
-		buffer := &bytes.Buffer{}
-		enc := json.NewEncoder(buffer)
-		enc.SetEscapeHTML(false)
+	if err := enc.Encode(request); err != nil {
+		return fmt.Errorf("failed to marshal data: %v", err)
+	}

-		if err := enc.Encode(request); err != nil {
-			return fmt.Errorf("failed to marshal data: %v", err)
-		}
+	endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", llm.Port)
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, buffer)
+	if err != nil {
+		return fmt.Errorf("error creating POST request: %v", err)
+	}
+	req.Header.Set("Content-Type", "application/json")

-		endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", llm.Port)
-		req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, buffer)
+	resp, err := http.DefaultClient.Do(req)
+	if err != nil {
+		return fmt.Errorf("POST predict: %v", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode >= 400 {
+		bodyBytes, err := io.ReadAll(resp.Body)
 		if err != nil {
-			return fmt.Errorf("error creating POST request: %v", err)
+			return fmt.Errorf("failed reading llm error response: %w", err)
 		}
-		req.Header.Set("Content-Type", "application/json")
+		log.Printf("llm predict error: %s", bodyBytes)
+		return fmt.Errorf("%s", bodyBytes)
+	}

-		resp, err := http.DefaultClient.Do(req)
-		if err != nil {
-			return fmt.Errorf("POST predict: %v", err)
-		}
-		defer resp.Body.Close()
-
-		if resp.StatusCode >= 400 {
-			bodyBytes, err := io.ReadAll(resp.Body)
-			if err != nil {
-				return fmt.Errorf("failed reading llm error response: %w", err)
+	scanner := bufio.NewScanner(resp.Body)
+	// increase the buffer size to avoid running out of space
+	buf := make([]byte, 0, maxBufferSize)
+	scanner.Buffer(buf, maxBufferSize)
+	for scanner.Scan() {
+		select {
+		case <-ctx.Done():
+			// This handles the request cancellation
+			return ctx.Err()
+		default:
+			line := scanner.Bytes()
+			if len(line) == 0 {
+				continue
 			}
-			log.Printf("llm predict error: %s", bodyBytes)
-			return fmt.Errorf("%s", bodyBytes)
-		}
-
-		scanner := bufio.NewScanner(resp.Body)
-		// increase the buffer size to avoid running out of space
-		buf := make([]byte, 0, maxBufferSize)
-		scanner.Buffer(buf, maxBufferSize)
-
-		retryNeeded := false
-		for scanner.Scan() {
-			select {
-			case <-ctx.Done():
-				// This handles the request cancellation
-				return ctx.Err()
-			default:
-				line := scanner.Bytes()
-				if len(line) == 0 {
-					continue
-				}
-
-				if isRetryable(line) {
-					retryNeeded = true
-					break
-				}
-
-				evt, ok := bytes.CutPrefix(line, []byte("data: "))
-				if !ok {
-					return fmt.Errorf("error parsing llm response stream: %s", line)
-				}

+			if evt, ok := bytes.CutPrefix(line, []byte("data: ")); ok {
 				var p prediction
 				if err := json.Unmarshal(evt, &p); err != nil {
 					return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
 				}

 				if p.Content != "" {
-					fn(PredictResult{
-						Content: p.Content,
-					})
+					fn(api.GenerateResponse{Response: p.Content})
+					nextContext.WriteString(p.Content)
 				}

 				if p.Stop {
-					fn(PredictResult{
+					embd, err := llm.Encode(ctx, nextContext.String())
+					if err != nil {
+						return fmt.Errorf("encoding context: %v", err)
+					}
+
+					fn(api.GenerateResponse{
 						Done:               true,
+						Context:            embd,
 						PromptEvalCount:    p.Timings.PromptN,
 						PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
 						EvalCount:          p.Timings.PredictedN,
 						EvalDuration:       parseDurationMs(p.Timings.PredictedMS),
 					})
+
 					return nil
 				}
 			}
 		}
-
-		if err := scanner.Err(); err != nil {
-			if strings.Contains(err.Error(), "unexpected EOF") {
-				// this means the llama runner subprocess crashed
-				llm.Close()
-				if llm.StatusWriter != nil && llm.StatusWriter.LastErrMsg != "" {
-					return fmt.Errorf("llama runner exited: %v", llm.StatusWriter.LastErrMsg)
-				}
-				return fmt.Errorf("llama runner exited, you may not have enough available memory to run this model")
-			}
-			return fmt.Errorf("error reading llm response: %v", err)
-		}
-
-		if !retryNeeded {
-			return nil // success
-		}
 	}

-	// should never reach here ideally
-	return fmt.Errorf("max retries exceeded")
+	if err := scanner.Err(); err != nil {
+		if strings.Contains(err.Error(), "unexpected EOF") {
+			// this means the llama runner subprocess crashed
+			llm.Close()
+			if llm.StatusWriter != nil && llm.StatusWriter.LastErrMsg != "" {
+				return fmt.Errorf("llama runner exited: %v", llm.StatusWriter.LastErrMsg)
+			}
+			return fmt.Errorf("llama runner exited, you may not have enough available memory to run this model")
+		}
+		return fmt.Errorf("error reading llm response: %v", err)
+	}
+
+	return nil
 }

 type TokenizeRequest struct {
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -14,7 +14,7 @@ import (
 )

 type LLM interface {
-	Predict(context.Context, PredictOpts, func(PredictResult)) error
+	Predict(context.Context, []int, string, string, func(api.GenerateResponse)) error
 	Embedding(context.Context, string) ([]float64, error)
 	Encode(context.Context, string) ([]int, error)
 	Decode(context.Context, []int) (string, error)
@@ -23,7 +23,7 @@ type LLM interface {
 	Ping(context.Context) error
 }

-func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
+func New(workDir, model string, adapters []string, opts api.Options) (LLM, error) {
 	if _, err := os.Stat(model); err != nil {
 		return nil, err
 	}
@@ -82,9 +82,9 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
 		opts.NumGQA = 0
 		opts.RopeFrequencyBase = 0.0
 		opts.RopeFrequencyScale = 0.0
-		return newLlama(model, adapters, projectors, chooseRunners(workDir, "gguf"), ggml.NumLayers(), opts)
+		return newLlama(model, adapters, chooseRunners(workDir, "gguf"), ggml.NumLayers(), opts)
 	case "ggml", "ggmf", "ggjt", "ggla":
-		return newLlama(model, adapters, projectors, chooseRunners(workDir, "ggml"), ggml.NumLayers(), opts)
+		return newLlama(model, adapters, chooseRunners(workDir, "ggml"), ggml.NumLayers(), opts)
 	default:
 		return nil, fmt.Errorf("unknown ggml type: %s", ggml.ModelFamily())
 	}
--- a/llm/starcoder.go
+++ b/llm/starcoder.go
@@ -0,0 +1,23 @@
+package llm
+
+const (
+	starCoderModelType1B  = 24
+	starCoderModelType3B  = 36
+	starCoderModelType7B  = 42
+	starCoderModelType15B = 40
+)
+
+func starCoderModelType(numLayer uint32) string {
+	switch numLayer {
+	case 24:
+		return "1B"
+	case 36:
+		return "3B"
+	case 42:
+		return "7B"
+	case 40:
+		return "15B"
+	default:
+		return "unknown"
+	}
+}
--- a/parser/parser.go
+++ b/parser/parser.go
@@ -37,13 +37,10 @@ func Parse(reader io.Reader) ([]Command, error) {
 		switch string(bytes.ToUpper(fields[0])) {
 		case "FROM":
 			command.Name = "model"
-			command.Args = string(bytes.TrimSpace(fields[1]))
+			command.Args = string(fields[1])
 			// copy command for validation
 			modelCommand = command
-		case "ADAPTER":
-			command.Name = string(bytes.ToLower(fields[0]))
-			command.Args = string(bytes.TrimSpace(fields[1]))
-		case "LICENSE", "TEMPLATE", "SYSTEM", "PROMPT":
+		case "LICENSE", "TEMPLATE", "SYSTEM", "PROMPT", "ADAPTER":
 			command.Name = string(bytes.ToLower(fields[0]))
 			command.Args = string(fields[1])
 		case "PARAMETER":
@@ -53,7 +50,7 @@ func Parse(reader io.Reader) ([]Command, error) {
 			}

 			command.Name = string(fields[0])
-			command.Args = string(bytes.TrimSpace(fields[1]))
+			command.Args = string(fields[1])
 		case "EMBED":
 			return nil, fmt.Errorf("deprecated command: EMBED is no longer supported, use the /embed API endpoint instead")
 		default:
--- a/readline/readline.go
+++ b/readline/readline.go
@@ -191,8 +191,6 @@ func (i *Instance) Readline() (string, error) {
 			buf.ClearScreen()
 		case CharCtrlW:
 			buf.DeleteWord()
-		case CharCtrlZ:
-			return handleCharCtrlZ(fd, termios)
 		case CharEnter:
 			output := buf.String()
 			if output != "" {
--- a/readline/readline_unix.go
+++ b/readline/readline_unix.go
@@ -1,18 +0,0 @@
-//go:build !windows
-
-package readline
-
-import (
-	"syscall"
-)
-
-func handleCharCtrlZ(fd int, termios *Termios) (string, error) {
-	if err := UnsetRawMode(fd, termios); err != nil {
-		return "", err
-	}
-
-	syscall.Kill(0, syscall.SIGSTOP)
-
-	// on resume...
-	return "", nil
-}
--- a/readline/readline_windows.go
+++ b/readline/readline_windows.go
@@ -1,6 +0,0 @@
-package readline
-
-func handleCharCtrlZ(fd int, state *State) (string, error) {
-	// not supported
-	return "", nil
-}
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -217,7 +217,7 @@ fi

 if ! check_gpu nvidia-smi || [ -z "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\.[0-9]*")" ]; then
    case $OS_NAME in
-        centos|rhel) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -d '.' -f 1) ;;
+        centos|rhel) install_cuda_driver_yum 'rhel' $OS_VERSION ;;
        rocky) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -c1) ;;
        fedora) install_cuda_driver_yum $OS_NAME $OS_VERSION ;;
        amzn) install_cuda_driver_yum 'fedora' '35' ;;
@@ -230,8 +230,7 @@ fi
 if ! lsmod | grep -q nvidia; then
    KERNEL_RELEASE="$(uname -r)"
    case $OS_NAME in
-        rocky) $SUDO $PACKAGE_MANAGER -y install kernel-devel kernel-headers ;;
-        centos|rhel|amzn) $SUDO $PACKAGE_MANAGER -y install kernel-devel-$KERNEL_RELEASE kernel-headers-$KERNEL_RELEASE ;;
+        centos|rhel|rocky|amzn) $SUDO $PACKAGE_MANAGER -y install kernel-devel-$KERNEL_RELEASE kernel-headers-$KERNEL_RELEASE ;;
        fedora) $SUDO $PACKAGE_MANAGER -y install kernel-devel-$KERNEL_RELEASE ;;
        debian|ubuntu) $SUDO apt-get -y install linux-headers-$KERNEL_RELEASE ;;
        *) exit ;;
--- a/server/images.go
+++ b/server/images.go
@@ -14,6 +14,7 @@ import (
 	"net/url"
 	"os"
 	"path/filepath"
+	"reflect"
 	"runtime"
 	"strconv"
 	"strings"
@@ -35,160 +36,80 @@ type RegistryOptions struct {
 }

 type Model struct {
-	Name           string `json:"name"`
-	Config         ConfigV2
-	ShortName      string
-	ModelPath      string
-	OriginalModel  string
-	AdapterPaths   []string
-	ProjectorPaths []string
-	Template       string
-	System         string
-	License        []string
-	Digest         string
-	Size           int64
-	Options        map[string]interface{}
+	Name          string `json:"name"`
+	ShortName     string
+	ModelPath     string
+	OriginalModel string
+	AdapterPaths  []string
+	Template      string
+	System        string
+	License       []string
+	Digest        string
+	Options       map[string]interface{}
 }

-type PromptVars struct {
-	System   string
-	Prompt   string
-	Response string
-	First    bool
-}
+func (m *Model) Prompt(request api.GenerateRequest) (string, error) {
+	t := m.Template
+	if request.Template != "" {
+		t = request.Template
+	}

-func (m *Model) Prompt(p PromptVars) (string, error) {
-	var prompt strings.Builder
-	// Use the "missingkey=zero" option to handle missing variables without panicking
-	tmpl, err := template.New("").Option("missingkey=zero").Parse(m.Template)
+	tmpl, err := template.New("").Parse(t)
 	if err != nil {
 		return "", err
 	}

-	if p.System == "" {
-		// use the default system message for this model if one is not specified
-		p.System = m.System
+	var vars struct {
+		First  bool
+		System string
+		Prompt string
 	}

-	vars := map[string]any{
-		"System":   p.System,
-		"Prompt":   p.Prompt,
-		"Response": p.Response,
-		"First":    p.First,
+	vars.First = len(request.Context) == 0
+	vars.System = m.System
+	vars.Prompt = request.Prompt
+
+	if request.System != "" {
+		vars.System = request.System
 	}

 	var sb strings.Builder
 	if err := tmpl.Execute(&sb, vars); err != nil {
 		return "", err
 	}
-	prompt.WriteString(sb.String())
-	prompt.WriteString(p.Response)
-	return prompt.String(), nil
-}

-func (m *Model) ChatPrompt(msgs []api.Message) (string, []api.ImageData, error) {
-	// build the prompt from the list of messages
-	var prompt strings.Builder
-	var currentImages []api.ImageData
-	currentVars := PromptVars{
-		First: true,
-	}
-
-	writePrompt := func() error {
-		p, err := m.Prompt(currentVars)
-		if err != nil {
-			return err
-		}
-		prompt.WriteString(p)
-		currentVars = PromptVars{}
-		return nil
-	}
-
-	for _, msg := range msgs {
-		switch strings.ToLower(msg.Role) {
-		case "system":
-			if currentVars.System != "" {
-				if err := writePrompt(); err != nil {
-					return "", nil, err
-				}
-			}
-			currentVars.System = msg.Content
-		case "user":
-			if currentVars.Prompt != "" {
-				if err := writePrompt(); err != nil {
-					return "", nil, err
-				}
-			}
-			currentVars.Prompt = msg.Content
-			currentImages = msg.Images
-		case "assistant":
-			currentVars.Response = msg.Content
-			if err := writePrompt(); err != nil {
-				return "", nil, err
-			}
-		default:
-			return "", nil, fmt.Errorf("invalid role: %s, role must be one of [system, user, assistant]", msg.Role)
-		}
-	}
-
-	// Append the last set of vars if they are non-empty
-	if currentVars.Prompt != "" || currentVars.System != "" {
-		if err := writePrompt(); err != nil {
-			return "", nil, err
-		}
-	}
-
-	return prompt.String(), currentImages, nil
+	return sb.String(), nil
 }

 type ManifestV2 struct {
 	SchemaVersion int      `json:"schemaVersion"`
 	MediaType     string   `json:"mediaType"`
-	Config        *Layer   `json:"config"`
+	Config        Layer    `json:"config"`
 	Layers        []*Layer `json:"layers"`
 }

+type Layer struct {
+	MediaType string `json:"mediaType"`
+	Digest    string `json:"digest"`
+	Size      int64  `json:"size"`
+	From      string `json:"from,omitempty"`
+}
+
+type LayerReader struct {
+	Layer
+	io.Reader
+}
+
 type ConfigV2 struct {
-	ModelFormat   string   `json:"model_format"`
-	ModelFamily   string   `json:"model_family"`
-	ModelFamilies []string `json:"model_families"`
-	ModelType     string   `json:"model_type"`
-	FileType      string   `json:"file_type"`
+	ModelFormat string `json:"model_format"`
+	ModelFamily string `json:"model_family"`
+	ModelType   string `json:"model_type"`
+	FileType    string `json:"file_type"`
+	RootFS      RootFS `json:"rootfs"`

 	// required by spec
 	Architecture string `json:"architecture"`
 	OS           string `json:"os"`
-	RootFS       RootFS `json:"rootfs"`
-}
-
-func (c *ConfigV2) SetModelFormat(format string) {
-	if c.ModelFormat == "" {
-		c.ModelFormat = format
-	}
-}
-
-func (c *ConfigV2) SetModelFamily(families ...string) {
-	for _, family := range families {
-		if c.ModelFamily == "" {
-			c.ModelFamily = family
-		}
-
-		if !slices.Contains(c.ModelFamilies, family) {
-			c.ModelFamilies = append(c.ModelFamilies, family)
-		}
-	}
-}
-
-func (c *ConfigV2) SetModelType(modelType string) {
-	if c.ModelType == "" {
-		c.ModelType = modelType
-	}
-}
-
-func (c *ConfigV2) SetFileType(fileType string) {
-	if c.FileType == "" {
-		c.FileType = fileType
-	}
 }

 type RootFS struct {
@@ -245,22 +166,6 @@ func GetModel(name string) (*Model, error) {
 		Digest:    digest,
 		Template:  "{{ .Prompt }}",
 		License:   []string{},
-		Size:      manifest.GetTotalSize(),
-	}
-
-	filename, err := GetBlobsPath(manifest.Config.Digest)
-	if err != nil {
-		return nil, err
-	}
-
-	configFile, err := os.Open(filename)
-	if err != nil {
-		return nil, err
-	}
-	defer configFile.Close()
-
-	if err := json.NewDecoder(configFile).Decode(&model.Config); err != nil {
-		return nil, err
 	}

 	for _, layer := range manifest.Layers {
@@ -279,8 +184,6 @@ func GetModel(name string) (*Model, error) {
 			log.Print("WARNING: model contains embeddings, but embeddings in modelfiles have been deprecated and will be ignored.")
 		case "application/vnd.ollama.image.adapter":
 			model.AdapterPaths = append(model.AdapterPaths, filename)
-		case "application/vnd.ollama.image.projector":
-			model.ProjectorPaths = append(model.ProjectorPaths, filename)
 		case "application/vnd.ollama.image.template":
 			bts, err := os.ReadFile(filename)
 			if err != nil {
@@ -354,14 +257,11 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
 	config := ConfigV2{
 		OS:           "linux",
 		Architecture: "amd64",
-		RootFS: RootFS{
-			Type: "layers",
-		},
 	}

 	deleteMap := make(map[string]struct{})

-	var layers Layers
+	var layers []*LayerReader

 	params := make(map[string][]string)
 	fromParams := make(map[string]any)
@@ -418,10 +318,10 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
 					return err
 				}

-				config.SetModelFormat(fromConfig.ModelFormat)
-				config.SetModelFamily(append(fromConfig.ModelFamilies, fromConfig.ModelFamily)...)
-				config.SetModelType(fromConfig.ModelType)
-				config.SetFileType(fromConfig.FileType)
+				config.ModelFormat = fromConfig.ModelFormat
+				config.ModelFamily = fromConfig.ModelFamily
+				config.ModelType = fromConfig.ModelType
+				config.FileType = fromConfig.FileType

 				for _, layer := range manifest.Layers {
 					deleteMap[layer.Digest] = struct{}{}
@@ -442,12 +342,13 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
 						}
 					}

-					layer, err := NewLayerFromLayer(layer.Digest, layer.MediaType, modelpath.GetShortTagname())
+					layer, err := GetLayerWithBufferFromLayer(layer)
 					if err != nil {
 						return err
 					}

-					layers.Add(layer)
+					layer.From = modelpath.GetShortTagname()
+					layers = append(layers, layer)
 				}

 				deleteMap[manifest.Config.Digest] = struct{}{}
@@ -455,48 +356,26 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
 			}
 			defer bin.Close()

-			var offset int64
-			for {
-				fn(api.ProgressResponse{Status: "creating model layer"})
-
-				bin.Seek(offset, io.SeekStart)
-				ggml, err := llm.DecodeGGML(bin)
-				if errors.Is(err, io.EOF) {
-					break
-				} else if err != nil {
-					return err
-				}
-
-				config.SetModelFormat(ggml.Name())
-				config.SetModelFamily(ggml.ModelFamily())
-				config.SetModelType(ggml.ModelType())
-				config.SetFileType(ggml.FileType())
-
-				mediatype := mediatype
-				if ggml.ModelFamily() == "clip" {
-					mediatype = "application/vnd.ollama.image.projector"
-				}
-
-				sr := io.NewSectionReader(bin, offset, ggml.Size)
-				layer, err := NewLayer(sr, mediatype)
-				if err != nil {
-					return err
-				}
-
-				layers.Add(layer)
-
-				offset += ggml.Size
+			fn(api.ProgressResponse{Status: "creating model layer"})
+			ggml, err := llm.DecodeGGML(bin)
+			if err != nil {
+				return err
 			}
+
+			config.ModelFormat = ggml.Name()
+			config.ModelFamily = ggml.ModelFamily()
+			config.ModelType = ggml.ModelType()
+			config.FileType = ggml.FileType()
+
+			bin.Seek(0, io.SeekStart)
+			layer, err := CreateLayer(bin)
+			if err != nil {
+				return err
+			}
+
+			layer.MediaType = mediatype
+			layers = append(layers, layer)
 		case "adapter":
-			if strings.HasPrefix(c.Args, "@") {
-				blobPath, err := GetBlobsPath(strings.TrimPrefix(c.Args, "@"))
-				if err != nil {
-					return err
-				}
-
-				c.Args = blobPath
-			}
-
 			fn(api.ProgressResponse{Status: "creating adapter layer"})
 			bin, err := os.Open(realpath(modelFileDir, c.Args))
 			if err != nil {
@@ -504,32 +383,41 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
 			}
 			defer bin.Close()

-			layer, err := NewLayer(bin, mediatype)
+			layer, err := CreateLayer(bin)
 			if err != nil {
 				return err
 			}

-			layers.Add(layer)
+			if layer.Size > 0 {
+				layer.MediaType = mediatype
+				layers = append(layers, layer)
+			}
 		case "license":
 			fn(api.ProgressResponse{Status: "creating license layer"})
-
-			bin := strings.NewReader(c.Args)
-			layer, err := NewLayer(bin, mediatype)
+			layer, err := CreateLayer(strings.NewReader(c.Args))
 			if err != nil {
 				return err
 			}

-			layers.Add(layer)
+			if layer.Size > 0 {
+				layer.MediaType = mediatype
+				layers = append(layers, layer)
+			}
 		case "template", "system":
 			fn(api.ProgressResponse{Status: fmt.Sprintf("creating %s layer", c.Name)})

-			bin := strings.NewReader(c.Args)
-			layer, err := NewLayer(bin, mediatype)
+			// remove duplicate layers
+			layers = removeLayerFromLayers(layers, mediatype)
+
+			layer, err := CreateLayer(strings.NewReader(c.Args))
 			if err != nil {
 				return err
 			}

-			layers.Replace(layer)
+			if layer.Size > 0 {
+				layer.MediaType = mediatype
+				layers = append(layers, layer)
+			}
 		default:
 			params[c.Name] = append(params[c.Name], c.Args)
 		}
@@ -538,7 +426,7 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
 	if len(params) > 0 {
 		fn(api.ProgressResponse{Status: "creating parameters layer"})

-		formattedParams, err := api.FormatParams(params)
+		formattedParams, err := formatParams(params)
 		if err != nil {
 			return err
 		}
@@ -549,7 +437,6 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
 			}
 		}

-		// xxx - can this be removed?
 		if config.ModelType == "65B" {
 			if gqa, ok := formattedParams["gqa"].(int); ok && gqa == 8 {
 				config.ModelType = "70B"
@@ -562,51 +449,40 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
 		}

 		fn(api.ProgressResponse{Status: "creating config layer"})
-		layer, err := NewLayer(&b, "application/vnd.ollama.image.params")
+		layer, err := CreateLayer(bytes.NewReader(b.Bytes()))
 		if err != nil {
 			return err
 		}

-		layers.Replace(layer)
+		layer.MediaType = "application/vnd.ollama.image.params"
+		layers = append(layers, layer)
 	}

-	digests := make([]string, len(layers.items))
-	for i, layer := range layers.items {
-		digests[i] = layer.Digest
-	}
-
-	config.RootFS.DiffIDs = digests
-
-	var b bytes.Buffer
-	if err := json.NewEncoder(&b).Encode(config); err != nil {
-		return err
-	}
-
-	configLayer, err := NewLayer(&b, "application/vnd.docker.container.image.v1+json")
+	digests, err := getLayerDigests(layers)
 	if err != nil {
 		return err
 	}

+	configLayer, err := createConfigLayer(config, digests)
+	if err != nil {
+		return err
+	}
+
+	layers = append(layers, configLayer)
 	delete(deleteMap, configLayer.Digest)

-	for _, layer := range append(layers.items, configLayer) {
-		committed, err := layer.Commit()
-		if err != nil {
-			return err
-		}
-
-		status := "writing layer"
-		if !committed {
-			status = "using already created layer"
-		}
-
-		fn(api.ProgressResponse{Status: fmt.Sprintf("%s %s", status, layer.Digest)})
+	if err := SaveLayers(layers, fn, false); err != nil {
+		return err
+	}

+	var contentLayers []*Layer
+	for _, layer := range layers {
+		contentLayers = append(contentLayers, &layer.Layer)
 		delete(deleteMap, layer.Digest)
 	}

 	fn(api.ProgressResponse{Status: "writing manifest"})
-	if err := WriteManifest(name, configLayer, layers.items); err != nil {
+	if err := CreateManifest(name, configLayer, contentLayers); err != nil {
 		return err
 	}

@@ -620,6 +496,177 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
 	return nil
 }

+func removeLayerFromLayers(layers []*LayerReader, mediaType string) []*LayerReader {
+	return slices.DeleteFunc(layers, func(layer *LayerReader) bool {
+		return layer.MediaType == mediaType
+	})
+}
+
+func SaveLayers(layers []*LayerReader, fn func(resp api.ProgressResponse), force bool) error {
+	// Write each of the layers to disk
+	for _, layer := range layers {
+		fp, err := GetBlobsPath(layer.Digest)
+		if err != nil {
+			return err
+		}
+
+		_, err = os.Stat(fp)
+		if os.IsNotExist(err) || force {
+			fn(api.ProgressResponse{Status: fmt.Sprintf("writing layer %s", layer.Digest)})
+
+			out, err := os.Create(fp)
+			if err != nil {
+				log.Printf("couldn't create %s", fp)
+				return err
+			}
+			defer out.Close()
+
+			if _, err = io.Copy(out, layer.Reader); err != nil {
+				return err
+			}
+
+		} else {
+			fn(api.ProgressResponse{Status: fmt.Sprintf("using already created layer %s", layer.Digest)})
+		}
+	}
+
+	return nil
+}
+
+func CreateManifest(name string, cfg *LayerReader, layers []*Layer) error {
+	mp := ParseModelPath(name)
+	manifest := ManifestV2{
+		SchemaVersion: 2,
+		MediaType:     "application/vnd.docker.distribution.manifest.v2+json",
+		Config: Layer{
+			MediaType: cfg.MediaType,
+			Size:      cfg.Size,
+			Digest:    cfg.Digest,
+		},
+		Layers: layers,
+	}
+
+	manifestJSON, err := json.Marshal(manifest)
+	if err != nil {
+		return err
+	}
+
+	fp, err := mp.GetManifestPath()
+	if err != nil {
+		return err
+	}
+	if err := os.MkdirAll(filepath.Dir(fp), 0o755); err != nil {
+		return err
+	}
+	return os.WriteFile(fp, manifestJSON, 0o644)
+}
+
+func GetLayerWithBufferFromLayer(layer *Layer) (*LayerReader, error) {
+	fp, err := GetBlobsPath(layer.Digest)
+	if err != nil {
+		return nil, err
+	}
+
+	file, err := os.Open(fp)
+	if err != nil {
+		return nil, fmt.Errorf("could not open blob: %w", err)
+	}
+	defer file.Close()
+
+	newLayer, err := CreateLayer(file)
+	if err != nil {
+		return nil, err
+	}
+	newLayer.MediaType = layer.MediaType
+	return newLayer, nil
+}
+
+// formatParams converts specified parameter options to their correct types
+func formatParams(params map[string][]string) (map[string]interface{}, error) {
+	opts := api.Options{}
+	valueOpts := reflect.ValueOf(&opts).Elem() // names of the fields in the options struct
+	typeOpts := reflect.TypeOf(opts)           // types of the fields in the options struct
+
+	// build map of json struct tags to their types
+	jsonOpts := make(map[string]reflect.StructField)
+	for _, field := range reflect.VisibleFields(typeOpts) {
+		jsonTag := strings.Split(field.Tag.Get("json"), ",")[0]
+		if jsonTag != "" {
+			jsonOpts[jsonTag] = field
+		}
+	}
+
+	out := make(map[string]interface{})
+	// iterate params and set values based on json struct tags
+	for key, vals := range params {
+		if opt, ok := jsonOpts[key]; ok {
+			field := valueOpts.FieldByName(opt.Name)
+			if field.IsValid() && field.CanSet() {
+				switch field.Kind() {
+				case reflect.Float32:
+					floatVal, err := strconv.ParseFloat(vals[0], 32)
+					if err != nil {
+						return nil, fmt.Errorf("invalid float value %s", vals)
+					}
+
+					out[key] = float32(floatVal)
+				case reflect.Int:
+					intVal, err := strconv.ParseInt(vals[0], 10, 64)
+					if err != nil {
+						return nil, fmt.Errorf("invalid int value %s", vals)
+					}
+
+					out[key] = intVal
+				case reflect.Bool:
+					boolVal, err := strconv.ParseBool(vals[0])
+					if err != nil {
+						return nil, fmt.Errorf("invalid bool value %s", vals)
+					}
+
+					out[key] = boolVal
+				case reflect.String:
+					out[key] = vals[0]
+				case reflect.Slice:
+					// TODO: only string slices are supported right now
+					out[key] = vals
+				default:
+					return nil, fmt.Errorf("unknown type %s for %s", field.Kind(), key)
+				}
+			}
+		}
+	}
+
+	return out, nil
+}
+
+func getLayerDigests(layers []*LayerReader) ([]string, error) {
+	var digests []string
+	for _, l := range layers {
+		if l.Digest == "" {
+			return nil, fmt.Errorf("layer is missing a digest")
+		}
+		digests = append(digests, l.Digest)
+	}
+	return digests, nil
+}
+
+// CreateLayer creates a Layer object from a given file
+func CreateLayer(f io.ReadSeeker) (*LayerReader, error) {
+	digest, size := GetSHA256Digest(f)
+	f.Seek(0, io.SeekStart)
+
+	layer := &LayerReader{
+		Layer: Layer{
+			MediaType: "application/vnd.docker.image.rootfs.diff.tar",
+			Digest:    digest,
+			Size:      size,
+		},
+		Reader: f,
+	}
+
+	return layer, nil
+}
+
 func CopyModel(src, dest string) error {
 	srcModelPath := ParseModelPath(src)
 	srcPath, err := srcModelPath.GetManifestPath()
@@ -887,7 +934,7 @@ func PushModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu

 	var layers []*Layer
 	layers = append(layers, manifest.Layers...)
-	layers = append(layers, manifest.Config)
+	layers = append(layers, &manifest.Config)

 	for _, layer := range layers {
 		if err := uploadBlob(ctx, mp, layer, regOpts, fn); err != nil {
@@ -958,7 +1005,7 @@ func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu

 	var layers []*Layer
 	layers = append(layers, manifest.Layers...)
-	layers = append(layers, manifest.Config)
+	layers = append(layers, &manifest.Config)

 	for _, layer := range layers {
 		if err := downloadBlob(
@@ -1046,6 +1093,30 @@ func pullModelManifest(ctx context.Context, mp ModelPath, regOpts *RegistryOptio
 	return m, err
 }

+func createConfigLayer(config ConfigV2, layers []string) (*LayerReader, error) {
+	config.RootFS = RootFS{
+		Type:    "layers",
+		DiffIDs: layers,
+	}
+
+	configJSON, err := json.Marshal(config)
+	if err != nil {
+		return nil, err
+	}
+
+	digest, size := GetSHA256Digest(bytes.NewBuffer(configJSON))
+
+	layer := &LayerReader{
+		Layer: Layer{
+			MediaType: "application/vnd.docker.container.image.v1+json",
+			Digest:    digest,
+			Size:      size,
+		},
+		Reader: bytes.NewBuffer(configJSON),
+	}
+	return layer, nil
+}
+
 // GetSHA256Digest returns the SHA256 hash of a given buffer and returns it, and the size of buffer
 func GetSHA256Digest(r io.Reader) (string, int64) {
 	h := sha256.New()
--- a/server/images_test.go
+++ b/server/images_test.go
@@ -1,98 +1,23 @@
 package server

 import (
-	"strings"
 	"testing"

 	"github.com/jmorganca/ollama/api"
 )

-func TestChat(t *testing.T) {
-	tests := []struct {
-		name     string
-		template string
-		msgs     []api.Message
-		want     string
-		wantErr  string
-	}{
-		{
-			name:     "Single Message",
-			template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
-			msgs: []api.Message{
-				{
-					Role:    "system",
-					Content: "You are a Wizard.",
-				},
-				{
-					Role:    "user",
-					Content: "What are the potion ingredients?",
-				},
-			},
-			want: "[INST] You are a Wizard. What are the potion ingredients? [/INST]",
-		},
-		{
-			name:     "Message History",
-			template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
-			msgs: []api.Message{
-				{
-					Role:    "system",
-					Content: "You are a Wizard.",
-				},
-				{
-					Role:    "user",
-					Content: "What are the potion ingredients?",
-				},
-				{
-					Role:    "assistant",
-					Content: "sugar",
-				},
-				{
-					Role:    "user",
-					Content: "Anything else?",
-				},
-			},
-			want: "[INST] You are a Wizard. What are the potion ingredients? [/INST]sugar[INST]  Anything else? [/INST]",
-		},
-		{
-			name:     "Assistant Only",
-			template: "[INST] {{ .System }} {{ .Prompt }} [/INST]",
-			msgs: []api.Message{
-				{
-					Role:    "assistant",
-					Content: "everything nice",
-				},
-			},
-			want: "[INST]   [/INST]everything nice",
-		},
-		{
-			name: "Invalid Role",
-			msgs: []api.Message{
-				{
-					Role:    "not-a-role",
-					Content: "howdy",
-				},
-			},
-			wantErr: "invalid role: not-a-role",
-		},
+func TestModelPrompt(t *testing.T) {
+	var m Model
+	req := api.GenerateRequest{
+		Template: "a{{ .Prompt }}b",
+		Prompt:   "<h1>",
 	}
-
-	for _, tt := range tests {
-		m := Model{
-			Template: tt.template,
-		}
-		t.Run(tt.name, func(t *testing.T) {
-			got, _, err := m.ChatPrompt(tt.msgs)
-			if tt.wantErr != "" {
-				if err == nil {
-					t.Errorf("ChatPrompt() expected error, got nil")
-				}
-				if !strings.Contains(err.Error(), tt.wantErr) {
-					t.Errorf("ChatPrompt() error = %v, wantErr %v", err, tt.wantErr)
-				}
-			}
-			if got != tt.want {
-				t.Errorf("ChatPrompt() got = %v, want %v", got, tt.want)
-			}
-		})
+	s, err := m.Prompt(req)
+	if err != nil {
+		t.Fatal(err)
+	}
+	want := "a<h1>b"
+	if s != want {
+		t.Errorf("got %q, want %q", s, want)
 	}
 }
--- a/server/layers.go
+++ b/server/layers.go
@@ -1,109 +0,0 @@
-package server
-
-import (
-	"crypto/sha256"
-	"fmt"
-	"io"
-	"os"
-	"runtime"
-	"strings"
-
-	"golang.org/x/exp/slices"
-)
-
-type Layers struct {
-	items []*Layer
-}
-
-func (ls *Layers) Add(layer *Layer) {
-	if layer.Size > 0 {
-		ls.items = append(ls.items, layer)
-	}
-}
-
-func (ls *Layers) Replace(layer *Layer) {
-	if layer.Size > 0 {
-		mediatype := layer.MediaType
-		layers := slices.DeleteFunc(ls.items, func(l *Layer) bool {
-			return l.MediaType == mediatype
-		})
-
-		ls.items = append(layers, layer)
-	}
-}
-
-type Layer struct {
-	MediaType string `json:"mediaType"`
-	Digest    string `json:"digest"`
-	Size      int64  `json:"size"`
-	From      string `json:"from,omitempty"`
-
-	tempFileName string
-}
-
-func NewLayer(r io.Reader, mediatype string) (*Layer, error) {
-	blobs, err := GetBlobsPath("")
-	if err != nil {
-		return nil, err
-	}
-
-	delimiter := ":"
-	if runtime.GOOS == "windows" {
-		delimiter = "-"
-	}
-
-	pattern := strings.Join([]string{"sha256", "*-partial"}, delimiter)
-	temp, err := os.CreateTemp(blobs, pattern)
-	if err != nil {
-		return nil, err
-	}
-	defer temp.Close()
-
-	sha256sum := sha256.New()
-	n, err := io.Copy(io.MultiWriter(temp, sha256sum), r)
-	if err != nil {
-		return nil, err
-	}
-
-	return &Layer{
-		MediaType:    mediatype,
-		Digest:       fmt.Sprintf("sha256:%x", sha256sum.Sum(nil)),
-		Size:         n,
-		tempFileName: temp.Name(),
-	}, nil
-}
-
-func NewLayerFromLayer(digest, mediatype, from string) (*Layer, error) {
-	blob, err := GetBlobsPath(digest)
-	if err != nil {
-		return nil, err
-	}
-
-	fi, err := os.Stat(blob)
-	if err != nil {
-		return nil, err
-	}
-
-	return &Layer{
-		MediaType: mediatype,
-		Digest:    digest,
-		Size:      fi.Size(),
-		From:      from,
-	}, nil
-}
-
-func (l *Layer) Commit() (bool, error) {
-	// always remove temp
-	defer os.Remove(l.tempFileName)
-
-	blob, err := GetBlobsPath(l.Digest)
-	if err != nil {
-		return false, err
-	}
-
-	if _, err := os.Stat(blob); err != nil {
-		return true, os.Rename(l.tempFileName, blob)
-	}
-
-	return false, nil
-}
--- a/server/manifests.go
+++ b/server/manifests.go
@@ -1,34 +0,0 @@
-package server
-
-import (
-	"bytes"
-	"encoding/json"
-	"os"
-	"path/filepath"
-)
-
-func WriteManifest(name string, config *Layer, layers []*Layer) error {
-	manifest := ManifestV2{
-		SchemaVersion: 2,
-		MediaType:     "application/vnd.docker.distribution.manifest.v2+json",
-		Config:        config,
-		Layers:        layers,
-	}
-
-	var b bytes.Buffer
-	if err := json.NewEncoder(&b).Encode(manifest); err != nil {
-		return err
-	}
-
-	modelpath := ParseModelPath(name)
-	manifestPath, err := modelpath.GetManifestPath()
-	if err != nil {
-		return err
-	}
-
-	if err := os.MkdirAll(filepath.Dir(manifestPath), 0755); err != nil {
-		return err
-	}
-
-	return os.WriteFile(manifestPath, b.Bytes(), 0644)
-}
--- a/server/modelpath.go
+++ b/server/modelpath.go
@@ -67,20 +67,6 @@ func ParseModelPath(name string) ModelPath {
 	return mp
 }

-var errModelPathInvalid = errors.New("invalid model path")
-
-func (mp ModelPath) Validate() error {
-	if mp.Repository == "" {
-		return fmt.Errorf("%w: model repository name is required", errModelPathInvalid)
-	}
-
-	if strings.Contains(mp.Tag, ":") {
-		return fmt.Errorf("%w: ':' (colon) is not allowed in tag names", errModelPathInvalid)
-	}
-
-	return nil
-}
-
 func (mp ModelPath) GetNamespaceRepository() string {
 	return fmt.Sprintf("%s/%s", mp.Namespace, mp.Repository)
 }
--- a/server/routes.go
+++ b/server/routes.go
@@ -2,6 +2,7 @@ package server

 import (
 	"context"
+	"crypto/sha256"
 	"encoding/json"
 	"errors"
 	"fmt"
@@ -59,26 +60,17 @@ var loaded struct {
 var defaultSessionDuration = 5 * time.Minute

 // load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
-func load(c *gin.Context, modelName string, reqOpts map[string]interface{}, sessionDuration time.Duration) (*Model, error) {
-	model, err := GetModel(modelName)
-	if err != nil {
-		return nil, err
-	}
-
-	workDir := c.GetString("workDir")
-
+func load(ctx context.Context, workDir string, model *Model, reqOpts map[string]interface{}, sessionDuration time.Duration) error {
 	opts := api.DefaultOptions()
 	if err := opts.FromMap(model.Options); err != nil {
 		log.Printf("could not load model options: %v", err)
-		return nil, err
+		return err
 	}

 	if err := opts.FromMap(reqOpts); err != nil {
-		return nil, err
+		return err
 	}

-	ctx := c.Request.Context()
-
 	// check if the loaded model is still running in a subprocess, in case something unexpected happened
 	if loaded.runner != nil {
 		if err := loaded.runner.Ping(ctx); err != nil {
@@ -105,7 +97,7 @@ func load(c *gin.Context, modelName string, reqOpts map[string]interface{}, sess
 			loaded.Options = nil
 		}

-		llmRunner, err := llm.New(workDir, model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts)
+		llmRunner, err := llm.New(workDir, model.ModelPath, model.AdapterPaths, opts)
 		if err != nil {
 			// some older models are not compatible with newer versions of llama.cpp
 			// show a generalized compatibility error until there is a better way to
@@ -114,7 +106,7 @@ func load(c *gin.Context, modelName string, reqOpts map[string]interface{}, sess
 				err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, model.ShortName)
 			}

-			return nil, err
+			return err
 		}

 		loaded.Model = model
@@ -148,7 +140,7 @@ func load(c *gin.Context, modelName string, reqOpts map[string]interface{}, sess
 	}

 	loaded.expireTimer.Reset(sessionDuration)
-	return model, nil
+	return nil
 }

 func GenerateHandler(c *gin.Context) {
@@ -156,9 +148,9 @@ func GenerateHandler(c *gin.Context) {
 	defer loaded.mu.Unlock()

 	checkpointStart := time.Now()
+
 	var req api.GenerateRequest
 	err := c.ShouldBindJSON(&req)
-
 	switch {
 	case errors.Is(err, io.EOF):
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
@@ -181,150 +173,88 @@ func GenerateHandler(c *gin.Context) {
 		return
 	}

-	sessionDuration := defaultSessionDuration
-	model, err := load(c, req.Model, req.Options, sessionDuration)
+	model, err := GetModel(req.Model)
 	if err != nil {
 		var pErr *fs.PathError
-		switch {
-		case errors.As(err, &pErr):
+		if errors.As(err, &pErr) {
 			c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found, try pulling it first", req.Model)})
-		case errors.Is(err, api.ErrInvalidOpts):
-			c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
-		default:
-			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+			return
 		}
+		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}

-	// an empty request loads the model
-	if req.Prompt == "" && req.Template == "" && req.System == "" {
-		c.JSON(http.StatusOK, api.GenerateResponse{
-			CreatedAt: time.Now().UTC(),
-			Model:     req.Model,
-			Done:      true})
+	workDir := c.GetString("workDir")
+
+	// TODO: set this duration from the request if specified
+	sessionDuration := defaultSessionDuration
+	if err := load(c.Request.Context(), workDir, model, req.Options, sessionDuration); err != nil {
+		if errors.Is(err, api.ErrInvalidOpts) {
+			c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+			return
+		}
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}

 	checkpointLoaded := time.Now()

-	var prompt string
-	switch {
-	case req.Raw:
-		prompt = req.Prompt
-	case req.Prompt != "":
-		if req.Template != "" {
-			// override the default model template
-			model.Template = req.Template
-		}
-
-		var rebuild strings.Builder
-		if req.Context != nil {
-			// TODO: context is deprecated, at some point the context logic within this conditional should be removed
-			prevCtx, err := loaded.runner.Decode(c.Request.Context(), req.Context)
-			if err != nil {
-				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
-				return
-			}
-
-			// Remove leading spaces from prevCtx if present
-			prevCtx = strings.TrimPrefix(prevCtx, " ")
-			rebuild.WriteString(prevCtx)
-		}
-		p, err := model.Prompt(PromptVars{
-			System: req.System,
-			Prompt: req.Prompt,
-			First:  len(req.Context) == 0,
-		})
+	prompt := req.Prompt
+	if !req.Raw {
+		prompt, err = model.Prompt(req)
 		if err != nil {
 			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 			return
 		}
-		rebuild.WriteString(p)
-		prompt = rebuild.String()
 	}

 	ch := make(chan any)
-	var generated strings.Builder
 	go func() {
 		defer close(ch)
+		// an empty request loads the model
+		if req.Prompt == "" && req.Template == "" && req.System == "" {
+			ch <- api.GenerateResponse{CreatedAt: time.Now().UTC(), Model: req.Model, Done: true}
+			return
+		}

-		fn := func(r llm.PredictResult) {
-			// Update model expiration
+		fn := func(r api.GenerateResponse) {
 			loaded.expireAt = time.Now().Add(sessionDuration)
 			loaded.expireTimer.Reset(sessionDuration)

-			// Build up the full response
-			if _, err := generated.WriteString(r.Content); err != nil {
-				ch <- gin.H{"error": err.Error()}
-				return
-			}
-
-			resp := api.GenerateResponse{
-				Model:     req.Model,
-				CreatedAt: time.Now().UTC(),
-				Done:      r.Done,
-				Response:  r.Content,
-				Metrics: api.Metrics{
-					PromptEvalCount:    r.PromptEvalCount,
-					PromptEvalDuration: r.PromptEvalDuration,
-					EvalCount:          r.EvalCount,
-					EvalDuration:       r.EvalDuration,
-				},
-			}
-
+			r.Model = req.Model
+			r.CreatedAt = time.Now().UTC()
 			if r.Done {
-				resp.TotalDuration = time.Since(checkpointStart)
-				resp.LoadDuration = checkpointLoaded.Sub(checkpointStart)
-
-				if !req.Raw {
-					embd, err := loaded.runner.Encode(c.Request.Context(), prompt+generated.String())
-					if err != nil {
-						ch <- gin.H{"error": err.Error()}
-						return
-					}
-					resp.Context = embd
-				}
+				r.TotalDuration = time.Since(checkpointStart)
+				r.LoadDuration = checkpointLoaded.Sub(checkpointStart)
 			}

-			ch <- resp
+			if req.Raw {
+				// in raw mode the client must manage history on their own
+				r.Context = nil
+			}
+
+			ch <- r
 		}

-		// Start prediction
-		predictReq := llm.PredictOpts{
-			Prompt: prompt,
-			Format: req.Format,
-			Images: req.Images,
-		}
-		if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil {
+		if err := loaded.runner.Predict(c.Request.Context(), req.Context, prompt, req.Format, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()

 	if req.Stream != nil && !*req.Stream {
-		// Accumulate responses into the final response
-		var final api.GenerateResponse
-		var sb strings.Builder
+		var response api.GenerateResponse
+		generated := ""
 		for resp := range ch {
-			switch r := resp.(type) {
-			case api.GenerateResponse:
-				sb.WriteString(r.Response)
-				final = r
-			case gin.H:
-				if errorMsg, ok := r["error"].(string); ok {
-					c.JSON(http.StatusInternalServerError, gin.H{"error": errorMsg})
-					return
-				} else {
-					c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error format in response"})
-					return
-				}
-			default:
-				c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error"})
+			if r, ok := resp.(api.GenerateResponse); ok {
+				generated += r.Response
+				response = r
+			} else {
+				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 				return
 			}
 		}
-
-		final.Response = sb.String()
-		c.JSON(http.StatusOK, final)
+		response.Response = generated
+		c.JSON(http.StatusOK, response)
 		return
 	}

@@ -351,18 +281,15 @@ func EmbeddingHandler(c *gin.Context) {
 		return
 	}

-	sessionDuration := defaultSessionDuration
-	_, err = load(c, req.Model, req.Options, sessionDuration)
+	model, err := GetModel(req.Model)
 	if err != nil {
-		var pErr *fs.PathError
-		switch {
-		case errors.As(err, &pErr):
-			c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found, try pulling it first", req.Model)})
-		case errors.Is(err, api.ErrInvalidOpts):
-			c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
-		default:
-			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
-		}
+		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+		return
+	}
+
+	workDir := c.GetString("workDir")
+	if err := load(c.Request.Context(), workDir, model, req.Options, 5*time.Minute); err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}

@@ -489,11 +416,6 @@ func CreateModelHandler(c *gin.Context) {
 		return
 	}

-	if err := ParseModelPath(req.Name).Validate(); err != nil {
-		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
-		return
-	}
-
 	if req.Path == "" && req.Modelfile == "" {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "path or modelfile are required"})
 		return
@@ -616,19 +538,10 @@ func GetModelInfo(name string) (*api.ShowResponse, error) {
 		return nil, err
 	}

-	modelDetails := api.ModelDetails{
-		Format:            model.Config.ModelFormat,
-		Family:            model.Config.ModelFamily,
-		Families:          model.Config.ModelFamilies,
-		ParameterSize:     model.Config.ModelType,
-		QuantizationLevel: model.Config.FileType,
-	}
-
 	resp := &api.ShowResponse{
 		License:  strings.Join(model.License, "\n"),
 		System:   model.System,
 		Template: model.Template,
-		Details:  modelDetails,
 	}

 	mf, err := ShowModelfile(model)
@@ -678,42 +591,25 @@ func ListModelsHandler(c *gin.Context) {
 		return
 	}

-	modelResponse := func(modelName string) (api.ModelResponse, error) {
-		model, err := GetModel(modelName)
-		if err != nil {
-			return api.ModelResponse{}, err
-		}
-
-		modelDetails := api.ModelDetails{
-			Format:            model.Config.ModelFormat,
-			Family:            model.Config.ModelFamily,
-			Families:          model.Config.ModelFamilies,
-			ParameterSize:     model.Config.ModelType,
-			QuantizationLevel: model.Config.FileType,
-		}
-
-		return api.ModelResponse{
-			Name:    model.ShortName,
-			Size:    model.Size,
-			Digest:  model.Digest,
-			Details: modelDetails,
-		}, nil
-	}
-
 	walkFunc := func(path string, info os.FileInfo, _ error) error {
 		if !info.IsDir() {
 			dir, file := filepath.Split(path)
 			dir = strings.Trim(strings.TrimPrefix(dir, fp), string(os.PathSeparator))
 			tag := strings.Join([]string{dir, file}, ":")

-			resp, err := modelResponse(tag)
+			mp := ParseModelPath(tag)
+			manifest, digest, err := GetManifest(mp)
 			if err != nil {
 				log.Printf("skipping file: %s", fp)
 				return nil
 			}

-			resp.ModifiedAt = info.ModTime()
-			models = append(models, resp)
+			models = append(models, api.ModelResponse{
+				Name:       mp.GetShortTagname(),
+				Size:       manifest.GetTotalSize(),
+				Digest:     digest,
+				ModifiedAt: info.ModTime(),
+			})
 		}

 		return nil
@@ -744,11 +640,6 @@ func CopyModelHandler(c *gin.Context) {
 		return
 	}

-	if err := ParseModelPath(req.Destination).Validate(); err != nil {
-		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
-		return
-	}
-
 	if err := CopyModel(req.Source, req.Destination); err != nil {
 		if os.IsNotExist(err) {
 			c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Source)})
@@ -775,18 +666,37 @@ func HeadBlobHandler(c *gin.Context) {
 }

 func CreateBlobHandler(c *gin.Context) {
-	layer, err := NewLayer(c.Request.Body, "")
+	targetPath, err := GetBlobsPath(c.Param("digest"))
 	if err != nil {
 		c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}

-	if layer.Digest != c.Param("digest") {
-		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("digest mismatch, expected %q, got %q", c.Param("digest"), layer.Digest)})
+	hash := sha256.New()
+	temp, err := os.CreateTemp(filepath.Dir(targetPath), c.Param("digest")+"-")
+	if err != nil {
+		c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		return
+	}
+	defer temp.Close()
+	defer os.Remove(temp.Name())
+
+	if _, err := io.Copy(temp, io.TeeReader(c.Request.Body, hash)); err != nil {
+		c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}

-	if _, err := layer.Commit(); err != nil {
+	if fmt.Sprintf("sha256:%x", hash.Sum(nil)) != c.Param("digest") {
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "digest does not match body"})
+		return
+	}
+
+	if err := temp.Close(); err != nil {
+		c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		return
+	}
+
+	if err := os.Rename(temp.Name(), targetPath); err != nil {
 		c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
@@ -847,7 +757,6 @@ func Serve(ln net.Listener, allowOrigins []string) error {

 	r.POST("/api/pull", PullModelHandler)
 	r.POST("/api/generate", GenerateHandler)
-	r.POST("/api/chat", ChatHandler)
 	r.POST("/api/embeddings", EmbeddingHandler)
 	r.POST("/api/create", CreateModelHandler)
 	r.POST("/api/push", PushModelHandler)
@@ -863,9 +772,6 @@ func Serve(ln net.Listener, allowOrigins []string) error {
 		})

 		r.Handle(method, "/api/tags", ListModelsHandler)
-		r.Handle(method, "/api/version", func(c *gin.Context) {
-			c.JSON(http.StatusOK, gin.H{"version": version.Version})
-		})
 	}

 	log.Printf("Listening on %s (version %s)", ln.Addr(), version.Version)
@@ -888,7 +794,7 @@ func Serve(ln net.Listener, allowOrigins []string) error {
 	if runtime.GOOS == "linux" {
 		// check compatibility to log warnings
 		if _, err := llm.CheckVRAM(); err != nil {
-			log.Print(err.Error())
+			log.Printf(err.Error())
 		}
 	}

@@ -944,136 +850,3 @@ func streamResponse(c *gin.Context, ch chan any) {
 		return true
 	})
 }
-
-func ChatHandler(c *gin.Context) {
-	loaded.mu.Lock()
-	defer loaded.mu.Unlock()
-
-	checkpointStart := time.Now()
-
-	var req api.ChatRequest
-	err := c.ShouldBindJSON(&req)
-	switch {
-	case errors.Is(err, io.EOF):
-		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
-		return
-	case err != nil:
-		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
-		return
-	}
-
-	// validate the request
-	switch {
-	case req.Model == "":
-		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
-		return
-	case len(req.Format) > 0 && req.Format != "json":
-		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "format must be json"})
-		return
-	}
-
-	sessionDuration := defaultSessionDuration
-	model, err := load(c, req.Model, req.Options, sessionDuration)
-	if err != nil {
-		var pErr *fs.PathError
-		switch {
-		case errors.As(err, &pErr):
-			c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found, try pulling it first", req.Model)})
-		case errors.Is(err, api.ErrInvalidOpts):
-			c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
-		default:
-			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
-		}
-		return
-	}
-
-	// an empty request loads the model
-	if len(req.Messages) == 0 {
-		c.JSON(http.StatusOK, api.ChatResponse{CreatedAt: time.Now().UTC(), Model: req.Model, Done: true})
-		return
-	}
-
-	checkpointLoaded := time.Now()
-
-	prompt, images, err := model.ChatPrompt(req.Messages)
-	if err != nil {
-		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
-		return
-	}
-
-	ch := make(chan any)
-
-	go func() {
-		defer close(ch)
-
-		fn := func(r llm.PredictResult) {
-			// Update model expiration
-			loaded.expireAt = time.Now().Add(sessionDuration)
-			loaded.expireTimer.Reset(sessionDuration)
-
-			resp := api.ChatResponse{
-				Model:     req.Model,
-				CreatedAt: time.Now().UTC(),
-				Done:      r.Done,
-				Metrics: api.Metrics{
-					PromptEvalCount:    r.PromptEvalCount,
-					PromptEvalDuration: r.PromptEvalDuration,
-					EvalCount:          r.EvalCount,
-					EvalDuration:       r.EvalDuration,
-				},
-			}
-
-			if r.Done {
-				resp.TotalDuration = time.Since(checkpointStart)
-				resp.LoadDuration = checkpointLoaded.Sub(checkpointStart)
-			} else {
-				resp.Message = &api.Message{Role: "assistant", Content: r.Content}
-			}
-
-			ch <- resp
-		}
-
-		// Start prediction
-		predictReq := llm.PredictOpts{
-			Prompt: prompt,
-			Format: req.Format,
-			Images: images,
-		}
-		if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil {
-			ch <- gin.H{"error": err.Error()}
-		}
-	}()
-
-	if req.Stream != nil && !*req.Stream {
-		// Accumulate responses into the final response
-		var final api.ChatResponse
-		var sb strings.Builder
-		for resp := range ch {
-			switch r := resp.(type) {
-			case api.ChatResponse:
-				if r.Message != nil {
-					sb.WriteString(r.Message.Content)
-				}
-
-				final = r
-			case gin.H:
-				if errorMsg, ok := r["error"].(string); ok {
-					c.JSON(http.StatusInternalServerError, gin.H{"error": errorMsg})
-					return
-				} else {
-					c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error format in response"})
-					return
-				}
-			default:
-				c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error"})
-				return
-			}
-		}
-
-		final.Message = &api.Message{Role: "assistant", Content: sb.String()}
-		c.JSON(http.StatusOK, final)
-		return
-	}
-
-	streamResponse(c, ch)
-}
--- a/server/upload.go
+++ b/server/upload.go
@@ -5,7 +5,6 @@ import (
 	"crypto/md5"
 	"errors"
 	"fmt"
-	"hash"
 	"io"
 	"log"
 	"math"
@@ -103,7 +102,7 @@ func (b *blobUpload) Prepare(ctx context.Context, requestURL *url.URL, opts *Reg
 		}

 		// set part.N to the current number of parts
-		b.Parts = append(b.Parts, blobUploadPart{N: len(b.Parts), Offset: offset, Size: size})
+		b.Parts = append(b.Parts, blobUploadPart{blobUpload: b, N: len(b.Parts), Offset: offset, Size: size})
 		offset += size
 	}

@@ -148,13 +147,14 @@ func (b *blobUpload) Run(ctx context.Context, opts *RegistryOptions) {
 			g.Go(func() error {
 				var err error
 				for try := 0; try < maxRetries; try++ {
-					err = b.uploadPart(inner, http.MethodPatch, requestURL, part, opts)
+					err = b.uploadChunk(inner, http.MethodPatch, requestURL, part, opts)
 					switch {
 					case errors.Is(err, context.Canceled):
 						return err
 					case errors.Is(err, errMaxRetriesExceeded):
 						return err
 					case err != nil:
+						part.Reset()
 						sleep := time.Second * time.Duration(math.Pow(2, float64(try)))
 						log.Printf("%s part %d attempt %d failed: %v, retrying in %s", b.Digest[7:19], part.N, try, err, sleep)
 						time.Sleep(sleep)
@@ -176,10 +176,17 @@ func (b *blobUpload) Run(ctx context.Context, opts *RegistryOptions) {

 	requestURL := <-b.nextURL

-	// calculate md5 checksum and add it to the commit request
 	var sb strings.Builder
+
+	// calculate md5 checksum and add it to the commit request
 	for _, part := range b.Parts {
-		sb.Write(part.Sum(nil))
+		hash := md5.New()
+		if _, err := io.Copy(hash, io.NewSectionReader(b.file, part.Offset, part.Size)); err != nil {
+			b.err = err
+			return
+		}
+
+		sb.Write(hash.Sum(nil))
 	}

 	md5sum := md5.Sum([]byte(sb.String()))
@@ -194,25 +201,27 @@ func (b *blobUpload) Run(ctx context.Context, opts *RegistryOptions) {
 	headers.Set("Content-Length", "0")

 	for try := 0; try < maxRetries; try++ {
-		var resp *http.Response
-		resp, err = makeRequestWithRetry(ctx, http.MethodPut, requestURL, headers, nil, opts)
-		if errors.Is(err, context.Canceled) {
-			break
-		} else if err != nil {
+		resp, err := makeRequestWithRetry(ctx, http.MethodPut, requestURL, headers, nil, opts)
+		if err != nil {
+			b.err = err
+			if errors.Is(err, context.Canceled) {
+				return
+			}
+
 			sleep := time.Second * time.Duration(math.Pow(2, float64(try)))
 			log.Printf("%s complete upload attempt %d failed: %v, retrying in %s", b.Digest[7:19], try, err, sleep)
 			time.Sleep(sleep)
 			continue
 		}
 		defer resp.Body.Close()
-		break
-	}

-	b.err = err
-	b.done = true
+		b.err = nil
+		b.done = true
+		return
+	}
 }

-func (b *blobUpload) uploadPart(ctx context.Context, method string, requestURL *url.URL, part *blobUploadPart, opts *RegistryOptions) error {
+func (b *blobUpload) uploadChunk(ctx context.Context, method string, requestURL *url.URL, part *blobUploadPart, opts *RegistryOptions) error {
 	headers := make(http.Header)
 	headers.Set("Content-Type", "application/octet-stream")
 	headers.Set("Content-Length", fmt.Sprintf("%d", part.Size))
@@ -223,13 +232,8 @@ func (b *blobUpload) uploadPart(ctx context.Context, method string, requestURL *
 	}

 	sr := io.NewSectionReader(b.file, part.Offset, part.Size)
-
-	md5sum := md5.New()
-	w := &progressWriter{blobUpload: b}
-
-	resp, err := makeRequest(ctx, method, requestURL, headers, io.TeeReader(sr, io.MultiWriter(w, md5sum)), opts)
+	resp, err := makeRequest(ctx, method, requestURL, headers, io.TeeReader(sr, part), opts)
 	if err != nil {
-		w.Rollback()
 		return err
 	}
 	defer resp.Body.Close()
@@ -241,13 +245,11 @@ func (b *blobUpload) uploadPart(ctx context.Context, method string, requestURL *

 	nextURL, err := url.Parse(location)
 	if err != nil {
-		w.Rollback()
 		return err
 	}

 	switch {
 	case resp.StatusCode == http.StatusTemporaryRedirect:
-		w.Rollback()
 		b.nextURL <- nextURL

 		redirectURL, err := resp.Location()
@@ -257,13 +259,14 @@ func (b *blobUpload) uploadPart(ctx context.Context, method string, requestURL *

 		// retry uploading to the redirect URL
 		for try := 0; try < maxRetries; try++ {
-			err = b.uploadPart(ctx, http.MethodPut, redirectURL, part, nil)
+			err = b.uploadChunk(ctx, http.MethodPut, redirectURL, part, nil)
 			switch {
 			case errors.Is(err, context.Canceled):
 				return err
 			case errors.Is(err, errMaxRetriesExceeded):
 				return err
 			case err != nil:
+				part.Reset()
 				sleep := time.Second * time.Duration(math.Pow(2, float64(try)))
 				log.Printf("%s part %d attempt %d failed: %v, retrying in %s", b.Digest[7:19], part.N, try, err, sleep)
 				time.Sleep(sleep)
@@ -276,7 +279,6 @@ func (b *blobUpload) uploadPart(ctx context.Context, method string, requestURL *
 		return fmt.Errorf("%w: %w", errMaxRetriesExceeded, err)

 	case resp.StatusCode == http.StatusUnauthorized:
-		w.Rollback()
 		auth := resp.Header.Get("www-authenticate")
 		authRedir := ParseAuthRedirectString(auth)
 		token, err := getAuthToken(ctx, authRedir)
@@ -287,7 +289,6 @@ func (b *blobUpload) uploadPart(ctx context.Context, method string, requestURL *
 		opts.Token = token
 		fallthrough
 	case resp.StatusCode >= http.StatusBadRequest:
-		w.Rollback()
 		body, err := io.ReadAll(resp.Body)
 		if err != nil {
 			return err
@@ -300,7 +301,6 @@ func (b *blobUpload) uploadPart(ctx context.Context, method string, requestURL *
 		b.nextURL <- nextURL
 	}

-	part.Hash = md5sum
 	return nil
 }

@@ -341,26 +341,22 @@ func (b *blobUpload) Wait(ctx context.Context, fn func(api.ProgressResponse)) er

 type blobUploadPart struct {
 	// N is the part number
-	N      int
-	Offset int64
-	Size   int64
-	hash.Hash
-}
-
-type progressWriter struct {
+	N       int
+	Offset  int64
+	Size    int64
 	written int64
 	*blobUpload
 }

-func (p *progressWriter) Write(b []byte) (n int, err error) {
+func (p *blobUploadPart) Write(b []byte) (n int, err error) {
 	n = len(b)
 	p.written += int64(n)
 	p.Completed.Add(int64(n))
 	return n, nil
 }

-func (p *progressWriter) Rollback() {
-	p.Completed.Add(-p.written)
+func (p *blobUploadPart) Reset() {
+	p.Completed.Add(-int64(p.written))
 	p.written = 0
 }
Author	SHA1	Message	Date
Matt Williams	05162c56aa	Update readme.md	2023-11-29 10:45:07 -08:00
Matt Williams	edd1a2b6e8	function calling for python. already had ts. Signed-off-by: Matt Williams <m@technovangelist.com>	2023-11-29 10:06:11 -08:00
				`@@ -1 +0,0 @@`
				`{ "dependencies": { "@types/node": "^20.10.4", "prompt-sync": "^4.2.0", "readline": "^1.3.0" } }`