diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 29adf56f3..ac9af6411 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -73,12 +73,12 @@ jobs:
           $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
           $env:PATH="$gopath;$gccpath;$env:PATH"
           echo $env:PATH
-          go generate -x ./...
+          $env:GOARCH=""; $env:OLLAMA_BUILD_TARGET_ARCH="${{ matrix.arch }}"; go generate -x ./...
         if: ${{ startsWith(matrix.os, 'windows-') }}
-        name: 'Windows Go Generate'
-      - run: go generate -x ./...
+        name: 'Windows Generate'
+      - run: GOARCH= OLLAMA_BUILD_TARGET_ARCH=${{ matrix.arch }} go generate -x ./...
         if: ${{ ! startsWith(matrix.os, 'windows-') }}
-        name: 'Unix Go Generate'
+        name: 'Unix Generate'
       - uses: actions/upload-artifact@v4
         with:
           name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
@@ -184,7 +184,7 @@ jobs:
           $env:OLLAMA_SKIP_CPU_GENERATE="1"
           $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
           go generate -x ./...
-        name: go generate
+        name: go generate -x ./...
         env:
           OLLAMA_SKIP_CPU_GENERATE: '1'
       # TODO - do we need any artifacts?
@@ -217,7 +217,7 @@ jobs:
       - name: 'Verify CUDA'
         run: nvcc -V
       - run: go get ./...
-      - name: go generate
+      - name: go generate -x ./...
         run: |
           $gopath=(get-command go).source | split-path -parent
           $cudabin=(get-command nvcc).source | split-path
@@ -312,7 +312,10 @@ jobs:
           touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
         if: ${{ startsWith(matrix.os, 'macos-') }}
         shell: bash
-      - run: go generate ./...
+      - run: $env:GOARCH=""; $env:OLLAMA_BUILD_TARGET_ARCH="${{ matrix.arch }}"; go generate -x ./...
+        if: ${{ startsWith(matrix.os, 'windows-') }}
+      - run: GOARCH= OLLAMA_BUILD_TARGET_ARCH=${{ matrix.arch }} go generate -x ./...
+        if: ${{ ! startsWith(matrix.os, 'windows-') }}
       - run: go build
       - run: go test -v ./...
       - uses: actions/upload-artifact@v4
diff --git a/README.md b/README.md
index 62f5cd65c..4bbf38556 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,12 @@
 <div align="center">
- <img alt="ollama" height="200px" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
+  <img alt="ollama" height="200px" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
 </div>
 
 # Ollama
 
 [![Discord](https://dcbadge.vercel.app/api/server/ollama?style=flat&compact=true)](https://discord.gg/ollama)
 
-Get up and running with large language models.
+Get up and running with large language models locally.
 
 ### macOS
 
@@ -51,17 +51,15 @@ Here are some example models that can be downloaded:
 | ------------------ | ---------- | ----- | ------------------------------ |
 | Llama 3            | 8B         | 4.7GB | `ollama run llama3`            |
 | Llama 3            | 70B        | 40GB  | `ollama run llama3:70b`        |
-| Phi 3 Mini         | 3.8B       | 2.3GB | `ollama run phi3`              |
-| Phi 3 Medium       | 14B        | 7.9GB | `ollama run phi3:medium`       |
-| Gemma 2            | 9B         | 5.5GB | `ollama run gemma2`            |
-| Gemma 2            | 27B        | 16GB  | `ollama run gemma2:27b`        |
+| Phi-3              | 3,8B       | 2.3GB | `ollama run phi3`              |
 | Mistral            | 7B         | 4.1GB | `ollama run mistral`           |
-| Moondream 2        | 1.4B       | 829MB | `ollama run moondream`         |
 | Neural Chat        | 7B         | 4.1GB | `ollama run neural-chat`       |
 | Starling           | 7B         | 4.1GB | `ollama run starling-lm`       |
 | Code Llama         | 7B         | 3.8GB | `ollama run codellama`         |
 | Llama 2 Uncensored | 7B         | 3.8GB | `ollama run llama2-uncensored` |
 | LLaVA              | 7B         | 4.5GB | `ollama run llava`             |
+| Gemma              | 2B         | 1.4GB | `ollama run gemma:2b`          |
+| Gemma              | 7B         | 4.8GB | `ollama run gemma:7b`          |
 | Solar              | 10.7B      | 6.1GB | `ollama run solar`             |
 
 > Note: You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
@@ -175,19 +173,13 @@ I'm a basic program that prints the famous "Hello, world!" message to the consol
 The image features a yellow smiley face, which is likely the central focus of the picture.
 ```
 
-### Pass the prompt as an argument
+### Pass in prompt as arguments
 
 ```
 $ ollama run llama3 "Summarize this file: $(cat README.md)"
  Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
 ```
 
-### Show model information
-
-```
-ollama show llama3
-```
-
 ### List models on your computer
 
 ```
@@ -200,7 +192,19 @@ ollama list
 
 ## Building
 
-See the [developer guide](https://github.com/ollama/ollama/blob/main/docs/development.md)
+Install `cmake` and `go`:
+
+```
+brew install cmake go
+```
+
+Then build the binary:
+
+```
+go run build.go
+```
+
+More detailed instructions can be found in the [developer guide](https://github.com/ollama/ollama/blob/main/docs/development.md)
 
 ### Running local builds
 
@@ -248,7 +252,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 
 - [Open WebUI](https://github.com/open-webui/open-webui)
 - [Enchanted (macOS native)](https://github.com/AugustDev/enchanted)
-- [Hollama](https://github.com/fmaclen/hollama)
 - [Lollms-Webui](https://github.com/ParisNeo/lollms-webui)
 - [LibreChat](https://github.com/danny-avila/LibreChat)
 - [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
@@ -275,24 +278,17 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [OllamaGUI](https://github.com/enoch1118/ollamaGUI)
 - [OpenAOE](https://github.com/InternLM/OpenAOE)
 - [Odin Runes](https://github.com/leonid20000/OdinRunes)
-- [LLM-X](https://github.com/mrdjohnson/llm-x) (Progressive Web App)
+- [LLM-X: Progressive Web App](https://github.com/mrdjohnson/llm-x)
 - [AnythingLLM (Docker + MacOs/Windows/Linux native app)](https://github.com/Mintplex-Labs/anything-llm)
 - [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat)
 - [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
-- [QA-Pilot](https://github.com/reid41/QA-Pilot) (Chat with Code Repository)
-- [ChatOllama](https://github.com/sugarforever/chat-ollama) (Open Source Chatbot based on Ollama with Knowledge Bases)
-- [CRAG Ollama Chat](https://github.com/Nagi-ovo/CRAG-Ollama-Chat) (Simple Web Search with Corrective RAG)
-- [RAGFlow](https://github.com/infiniflow/ragflow) (Open-source Retrieval-Augmented Generation engine based on deep document understanding)
-- [StreamDeploy](https://github.com/StreamDeploy-DevRel/streamdeploy-llm-app-scaffold) (LLM Application Scaffold)
-- [chat](https://github.com/swuecho/chat) (chat web app for teams)
+- [QA-Pilot: Chat with Code Repository](https://github.com/reid41/QA-Pilot)
+- [ChatOllama: Open Source Chatbot based on Ollama with Knowledge Bases](https://github.com/sugarforever/chat-ollama)
+- [CRAG Ollama Chat: Simple Web Search with Corrective RAG](https://github.com/Nagi-ovo/CRAG-Ollama-Chat)
+- [RAGFlow: Open-source Retrieval-Augmented Generation engine based on deep document understanding](https://github.com/infiniflow/ragflow)
+- [chat: chat web app for teams](https://github.com/swuecho/chat)
 - [Lobe Chat](https://github.com/lobehub/lobe-chat) with [Integrating Doc](https://lobehub.com/docs/self-hosting/examples/ollama)
-- [Ollama RAG Chatbot](https://github.com/datvodinh/rag-chatbot.git) (Local Chat with multiple PDFs using Ollama and RAG)
-- [BrainSoup](https://www.nurgo-software.com/products/brainsoup) (Flexible native client with RAG & multi-agent automation)
-- [macai](https://github.com/Renset/macai) (macOS client for Ollama, ChatGPT, and other compatible API back-ends)
-- [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
-- [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
-- [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
-- [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
+- [Ollama RAG Chatbot: Local Chat with multiples PDFs using Ollama and RAG.](https://github.com/datvodinh/rag-chatbot.git)
 
 ### Terminal
 
@@ -315,7 +311,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ShellOracle](https://github.com/djcopley/ShellOracle)
 - [tlm](https://github.com/yusufcanb/tlm)
 - [podman-ollama](https://github.com/ericcurtin/podman-ollama)
-- [gollama](https://github.com/sammcj/gollama)
 
 ### Database
 
@@ -326,20 +321,17 @@ See the [API documentation](./docs/api.md) for all endpoints.
 
 - [Pacman](https://archlinux.org/packages/extra/x86_64/ollama/)
 - [Helm Chart](https://artifacthub.io/packages/helm/ollama-helm/ollama)
-- [Guix channel](https://codeberg.org/tusharhero/ollama-guix)
 
 ### Libraries
 
 - [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
 - [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
 - [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
-- [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
 - [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
 - [LiteLLM](https://github.com/BerriAI/litellm)
 - [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
 - [Ollama for Ruby](https://github.com/gbaptista/ollama-ai)
 - [Ollama-rs for Rust](https://github.com/pepperoni21/ollama-rs)
-- [Ollama-hpp for C++](https://github.com/jmont-dev/ollama-hpp)
 - [Ollama4j for Java](https://github.com/amithkoujalgi/ollama4j)
 - [ModelFusion Typescript Library](https://modelfusion.dev/integration/model-provider/ollama)
 - [OllamaKit for Swift](https://github.com/kevinhermawan/OllamaKit)
@@ -350,13 +342,9 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Haystack](https://github.com/deepset-ai/haystack-integrations/blob/main/integrations/ollama.md)
 - [Elixir LangChain](https://github.com/brainlid/langchain)
 - [Ollama for R - rollama](https://github.com/JBGruber/rollama)
-- [Ollama for R - ollama-r](https://github.com/hauselin/ollama-r)
 - [Ollama-ex for Elixir](https://github.com/lebrunel/ollama-ex)
 - [Ollama Connector for SAP ABAP](https://github.com/b-tocs/abap_btocs_ollama)
 - [Testcontainers](https://testcontainers.com/modules/ollama/)
-- [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama)
-- [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama)
-- [LlamaScript](https://github.com/Project-Llama/llamascript)
 
 ### Mobile
 
@@ -376,23 +364,18 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama Telegram Bot](https://github.com/ruecat/ollama-telegram)
 - [Hass Ollama Conversation](https://github.com/ej52/hass-ollama-conversation)
 - [Rivet plugin](https://github.com/abrenneke/rivet-plugin-ollama)
+- [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
 - [Obsidian BMO Chatbot plugin](https://github.com/longy2k/obsidian-bmo-chatbot)
 - [Cliobot](https://github.com/herval/cliobot) (Telegram bot with Ollama support)
 - [Copilot for Obsidian plugin](https://github.com/logancyang/obsidian-copilot)
 - [Obsidian Local GPT plugin](https://github.com/pfrankov/obsidian-local-gpt)
 - [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
-- [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
-- [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use ollama as a copilot like Github copilot)
 - [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
 - [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and HuggingFace)
 - [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
 - [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)
 - [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)
 - [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
-- [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
-- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
-
-### Supported backends
-
-- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.
 
+### Supported backends 
+- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov. 
diff --git a/build.go b/build.go
new file mode 100644
index 000000000..e6446d2a3
--- /dev/null
+++ b/build.go
@@ -0,0 +1,199 @@
+//go:build ignore
+
+package main
+
+import (
+	"cmp"
+	"errors"
+	"flag"
+	"log"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+)
+
+// Flags
+var (
+	flagRegenerateDestroy = flag.Bool("d", false, "force regenerate the dependencies (destructive)")
+	flagRegenerateGently  = flag.Bool("g", false, "regenerate the dependencies (non-destructive)")
+	flagSkipBuild         = flag.Bool("s", false, "generate dependencies only (e.g. skip 'go build .')")
+
+	// Flags to set GOARCH explicitly for cross-platform builds,
+	// e.g., in CI to target a different platform than the build matrix
+	// default. These allows us to run generate without a separate build
+	// step for building the script binary for the host ARCH and then
+	// runing the generate script for the target ARCH. Instead, we can
+	// just run `go run build.go -target=$GOARCH` to generate the
+	// deps.
+	flagGOARCH = flag.String("target", "", "sets GOARCH to use when generating dependencies and building")
+)
+
+func buildEnv() []string {
+	return append(os.Environ(), "GOARCH="+cmp.Or(
+		*flagGOARCH,
+		os.Getenv("OLLAMA_BUILD_TARGET_ARCH"),
+		runtime.GOARCH,
+	))
+}
+
+func main() {
+	log.SetFlags(0)
+	flag.Usage = func() {
+		log.Printf("Usage: go run build.go [flags]")
+		log.Println()
+		log.Println("Flags:")
+		flag.PrintDefaults()
+		log.Println()
+		log.Println("This script builds the Ollama server binary and generates the llama.cpp")
+		log.Println("bindings for the current platform. It assumes that the current working")
+		log.Println("directory is the root directory of the Ollama project.")
+		log.Println()
+		log.Println("If the -d flag is provided, the script will force regeneration of the")
+		log.Println("dependencies; removing the 'llm/build' directory before starting.")
+		log.Println()
+		log.Println("If the -g flag is provided, the script will regenerate the dependencies")
+		log.Println("without removing the 'llm/build' directory.")
+		log.Println()
+		log.Println("If the -s flag is provided, the script will skip building the Ollama binary")
+		log.Println()
+		log.Println("If the -target flag is provided, the script will set GOARCH to the value")
+		log.Println("of the flag. This is useful for cross-platform builds.")
+		log.Println()
+		log.Println("The script will check for the required dependencies (cmake, gcc) and")
+		log.Println("print their version.")
+		log.Println()
+		log.Println("The script will also check if it is being run from the root directory of")
+		log.Println("the Ollama project.")
+		log.Println()
+		os.Exit(1)
+	}
+	flag.Parse()
+
+	log.Printf("=== Building Ollama ===")
+	defer func() {
+		log.Printf("=== Done building Ollama ===")
+		if !*flagSkipBuild {
+			log.Println()
+			log.Println("To run the Ollama server, use:")
+			log.Println()
+			log.Println("    ./ollama serve")
+			log.Println()
+		}
+	}()
+
+	if flag.NArg() > 0 {
+		flag.Usage()
+	}
+
+	if !inRootDir() {
+		log.Fatalf("Please run this script from the root directory of the Ollama project.")
+	}
+
+	if err := checkDependencies(); err != nil {
+		log.Fatalf("Failed dependency check: %v", err)
+	}
+	if err := buildLlammaCPP(); err != nil {
+		log.Fatalf("Failed to build llama.cpp: %v", err)
+	}
+	if err := goBuildOllama(); err != nil {
+		log.Fatalf("Failed to build ollama Go binary: %v", err)
+	}
+}
+
+// checkDependencies does a quick check to see if the required dependencies are
+// installed on the system and functioning enough to print their version.
+//
+// TODO(bmizerany): Check the actual version of the dependencies? Seems a
+// little daunting given diff versions might print diff things. This should
+// be good enough for now.
+func checkDependencies() error {
+	var err error
+	check := func(name string, args ...string) {
+		log.Printf("=== Checking for %s ===", name)
+		defer log.Printf("=== Done checking for %s ===\n\n", name)
+		cmd := exec.Command(name, args...)
+		cmd.Stdout = os.Stdout
+		cmd.Stderr = os.Stderr
+		err = errors.Join(err, cmd.Run())
+	}
+
+	check("cmake", "--version")
+	check("gcc", "--version")
+	return err
+}
+
+func goBuildOllama() error {
+	log.Println("=== Building Ollama binary ===")
+	defer log.Printf("=== Done building Ollama binary ===\n\n")
+	if *flagSkipBuild {
+		log.Println("Skipping 'go build -o ollama .'")
+		return nil
+	}
+	cmd := exec.Command("go", "build", "-o", "ollama", ".")
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	cmd.Env = buildEnv()
+	return cmd.Run()
+}
+
+// buildLlammaCPP generates the llama.cpp bindings for the current platform.
+//
+// It assumes that the current working directory is the root directory of the
+// Ollama project.
+func buildLlammaCPP() error {
+	log.Println("=== Generating dependencies ===")
+	defer log.Printf("=== Done generating dependencies ===\n\n")
+	if *flagRegenerateDestroy {
+		if err := os.RemoveAll(filepath.Join("llm", "build")); err != nil {
+			return err
+		}
+	}
+	if isDirectory(filepath.Join("llm", "build")) && !*flagRegenerateGently {
+		log.Println("llm/build already exists; skipping.  Use -d or -g to re-generate.")
+		return nil
+	}
+
+	scriptDir, err := filepath.Abs(filepath.Join("llm", "generate"))
+	if err != nil {
+		return err
+	}
+
+	var cmd *exec.Cmd
+	switch runtime.GOOS {
+	case "windows":
+		script := filepath.Join(scriptDir, "gen_windows.ps1")
+		cmd = exec.Command("powershell", "-ExecutionPolicy", "Bypass", "-File", script)
+	case "linux":
+		script := filepath.Join(scriptDir, "gen_linux.sh")
+		cmd = exec.Command("bash", script)
+	case "darwin":
+		script := filepath.Join(scriptDir, "gen_darwin.sh")
+		cmd = exec.Command("bash", script)
+	default:
+		log.Fatalf("Unsupported OS: %s", runtime.GOOS)
+	}
+	cmd.Dir = filepath.Join("llm", "generate")
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	cmd.Env = buildEnv()
+
+	log.Printf("Running GOOS=%s GOARCH=%s %s", runtime.GOOS, runtime.GOARCH, cmd.Args)
+
+	return cmd.Run()
+}
+
+func isDirectory(path string) bool {
+	info, err := os.Stat(path)
+	if err != nil {
+		return false
+	}
+	return info.IsDir()
+}
+
+// inRootDir returns true if the current working directory is the root
+// directory of the Ollama project. It looks for a file named "go.mod".
+func inRootDir() bool {
+	_, err := os.Stat("go.mod")
+	return err == nil
+}
diff --git a/docs/development.md b/docs/development.md
index 2a6886a43..1f54e0320 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -25,13 +25,7 @@ export OLLAMA_DEBUG=1
 Get the required libraries and build the native LLM code:
 
 ```bash
-go generate ./...
-```
-
-Then build ollama:
-
-```bash
-go build .
+go run build.go
 ```
 
 Now you can run `ollama`:
@@ -40,6 +34,16 @@ Now you can run `ollama`:
 ./ollama
 ```
 
+### Rebuilding the native code
+
+If at any point you need to rebuild the native code, you can run the
+build.go script again using the `-f` flag to force a rebuild, and,
+optionally, the `-d` flag to skip building the Go binary:
+
+```bash
+go run build.go -d -s
+```
+
 ### Linux
 
 #### Linux CUDA (NVIDIA)
@@ -55,16 +59,10 @@ specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
 libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize
 a set of target CUDA architectures by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
 
-Then generate dependencies:
-
-```
-go generate ./...
-```
-
 Then build the binary:
 
 ```
-go build .
+go run build.go
 ```
 
 #### Linux ROCm (AMD)
@@ -80,21 +78,17 @@ install (typically `/opt/rocm`), and `CLBlast_DIR` to the location of the
 CLBlast install (typically `/usr/lib/cmake/CLBlast`). You can also customize
 the AMD GPU targets by setting AMDGPU_TARGETS (e.g. `AMDGPU_TARGETS="gfx1101;gfx1102"`)
 
-```
-go generate ./...
-```
-
 Then build the binary:
 
 ```
-go build .
+go run build.go
 ```
 
 ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
 
 #### Advanced CPU Settings
 
-By default, running `go generate ./...` will compile a few different variations
+By default, running `go run build.go` will compile a few different variations
 of the LLM library based on common CPU families and vector math capabilities,
 including a lowest-common-denominator which should run on almost any 64 bit CPU
 somewhat slowly. At runtime, Ollama will auto-detect the optimal variation to
@@ -104,8 +98,7 @@ like to use. For example, to compile an optimized binary for an Intel i9-9880H,
 you might use:
 
 ```
-OLLAMA_CUSTOM_CPU_DEFS="-DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_F16C=on -DLLAMA_FMA=on" go generate ./...
-go build .
+OLLAMA_CUSTOM_CPU_DEFS="-DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_F16C=on -DLLAMA_FMA=on" go run build.go
 ```
 
 #### Containerized Linux Build
@@ -129,8 +122,7 @@ Then, build the `ollama` binary:
 
 ```powershell
 $env:CGO_ENABLED="1"
-go generate ./...
-go build .
+go run build.go
 ```
 
 #### Windows CUDA (NVIDIA)
diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh
index 721a9ae80..7a019942a 100755
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-# This script is intended to run inside the go generate
-# working directory must be ./llm/generate/
+# This script is intended to run inside the `go run build.go` script, which
+# sets the working directory to the correct location: ./llm/generate/.
 
 # TODO - add hardening to detect missing tools (cmake, etc.)
 
@@ -92,10 +92,10 @@ case "${GOARCH}" in
     ;;
 *)
     echo "GOARCH must be set"
-    echo "this script is meant to be run from within go generate"
+    echo "this script is meant to be run from within 'go run build.go'"
     exit 1
     ;;
 esac
 
 cleanup
-echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
+echo "code generation completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh
index 28ce1f21d..b2cd76a6f 100755
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-# This script is intended to run inside the go generate
-# working directory must be llm/generate/
+# This script is intended to run with the `go run build.go` script, which
+# sets the working directory to the correct location: ./llm/generate/.
 
 # First we build one or more CPU based LLM libraries
 #
@@ -281,4 +281,4 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
 fi
 
 cleanup
-echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
+echo "code generation completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1
index e217a0382..8c07ad065 100644
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -26,26 +26,15 @@ function amdGPUs {
     $GPU_LIST -join ';'
 }
 
-
 function init_vars {
-    if (!$script:SRC_DIR) {
-        $script:SRC_DIR = $(resolve-path "..\..\")
-    }
-    if (!$script:llamacppDir) {
-        $script:llamacppDir = "../llama.cpp"
-    }
-    if (!$script:cmakeTargets) {
-        $script:cmakeTargets = @("ollama_llama_server")
-    }
+    $script:SRC_DIR = $(resolve-path "..\..\")
+    $script:llamacppDir = "../llama.cpp"
     $script:cmakeDefs = @(
         "-DBUILD_SHARED_LIBS=on",
-        "-DLLAMA_NATIVE=off",
-        "-DLLAMA_OPENMP=off"
+        "-DLLAMA_NATIVE=off"
         )
-    $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
-    $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
-    $script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_runners"
-    md "$script:DIST_BASE" -ea 0 > $null
+    $script:cmakeTargets = @("ollama_llama_server")
+    $script:ARCH = "amd64" # arm not yet supported.
     if ($env:CGO_CFLAGS -contains "-g") {
         $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
         $script:config = "RelWithDebInfo"
@@ -66,6 +55,7 @@ function init_vars {
     } else {
         $script:CUDA_LIB_DIR=$env:CUDA_LIB_DIR
     }
+    $script:GZIP=(get-command -ea 'silentlycontinue' gzip).path
     $script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path
     if ($null -eq $env:CMAKE_CUDA_ARCHITECTURES) {
         $script:CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
@@ -123,13 +113,8 @@ function build {
     & cmake --version
     & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
     if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    if ($cmakeDefs -contains "-G") {
-        $extra=@("-j8")
-    } else {
-        $extra= @("--", "/p:CL_MPcount=8")
-    }
-    write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
-    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
+    write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ })"
+    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
     if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
     # Rearrange output to be consistent between different generators
     if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) {
@@ -149,18 +134,21 @@ function sign {
     }
 }
 
-function install {
-    write-host "Installing binaries to dist dir ${script:distDir}"
-    mkdir ${script:distDir} -ErrorAction SilentlyContinue
+function compress {
+    if ($script:GZIP -eq $null) {
+        write-host "gzip not installed, not compressing files"
+        return
+    }
+    write-host "Compressing binaries..."
     $binaries = dir "${script:buildDir}/bin/*.exe"
     foreach ($file in $binaries) {
-        copy-item -Path $file -Destination ${script:distDir} -Force
+        & "$script:GZIP" --best -f $file
     }
 
-    write-host "Installing dlls to dist dir ${script:distDir}"
+    write-host "Compressing dlls..."
     $dlls = dir "${script:buildDir}/bin/*.dll"
     foreach ($file in $dlls) {
-        copy-item -Path $file -Destination ${script:distDir} -Force
+        & "$script:GZIP" --best -f $file
     }
 }
 
@@ -181,252 +169,132 @@ function cleanup {
     }
 }
 
+init_vars
+git_module_setup
+apply_patches
 
 # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
 # -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
 # -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
 
+$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
 
-function build_static() {
-    if ((-not "${env:OLLAMA_SKIP_STATIC_GENERATE}") -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "static"))) {
-        # GCC build for direct linking into the Go binary
-        init_vars
-        # cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast
-        # as we need this to be compiled by gcc for golang to be able to link with itx
-        write-host "Checking for MinGW..."
-        # error action ensures we exit on failure
-        get-command gcc
-        get-command mingw32-make
-        $oldTargets = $script:cmakeTargets
-        $script:cmakeTargets = @("llama", "ggml")
-        $script:cmakeDefs = @(
-            "-G", "MinGW Makefiles"
-            "-DCMAKE_C_COMPILER=gcc.exe",
-            "-DCMAKE_CXX_COMPILER=g++.exe",
-            "-DBUILD_SHARED_LIBS=off",
-            "-DLLAMA_NATIVE=off",
-            "-DLLAMA_AVX=off",
-            "-DLLAMA_AVX2=off",
-            "-DLLAMA_AVX512=off",
-            "-DLLAMA_F16C=off",
-            "-DLLAMA_FMA=off",
-            "-DLLAMA_OPENMP=off")
-        $script:buildDir="../build/windows/${script:ARCH}_static"
-        write-host "Building static library"
-        build
-        $script:cmakeTargets = $oldTargets
-    } else {
-        write-host "Skipping CPU generation step as requested"
-    }
+if ($null -eq ${env:OLLAMA_SKIP_CPU_GENERATE}) {
+
+# GCC build for direct linking into the Go binary
+init_vars
+# cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast
+# as we need this to be compiled by gcc for golang to be able to link with itx
+write-host "Checking for MinGW..."
+# error action ensures we exit on failure
+get-command gcc
+get-command mingw32-make
+$script:cmakeTargets = @("llama", "ggml")
+$script:cmakeDefs = @(
+    "-G", "MinGW Makefiles"
+    "-DCMAKE_C_COMPILER=gcc.exe",
+    "-DCMAKE_CXX_COMPILER=g++.exe",
+    "-DBUILD_SHARED_LIBS=off",
+    "-DLLAMA_NATIVE=off",
+    "-DLLAMA_AVX=off",
+    "-DLLAMA_AVX2=off",
+    "-DLLAMA_AVX512=off",
+    "-DLLAMA_F16C=off",
+    "-DLLAMA_FMA=off")
+$script:buildDir="../build/windows/${script:ARCH}_static"
+write-host "Building static library"
+build
+
+# remaining llama.cpp builds use MSVC 
+    init_vars
+    $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
+    $script:buildDir="../build/windows/${script:ARCH}/cpu"
+    write-host "Building LCD CPU"
+    build
+    sign
+    compress
+
+    init_vars
+    $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
+    $script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
+    write-host "Building AVX CPU"
+    build
+    sign
+    compress
+
+    init_vars
+    $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
+    $script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
+    write-host "Building AVX2 CPU"
+    build
+    sign
+    compress
+} else {
+    write-host "Skipping CPU generation step as requested"
 }
 
-function build_cpu($gen_arch) {
-    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
-        # remaining llama.cpp builds use MSVC 
-        init_vars
-        $script:cmakeDefs = $script:commonCpuDefs + @("-A", $gen_arch, "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
-        $script:buildDir="../build/windows/${script:ARCH}/cpu"
-        $script:distDir="$script:DIST_BASE\cpu"
-        write-host "Building LCD CPU"
-        build
-        sign
-        install
-    } else {
-        write-host "Skipping CPU generation step as requested"
-    }
-}
-
-function build_cpu_avx() {
-    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx"))) {
-        init_vars
-        $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
-        $script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
-        $script:distDir="$script:DIST_BASE\cpu_avx"
-        write-host "Building AVX CPU"
-        build
-        sign
-        install
-    } else {
-        write-host "Skipping CPU AVX generation step as requested"
-    }
-}
-
-function build_cpu_avx2() {
-    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx2"))) {
-        init_vars
-        $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
-        $script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
-        $script:distDir="$script:DIST_BASE\cpu_avx2"
-        write-host "Building AVX2 CPU"
-        build
-        sign
-        install
-    } else {
-        write-host "Skipping CPU AVX2 generation step as requested"
-    }
-}
-
-function build_cuda() {
-    if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) {
-        # Then build cuda as a dynamically loaded library
-        $nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
-        $script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
-        if ($null -ne $script:CUDA_VERSION) {
-            $script:CUDA_VARIANT="_"+$script:CUDA_VERSION
-        }
-        init_vars
-        $script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
-        $script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT"
-        $script:cmakeDefs += @(
-            "-A", "x64",
-            "-DLLAMA_CUDA=ON",
-            "-DLLAMA_AVX=on",
-            "-DLLAMA_AVX2=off",
-            "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR",
-            "-DCMAKE_CUDA_FLAGS=-t8",
-            "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}"
-            )
-        if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
-            write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
-            $script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
-            write-host "building custom CUDA GPU"
-        }
-        build
-        sign
-        install
-
-        rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
-        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" -ea 0 > $null
-        write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
-        cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
-        cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
-        cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
-    } else {
-        write-host "Skipping CUDA generation step"
-    }
-}
-
-function build_oneapi() {
-  if ((-not "${env:OLLAMA_SKIP_ONEAPI_GENERATE}") -and ("${env:ONEAPI_ROOT}"))  {
-    # Get oneAPI version
-    $script:ONEAPI_VERSION = icpx --version
-    $script:ONEAPI_VERSION = [regex]::Match($script:ONEAPI_VERSION, '(?<=oneAPI DPC\+\+/C\+\+ Compiler )(?<version>\d+\.\d+\.\d+)').Value
-    if ($null -ne $script:ONEAPI_VERSION) {
-      $script:ONEAPI_VARIANT = "_v" + $script:ONEAPI_VERSION
+if ($null -ne $script:CUDA_LIB_DIR) {
+    # Then build cuda as a dynamically loaded library
+    $nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
+    $script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
+    if ($null -ne $script:CUDA_VERSION) {
+        $script:CUDA_VARIANT="_"+$script:CUDA_VERSION
     }
     init_vars
-    $script:buildDir = "../build/windows/${script:ARCH}/oneapi$script:ONEAPI_VARIANT"
-    $script:distDir ="$script:DIST_BASE\oneapi$script:ONEAPI_VARIANT"
-    $script:cmakeDefs += @(
-      "-G", "MinGW Makefiles",
-      "-DLLAMA_SYCL=ON",
-      "-DCMAKE_C_COMPILER=icx",
-      "-DCMAKE_CXX_COMPILER=icx",
-      "-DCMAKE_BUILD_TYPE=Release"
-    )
+    $script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
+    $script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
+    if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
+        write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
+        $script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
+        write-host "building custom CUDA GPU"
+    }
+    build
+    sign
+    compress
+}
 
-    Write-Host "Building oneAPI"
+if ($null -ne $env:HIP_PATH) {
+    $script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
+    if ($null -ne $script:ROCM_VERSION) {
+        $script:ROCM_VARIANT="_v"+$script:ROCM_VERSION
+    }
+
+    init_vars
+    $script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
+    $script:cmakeDefs += @(
+        "-G", "Ninja", 
+        "-DCMAKE_C_COMPILER=clang.exe",
+        "-DCMAKE_CXX_COMPILER=clang++.exe",
+        "-DLLAMA_HIPBLAS=on",
+        "-DHIP_PLATFORM=amd",
+        "-DLLAMA_AVX=on",
+        "-DLLAMA_AVX2=off",
+        "-DCMAKE_POSITION_INDEPENDENT_CODE=on",
+        "-DAMDGPU_TARGETS=$(amdGPUs)",
+        "-DGPU_TARGETS=$(amdGPUs)"
+        )
+
+    # Make sure the ROCm binary dir is first in the path
+    $env:PATH="$env:HIP_PATH\bin;$env:PATH"
+
+    # We have to clobber the LIB var from the developer shell for clang to work properly
+    $env:LIB=""
+    if ($null -ne $env:OLLAMA_CUSTOM_ROCM_DEFS) {
+        write-host "OLLAMA_CUSTOM_ROCM_DEFS=`"${env:OLLAMA_CUSTOM_ROCM_DEFS}`""
+        $script:cmakeDefs += @("${env:OLLAMA_CUSTOM_ROCM_DEFS}")
+        write-host "building custom ROCM GPU"
+    }
+    write-host "Building ROCm"
     build
     # Ninja doesn't prefix with config name
+    ${script:config}=""
     if ($null -ne $script:DUMPBIN) {
-      & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | Select-String ".dll"
+        & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll"
     }
     sign
-    install
-
-    rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    md "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" -ea 0 > $null
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-  } else {
-    Write-Host "Skipping oneAPI generation step"
-  }
+    compress
 }
 
-function build_rocm() {
-    if ((-not "${env:OLLAMA_SKIP_ROCM_GENERATE}") -and ("${env:HIP_PATH}")) {
-        $script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
-        if ($null -ne $script:ROCM_VERSION) {
-            $script:ROCM_VARIANT="_v"+$script:ROCM_VERSION
-        }
 
-        init_vars
-        $script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
-        $script:distDir="$script:DIST_BASE\rocm$script:ROCM_VARIANT"
-        $script:cmakeDefs += @(
-            "-G", "Ninja", 
-            "-DCMAKE_C_COMPILER=clang.exe",
-            "-DCMAKE_CXX_COMPILER=clang++.exe",
-            "-DLLAMA_HIPBLAS=on",
-            "-DHIP_PLATFORM=amd",
-            "-DLLAMA_AVX=on",
-            "-DLLAMA_AVX2=off",
-            "-DCMAKE_POSITION_INDEPENDENT_CODE=on",
-            "-DAMDGPU_TARGETS=$(amdGPUs)",
-            "-DGPU_TARGETS=$(amdGPUs)"
-            )
-
-        # Make sure the ROCm binary dir is first in the path
-        $env:PATH="$env:HIP_PATH\bin;$env:PATH"
-
-        # We have to clobber the LIB var from the developer shell for clang to work properly
-        $env:LIB=""
-        if ($null -ne $env:OLLAMA_CUSTOM_ROCM_DEFS) {
-            write-host "OLLAMA_CUSTOM_ROCM_DEFS=`"${env:OLLAMA_CUSTOM_ROCM_DEFS}`""
-            $script:cmakeDefs += @("${env:OLLAMA_CUSTOM_ROCM_DEFS}")
-            write-host "building custom ROCM GPU"
-        }
-        write-host "Building ROCm"
-        build
-        # Ninja doesn't prefix with config name
-        ${script:config}=""
-        if ($null -ne $script:DUMPBIN) {
-            & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll"
-        }
-        sign
-        install
-
-        # Assumes v5.7, may need adjustments for v6
-        rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
-        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" -ea 0 > $null
-        cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
-        cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
-        # amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
-        cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\"
-    } else {
-        write-host "Skipping ROCm generation step"
-    }
-}
-
-init_vars
-if ($($args.count) -eq 0) {
-    git_module_setup
-    apply_patches
-    build_static
-    if ($script:ARCH -eq "arm64") {
-        build_cpu("ARM64")
-    } else { # amd64
-        build_cpu("x64")
-        build_cpu_avx
-        build_cpu_avx2
-        build_cuda
-        build_oneapi
-        build_rocm
-    }
-
-    cleanup
-    write-host "`ngo generate completed.  LLM runners: $(get-childitem -path $script:DIST_BASE)"
-} else {
-    for ( $i = 0; $i -lt $args.count; $i++ ) {
-        write-host "performing $($args[$i])"
-        & $($args[$i])
-    } 
-}
\ No newline at end of file
+cleanup
+write-host "`code generation completed.  LLM runners: $(get-childitem -path ${script:SRC_DIR}\llm\build\windows\${script:ARCH})"
diff --git a/llm/generate/generate_darwin.go b/llm/generate/generate_darwin.go
deleted file mode 100644
index 776852342..000000000
--- a/llm/generate/generate_darwin.go
+++ /dev/null
@@ -1,3 +0,0 @@
-package generate
-
-//go:generate bash ./gen_darwin.sh
diff --git a/llm/generate/generate_linux.go b/llm/generate/generate_linux.go
deleted file mode 100644
index 2b7e116db..000000000
--- a/llm/generate/generate_linux.go
+++ /dev/null
@@ -1,3 +0,0 @@
-package generate
-
-//go:generate bash ./gen_linux.sh
diff --git a/llm/generate/generate_windows.go b/llm/generate/generate_windows.go
deleted file mode 100644
index d2ee5428a..000000000
--- a/llm/generate/generate_windows.go
+++ /dev/null
@@ -1,3 +0,0 @@
-package generate
-
-//go:generate powershell -ExecutionPolicy Bypass -File ./gen_windows.ps1
diff --git a/main.go b/main.go
index 158f0063c..4f7e26158 100644
--- a/main.go
+++ b/main.go
@@ -1,5 +1,7 @@
 package main
 
+//go:generate go run build.go -g -s
+
 import (
 	"context"