Compare commits
	
		
			1 Commits
		
	
	
		
			brucemacd/
			...
			scratch
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | 2789ed31a7 | 
| @@ -190,13 +190,7 @@ func getCPUMem() (memInfo, error) { | |||||||
| func CheckVRAM() (int64, error) { | func CheckVRAM() (int64, error) { | ||||||
| 	gpuInfo := GetGPUInfo() | 	gpuInfo := GetGPUInfo() | ||||||
| 	if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") { | 	if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") { | ||||||
| 		// leave 10% or 512MiB of VRAM free per GPU to handle unaccounted for overhead | 		return int64(gpuInfo.FreeMemory), nil | ||||||
| 		overhead := gpuInfo.FreeMemory / 10 |  | ||||||
| 		gpus := uint64(gpuInfo.DeviceCount) |  | ||||||
| 		if overhead < gpus*512*1024*1024 { |  | ||||||
| 			overhead = gpus * 512 * 1024 * 1024 |  | ||||||
| 		} |  | ||||||
| 		return int64(gpuInfo.FreeMemory - overhead), nil |  | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation | 	return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation | ||||||
|   | |||||||
							
								
								
									
										14
									
								
								llm/llm.go
									
									
									
									
									
								
							
							
						
						
									
										14
									
								
								llm/llm.go
									
									
									
									
									
								
							| @@ -50,10 +50,10 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) | |||||||
| 	// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value | 	// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value | ||||||
| 	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead()) | 	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead()) | ||||||
|  |  | ||||||
| 	// this amount is the overhead + tensors in memory | 	// rough estimation for scratch space based on context size, batch size and number of layers in the model | ||||||
| 	// TODO: get this from the llama.cpp's graph calculations instead of | 	// TODO: instead call llama.cpp's alloc functions to measure required memory | ||||||
| 	// estimating it's 1/6 * kv_cache_size * num_gqa | 	// TODO: account for quantization levels | ||||||
| 	graph := int64(ggml.NumGQA()) * kv / 6 | 	scratch := 8*int64(opts.NumCtx)*int64(opts.NumBatch)*int64(ggml.NumLayers()) + 1536*1024*1024 // 1536MiB overhead | ||||||
|  |  | ||||||
| 	info := gpu.GetGPUInfo() | 	info := gpu.GetGPUInfo() | ||||||
| 	switch runtime.GOOS { | 	switch runtime.GOOS { | ||||||
| @@ -62,7 +62,7 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) | |||||||
| 			break | 			break | ||||||
| 		} | 		} | ||||||
|  |  | ||||||
| 		if size+kv+graph > vram { | 		if size+kv+scratch > vram { | ||||||
| 			slog.Info("not enough vram available, falling back to CPU only") | 			slog.Info("not enough vram available, falling back to CPU only") | ||||||
| 			info.Library = "cpu" | 			info.Library = "cpu" | ||||||
| 			info.Variant = gpu.GetCPUVariant() | 			info.Variant = gpu.GetCPUVariant() | ||||||
| @@ -99,13 +99,13 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) | |||||||
| 		maxlayers := int64(ggml.NumLayers()) + 1 | 		maxlayers := int64(ggml.NumLayers()) + 1 | ||||||
| 		devices := int64(info.DeviceCount) | 		devices := int64(info.DeviceCount) | ||||||
| 		avg := vram / devices | 		avg := vram / devices | ||||||
| 		layers := maxlayers * (avg - graph) / (kv + size/devices) | 		layers := maxlayers * (avg - scratch) / (kv + size/devices) | ||||||
| 		if layers > maxlayers { | 		if layers > maxlayers { | ||||||
| 			layers = maxlayers | 			layers = maxlayers | ||||||
| 		} | 		} | ||||||
|  |  | ||||||
| 		// 1 + 2 must fit on the main gpu | 		// 1 + 2 must fit on the main gpu | ||||||
| 		min := graph + kv*layers/maxlayers | 		min := scratch + kv*layers/maxlayers | ||||||
| 		if layers <= 0 || min > avg { | 		if layers <= 0 || min > avg { | ||||||
| 			slog.Info("not enough vram available, falling back to CPU only") | 			slog.Info("not enough vram available, falling back to CPU only") | ||||||
| 			info.Library = "cpu" | 			info.Library = "cpu" | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user