diff --git a/README.md b/README.md index caf3bf8..94aa79a 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,8 @@ [![.github/workflows/shellcheck.yaml](https://github.com/geerlingguy/ollama-benchmark/actions/workflows/shellcheck.yaml/badge.svg)](https://github.com/geerlingguy/ollama-benchmark/actions/workflows/shellcheck.yaml) +## Linux + This bash script benchmarks ollama on any system where it is installed. For a quick installation, try: @@ -26,7 +28,7 @@ Then run this benchmark script: Uninstall Ollama following the [official uninstall instructions](https://github.com/ollama/ollama/blob/main/docs/linux.md#uninstall). -## CLI Options +###CLI Options ``` Usage: ./obench.sh [OPTIONS] @@ -39,12 +41,32 @@ Options: --markdown Format output as markdown ``` +## Windows + +This PowerShell script benchmarks ollama on any system where it is installed. + +> Make sure you have Ollama installed on your windows machine + +### CLI Options + +``` +Usage: .\obench.ps1 [OPTIONS] +Options: + -Help Display this help message + -Default Run a benchmark using some default small models + -Model Specify a model to use + -Count Number of times to run the benchmark + -Ollama-bin Point to ollama executable or command (e.g if using Docker) + -Markdown Format output as markdown +``` + + ## Findings ### DeepSeek | System | CPU/GPU | Model | Eval Rate | Power (Peak) | -| :--- | :--- | :--- | :--- | :--- | +| :--- | :--- | :--- | ---: | :--- | | Pi 5 - 16GB | CPU | deepseek-r1:14b | 1.20 Tokens/s | 13.0 W | | Pi 5 - 16GB (AMD Pro W77001) | GPU | deepseek-r1:14b | 19.90 Tokens/s | 164 W | | GMKtek G3 Plus (Intel N150) - 16GB | CPU | deepseek-r1:1.5b | 17.02 Tokens/s | 25.6 W | @@ -53,11 +75,15 @@ Options: | AmpereOne A192-32X - 512GB | CPU | deepseek-r1:671b | 4.18 Tokens/s | 477 W | | M1 Ultra (48 GPU Core) 64GB | GPU | deepseek-r1:1.5b | 126.21 Tokens/s | N/A | | M1 Ultra (48 GPU Core) 64GB | GPU | deepseek-r1:14b | 35.89 Tokens/s | N/A | +| M1 Macbook Air (8 GPU Core) 8GB | GPU | deepseek-r1:8b | 9.09 Tokens/s | N/A | +| Intel i5 13500 (AMD 7800XT) | GPU | deepseek-r1:8b | 67.99 Tokens/s | N/A | +| Intel i5 13500 (AMD 7800XT) | GPU | deepseek-r1:14b | 37.53 Tokens/s | N/A | + ### Llama | System | CPU/GPU | Model | Eval Rate | Power (Peak) | -| :--- | :--- | :--- | :--- | :--- | +| :--- | :--- | :--- | ---: | :--- | | Pi 400 - 4GB | CPU | llama3.2:3b | 1.60 Tokens/s | 6 W | | Pi 5 - 8GB | CPU | llama3.2:3b | 4.61 Tokens/s | 13.9 W | | Pi 5 - 8GB | CPU | llama3.1:8b | 1.99 Tokens/s | 13.2 W | @@ -80,6 +106,9 @@ Options: | Pi 5 - 8GB (AMD Pro W77001) | GPU | llama3.2:3b | 56.14 Tokens/s | 145 W | | Pi 5 - 8GB (AMD Pro W77001) | GPU | llama3.1:8b | 39.87 Tokens/s | 52 W | | Pi 5 - 8GB (AMD Pro W77001) | GPU | llama2:13b | 4.38 Tokens/s | 108 W | +| M1 Macbook Air (8 GB) | GPU | llama3.2:8b | 22.95 Tokens/s | N/A | +| M1 Macbook Air (8 GB) | GPU | llama3.1:8b | 9.18 Tokens/s | N/A | +| M1 Macbook Air (8 GB) | GPU | llama2:7b | 14.12 Tokens/s | N/A | | M4 Mac mini (10 core - 32GB) | GPU | llama3.2:3b | 41.31 Tokens/s | 30.1 W | | M4 Mac mini (10 core - 32GB) | GPU | llama3.1:8b | 20.95 Tokens/s | 29.4 W | | M4 Mac mini (10 core - 32GB) | GPU | llama2:13b | 13.60 Tokens/s | 29.8 W | @@ -89,6 +118,9 @@ Options: | M1 Max Mac Studio (10 core - 64GB) | GPU | llama3.1:70b | 7.25 Tokens/s | N/A | | M1 Ultra (48 GPU Core) 64GB | GPU | llama3.2:3b | 108.67 Tokens/s | N/A | | M1 Ultra (48 GPU Core) 64GB | GPU | llama3.1:8b | 62.28 Tokens/s | N/A | +| Intel i5 13500 (AMD 7800XT) | GPU | llama3.1:8b | 37.83 Tokens/s | N/A | +| Intel i5 13500 (AMD 7800XT) | GPU | llama3.2:8b | 122.38 Tokens/s | N/A | +| Intel i5 13500 (AMD 7800XT) | GPU | llama2:7b | 62.51 Tokens/s | N/A | | Ryzen 9 7900X (Nvidia 4090) | GPU | llama3.2:3b | 237.05 Tokens/s | N/A | | Ryzen 9 7900X (Nvidia 4090) | GPU | llama3.1:8b | 148.09 Tokens/s | N/A | | Ryzen 9 7900X (Nvidia 4090) | GPU/CPU | llama3.1:70b | 3.10 Tokens/s | N/A | @@ -114,6 +146,20 @@ Options: 1 These GPUs were tested using `llama.cpp` with Vulkan support. +## qwen + +| System | CPU/GPU | Model | Eval Rate | Power (Peak) | +| :--- | :--- | :--- | ---: | :--- | +| Intel i5 13500 (AMD 7800XT) | GPU | qwen:14b | 3.56 Tokens/s | N/A | +| Intel i5 13500 (AMD 7800XT) | GPU | qwen2.5:14b | 4.01 Tokens/s | N/A | + +## phi + +| System | CPU/GPU | Model | Eval Rate | Power (Peak) | +| :--- | :--- | :--- | ---: | :--- | +| Intel i5 13500 (AMD 7800XT) | GPU | phi:14b | 41.33 Tokens/s | N/A | + + ## Further Reading This script is just a quick way of comparing _one aspect_ of generative AI performance. There are _many other_ aspects that are as important (or more important) this script does _not_ cover. diff --git a/obench.ps1 b/obench.ps1 new file mode 100644 index 0000000..1b21dd3 --- /dev/null +++ b/obench.ps1 @@ -0,0 +1,83 @@ +# PowerShell script to benchmark Ollama token generation rate +# Inspired by https://taoofmac.com/space/blog/2024/01/20/1800 + +param ( + [switch]$Help, + [switch]$Default, + [string]$Model, + [int]$Count, + [string]$OllamaBin = "ollama", + [switch]$Markdown +) + +function Show-Usage { + Write-Output "Usage: obench.ps1 [OPTIONS]" + Write-Output "Options:" + Write-Output " -Help Display this help message" + Write-Output " -Default Run a benchmark using some default small models" + Write-Output " -Model Specify a model to use" + Write-Output " -Count Number of times to run the benchmark" + Write-Output " -Ollama-bin Point to ollama executable or command (e.g if using Docker)" + Write-Output " -Markdown Format output as markdown" + exit 0 +} + +if ($Help) { + Show-Usage + exit 0 +} + +# Default values +if ($Default) { + $Count = 3 + $Model = "llama3.2:3b" +} + +# Ensure Ollama is available +$baseCmd = ($OllamaBin -split " ")[0] +if (-not (Get-Command $baseCmd -ErrorAction SilentlyContinue)) { + Write-Error "Error: $baseCmd could not be found. Please check the path or install it." + exit 1 +} + +# Prompt for benchmark count if not provided +if (-not $Count) { + $Count = Read-Host "How many times to run the benchmark?" +} + +# Prompt for model if not provided +if (-not $Model) { + Write-Output "Current models available locally:" + & $OllamaBin list + $Model = Read-Host "Enter model you'd like to run (e.g. llama3.2)" +} + +Write-Output "Running benchmark $Count times using model: $Model" +Write-Output "" +if ($Markdown) { + Write-Output "| Run | Eval Rate (Tokens/Second) |" + Write-Output "|-----|---------------------------|" +} + +$totalEvalRate = 0 +for ($run = 1; $run -le $Count; $run++) { + $result = echo "Why is the blue sky blue?" | & $OllamaBin run $Model --verbose 2>&1 | Select-String "^eval rate:" + + if ($result) { + $evalRate = ($result -split " ")[1] + $tokenValue = ($evalRate -split " ")[0] + $totalEvalRate += [double]$tokenValue + if ($Markdown) { + Write-Output "| $run | $evalRate tokens/s |" + } else { + Write-Output $result + } + } +} + +$averageEvalRate = [math]::Round($totalEvalRate / $Count, 2) +if ($Markdown) { + Write-Output "|**Average Eval Rate**| $averageEvalRate tokens/second |" +} else { + Write-Output "Average Eval Rate: $averageEvalRate tokens/second" +}