golang测试模型的token输出速度

golang测试模型的token输出速度

package main

import (
	"bytes"
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"os/exec"
	"runtime"
	"strconv"
	"strings"
	"time"
)

type Request struct {
	Model   string         `json:"model"`
	Prompt  string         `json:"prompt"`
	Stream  bool           `json:"stream"`
	Options map[string]any `json:"options,omitempty"`
}

type Response struct {
	EvalCount    int    `json:"eval_count"`
	EvalDuration int64  `json:"eval_duration"`
	Response     string `json:"response"`
}

type VersionResp struct {
	Version string `json:"version"`
}

func main() {
	printSystemInfo()
	fmt.Println()
	printOllamaVersion()
	fmt.Println()
	runBenchmark()
}

func printSystemInfo() {
	fmt.Println("=== System Info ===")
	printCPU()
	printMemory()
	printGPU()
	fmt.Println("===================")
}

func printCPU() {
	model, cores := getCPUInfo()
	fmt.Printf("%-8s: %s\n", "CPU", model)
	fmt.Printf("%-8s: %s\n", "Cores", cores)
}

func getCPUInfo() (model, cores string) {
	switch runtime.GOOS {
	case "linux":
		return getCPUInfoLinux()
	case "windows":
		return getCPUInfoWindows()
	case "darwin":
		return getCPUInfoDarwin()
	default:
		return "unknown", "unknown"
	}
}

func getCPUInfoLinux() (model, cores string) {
	// 获取 CPU 型号
	cmd := exec.Command("grep", "-m1", "model name", "/proc/cpuinfo")
	out, err := cmd.Output()
	if err != nil {
		model = "unknown"
	} else {
		parts := strings.SplitN(string(out), ":", 2)
		if len(parts) == 2 {
			model = strings.TrimSpace(parts[1])
		}
	}

	// 获取核心数
	logical, _ := exec.Command("nproc").Output()
	physicalCmd := exec.Command("bash", "-c", "lscpu | grep 'Core(s) per socket' | awk '{print $4}'")
	physicalOut, err := physicalCmd.Output()
	if err != nil {
		cores = fmt.Sprintf("%sL", strings.TrimSpace(string(logical)))
	} else {
		socketCmd := exec.Command("bash", "-c", "lscpu | grep 'Socket(s)' | awk '{print $2}'")
		socketOut, _ := socketCmd.Output()
		physical := strings.TrimSpace(string(physicalOut))
		sockets := strings.TrimSpace(string(socketOut))
		p, _ := strconv.Atoi(physical)
		s, _ := strconv.Atoi(sockets)
		l, _ := strconv.Atoi(strings.TrimSpace(string(logical)))
		if p > 0 && s > 0 {
			cores = fmt.Sprintf("%dP / %dL", p*s, l)
		} else {
			cores = fmt.Sprintf("%dL", l)
		}
	}
	return
}

func getCPUInfoWindows() (model, cores string) {
	// 获取 CPU 型号
	cmd := exec.Command("wmic", "cpu", "get", "name")
	out, err := cmd.Output()
	if err != nil {
		model = "unknown"
	} else {
		lines := strings.Split(strings.TrimSpace(string(out)), "\n")
		if len(lines) >= 2 {
			model = strings.TrimSpace(lines[1])
		}
	}

	// 获取核心数
	cmd = exec.Command("wmic", "cpu", "get", "NumberOfCores,NumberOfLogicalProcessors")
	out, err = cmd.Output()
	if err != nil {
		cores = "unknown"
	} else {
		lines := strings.Split(strings.TrimSpace(string(out)), "\n")
		if len(lines) >= 2 {
			parts := strings.Fields(lines[1])
			if len(parts) >= 2 {
				cores = fmt.Sprintf("%sP / %sL", parts[0], parts[1])
			}
		}
	}
	return
}

func getCPUInfoDarwin() (model, cores string) {
	// 获取 CPU 型号
	cmd := exec.Command("sysctl", "-n", "machdep.cpu.brand_string")
	out, err := cmd.Output()
	if err != nil {
		model = "unknown"
	} else {
		model = strings.TrimSpace(string(out))
	}

	// 获取核心数
	physicalCmd := exec.Command("sysctl", "-n", "hw.physicalcpu")
	logicalCmd := exec.Command("sysctl", "-n", "hw.logicalcpu")
	pOut, pErr := physicalCmd.Output()
	lOut, lErr := logicalCmd.Output()
	if pErr != nil || lErr != nil {
		cores = "unknown"
	} else {
		cores = fmt.Sprintf("%sP / %sL", strings.TrimSpace(string(pOut)), strings.TrimSpace(string(lOut)))
	}
	return
}

func printMemory() {
	totalGB := getMemoryTotal()
	fmt.Printf("%-8s: %s\n", "Memory", totalGB)
}

func getMemoryTotal() string {
	switch runtime.GOOS {
	case "linux":
		return getMemoryTotalLinux()
	case "windows":
		return getMemoryTotalWindows()
	case "darwin":
		return getMemoryTotalDarwin()
	default:
		return "unknown"
	}
}

func getMemoryTotalLinux() string {
	cmd := exec.Command("grep", "MemTotal", "/proc/meminfo")
	out, err := cmd.Output()
	if err != nil {
		return "unknown"
	}
	parts := strings.Fields(string(out))
	if len(parts) >= 2 {
		kb, _ := strconv.ParseFloat(parts[1], 64)
		return fmt.Sprintf("%.2f GB", kb/1024/1024)
	}
	return "unknown"
}

func getMemoryTotalWindows() string {
	cmd := exec.Command("wmic", "os", "get", "TotalVisibleMemorySize")
	out, err := cmd.Output()
	if err != nil {
		return "unknown"
	}
	lines := strings.Split(strings.TrimSpace(string(out)), "\n")
	if len(lines) >= 2 {
		kb, _ := strconv.ParseFloat(strings.TrimSpace(lines[1]), 64)
		return fmt.Sprintf("%.2f GB", kb/1024/1024)
	}
	return "unknown"
}

func getMemoryTotalDarwin() string {
	cmd := exec.Command("sysctl", "-n", "hw.memsize")
	out, err := cmd.Output()
	if err != nil {
		return "unknown"
	}
	bytes, _ := strconv.ParseFloat(strings.TrimSpace(string(out)), 64)
	return fmt.Sprintf("%.2f GB", bytes/1024/1024/1024)
}

func printGPU() {
	cmd := exec.Command("nvidia-smi", "--query-gpu=name,memory.total", "--format=csv,noheader")
	out, err := cmd.Output()
	if err != nil {
		fmt.Printf("%-8s: No NVIDIA GPU detected or nvidia-smi not available\n", "GPU")
		return
	}
	lines := strings.Split(strings.TrimSpace(string(out)), "\n")
	for i, line := range lines {
		parts := strings.Split(line, ",")
		if len(parts) >= 2 {
			name := strings.TrimSpace(parts[0])
			memMB := strings.TrimSpace(strings.TrimSuffix(parts[1], " MiB"))
			if i == 0 {
				fmt.Printf("%-8s[%d]: %s\n", "GPU", i, name)
				fmt.Printf("%-8s: %s MB\n", "VRAM", memMB)
			} else {
				fmt.Printf("%-8s[%d]: %s | VRAM: %s MB\n", "GPU", i, name, memMB)
			}
		}
	}
}

func printOllamaVersion() {
	fmt.Println("=== Ollama Info ===")
	url := "http://localhost:11434/api/version"
	resp, err := http.Get(url)
	if err != nil {
		fmt.Printf("%-8s: failed to connect (%v)\n", "Version", err)
		return
	}
	defer resp.Body.Close()

	body, err := io.ReadAll(resp.Body)
	if err != nil {
		fmt.Printf("%-8s: failed to read response (%v)\n", "Version", err)
		return
	}

	var v VersionResp
	if err := json.Unmarshal(body, &v); err != nil {
		fmt.Printf("%-8s: unknown (%s)\n", "Version", string(body))
		return
	}
	fmt.Printf("%-8s: %s\n", "Version", v.Version)
	fmt.Println("===================")
}

func runBenchmark() {
	const (
		model  = "qwen3.5:9b"
		prompt = "你好,请介绍一下你自己"
	)

	fmt.Println("=== Benchmark ===")
	fmt.Printf("%-12s: %s\n", "Model", model)
	fmt.Printf("%-12s: %s\n", "Prompt", prompt)
	fmt.Println("-------------------")

	url := "http://localhost:11434/api/generate"
	payload := Request{
		Model:  model,
		Prompt: prompt,
		Stream: false,
		Options: map[string]any{
			"think": false,
		},
	}

	data, err := json.Marshal(payload)
	if err != nil {
		fmt.Println("Marshal error:", err)
		return
	}

	start := time.Now()
	resp, err := http.Post(url, "application/json", bytes.NewBuffer(data))
	if err != nil {
		fmt.Println("Request error:", err)
		return
	}
	defer resp.Body.Close()

	var res Response
	if err := json.NewDecoder(resp.Body).Decode(&res); err != nil {
		fmt.Println("Decode error:", err)
		return
	}

	duration := time.Since(start)
	evalSec := float64(res.EvalDuration) / 1e9
	var tokensPerSec float64
	if evalSec > 0 {
		tokensPerSec = float64(res.EvalCount) / evalSec
	}

	fmt.Printf("%-12s: %v\n", "Total Time", duration.Round(time.Millisecond))
	fmt.Printf("%-12s: %d\n", "Eval Tokens", res.EvalCount)
	fmt.Printf("%-12s: %.2f tokens/s\n", "Eval Speed", tokensPerSec)
	fmt.Printf("%-12s: %v\n", "Server Eval", time.Duration(res.EvalDuration)*time.Nanosecond)
	fmt.Println("=================")
}

测试结果

=== System Info ===
CPU     : AMD Ryzen 5 5600G with Radeon Graphics
Cores   : 6P / 12L
Memory  : 27.30 GB
GPU     : No NVIDIA GPU detected or nvidia-smi not available
===================

=== Ollama Info ===
Version : 0.18.2
===================

=== Benchmark ===
Model       : qwen3.5:9b
Prompt      : 你好,请介绍一下你自己
-------------------
Total Time  : 1m33.855s
Eval Tokens : 400
Eval Speed  : 4.32 tokens/s
Server Eval : 1m32.676335318s
=================

=== System Info ===
CPU     : AMD Ryzen 5 5600G with Radeon Graphics
Cores   : 6P / 12L
Memory  : 27.30 GB
GPU     : No NVIDIA GPU detected or nvidia-smi not available
===================

=== Ollama Info ===
Version : 0.18.2
===================

=== Benchmark ===
Model       : qwen3.5:0.8b
Prompt      : 你好,请介绍一下你自己
-------------------
Total Time  : 33.877s
Eval Tokens : 484
Eval Speed  : 15.66 tokens/s
Server Eval : 30.914075743s
=================


=== System Info ===
CPU     : AMD Ryzen 5 5600G with Radeon Graphics
Cores   : 6P / 12L
Memory  : 27.30 GB
GPU     : No NVIDIA GPU detected or nvidia-smi not available
===================

=== Ollama Info ===
Version : 0.18.2
===================

=== Benchmark ===
Model       : qwen3:0.6b
Prompt      : 你好,请介绍一下你自己
-------------------
Total Time  : 3.721s
Eval Tokens : 165
Eval Speed  : 58.01 tokens/s
Server Eval : 2.844187074s
=================

=== System Info ===
CPU     : AMD Ryzen 5 5600G with Radeon Graphics
Cores   : 6P / 12L
Memory  : 27.30 GB
GPU     : No NVIDIA GPU detected or nvidia-smi not available
===================

=== Ollama Info ===
Version : 0.18.2
===================

=== Benchmark ===
Model       : llama3.1:8b
Prompt      : 你好,请介绍一下你自己
-------------------
Total Time  : 6.548s
Eval Tokens : 47
Eval Speed  : 7.53 tokens/s
Server Eval : 6.24072249s
=================


模型性能对比

以下是将新提供的数据整合到原有对比表格中的结果:

指标 qwen3.5:9b qwen3.5:0.8b qwen3:0.6b llama3.1:8b qwen3.5:27b qwen2.5:0.5b
Model qwen3.5:9b qwen3.5:0.8b qwen3:0.6b llama3.1:8b qwen3.5:27b qwen2.5:0.5b
Prompt 你好,请介绍一下你自己 你好,请介绍一下你自己 你好,请介绍一下你自己 你好,请介绍一下你自己 你好,请介绍一下你自己 你好,请介绍一下你自己
Total Time 1m33.855s 33.877s 3.721s 6.548s 24m16.474s 2.969s
Eval Tokens 400 484 165 47 1622 70
Eval Speed (tokens/s) 4.32 15.66 58.01 7.53 1.13 55.88
Server Eval 1m32.676335318s 30.914075743s 2.844187074s 6.24072249s 23m59.117758659s 1.252627949s

表格说明:

  • Model:测试的模型名称。
  • Prompt:使用的提示文本。
  • Total Time:模型完成提示所需的总时间。
  • Eval Tokens:评估过程中处理的token数量。
  • Eval Speed:评估速度,以tokens/s为单位。
  • Server Eval:服务器端评估所需的时间。
posted @ 2026-03-24 23:36  jiftle  阅读(0)  评论(0)    收藏  举报