这个不是Http版本的,是直接调用API,速度更快、更安全,无需网络组件,纯本机运行。
未标题-1

 

一、下载Llama.cpp

根据自己的需要编译CPU或是vulkan+cpu、cuda+cpu等版本的动态库。将编译好的所有动态库与自己开发的可执行文件放一起,如果使用了GPU加速可能还要其它库,比如Cuda还要

CUDART64_xxx.DLL
CUBLAS64_xx.DLL
cublasLt64_xx.dll

二、动态库引用单元

unit LlamaCppApi;

{$mode ObjFPC}{$H+}

interface

uses
  Classes, SysUtils,Dialogs,Math;
const
  DLL_NAME = 'llama.dll';
  MAX_TOKENS = 8192;
type
  ggml_backend_dev_t = Pointer;
  llama_model_tensor_buft_override = Pointer;
  llama_split_mode = integer;
  llama_progress_callback = function(progress: single;
    user_data: Pointer): boolean; cdecl;
  llama_model_kv_override = Pointer;

  ggml_backend_sched_eval_callback = Pointer;
  ggml_abort_callback = Pointer;
  ggml_type = integer;
  llama_rope_scaling_type = integer;
  llama_pooling_type = integer;
  llama_attention_type = integer;
  llama_flash_attn_type = integer;
  llama_sampler_seq_config = Pointer;

  llama_token = int32;
  llama_vocab = type Pointer; 
  llama_context = Pointer;
  llama_pos = int32; 
  llama_seq_id = int32; 
  Pllama_token = ^llama_token;
  Pllama_pos = ^llama_pos;
  PInt32 = ^int32;
  PInt8 = ^int8;
  PSingle = ^single;
  Pllama_seq_id = ^llama_seq_id;
  PPllama_seq_id = ^Pllama_seq_id;  // 关键!

  llama_batch = record
    n_tokens: int32;           // int32_t
    token: Pllama_token;       // llama_token*
    embd: PSingle;             // float*
    pos: Pllama_pos;           // llama_pos*
    n_seq_id: PInt32;          // int32_t*
    seq_id: PPllama_seq_id;    // llama_seq_id** ⚠️ 二级指针
    logits: PInt8;             // int8_t*
  end;
llama_model_params
= record devices: ^ggml_backend_dev_t; tensor_buft_overrides: ^llama_model_tensor_buft_override; n_gpu_layers: Int32; split_mode: llama_split_mode; main_gpu: Int32; tensor_split: ^Single; progress_callback: llama_progress_callback; progress_callback_user_data: Pointer; kv_overrides: ^llama_model_kv_override; vocab_only: Boolean; use_mmap: Boolean; use_direct_io: Boolean; use_mlock: Boolean; check_tensors: Boolean; use_extra_bufts: Boolean; no_host: Boolean; no_alloc: Boolean; end; llama_context_params =record n_ctx: UInt32; n_batch: UInt32; n_ubatch: UInt32; n_seq_max: UInt32; n_threads: Int32; n_threads_batch: Int32; rope_scaling_type: llama_rope_scaling_type; pooling_type: llama_pooling_type; attention_type: llama_attention_type; flash_attn_type: llama_flash_attn_type; rope_freq_base: Single; rope_freq_scale: Single; yarn_ext_factor: Single; yarn_attn_factor: Single; yarn_beta_fast: Single; yarn_beta_slow: Single; yarn_orig_ctx: UInt32; defrag_thold: Single; cb_eval: ggml_backend_sched_eval_callback; cb_eval_user_data: Pointer; type_k: ggml_type; type_v: ggml_type; abort_callback: ggml_abort_callback; abort_callback_data: Pointer; embeddings: Boolean; offload_kqv: Boolean; no_perf: Boolean; op_offload: Boolean; swa_full: Boolean; kv_unified: Boolean; samplers: ^llama_sampler_seq_config; n_samplers: SizeInt; end; llama_sampler_chain_params = record no_perf: Byte; // bool → Byte (0/1) end;type Pllama_chat_message = ^Tllama_chat_message; Tllama_chat_message = record role: PAnsiChar; // const char* content: PAnsiChar; // const char* end; // ========================== // DLL 导出函数声明 // ========================== function llama_model_default_params: llama_model_params; cdecl; external DLL_NAME;function llama_load_model_from_file(const filename: PChar; const params: llama_model_params): Pointer; cdecl; external DLL_NAME;function llama_context_default_params: llama_context_params; cdecl; external DLL_NAME;function llama_new_context_with_model(model: Pointer; const params: llama_context_params): Pointer; cdecl; external DLL_NAME;procedure llama_free(ctx: Pointer); cdecl; external DLL_NAME; procedure llama_free_model(model: Pointer); cdecl; external DLL_NAME;function llama_model_get_vocab(model: Pointer): llama_vocab; cdecl; external DLL_NAME; function llama_vocab_get_text(vocab: llama_vocab; token: llama_token): PChar; cdecl; external DLL_NAME; function llama_tokenize( vocab : Pointer; // const struct llama_vocab* text : PAnsiChar; // 注意:使用 PAnsiChar 而非 PChar text_len : Int32; tokens : PInt32; n_tokens_max : Int32; add_special : Byte; // bool -> Byte parse_special: Byte): Int32; cdecl; external DLL_NAME; function llama_token_to_piece(vocab: llama_vocab; token: llama_token; buf:PChar; len:Int32; lstrip:Int32; special:Byte): Int32; cdecl; external DLL_NAME; function llama_batch_get_one(tokens:PInt32; n_tokens:Int32): llama_batch; cdecl; external DLL_NAME; function llama_decode(ctx: llama_context; batch: llama_batch): Int32; cdecl; external DLL_NAME;function llama_sampler_init_dist(seed: UInt32): Pointer; cdecl; external DLL_NAME; function llama_sampler_sample(smpl: Pointer; ctx: Pointer; idx: Int32): llama_token; cdecl; external DLL_NAME; procedure llama_sampler_free(smpl: Pointer); cdecl; external DLL_NAME; procedure llama_sampler_accept(smpl: Pointer; token: llama_token); cdecl; external DLL_NAME;function llama_vocab_is_eog(vocab: llama_vocab; token: llama_token):Boolean; cdecl; external DLL_NAME;function llama_get_memory(ctx: llama_context): Pointer; cdecl; external DLL_NAME; procedure llama_memory_clear(mem: Pointer; data: Boolean); cdecl; external DLL_NAME; //---------------------------------------------------------------- type TCreateOverCallback = procedure(const Msg: string;isInit:Boolean) of object; //完成初始化回调 TChatCallback = procedure(elapsedSec: Double;GenTokenCount:integer;msg:String) of object; //返对话信息 TChartOverCallback = procedure(msg:String) of object;//完成对话 TLlamaClass=class private ctx: Pointer;//上下文 model: Pointer;//模型 vocab: llama_vocab; FisInit:Boolean;//是否完成初始化 FGenTokenCount:integer;//当前Token数 //计时变量 startTick, endTick: UInt64; elapsedSec: Double; FStopRequested: Boolean; // 中断标志 public { model_path:模型地址 CreateOverCallback:完成初始化后回调 n_gpu_layers:加载到GPU显存的层数 使用CPU为0 main_gpu:使用那个显卡 n_ctx:上下文长度 n_threads:使用CPU线程数 } constructor Create(model_path: string;CreateOverCallback:TCreateOverCallback; n_gpu_layers: Int32=10;main_gpu: Int32=0;n_ctx: UInt32=1024;n_threads: UInt32=4); destructor Destroy; override; procedure Chat(prompt: AnsiString;FChatCallback:TChatCallback;FChartOverCallback:TChartOverCallback);//对话 procedure StopChat;//中断对话 end; implementation constructor TLlamaClass.Create(model_path: string;CreateOverCallback:TCreateOverCallback; n_gpu_layers: Int32=10;main_gpu: Int32=0;n_ctx: UInt32=1024;n_threads: UInt32=4); var msg:string; m_params: llama_model_params; cp: llama_context_params; begin inherited Create; SetExceptionMask([exInvalidOp..exPrecision]); FisInit := False; m_params := llama_model_default_params; m_params.n_gpu_layers := n_gpu_layers;// 使用 GPU 层数(N卡/A卡可用) 显存不够就减少点,否则 llama_new_context_with_model会失败 m_params.main_gpu := main_gpu; m_params.vocab_only := False; m_params.use_mmap := True; model := llama_load_model_from_file(PChar(model_path), m_params); if not Assigned(model) then begin msg := '模型加载失败,可能是显存不足,减小n_gpu_layers'; end else begin cp := llama_context_default_params; cp.n_ctx := n_ctx; cp.n_threads := n_threads;//CPU线程数 cp.offload_kqv := True; // GPU 加速 ctx := llama_new_context_with_model(model, cp); if not Assigned(ctx) then begin msg := '上下文创建失败,可能是显存不足,减小n_gpu_layers'; end else begin vocab := llama_model_get_vocab(model); msg := '完成模型初始化-可以开始对话'; FisInit := True; end; end; if Assigned(CreateOverCallback) then CreateOverCallback(msg, FisInit); end; destructor TLlamaClass.Destroy; begin if Assigned(ctx) then llama_free(ctx); if Assigned(model) then llama_free_model(model); end; procedure TLlamaClass.Chat(prompt: AnsiString;FChatCallback:TChatCallback;FChartOverCallback:TChartOverCallback); var tokens: array[0..MAX_TOKENS] of llama_token; n_tokens: Int32; sampler: Pointer; batch: llama_batch; token: llama_token; piece_len: Int32; piece: array[0..255] of AnsiChar; ChatString:string; begin If not FisInit then begin ShowMessage('请先成功初始化模型'); Exit; end; if (not Assigned(FChatCallback))or(not Assigned(FChartOverCallback))then begin ShowMessage('请正确设置回调'); Exit; end; llama_memory_clear(llama_get_memory(ctx), True); FStopRequested:=False; n_tokens := llama_tokenize( vocab, PAnsiChar(prompt), Length(prompt), @tokens[0], High(tokens), 1, 0 ); sampler := llama_sampler_init_dist($FFFFFFFF); batch := llama_batch_get_one(@tokens[0], n_tokens); if llama_decode(ctx, batch)<> 0 then begin FChartOverCallback('Decode 失败'); llama_sampler_free(sampler); Exit; end; FGenTokenCount := 0; startTick := GetTickCount64(); ChatString:=''; while not FStopRequested do begin token := llama_sampler_sample(sampler, ctx, -1); llama_sampler_accept(sampler, token);if llama_vocab_is_eog(vocab,token) then begin Break; end; piece_len := llama_token_to_piece(vocab, token, @piece[0], Length(piece), 0, 0); if piece_len > 0 then begin piece[piece_len] := #0; ChatString :=ChatString + string(PAnsiChar(@piece[0])); end else begin Break; end; Inc(FGenTokenCount); endTick := GetTickCount64(); elapsedSec := (endTick - startTick) / 1000.0; // 毫秒 → 秒 if FGenTokenCount mod 3 = 0 then begin FChatCallback(elapsedSec,FGenTokenCount,ChatString); ChatString:=''; end; batch := llama_batch_get_one(@token, 1); if llama_decode(ctx, batch) <> 0 then Break;//解码出错 结束 end; llama_sampler_free(sampler); FChartOverCallback(''); end; procedure TLlamaClass.StopChat; begin FStopRequested:=True; end; end.

三、调用

1、在窗口中定义回调

uses
.....
LlamaCppApi


TForm1 = class(TForm)  
private
    procedure CreateOverCallback(const Msg: string;isInit:Boolean); //完成初始化回调
    procedure ChatCallback(elapsedSec: Double;GenTokenCount:integer;msg:String); //返对话信息
    procedure ChartOverCallback(msg:String);//完成对话
Llama:TLlamaClass; 


implementation 
procedure TForm1.CreateOverCallback(const Msg: string;isInit:Boolean); //完成初始化回调
begin
  //处理完成模型加载后工作
end;
procedure TForm1.ChatCallback(elapsedSec: Double;GenTokenCount:integer;msg:String); //返对话信息
begin
  //处理返回回复信息工作
  Application.ProcessMessages;
end;
procedure TForm1.ChartOverCallback(msg:String);//完成对话
begin
//处理完成对话或是中断对话工作
end; 

2、加载与退出模型

Llama:=TLlamaClass.Create(ExtractFilePath(Application.ExeName) +'Qwen3.5-4B.Q4_K_M-instruct.gguf',@CreateOverCallback); //加载模型,完成初始化

Llama.Free; //退出

3、对话

Llama.Chat(AnsiString(Edit1.Text),@ChatCallback,@ChartOverCallback); //与AI对话,ai返回到函数ChatCallback中,分别是对话用时,已返回Token数,当前Token的内容

//在ai回复时可以调用
Llama.StopChat;
//退出本次对话

这个只是一个基本调用,还有很多不完善的地方。

posted on 2026-04-20 20:11  禁卫军  阅读(110)  评论(0)    收藏  举报