这个不是Http版本的,是直接调用API,速度更快、更安全,无需网络组件,纯本机运行。
一、下载Llama.cpp
根据自己的需要编译CPU或是vulkan+cpu、cuda+cpu等版本的动态库。将编译好的所有动态库与自己开发的可执行文件放一起,如果使用了GPU加速可能还要其它库,比如Cuda还要
CUDART64_xxx.DLL
CUBLAS64_xx.DLL
cublasLt64_xx.dll
二、动态库引用单元
unit LlamaCppApi; {$mode ObjFPC}{$H+} interface uses Classes, SysUtils,Dialogs,Math; const DLL_NAME = 'llama.dll'; MAX_TOKENS = 8192; type ggml_backend_dev_t = Pointer; llama_model_tensor_buft_override = Pointer; llama_split_mode = integer; llama_progress_callback = function(progress: single; user_data: Pointer): boolean; cdecl; llama_model_kv_override = Pointer; ggml_backend_sched_eval_callback = Pointer; ggml_abort_callback = Pointer; ggml_type = integer; llama_rope_scaling_type = integer; llama_pooling_type = integer; llama_attention_type = integer; llama_flash_attn_type = integer; llama_sampler_seq_config = Pointer; llama_token = int32; llama_vocab = type Pointer; llama_context = Pointer; llama_pos = int32; llama_seq_id = int32; Pllama_token = ^llama_token; Pllama_pos = ^llama_pos; PInt32 = ^int32; PInt8 = ^int8; PSingle = ^single; Pllama_seq_id = ^llama_seq_id; PPllama_seq_id = ^Pllama_seq_id; // 关键! llama_batch = record n_tokens: int32; // int32_t token: Pllama_token; // llama_token* embd: PSingle; // float* pos: Pllama_pos; // llama_pos* n_seq_id: PInt32; // int32_t* seq_id: PPllama_seq_id; // llama_seq_id** ⚠️ 二级指针 logits: PInt8; // int8_t* end;
llama_model_params = record devices: ^ggml_backend_dev_t; tensor_buft_overrides: ^llama_model_tensor_buft_override; n_gpu_layers: Int32; split_mode: llama_split_mode; main_gpu: Int32; tensor_split: ^Single; progress_callback: llama_progress_callback; progress_callback_user_data: Pointer; kv_overrides: ^llama_model_kv_override; vocab_only: Boolean; use_mmap: Boolean; use_direct_io: Boolean; use_mlock: Boolean; check_tensors: Boolean; use_extra_bufts: Boolean; no_host: Boolean; no_alloc: Boolean; end; llama_context_params =record n_ctx: UInt32; n_batch: UInt32; n_ubatch: UInt32; n_seq_max: UInt32; n_threads: Int32; n_threads_batch: Int32; rope_scaling_type: llama_rope_scaling_type; pooling_type: llama_pooling_type; attention_type: llama_attention_type; flash_attn_type: llama_flash_attn_type; rope_freq_base: Single; rope_freq_scale: Single; yarn_ext_factor: Single; yarn_attn_factor: Single; yarn_beta_fast: Single; yarn_beta_slow: Single; yarn_orig_ctx: UInt32; defrag_thold: Single; cb_eval: ggml_backend_sched_eval_callback; cb_eval_user_data: Pointer; type_k: ggml_type; type_v: ggml_type; abort_callback: ggml_abort_callback; abort_callback_data: Pointer; embeddings: Boolean; offload_kqv: Boolean; no_perf: Boolean; op_offload: Boolean; swa_full: Boolean; kv_unified: Boolean; samplers: ^llama_sampler_seq_config; n_samplers: SizeInt; end; llama_sampler_chain_params = record no_perf: Byte; // bool → Byte (0/1) end;type Pllama_chat_message = ^Tllama_chat_message; Tllama_chat_message = record role: PAnsiChar; // const char* content: PAnsiChar; // const char* end; // ========================== // DLL 导出函数声明 // ========================== function llama_model_default_params: llama_model_params; cdecl; external DLL_NAME;function llama_load_model_from_file(const filename: PChar; const params: llama_model_params): Pointer; cdecl; external DLL_NAME;function llama_context_default_params: llama_context_params; cdecl; external DLL_NAME;function llama_new_context_with_model(model: Pointer; const params: llama_context_params): Pointer; cdecl; external DLL_NAME;procedure llama_free(ctx: Pointer); cdecl; external DLL_NAME; procedure llama_free_model(model: Pointer); cdecl; external DLL_NAME;function llama_model_get_vocab(model: Pointer): llama_vocab; cdecl; external DLL_NAME; function llama_vocab_get_text(vocab: llama_vocab; token: llama_token): PChar; cdecl; external DLL_NAME; function llama_tokenize( vocab : Pointer; // const struct llama_vocab* text : PAnsiChar; // 注意:使用 PAnsiChar 而非 PChar text_len : Int32; tokens : PInt32; n_tokens_max : Int32; add_special : Byte; // bool -> Byte parse_special: Byte): Int32; cdecl; external DLL_NAME; function llama_token_to_piece(vocab: llama_vocab; token: llama_token; buf:PChar; len:Int32; lstrip:Int32; special:Byte): Int32; cdecl; external DLL_NAME; function llama_batch_get_one(tokens:PInt32; n_tokens:Int32): llama_batch; cdecl; external DLL_NAME; function llama_decode(ctx: llama_context; batch: llama_batch): Int32; cdecl; external DLL_NAME;function llama_sampler_init_dist(seed: UInt32): Pointer; cdecl; external DLL_NAME; function llama_sampler_sample(smpl: Pointer; ctx: Pointer; idx: Int32): llama_token; cdecl; external DLL_NAME; procedure llama_sampler_free(smpl: Pointer); cdecl; external DLL_NAME; procedure llama_sampler_accept(smpl: Pointer; token: llama_token); cdecl; external DLL_NAME;function llama_vocab_is_eog(vocab: llama_vocab; token: llama_token):Boolean; cdecl; external DLL_NAME;function llama_get_memory(ctx: llama_context): Pointer; cdecl; external DLL_NAME; procedure llama_memory_clear(mem: Pointer; data: Boolean); cdecl; external DLL_NAME; //---------------------------------------------------------------- type TCreateOverCallback = procedure(const Msg: string;isInit:Boolean) of object; //完成初始化回调 TChatCallback = procedure(elapsedSec: Double;GenTokenCount:integer;msg:String) of object; //返对话信息 TChartOverCallback = procedure(msg:String) of object;//完成对话 TLlamaClass=class private ctx: Pointer;//上下文 model: Pointer;//模型 vocab: llama_vocab; FisInit:Boolean;//是否完成初始化 FGenTokenCount:integer;//当前Token数 //计时变量 startTick, endTick: UInt64; elapsedSec: Double; FStopRequested: Boolean; // 中断标志 public { model_path:模型地址 CreateOverCallback:完成初始化后回调 n_gpu_layers:加载到GPU显存的层数 使用CPU为0 main_gpu:使用那个显卡 n_ctx:上下文长度 n_threads:使用CPU线程数 } constructor Create(model_path: string;CreateOverCallback:TCreateOverCallback; n_gpu_layers: Int32=10;main_gpu: Int32=0;n_ctx: UInt32=1024;n_threads: UInt32=4); destructor Destroy; override; procedure Chat(prompt: AnsiString;FChatCallback:TChatCallback;FChartOverCallback:TChartOverCallback);//对话 procedure StopChat;//中断对话 end; implementation constructor TLlamaClass.Create(model_path: string;CreateOverCallback:TCreateOverCallback; n_gpu_layers: Int32=10;main_gpu: Int32=0;n_ctx: UInt32=1024;n_threads: UInt32=4); var msg:string; m_params: llama_model_params; cp: llama_context_params; begin inherited Create; SetExceptionMask([exInvalidOp..exPrecision]); FisInit := False; m_params := llama_model_default_params; m_params.n_gpu_layers := n_gpu_layers;// 使用 GPU 层数(N卡/A卡可用) 显存不够就减少点,否则 llama_new_context_with_model会失败 m_params.main_gpu := main_gpu; m_params.vocab_only := False; m_params.use_mmap := True; model := llama_load_model_from_file(PChar(model_path), m_params); if not Assigned(model) then begin msg := '模型加载失败,可能是显存不足,减小n_gpu_layers'; end else begin cp := llama_context_default_params; cp.n_ctx := n_ctx; cp.n_threads := n_threads;//CPU线程数 cp.offload_kqv := True; // GPU 加速 ctx := llama_new_context_with_model(model, cp); if not Assigned(ctx) then begin msg := '上下文创建失败,可能是显存不足,减小n_gpu_layers'; end else begin vocab := llama_model_get_vocab(model); msg := '完成模型初始化-可以开始对话'; FisInit := True; end; end; if Assigned(CreateOverCallback) then CreateOverCallback(msg, FisInit); end; destructor TLlamaClass.Destroy; begin if Assigned(ctx) then llama_free(ctx); if Assigned(model) then llama_free_model(model); end; procedure TLlamaClass.Chat(prompt: AnsiString;FChatCallback:TChatCallback;FChartOverCallback:TChartOverCallback); var tokens: array[0..MAX_TOKENS] of llama_token; n_tokens: Int32; sampler: Pointer; batch: llama_batch; token: llama_token; piece_len: Int32; piece: array[0..255] of AnsiChar; ChatString:string; begin If not FisInit then begin ShowMessage('请先成功初始化模型'); Exit; end; if (not Assigned(FChatCallback))or(not Assigned(FChartOverCallback))then begin ShowMessage('请正确设置回调'); Exit; end; llama_memory_clear(llama_get_memory(ctx), True); FStopRequested:=False; n_tokens := llama_tokenize( vocab, PAnsiChar(prompt), Length(prompt), @tokens[0], High(tokens), 1, 0 ); sampler := llama_sampler_init_dist($FFFFFFFF); batch := llama_batch_get_one(@tokens[0], n_tokens); if llama_decode(ctx, batch)<> 0 then begin FChartOverCallback('Decode 失败'); llama_sampler_free(sampler); Exit; end; FGenTokenCount := 0; startTick := GetTickCount64(); ChatString:=''; while not FStopRequested do begin token := llama_sampler_sample(sampler, ctx, -1); llama_sampler_accept(sampler, token);if llama_vocab_is_eog(vocab,token) then begin Break; end; piece_len := llama_token_to_piece(vocab, token, @piece[0], Length(piece), 0, 0); if piece_len > 0 then begin piece[piece_len] := #0; ChatString :=ChatString + string(PAnsiChar(@piece[0])); end else begin Break; end; Inc(FGenTokenCount); endTick := GetTickCount64(); elapsedSec := (endTick - startTick) / 1000.0; // 毫秒 → 秒 if FGenTokenCount mod 3 = 0 then begin FChatCallback(elapsedSec,FGenTokenCount,ChatString); ChatString:=''; end; batch := llama_batch_get_one(@token, 1); if llama_decode(ctx, batch) <> 0 then Break;//解码出错 结束 end; llama_sampler_free(sampler); FChartOverCallback(''); end; procedure TLlamaClass.StopChat; begin FStopRequested:=True; end; end.
三、调用
1、在窗口中定义回调
uses ..... LlamaCppApi TForm1 = class(TForm) private procedure CreateOverCallback(const Msg: string;isInit:Boolean); //完成初始化回调 procedure ChatCallback(elapsedSec: Double;GenTokenCount:integer;msg:String); //返对话信息 procedure ChartOverCallback(msg:String);//完成对话 Llama:TLlamaClass; implementation procedure TForm1.CreateOverCallback(const Msg: string;isInit:Boolean); //完成初始化回调 begin //处理完成模型加载后工作 end; procedure TForm1.ChatCallback(elapsedSec: Double;GenTokenCount:integer;msg:String); //返对话信息 begin //处理返回回复信息工作 Application.ProcessMessages; end; procedure TForm1.ChartOverCallback(msg:String);//完成对话 begin //处理完成对话或是中断对话工作 end;
2、加载与退出模型
Llama:=TLlamaClass.Create(ExtractFilePath(Application.ExeName) +'Qwen3.5-4B.Q4_K_M-instruct.gguf',@CreateOverCallback); //加载模型,完成初始化 Llama.Free; //退出
3、对话
Llama.Chat(AnsiString(Edit1.Text),@ChatCallback,@ChartOverCallback); //与AI对话,ai返回到函数ChatCallback中,分别是对话用时,已返回Token数,当前Token的内容 //在ai回复时可以调用 Llama.StopChat; //退出本次对话
这个只是一个基本调用,还有很多不完善的地方。
浙公网安备 33010602011771号