第二节:如何理解Embedding以及基于内存库简单实操
一. 说明
Embedding可以理解为将一个内容进行向量化,便于比较相似度
这里使用的是阿里千问的“text-embedding-v4”模型
二. 实操
(1) 先将一些内容向量化,存入库中,这里使用list集合代替,其中key为原内容,value为调用embedding接口后的内容
(2) 输入一个内容,同样调用embedding接口获取向量结果
(3) 去库中逐个比对相似度,比对方法使用封装的CalculateCosineSimilarity
(4) 获取相似度前三的内容,进行输出
代码分享:
Console.WriteLine("=== Embedding Similarity Demo ===\n");
var apiKey = "sk-xxx";
var endpoint = "https://dashscope.aliyuncs.com/compatible-mode/v1/";
var model = "text-embedding-v4";
var embeddingClient = new OpenAIClient(
new ApiKeyCredential(apiKey),
new OpenAIClientOptions { Endpoint = new Uri(endpoint) }
).GetEmbeddingClient(model);
// 1.一些不同的主题
var sampleTexts = new[]
{
"C# is a popular programming language for data science and machine learning.",
"I love cooking Italian pasta with fresh tomatoes and basil.",
"The football match was exciting, with the final score being 3-2.",
"Machine learning algorithms can identify patterns in large datasets.",
"The recipe calls for two cups of flour and three eggs.",
"Basketball requires good coordination and teamwork skills.",
"Neural networks are inspired by biological brain structures.",
"Baking bread at home requires patience and the right temperature.",
"The soccer team won the championship after months of training.",
"Deep learning has revolutionized computer vision and natural language processing."
};
Console.WriteLine("Generating embeddings for sample texts...\n");
// 2.将上面的text进行embedding,将原text和embedding后的结果存放到List中
var textEmbeddings = new List<(string text, float[] embedding)>();
int count = 1;
foreach (var text in sampleTexts)
{
ClientResult<OpenAIEmbedding> embeddingResult = await embeddingClient.GenerateEmbeddingAsync(text);
float[] embedding = embeddingResult.Value.ToFloats().ToArray();
textEmbeddings.Add((text, embedding));
Console.WriteLine($"{count}:{text}");
count++;
}
Console.WriteLine($"\nStored {textEmbeddings.Count} text embeddings (dimension: {textEmbeddings[0].embedding.Length})\n");
//3. 自己输入一个内容,寻找库中相似度前三的内容
while (true)
{
Console.WriteLine("\n" + new string('-', 70));
Console.Write("Enter your query (or 'quit' to exit): ");
var query = Console.ReadLine();
if (string.IsNullOrWhiteSpace(query) || query.ToLower() == "quit")
{
Console.WriteLine("Goodbye!");
break;
}
Console.WriteLine($"\nSearching for: \"{query}\"");
Console.WriteLine("Generating query embedding...");
//3.1 计算输入内容的embedding
var queryEmbeddingResult = await embeddingClient.GenerateEmbeddingAsync(query);
var queryEmbedding = queryEmbeddingResult.Value.ToFloats().ToArray();
Console.WriteLine(queryEmbedding.Length);
Console.WriteLine(string.Join(',', queryEmbedding));
//3.2 去库中进行相似度的匹配
//利用CalculateCosineSimilarity方法循环匹配
var similarities = new List<(string text, double similarity)>();
foreach (var (text, embedding) in textEmbeddings)
{
double similarity = CalculateCosineSimilarity(queryEmbedding, embedding); //计算相似度
similarities.Add((text, similarity));
}
//3.3 找出相似度前三的,输出
var topResults = similarities.OrderByDescending(x => x.similarity).Take(3).ToList();
Console.WriteLine("\nTop 3 Most Similar Texts:");
Console.WriteLine(new string('=', 70));
for (int i = 0; i < topResults.Count; i++)
{
var (text, similarity) = topResults[i];
Console.WriteLine($"\n{i + 1}. Similarity: {similarity:F4} ({similarity * 100:F2}%)");
Console.WriteLine($" Text: {text}");
}
}
//比对向量相似度方法
static double CalculateCosineSimilarity(float[] vector1, float[] vector2)
{
if (vector1.Length != vector2.Length)
{
throw new ArgumentException("Vectors must have the same dimension");
}
double dotProduct = 0;
double magnitude1 = 0;
double magnitude2 = 0;
for (int i = 0; i < vector1.Length; i++)
{
dotProduct += vector1[i] * vector2[i];
magnitude1 += vector1[i] * vector1[i];
magnitude2 += vector2[i] * vector2[i];
}
magnitude1 = Math.Sqrt(magnitude1);
magnitude2 = Math.Sqrt(magnitude2);
if (magnitude1 == 0 || magnitude2 == 0)
{
return 0;
}
return dotProduct / (magnitude1 * magnitude2);
}
!
- 作 者 : Yaopengfei(姚鹏飞)
- 博客地址 : http://www.cnblogs.com/yaopengfei/
- 声 明1 : 如有错误,欢迎讨论,请勿谩骂^_^。
- 声 明2 : 原创博客请在转载时保留原文链接或在文章开头加上本人博客地址,否则保留追究法律责任的权利。

浙公网安备 33010602011771号