qdrant向量数据库试用简单示例
qdrant是一个向量数据库,可以将text等格式的数据进行向量化后做存储,方便跟大模型集成做rag,简单使用如下:
1. 安装:
参考官网:https://qdrant.tech/documentation/quickstart/
2.简单使用
安装依赖:
pip install qdrant-client scikit-learn jieba
使用代码
import requests from qdrant_client import QdrantClient from sklearn.feature_extraction.text import TfidfVectorizer from qdrant_client.http.models import PointStruct import jieba # 初始化 Qdrant 客户端 client = QdrantClient(url='http://localhost:6333') # 分词函数 def tokenize(text): return list(jieba.cut(text)) # 初始化向量化工具 #vectorizer = TfidfVectorizer(max_features=11) vectorizer = TfidfVectorizer(max_features=100, tokenizer=tokenize) # 示例文本文档 documents = [ "This is the first document.", "This document is the second document.", "And this is the third one.", "Is this the first document?", "小明最喜欢红色", "小强最喜欢绿色", "cat is black" ] # 向量化文档 vectors = vectorizer.fit_transform(documents).toarray() # 文档实际向量化后的size actual_vector_size = vectors.shape[1] # 创建集合并定义向量字段 collection_name = 'text_documents' client.recreate_collection( collection_name=collection_name, vectors_config={"size": actual_vector_size, "distance": "Cosine"}, ) # 将向量化文档写入 Qdrant points = [] for i, vector in enumerate(vectors): print(vector.tolist()) points.append( PointStruct( id=i, vector=vector.tolist(), payload={'text': documents[i]} ) ) operation_info = client.upsert( collection_name=collection_name, wait=True, points=points, ) # 定义查询函数 def query_text_documents(query_text): # 向量化查询文本 query_vector = vectorizer.transform([query_text]).toarray()[0] print(f"Query Vector: {query_vector.tolist()}") # 在 Qdrant 中进行相似性搜索 search_result = client.search( collection_name=collection_name, query_vector=query_vector.tolist(), limit=3, # 返回前3个最相似文档 ) # 提取并返回查询结果 results = [] for result in search_result: doc_id = result.id doc_text = result.payload['text'] score = result.score results.append((doc_id, doc_text, score)) return results # 示例查询 query = "小明最喜欢什么颜色" #query = "what's color of cat?" results = query_text_documents(query) print(query) for result in results: print(f"ID: {result[0]}, Text: {result[1]}, Score: {result[2]}")
结合dify等大模型工作流,可以实现rag等能力,留待后面继续探索