qdrant向量数据库试用简单示例

qdrant是一个向量数据库,可以将text等格式的数据进行向量化后做存储,方便跟大模型集成做rag,简单使用如下:

1. 安装:

参考官网:https://qdrant.tech/documentation/quickstart/

2.简单使用

安装依赖:

pip install qdrant-client scikit-learn jieba

使用代码

import requests
from qdrant_client import QdrantClient
from sklearn.feature_extraction.text import TfidfVectorizer
from qdrant_client.http.models import PointStruct
import jieba

# 初始化 Qdrant 客户端
client = QdrantClient(url='http://localhost:6333')

# 分词函数
def tokenize(text):
    return list(jieba.cut(text))

# 初始化向量化工具
#vectorizer = TfidfVectorizer(max_features=11)
vectorizer = TfidfVectorizer(max_features=100, tokenizer=tokenize)

# 示例文本文档
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
    "小明最喜欢红色",
    "小强最喜欢绿色",
    "cat is black"
]

# 向量化文档
vectors = vectorizer.fit_transform(documents).toarray()
# 文档实际向量化后的size
actual_vector_size = vectors.shape[1]

# 创建集合并定义向量字段
collection_name = 'text_documents'
client.recreate_collection(
    collection_name=collection_name,
    vectors_config={"size": actual_vector_size, "distance": "Cosine"},
)

# 将向量化文档写入 Qdrant
points = []
for i, vector in enumerate(vectors):
    print(vector.tolist())
    points.append(
        PointStruct(
            id=i,
            vector=vector.tolist(),
            payload={'text': documents[i]}
        )
    )

operation_info = client.upsert(
    collection_name=collection_name,
    wait=True,
    points=points,
)


# 定义查询函数
def query_text_documents(query_text):
    # 向量化查询文本
    query_vector = vectorizer.transform([query_text]).toarray()[0]
    print(f"Query Vector: {query_vector.tolist()}")

    # 在 Qdrant 中进行相似性搜索
    search_result = client.search(
        collection_name=collection_name,
        query_vector=query_vector.tolist(),
        limit=3,  # 返回前3个最相似文档
    )

    # 提取并返回查询结果
    results = []
    for result in search_result:
        doc_id = result.id
        doc_text = result.payload['text']
        score = result.score
        results.append((doc_id, doc_text, score))

    return results

# 示例查询
query = "小明最喜欢什么颜色"
#query = "what's color of cat?"
results = query_text_documents(query)
print(query)
for result in results:
    print(f"ID: {result[0]}, Text: {result[1]}, Score: {result[2]}")

结合dify等大模型工作流,可以实现rag等能力,留待后面继续探索

posted on 2025-04-18 18:28  生活费  阅读(298)  评论(0)    收藏  举报

导航