数据库镜像:milvusdb/milvus:v2.6.9

部署

---
# ConfigMap for etcd configuration
apiVersion: v1
kind: ConfigMap
metadata:
  name: milvus-config
  namespace: default
data:
  embedEtcd.yaml: |
    listen-client-urls: http://0.0.0.0:2379
    advertise-client-urls: http://0.0.0.0:2379
    quota-backend-bytes: 4294967296
    auto-compaction-mode: revision
    auto-compaction-retention: '1000'
  user.yaml: |
    # Extra config to override default milvus.yaml
---
kind: Service
apiVersion: v1
metadata:
  labels:
    app: milvus
  name: milvus
  namespace: default
spec:
  type: NodePort
  ports:
    - name: http
      port: 9091
      targetPort: 9091
      protocol: TCP
      nodePort: 30091
    - name: grpc
      port: 19530
      targetPort: 19530
      protocol: TCP
      nodePort: 30030
    - name: tcp
      port: 2379
      targetPort: 2379
      protocol: TCP
      nodePort: 32379
  selector:
    app: milvus
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: milvus
  namespace: default
spec:
  replicas: 1
  selector:
    matchLabels:
      app: milvus
  template:
    metadata:
      labels:
        app: milvus
    spec:
      containers:
        - name: milvus
          image: harbor.futongcloud.com.cn:15005/public/milvusdb/milvus:v2.6.9
          imagePullPolicy: Always
          command: 
            - milvus
            - run
            - standalone
          env:
            - name: ETCD_USE_EMBED
              value: "true"
            - name: ETCD_DATA_DIR
              value: /var/lib/milvus/etcd
            - name: ETCD_CONFIG_PATH
              value: /milvus/configs/embedEtcd.yaml
            - name: COMMON_STORAGETYPE
              value: "local"
            - name: DEPLOY_MODE
              value: "STANDALONE"
            - name: MINIO_USE_EMBED
              value: "true"
            - name: MINIO_DATA_DIR
              value: /var/lib/milvus/minio
            - name: LOG_LEVEL
              value: "info"
            - name: TINI_SUBREAPER
              value: "1"
          ports:
            - name: http
              containerPort: 9091
              protocol: TCP
            - name: grpc
              containerPort: 19530
              protocol: TCP
            - name: tcp
              containerPort: 2379
              protocol: TCP
          volumeMounts:
            - name: config
              mountPath: /milvus/configs/embedEtcd.yaml
              subPath: embedEtcd.yaml
            - name: config
              mountPath: /milvus/configs/user.yaml
              subPath: user.yaml
            - name: data
              mountPath: /var/lib/milvus
          livenessProbe:
            exec:
              command:
                - curl
                - -f
                - http://localhost:9091/healthz
            initialDelaySeconds: 90
            periodSeconds: 30
            timeoutSeconds: 20
            failureThreshold: 3
          readinessProbe:
            exec:
              command:
                - curl
                - -f
                - http://localhost:9091/healthz
            initialDelaySeconds: 30
            periodSeconds: 10
            timeoutSeconds: 5
          resources:
            requests:
              memory: "2Gi"
              cpu: "1000m"
            limits:
              memory: "4Gi"
              cpu: "2000m"
      volumes:
        - name: config
          configMap:
            name: milvus-config
        - name: data
          emptyDir: {}
View Code

代码测试

package main

import (
    "context"
    "encoding/json"
    "fmt"
    "log"
    "strings"

    "github.com/milvus-io/milvus-sdk-go/v2/client"
    "github.com/milvus-io/milvus-sdk-go/v2/entity"
)

// 正确的方法:使用NewColumnJSONBytes
func InsertDocumentWithJSON(conn client.Client, content string, metadata map[string]interface{}) error {
    ctx := context.Background()
    collectionName := "documents"

    // 1. 文本转向量
    vector := textToVector(content)

    // 2. 准备元数据JSON
    metaJSON, err := json.Marshal(metadata)
    if err != nil {
        return fmt.Errorf("序列化元数据失败: %w", err)
    }

    // 3. 构建插入数据 - 使用NewColumnJSONBytes
    insertData := []entity.Column{
        entity.NewColumnFloatVector("vector", 128, [][]float32{vector}),
        entity.NewColumnVarChar("content", []string{content}),
        entity.NewColumnJSONBytes("metadata", [][]byte{metaJSON}), // 正确的方法
    }

    // 4. 执行插入
    _, err = conn.Insert(ctx, collectionName, "", insertData...)
    return err
}

// 批量插入
func BatchInsertDocuments(conn client.Client, documents []Document) error {
    ctx := context.Background()

    if len(documents) == 0 {
        return nil
    }

    // 准备数据
    vectors := make([][]float32, len(documents))
    contents := make([]string, len(documents))
    metadatas := make([][]byte, len(documents))

    for i, doc := range documents {
        vectors[i] = textToVector(doc.Content)
        contents[i] = doc.Content

        // 序列化元数据
        metaJSON, err := json.Marshal(doc.Metadata)
        if err != nil {
            return fmt.Errorf("序列化文档 %d 元数据失败: %w", i, err)
        }
        metadatas[i] = metaJSON
    }

    // 创建列
    insertData := []entity.Column{
        entity.NewColumnFloatVector("vector", 128, vectors),
        entity.NewColumnVarChar("content", contents),
        entity.NewColumnJSONBytes("metadata", metadatas), // 批量创建
    }

    // 执行插入
    _, err := conn.Insert(ctx, "documents", "", insertData...)
    return err
}

// 获取JSON字段的辅助函数
func GetJSONFromColumn(col entity.Column, index int) (map[string]interface{}, error) {
    var result map[string]interface{}

    if col == nil {
        return nil, fmt.Errorf("列为空")
    }

    // 通过FieldData获取
    fieldData := col.FieldData()
    if fieldData == nil {
        return nil, fmt.Errorf("fieldData为空")
    }

    scalars := fieldData.GetScalars()
    if scalars == nil {
        return nil, fmt.Errorf("scalars为空")
    }

    jsonData := scalars.GetJsonData()
    if jsonData == nil {
        return nil, fmt.Errorf("jsonData为空")
    }

    if index >= len(jsonData.Data) {
        return nil, fmt.Errorf("索引超出范围: %d >= %d", index, len(jsonData.Data))
    }

    // 解析JSON
    if err := json.Unmarshal(jsonData.Data[index], &result); err != nil {
        // 如果解析失败,返回原始字符串
        return map[string]interface{}{
            "raw": string(jsonData.Data[index]),
        }, nil
    }

    return result, nil
}

// 完整的搜索示例
func SearchDocuments(conn client.Client, query string, topK int) ([]map[string]interface{}, error) {
    ctx := context.Background()
    collectionName := "documents"

    // 1. 查询文本转向量
    queryVector := textToVector(query)

    // 2. 执行搜索
    sp, _ := entity.NewIndexIvfFlatSearchParam(8)

    searchResult, err := conn.Search(
        ctx,
        collectionName,
        nil,
        "",
        []string{"id", "content", "metadata"},
        []entity.Vector{entity.FloatVector(queryVector)},
        "vector",
        entity.L2,
        topK,
        sp,
    )
    if err != nil {
        return nil, fmt.Errorf("搜索失败: %w", err)
    }

    // 3. 解析结果
    var results []map[string]interface{}

    for _, res := range searchResult {
        // 获取各个字段的列
        var idCol, contentCol, metaCol entity.Column

        for _, col := range res.Fields {
            switch col.Name() {
            case "id":
                idCol = col
            case "content":
                contentCol = col
            case "metadata":
                metaCol = col
            }
        }

        // 处理每个结果
        for i := 0; i < res.ResultCount; i++ {
            result := map[string]interface{}{
                "score": res.Scores[i],
            }

            // 获取ID
            if idCol != nil {
                if id, err := idCol.GetAsInt64(i); err == nil {
                    result["id"] = id
                }
            }

            // 获取内容
            if contentCol != nil {
                if content, err := contentCol.GetAsString(i); err == nil {
                    result["content"] = content
                }
            }

            // 获取元数据
            if metaCol != nil {
                metadata, err := GetJSONFromColumn(metaCol, i)
                if err == nil {
                    result["metadata"] = metadata
                } else {
                    result["metadata_error"] = err.Error()
                }
            }

            results = append(results, result)
        }
    }

    return results, nil
}

// 文档结构
type Document struct {
    Content  string
    Metadata map[string]interface{}
}

// 模拟文本转向量
func textToVector(text string) []float32 {
    vector := make([]float32, 128)
    text = strings.ToLower(text)

    for i := 0; i < 128; i++ {
        if i < len(text) {
            vector[i] = float32(text[i]) * 0.01
        } else {
            vector[i] = float32(len(text)+i) * 0.001
        }
    }
    return vector
}

func main() {
    // 连接Milvus
    ctx := context.Background()
    conn, err := client.NewClient(ctx, client.Config{
        Address: "172.16.71.31:30030",
    })
    if err != nil {
        log.Fatal("连接失败:", err)
    }
    defer conn.Close()

    // 示例:插入文档
    documents := []Document{
        {
            Content: "人工智能是计算机科学的一个分支",
            Metadata: map[string]interface{}{
                "category": "tech",
                "source":   "wikipedia",
                "tags":     []string{"AI", "计算机科学"},
            },
        },
        {
            Content: "机器学习是人工智能的核心技术",
            Metadata: map[string]interface{}{
                "category": "tech",
                "source":   "book",
                "author":   "Tom Mitchell",
                "year":     1997,
            },
        },
    }

    fmt.Println("插入文档...")
    err = BatchInsertDocuments(conn, documents)
    if err != nil {
        log.Fatal("插入失败:", err)
    }
    fmt.Println("插入成功")

    // 等待索引建立
    // time.Sleep(2 * time.Second)

    // 搜索示例
    fmt.Print("\n请输入搜索内容: ")
    var query string
    fmt.Scanln(&query)

    results, err := SearchDocuments(conn, query, 5)
    if err != nil {
        log.Fatal("搜索失败:", err)
    }

    fmt.Printf("\n搜索 '%s' 的结果:\n", query)
    fmt.Println(strings.Repeat("=", 50))

    for i, result := range results {
        fmt.Printf("\n%d. 相似度: %.4f\n", i+1, result["score"].(float32))

        if id, ok := result["id"].(int64); ok {
            fmt.Printf("   ID: %d\n", id)
        }

        if content, ok := result["content"].(string); ok {
            fmt.Printf("   内容: %s\n", content)
        }

        if metadata, ok := result["metadata"].(map[string]interface{}); ok {
            fmt.Printf("   元数据: %v\n", metadata)
        }
    }
}
View Code

运行效果

GOROOT=D:\Program Files\Go #gosetup
GOPATH=C:\Users\admin\go #gosetup
"D:\Program Files\Go\bin\go.exe" build -o C:\Users\admin\AppData\Local\JetBrains\GoLand2025.2\tmp\GoLand\___1go_build_test1_go.exe D:\futong_project\model-test\mytest\test1.go #gosetup
C:\Users\admin\AppData\Local\JetBrains\GoLand2025.2\tmp\GoLand\___1go_build_test1_go.exe #gosetup
插入文档...
插入成功

请输入搜索内容: 人工智能是什么

搜索 '人工智能是什么' 的结果:
==================================================

1. 相似度: 69.8723
   ID: 463885158195399571
   内容: 机器学习是人工智能的核心技术
   元数据: map[author:Tom Mitchell category:tech source:book year:1997]

2. 相似度: 69.8723
   ID: 463885158195399573
   内容: 机器学习是人工智能的核心技术
   元数据: map[author:Tom Mitchell category:tech source:book year:1997]

进程 已完成,退出代码为 0

 1. 数据分类(大数量下提供查询效率)

package main

import (
    "context"
    "encoding/json"
    "fmt"
    "log"
    "math"
    "sort"
    "strings"
    "unicode"

    "github.com/milvus-io/milvus-sdk-go/v2/client"
    "github.com/milvus-io/milvus-sdk-go/v2/entity"
)

// 词汇表(可以根据需要扩展)
var vocabulary = []string{
    // 人工智能相关
    "人工", "智能", "人工智能", "AI", "机器学习", "深度", "学习", "神经网络",
    "计算机", "科学", "数据", "算法", "模型", "训练", "预测", "分析",
    "语言", "处理", "视觉", "图像", "识别", "理解", "生成", "推理",
    "框架", "TensorFlow", "PyTorch", "Python", "编程", "代码", "开发",

    // 其他常用词
    "一个", "重要", "分支", "核心", "技术", "领域", "基于", "能够",
    "信息", "过程", "价值", "提取", "工作", "方式", "神经元", "人脑",
    "模仿", "处理", "最受", "欢迎", "之一", "Google", "开发", "Facebook",
    "另一个", "流行", "专注", "", "解释", "使", "", "",
}

// 创建词汇表索引
func buildVocabularyMap() map[string]int {
    vocabMap := make(map[string]int)
    for i, word := range vocabulary {
        vocabMap[word] = i
    }
    return vocabMap
}

// 中文分词简单版本
func simpleChineseSegmentation(text string) []string {
    var words []string
    var currentWord strings.Builder

    for _, r := range text {
        if unicode.Is(unicode.Han, r) {
            // 中文字符,单独作为一个词
            if currentWord.Len() > 0 {
                words = append(words, currentWord.String())
                currentWord.Reset()
            }
            words = append(words, string(r))
        } else if unicode.IsLetter(r) || unicode.IsDigit(r) {
            // 英文或数字,积累
            currentWord.WriteRune(r)
        } else {
            // 分隔符
            if currentWord.Len() > 0 {
                words = append(words, currentWord.String())
                currentWord.Reset()
            }
        }
    }

    if currentWord.Len() > 0 {
        words = append(words, currentWord.String())
    }

    return words
}

// 改进的文本转向量函数(基于词频)
func textToVectorImproved(text string) []float32 {
    dim := 128
    vector := make([]float32, dim)

    // 转换为小写
    text = strings.ToLower(text)

    // 简单分词
    words := simpleChineseSegmentation(text)

    // 计算词频
    wordFreq := make(map[string]int)
    for _, word := range words {
        wordFreq[word]++
    }

    // 基于词汇表生成向量
    vocabMap := buildVocabularyMap()

    for word, freq := range wordFreq {
        if idx, exists := vocabMap[word]; exists && idx < dim {
            // 词频加权
            vector[idx] = float32(freq) * 0.1
        }
    }

    // 基于字符的补充特征
    for i, char := range text {
        if i >= dim/2 {
            break
        }
        vector[dim/2+i] = float32(char) * 0.0001
    }

    // 文本长度特征
    vector[dim-3] = float32(len(text)) * 0.001
    vector[dim-2] = float32(len(words)) * 0.01
    vector[dim-1] = float32(len(set(words))) * 0.01 // 唯一词数

    // 归一化
    return normalizeVector(vector)
}

// 辅助函数:去重
func set(items []string) []string {
    seen := make(map[string]bool)
    var result []string
    for _, item := range items {
        if !seen[item] {
            seen[item] = true
            result = append(result, item)
        }
    }
    return result
}

// 归一化向量
func normalizeVector(vector []float32) []float32 {
    var sum float32
    for _, v := range vector {
        sum += v * v
    }

    if sum > 0 {
        norm := float32(math.Sqrt(float64(sum)))
        result := make([]float32, len(vector))
        for i, v := range vector {
            result[i] = v / norm
        }
        return result
    }

    return vector
}

// 计算文本相似度(余弦相似度)
func cosineSimilarity(vec1, vec2 []float32) float32 {
    var dotProduct, norm1, norm2 float32
    for i := range vec1 {
        dotProduct += vec1[i] * vec2[i]
        norm1 += vec1[i] * vec1[i]
        norm2 += vec2[i] * vec2[i]
    }

    if norm1 == 0 || norm2 == 0 {
        return 0
    }

    return dotProduct / (float32(math.Sqrt(float64(norm1))) * float32(math.Sqrt(float64(norm2))))
}

// 测试向量生成
func testVectorGeneration() {
    texts := []string{
        "人工智能是计算机科学的一个重要分支",
        "机器学习是人工智能的核心技术",
        "数据分析是从数据中提取有价值信息的过程",
        "Python是最受欢迎的机器学习编程语言之一",
    }

    fmt.Println("向量生成测试:")
    for i, text1 := range texts {
        vec1 := textToVectorImproved(text1)
        for j, text2 := range texts {
            if i < j {
                vec2 := textToVectorImproved(text2)
                similarity := cosineSimilarity(vec1, vec2)
                fmt.Printf("'%s' 与 '%s' 的相似度: %.4f\n",
                    text1[:min(len(text1), 10)]+"...",
                    text2[:min(len(text2), 10)]+"...",
                    similarity)
            }
        }
    }
}

func min(a, b int) int {
    if a < b {
        return a
    }
    return b
}

// 插入文档(使用改进的向量生成)
func InsertDocumentImproved(conn client.Client, content string, metadata map[string]interface{}) error {
    ctx := context.Background()

    vector := textToVectorImproved(content)
    metaJSON, err := json.Marshal(metadata)
    if err != nil {
        return err
    }

    insertData := []entity.Column{
        entity.NewColumnFloatVector("vector", 128, [][]float32{vector}),
        entity.NewColumnVarChar("content", []string{content}),
        entity.NewColumnJSONBytes("metadata", [][]byte{metaJSON}),
    }

    _, err = conn.Insert(ctx, "documents", "", insertData...)
    return err
}

// 重新创建测试数据
func recreateTestData(conn client.Client) error {
    ctx := context.Background()

    // 删除旧集合
    if err := conn.DropCollection(ctx, "documents"); err != nil {
        log.Printf("删除集合失败: %v", err)
    }

    // 创建新集合
    schema := &entity.Schema{
        CollectionName: "documents",
        Description:    "文档向量存储",
        AutoID:         true,
        Fields: []*entity.Field{
            {
                Name:       "id",
                DataType:   entity.FieldTypeInt64,
                PrimaryKey: true,
                AutoID:     true,
            },
            {
                Name:       "vector",
                DataType:   entity.FieldTypeFloatVector,
                TypeParams: map[string]string{"dim": "128"},
            },
            {
                Name:     "content",
                DataType: entity.FieldTypeVarChar,
                TypeParams: map[string]string{
                    "max_length": "65535",
                },
            },
            {
                Name:     "metadata",
                DataType: entity.FieldTypeJSON,
            },
        },
    }

    if err := conn.CreateCollection(ctx, schema, 2); err != nil {
        return fmt.Errorf("创建集合失败: %w", err)
    }

    // 插入改进的测试数据
    documents := []struct {
        content  string
        metadata map[string]interface{}
    }{
        {
            content: "人工智能是计算机科学的一个重要分支,专注于创建智能机器",
            metadata: map[string]interface{}{
                "category":    "人工智能",
                "subcategory": "基础概念",
                "keywords":    []string{"人工智能", "AI", "计算机科学", "智能机器"},
            },
        },
        {
            content: "机器学习是人工智能的核心技术,使计算机能从数据中学习和改进",
            metadata: map[string]interface{}{
                "category":    "人工智能",
                "subcategory": "机器学习",
                "keywords":    []string{"机器学习", "数据学习", "人工智能", "核心技术"},
            },
        },
        {
            content: "深度学习是机器学习的一个子领域,基于神经网络模型",
            metadata: map[string]interface{}{
                "category":    "人工智能",
                "subcategory": "深度学习",
                "keywords":    []string{"深度学习", "神经网络", "机器学习", "子领域"},
            },
        },
        {
            content: "自然语言处理是人工智能的重要应用,让计算机理解人类语言",
            metadata: map[string]interface{}{
                "category":    "人工智能",
                "subcategory": "自然语言处理",
                "keywords":    []string{"自然语言处理", "NLP", "语言理解", "人工智能应用"},
            },
        },
        {
            content: "数据分析是从大量数据中提取有价值信息和洞察的过程",
            metadata: map[string]interface{}{
                "category":    "数据科学",
                "subcategory": "数据分析",
                "keywords":    []string{"数据分析", "数据科学", "信息提取", "数据洞察"},
            },
        },
        {
            content: "Python在人工智能和数据科学领域非常流行",
            metadata: map[string]interface{}{
                "category":    "编程",
                "subcategory": "Python",
                "keywords":    []string{"Python", "编程语言", "人工智能", "数据科学"},
            },
        },
    }

    fmt.Println("插入测试数据...")
    for i, doc := range documents {
        if err := InsertDocumentImproved(conn, doc.content, doc.metadata); err != nil {
            log.Printf("插入文档 %d 失败: %v", i+1, err)
        } else {
            fmt.Printf("插入文档 %d: %s\n", i+1, doc.content)
        }
    }

    // 创建索引
    index, err := entity.NewIndexIvfFlat(entity.L2, 1024)
    if err != nil {
        return fmt.Errorf("创建索引失败: %w", err)
    }

    if err := conn.CreateIndex(ctx, "documents", "vector", index, false); err != nil {
        return fmt.Errorf("创建索引失败: %w", err)
    }

    // 加载集合
    if err := conn.LoadCollection(ctx, "documents", false); err != nil {
        return fmt.Errorf("加载集合失败: %w", err)
    }

    fmt.Println("测试数据创建完成")

    // 测试向量生成
    fmt.Println("\n向量相似度测试:")
    testVectorGeneration()

    return nil
}

func main() {
    ctx := context.Background()

    conn, err := client.NewClient(ctx, client.Config{
        Address: "172.16.71.31:30030",
    })
    if err != nil {
        log.Fatal("连接失败:", err)
    }
    defer conn.Close()

    fmt.Println("=== Milvus 向量搜索演示 ===")
    fmt.Println("1. 重新创建测试数据(使用改进的向量生成)")
    fmt.Println("2. 执行搜索")
    fmt.Println("3. 测试向量生成")
    fmt.Println("4. 退出")

    for {
        fmt.Print("\n请选择操作 (1/2/3/4): ")
        var choice int
        fmt.Scanln(&choice)

        switch choice {
        case 1:
            if err := recreateTestData(conn); err != nil {
                fmt.Printf("创建测试数据失败: %v\n", err)
            }

        case 2:
            fmt.Print("请输入搜索内容: ")
            var query string
            fmt.Scanln(&query)

            if query == "" {
                fmt.Println("搜索内容不能为空")
                continue
            }

            // 生成查询向量
            queryVector := textToVectorImproved(query)

            // 执行搜索
            sp, _ := entity.NewIndexIvfFlatSearchParam(8)
            searchResult, err := conn.Search(
                ctx,
                "documents",
                nil,
                "",
                []string{"id", "content", "metadata"},
                []entity.Vector{entity.FloatVector(queryVector)},
                "vector",
                entity.L2,
                10,
                sp,
            )

            if err != nil {
                fmt.Printf("搜索失败: %v\n", err)
                continue
            }

            // 解析和显示结果
            var results []struct {
                ID       int64
                Content  string
                Score    float32
                Metadata map[string]interface{}
            }

            for _, res := range searchResult {
                var idCol, contentCol, metaCol entity.Column

                for _, col := range res.Fields {
                    switch col.Name() {
                    case "id":
                        idCol = col
                    case "content":
                        contentCol = col
                    case "metadata":
                        metaCol = col
                    }
                }

                for i := 0; i < res.ResultCount; i++ {
                    result := struct {
                        ID       int64
                        Content  string
                        Score    float32
                        Metadata map[string]interface{}
                    }{
                        Score: res.Scores[i],
                    }

                    if idCol != nil {
                        if id, err := idCol.GetAsInt64(i); err == nil {
                            result.ID = id
                        }
                    }

                    if contentCol != nil {
                        if content, err := contentCol.GetAsString(i); err == nil {
                            result.Content = content
                        }
                    }

                    if metaCol != nil {
                        fieldData := metaCol.FieldData()
                        if fieldData != nil && fieldData.GetScalars() != nil {
                            if jsonData := fieldData.GetScalars().GetJsonData(); jsonData != nil {
                                if i < len(jsonData.Data) {
                                    var metadata map[string]interface{}
                                    if err := json.Unmarshal(jsonData.Data[i], &metadata); err == nil {
                                        result.Metadata = metadata
                                    }
                                }
                            }
                        }
                    }

                    results = append(results, result)
                }
            }

            // 按相似度排序
            sort.Slice(results, func(i, j int) bool {
                return results[i].Score < results[j].Score
            })

            // 显示结果
            fmt.Printf("\n搜索 '%s' 的结果 (显示前5个):\n", query)
            fmt.Println(strings.Repeat("=", 80))

            if len(results) == 0 {
                fmt.Println("没有找到相关结果")
                continue
            }

            for i, result := range results {
                if i >= 5 {
                    break
                }
                fmt.Printf("\n%d. 距离: %.4f (越小越相似)\n", i+1, result.Score)
                fmt.Printf("   ID: %d\n", result.ID)
                fmt.Printf("   内容: %s\n", result.Content)

                if result.Metadata != nil {
                    fmt.Printf("   分类: %v", result.Metadata["category"])
                    if subcat, ok := result.Metadata["subcategory"]; ok {
                        fmt.Printf(" / %v", subcat)
                    }
                    fmt.Println()
                }
            }

            // 显示查询向量的维度信息
            fmt.Printf("\n查询向量信息: 维度=%d, 示例值: [", len(queryVector))
            for i := 0; i < min(5, len(queryVector)); i++ {
                fmt.Printf("%.4f", queryVector[i])
                if i < 4 {
                    fmt.Print(", ")
                }
            }
            fmt.Println("...]")

        case 3:
            fmt.Println("\n向量生成测试:")
            testVectorGeneration()

        case 4:
            fmt.Println("退出程序")
            return

        default:
            fmt.Println("无效选择")
        }
    }
}

// 更简单的向量生成:基于关键词匹配
func textToVectorSimple(text string) []float32 {
    dim := 128
    vector := make([]float32, dim)

    text = strings.ToLower(text)

    // 定义关键词到维度的映射
    keywordMap := map[string]int{
        "人工智能": 0, "AI": 0, "智能": 0,
        "机器学习": 1, "ML": 1, "学习": 1,
        "深度": 2, "神经网络": 2, "NN": 2,
        "自然语言": 3, "NLP": 3, "语言": 3,
        "计算机": 4, "电脑": 4, "计算": 4,
        "数据": 5, "信息": 5,
        "科学": 6, "技术": 6,
        "分析": 7, "处理": 7,
        "Python": 8, "编程": 8, "代码": 8,
        "模型": 9, "算法": 9,
        "训练": 10, "预测": 10,
        "视觉": 11, "图像": 11, "识别": 11,
    }

    // 检查关键词
    for keyword, idx := range keywordMap {
        if idx < dim && strings.Contains(text, keyword) {
            vector[idx] = 1.0
        }
    }

    // 文本长度特征
    textLen := len(text)
    for i := 20; i < 30 && i < dim; i++ {
        vector[i] = float32(textLen%10) * 0.1
    }

    // 归一化
    return normalizeVector(vector)
}
View Code

 2.优化上面的向量算法

package main

import (
    "context"
    "encoding/json"
    "fmt"
    "log"
    "math"
    "sort"
    "strings"

    "github.com/milvus-io/milvus-sdk-go/v2/client"
    "github.com/milvus-io/milvus-sdk-go/v2/entity"
)

// 关键词映射表
var keywordDimensions = map[string][]int{
    // 人工智能相关
    "人工智能": {0, 1}, "AI": {0, 1}, "智能": {0, 1}, "人工": {0, 1},
    "机器学习": {2, 3}, "ML": {2, 3}, "学习": {2, 3},
    "深度学习": {4, 5}, "深度": {4, 5}, "神经网络": {4, 5, 6}, "NN": {4, 5},
    "自然语言": {7, 8}, "NLP": {7, 8}, "语言处理": {7, 8},
    "计算机视觉": {9, 10}, "CV": {9, 10}, "视觉": {9}, "图像": {10},

    // 计算机科学基础
    "计算机": {11, 12}, "计算": {11}, "科学": {12},
    "数据": {13, 14}, "信息": {13}, "分析": {14},
    "算法": {15}, "模型": {16}, "训练": {17}, "预测": {18},

    // 编程和技术
    "Python": {19, 20}, "编程": {20}, "代码": {20},
    "TensorFlow": {21}, "PyTorch": {22}, "框架": {21, 22},

    // 其他常用
    "是什么": {23}, "什么": {23}, "定义": {24}, "概念": {24},
    "技术": {25}, "应用": {26}, "领域": {27},
    "重要": {28}, "核心": {29}, "基础": {30},
    "过程": {31}, "方法": {32}, "系统": {33},
}

// 简单的向量生成:基于关键词匹配
func textToVectorSimple(text string) []float32 {
    dim := 128
    vector := make([]float32, dim)

    // 转换为小写方便匹配
    text = strings.ToLower(text)

    // 1. 关键词匹配
    for keyword, dims := range keywordDimensions {
        if strings.Contains(text, strings.ToLower(keyword)) {
            for _, dimIdx := range dims {
                if dimIdx < dim {
                    vector[dimIdx] += 1.0
                }
            }
        }
    }

    // 2. 文本长度特征
    textLen := len(text)
    vector[100] = float32(textLen) * 0.01

    // 3. 字符分布特征
    for i, char := range text {
        if i >= 20 { // 只取前20个字符
            break
        }
        vector[101+i] = float32(char) * 0.0001
    }

    // 4. 关键词计数特征
    keywordCount := 0
    for keyword := range keywordDimensions {
        if strings.Contains(text, strings.ToLower(keyword)) {
            keywordCount++
        }
    }
    vector[121] = float32(keywordCount) * 0.1

    // 5. 归一化
    return normalizeVector(vector)
}

// 归一化向量
func normalizeVector(vector []float32) []float32 {
    var sum float32
    for _, v := range vector {
        sum += v * v
    }

    if sum > 0 {
        norm := float32(math.Sqrt(float64(sum)))
        result := make([]float32, len(vector))
        for i, v := range vector {
            result[i] = v / norm
        }
        return result
    }

    return vector
}

// 计算余弦相似度
func cosineSimilarity(vec1, vec2 []float32) float32 {
    var dotProduct, norm1, norm2 float32
    for i := range vec1 {
        dotProduct += vec1[i] * vec2[i]
        norm1 += vec1[i] * vec1[i]
        norm2 += vec2[i] * vec2[i]
    }

    if norm1 == 0 || norm2 == 0 {
        return 0
    }

    return dotProduct / (float32(math.Sqrt(float64(norm1))) * float32(math.Sqrt(float64(norm2))))
}

// 测试向量生成
func testVectorGeneration() {
    fmt.Println("\n=== 向量生成测试 ===")

    testTexts := []struct {
        name string
        text string
    }{
        {"人工智能定义", "人工智能是计算机科学的一个重要分支"},
        {"机器学习定义", "机器学习是人工智能的核心技术"},
        {"数据分析定义", "数据分析是从数据中提取有价值信息的过程"},
        {"Python介绍", "Python是最受欢迎的机器学习编程语言之一"},
        {"自然语言处理", "自然语言处理是人工智能的重要应用"},
        {"深度学习介绍", "深度学习基于神经网络模型"},
    }

    // 生成向量并计算相似度
    vectors := make([][]float32, len(testTexts))

    for i, item := range testTexts {
        vectors[i] = textToVectorSimple(item.text)
        fmt.Printf("%d. %s\n", i+1, item.name)
        fmt.Printf("   文本: %s\n", item.text)
        fmt.Printf("   向量特征: ")
        // 显示非零的维度
        nonZeroCount := 0
        for j, v := range vectors[i] {
            if v > 0.1 {
                fmt.Printf("%d:%.2f ", j, v)
                nonZeroCount++
            }
        }
        fmt.Printf("(共%d个非零特征)\n\n", nonZeroCount)
    }

    // 计算相似度矩阵
    fmt.Println("=== 相似度矩阵 ===")
    fmt.Printf("%-20s", "文本")
    for i := range testTexts {
        fmt.Printf("%8d", i+1)
    }
    fmt.Println()

    for i, item1 := range testTexts {
        fmt.Printf("%-20s", item1.name[:min(len(item1.name), 18)])
        for j := range testTexts { // 修复:使用索引而不是item2
            similarity := cosineSimilarity(vectors[i], vectors[j])
            fmt.Printf("%8.3f", similarity)
        }
        fmt.Println()
    }
    fmt.Println("(数字越小表示越相似)")
}

func min(a, b int) int {
    if a < b {
        return a
    }
    return b
}

// 插入文档
func InsertDocument(conn client.Client, content string, metadata map[string]interface{}) error {
    ctx := context.Background()

    vector := textToVectorSimple(content)
    metaJSON, err := json.Marshal(metadata)
    if err != nil {
        return err
    }

    insertData := []entity.Column{
        entity.NewColumnFloatVector("vector", 128, [][]float32{vector}),
        entity.NewColumnVarChar("content", []string{content}),
        entity.NewColumnJSONBytes("metadata", [][]byte{metaJSON}),
    }

    _, err = conn.Insert(ctx, "documents", "", insertData...)
    return err
}

// 重新创建测试数据
func recreateTestData(conn client.Client) error {
    ctx := context.Background()

    // 删除旧集合
    if err := conn.DropCollection(ctx, "documents"); err != nil {
        log.Printf("删除集合失败(可能不存在): %v", err)
    }

    // 创建新集合
    schema := &entity.Schema{
        CollectionName: "documents",
        Description:    "文档向量存储",
        AutoID:         true,
        Fields: []*entity.Field{
            {
                Name:       "id",
                DataType:   entity.FieldTypeInt64,
                PrimaryKey: true,
                AutoID:     true,
            },
            {
                Name:       "vector",
                DataType:   entity.FieldTypeFloatVector,
                TypeParams: map[string]string{"dim": "128"},
            },
            {
                Name:     "content",
                DataType: entity.FieldTypeVarChar,
                TypeParams: map[string]string{
                    "max_length": "65535",
                },
            },
            {
                Name:     "metadata",
                DataType: entity.FieldTypeJSON,
            },
        },
    }

    if err := conn.CreateCollection(ctx, schema, 2); err != nil {
        return fmt.Errorf("创建集合失败: %w", err)
    }

    fmt.Println("✅ 集合创建成功")

    // 插入测试数据
    documents := []struct {
        content  string
        metadata map[string]interface{}
    }{
        {
            content: "人工智能是计算机科学的一个重要分支,专注于创建智能机器",
            metadata: map[string]interface{}{
                "category":         "人工智能",
                "relevance":        5, // 相关性分数
                "matched_keywords": []string{"人工智能", "计算机", "科学", "智能"},
            },
        },
        {
            content: "机器学习是人工智能的核心技术,使计算机能从数据中学习和改进",
            metadata: map[string]interface{}{
                "category":         "机器学习",
                "relevance":        4,
                "matched_keywords": []string{"机器学习", "人工智能", "核心", "技术", "数据", "学习"},
            },
        },
        {
            content: "深度学习是机器学习的一个子领域,基于神经网络模型",
            metadata: map[string]interface{}{
                "category":         "深度学习",
                "relevance":        3,
                "matched_keywords": []string{"深度学习", "机器学习", "神经网络", "模型"},
            },
        },
        {
            content: "自然语言处理是人工智能的重要应用,让计算机理解人类语言",
            metadata: map[string]interface{}{
                "category":         "自然语言处理",
                "relevance":        3,
                "matched_keywords": []string{"自然语言", "处理", "人工智能", "应用", "计算机", "语言"},
            },
        },
        {
            content: "数据分析是从大量数据中提取有价值信息和洞察的过程",
            metadata: map[string]interface{}{
                "category":         "数据分析",
                "relevance":        2,
                "matched_keywords": []string{"数据", "分析", "信息", "过程"},
            },
        },
        {
            content: "Python在人工智能和数据科学领域非常流行",
            metadata: map[string]interface{}{
                "category":         "编程语言",
                "relevance":        1,
                "matched_keywords": []string{"Python", "人工智能", "数据", "科学"},
            },
        },
        {
            content: "TensorFlow是Google开发的深度学习框架",
            metadata: map[string]interface{}{
                "category":         "深度学习框架",
                "relevance":        1,
                "matched_keywords": []string{"TensorFlow", "Google", "深度学习", "框架"},
            },
        },
        {
            content: "PyTorch是Facebook开发的另一个流行的深度学习框架",
            metadata: map[string]interface{}{
                "category":         "深度学习框架",
                "relevance":        1,
                "matched_keywords": []string{"PyTorch", "Facebook", "深度学习", "框架"},
            },
        },
        {
            content: "计算机视觉用于图像识别和分析",
            metadata: map[string]interface{}{
                "category":         "计算机视觉",
                "relevance":        2,
                "matched_keywords": []string{"计算机", "视觉", "图像", "识别", "分析"},
            },
        },
        {
            content: "算法是解决问题的一系列步骤和规则",
            metadata: map[string]interface{}{
                "category":         "计算机基础",
                "relevance":        1,
                "matched_keywords": []string{"算法", "问题", "解决", "步骤", "规则"},
            },
        },
    }

    fmt.Println("插入测试数据...")
    successCount := 0
    for i, doc := range documents {
        if err := InsertDocument(conn, doc.content, doc.metadata); err != nil {
            log.Printf("❌ 插入文档 %d 失败: %v", i+1, err)
        } else {
            successCount++
            fmt.Printf("✅ 插入文档 %d: %s\n", i+1, doc.content[:min(40, len(doc.content))]+"...")
        }
    }

    fmt.Printf("\n✅ 成功插入 %d/%d 个文档\n", successCount, len(documents))

    // 创建索引
    index, err := entity.NewIndexIvfFlat(entity.L2, 1024)
    if err != nil {
        return fmt.Errorf("创建索引失败: %w", err)
    }

    if err := conn.CreateIndex(ctx, "documents", "vector", index, false); err != nil {
        return fmt.Errorf("创建索引失败: %w", err)
    }

    fmt.Println("✅ 索引创建成功")

    // 加载集合
    if err := conn.LoadCollection(ctx, "documents", false); err != nil {
        return fmt.Errorf("加载集合失败: %w", err)
    }

    fmt.Println("✅ 集合加载成功")
    fmt.Println("\n🎉 测试数据准备完成!")

    return nil
}

// 搜索文档
func SearchDocuments(conn client.Client, query string, topK int) ([]SearchResult, error) {
    ctx := context.Background()

    queryVector := textToVectorSimple(query)
    sp, _ := entity.NewIndexIvfFlatSearchParam(8)

    searchResult, err := conn.Search(
        ctx,
        "documents",
        nil,
        "",
        []string{"id", "content", "metadata"},
        []entity.Vector{entity.FloatVector(queryVector)},
        "vector",
        entity.L2,
        topK,
        sp,
    )
    if err != nil {
        return nil, err
    }

    var results []SearchResult

    for _, res := range searchResult {
        var idCol, contentCol, metaCol entity.Column

        // 获取各列
        for _, col := range res.Fields {
            switch col.Name() {
            case "id":
                idCol = col
            case "content":
                contentCol = col
            case "metadata":
                metaCol = col
            }
        }

        for i := 0; i < res.ResultCount; i++ {
            result := SearchResult{Score: res.Scores[i]}

            // 获取ID
            if idCol != nil {
                if id, err := idCol.GetAsInt64(i); err == nil {
                    result.ID = id
                }
            }

            // 获取内容
            if contentCol != nil {
                if content, err := contentCol.GetAsString(i); err == nil {
                    result.Content = content
                }
            }

            // 获取元数据
            if metaCol != nil {
                fieldData := metaCol.FieldData()
                if fieldData != nil && fieldData.GetScalars() != nil {
                    if jsonData := fieldData.GetScalars().GetJsonData(); jsonData != nil {
                        if i < len(jsonData.Data) {
                            var metadata map[string]interface{}
                            if err := json.Unmarshal(jsonData.Data[i], &metadata); err == nil {
                                result.Metadata = metadata
                            }
                        }
                    }
                }
            }

            results = append(results, result)
        }
    }

    // 按相似度排序(L2距离越小越相似)
    sort.Slice(results, func(i, j int) bool {
        return results[i].Score < results[j].Score
    })

    return results, nil
}

type SearchResult struct {
    ID       int64
    Content  string
    Score    float32
    Metadata map[string]interface{}
}

func main() {
    ctx := context.Background()

    fmt.Println("=== Milvus 向量搜索演示(基于关键词的向量生成)===")
    fmt.Println("连接Milvus...")

    conn, err := client.NewClient(ctx, client.Config{
        Address: "172.16.71.31:30030",
    })
    if err != nil {
        log.Fatal("❌ 连接失败:", err)
    }
    defer conn.Close()

    fmt.Println("✅ 连接成功")

    for {
        fmt.Println("\n=== 主菜单 ===")
        fmt.Println("1. 重新创建测试数据")
        fmt.Println("2. 执行搜索")
        fmt.Println("3. 测试向量生成算法")
        fmt.Println("4. 查看关键词映射表")
        fmt.Println("5. 退出")
        fmt.Print("\n请选择操作 (1-5): ")

        var choice int
        fmt.Scanln(&choice)

        switch choice {
        case 1:
            fmt.Println("\n正在重新创建测试数据...")
            if err := recreateTestData(conn); err != nil {
                fmt.Printf("❌ 创建测试数据失败: %v\n", err)
            }

        case 2:
            fmt.Print("\n请输入搜索内容: ")
            var query string
            fmt.Scanln(&query)

            if query == "" {
                fmt.Println("⚠️ 搜索内容不能为空")
                continue
            }

            fmt.Printf("正在搜索: %s\n", query)

            // 生成查询向量
            _ = textToVectorSimple(query) // 修复:使用下划线忽略变量

            // 显示查询向量的关键词匹配
            fmt.Println("\n🔍 查询分析:")
            fmt.Printf("查询文本: %s\n", query)
            fmt.Printf("检测到的关键词: ")

            matchedKeywords := []string{}
            for keyword := range keywordDimensions {
                if strings.Contains(strings.ToLower(query), strings.ToLower(keyword)) {
                    matchedKeywords = append(matchedKeywords, keyword)
                }
            }

            if len(matchedKeywords) > 0 {
                fmt.Println(strings.Join(matchedKeywords, ", "))
            } else {
                fmt.Println("")
            }

            // 执行搜索
            results, err := SearchDocuments(conn, query, 10)
            if err != nil {
                fmt.Printf("❌ 搜索失败: %v\n", err)
                continue
            }

            // 显示结果
            fmt.Printf("\n📊 搜索结果 (共 %d 个):\n", len(results))
            fmt.Println(strings.Repeat("=", 80))

            if len(results) == 0 {
                fmt.Println("没有找到相关结果")
                continue
            }

            for i, result := range results {
                if i >= 5 { // 只显示前5个
                    break
                }

                // 计算相似度百分比(转换为0-100的分数)
                similarityPercent := 100.0
                if result.Score > 0 {
                    similarityPercent = 100.0 / (1.0 + float64(result.Score))
                }

                // 星级评价
                stars := int(similarityPercent / 20)
                if stars > 5 {
                    stars = 5
                }

                fmt.Printf("\n%d. 匹配度: %.1f%% ", i+1, similarityPercent)
                fmt.Printf("[%s]\n", strings.Repeat("", stars)+strings.Repeat("", 5-stars))
                fmt.Printf("   距离: %.4f (越小越好)\n", result.Score)
                fmt.Printf("   ID: %d\n", result.ID)
                fmt.Printf("   内容: %s\n", result.Content)

                if result.Metadata != nil {
                    if category, ok := result.Metadata["category"].(string); ok {
                        fmt.Printf("   分类: %s\n", category)
                    }
                    if relevance, ok := result.Metadata["relevance"].(float64); ok {
                        fmt.Printf("   相关性: %.0f/5\n", relevance)
                    }
                }
                fmt.Println()
            }

        case 3:
            testVectorGeneration()

        case 4:
            fmt.Println("\n📋 关键词映射表:")
            fmt.Println("关键词 -> 向量维度")
            fmt.Println(strings.Repeat("-", 40))

            // 按关键词排序显示
            var keywords []string
            for keyword := range keywordDimensions {
                keywords = append(keywords, keyword)
            }
            sort.Strings(keywords)

            for _, keyword := range keywords {
                dims := keywordDimensions[keyword]
                fmt.Printf("%-15s -> %v\n", keyword, dims)
            }

            fmt.Printf("\n总关键词数: %d\n", len(keywords))
            fmt.Println("向量维度: 128")

        case 5:
            fmt.Println("\n👋 退出程序")
            return

        default:
            fmt.Println("❌ 无效选择,请输入1-5之间的数字")
        }
    }
}
View Code

 3.内容分类写入向量数据库

package main

import (
    "context"
    "encoding/json"
    "fmt"
    "log"
    "math"
    "regexp"
    "sort"
    "strings"
    "time"

    "github.com/milvus-io/milvus-sdk-go/v2/client"
    "github.com/milvus-io/milvus-sdk-go/v2/entity"
)

// 向量生成函数(使用之前的简单关键词方法)
func textToVectorSimple(text string) []float32 {
    dim := 128
    vector := make([]float32, dim)
    text = strings.ToLower(text)

    // 关键词映射
    keywordDimensions := map[string][]int{
        "人工智能": {0, 1}, "AI": {0, 1}, "智能": {0, 1}, "人工": {0, 1},
        "机器学习": {2, 3}, "ML": {2, 3}, "学习": {2, 3},
        "深度学习": {4, 5}, "深度": {4, 5}, "神经网络": {4, 5, 6}, "NN": {4, 5},
        "自然语言": {7, 8}, "NLP": {7, 8}, "语言处理": {7, 8},
        "计算机视觉": {9, 10}, "CV": {9, 10}, "视觉": {9}, "图像": {10},
        "计算机": {11, 12}, "计算": {11}, "科学": {12},
        "数据": {13, 14}, "信息": {13}, "分析": {14},
        "算法": {15}, "模型": {16}, "训练": {17}, "预测": {18},
        "Python": {19, 20}, "编程": {20}, "代码": {20},
        "TensorFlow": {21}, "PyTorch": {22}, "框架": {21, 22},
        "是什么": {23}, "什么": {23}, "定义": {24}, "概念": {24},
        "技术": {25}, "应用": {26}, "领域": {27},
        "重要": {28}, "核心": {29}, "基础": {30},
        "过程": {31}, "方法": {32}, "系统": {33},
    }

    // 1. 关键词匹配
    for keyword, dims := range keywordDimensions {
        if strings.Contains(text, strings.ToLower(keyword)) {
            for _, dimIdx := range dims {
                if dimIdx < dim {
                    vector[dimIdx] += 1.0
                }
            }
        }
    }

    // 2. 文本长度特征
    textLen := len(text)
    vector[100] = float32(textLen) * 0.01

    // 3. 字符分布特征
    for i, char := range text {
        if i >= 20 {
            break
        }
        vector[101+i] = float32(char) * 0.0001
    }

    // 4. 关键词计数特征
    keywordCount := 0
    for keyword := range keywordDimensions {
        if strings.Contains(text, strings.ToLower(keyword)) {
            keywordCount++
        }
    }
    vector[121] = float32(keywordCount) * 0.1

    // 5. 归一化
    return normalizeVector(vector)
}

// 归一化向量
func normalizeVector(vector []float32) []float32 {
    var sum float32
    for _, v := range vector {
        sum += v * v
    }

    if sum > 0 {
        norm := float32(math.Sqrt(float64(sum)))
        result := make([]float32, len(vector))
        for i, v := range vector {
            result[i] = v / norm
        }
        return result
    }

    return vector
}

// 自动提取metadata
func extractMetadataFromContent(content string) map[string]interface{} {
    metadata := make(map[string]interface{})

    // 1. 自动分类
    metadata["category"] = autoClassify(content)

    // 2. 自动计算相关性
    metadata["relevance"] = calculateRelevance(content, metadata["category"].(string))

    // 3. 自动提取关键词
    metadata["keywords"] = extractKeywords(content)
    metadata["keyword_count"] = len(metadata["keywords"].([]string))

    // 4. 基本统计信息
    metadata["content_length"] = len(content)
    metadata["word_count"] = len(strings.Fields(content))
    metadata["avg_word_length"] = calculateAvgWordLength(content)

    // 5. 时间信息
    metadata["created_at"] = time.Now().Format(time.RFC3339)
    metadata["updated_at"] = metadata["created_at"]

    // 6. 质量评分
    metadata["quality_score"] = calculateQualityScore(content, metadata)

    // 7. 来源信息
    metadata["source_type"] = "text"
    metadata["processed_at"] = time.Now().Format("2006-01-02 15:04:05")

    // 8. 文档摘要(前100字符)
    summary := content
    if len(content) > 100 {
        summary = content[:100] + "..."
    }
    metadata["summary"] = summary

    // 9. 情感倾向(简单判断)
    metadata["sentiment"] = analyzeSentimentSimple(content)

    return metadata
}

// 自动分类函数
func autoClassify(content string) string {
    content = strings.ToLower(content)

    // 定义分类关键词
    categoryKeywords := map[string][]string{
        "人工智能":  {"人工智能", "ai", "机器学习", "深度学习", "神经网络", "自然语言", "计算机视觉", "智能", "算法", "模型"},
        "编程开发":  {"编程", "代码", "python", "java", "golang", "javascript", "c++", "函数", "变量", "循环", "语法", ""},
        "数据分析":  {"数据", "分析", "统计", "excel", "sql", "数据库", "可视化", "图表", "清洗", "建模", "挖掘"},
        "计算机基础": {"计算机", "网络", "操作系统", "算法", "数据结构", "硬件", "软件", "存储", "内存", "处理器"},
        "科学技术":  {"科学", "技术", "研究", "实验", "理论", "物理", "化学", "生物", "数学", "工程"},
        "商业经济":  {"商业", "经济", "市场", "投资", "金融", "企业", "管理", "营销", "销售", "利润"},
        "教育学习":  {"教育", "学习", "学校", "课程", "学生", "老师", "考试", "培训", "知识", "技能"},
        "医疗健康":  {"医疗", "健康", "医生", "医院", "疾病", "治疗", "药物", "患者", "保健"},
        "文化娱乐":  {"文化", "娱乐", "电影", "音乐", "艺术", "文学", "游戏", "体育", "旅游"},
        "其他":    {}, // 默认分类
    }

    // 计算每个分类的得分
    scores := make(map[string]int)
    for category, keywords := range categoryKeywords {
        for _, keyword := range keywords {
            if strings.Contains(content, strings.ToLower(keyword)) {
                scores[category]++
            }
        }
    }

    // 找到得分最高的分类
    maxScore := 0
    bestCategory := "其他"
    for category, score := range scores {
        if score > maxScore {
            maxScore = score
            bestCategory = category
        }
    }

    // 如果分数太低,返回"其他"
    if maxScore < 2 {
        return "其他"
    }

    return bestCategory
}

// 计算相关性分数
func calculateRelevance(content, category string) int {
    score := 1 // 基础分

    // 1. 关键词匹配度
    keywords := getCategoryKeywords(category)
    matchedCount := 0
    for _, keyword := range keywords {
        if strings.Contains(strings.ToLower(content), strings.ToLower(keyword)) {
            matchedCount++
        }
    }

    if matchedCount >= 3 {
        score += 2
    } else if matchedCount >= 1 {
        score += 1
    }

    // 2. 内容长度
    contentLen := len(content)
    if contentLen > 500 {
        score += 1
    } else if contentLen < 100 {
        score -= 1
    }

    // 3. 结构化程度
    if isStructured(content) {
        score += 1
    }

    // 4. 权威性指标
    if hasCitations(content) {
        score += 1
    }

    // 5. 专业术语数量
    techTermCount := countTechnicalTerms(content)
    if techTermCount >= 5 {
        score += 1
    } else if techTermCount >= 2 {
        score += 0
    } else {
        score -= 1
    }

    // 限制在1-5分
    if score < 1 {
        score = 1
    }
    if score > 5 {
        score = 5
    }

    return score
}

// 获取分类关键词
func getCategoryKeywords(category string) []string {
    keywordsMap := map[string][]string{
        "人工智能":  {"人工智能", "ai", "机器学习", "深度学习", "神经网络", "算法", "模型", "训练", "预测", "识别"},
        "编程开发":  {"编程", "代码", "开发", "软件", "程序", "函数", "", "对象", "接口", "测试", "调试"},
        "数据分析":  {"数据", "分析", "统计", "可视化", "图表", "报表", "挖掘", "处理", "清洗", "转换", "建模"},
        "计算机基础": {"计算机", "硬件", "软件", "网络", "系统", "存储", "内存", "处理器", "操作系统", "数据库"},
        "科学技术":  {"科学", "技术", "研究", "实验", "理论", "物理", "化学", "生物", "数学", "工程", "发现"},
        "商业经济":  {"商业", "经济", "市场", "投资", "金融", "企业", "管理", "营销", "销售", "利润", "成本"},
        "教育学习":  {"教育", "学习", "学校", "课程", "学生", "老师", "考试", "培训", "知识", "技能", "教学"},
        "医疗健康":  {"医疗", "健康", "医生", "医院", "疾病", "治疗", "药物", "患者", "保健", "疫苗", "诊断"},
        "文化娱乐":  {"文化", "娱乐", "电影", "音乐", "艺术", "文学", "游戏", "体育", "旅游", "演出", "表演"},
    }

    if keywords, ok := keywordsMap[category]; ok {
        return keywords
    }
    return []string{}
}

// 提取关键词
func extractKeywords(content string) []string {
    content = strings.ToLower(content)

    // 移除标点
    re := regexp.MustCompile(`[^\p{L}\p{N}\s]`)
    cleaned := re.ReplaceAllString(content, " ")

    // 分词
    words := strings.Fields(cleaned)

    // 停用词列表
    stopWords := map[string]bool{
        "": true, "": true, "": true, "": true, "": true, "": true,
        "": true, "": true, "": true, "": true, "": true, "": true,
        "一个": true, "": true, "": true, "": true, "": true, "": true,
        "": true, "": true, "": true, "": true, "没有": true, "": true,
        "自己": true, "": true, "": true, "": true, "": true, "": true,
        "这个": true, "那个": true, "什么": true, "怎么": true, "为什么": true,
        "the": true, "and": true, "a": true, "an": true, "in": true, "on": true,
        "at": true, "to": true, "for": true, "of": true, "with": true, "by": true,
        "as": true, "it": true, "that": true, "this": true, "but": true, "or": true,
    }

    // 统计词频
    wordFreq := make(map[string]int)
    for _, word := range words {
        if len(word) > 1 && !stopWords[word] {
            wordFreq[word]++
        }
    }

    // 按词频排序
    type wordFreqPair struct {
        word  string
        count int
    }

    var pairs []wordFreqPair
    for word, count := range wordFreq {
        pairs = append(pairs, wordFreqPair{word, count})
    }

    sort.Slice(pairs, func(i, j int) bool {
        return pairs[i].count > pairs[j].count
    })

    // 取前10个
    var keywords []string
    for i := 0; i < 10 && i < len(pairs); i++ {
        keywords = append(keywords, pairs[i].word)
    }

    return keywords
}

// 计算平均词长
func calculateAvgWordLength(content string) float64 {
    words := strings.Fields(content)
    if len(words) == 0 {
        return 0
    }

    totalLength := 0
    for _, word := range words {
        totalLength += len(word)
    }

    return float64(totalLength) / float64(len(words))
}

// 检查是否有引用
func hasCitations(content string) bool {
    patterns := []string{
        `参考文献`, `引用`, `\[1-9\]`, `http`, `www\.`, `来源`, `出处`, `作者`, `论文`,
        `期刊`, `杂志`, `网站`, `链接`, `参考`,
    }

    for _, pattern := range patterns {
        matched, _ := regexp.MatchString(pattern, content)
        if matched {
            return true
        }
    }
    return false
}

// 检查是否结构化
func isStructured(content string) bool {
    patterns := []string{
        `\d+\.\s`,  // 1. 项目
        `\* `,      // * 项目
        `- `,       // - 项目
        `#{1,6}\s`, // Markdown标题
        `第[一二三四五六七八九十]+章`, // 章节
        `\n\n`,                 // 多个空行
        `首先`, `其次`, `然后`, `最后`, // 顺序词
        `一、`, `二、`, `三、`, `四、`, // 中文序号
        `第一`, `第二`, `第三`, `第四`, // 中文序号
    }

    for _, pattern := range patterns {
        matched, _ := regexp.MatchString(pattern, content)
        if matched {
            return true
        }
    }
    return false
}

// 统计专业术语数量
func countTechnicalTerms(content string) int {
    content = strings.ToLower(content)
    technicalTerms := []string{
        "人工智能", "机器学习", "深度学习", "神经网络", "算法", "模型", "函数", "变量",
        "数据库", "网络", "协议", "接口", "对象", "", "继承", "封装", "多态",
        "数据结构", "时间复杂度", "空间复杂度", "递归", "迭代", "排序", "查找",
        "统计分析", "概率", "统计", "回归", "分类", "聚类", "降维",
        "操作系统", "进程", "线程", "内存", "存储", "文件系统", "网络协议",
        "编译器", "解释器", "语法", "语义", "词法分析", "语法分析",
    }

    count := 0
    for _, term := range technicalTerms {
        if strings.Contains(content, strings.ToLower(term)) {
            count++
        }
    }
    return count
}

// 计算质量分数
func calculateQualityScore(content string, metadata map[string]interface{}) float64 {
    score := 0.0

    // 1. 长度分数
    length := len(content)
    if length > 1000 {
        score += 0.3
    } else if length > 500 {
        score += 0.2
    } else if length > 200 {
        score += 0.1
    }

    // 2. 关键词密度
    if keywords, ok := metadata["keywords"].([]string); ok {
        if len(keywords) >= 5 {
            score += 0.2
        } else if len(keywords) >= 3 {
            score += 0.1
        }
    }

    // 3. 相关性分数
    if relevance, ok := metadata["relevance"].(int); ok {
        score += float64(relevance) * 0.1
    }

    // 4. 结构化
    if isStructured(content) {
        score += 0.1
    }

    // 5. 有引用
    if hasCitations(content) {
        score += 0.2
    }

    // 6. 专业术语
    techTerms := countTechnicalTerms(content)
    if techTerms >= 5 {
        score += 0.2
    } else if techTerms >= 3 {
        score += 0.1
    }

    // 归一化到0-1
    if score > 1.0 {
        score = 1.0
    }

    return score
}

// 简单情感分析
func analyzeSentimentSimple(content string) string {
    positiveWords := []string{"", "优秀", "成功", "优势", "进步", "发展", "创新", "突破"}
    negativeWords := []string{"问题", "困难", "挑战", "不足", "缺点", "失败", "错误", "缺陷"}
    neutralWords := []string{"说明", "介绍", "解释", "描述", "定义", "概念", "理论", "方法"}

    content = strings.ToLower(content)

    posCount := 0
    negCount := 0
    neuCount := 0

    for _, word := range positiveWords {
        if strings.Contains(content, word) {
            posCount++
        }
    }

    for _, word := range negativeWords {
        if strings.Contains(content, word) {
            negCount++
        }
    }

    for _, word := range neutralWords {
        if strings.Contains(content, word) {
            neuCount++
        }
    }

    if posCount > negCount && posCount > neuCount {
        return "积极"
    } else if negCount > posCount && negCount > neuCount {
        return "消极"
    } else {
        return "中性"
    }
}

// 插入文档(自动提取metadata)
func InsertDocumentAuto(conn client.Client, content string) error {
    ctx := context.Background()

    // 1. 自动提取metadata
    metadata := extractMetadataFromContent(content)

    // 2. 生成向量
    vector := textToVectorSimple(content)

    // 3. 准备数据
    metaJSON, err := json.Marshal(metadata)
    if err != nil {
        return fmt.Errorf("序列化元数据失败: %w", err)
    }

    // 4. 构建插入数据
    insertData := []entity.Column{
        entity.NewColumnFloatVector("vector", 128, [][]float32{vector}),
        entity.NewColumnVarChar("content", []string{content}),
        entity.NewColumnJSONBytes("metadata", [][]byte{metaJSON}),
    }

    // 5. 执行插入
    _, err = conn.Insert(ctx, "documents", "", insertData...)
    if err != nil {
        return fmt.Errorf("插入失败: %w", err)
    }

    return nil
}

// 创建集合(如果不存在)
func createCollectionIfNotExists(conn client.Client) error {
    ctx := context.Background()

    // 检查集合是否存在
    collections, err := conn.ListCollections(ctx)
    if err != nil {
        return fmt.Errorf("获取集合列表失败: %w", err)
    }

    // 检查是否已存在
    for _, coll := range collections {
        if coll.Name == "documents" {
            fmt.Println("✅ 集合 'documents' 已存在")
            return nil
        }
    }

    // 创建集合
    schema := &entity.Schema{
        CollectionName: "documents",
        Description:    "文档向量存储",
        AutoID:         true,
        Fields: []*entity.Field{
            {
                Name:       "id",
                DataType:   entity.FieldTypeInt64,
                PrimaryKey: true,
                AutoID:     true,
            },
            {
                Name:       "vector",
                DataType:   entity.FieldTypeFloatVector,
                TypeParams: map[string]string{"dim": "128"},
            },
            {
                Name:     "content",
                DataType: entity.FieldTypeVarChar,
                TypeParams: map[string]string{
                    "max_length": "65535",
                },
            },
            {
                Name:     "metadata",
                DataType: entity.FieldTypeJSON,
            },
        },
    }

    if err := conn.CreateCollection(ctx, schema, 2); err != nil {
        return fmt.Errorf("创建集合失败: %w", err)
    }

    fmt.Println("✅ 集合 'documents' 创建成功")
    return nil
}

// 创建索引
func createIndex(conn client.Client) error {
    ctx := context.Background()

    index, err := entity.NewIndexIvfFlat(entity.L2, 1024)
    if err != nil {
        return fmt.Errorf("创建索引失败: %w", err)
    }

    if err := conn.CreateIndex(ctx, "documents", "vector", index, false); err != nil {
        return fmt.Errorf("创建索引失败: %w", err)
    }

    fmt.Println("✅ 索引创建成功")
    return nil
}

// 加载集合
func loadCollection(conn client.Client) error {
    ctx := context.Background()

    if err := conn.LoadCollection(ctx, "documents", false); err != nil {
        return fmt.Errorf("加载集合失败: %w", err)
    }

    fmt.Println("✅ 集合加载成功")
    return nil
}

// 搜索文档
func searchDocuments(conn client.Client, query string, topK int) ([]map[string]interface{}, error) {
    ctx := context.Background()

    // 生成查询向量
    queryVector := textToVectorSimple(query)

    // 执行搜索
    sp, _ := entity.NewIndexIvfFlatSearchParam(8)
    searchResult, err := conn.Search(
        ctx,
        "documents",
        nil,
        "",
        []string{"id", "content", "metadata"},
        []entity.Vector{entity.FloatVector(queryVector)},
        "vector",
        entity.L2,
        topK,
        sp,
    )

    if err != nil {
        return nil, err
    }

    // 解析结果
    var results []map[string]interface{}

    for _, res := range searchResult {
        var idCol, contentCol, metaCol entity.Column

        for _, col := range res.Fields {
            switch col.Name() {
            case "id":
                idCol = col
            case "content":
                contentCol = col
            case "metadata":
                metaCol = col
            }
        }

        for i := 0; i < res.ResultCount; i++ {
            result := map[string]interface{}{
                "score": res.Scores[i],
            }

            if idCol != nil {
                if id, err := idCol.GetAsInt64(i); err == nil {
                    result["id"] = id
                }
            }

            if contentCol != nil {
                if content, err := contentCol.GetAsString(i); err == nil {
                    result["content"] = content
                }
            }

            if metaCol != nil {
                fieldData := metaCol.FieldData()
                if fieldData != nil && fieldData.GetScalars() != nil {
                    if jsonData := fieldData.GetScalars().GetJsonData(); jsonData != nil {
                        if i < len(jsonData.Data) {
                            var metadata map[string]interface{}
                            if err := json.Unmarshal(jsonData.Data[i], &metadata); err == nil {
                                result["metadata"] = metadata
                            }
                        }
                    }
                }
            }

            results = append(results, result)
        }
    }

    return results, nil
}

func main() {
    fmt.Println("=== Milvus 文档自动处理和搜索演示 ===")
    fmt.Println("连接 Milvus...")

    // 连接Milvus
    conn, err := client.NewClient(context.Background(), client.Config{
        Address: "172.16.71.31:30030", // 根据您的配置修改
    })
    if err != nil {
        log.Fatal("❌ 连接失败:", err)
    }
    defer conn.Close()

    fmt.Println("✅ 连接成功")

    // 创建集合
    if err := createCollectionIfNotExists(conn); err != nil {
        log.Printf("⚠️ 创建集合失败: %v", err)
    }

    // 示例文本
    documents := []string{
        `人工智能是计算机科学的一个重要分支,专注于创建能够执行通常需要人类智能的任务的机器。
        这些任务包括视觉感知、语音识别、决策制定和语言翻译等。人工智能的研究领域包括机器学习、
        深度学习、自然语言处理和计算机视觉等。`,

        `Python是一种高级编程语言,以其简洁的语法和强大的库支持而闻名。
        它广泛用于Web开发、数据分析、人工智能和科学计算等领域。Python的设计哲学强调代码的可读性,
        其语法允许程序员用更少的代码行表达概念。`,

        `数据分析是从原始数据中提取有意义信息的过程。
        它包括数据清洗、转换、建模和可视化等步骤,帮助组织做出更好的决策。
        常用的数据分析工具有Python的pandas、numpy库,以及R语言、SQL等。`,

        `机器学习是人工智能的核心,使计算机能够在没有明确编程的情况下学习。
        监督学习、无监督学习和强化学习是机器学习的三种主要类型。
        机器学习应用广泛,包括推荐系统、图像识别、自然语言处理等。`,

        `深度学习是机器学习的一个子领域,它使用称为神经网络的算法。
        神经网络受人脑结构启发,能够从大量数据中学习复杂模式。
        深度学习在图像识别、语音识别和自然语言处理方面取得了显著成果。`,

        `TensorFlow是Google开发的一个开源深度学习框架。
        它提供了一个全面的生态系统,包含各种工具、库和社区资源,
        使研究人员和开发人员能够轻松构建和部署机器学习模型。`,

        `PyTorch是Facebook开发的另一个流行的深度学习框架。
        它以动态计算图和易用性而闻名,深受研究人员喜爱。
        PyTorch提供了强大的GPU加速和自动微分功能。`,
    }

    fmt.Printf("\n📄 准备处理 %d 个文档...\n", len(documents))

    // 处理并插入文档
    for i, content := range documents {
        fmt.Printf("\n--- 处理文档 %d/%d ---\n", i+1, len(documents))

        // 自动提取metadata
        metadata := extractMetadataFromContent(content)

        fmt.Printf("📊 自动提取的元数据:\n")
        fmt.Printf("  分类: %s\n", metadata["category"])
        fmt.Printf("  相关性: %d/5\n", metadata["relevance"])
        fmt.Printf("  关键词: %v\n", metadata["keywords"])
        fmt.Printf("  质量分: %.2f/1\n", metadata["quality_score"])
        fmt.Printf("  字数: %d\n", metadata["content_length"])
        fmt.Printf("  词数: %d\n", metadata["word_count"])
        fmt.Printf("  平均词长: %.1f\n", metadata["avg_word_length"])
        fmt.Printf("  关键词数: %d\n", metadata["keyword_count"])
        fmt.Printf("  情感: %s\n", metadata["sentiment"])
        fmt.Printf("  处理时间: %s\n", metadata["processed_at"])
        fmt.Printf("  摘要: %s\n", metadata["summary"])

        // 插入到Milvus
        fmt.Print("💾 插入到Milvus... ")
        if err := InsertDocumentAuto(conn, content); err != nil {
            log.Printf("❌ 插入文档 %d 失败: %v", i+1, err)
        } else {
            fmt.Println("✅ 成功")
        }
    }

    fmt.Println("\n🎉 所有文档处理完成!")

    // 创建索引
    fmt.Print("\n🔧 创建索引... ")
    if err := createIndex(conn); err != nil {
        log.Printf("❌ 创建索引失败: %v", err)
    } else {
        fmt.Println("✅ 成功")
    }

    // 加载集合
    fmt.Print("📂 加载集合... ")
    if err := loadCollection(conn); err != nil {
        log.Printf("❌ 加载集合失败: %v", err)
    } else {
        fmt.Println("✅ 成功")
    }

    // 交互式搜索
    fmt.Println("\n🔍 开始交互式搜索")
    fmt.Println("输入 'exit' 退出搜索")
    fmt.Println("=" + strings.Repeat("=", 50))

    for {
        fmt.Print("\n请输入搜索内容: ")
        var query string
        fmt.Scanln(&query)

        query = strings.TrimSpace(query)
        if query == "" {
            continue
        }

        if strings.ToLower(query) == "exit" {
            fmt.Println("👋 退出搜索")
            break
        }

        fmt.Printf("正在搜索: %s\n", query)

        // 执行搜索
        results, err := searchDocuments(conn, query, 5)
        if err != nil {
            fmt.Printf("❌ 搜索失败: %v\n", err)
            continue
        }

        fmt.Printf("\n📈 找到 %d 个结果:\n", len(results))
        fmt.Println(strings.Repeat("-", 80))

        if len(results) == 0 {
            fmt.Println("没有找到相关结果")
            continue
        }

        for i, result := range results {
            // 计算相似度百分比
            score := result["score"].(float32)
            similarityPercent := 100.0 / (1.0 + float64(score))

            fmt.Printf("\n%d. 匹配度: %.1f%%\n", i+1, similarityPercent)
            fmt.Printf("   距离: %.4f\n", score)

            if content, ok := result["content"].(string); ok {
                // 显示前100个字符
                displayContent := content
                if len(displayContent) > 100 {
                    displayContent = displayContent[:100] + "..."
                }
                fmt.Printf("   内容: %s\n", displayContent)
            }

            if metadata, ok := result["metadata"].(map[string]interface{}); ok {
                fmt.Printf("   元数据: ")
                if category, ok := metadata["category"].(string); ok {
                    fmt.Printf("分类: %s, ", category)
                }
                if relevance, ok := metadata["relevance"].(int); ok {
                    fmt.Printf("相关性: %d/5, ", relevance)
                }
                if quality, ok := metadata["quality_score"].(float64); ok {
                    fmt.Printf("质量: %.2f/1", quality)
                }
                fmt.Println()

                if keywords, ok := metadata["keywords"].([]interface{}); ok && len(keywords) > 0 {
                    fmt.Printf("   关键词: ")
                    for j, kw := range keywords {
                        if j >= 5 { // 最多显示5个
                            fmt.Print("...")
                            break
                        }
                        fmt.Printf("%v ", kw)
                    }
                    fmt.Println()
                }
            }
        }
    }

    fmt.Println("\n🎯 演示结束")
}
View Code

 4.第三方向量

package main

import (
    "bytes"
    "context"
    "encoding/json"
    "fmt"
    "log"
    "net/http"
    "sort"
    "strings"

    "github.com/milvus-io/milvus-sdk-go/v2/client"
    "github.com/milvus-io/milvus-sdk-go/v2/entity"
)

// 关键词映射表(仅用于 UI 分析提示,不影响向量生成)
var keywordDimensions = map[string][]int{
    // 人工智能相关
    "人工智能": {0, 1}, "AI": {0, 1}, "智能": {0, 1}, "人工": {0, 1},
    "机器学习": {2, 3}, "ML": {2, 3}, "学习": {2, 3},
    "深度学习": {4, 5}, "深度": {4, 5}, "神经网络": {4, 5, 6}, "NN": {4, 5},
    "自然语言": {7, 8}, "NLP": {7, 8}, "语言处理": {7, 8},
    "计算机视觉": {9, 10}, "CV": {9, 10}, "视觉": {9}, "图像": {10},

    // 计算机科学基础
    "计算机": {11, 12}, "计算": {11}, "科学": {12},
    "数据": {13, 14}, "信息": {13}, "分析": {14},
    "算法": {15}, "模型": {16}, "训练": {17}, "预测": {18},

    // 编程和技术
    "Python": {19, 20}, "编程": {20}, "代码": {20},
    "TensorFlow": {21}, "PyTorch": {22}, "框架": {21, 22},

    // 其他常用
    "是什么": {23}, "什么": {23}, "定义": {24}, "概念": {24},
    "技术": {25}, "应用": {26}, "领域": {27},
    "重要": {28}, "核心": {29}, "基础": {30},
    "过程": {31}, "方法": {32}, "系统": {33},
}

// 调用 embedding 服务获取向量(m3e-large)
func textToVectorFromService(text string) ([]float32, error) {
    url := "http://172.16.71.31:9997/v1/embeddings"
    payload := map[string]interface{}{
        "input": []string{text},
        "model": "m3e-large",
    }

    jsonBody, err := json.Marshal(payload)
    if err != nil {
        return nil, fmt.Errorf("构建请求体失败: %w", err)
    }

    resp, err := http.Post(url, "application/json", bytes.NewBuffer(jsonBody))
    if err != nil {
        return nil, fmt.Errorf("调用 embedding 服务失败: %w", err)
    }
    defer resp.Body.Close()

    if resp.StatusCode != http.StatusOK {
        return nil, fmt.Errorf("embedding 服务返回非200状态码: %d", resp.StatusCode)
    }

    var result struct {
        Data []struct {
            Embedding []float32 `json:"embedding"`
        } `json:"data"`
    }

    if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
        return nil, fmt.Errorf("解析 embedding 响应失败: %w", err)
    }

    if len(result.Data) == 0 || len(result.Data[0].Embedding) == 0 {
        return nil, fmt.Errorf("embedding 服务未返回有效向量")
    }

    // m3e-large 返回 1024 维 float32 向量,通常已 L2 归一化
    return result.Data[0].Embedding, nil
}

// 计算余弦相似度(用于测试)
func cosineSimilarity(vec1, vec2 []float32) float32 {
    var dot float32
    for i := range vec1 {
        dot += vec1[i] * vec2[i]
    }
    // 因为 m3e 向量已归一化,cosine ≈ dot product
    return dot
}

// 测试向量生成(通过服务)
func testVectorGeneration() {
    fmt.Println("\n=== 向量生成测试(通过 embedding 服务)===")

    testTexts := []string{
        "人工智能是计算机科学的一个重要分支",
        "机器学习是人工智能的核心技术",
        "数据分析是从数据中提取有价值信息的过程",
        "Python是最受欢迎的机器学习编程语言之一",
    }

    vectors := make([][]float32, len(testTexts))

    for i, text := range testTexts {
        vec, err := textToVectorFromService(text)
        if err != nil {
            log.Printf("❌ 生成向量失败: %v", err)
            return
        }
        vectors[i] = vec
        fmt.Printf("%d. 文本: %s\n", i+1, text)
        fmt.Printf("   向量维度: %d\n", len(vec))
        fmt.Printf("   前5维: [%.3f, %.3f, %.3f, %.3f, %.3f]\n\n", vec[0], vec[1], vec[2], vec[3], vec[4])
    }

    // 相似度矩阵
    fmt.Println("=== 余弦相似度矩阵(近似)===")
    fmt.Printf("%-40s", "文本")
    for i := range testTexts {
        fmt.Printf("  T%d", i+1)
    }
    fmt.Println()

    for i, text1 := range testTexts {
        label := text1
        if len(label) > 38 {
            label = label[:35] + "..."
        }
        fmt.Printf("%-40s", label)
        for j := range testTexts {
            sim := cosineSimilarity(vectors[i], vectors[j])
            fmt.Printf(" %5.3f", sim)
        }
        fmt.Println()
    }
    fmt.Println("(值越接近1.0表示越相似)")
}

// 插入文档
func InsertDocument(conn client.Client, content string, metadata map[string]interface{}) error {
    ctx := context.Background()

    vector, err := textToVectorFromService(content)
    if err != nil {
        return fmt.Errorf("生成 embedding 失败: %w", err)
    }

    metaJSON, err := json.Marshal(metadata)
    if err != nil {
        return err
    }

    insertData := []entity.Column{
        entity.NewColumnFloatVector("vector", 1024, [][]float32{vector}),
        entity.NewColumnVarChar("content", []string{content}),
        entity.NewColumnJSONBytes("metadata", [][]byte{metaJSON}),
    }

    _, err = conn.Insert(ctx, "documents", "", insertData...)
    return err
}

// 重新创建测试数据
func recreateTestData(conn client.Client) error {
    ctx := context.Background()

    // 删除旧集合
    if err := conn.DropCollection(ctx, "documents"); err != nil {
        log.Printf("删除集合失败(可能不存在): %v", err)
    }
    // 创建新集合(1024维)
    schema := &entity.Schema{
        CollectionName: "documents",
        Description:    "文档向量存储(m3e-large 1024维)",
        AutoID:         true,
        Fields: []*entity.Field{
            {
                Name:       "id",
                DataType:   entity.FieldTypeInt64,
                PrimaryKey: true,
                AutoID:     true,
            },
            {
                Name:       "vector",
                DataType:   entity.FieldTypeFloatVector,
                TypeParams: map[string]string{"dim": "1024"},
            },
            {
                Name:     "content",
                DataType: entity.FieldTypeVarChar,
                TypeParams: map[string]string{
                    "max_length": "65535",
                },
            },
            {
                Name:     "metadata",
                DataType: entity.FieldTypeJSON,
            },
        },
    }

    if err := conn.CreateCollection(ctx, schema, 2); err != nil {
        return fmt.Errorf("创建集合失败: %w", err)
    }

    fmt.Println("✅ 集合创建成功(1024维)")

    // 插入测试数据
    documents := []struct {
        content  string
        metadata map[string]interface{}
    }{
        {
            content: "人工智能是计算机科学的一个重要分支,专注于创建智能机器",
            metadata: map[string]interface{}{
                "category":         "人工智能",
                "relevance":        5,
                "matched_keywords": []string{"人工智能", "计算机", "科学", "智能"},
            },
        },
        {
            content: "机器学习是人工智能的核心技术,使计算机能从数据中学习和改进",
            metadata: map[string]interface{}{
                "category":         "机器学习",
                "relevance":        4,
                "matched_keywords": []string{"机器学习", "人工智能", "核心", "技术", "数据", "学习"},
            },
        },
        {
            content: "深度学习是机器学习的一个子领域,基于神经网络模型",
            metadata: map[string]interface{}{
                "category":         "深度学习",
                "relevance":        3,
                "matched_keywords": []string{"深度学习", "机器学习", "神经网络", "模型"},
            },
        },
        {
            content: "自然语言处理是人工智能的重要应用,让计算机理解人类语言",
            metadata: map[string]interface{}{
                "category":         "自然语言处理",
                "relevance":        3,
                "matched_keywords": []string{"自然语言", "处理", "人工智能", "应用", "计算机", "语言"},
            },
        },
        {
            content: "数据分析是从大量数据中提取有价值信息和洞察的过程",
            metadata: map[string]interface{}{
                "category":         "数据分析",
                "relevance":        2,
                "matched_keywords": []string{"数据", "分析", "信息", "过程"},
            },
        },
        {
            content: "Python在人工智能和数据科学领域非常流行",
            metadata: map[string]interface{}{
                "category":         "编程语言",
                "relevance":        1,
                "matched_keywords": []string{"Python", "人工智能", "数据", "科学"},
            },
        },
        {
            content: "TensorFlow是Google开发的深度学习框架",
            metadata: map[string]interface{}{
                "category":         "深度学习框架",
                "relevance":        1,
                "matched_keywords": []string{"TensorFlow", "Google", "深度学习", "框架"},
            },
        },
        {
            content: "PyTorch是Facebook开发的另一个流行的深度学习框架",
            metadata: map[string]interface{}{
                "category":         "深度学习框架",
                "relevance":        1,
                "matched_keywords": []string{"PyTorch", "Facebook", "深度学习", "框架"},
            },
        },
        {
            content: "计算机视觉用于图像识别和分析",
            metadata: map[string]interface{}{
                "category":         "计算机视觉",
                "relevance":        2,
                "matched_keywords": []string{"计算机", "视觉", "图像", "识别", "分析"},
            },
        },
        {
            content: "算法是解决问题的一系列步骤和规则",
            metadata: map[string]interface{}{
                "category":         "计算机基础",
                "relevance":        1,
                "matched_keywords": []string{"算法", "问题", "解决", "步骤", "规则"},
            },
        },
    }

    fmt.Println("插入测试数据...")
    successCount := 0
    for i, doc := range documents {
        doc.metadata = make(map[string]interface{})
        if err := InsertDocument(conn, doc.content, doc.metadata); err != nil {
            log.Printf("❌ 插入文档 %d 失败: %v", i+1, err)
        } else {
            successCount++
            short := doc.content
            if len(short) > 50 {
                short = short[:47] + "..."
            }
            fmt.Printf("✅ 插入文档 %d: %s\n", i+1, short)
        }
    }

    fmt.Printf("\n✅ 成功插入 %d/%d 个文档\n", successCount, len(documents))

    // 创建 IVF_FLAT 索引(适合中小数据集)
    index, err := entity.NewIndexIvfFlat(entity.L2, 128)
    if err != nil {
        return fmt.Errorf("创建索引失败: %w", err)
    }

    if err := conn.CreateIndex(ctx, "documents", "vector", index, false); err != nil {
        return fmt.Errorf("创建索引失败: %w", err)
    }

    fmt.Println("✅ 索引创建成功 (IVF_FLAT, nlist=128)")

    // 加载集合
    if err := conn.LoadCollection(ctx, "documents", false); err != nil {
        return fmt.Errorf("加载集合失败: %w", err)
    }

    fmt.Println("✅ 集合加载成功")
    fmt.Println("\n🎉 测试数据准备完成!")

    return nil
}

// 搜索文档
func SearchDocuments(conn client.Client, query string, topK int) ([]SearchResult, error) {
    ctx := context.Background()

    queryVector, err := textToVectorFromService(query)
    if err != nil {
        return nil, fmt.Errorf("生成查询向量失败: %w", err)
    }

    sp, _ := entity.NewIndexIvfFlatSearchParam(16) // nprobe=16

    searchResult, err := conn.Search(
        ctx,
        "documents",
        nil,
        "",
        []string{"id", "content", "metadata"},
        []entity.Vector{entity.FloatVector(queryVector)},
        "vector",
        entity.L2,
        topK,
        sp,
    )
    if err != nil {
        return nil, err
    }

    var results []SearchResult

    for _, res := range searchResult {
        var idCol, contentCol, metaCol entity.Column

        for _, col := range res.Fields {
            switch col.Name() {
            case "id":
                idCol = col
            case "content":
                contentCol = col
            case "metadata":
                metaCol = col
            }
        }

        for i := 0; i < res.ResultCount; i++ {
            result := SearchResult{Score: res.Scores[i]}

            if idCol != nil {
                if id, err := idCol.GetAsInt64(i); err == nil {
                    result.ID = id
                }
            }

            if contentCol != nil {
                if content, err := contentCol.GetAsString(i); err == nil {
                    result.Content = content
                }
            }

            if metaCol != nil {
                fieldData := metaCol.FieldData()
                if fieldData != nil && fieldData.GetScalars() != nil {
                    if jsonData := fieldData.GetScalars().GetJsonData(); jsonData != nil {
                        if i < len(jsonData.Data) {
                            var metadata map[string]interface{}
                            if err := json.Unmarshal(jsonData.Data[i], &metadata); err == nil {
                                result.Metadata = metadata
                            }
                        }
                    }
                }
            }

            results = append(results, result)
        }
    }

    // L2 距离越小越相似
    sort.Slice(results, func(i, j int) bool {
        return results[i].Score < results[j].Score
    })

    return results, nil
}

type SearchResult struct {
    ID       int64
    Content  string
    Score    float32
    Metadata map[string]interface{}
}

func main() {
    ctx := context.Background()

    fmt.Println("=== Milvus 向量搜索演示(使用 m3e-large embedding 服务)===")
    fmt.Println("连接 Milvus...")

    conn, err := client.NewClient(ctx, client.Config{
        Address: "172.16.71.31:30030",
    })
    if err != nil {
        log.Fatal("❌ 连接失败:", err)
    }
    defer conn.Close()

    fmt.Println("✅ 连接成功")

    for {
        fmt.Println("\n=== 主菜单 ===")
        fmt.Println("1. 重新创建测试数据")
        fmt.Println("2. 执行搜索")
        fmt.Println("3. 测试 embedding 服务")
        fmt.Println("4. 查看关键词映射表(仅用于分析)")
        fmt.Println("5. 退出")
        fmt.Print("\n请选择操作 (1-5): ")

        var choice int
        fmt.Scanln(&choice)

        switch choice {
        case 1:
            fmt.Println("\n正在重新创建测试数据...")
            if err := recreateTestData(conn); err != nil {
                fmt.Printf("❌ 创建测试数据失败: %v\n", err)
            }

        case 2:
            fmt.Print("\n请输入搜索内容: ")
            var query string
            fmt.Scanln(&query)

            if strings.TrimSpace(query) == "" {
                fmt.Println("⚠️ 搜索内容不能为空")
                continue
            }

            fmt.Printf("正在搜索: %s\n", query)

            // 关键词分析(仅用于提示)
            fmt.Println("\n🔍 查询分析:")
            fmt.Printf("查询文本: %s\n", query)
            fmt.Printf("检测到的关键词: ")

            matchedKeywords := []string{}
            lowerQuery := strings.ToLower(query)
            for keyword := range keywordDimensions {
                if strings.Contains(lowerQuery, strings.ToLower(keyword)) {
                    matchedKeywords = append(matchedKeywords, keyword)
                }
            }

            if len(matchedKeywords) > 0 {
                fmt.Println(strings.Join(matchedKeywords, ", "))
            } else {
                fmt.Println("")
            }

            // 执行搜索
            results, err := SearchDocuments(conn, query, 10)
            if err != nil {
                fmt.Printf("❌ 搜索失败: %v\n", err)
                continue
            }

            fmt.Printf("\n📊 搜索结果 (共 %d 个):\n", len(results))
            fmt.Println(strings.Repeat("=", 80))

            if len(results) == 0 {
                fmt.Println("没有找到相关结果")
                continue
            }

            for i, result := range results {
                if i >= 5 {
                    break
                }

                // 将 L2 距离转换为相似度分数(0~100)
                // 注意:L2 距离范围不确定,这里用简单映射:score = 1/(1+dist) * 100
                similarity := 100.0 / (1.0 + float64(result.Score))
                if similarity > 100 {
                    similarity = 100
                }

                stars := int(similarity / 20)
                if stars > 5 {
                    stars = 5
                }

                fmt.Printf("\n%d. 匹配度: %.1f%% ", i+1, similarity)
                fmt.Printf("[%s]\n", strings.Repeat("", stars)+strings.Repeat("", 5-stars))
                fmt.Printf("   L2距离: %.4f (越小越好)\n", result.Score)
                fmt.Printf("   ID: %d\n", result.ID)
                fmt.Printf("   内容: %s\n", result.Content)

                if result.Metadata != nil {
                    if category, ok := result.Metadata["category"].(string); ok {
                        fmt.Printf("   分类: %s\n", category)
                    }
                    if relevance, ok := result.Metadata["relevance"].(float64); ok {
                        fmt.Printf("   相关性: %.0f/5\n", relevance)
                    }
                }
                fmt.Println()
            }

        case 3:
            testVectorGeneration()

        case 4:
            fmt.Println("\n📋 关键词映射表(仅用于 UI 分析):")
            fmt.Println("关键词 -> 向量维度(模拟用)")
            fmt.Println(strings.Repeat("-", 40))

            var keywords []string
            for keyword := range keywordDimensions {
                keywords = append(keywords, keyword)
            }
            sort.Strings(keywords)

            for _, keyword := range keywords {
                dims := keywordDimensions[keyword]
                fmt.Printf("%-15s -> %v\n", keyword, dims)
            }

            fmt.Printf("\n总关键词数: %d\n", len(keywords))
            fmt.Println("注意:实际向量来自 m3e-large 服务(1024维),与此表无关")

        case 5:
            fmt.Println("\n👋 退出程序")
            return

        default:
            fmt.Println("❌ 无效选择,请输入1-5之间的数字")
        }
    }
}
View Code

python转向量方案

pip install transformers sentence-transformers torch modelscope

git clone https://github.com/ggerganov/llama.cpp.git

cd llama.cpp


apt install cmake -y

mkdir -p build && cd build


cmake ..

cmake --build . -j4


build/bin 目录生成的文件
llama-embedding:生成文本嵌入向量(验证模型可用性的核心工具)
llama-quantize: 对 GGUF 模型进行量化(INT4/INT8 核心工具)
llama-convert(或直接用 llama-gguf)

转文件,llama.cpp 这个目录下
python convert_sentence_bert_to_ggml.py \
  ${MODEL_PATH} \
  --outtype f16 \
  --outfile ./m3e-large-ggml-f16.bin

 

python convert_hf_to_gguf.py \
/root/.cache/modelscope/hub/models/BAAI/bge-reranker-v2-m3 \
--outfile /app/models/bge-reranker-v2-m3.gguf \
--outtype f16

 

image

 

posted on 2026-02-05 20:06  可口_可乐  阅读(1)  评论(0)    收藏  举报