自己动手写搜索引擎

radic 是我用go语言实现搜索引擎的索引部分,包括:
倒排索引
正排索引
分布式索引

使用举例

首先需要go get github.com/Orisun/radic

package main

import (
	"radic/types"
	"bytes"
	"encoding/gob"
	"fmt"
	"radic"
	"time"
)

//Book 以图书搜索为例
type Book struct {
	Id      uint32
	Name    string
	Price   float32
	Chinese bool //是否为汉语版
}

//Serialize 图书序列化。序列化和反序列化由调用方决定,这不是radic负责的范畴。
func (self *Book) Serialize() []byte {
	var value bytes.Buffer
	encoder := gob.NewEncoder(&value) //gob是go自带的序列化方法,当然也可以用protobuf等其它方式
	err := encoder.Encode(self)
	if err == nil {
		return value.Bytes()
	} else {
		fmt.Println(err)
		return []byte{}
	}
}

//DeserializeBook  图书反序列化
func DeserializeBook(v []byte) *Book {
	buf := bytes.NewReader(v)
	dec := gob.NewDecoder(buf)
	var data = Book{}
	err := dec.Decode(&data)
	if err == nil {
		return &data
	} else {
		fmt.Println(err)
		return nil
	}
}

func main() {
	/**
	初始化索引
	 */
	options := types.IndexerOpts{}
	engine := radic.IndexEngine{}
	engine.Init(options, types.DistOpts{}, "radic.log") //DistOpts为空,则采用单机索引
	defer engine.Destroy()
	
	/**
	往索引上添加数据
	 */
	book1 := Book{Id: 1, Name: "工业机器学习算法详解与实战", Price: 100.0, Chinese: true}
	book2 := Book{Id: 2, Name: "effective go", Price: 200.0, Chinese: false}
	kw1 := &types.Keyword{Field: "field", Word: "算法"}
	kw2 := &types.Keyword{Field: "title", Word: "工业"}
	kw3 := &types.Keyword{Field: "title", Word: "机器学习"}
	kw4 := &types.Keyword{Field: "field", Word: "编程"}
	kw5 := &types.Keyword{Field: "title", Word: "go"}
	docInfo1 := types.DocInfo{
		DocId:            uint32(book1.Id),
		Keyword:          []*types.Keyword{kw1, kw2, kw3},
		RankScore:        book1.Price,
		Entry:            book1.Serialize(),
		CompositeFeature: 1, //转成二进制,倒数第1位上是1
	}
	docInfo2 := types.DocInfo{
		DocId:            uint32(book2.Id),
		Keyword:          []*types.Keyword{kw4, kw5},
		RankScore:        book2.Price,
		Entry:            book2.Serialize(),
		CompositeFeature: 3, //转成二进制,倒数第1位和第2位上都是1
	}
	engine.IndexDoc(docInfo1) //往索引上添加数据是异步执行的
	engine.IndexDoc(docInfo2)
	time.Sleep(100 * time.Millisecond) //稍等一会儿,等Add操作执行完成
	engine.InvertIndexStatistics("")   //建好索引后最好调用一次InvertIndexStatistics,有利于搜索速度的提升
	
	/**
	按关键词检索
	 */
	keyword := &types.Keyword{Field: "title", Word: "go"}
	request := types.SearchRequest{
		Must:          []*types.Keyword{keyword},
		Should:        nil,
		Not:           nil,
		OutputOffset:  0,
		OnFlag:        1, //要求倒数第1位上是1
		OffFlag:       4, //要求倒数第3位上不能是1
		Orderless:     false,
		CountDocsOnly: false,
		Timeout:       200,
	}
	response := engine.Search(request)
	if !response.Timeout {
		fmt.Printf("一共有%d条搜索结果:\n", response.Total)
		for _, doc := range response.Docs {
			book := DeserializeBook(doc)
			fmt.Printf("book name %s\n", book.Name)
		}
	}
	fmt.Println()
	
	/**
	按doc id检索
	 */
	doc := engine.GetDoc(book1.Id, 1) //根据ID获取一个doc的详情。flag=1仅获取detail,flag=0都获取
	book := DeserializeBook(doc.Entry)
	fmt.Printf("book name %s\n", book.Name)
	fmt.Printf("book keywords %v\n", doc.Keyword) //由于flag=1,所以获取不到Keyword
	fmt.Println()
	
	/**
	删除doc
	 */
	engine.RemoveDoc(book2.Id)
	fmt.Println()
	
	/**
	遍历索引上的数据
	 */
	engine.IterIndex(func(docId uint32, docInfo types.DocInfo) error {
		book := DeserializeBook(docInfo.Entry)
		fmt.Printf("book name %s\n", book.Name)
		return nil
	})
}

更多API参见radic.go

posted @ 2019-08-25 17:06  张朝阳  阅读(696)  评论(0编辑  收藏