三、14万条数据
Product.java
准备实体类来存放产品信息
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
package com.how2java;public class Product { int id; String name; String category; float price; String place; String code; public int getId() { return id; } public void setId(int id) { this.id = id; } public String getName() { return name; } public void setName(String name) { this.name = name; } public String getCategory() { return category; } public void setCategory(String category) { this.category = category; } public float getPrice() { return price; } public void setPrice(float price) { this.price = price; } public String getPlace() { return place; } public void setPlace(String place) { this.place = place; } public String getCode() { return code; } public void setCode(String code) { this.code = code; } @Override public String toString() { return "Product [id=" + id + ", name=" + name + ", category=" + category + ", price=" + price + ", place=" + place + ", code=" + code + "]"; }} |
准备工具类,把140k_products.txt 文本文件,转换为泛型是Product的集合
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
|
package com.how2java;import java.awt.AWTException;import java.io.File;import java.io.IOException;import java.util.ArrayList;import java.util.HashSet;import java.util.List;import java.util.Set;import org.apache.commons.io.FileUtils; public class ProductUtil { public static void main(String[] args) throws IOException, InterruptedException, AWTException { String fileName = "140k_products.txt"; List<Product> products = file2list(fileName); System.out.println(products.size()); } public static List<Product> file2list(String fileName) throws IOException { File f = new File(fileName); List<String> lines = FileUtils.readLines(f,"UTF-8"); List<Product> products = new ArrayList<>(); for (String line : lines) { Product p = line2product(line); products.add(p); } return products; } private static Product line2product(String line) { Product p = new Product(); String[] fields = line.split(","); p.setId(Integer.parseInt(fields[0])); p.setName(fields[1]); p.setCategory(fields[2]); p.setPrice(Float.parseFloat(fields[3])); p.setPlace(fields[4]); p.setCode(fields[5]); return p; }} |
TestLucene.java
在入门中 TestLucene.java 的基础上进行修改。 主要做了两个方面的修改:
1. 索引的增加,以前是10条数据,现在是14万条数据
注: 因为数据量比较大, 所以加入到索引的时间也比较久,请耐心等待。
2. Document以前只有name字段,现在有6个字段
3. 查询关键字从控制台输入,这样每次都可以输入不同的关键字进行查询。 因为索引建立时间比较久,采用这种方式,可以建立一次索引,进行多次查询,否则的话,每次使用不同的关键字,都要耗时建立索引,测试效率会比较低
1. 索引的增加,以前是10条数据,现在是14万条数据
注: 因为数据量比较大, 所以加入到索引的时间也比较久,请耐心等待。
2. Document以前只有name字段,现在有6个字段
3. 查询关键字从控制台输入,这样每次都可以输入不同的关键字进行查询。 因为索引建立时间比较久,采用这种方式,可以建立一次索引,进行多次查询,否则的话,每次使用不同的关键字,都要耗时建立索引,测试效率会比较低
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
|
package com.how2java;import java.io.IOException;import java.io.StringReader;import java.util.List;import java.util.Scanner;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.TextField;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.IndexableField;import org.apache.lucene.queryparser.classic.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.highlight.Highlighter;import org.apache.lucene.search.highlight.QueryScorer;import org.apache.lucene.search.highlight.SimpleHTMLFormatter;import org.apache.lucene.store.Directory;import org.apache.lucene.store.RAMDirectory;import org.wltea.analyzer.lucene.IKAnalyzer;public class TestLucene { public static void main(String[] args) throws Exception { // 1. 准备中文分词器 IKAnalyzer analyzer = new IKAnalyzer(); // 2. 索引 Directory index = createIndex(analyzer); // 3. 查询器 Scanner s = new Scanner(System.in); while(true){ System.out.print("请输入查询关键字:"); String keyword = s.nextLine(); System.out.println("当前关键字是:"+keyword); Query query = new QueryParser( "name", analyzer).parse(keyword); // 4. 搜索 IndexReader reader = DirectoryReader.open(index); IndexSearcher searcher=new IndexSearcher(reader); int numberPerPage = 10; ScoreDoc[] hits = searcher.search(query, numberPerPage).scoreDocs; // 5. 显示查询结果 showSearchResults(searcher, hits,query,analyzer); // 6. 关闭查询 reader.close(); } } private static void showSearchResults(IndexSearcher searcher, ScoreDoc[] hits, Query query, IKAnalyzer analyzer)throws Exception { System.out.println("找到 " + hits.length + " 个命中."); SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>"); Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query)); System.out.println("找到 " + hits.length + " 个命中."); System.out.println("序号\t匹配度得分\t结果"); for (int i = 0; i < hits.length; ++i) { ScoreDoc scoreDoc= hits[i]; int docId = scoreDoc.doc; Document d = searcher.doc(docId); List<IndexableField> fields= d.getFields(); System.out.print((i + 1) ); System.out.print("\t" + scoreDoc.score); for (IndexableField f : fields) { if("name".equals(f.name())){ TokenStream tokenStream = analyzer.tokenStream(f.name(), new StringReader(d.get(f.name()))); String fieldContent = highlighter.getBestFragment(tokenStream, d.get(f.name())); System.out.print("\t"+fieldContent); } else{ System.out.print("\t"+d.get(f.name())); } } System.out.println("<br>"); } } private static Directory createIndex(IKAnalyzer analyzer) throws IOException { Directory index = new RAMDirectory(); IndexWriterConfig config = new IndexWriterConfig(analyzer); IndexWriter writer = new IndexWriter(index, config); String fileName = "140k_products.txt"; List<Product> products = ProductUtil.file2list(fileName); int total = products.size(); int count = 0; int per = 0; int oldPer =0; for (Product p : products) { addDoc(writer, p); count++; per = count*100/total; if(per!=oldPer){ oldPer = per; System.out.printf("索引中,总共要添加 %d 条记录,当前添加进度是: %d%% %n",total,per); } } writer.close(); return index; } private static void addDoc(IndexWriter w, Product p) throws IOException { Document doc = new Document(); doc.add(new TextField("id", String.valueOf(p.getId()), Field.Store.YES)); doc.add(new TextField("name", p.getName(), Field.Store.YES)); doc.add(new TextField("category", p.getCategory(), Field.Store.YES)); doc.add(new TextField("price", String.valueOf(p.getPrice()), Field.Store.YES)); doc.add(new TextField("place", p.getPlace(), Field.Store.YES)); doc.add(new TextField("code", p.getCode(), Field.Store.YES)); w.addDocument(doc); }} |

下载地址:http://how2j.cn/k/search-engine/search-engine-14k/1674.html#nowhere
140k_products.rar http://download.how2j.cn/1680/140k_products.rar
lucene.rar http://download.how2j.cn/1710/lucene.rar
浙公网安备 33010602011771号