字符匹配
单个字符匹配:KMP算法
多字符
AC自动机算法
package lwk;
import java.io.*;
import java.util.*;
/**
* input:
* 1. input_file
* 每一行有一个词汇,如“浙江”, “浙江大学”, “美国”, “美国政府”。该文件可能有100万词
* 2. a document,字符串。一般有2000字左右。如 “美国规划协会中国办公室揭牌仪式及美国规划领域合作研讨会在浙江大学城乡规划设计研究院208会议室举行。美国规划协会CEO James Drinan,国际项目及外联主任Jeffrey Soule先生,浙江大学党委副书记任少波,浙江大学控股集团领导杨其和,西湖区政府代表应权英副主任....”
* <p>
* output:
* 输出document中出现的词汇,以及其位置列表。如
* {
* “美国” : [ 0,16, ....],
* “浙江”: [28, ...]
* “浙江大学”: [28, ...]
* }
* <p>
* 尽量优化算法的时间复杂度
*/
public class DictionarySearcher {
private AcNode root = new AcNode();
/**
* 假设内存够大
*
* @param filename
*/
public DictionarySearcher(String filename) {
Long s1 = System.currentTimeMillis();
InputStream inputStream = this.getClass().getResourceAsStream("/" + filename);
//BufferedReader是可以按行读取文件
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
try {
String line = null;
while ((line = bufferedReader.readLine()) != null) {
// System.out.println(line);
insertAcNode(root, line);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
inputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
try {
bufferedReader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
Long s2 = System.currentTimeMillis();
//close
buildFailPath(root);
Long s3 = System.currentTimeMillis();
System.out.println("字典构建耗时:" + (s2 - s1) + " ms");
System.out.println("fail指针构建耗时:" + (s3 - s2) + " ms");
}
public Map<String, List<Integer>> search(String document) {
Long s1 = System.currentTimeMillis();
Map<String, List<Integer>> map = new HashMap<>();
AcNode temp = root;
char[] chars = document.toCharArray();
for (int i = 0; i < document.length(); i++) {
while (temp.children.get(chars[i]) == null && temp.failNode != null) {
temp = temp.failNode;
}
if (temp.children.get(chars[i]) != null) {
temp = temp.children.get(chars[i]);
} else {
continue;
}
if (temp.wordLengthList.size() != 0) {
for (Integer wordLen : temp.wordLengthList) {
int startIndex = i - wordLen + 1;
String matchWord = document.substring(startIndex, i + 1);
if (map.containsKey(matchWord)) {
map.get(matchWord).add(startIndex);
} else {
List<Integer> list = new ArrayList();
list.add(startIndex);
map.put(matchWord, list);
}
}
}
}
Long s2 = System.currentTimeMillis();
System.out.println("查询耗时:" + (s2 - s1) + " ms");
return map;
}
public static void main(String[] args) {
DictionarySearcher dictionarySearcher = new DictionarySearcher("inputfile.txt");
String document = "美国规划协会中国办公室揭牌仪式及美国规划领域合作研讨会在浙江大学城乡规划设计研究院208会议室举行。美国规划协会CEO James Drinan,国际项目及外联主任Jeffrey Soule先生,浙江大学党委副书记任少波,浙江大学控股集团领导杨其和,西湖区政府代表应权英副主任";
Map<String, List<Integer>> search = dictionarySearcher.search(document);
printMap(search);
System.out.println();
}
public static class AcNode {
public AcNode() {
this.children = new HashMap<>();
this.failNode = null;
this.wordLengthList = new HashSet<>();
}
private Map<Character, AcNode> children;
private AcNode failNode;
private Set<Integer> wordLengthList = new HashSet<>();
public Map<Character, AcNode> getChildren() {
return children;
}
public void setChildren(Map<Character, AcNode> children) {
this.children = children;
}
public AcNode getFailNode() {
return failNode;
}
public void setFailNode(AcNode failNode) {
this.failNode = failNode;
}
public Set<Integer> getWordLengthList() {
return wordLengthList;
}
public void setWordLengthList(Set<Integer> wordLengthList) {
this.wordLengthList = wordLengthList;
}
}
public static void insertAcNode(AcNode root, String s) {
AcNode temp = root;
char[] chars = s.toCharArray();
for (int i = 0; i < s.length(); i++) {
if (!temp.children.containsKey(chars[i])) {
AcNode acNode = new AcNode();
acNode.setFailNode(root);
temp.children.put(chars[i], acNode);
}
temp = temp.children.get(chars[i]);
}
temp.wordLengthList.add(s.length());
}
public static void buildFailPath(AcNode root) {
Queue<AcNode> queue = new LinkedList();
Map<Character, AcNode> childrens = root.children;
Iterator iterator = childrens.entrySet().iterator();
while (iterator.hasNext()) {
Map.Entry<Character, AcNode> next = (Map.Entry<Character, AcNode>) iterator.next();
queue.offer(next.getValue());
next.getValue().failNode = root;
}
while (!queue.isEmpty()) {
AcNode x = queue.poll();
childrens = x.children;
iterator = childrens.entrySet().iterator();
while (iterator.hasNext()) {
Map.Entry<Character, AcNode> next = (Map.Entry<Character, AcNode>) iterator.next();
AcNode y = next.getValue();
AcNode fafail = x.failNode;
while (fafail != null && (!fafail.children.containsKey(next.getKey()))) {
fafail = fafail.failNode;
}
if (fafail != root) {
y.failNode = root;
} else {
y.failNode = fafail.children.get(next.getKey());
}
if (y.failNode.wordLengthList != null) {
y.wordLengthList.addAll(y.failNode.wordLengthList);
}
queue.offer(y);
}
}
}
public static void printMap(Map map) {
StringBuilder stringBuilder = new StringBuilder();
stringBuilder.append("{").append("\n");
if (map != null) {
for (Object o : map.entrySet()) {
Map.Entry entry = (Map.Entry) o;
stringBuilder
.append("\"").append(entry.getKey()).append("\"")
.append(":")
.append(entry.getValue())
.append("\n");
}
}
stringBuilder.append("}").append("\n");
System.out.println(stringBuilder.toString());
}
}
/**
* 字典构建耗时:21 ms
* fail指针构建耗时:1 ms
* 查询耗时:1 ms
* {
* "美国":[0, 16, 50]
* "浙江大学":[28, 98, 111]
* "浙江":[28, 98, 111]
* }
*/
浙公网安备 33010602011771号