package com.pachira.d;
import java.math.BigInteger;
/**
* SamHash简介
* 1.基本思想
* LSH: The basic idea is to hash the input items so that similar items are mapped to the same buckets with high probability;
* Hamming distance
*
* 2.具体步骤
* a). 对于给定的一段语句,进行分词,得到有效的特征向量;
* b). 为每一个特征向量设置一个权值;
* c). 对每一个特征向量计算hash值,为01组成的n-bit签名;
* d). 所有特征向量进行加权(1则为正,0则为负),然后累加;
* e). 对于n-bit签名的累加结果,如果>0置1,否则置0;
* f). 得到该语句的simhash值;
* g). 根据不同语句simhash的Hamming Distance就来判断相似程度;
*
* 3.算法优势
* 用于比较大文本,如500字以上效果挺好,距离小于3的基本都是相似,误判率也比较低;
* ps: 对于短句或者标题,内容过少,离散性较强,并不足以判断相似性;
* 对于短句或者标题,可以尝试使用ED等(个人理解)
* jecard、cos@
*
* 4.具体流程
* 中国人民解放军解放中国
* |
* |分词
* |
* --------- hash ------- weight ---------------
* | 中 国 | --> | 10010 | ---> | 3 -3 -3 3 -3 |
* | 人 民 | --> | 11001 | ---> | 2 2 -2 -2 2 |
* | 解放军 | --> | 11100 | ---> | 5 5 5 -5 -5 |
* | 解 放 | --> | 11011 | ---> | 2 2 -2 2 2 |
* | 中 国 | --> | 10001 | ---> | 1 -1 -1 -1 1 |
* --------- ------- ---------------
* sign |
* | 1 1 0 0 0 | <--- | 13 5 -3 -3 -3 |
*
* 5.具体实现:
*/
public class SimHash {
/**
* 计算初始的hash值
* @param source 要计算hash的特征字符串
* @param hashbits 指定特定位数的hash
* @return 特征的hash值
*/
public static BigInteger hash(String source, int hashbits) {
if (source == null || source.length() == 0) {
return new BigInteger("0");
} else {
char[] sourceArray = source.toCharArray();
BigInteger hashcode = BigInteger.valueOf(((long) sourceArray[0]) << 7);
BigInteger m = new BigInteger("1000003");
BigInteger mask = new BigInteger("2").pow(hashbits).subtract(new BigInteger("1"));
for (char item : sourceArray) {
BigInteger temp = BigInteger.valueOf((long) item);
hashcode = hashcode.multiply(m).xor(temp).and(mask);
}
hashcode = hashcode.xor(new BigInteger(String.valueOf(source.length())));
if (hashcode.equals(new BigInteger("-1"))) {
hashcode = new BigInteger("-2");
}
return hashcode;
}
}
/**
* 更新向量的维度值信息
* @param hash 某一个特征的hash值
* @param features 基础向量
* @param weight 某一个特征的权重
*/
public static void updatefeatures(BigInteger hash, int[] features, int weight) {
for (int i = 0; i < features.length; i++) {
BigInteger bitmask = new BigInteger("1").shiftLeft(i);
/*
*对某一特征的hash数列进行判断,如果是1000...1,那么数组的第一位和末尾一位加1,
*中间的62位减一,也就是说,逢1加1,逢0减1.一直到把所有的分词hash数列全部判断完毕.
*/
if (hash.and(bitmask).signum() != 0) {
//将该特征的信息更新到基础向量中,如果向量的第i维是大于0,+wegiht,否者-weight
features[i] += weight;
} else {
features[i] -= weight;
}
}
}
/**
* 最后对数组进行判断,大于0的记为1,小于等于0的记为0,得到一个 64bit 的数字指纹/签名.
*
* @param features
* @return
*/
public static String fingerprint(int[] features) {
BigInteger fingerprint = new BigInteger("0");
StringBuffer simHashBuffer = new StringBuffer();
for (int i = 0; i < features.length; i++) {
if (features[i] >= 0) {
fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i));
simHashBuffer.append("1");
} else {
simHashBuffer.append("0");
}
}
return simHashBuffer.toString();
}
/**
* hamming distance: 获得simHash串中同位置不同的数字的总数
* @param str1 simHash 1
* @param str2 simHash 2
* @return hamming distance
*/
public static int hammingDistance(String str1, String str2) {
int distance;
if (str1.length() != str2.length()) {
distance = -1;
} else {
distance = 0;
for (int i = 0; i < str1.length(); i++) {
if (str1.charAt(i) != str2.charAt(i)) {
distance++;
}
}
}
return distance;
}
public static void main(String[] args) {
String s = "My thesis work focuses on large scale copy detection of digital objects such as textual documents";
String t = "My thesis job focuses on large scale copy detection of digital objects such as textual documents";
int hashbits = 64;
//定义n-bit向量
int[] features = new int[hashbits];
int[] featuret = new int[hashbits];
//获得特征的hash值
BigInteger bs = hash(s, hashbits);
BigInteger bt = hash(t, hashbits);
//该处只是简单表示一个特征,如果对于一个文档一般会提取多个特征
updatefeatures(bs, features, 1);
updatefeatures(bt, featuret, 1);
//获得文档的指纹信息
String fingers = fingerprint(features);
String fingert = fingerprint(featuret);
//计算hamming distance
int dis = hammingDistance(fingers, fingert);
System.out.println(fingers + "\t" + s);
System.out.println(fingert + "\t" + t);
System.out.println(dis);
}
}