【算法】SimHash

package com.pachira.d;

import java.math.BigInteger;
/**
 * SamHash简介
 * 1.基本思想
 *  LSH: The basic idea is to hash the input items so that similar items are mapped to the same buckets with high probability;
 *  Hamming distance
 * 
 * 2.具体步骤
 *  a). 对于给定的一段语句，进行分词，得到有效的特征向量;
 *  b). 为每一个特征向量设置一个权值;
 *  c). 对每一个特征向量计算hash值，为01组成的n-bit签名;
 *  d). 所有特征向量进行加权（1则为正，0则为负），然后累加;
 *  e). 对于n-bit签名的累加结果，如果>0置1，否则置0;
 *  f). 得到该语句的simhash值;
 *  g). 根据不同语句simhash的Hamming Distance就来判断相似程度;
 * 
 * 3.算法优势
 *  用于比较大文本，如500字以上效果挺好，距离小于3的基本都是相似，误判率也比较低;
 *  ps: 对于短句或者标题，内容过少，离散性较强，并不足以判断相似性; 
 *      对于短句或者标题，可以尝试使用ED等（个人理解）
 *      jecard、cos@
 * 
 * 4.具体流程
 *  中国人民解放军解放中国 
 *     | 
 *     |分词
 *     |
 *   ---------  hash   -------  weight ---------------  
 *  | 中  国   |  -->  | 10010 |  ---> | 3 -3 -3  3 -3 | 
 *  | 人  民   |  -->  | 11001 |  ---> | 2  2 -2 -2  2 |
 *  | 解放军   |  -->  | 11100 |  ---> | 5  5  5 -5 -5 |
 *  | 解  放   |  -->  | 11011 |  ---> | 2  2 -2  2  2 |
 *  | 中  国   |  -->  | 10001 |  ---> | 1 -1 -1 -1  1 |
 *   ---------         -------         --------------- 
 *                                sign        |
 *                | 1 1 0 0 0 |  <--- | 13 5 -3 -3 -3 |
 *               
 * 5.具体实现：
 */
public class SimHash {
    /**
     * 计算初始的hash值
     * @param source 要计算hash的特征字符串
     * @param hashbits 指定特定位数的hash
     * @return 特征的hash值
     */
    public static BigInteger hash(String source, int hashbits) {
        if (source == null || source.length() == 0) {
            return new BigInteger("0");
        } else {
            char[] sourceArray = source.toCharArray();
            BigInteger hashcode = BigInteger.valueOf(((long) sourceArray[0]) << 7);
            BigInteger m = new BigInteger("1000003");
            BigInteger mask = new BigInteger("2").pow(hashbits).subtract(new BigInteger("1"));
            for (char item : sourceArray) {
                BigInteger temp = BigInteger.valueOf((long) item);
                hashcode = hashcode.multiply(m).xor(temp).and(mask);
            }
            hashcode = hashcode.xor(new BigInteger(String.valueOf(source.length())));
            if (hashcode.equals(new BigInteger("-1"))) {
                hashcode = new BigInteger("-2");
            }
            return hashcode;
        }
    }
    /**
     * 更新向量的维度值信息
     * @param hash 某一个特征的hash值
     * @param features 基础向量
     * @param weight 某一个特征的权重
     */
    public static void updatefeatures(BigInteger hash, int[] features, int weight) {
        for (int i = 0; i < features.length; i++) {
            BigInteger bitmask = new BigInteger("1").shiftLeft(i);
            /*
             *对某一特征的hash数列进行判断,如果是1000...1,那么数组的第一位和末尾一位加1,
             *中间的62位减一,也就是说,逢1加1,逢0减1.一直到把所有的分词hash数列全部判断完毕.
             */
            if (hash.and(bitmask).signum() != 0) {
                //将该特征的信息更新到基础向量中，如果向量的第i维是大于0，+wegiht，否者-weight
                features[i] += weight;
            } else {
                features[i] -= weight;
            }
        }
    }

    /**
     * 最后对数组进行判断,大于0的记为1,小于等于0的记为0,得到一个 64bit 的数字指纹/签名.
     * 
     * @param features
     * @return
     */
    public static String fingerprint(int[] features) {
        BigInteger fingerprint = new BigInteger("0");
        StringBuffer simHashBuffer = new StringBuffer();
        for (int i = 0; i < features.length; i++) {
            if (features[i] >= 0) {
                fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i));
                simHashBuffer.append("1");
            } else {
                simHashBuffer.append("0");
            }
        }
        return simHashBuffer.toString();
    }
    /**
     * hamming distance: 获得simHash串中同位置不同的数字的总数
     * @param str1 simHash 1
     * @param str2 simHash 2
     * @return hamming distance
     */
    public static int hammingDistance(String str1, String str2) {
        int distance;
        if (str1.length() != str2.length()) {
            distance = -1;
        } else {
            distance = 0;
            for (int i = 0; i < str1.length(); i++) {
                if (str1.charAt(i) != str2.charAt(i)) {
                    distance++;
                }
            }
        }
        return distance;
    }
    public static void main(String[] args) {
        String s = "My thesis work focuses on large scale copy detection of digital objects such as textual documents";
        String t = "My thesis job focuses on large scale copy detection of digital objects such as textual documents";
        int hashbits = 64;
        //定义n-bit向量
        int[] features = new int[hashbits];
        int[] featuret = new int[hashbits];
        //获得特征的hash值
        BigInteger bs = hash(s, hashbits);
        BigInteger bt = hash(t, hashbits);
        //该处只是简单表示一个特征，如果对于一个文档一般会提取多个特征
        updatefeatures(bs, features, 1);
        updatefeatures(bt, featuret, 1);
        //获得文档的指纹信息
        String fingers = fingerprint(features);
        String fingert = fingerprint(featuret);
        //计算hamming distance
        int dis = hammingDistance(fingers, fingert);
        System.out.println(fingers + "\t" + s);
        System.out.println(fingert + "\t" + t);
        System.out.println(dis);
    }
}
posted on 2014-12-12 14:07 有个姑娘叫小芳阅读(386) 评论(0) 收藏举报
刷新页面返回顶部
atom_ye

【算法】SimHash

导航

公告