simhash - kayy - 博客园

#-*- coding: utf-8 -*-

import jieba.analyse
import numpy as np
import hashlib

cts = 128
#hash字符串位数

def word_hash(t, w):
    if int(t)==1:
        return w
    else:
        return -w
def content_hash(t):
    if float(t)<=0:
        return '0'
    else:
        return '1'


def simhash(content):

    tags = jieba.analyse.extract_tags(content, topK=100, withWeight=True)

    hash_init = np.zeros(cts)

    for t, w in tags:
        md5 = hashlib.md5()
        md5.update(t.encode('utf-8'))

        hash_str = str(bin(int(md5.hexdigest(), 16)).replace('0b', '').zfill(cts)[-cts:])

        hash_str_deal = map(lambda x: word_hash(x, w), hash_str)

        hash_init =  hash_str_deal + hash_init
    return ''.join(map(lambda x: content_hash(x), hash_init))


======================================================================================================================================================================

#-*- coding: utf-8 -*-

from simhash import simhash

import Levenshtein



if __name__ == '__main__':
    content1 = '而在感怀的热潮过后，关于人人网以及人人直播的价值讨论，开始浮出水面。'
    content2 = '陈一舟还认为，相比年轻创业者，老江湖更适合做产业互联网。'

    hash1 = simhash(content1)
    hash2 = simhash(content2)

    #print len(filter(lambda x: x[0]==x[1], zip(hash1, hash2)))/128.0

    print Levenshtein.hamming(hash1, hash2)/128.0


    #汉明距离

#如果使用hash(str)的话，python重启后对同样字符串计算的hash值会变化

发表于 2018-11-16 09:38 kayy 阅读(283) 评论(0) 收藏举报