import urllib.request
data=urllib.request.urlopen("http://127.0.0.1/txt1.txt").read().decode("utf-8","ignore")
word10=jieba.analyse.extract_tags(data,20)
print(word10)
import gensim
from gensim import corpora,models,similarities
import jieba
import urllib.request
data1=data.replace('\t', '').replace('\n', '').replace(' ','')
data2=urllib.request.urlopen("http://127.0.0.1/comment.txt").read().decode("utf-8","ignore").replace('\t', '').replace('\n', '').replace(' ','')
d1=jieba.cut(data1)
d2=jieba.cut(data2)
data01 = ""
for item in d1 :
#print (item)
data01 += item+ " "
data11=data01.replace(",","")
data21 = ""
for item in d2:
data21 += item + " "
data22=data21.replace(',','')
documents = [data11, data22]
print(documents)
from collections import defaultdict
texts=[[word for word in document.split()]
for document in documents]
print (texts)
frequency=defaultdict(int)
for text in texts:
for token in text:
frequency[token]+=1
#texts=[[word for word in text if frequency[token]>3]
# for text in texts]
dictionary=corpora.Dictionary(texts)
dictionary.save("C:/Users/Administrator/Desktop/tripadvisor_gm/tripadvisor_code_python/test_dict1.txt")
data3=data=urllib.request.urlopen("http://127.0.0.1/txt2.txt").read().decode("utf-8","ignore")
d3=jieba.cut(data2)
data31 = ""
for item in d3 :
#print (item)
data31 += item+ " "
data31=data31.replace(",","")
new_doc=data31
new_vec=dictionary.doc2bow(new_doc.split())
corpus=[dictionary.doc2bow(text)for text in texts]
corpora.MmCorpus.serialize("C:/Users/Administrator/Desktop/tripadvisor_gm/tripadvisor_code_python/test_corpus1.txt",corpus)
tfidf=models.TfidfModel(corpus)
feature_num=len(dictionary.token2id.keys())
index=similarities.SparseMatrixSimilarity(tfidf[corpus],num_features=feature_num)
sim=index[tfidf[new_vec]]
print(sim)
#word1 word2 word3...wordn
#小说推荐。。。
#自动匹配推荐。。。