文本的简单表示 boolean representation count-based Representation tf-idf python实现
1. Boolean representation
word_dict = ['我们', '又', '去', '爬山', '今天', '你们', '昨天', '跑步']
def booleanRepresent(user_input):
count = {}
for word in word_dict:
count[word] = 0
for word in user_input:
if word in count:
count[word] = 1
else:
count[word] = 0
return count
user_input1 = ['我们', '今天', '去', '爬山']
print(booleanRepresent(user_input1))
user_input2 = ['你们', '又', '去', '爬山', '又', '去', '跑步']
print(booleanRepresent(user_input2))
输出结果:
{'我们': 1, '又': 0, '去': 1, '爬山': 1, '今天': 1, '你们': 0, '昨天': 0, '跑步': 0}
{'我们': 0, '又': 1, '去': 1, '爬山': 1, '今天': 0, '你们': 1, '昨天': 0, '跑步': 1}
2. Count-based Representation
word_dict = ['我们', '又', '去', '爬山', '今天', '你们', '昨天', '跑步']
user_input2 = ['你们', '又', '去', '爬山', '又', '去', '跑步']
def countRepresent(user_input):
count = {}
for word in word_dict:
count[word] = 0
for word in user_input2:
if word in count:
count[word] += 1
else:
count[word] = 0
return count
countRepresent(user_input2)
输出结果:
{'我们': 0, '又': 2, '去': 2, '爬山': 1, '今天': 0, '你们': 1, '昨天': 0, '跑步': 1}
3. Tf-Idf表示
import math
word_dict = ['今天', '上', 'NLP', '课程', '的', '有', '意思', '数据', '也']
text1 = ['今天', '上', 'NLP', '课程']
text2 = ['今天', '的', '课程', '也', '有', '意思']
text3 = ['数据', '课程', '也', '有', '意思']
document = [text1, text2, text3]
def getIDF(word_dict, document):
idf_of_word = {}
for word in word_dict:
w_in_f = 0.0
for text in document:
if word in text:
w_in_f += 1.0
idf_of_word[word] = math.log(len(document) / w_in_f)
return idf_of_word
print(getIDF(word_dict, document))
IDF输出结果:
{'今天': 0.4054651081081644, '上': 1.0986122886681098, 'NLP': 1.0986122886681098, '课程': 0.0, '的': 1.0986122886681098, '有': 0.4054651081081644, '意思': 0.4054651081081644, '数据': 1.0986122886681098, '也': 0.4054651081081644}
def getTfIdf(word_dict, text):
tf_words = {}
for w in word_dict:
if w in text1:
tf_words[w] = text1.count(w)
else:
tf_words[w] = 0
tf_idf_of_file[w] = tf_words[w] * idf_of_word[w]
return tf_idf_of_file
print(tf_idf_of_file)
Tf-Idf输出结果:
{'今天': 0.4054651081081644, '上': 1.0986122886681098, 'NLP': 1.0986122886681098, '课程': 0.0, '的': 0.0, '有': 0.0, '意思': 0.0, '数据': 0.0, '也': 0.0}

浙公网安备 33010602011771号