tf*idf、lcs
#tf*idf
1 import os 2 import math 3 4 file_path = './allfiles' 5 # print(os.listdir(file_path)) 6 stop_list = set() 7 with open('stop_list.txt', 'r', encoding='utf-8') as f: 8 for word_list in f.readlines(): 9 word = word_list[0] 10 stop_list.add(word.strip()) 11 12 #定义每个词的文档 13 doc_words = dict() 14 #标识文章数量 15 doc_num = 0 16 17 for filename in os.listdir(file_path): 18 if doc_num % 100 == 0: 19 print('%s docs finished' % doc_num) 20 with open(file_path + '/' + filename, 'r', encoding = 'utf-8') as f: 21 # print(f.readlines()) 22 # break 23 #定义字典去存词频 24 word_freq = dict() 25 #按照占比处理tf 26 sum_cnt = 0 27 #按照最大值处理tf 28 max_tf = 0 29 for line in f.readlines(): 30 words = line.strip().split(' ') 31 # print(words) 32 # break 33 for word in words: 34 if len(word.strip()) < 1 or word in stop_list: 35 continue 36 37 if word_freq.get(word, -1) == -1: 38 word_freq[word] = 1 39 else: 40 word_freq[word] += 1 41 42 if word_freq[word] > max_tf: 43 max_tf = word_freq[word] 44 45 #文章的总词数 46 sum_cnt += 1 47 48 for word in word_freq.keys(): 49 word_freq[word] /= sum_cnt 50 51 #word_freq:{word:tf} 52 doc_words[filename] = word_freq #字典套字典 53 doc_num += 1 54 print() 55 56 57 #统计每个词的doc_freq 每个词在多少个文章出现的次数 IDF 58 doc_freq = dict() 59 for doc in doc_words.keys(): #获取每一篇文章 filename 60 for word in doc_words[doc].keys(): #doc_words[doc] = {word:tf} 61 if doc_freq.get(word, -1) == -1: 62 doc_freq[word] = 1 63 else: 64 doc_freq[word] += 1 65 66 67 #idf公式 值的大小能反应文章的特性 68 for word in doc_freq.keys(): 69 doc_freq[word] = math.log(doc_num / doc_freq[word]+1, 10) #log以10为底 70 71 # print(doc_freq) 72 #按照降序获取前10个 73 # print(sorted(doc_freq.items(),key=lambda x:x[1],reverse=True)[:10]) 74 75 #tf*idf 76 for doc in doc_words.keys(): #获取每一篇文章filename 77 for word in doc_words[doc].keys(): #doc_words[doc] = {word:tf} 78 doc_words[doc][word] *= doc_freq[word] 79 80 print(sorted(doc_words['3business.seg.cln.txt'].items(), key=lambda x:x[1],reverse=False)[:10]) 81 82 print(sorted(doc_words['3business.seg.cln.txt'].items(), key=lambda x:x[1],reverse=True)[:10])
lcs:(Longest Common Subquence):最长公共子序列
1.子序列:特性 他是可以断开的,未必连接在一起,但是顺序需要保证。
2.作用:描述两段文字之间的“相似度”。
3.jaccard distinct set(s1)&set(s2) :求公共交集的数量。
4.x{1,2,....,m}有2m个子序列。
5.动态规划求子序列。
'''LCS(X,Y) Xm = Yn abc, bfc LCS(abc, bfc) = LCS(ab, bf)+c = b+c = bc #递归调用 LCS(ab, bf) = max(LCS(ab, b), LCS(a, bf)) = b LCS(ab, b) = LCS(a,'')+b = b LCS(a, bf) = max(LCS(a, b), LCS('', bf)) = '' '''
1 # a = 'ABCBDAB' 2 # b = 'BDCABA' 3 4 # n = len(a) 5 # m = len(b) 6 # 7 # l =[[0]*(m+1) for x in range(n+1)] #都生成在一行 8 # for x in l: 9 # print(x) 10 """ 11 [0, 0, 0, 0, 0, 0, 0] 12 [0, 0, 0, 0, 0, 0, 0] 13 [0, 0, 0, 0, 0, 0, 0] 14 [0, 0, 0, 0, 0, 0, 0] 15 [0, 0, 0, 0, 0, 0, 0] 16 [0, 0, 0, 0, 0, 0, 0] 17 [0, 0, 0, 0, 0, 0, 0] 18 [0, 0, 0, 0, 0, 0, 0] 19 """ 20 21 # for i in range(1, n+1): 22 # for j in range(1, m+1): 23 # if a[i-1] == b[j-1]: 24 # l[i][j] = l[i-1][j-1] + 1 25 # else: 26 # l[i][j] = max(l[i-1][j],l[i][j-1]) 27 28 # for line in l: 29 # print(line) 30 ''' 31 [0, 0, 0, 0, 0, 0, 0] 32 [0, 0, 0, 0, 1, 1, 1] 33 [0, 1, 1, 1, 1, 2, 2] 34 [0, 1, 1, 2, 2, 2, 2] 35 [0, 1, 1, 2, 2, 3, 3] 36 [0, 1, 2, 2, 2, 3, 3] 37 [0, 1, 2, 2, 3, 3, 4] 38 [0, 1, 2, 2, 3, 4, 4] 39 ''' 40 41 #封装成一个函数 42 def lcs(a,b): 43 n = len(a) 44 m = len(b) 45 l = [[0] * (m + 1) for x in range(n + 1)] # 都生成在一行 46 for i in range(1, n + 1): 47 for j in range(1, m + 1): 48 if a[i - 1] == b[j - 1]: 49 l[i][j] = l[i - 1][j - 1] + 1 50 else: 51 l[i][j] = max(l[i - 1][j], l[i][j - 1]) 52 return l 53 54 a = 'ABCBDAB' 55 b = 'BDCABA' 56 l = lcs(a,b) 57 for line in l: 58 print(line)
pyspark:
df = spark.sql("select * from badou.lcs_data")
df.show()
from pyspark.sql.functions import udf
from pyspark.sql.types import *
lcs_udf = udf(lcs,IntergerType())
df_score = df.WithColumn("score",lcs_udf(df.a,df.b))
df_score.show()
浙公网安备 33010602011771号