wuyijia

导航

tf*idf、lcs

 #tf*idf
1
import os 2 import math 3 4 file_path = './allfiles' 5 # print(os.listdir(file_path)) 6 stop_list = set() 7 with open('stop_list.txt', 'r', encoding='utf-8') as f: 8 for word_list in f.readlines(): 9 word = word_list[0] 10 stop_list.add(word.strip()) 11 12 #定义每个词的文档 13 doc_words = dict() 14 #标识文章数量 15 doc_num = 0 16 17 for filename in os.listdir(file_path): 18 if doc_num % 100 == 0: 19 print('%s docs finished' % doc_num) 20 with open(file_path + '/' + filename, 'r', encoding = 'utf-8') as f: 21 # print(f.readlines()) 22 # break 23 #定义字典去存词频 24 word_freq = dict() 25 #按照占比处理tf 26 sum_cnt = 0 27 #按照最大值处理tf 28 max_tf = 0 29 for line in f.readlines(): 30 words = line.strip().split(' ') 31 # print(words) 32 # break 33 for word in words: 34 if len(word.strip()) < 1 or word in stop_list: 35 continue 36 37 if word_freq.get(word, -1) == -1: 38 word_freq[word] = 1 39 else: 40 word_freq[word] += 1 41 42 if word_freq[word] > max_tf: 43 max_tf = word_freq[word] 44 45 #文章的总词数 46 sum_cnt += 1 47 48 for word in word_freq.keys(): 49 word_freq[word] /= sum_cnt 50 51 #word_freq:{word:tf} 52 doc_words[filename] = word_freq #字典套字典 53 doc_num += 1 54 print() 55 56 57 #统计每个词的doc_freq 每个词在多少个文章出现的次数 IDF 58 doc_freq = dict() 59 for doc in doc_words.keys(): #获取每一篇文章 filename 60 for word in doc_words[doc].keys(): #doc_words[doc] = {word:tf} 61 if doc_freq.get(word, -1) == -1: 62 doc_freq[word] = 1 63 else: 64 doc_freq[word] += 1 65 66 67 #idf公式 值的大小能反应文章的特性 68 for word in doc_freq.keys(): 69 doc_freq[word] = math.log(doc_num / doc_freq[word]+1, 10) #log以10为底 70 71 # print(doc_freq) 72 #按照降序获取前10个 73 # print(sorted(doc_freq.items(),key=lambda x:x[1],reverse=True)[:10]) 74 75 #tf*idf 76 for doc in doc_words.keys(): #获取每一篇文章filename 77 for word in doc_words[doc].keys(): #doc_words[doc] = {word:tf} 78 doc_words[doc][word] *= doc_freq[word] 79 80 print(sorted(doc_words['3business.seg.cln.txt'].items(), key=lambda x:x[1],reverse=False)[:10]) 81 82 print(sorted(doc_words['3business.seg.cln.txt'].items(), key=lambda x:x[1],reverse=True)[:10])

 lcs:(Longest Common Subquence):最长公共子序列

1.子序列:特性 他是可以断开的,未必连接在一起,但是顺序需要保证。

2.作用:描述两段文字之间的“相似度”。

3.jaccard distinct set(s1)&set(s2)   :求公共交集的数量。

4.x{1,2,....,m}有2m个子序列。

5.动态规划求子序列。

'''LCS(X,Y)
Xm = Yn

abc, bfc
LCS(abc, bfc) = LCS(ab, bf)+c = b+c = bc  #递归调用 
LCS(ab, bf) = max(LCS(ab, b), LCS(a, bf)) = b
LCS(ab, b) = LCS(a,'')+b = b
LCS(a, bf) = max(LCS(a, b), LCS('', bf)) = ''
'''
 1 # a = 'ABCBDAB'
 2 # b = 'BDCABA'
 3 
 4 # n = len(a)
 5 # m = len(b)
 6 #
 7 # l =[[0]*(m+1) for x in range(n+1)] #都生成在一行
 8 # for x in l:
 9 #     print(x)
10 """
11 [0, 0, 0, 0, 0, 0, 0]
12 [0, 0, 0, 0, 0, 0, 0]
13 [0, 0, 0, 0, 0, 0, 0]
14 [0, 0, 0, 0, 0, 0, 0]
15 [0, 0, 0, 0, 0, 0, 0]
16 [0, 0, 0, 0, 0, 0, 0]
17 [0, 0, 0, 0, 0, 0, 0]
18 [0, 0, 0, 0, 0, 0, 0]
19 """
20 
21 # for i in range(1, n+1):
22 #     for j in range(1, m+1):
23 #         if a[i-1] == b[j-1]:
24 #             l[i][j] = l[i-1][j-1] + 1
25 #         else:
26 #             l[i][j] = max(l[i-1][j],l[i][j-1])
27 
28 # for line in l:
29 #     print(line)
30 '''
31 [0, 0, 0, 0, 0, 0, 0]
32 [0, 0, 0, 0, 1, 1, 1]
33 [0, 1, 1, 1, 1, 2, 2]
34 [0, 1, 1, 2, 2, 2, 2]
35 [0, 1, 1, 2, 2, 3, 3]
36 [0, 1, 2, 2, 2, 3, 3]
37 [0, 1, 2, 2, 3, 3, 4]
38 [0, 1, 2, 2, 3, 4, 4]
39 '''
40 
41 #封装成一个函数
42 def lcs(a,b):
43     n = len(a)
44     m = len(b)
45     l = [[0] * (m + 1) for x in range(n + 1)]  # 都生成在一行
46     for i in range(1, n + 1):
47         for j in range(1, m + 1):
48             if a[i - 1] == b[j - 1]:
49                 l[i][j] = l[i - 1][j - 1] + 1
50             else:
51                 l[i][j] = max(l[i - 1][j], l[i][j - 1])
52     return l
53 
54 a = 'ABCBDAB'
55 b = 'BDCABA'
56 l = lcs(a,b)
57 for line in l:
58     print(line)

 pyspark:

df = spark.sql("select * from badou.lcs_data")

df.show()

from pyspark.sql.functions import udf

from pyspark.sql.types import *

lcs_udf = udf(lcs,IntergerType())

df_score = df.WithColumn("score",lcs_udf(df.a,df.b))

df_score.show()

 

posted on 2023-05-19 14:44  小吴要努力  阅读(36)  评论(0)    收藏  举报