个人项目
| 作业所属班级 | 首页 - 软件工程2024 - 广东工业大学 - 班级博客 - 博客园 (cnblogs.com) |
|---|---|
| 作业的要求 | https://edu.cnblogs.com/campus/gdgy/SoftwareEngineering2024/homework/13136 |
| 我理解的作业目标 | 了解编码规范,学习使用Github管理代码,学习代码的性能测试与优化,学习代码的测试 |
| GitHub地址 | https://github.com/Sherry146/test/tree/main/3122004793 |
需求分析
设计一个可以通过命令行参数传递原文文件地址、抄袭文件地址、答案文件地址,计算抄袭文件相对于原文文件的重复率,并将结果以保留小数点后两位的形式输出到答案文件。
计算模块接口的设计与实现过程
使用余弦相似度来计算文本相似度
分词并去除停用词
def tokenize(text):
words = [word for word in jieba.cut(text) if word.strip()]
return words
构建词袋模型
def build_bag_of_words(text):
words = tokenize(text)
word_counts = Counter(words)
return word_counts
计算余弦相似度
def cosine_similarity(vector1, vector2):
intersection = set(vector1.keys()) & set(vector2.keys())
numerator = sum(vector1[x] * vector2[x] for x in intersection)
sum1 = sum(vector1[x] ** 2 for x in vector1.keys())
sum2 = sum(vector2[x] ** 2 for x in vector2.keys())
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
def calculate_cosine_similarity(text1, text2):
vector1 = build_bag_of_words(text1)
vector2 = build_bag_of_words(text2)
similarity = cosine_similarity(vector1, vector2)
return similarity
main函数入口
#获取文件路径
file_name = input("请输入文件的绝对路径:")
file_sample = open(file="orig.txt",mode="r",encoding="utf-8")
file1 = open(file=file_name,mode="r",encoding="utf-8")
original = file_sample.read()
text1 = file1.read()
#计算相似度
similarity = calculate_cosine_similarity(original, text1)
similarity1 =round(similarity,4)
similarity1 *= 100
"""
print("文本1:", original)
print("文本2:", text1)
"""
#保存到文件 anser.txt
file_ans = open(file="anser.txt",mode="a",encoding="utf-8")
file_ans.write(str(file_name)+"相似度:"+str(similarity1)+"%"+"\n")
print(file_name,"相似度:", similarity1,"%")
file_sample.close()
file1.close()
file_ans.close()
通过命令行参数,输入需要查重的文件地址后,程序会读取原文文件和抄袭文件的内容,后将内容由calculate_cosine_similarity()处理并返回该文本的重复率,然后将重复率以保留小数点后两位的形式输出到目标文件anser.txt。
程序的执行逻辑为:main函数入口->build_bag_of_words()->tokenize()->cosine_similarity()->calculate_cosine_similarity()
计算模块接口部分的性能改进
通过line_profiler,测试程序
{
请输入文件的绝对路径:orig_0.8_add.txt
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\XIEJIA~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.521 seconds.
Prefix dict has been built successfully.
orig_0.8_add.txt 相似度: 99.65 %
Timer unit: 1e-07 s
Total time: 0.827692 s
File: d:\code\code_py\personal project\test.py
Function: tokenize at line 9
Line # Hits Time Per Hit % Time Line Contents
==============================================================
9 def tokenize(text):
10 16197 8276911.0 511.0 100.0 words = [word for word in jieba.cut(text) if word.strip()]
11 2 13.0 6.5 0.0 return words
Total time: 0.832111 s
File: d:\code\code_py\personal project\test.py
Function: build_bag_of_words at line 14
Line # Hits Time Per Hit % Time Line Contents
==============================================================
14 def build_bag_of_words(text):
15 2 8311985.0 4e+06 99.9 words = tokenize(text)
16 2 9118.0 4559.0 0.1 word_counts = Counter(words)
17 2 10.0 5.0 0.0 return word_counts
Total time: 0.0037579 s
File: d:\code\code_py\personal project\test.py
Function: cosine_similarity at line 20
Line # Hits Time Per Hit % Time Line Contents
==============================================================
20 def cosine_similarity(vector1, vector2):
21 1 3338.0 3338.0 8.9 intersection = set(vector1.keys()) & set(vector2.keys())
22 1 8837.0 8837.0 23.5 numerator = sum(vector1[x] * vector2[x] for x in intersection)
23
24 1 9072.0 9072.0 24.1 sum1 = sum(vector1[x] ** 2 for x in vector1.keys())
25 1 16255.0 16255.0 43.3 sum2 = sum(vector2[x] ** 2 for x in vector2.keys())
26 1 52.0 52.0 0.1 denominator = math.sqrt(sum1) * math.sqrt(sum2)
27
28 1 7.0 7.0 0.0 if not denominator:
29 return 0.0
30 else:
31 1 18.0 18.0 0.0 return float(numerator) / denominator
Total time: 0.836009 s
File: d:\code\code_py\personal project\test.py
Function: calculate_cosine_similarity at line 33
Line # Hits Time Per Hit % Time Line Contents
==============================================================
33 def calculate_cosine_similarity(text1, text2):
34 1 6513932.0 7e+06 77.9 vector1 = build_bag_of_words(text1)
35 1 1808326.0 2e+06 21.6 vector2 = build_bag_of_words(text2)
36
37 1 37825.0 37825.0 0.5 similarity = cosine_similarity(vector1, vector2)
38 1 5.0 5.0 0.0 return similarity
Total time: 12.678 s
File: d:\code\code_py\personal project\test.py
Function: mian at line 40
Line # Hits Time Per Hit % Time Line Contents
==============================================================
40 def mian():
41 1 118387700.0 1e+08 93.4 file_name = input("请输入文件的绝对路径:")
42 1 5988.0 5988.0 0.0 file_sample = open(file="orig.txt",mode="r",encoding="utf-8")
43 1 6106.0 6106.0 0.0 file1 = open(file=file_name,mode="r",encoding="utf-8")
44
45 1 1191.0 1191.0 0.0 original = file_sample.read()
46 1 4091.0 4091.0 0.0 text1 = file1.read()
47
48
49 1 8360976.0 8e+06 6.6 similarity = calculate_cosine_similarity(original, text1)
50 1 86.0 86.0 0.0 similarity1 =round(similarity,4)
51 1 8.0 8.0 0.0 similarity1 *= 100
52
53 1 2.0 2.0 0.0 """
54 print("文本1:", original)
55 print("文本2:", text1)
56 """
57 1 5566.0 5566.0 0.0 file_ans = open(file="anser.txt",mode="a",encoding="utf-8")
58 1 97.0 97.0 0.0 file_ans.write(str(file_name)+"相似度:"+str(similarity1)+"%"+"\n")
59 1 5190.0 5190.0 0.0 print(file_name,"相似度:", similarity1,"%")
60
61 1 360.0 360.0 0.0 file_sample.close()
62 1 454.0 454.0 0.0 file1.close()
63 1 2010.0 2010.0 0.0 file_ans.close()
}
计算模块部分单元测试展示
单元测试代码
import unittest
from main import *
class TestMyfuctions(unittest.TestCase):
def setUp(self):
self.min_value = 0
self.max_value = 1
def tearDown(self):
del self.min_value
del self.max_value
def test_calculate_cosine_similarity(self):
outcome = calculate_cosine_similarity("谢建豪","谢建豪")
self.assertEqual(outcome, 1)
outcome = calculate_cosine_similarity("谢建豪","谢老王")
self.assertEqual(outcome,0)
result = calculate_cosine_similarity("好好好","好好好好好好")
self.assertTrue(self.min_value <= result <= self.max_value,
f"{result} is not within the range [{self.min_value}, {self.max_value}]")
outcome = calculate_cosine_similarity("天气好好","我先去吃饭")
self.assertEqual(outcome,0)
outcome = calculate_cosine_similarity("软件测试","软件分析")
self.assertEqual(outcome,0)
outcome = calculate_cosine_similarity("你好","Hello")
self.assertEqual(outcome,0)
outcome = calculate_cosine_similarity("五年级","八年级")
self.assertTrue(self.min_value <= result <= self.max_value,
f"{result} is not within the range [{self.min_value}, {self.max_value}]")
outcome = calculate_cosine_similarity("你在哪里吃饭","你在哪里")
self.assertTrue(self.min_value <= result <= self.max_value,
f"{result} is not within the range [{self.min_value}, {self.max_value}]")
outcome = calculate_cosine_similarity("你生意还好吗","你好吗")
self.assertTrue(self.min_value <= result <= self.max_value,
f"{result} is not within the range [{self.min_value}, {self.max_value}]")
if __name__ == "__main__":
unittest.main()
使用coverage获得测试覆盖率
(xjh) PS D:\code\code_py\personal project> coverage report
Name Stmts Miss Cover
---------------------------------------
main.py 39 1 97%
test.py 43 32 26%
test_functions.py 30 1 97%
---------------------------------------
TOTAL 112 34 70%
(test.py 为代码内print测试)
计算模块部分异常处理说明
TextIOWrapper.write() takes exactly one argument (4 given)
AttributeError: 'list' object has no attribute 'lower'
cannot import name 'Vocabulary' from 'sklearn.feature_extraction.text'
PSP表格
| PSP | Personal Software Process Stages | 预估耗时(分钟) | 实际耗时(分钟) |
|---|---|---|---|
| Planning | 计划 | ||
| · Estimate | · 估计这个任务需要多少时间 | 30 | 50 |
| Development | 开发 | ||
| · Analysis | · 需求分析 (包括学习新技术) | 120 | 180 |
| · Design Spec | · 生成设计文档 | 60 | 60 |
| · Design Review | · 设计复审 | 30 | 45 |
| · Coding Standard | · 代码规范 (为目前的开发制定合适的规范) | 20 | 10 |
| · Design | · 具体设计 | 30 | 50 |
| · Coding | · 具体编码 | 90 | 120 |
| · Code Review | · 代码复审 | 30 | 60 |
| · Test | · 测试(自我测试,修改代码,提交修改) | 60 | 90 |
| Reporting | 报告 | ||
| · Test Repor | · 测试报告 | 60 | 60 |
| · Size Measurement | · 计算工作量 | 60 | 60 |
| · Postmortem & Process Improvement Plan | · 事后总结, 并提出过程改进计划 | 60 | 60 |
| · 合计 | 620 | 795 |
浙公网安备 33010602011771号