第一次个人编程作业
一、github链接
github地址:https://github.com/sjz000/031804125
二、计算模块接口的设计与实现过程
算法流程图

模块介绍
- jieba和jieba.analyse : 用于分词并根据权重获取关键词
- sys : 用于获取命令行输入参数
- math : 用于计算根号
- 用类Similarity进行封装
- 构造函数
def __init__(self,file1,file2,topK):
self.file1 = file1
self.file2 = file2
self.topK = topK
- 获取两篇文章的关键词以及分词结果
def keywords(self):
#根据权重获取权重最高的K个关键词
words = [i for i in jieba.lcut(self.file1, cut_all=True)]
self.keywords1 = jieba.analyse.extract_tags(" ".join(words), topK=self.topK, withWeight=False)
words = [i for i in jieba.lcut(self.file2, cut_all=True)]
self.keywords2 = jieba.analyse.extract_tags(" ".join(words), topK=self.topK, withWeight=False)
- 建立关键词字典
def mix_dicts(self):
#将两篇文章的关键词合并,并且根据合并后的关键词建立字典
union = set(self.keywords1).union(set(self.keywords2))
self.dicts = {}
i = 0
for word in union:
self.dicts[word] = i
i += 1
self.length = len(union)
- 将分词结果转换为出现在字典中的位置
def list_codes(self):
#将关键词转换为出现在字典中的位置
self.codes1 = []
for word in self.keywords1:
self.codes1.append(self.dicts[word])
self.codes2 = []
for word in self.keywords2:
self.codes2.append(self.dicts[word])
- 进行oneHot编码
def oneHot(self):
self.listoneHot1 = [0]*self.length
for i in self.codes1:
self.listoneHot1[i] += 1
self.listoneHot2 = [0]*self.length
for i in self.codes2:
self.listoneHot2[i] += 1
- 相似度计算
def similar(self):
#删除标点符号以及部分语气助词
strs=[' ','\n','“',',','。','《','》',':','”','、','的','了','是','说','呀','啦','啊']
for j in strs:
self.file1=self.file1.replace(j,'')
self.file2=self.file2.replace(j,'')
#获取两篇文章的关键词以及分词结果
self.keywords()
#建立关键词字典
self.mix_dicts()
#将分词结果转换为出现在字典中的位置
self.list_codes()
#进行oneHot编码
self.oneHot()
#余弦相似度计算
sum_ = 0
for i in range(0,self.length):
sum_ += self.listoneHot1[i] * self.listoneHot2[i]
A = sqrt(reduce(lambda x,y: x+y, map(lambda x: x*x, self.listoneHot1)))
B = sqrt(reduce(lambda x,y: x+y, map(lambda x: x*x, self.listoneHot2)))
try:
result = sum_/(A*B)
return result
except Exception as e:
print(e)
return 0.0
- 主函数
if __name__ == '__main__':
oriPath = sys.argv[1]
copyPath = sys.argv[2]
ansPath = sys.argv[3]
try:
with open(oriPath,encoding='UTF-8') as fp:
file1 = fp.read()
words = [i for i in jieba.lcut(file1, cut_all=True) if i != '']
with open(copyPath,encoding='UTF-8') as fp:
file2 = fp.read()
topK = int(len(words)*0.15)
except:
print("路径错误")
s = Similarity(file1,file2,topK)
similarity = round(s.similar(), 2)
try:
with open(ansPath, "w+", encoding='UTF-8') as fp:
fp.write(str(similarity))
except:
print("路径错误")
三、计算模块接口部分的性能改进
- 代码中各模块的性能

- 代码总耗时

四、计算模块部分单元测试展示
- 进行10次测试,其中9次的文本为老师所给的,还有一个文本为空文本

- 测试代码
import unittest
import compute_similarity
class Test(unittest.TestCase):
def test_add(self):
with open(r"E:\python\软工实践\第一次编程作业\orig.txt", "r", encoding='UTF-8') as fp:
orig_text = fp.read()
with open(r"E:\python\软工实践\第一次编程作业\orig_0.8_add.txt", "r", encoding='UTF-8') as fp:
copy_text = fp.read()
similarity = compute_similarity.Similarity(orig_text, copy_text, int(len(orig_text)*0.15))
similarity = round(similarity.similar(), 2)
print("orig_0.8_add.txt 与 orig.txt的相似度:" + " " + str(similarity))
def test_del(self):
with open(r"E:\python\软工实践\第一次编程作业\orig.txt", "r", encoding='UTF-8') as fp:
orig_text = fp.read()
with open(r"E:\python\软工实践\第一次编程作业\orig_0.8_del.txt", "r", encoding='UTF-8') as fp:
copy_text = fp.read()
similarity = compute_similarity.Similarity(orig_text, copy_text, int(len(orig_text)*0.15))
similarity = round(similarity.similar(), 2)
print("orig_0.8_del.txt 与 orig.txt的相似度:" + " " + str(similarity))
def test_dis_1(self):
with open(r"E:\python\软工实践\第一次编程作业\orig.txt", "r", encoding='UTF-8') as fp:
orig_text = fp.read()
with open(r"E:\python\软工实践\第一次编程作业\orig_0.8_dis_1.txt", "r", encoding='UTF-8') as fp:
copy_text = fp.read()
similarity = compute_similarity.Similarity(orig_text, copy_text, int(len(orig_text)*0.15))
similarity = round(similarity.similar(), 2)
print("orig_0.8_dis_1.txt 与 orig.txt的相似度:" + " " + str(similarity))
def test_dis_3(self):
with open(r"E:\python\软工实践\第一次编程作业\orig.txt", "r", encoding='UTF-8') as fp:
orig_text = fp.read()
with open(r"E:\python\软工实践\第一次编程作业\orig_0.8_dis_3.txt", "r", encoding='UTF-8') as fp:
copy_text = fp.read()
similarity = compute_similarity.Similarity(orig_text, copy_text, int(len(orig_text)*0.15))
similarity = round(similarity.similar(), 2)
print("orig_0.8_dis_3.txt 与 orig.txt的相似度:" + " " + str(similarity))
def test_dis_7(self):
with open(r"E:\python\软工实践\第一次编程作业\orig.txt", "r", encoding='UTF-8') as fp:
orig_text = fp.read()
with open(r"E:\python\软工实践\第一次编程作业\orig_0.8_dis_7.txt", "r", encoding='UTF-8') as fp:
copy_text = fp.read()
similarity = compute_similarity.Similarity(orig_text, copy_text, int(len(orig_text)*0.15))
similarity = round(similarity.similar(), 2)
print("orig_0.8_dis_7.txt 与 orig.txt的相似度:" + " " + str(similarity))
def test_dis_10(self):
with open(r"E:\python\软工实践\第一次编程作业\orig.txt", "r", encoding='UTF-8') as fp:
orig_text = fp.read()
with open(r"E:\python\软工实践\第一次编程作业\orig_0.8_dis_10.txt", "r", encoding='UTF-8') as fp:
copy_text = fp.read()
similarity = compute_similarity.Similarity(orig_text, copy_text, int(len(orig_text)*0.15))
similarity = round(similarity.similar(), 2)
print("orig_0.8_dis_10.txt 与 orig.txt的相似度:" + " " + str(similarity))
def test_dis_15(self):
with open(r"E:\python\软工实践\第一次编程作业\orig.txt", "r", encoding='UTF-8') as fp:
orig_text = fp.read()
with open(r"E:\python\软工实践\第一次编程作业\orig_0.8_dis_15.txt", "r", encoding='UTF-8') as fp:
copy_text = fp.read()
similarity = compute_similarity.Similarity(orig_text, copy_text, int(len(orig_text)*0.15))
similarity = round(similarity.similar(), 2)
print("orig_0.8_dis_15.txt 与 orig.txt的相似度:" + " " + str(similarity))
def test_mix(self):
with open(r"E:\python\软工实践\第一次编程作业\orig.txt", "r", encoding='UTF-8') as fp:
orig_text = fp.read()
with open(r"E:\python\软工实践\第一次编程作业\orig_0.8_mix.txt", "r", encoding='UTF-8') as fp:
copy_text = fp.read()
similarity = compute_similarity.Similarity(orig_text, copy_text, int(len(orig_text)*0.15))
similarity = round(similarity.similar(), 2)
print("orig_0.8_mix.txt 与 orig.txt的相似度:" + " " + str(similarity))
def test_rep(self):
with open(r"E:\python\软工实践\第一次编程作业\orig.txt", "r", encoding='UTF-8') as fp:
orig_text = fp.read()
with open(r"E:\python\软工实践\第一次编程作业\orig_0.8_rep.txt", "r", encoding='UTF-8') as fp:
copy_text = fp.read()
similarity = compute_similarity.Similarity(orig_text, copy_text, int(len(orig_text)*0.15))
similarity = round(similarity.similar(), 2)
print("orig_0.8_rep.txt 与 orig.txt的相似度:" + " " + str(similarity))
def test_0(self):
with open(r"E:\python\软工实践\第一次编程作业\orig.txt", "r", encoding='UTF-8') as fp:
orig_text = fp.read()
with open(r"E:\python\软工实践\第一次编程作业\orig_0.txt", "r", encoding='UTF-8') as fp:
copy_text = fp.read()
similarity = compute_similarity.Similarity(orig_text, copy_text, int(len(orig_text)*0.15))
similarity = round(similarity.similar(), 2)
print("orig_0.txt 与 orig.txt的相似度:" + " " + str(similarity))
if __name__ == '__main__':
unittest.main()
- 代码单元测试覆盖率

五、计算模块部分异常处理说明
- 除零异常处理
当输入的两篇文本为空时,会出现除零的情况
try:
result = sum_/(A*B)
return result
except Exception as e:
print(e)
return 0.00
- 读取文件的路径错误
try:
with open(oriPath,encoding='UTF-8') as fp:
file1 = fp.read()
with open(copyPath,encoding='UTF-8') as fp:
file2 = fp.read()
except:
print("路径错误")
六、PSP表格
| PSP2.1 | Personal Software Process Stages | 预估耗时(分钟) | 实际耗时(分钟) |
|---|---|---|---|
| Planning | 计划 | 40 | 40 |
| · Estimate | · 估计这个任务需要多少时间 | 90 | 90 |
| Development | 开发 | 420 | 420 |
| · Analysis | · 需求分析 (包括学习新技术) | 420 | 540 |
| · Design Spec | · 生成设计文档 | 30 | 30 |
| · Design Review | · 设计复审 | 30 | 40 |
| · Coding Standard | · 代码规范 (为目前的开发制定合适的规范) | 15 | 15 |
| · Design | · 具体设计 | 60 | 60 |
| · Coding | · 具体编码 | 120 | 120 |
| · Code Review | · 代码复审 | 30 | 30 |
| · Test | · 测试(自我测试,修改代码,提交修改) | 90 | 60 |
| Reporting | 报告 | 60 | 60 |
| · Test Repor | · 测试报告 | 30 | 30 |
| · Size Measurement | · 计算工作量 | 15 | 15 |
| · Postmortem & Process Improvement Plan | · 事后总结, 并提出过程改进计划 | 30 | 30 |
| · 合计 | 1480 | 1580 |
七、总结
- 在完成这次作业的过程中,我认识到了自己在编程方面还有很大的不足。但是在完成作业,实现程序的过程中,我也学习到了很多的新知识。同时也学会好很多之前没听过没用过的工具,也学会如何建立github仓库,如何将文件上传github。在之后的作业中,我会付出更多的时间吧,希望能学习到更多的知识和技能。

浙公网安备 33010602011771号