简单查重系统

作业要求

作业所属班级 软件工程2024链接
作业要求 作业要求链接
作业目标 设计一个论文查重算法,给出一个原文文件和一个在这份原文上经过了增删改的抄袭版论文的文件,在答案文件中输出其重复率。

PSP

PSP2.1 Personal Software Process Stages 预估耗时(分钟) 实际耗时(分钟)
Planning 计划 60 60
· Estimate · 估计这个任务需要多少时间 60 60
Development 开发 1120 1280
· Analysis · 需求分析 (包括学习新技术) 300 370
· Design Spec · 生成设计文档 70 60
· Design Review · 设计复审 30 30
· Coding Standard · 代码规范 (为目前的开发制定合适的规范) 30 30
· Design · 具体设计 90 90
· Coding · 具体编码 300 360
· Code Review · 代码复审 60 60
· Test · 测试(自我测试,修改代码,提交修改) 240 300
Reporting 报告 240 240
· Test Repor · 测试报告 60 70
· Size Measurement · 计算工作量 60 60
· Postmortem & Process Improvement Plan · 事后总结, 并提出过程改进计划 120 120
·  合计 1420 1590

接口设计

从命令行输入的路径名读取对应的文件,将文件的内容转化为对应的字符串,由字符串得出对应的 simHash值,由 simHash值求出相似度,把相似度写入最后的结果文件中。

点击查看代码
package Utils;

public class HammingUtils {


//    输入两个simHash值,计算它们的海明距离

    public static int getHammingDistance(String simHash1, String simHash2) {
        int distance = 0;
        if (simHash1.length() != simHash2.length()) {
            // 出错,返回-1
            distance = -1;
        } else {
            for (int i = 0; i < simHash1.length(); i++) {
                // 每一位进行比较
                if (simHash1.charAt(i) != simHash2.charAt(i)) {
                    distance++;
                }
            }
        }
        return distance;
    }


    //      输入两个simHash值,输出相似度
    public static double getSimilarity(String simHash1, String simHash2) {
        // 通过 simHash1 和 simHash2 获得它们的海明距离
        int distance = getHammingDistance(simHash1, simHash2);
        // 通过海明距离计算出相似度,并返回
        return 0.01 * (100 - distance * 100 / 128);
    }

}

通过海明距离计算相似度
点击查看代码
package Utils;

public class ShortStringException extends Exception {

    public ShortStringException() {
        super();
    }

    public ShortStringException(String message) {
        super(message);
    }

    public ShortStringException(String message, Throwable cause) {
        super(message, cause);
    }

    public ShortStringException(Throwable cause) {
        super(cause);
    }

}

点击查看代码
import java.math.BigInteger;
import java.security.MessageDigest;
import java.util.List;

public class SimHashUtils {

    /**
     * 传入String,计算出它的hash值,并以字符串形式输出
     *
     * @param str 传入的Srting类型字符串
     * @return 返回str的hash值
     */
    public static String getHash(String str) {
        try {
            // 这里使用了MD5获得hash值
            MessageDigest messageDigest = MessageDigest.getInstance("MD5");
            return new BigInteger(1, messageDigest.digest(str.getBytes("UTF-8"))).toString(2);
        } catch (Exception e) {
            e.printStackTrace();
            return str;
        }
    }

    /**
     * 传入String,计算出它的simHash值,并以字符串形式输出
     *
     * @param str 传入的Srting类型字符串
     * @return 返回str的simHash值
     */
    public static String getSimHash(String str) {
        // 文本长度太短时HanLp无法取得关键字
        try {
            if (str.length() < 200) throw new ShortStringException("文本过短,难以判断!");
        } catch (ShortStringException e) {
            e.printStackTrace();
            return null;
        }
        // 用数组表示特征向量,取128位,从 0 1 2 位开始表示从高位到低位
        int[] v = new int[128];
        // 1、分词(使用了外部依赖hankcs包提供的接口)
        List<String> keywordList = HanLP.extractKeyword(str, str.length());//取出所有关键词
        // hash
        int size = keywordList.size();
        int i = 0;//以i做外层循环
        for (String keyword : keywordList) {
            // 2、获取hash值
            String keywordHash = getHash(keyword);
            if (keywordHash.length() < 128) {
                // hash值可能少于128位,在低位以0补齐
                int dif = 128 - keywordHash.length();
                for (int j = 0; j < dif; j++) {
                    keywordHash += "0";
                }
            }
            // 3、加权、合并
            for (int j = 0; j < v.length; j++) {
                // 对keywordHash的每一位与'1'进行比较
                if (keywordHash.charAt(j) == '1') {
                    //权重分10级,由词频从高到低,取权重10~0
                    v[j] += (10 - (i / (size / 10)));
                } else {
                    v[j] -= (10 - (i / (size / 10)));
                }
            }
            i++;
        }
        // 4、降维
        String simHash = "";// 储存返回的simHash值
        for (int j = 0; j < v.length; j++) {
            // 从高位遍历到低位
            if (v[j] <= 0) {
                simHash += "0";
            } else {
                simHash += "1";
            }
        }
        return simHash;
    }
}
传入String,计算出它的hash值,并以字符串形式输出
点击查看代码
package Utils;

import java.io.*;
import java.util.ArrayList;

public class TxtIOUtil {





    public static String readTxt(String txtPath) {
        String str = "";
        String strLine;
        // 将 txt文件按行读入 str中
        File file = new File(txtPath);
        FileInputStream fileInputStream = null;
        try {
            fileInputStream = new FileInputStream(file);
            InputStreamReader inputStreamReader = new InputStreamReader(fileInputStream, "UTF-8");
            BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
            // 字符串拼接
            while ((strLine = bufferedReader.readLine()) != null) {
                str += strLine;
            }
            // 关闭资源
            inputStreamReader.close();
            bufferedReader.close();
            fileInputStream.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return str;
    }



    public static void writeTxt(double txtElem,String txtPath){
        String str = Double.toString(txtElem);
        File file = new File(txtPath);
        FileWriter fileWriter = null;
        try {
            fileWriter = new FileWriter(file, true);
            fileWriter.write("相似度比例为: ");
            fileWriter.write(str, 0, (str.length() > 3 ? 4 : str.length()));
            fileWriter.write("\r\n");
            // 关闭资源
            fileWriter.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }



}




读取文本

性能改进





单元测试

点击查看代码
import Util.HammingUtils;
import Util.SimHashUtils;
import Util.TxtIOUtil;
import org.junit.Test;

public class MainTest {

    @Test
    public void origAndAllTest(){
        String[] str = new String[6];
        str[0] = TxtIOUtil.readTxt("D:/checkDuplicate/orig.txt");
        str[1] = TxtIOUtil.readTxt("D:/checkDuplicate/orig_0.8_add.txt");
        str[2] = TxtIOUtil.readTxt("D:/checkDuplicate/orig_0.8_del.txt");
        str[3] = TxtIOUtil.readTxt("D:/checkDuplicate/orig_0.8_dis_1.txt");
        str[4] = TxtIOUtil.readTxt("D:/checkDuplicate/orig_0.8_dis_10.txt");
        str[5] = TxtIOUtil.readTxt("D:/checkDuplicate/orig_0.8_dis_15.txt");
        String ansFileName = "D:/checkDuplicate/ansAll.txt";
        for(int i = 0; i <= 5; i++){
            double ans = HammingUtils.getSimilarity(SimHashUtils.getSimHash(str[0]), SimHashUtils.getSimHash(str[i]));
            TxtIOUtil.writeTxt(ans, ansFileName);
        }
    }

    @Test
    public void origAndOrigTest(){
        String str0 = TxtIOUtil.readTxt("D:/checkDuplicate/orig.txt");
        String str1 = TxtIOUtil.readTxt("D:/checkDuplicate/orig.txt");
        String ansFileName = "D:/checkDuplicate/ansOrigAndOrigTest.txt";
        double ans = HammingUtils.getSimilarity(SimHashUtils.getSimHash(str0), SimHashUtils.getSimHash(str1));
        TxtIOUtil.writeTxt(ans, ansFileName);
    }

    @Test
    public void origAndAddTest(){
        String str0 = TxtIOUtil.readTxt("D:/checkDuplicate/orig.txt");
        String str1 = TxtIOUtil.readTxt("D:/checkDuplicate/orig_0.8_add.txt");
        String ansFileName = "D:/checkDuplicate/ansOrigAndAddTest.txt";
        double ans = HammingUtils.getSimilarity(SimHashUtils.getSimHash(str0), SimHashUtils.getSimHash(str1));
        TxtIOUtil.writeTxt(ans, ansFileName);
    }

    @Test
    public void origAndDelTest(){
        String str0 = TxtIOUtil.readTxt("D:/checkDuplicate/orig.txt");
        String str1 = TxtIOUtil.readTxt("D:/checkDuplicate/orig_0.8_del.txt");
        String ansFileName = "D:/checkDuplicate/ansOrigAndDelTest.txt";
        double ans = HammingUtils.getSimilarity(SimHashUtils.getSimHash(str0), SimHashUtils.getSimHash(str1));
        TxtIOUtil.writeTxt(ans, ansFileName);
    }

    @Test
    public void origAndDis1Test(){
        String str0 = TxtIOUtil.readTxt("D:/checkDuplicate/orig.txt");
        String str1 = TxtIOUtil.readTxt("D:/checkDuplicate/orig_0.8_dis_1.txt");
        String ansFileName = "D:/checkDuplicate/ansOrigAndDis1Test.txt";
        double ans = HammingUtils.getSimilarity(SimHashUtils.getSimHash(str0), SimHashUtils.getSimHash(str1));
        TxtIOUtil.writeTxt(ans, ansFileName);
    }

    @Test
    public void origAndDis10Test(){
        String str0 = TxtIOUtil.readTxt("D:/checkDuplicate/orig.txt");
        String str1 = TxtIOUtil.readTxt("D:/checkDuplicate/orig_0.8_dis_10.txt");
        String ansFileName = "D:/checkDuplicate/ansOrigAndDis10Test.txt";
        double ans = HammingUtils.getSimilarity(SimHashUtils.getSimHash(str0), SimHashUtils.getSimHash(str1));
        TxtIOUtil.writeTxt(ans, ansFileName);
    }

    @Test
    public void origAndDis15Test(){
        String str0 = TxtIOUtil.readTxt("D:/checkDuplicate/orig.txt");
        String str1 = TxtIOUtil.readTxt("D:/checkDuplicate/orig_0.8_dis_15.txt");
        String ansFileName = "D:/checkDuplicate/ansOrigAndDis15Test.txt";
        double ans = HammingUtils.getSimilarity(SimHashUtils.getSimHash(str0), SimHashUtils.getSimHash(str1));
        TxtIOUtil.writeTxt(ans,ansFileName);
    }

}


测试覆盖率

异常处理

读取文本出现异常,无法取得关键字,需要throw异常。
使用ShortStringException这个类处理此异常。

github地址

github地址

posted @ 2024-03-14 14:01  冬日吃西瓜  阅读(33)  评论(0编辑  收藏  举报