搜索引擎的预处理

预处理
public static String replace(String line){
        HashMap map = new HashMap();
        //把全角转换为半角  我们汉字为全角 英文是半角
        map.put("。",".");
        map.put(",",",");
        map.put("?","?");
        int length = line.length();
        for(int i=0;i<length;i++){
            //得到line中的字符
            String charat = line.substring(i,i+1);
            //判断是否包含charat的key值   map可以使得到的value为空
            if (map.get(charat)!=null){
                line = line.replace(charat,(String)map.get(charat));
            }
        }
     return line; }

读取文件并进行预处理  文件分割

import java.io.*;

public class Main {

    public static void main(String[] args) throws  Exception{

        //需要使文件为utf-8编码的

        splitToSmallFiles(new File("C:/Users/15354/Downloads/钢铁是怎样炼成的(全).txt"),"C:/Users/15354/Downloads/Split/");

    }

    public static void splitToSmallFiles(File file , String outputpath)throws IOException {
        //输入流
        BufferedReader reader = new BufferedReader(new FileReader(file));
/*        InputStreamReader isr = new InputStreamReader(new FileInputStream(file), "UTF-8");
        BufferedReader read = new BufferedReader(isr);*/
        //输出流
        BufferedWriter writer = null;
        //设置缓冲区
        StringBuffer stringBuffer = new StringBuffer();
        //最大文件容量
        int MAX_SIZE = 10240;
        //文件数量
        int filePointer = 0;

        String line = reader.readLine();
        while (line != null){
            //把line加入到缓冲区中
            stringBuffer.append(line).append("\r\n");
            if (stringBuffer.toString().getBytes().length >= MAX_SIZE){
                //如果大于最大值 那么就输出
                writer = new BufferedWriter(new FileWriter(outputpath+"output"+filePointer+"txt"));
                writer.write(stringBuffer.toString());
                writer.close();
                filePointer++;
                //清除缓冲区
                stringBuffer = new StringBuffer();
            }
            line = reader.readLine();
        }
        writer = new BufferedWriter(new FileWriter(outputpath+"output"+filePointer+"txt"));
        writer.write(stringBuffer.toString());
        writer.close();
    }
}

 

 1 public static File characterProcess(File file , String descFile)throws Exception{
 2         //写入流
 3         BufferedWriter writer = new BufferedWriter(new FileWriter(descFile));
 4         //读取流
 5         BufferedReader reader = new BufferedReader(new FileReader(file));
 6 
 7         String line = reader.readLine();
 8         while (line != null){
 9             String validString = replace(line);
10             writer.write(validString);
11             //写入行分隔符
12             writer.newLine();
13             line = reader.readLine();
14         }
15         writer.close();
16         reader.close();
17         return new File(descFile);
18     }

 

posted @ 2018-07-03 21:22  Bockpecehhe  阅读(332)  评论(0)    收藏  举报