搜索引擎的预处理
预处理
public static String replace(String line){ HashMap map = new HashMap(); //把全角转换为半角 我们汉字为全角 英文是半角 map.put("。","."); map.put(",",","); map.put("?","?"); int length = line.length(); for(int i=0;i<length;i++){ //得到line中的字符 String charat = line.substring(i,i+1); //判断是否包含charat的key值 map可以使得到的value为空 if (map.get(charat)!=null){ line = line.replace(charat,(String)map.get(charat)); } }
return line; }
读取文件并进行预处理 文件分割
import java.io.*; public class Main { public static void main(String[] args) throws Exception{ //需要使文件为utf-8编码的 splitToSmallFiles(new File("C:/Users/15354/Downloads/钢铁是怎样炼成的(全).txt"),"C:/Users/15354/Downloads/Split/"); } public static void splitToSmallFiles(File file , String outputpath)throws IOException { //输入流 BufferedReader reader = new BufferedReader(new FileReader(file)); /* InputStreamReader isr = new InputStreamReader(new FileInputStream(file), "UTF-8"); BufferedReader read = new BufferedReader(isr);*/ //输出流 BufferedWriter writer = null; //设置缓冲区 StringBuffer stringBuffer = new StringBuffer(); //最大文件容量 int MAX_SIZE = 10240; //文件数量 int filePointer = 0; String line = reader.readLine(); while (line != null){ //把line加入到缓冲区中 stringBuffer.append(line).append("\r\n"); if (stringBuffer.toString().getBytes().length >= MAX_SIZE){ //如果大于最大值 那么就输出 writer = new BufferedWriter(new FileWriter(outputpath+"output"+filePointer+"txt")); writer.write(stringBuffer.toString()); writer.close(); filePointer++; //清除缓冲区 stringBuffer = new StringBuffer(); } line = reader.readLine(); } writer = new BufferedWriter(new FileWriter(outputpath+"output"+filePointer+"txt")); writer.write(stringBuffer.toString()); writer.close(); } }
1 public static File characterProcess(File file , String descFile)throws Exception{ 2 //写入流 3 BufferedWriter writer = new BufferedWriter(new FileWriter(descFile)); 4 //读取流 5 BufferedReader reader = new BufferedReader(new FileReader(file)); 6 7 String line = reader.readLine(); 8 while (line != null){ 9 String validString = replace(line); 10 writer.write(validString); 11 //写入行分隔符 12 writer.newLine(); 13 line = reader.readLine(); 14 } 15 writer.close(); 16 reader.close(); 17 return new File(descFile); 18 }
逃不过逝水流年 没有时间可以浪费了!!!

浙公网安备 33010602011771号