JAVA xml 流方式读取。数据挖掘大文件预处理。

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

/**
 * @author gjf
 *db_pre.arff存储的是从xml文件中读取的xml信息
 */
public class ElmAuth {
    Map<String, Integer> map = new HashMap<String, Integer>();

    //第一步
    //从xml文件中提取  读取xml文件中的author信息,写到db_pre.arff,并且替换特殊字符
    public void settleXml(String src, String dst){//src=dblp.xml dst=db_pre.arff
        File file = new File(src);
        File fl = new File(dst);
        FileReader fr;
        try {
            fr = new FileReader(file);
            FileWriter fw = new FileWriter(fl);
            BufferedReader br = new BufferedReader(fr);
            BufferedWriter bw = new BufferedWriter(fw);
            String line = null;
            boolean flag = true;
            int loc_st;
            int loc_end;
            int len = 0, max = 0;
            while((line = br.readLine()) != null){
                if(line == null) 
                    break;    
                loc_st = line.indexOf("<author>");
                if(loc_st != -1){
                    loc_end = line.indexOf("</author>");
                    line = line.substring(loc_st + 8, loc_end);//在<author></author>之间的数据,一个作者的名字
                    line=line.replace('&', ' ');
                    line=line.replace('$', ' ');
                    line=line.replace("' "," ");
                    line=line.replace("'", " ");
                    /*flag以文章为界限,在同一篇文章内,flag=false,写入在同一行*/
                    if(flag){
                        bw.write("\n");
                        bw.write(line);
                        } else {
                            bw.write(",");
                            bw.write(line);
                            }
                    len++;//每写一个作者,计数加 +1 
                    flag = false;
                    } else {
                        flag = true;
                        if(max < len) max = len;//选择最大的len;
                        len = 0;
                        bw.flush();
                        }
                }
            System.out.println("第一步 论文中具有最大的作者数:" + max);
            } catch (IOException e) {
                e.printStackTrace();
                }    
    }

    //消除只有单个作item
    //第二步:将作者的信息db_pre.arff中只有一个作者的数据删除    
    public void elimate_one(String src, String dst){//src=db_pre.arff dst=db_elone.arff
        try {
            File file = new File(src);
            FileReader fr = new FileReader(file);
            BufferedReader br = new BufferedReader(fr);
            File filew = new File(dst);
            FileWriter fw = new FileWriter(filew);
            BufferedWriter bw = new BufferedWriter(fw);
            Map<String, Integer> map = new HashMap<String, Integer>();
            String line = null;
            int k = 1;
            int res = 0;
            while((line = br.readLine()) != null){
                String[] arrLine = line.split(",");
                //作者之间用","隔离,","的数量表示作者的个数,数量比一少,则不写入.
                if(arrLine.length > 1){
                    bw.write(line);
                    bw.write("\n");
                    res ++;
                }
            } 
            bw.flush();
            br.close();
            bw.close();
            fr.close();
            //System.out.println("The Row of the file is:" + res);
            System.out.println("这篇论文中去除单个作者后的行数:" + res);
        }catch (IOException e) {
            e.printStackTrace();
        }
    }

    //将剩余的作储再hashMap中,key值为人名,value为出现的次数,支持度数 
    public void createMap(String src){//srr=db_elone.arff
        try {
            File file = new File(src);
            FileReader fr = new FileReader(file);
            BufferedReader br = new BufferedReader(fr);

            String line = null;
            while((line = br.readLine()) != null){
                if(line == null)
                    break;
                String[] arrLine = line.split(",");
                for(int i = 0; i < arrLine.length; ++i){
                    if(map.get(arrLine[i]) == null){
                        map.put(arrLine[i], 1);
                    } else {
                        map.put(arrLine[i], map.get(arrLine[i]) + 1);
                    }
                }
            }
            fr.close();
            br.close();
        } catch (IOException e) {
            e.printStackTrace();
        } 
    }

    //从hashMap中删除小于支持度minsup的作者,本次的支持度数为100;
    public void settleMap(int minsup){
        Iterator it = map.keySet().iterator();
        while(it.hasNext()){
            String str = (String) it.next();
            if(map.get(str) < minsup){
                it.remove();
            }
        }
        System.out.println("Map的大小,支持度大于100的作者个数:" + map.size());
    }

    //将大于minsup的作者存储到文件 db_minsup.arff,存储的是符合筛选的作者
    public void updateMap(String src, String dst){//src=db_elone.arff dst=db_minsup.arff
        try {
            File filer = new File(src);
            FileReader fr = new FileReader(filer);
            BufferedReader br = new BufferedReader(fr);

            File filew = new File(dst);
            FileWriter fw = new FileWriter(filew);
            BufferedWriter bw = new BufferedWriter(fw);

            String line = null;
            int res = 0;
            boolean flag = true;
            while((line = br.readLine()) != null){
                if(line == null)break;
                String[] arrLine = line.split(",");
                if(flag == false)res++;
                flag = true;
                for(int i = 0; i < arrLine.length; ++i){
                    if(map.get(arrLine[i]) != null){
                        if(flag == true){
                            bw.write("\n" + arrLine[i]);
                            flag = false;
                        } else {
                            bw.write("," + arrLine[i]);
                        }
                    }
                }
            }
            bw.flush();
            System.out.println("符合筛选的作者合作写的论文篇数:" + res);
            fw.close();
            bw.close();
            fr.close();
            br.close();
        } catch (IOException e) {
            e.printStackTrace();
        } 
    }

    //生成weka识别的文  dst=db
    public void createWekaFile(String src, String dst){//src=db_minsup.arff dst=db
        try {
            File filer = new File(src);
            FileReader fr = new FileReader(filer);
            BufferedReader br = new BufferedReader(fr);

            File filew = new File(dst);
            FileWriter fw = new FileWriter(filew);
            BufferedWriter bw = new BufferedWriter(fw);
            bw.write("@relation db" + "\n");
            Iterator it = map.keySet().iterator();
            while(it.hasNext()){
                String str = (String) it.next();
                str.replace("'", "\'");
                bw.write("@attribute '" + str + "' { t}\n");
            }
            bw.write("@data" + "\n");
            
            String line = null;
            boolean flag = true;
            while((line = br.readLine()) != null){
                if(line == null)break;
                flag = true;
                char ch;
                it = map.keySet().iterator();
                while(it.hasNext()){
                    String str = (String)it.next();
                    if(line.indexOf(str) >= 0){
                        ch = 't';
                    } else {
                        ch = '?';
                    }
                    if(flag == true){
                        bw.write(ch);
                    } else {
                        bw.write("," + ch);
                    }
                    flag = false;
                }
                bw.write("\n");
            }
            bw.flush();
            fw.close();
            bw.close();
            fr.close();
            br.close();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } 
    }
    
    public void clearMap(){
        map.clear();
    }
    
    public static void main(String args[]){
        ElmAuth elmauth = new ElmAuth();
        elmauth.settleXml("dblp.xml", "db_pre.arff");
        elmauth.elimate_one("db_pre.arff", "db_elone.arff");
        elmauth.createMap("db_elone.arff");
        elmauth.settleMap(100);//确定最小支持度数
        elmauth.updateMap("db_elone.arff", "db_minsup.arff");
        
        for(int i = 0; i < 20; ++i){
            System.out.println();
            elmauth.elimate_one("db_minsup.arff", "db_minsup_elone.arff");
            elmauth.clearMap();
            elmauth.createMap("db_minsup_elone.arff");
            elmauth.settleMap(100);
            elmauth.updateMap("db_minsup_elone.arff", "db_minsup.arff");
        }
                
        elmauth.createWekaFile("db_minsup.arff", "db.arff");
    }
}

 

posted @ 2014-04-01 20:09  shenghaishiweini  阅读(885)  评论(0编辑  收藏  举报