Fork me on GitHub

java解析超大xml(1G),一般数据挖掘dblp.xml文件的解析

在网上找了很多关于解析超大xml的例子,都说java再带的jar包中有相关的SAXparse类来解析xml,但是试过了好多次,之后还是不行,还有dom4j.jar等等,都不能解析太多条数的xml,大概超过30M,就会解析报错。

不过偶尔看到过xercesImpl.jar,sax2.jar,jaxen-1.1.1.jar

import java.io.IOException;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import java.io.File;
import org.xml.sax.SAXException;

public class SAX {

	public static void main(String[] args) {
		try {
			SAXParserFactory factory=SAXParserFactory.newInstance();
			factory.setNamespaceAware(true);
			factory.setValidating(true);
			SAXParser parser=factory.newSAXParser();
			SAXparse p1=new SAXparse();
			parser.parse(new File("D:\\dblp.xml"), p1);
		} catch (ParserConfigurationException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (SAXException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

}

  或者

import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.List;

import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;

/**
 * 
 */
public class XMLParse {
	private String configName = "dblp_little.xml";
	private SAXReader saxReader;
	private Document doc;
	private Element root;

	/**
	 */
	public XMLParse() {
//		InputStream in = Thread.currentThread().getContextClassLoader()
//				.getResourceAsStream(configName);
		saxReader = new SAXReader();
		try {
			doc = saxReader.read(configName);
		} catch (DocumentException e) {
			e.printStackTrace();
		}
		root = doc.getRootElement();
	}

	/**
	 * get Data Type
	 * 
	 * @throws IOException
	 */
	public void getModelElement(String attribute) {
		FileWriter fileWriter = null;
		try {
			fileWriter = new FileWriter(attribute + ".txt");
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		List list = root.elements();
		Element model = null;
		List childList = null;
		Element modelEle = null;
		Element returnModel = null;
		String dataType = null;
		StringBuffer sb = new StringBuffer();
		int temp = 0;
			for (Iterator it = list.iterator(); it.hasNext();) {
				model = (Element) it.next();
				temp++;
				System.out.println("temp:"+temp);
				childList = model.elements();
				
					
					for (Iterator ite = childList.iterator(); ite.hasNext();) {
						modelEle = (Element) ite.next();
						if (attribute.equals(modelEle.getName())) {
							dataType = modelEle.getText();
							dataType = dataType;
							if (sb.length() > 1) {
								sb.append(",");
							}
							sb.append(dataType);
						}
					}
					dataType = sb.toString();
					if (!"".equals(dataType)) { // 没有值的话,跳过往txt中写值
						try {
							fileWriter.write(dataType);
							fileWriter.write("\r\n");
						} catch (IOException e) {
							// TODO Auto-generated catch block
							e.printStackTrace();
						}
					}
					sb.delete(0, sb.length());
					try {
						fileWriter.flush();
					} catch (IOException e) {
						// TODO Auto-generated catch block
						e.printStackTrace();
					}
					
				}
			try {
//				fileWriter.flush();
				fileWriter.close();
				System.out.println("xml解析成功,并成功写入到"+attribute+".txt 文件中!");
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			System.out.println("list.size:"+list.size());
	}

	public static void main(String[] args) {
		// TODO Auto-generated method stub
		String attribute = null;
		XMLParse parse = new XMLParse();
		attribute = "author";
		parse.getModelElement(attribute);
	}
}

  

posted @ 2013-02-18 22:29  symbolJerry²º¹³  阅读(2352)  评论(0编辑  收藏  举报