java解析CSV文件三种方法(openCSV)
一、简介
1、pom.xml
<!-- csv文件解析依赖 -->
<dependency>
<groupId>com.opencsv</groupId>
<artifactId>opencsv</artifactId>
<version>5.4</version>
</dependency>
二、手动解析CSV文件
// 析csv文件并转成bean(方法一)
public static List<CsvFile> getCsvDataMethod1(MultipartFile file) {
ArrayList<CsvFile> csvFileList = new ArrayList<>();
InputStreamReader in = null;
String s = null;
try {
in = new InputStreamReader(file.getInputStream(), "utf-8");
BufferedReader bufferedReader = new BufferedReader(in);
String line = null;
while ((line = bufferedReader.readLine()) != null) {
String[] split = line.split(",");
CsvFile csvFile = new CsvFile();
csvFile.setName(splitResult(split[0]));
csvFile.setTitle(splitResult(split[1]));
csvFile.setNumber(splitResult(split[2]));
csvFile.setType(splitResult(split[3]));
csvFile.setPersonnel(splitResult(split[4]));
csvFile.setTime(splitResult(split[5]));
csvFileList.add(csvFile);
}
} catch (IOException e) {
e.printStackTrace();
}
return csvFileList;
}
去重引号""
private static String splitResult(String once) {
String result = "";
for (int i = 0; i < once.length(); i++) {
if (once.charAt(i) != '"') {
result += once.charAt(i);
}
}
return result;
}
三、openCSV解析CSV文件
/**
* 解析csv文件并转成bean(方法二)
*
* @param file csv文件
* @return 数组
*/
public static List<String[]> getCsvDataMethod2(MultipartFile file) {
List<String[]> list = new ArrayList<String[]>();
int i = 0;
try {
CSVReader csvReader = new CSVReaderBuilder(
new BufferedReader(
new InputStreamReader(file.getInputStream(), "utf-8"))).build();
Iterator<String[]> iterator = csvReader.iterator();
while (iterator.hasNext()) {
String[] next = iterator.next();
//去除第一行的表头,从第二行开始
if (i >= 1) {
list.add(next);
}
i++;
}
return list;
} catch (Exception e) {
System.out.println("CSV文件读取异常");
return list;
}
}
四、openCSV解析CSV文件(结果为实体类)
工具类:
/**
* 解析csv文件并转成bean(方法三)
*
* @param file csv文件
* @param clazz 类
* @param <T> 泛型
* @return 泛型bean集合
*/
public static <T> List<T> getCsvDataMethod3(MultipartFile file, Class<T> clazz) {
InputStreamReader in = null;
CsvToBean<T> csvToBean = null;
try {
in = new InputStreamReader(file.getInputStream(), "utf-8");
HeaderColumnNameMappingStrategy<T> strategy = new HeaderColumnNameMappingStrategy<>();
strategy.setType(clazz);
csvToBean = new CsvToBeanBuilder<T>(in).withMappingStrategy(strategy).build();
} catch (Exception e) {
logger.error("数据转化失败");
return null;
}
return csvToBean.parse();
}
实体类:
import com.opencsv.bean.CsvBindByName;
import lombok.Data;
@Data
public class CsvFile {
@CsvBindByName(column = "name")
private String name;
@CsvBindByName(column = "title")
private String title;
@CsvBindByName(column = "number")
private String number;
@CsvBindByName(column = "type")
private String type;
@CsvBindByName(column = "personnel")
private String personnel;
@CsvBindByName(column = "time")
private String time;
}
五、整理完成的CsvUtils
import com.lydms.testopencsv.domain.CsvFile;
import com.opencsv.CSVReader;
import com.opencsv.CSVReaderBuilder;
import com.opencsv.bean.CsvToBean;
import com.opencsv.bean.CsvToBeanBuilder;
import com.opencsv.bean.HeaderColumnNameMappingStrategy;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.springframework.web.multipart.MultipartFile;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
public class CsvUtils {
private static final Logger logger = LogManager.getLogger(CsvUtils.class);
/**
* 解析csv文件并转成bean(方法二)
*
* @param file csv文件
* @return 数组
*/
public static List<String[]> getCsvDataMethod2(MultipartFile file) {
List<String[]> list = new ArrayList<String[]>();
int i = 0;
try {
CSVReader csvReader = new CSVReaderBuilder(
new BufferedReader(
new InputStreamReader(file.getInputStream(), "utf-8"))).build();
Iterator<String[]> iterator = csvReader.iterator();
while (iterator.hasNext()) {
String[] next = iterator.next();
//去除第一行的表头,从第二行开始
if (i >= 1) {
list.add(next);
}
i++;
}
return list;
} catch (Exception e) {
System.out.println("CSV文件读取异常");
return list;
}
}
/**
* 解析csv文件并转成bean(方法三)
*
* @param file csv文件
* @param clazz 类
* @param <T> 泛型
* @return 泛型bean集合
*/
public static <T> List<T> getCsvDataMethod3(MultipartFile file, Class<T> clazz) {
InputStreamReader in = null;
CsvToBean<T> csvToBean = null;
try {
in = new InputStreamReader(file.getInputStream(), "utf-8");
HeaderColumnNameMappingStrategy<T> strategy = new HeaderColumnNameMappingStrategy<>();
strategy.setType(clazz);
csvToBean = new CsvToBeanBuilder<T>(in).withMappingStrategy(strategy).build();
} catch (Exception e) {
logger.error("数据转化失败");
return null;
}
return csvToBean.parse();
}
/**
* 解析csv文件并转成bean(方法一)
*
* @param file
* @return
*/
public static List<CsvFile> getCsvDataMethod1(MultipartFile file) {
ArrayList<CsvFile> csvFileList = new ArrayList<>();
InputStreamReader in = null;
String s = null;
try {
in = new InputStreamReader(file.getInputStream(), "utf-8");
BufferedReader bufferedReader = new BufferedReader(in);
String line = null;
while ((line = bufferedReader.readLine()) != null) {
String[] split = line.split(",");
CsvFile csvFile = new CsvFile();
csvFile.setName(splitResult(split[0]));
csvFile.setTitle(splitResult(split[1]));
csvFile.setNumber(splitResult(split[2]));
csvFile.setType(splitResult(split[3]));
csvFile.setPersonnel(splitResult(split[4]));
csvFile.setTime(splitResult(split[5]));
csvFileList.add(csvFile);
}
} catch (IOException e) {
e.printStackTrace();
}
return csvFileList;
}
private static String splitResult(String once) {
String result = "";
for (int i = 0; i < once.length(); i++) {
if (once.charAt(i) != '"') {
result += once.charAt(i);
}
}
return result;
}
}
六、相关地址
参考地址:https://www.cnblogs.com/xhj99/p/13536465.html
git地址:https://github.com/li395092734/test-opencsv
csv地址:https://files.cnblogs.com/files/blogs/604830/csvfile.zip
------------------------------------------------------------------------------------------------------------------------
Java解压缩.gz .zip .tar.gz等格式的压缩包方法总结
一、.gz文件是linux下常见的压缩格式。使用 java.util.zip.GZIPInputStream即可,压缩是 java.util.zip.GZIPOutputStream
public static void unGzipFile(String sourcedir) {
String ouputfile = "";
try {
//建立gzip压缩文件输入流
FileInputStream fin = new FileInputStream(sourcedir);
//建立gzip解压工作流
GZIPInputStream gzin = new GZIPInputStream(fin);
//建立解压文件输出流
ouputfile = sourcedir.substring(0,sourcedir.lastIndexOf('.'));
ouputfile = ouputfile.substring(0,ouputfile.lastIndexOf('.'));
FileOutputStream fout = new FileOutputStream(ouputfile);
int num;
byte[] buf=new byte[1024];
while ((num = gzin.read(buf,0,buf.length)) != -1)
{
fout.write(buf,0,num);
}
gzin.close();
fout.close();
fin.close();
} catch (Exception ex){
System.err.println(ex.toString());
}
return;
}
2、zip文件,使用java.util.zip.ZipEntry 和 java.util.zip.ZipFile
/**
* 解压缩zipFile
* @param file 要解压的zip文件对象
* @param outputDir 要解压到某个指定的目录下
* @throws IOException
*/
public static void unZip(File file,String outputDir) throws IOException {
ZipFile zipFile = null;
try {
Charset CP866 = Charset.forName("CP866"); //specifying alternative (non UTF-8) charset
//ZipFile zipFile = new ZipFile(zipArchive, CP866);
zipFile = new ZipFile(file, CP866);
createDirectory(outputDir,null);//创建输出目录
Enumeration<?> enums = zipFile.entries();
while(enums.hasMoreElements()){
ZipEntry entry = (ZipEntry) enums.nextElement();
System.out.println("解压." + entry.getName());
if(entry.isDirectory()){//是目录
createDirectory(outputDir,entry.getName());//创建空目录
}else{//是文件
File tmpFile = new File(outputDir + "/" + entry.getName());
createDirectory(tmpFile.getParent() + "/",null);//创建输出目录
InputStream in = null;
OutputStream out = null;
try{
in = zipFile.getInputStream(entry);;
out = new FileOutputStream(tmpFile);
int length = 0;
byte[] b = new byte[2048];
while((length = in.read(b)) != -1){
out.write(b, 0, length);
}
}catch(IOException ex){
throw ex;
}finally{
if(in!=null)
in.close();
if(out!=null)
out.close();
}
}
}
} catch (IOException e) {
throw new IOException("解压缩文件出现异常",e);
} finally{
try{
if(zipFile != null){
zipFile.close();
}
}catch(IOException ex){
throw new IOException("关闭zipFile出现异常",ex);
}
}
}
/**
* 构建目录
* @param outputDir
* @param subDir
*/
public static void createDirectory(String outputDir,String subDir){
File file = new File(outputDir);
if(!(subDir == null || subDir.trim().equals(""))){//子目录不为空
file = new File(outputDir + "/" + subDir);
}
if(!file.exists()){
if(!file.getParentFile().exists())
file.getParentFile().mkdirs();
file.mkdirs();
}
}
3、.tar.gz文件可以看做先用tar打包,再使用gz进行压缩。
使用org.apache.tools.tar.TarEntry; org.apache.tools.tar.TarInputStream 和 org.apache.tools.tar.TarOutputStream
需要导入pom文件:
<dependency>
<groupId>org.apache.ant</groupId>
<artifactId>ant</artifactId>
<version>1.10.7</version>
</dependency>
package com.asiainfo.utils;
import org.apache.tools.tar.TarEntry;
import org.apache.tools.tar.TarInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.util.zip.GZIPInputStream;
public class TarGzipParser {
private static final Logger LOGGER = LoggerFactory.getLogger(TarGzipParser.class);
/**
* 解压tar.gz 文件
* @param file 要解压的tar.gz文件对象
* @param outputDir 要解压到某个指定的目录下
* @throws IOException
*/
public static File unTarGz(File file,String outputDir) throws IOException{
TarInputStream tarIn = null;
File tmpFile = null;
try{
tarIn = new TarInputStream(new GZIPInputStream(
new BufferedInputStream(new FileInputStream(file))),
1024 * 2);
createDirectory(outputDir,null);//创建输出目录
TarEntry entry = null;
while( (entry = tarIn.getNextEntry()) != null ){
if(entry.isDirectory()){//是目录
entry.getName();
createDirectory(outputDir,entry.getName());//创建空目录
}else{//是文件
tmpFile = new File(outputDir + "\\" + entry.getName());
createDirectory(tmpFile.getParent() + "\\",null);//创建输出目录
OutputStream out = null;
try{
out = new FileOutputStream(tmpFile);
int length = 0;
byte[] b = new byte[2048];
while((length = tarIn.read(b)) != -1){
out.write(b, 0, length);
}
}catch(IOException ex){
throw ex;
}finally{
if(out!=null)
out.close();
}
}
}
}catch(IOException ex){
throw new IOException("解压归档文件出现异常",ex);
} finally{
try{
if(tarIn != null){
tarIn.close();
}
}catch(IOException ex){
throw new IOException("关闭tarFile出现异常",ex);
}
}
return tmpFile;
}
/**
* 构建目录
* @param outputDir
* @param subDir
*/
public static void createDirectory(String outputDir,String subDir){
File file = new File(outputDir);
if(!(subDir == null || subDir.trim().equals(""))){//子目录不为空
file = new File(outputDir + "\\" + subDir);
}
if(!file.exists()){
if(!file.getParentFile().exists())
file.getParentFile().mkdirs();
file.mkdirs();
}
}
}
------------------------------------------------------------------------------------------------------------------------
应用场景
在大数据的工作中,每天必不可少的就是和数据打交道,我们需要从我们的业务方将数据采集过来,然后根据我们的业务逻辑将数据解析并转换成我们所需要的格式!大数据分析往往数据量都是非常大的,一天几十T都是很正常,如果按正常的来采集的话,估计就是采集都要花费不少时间,最常用的方式就是将数据进行压缩之后再进行传输,这样的效率是比较高的,也节省了带宽资源!举一个简单的例子,我们的逻辑是xml原始文件先压缩成一个gz文件,再将上百个gz文件再二次压缩成一个tar.gz文件!一个压缩文件的大小大概是200M,但是解压出来就差不多20G!!!此篇文章就记录一下实现功能需求的过程!!!!
依赖
1
2
3
4
5
|
<dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-compress</artifactId> <version> 1.5 </version> </dependency> |
实现代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
|
import lombok.extern.slf4j.Slf4j; import org.apache.commons.compress.archivers.tar.TarArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; import org.apache.commons.compress.utils.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; import java.util.zip.GZIPInputStream; @Slf4j public class FileUtils { private static final Logger LOGGER = LoggerFactory.getLogger(FileUtils. class ); public static void main(String[] args) { deCompressGZipFile( "path1" , "dir1" ); } /** * Tar文件解压方法 * * @param tarGzFile 要解压的压缩文件名称(绝对路径名称) * @param destDir 解压后文件放置的路径名(绝对路径名称)当路径不存在,会自动创建 * @return 解压出的文件列表 */ public static void deCompressGZipFile(String tarGzFile, String destDir) { // 建立输出流,用于将从压缩文件中读出的文件流写入到磁盘 TarArchiveEntry entry = null ; TarArchiveEntry[] subEntries = null ; File subEntryFile = null ; try (FileInputStream fis = new FileInputStream(tarGzFile); GZIPInputStream gis = new GZIPInputStream(fis); TarArchiveInputStream taris = new TarArchiveInputStream(gis);) { while ((entry = taris.getNextTarEntry()) != null ) { StringBuilder entryFileName = new StringBuilder(); entryFileName.append(destDir).append(File.separator).append(entry.getName()); File entryFile = new File(entryFileName.toString()); if (entry.isDirectory()) { if (!entryFile.exists()) { entryFile.mkdir(); } subEntries = entry.getDirectoryEntries(); for ( int i = 0 ; i < subEntries.length; i++) { try (OutputStream out = new FileOutputStream(subEntryFile)) { subEntryFile = new File(entryFileName + File.separator + subEntries[i].getName()); IOUtils.copy(taris, out); } catch (Exception e) { LOGGER.error( "deCompressing file failed:" + subEntries[i].getName() + "in" + tarGzFile); } } } else { checkFileExists(entryFile); OutputStream out = new FileOutputStream(entryFile); IOUtils.copy(taris, out); out.close(); //如果是gz文件进行递归解压 if (entryFile.getName().endsWith( ".gz" )) { String namepath = entryFile.getAbsolutePath(); compressGZ(namepath); } } } //如果需要刪除之前解压的gz文件,在这里进行 File dir = new File( "dir1" ); File[] files = dir.listFiles(); for (File f:files) { if (f.getName().split( "\\." ).length== 3 ){ f.delete(); } } } catch (Exception e) { LOGGER.warn( "decompress failed" , e); } } /** * 解压GZ文件 * @param pwd */ public static void compressGZ(String pwd){ if (!getExtension(pwd).equalsIgnoreCase( "gz" )) { System.err.println( "File name must have extension of \".gz\"" ); System.exit( 1 ); } GZIPInputStream in = null ; try { in = new GZIPInputStream( new FileInputStream(pwd)); } catch (FileNotFoundException e) { System.err.println( "File not found. " + pwd); System.exit( 1 ); } catch (IOException e) { e.printStackTrace(); } String outFileName = getFileName(pwd); FileOutputStream out = null ; try { out = new FileOutputStream(outFileName); } catch (FileNotFoundException e) { System.err.println( "Could not write to file. " + outFileName); System.exit( 1 ); } try { byte [] buf = new byte [ 1024 ]; int len; while ((len = in.read(buf)) > 0 ) { out.write(buf, 0 , len); } in.close(); out.close(); } catch (IOException e) { e.printStackTrace(); } } /** * Used to extract and return the extension of a given file. * @param f Incoming file to get the extension of * @return <code>String</code> representing the extension of the incoming * file. */ public static String getExtension(String f) { String ext = "" ; int i = f.lastIndexOf( '.' ); if (i > 0 && i < f.length() - 1 ) { ext = f.substring(i+ 1 ); } return ext; } /** * Used to extract the filename without its extension. * @param f Incoming file to get the filename * @return <code>String</code> representing the filename without its * extension. */ public static String getFileName(String f) { String fname = "" ; int i = f.lastIndexOf( '.' ); if (i > 0 && i < f.length() - 1 ) { fname = f.substring( 0 ,i); } return fname; } public static void checkFileExists(File file) { //判断是否是目录 if (file.isDirectory()) { if (!file.exists()) { file.mkdir(); } } else { //判断父目录是否存在,如果不存在,则创建 if (file.getParentFile() != null && !file.getParentFile().exists()) { file.getParentFile().mkdirs(); } try { file.createNewFile(); } catch (IOException e) { e.printStackTrace(); } } } } |
这样就实现了将tar.gz文件中的xml原始文件全部解压出来!之后就可以将xml文件中的数据拿出来做解析和分析了!java解析xml文件的方式有多种,这里使用dom4j!!
依赖
1
2
3
4
5
|
<dependency> <groupId>org.dom4j</groupId> <artifactId>dom4j</artifactId> <version> 2.1 . 3 </version> </dependency> |
实现代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
|
import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.Element; import org.dom4j.io.SAXReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.List; /** * java DOM4j xml解析 */ public class JavaXMLTest { public static void main(String[] args) throws IOException, DocumentException { SAXReader reader = new SAXReader(); Document xmlDocument = reader.read( new FileInputStream( new File( "2.xml" ))); //System.out.println(doc==null?"未读取到xml文件":"已读取到xml文件"); //获取根节点 Element rootElement = xmlDocument.getRootElement(); //获取根节点下的直接子节点的个数和名字 List<Element> list = rootElement.elements( "fileHeader" ); //System.out.println("根节点下有"+list.size()+"直接子节点"); //获取根节点下 fileHeader节点得value值 for (Element elemet:list) { String reportTime = elemet.attributeValue( "reportTime" ); String startTime = elemet.attributeValue( "startTime" ); String endTime = elemet.attributeValue( "endTime" ); } //获取根节点下所有得 子节点 List<Element> list1 = rootElement.elements( "header" ); //System.out.println("根节点下header有"+list1.size()+"直接子节点"); //由于只有一个节点所以取get(0) Element header = list1.get( 0 ); //获取header节点得value值 String id = header.attributeValue( "id" ); //System.out.println("id是"+id); //获取header节点下所有 measure 节点 Element measure = header.elements( "measure" ).get( 0 ); //获取measurement节点下 smr节点 Element sm = measure.elements( "sm" ).get( 0 ); //获取smr节点的value值 String stringValue = sm.getStringValue(); //按照空格进行拆分 String[] objj = stringValue.split( " " ); //System.out.println("stringvalue===="+stringValue); //List<Element> smlist = smr.elements("obj"); //获取measure节点下所有的 obj 节点 List<Element> objlist = measurement.elements( "obj" ); //Map<String,String> map = new HashMap(); //遍历所有 obj节点 for (Element ob:objectlist) { //System.out.println(objj.length); //获取所有 obj节点 下的 v 节点 List<Element> vlist = ob.elements( "v" ); //遍历 v 节点 for (Element v:vlist) { //System.out.println("v得value值是"+v.getStringValue()); //获取v节点的value值 String[] vv = v.getStringValue().split( " " ); //System.out.println(vv.length); StringBuilder sb = new StringBuilder(); for ( int i= 0 ;i<objj.length;i++){ sb.append(objj[i]+ "=" +vv[i]); } System.out.println(sb.toString()); sb= null ; } } } } |
至此,利用java就完成了对tar.gz文件的解压,并对xml原始文件进行数据解析!!!这个方法对于小数据量是可以实现的!但是大数据都是基于分布式文件系统Hadoop构建的!我们的数据都是存储在hdfs上的,而且数据也非常大,这样解压文件写到本地文件系统中,再解析其中的数据上传至hdfs!同时也是要消耗带宽的!最终在测试的时候是不行的!这个方案就被否定了!