require:
/**
* before:
* file A1.csv {1,2,3,4,5}
* file A2.csv {2,3,9,10,11}
* file B1.csv {5,12,13,14,15}
* file B2.csv {16,14,15,4,9,20,30}
* A1.csv A2.csv A3.csv A4.csv cant not repeat
*
* after:
* file A1.csv {1,4}
* file A2.csv {2,3,10,11}
* file B1.csv {12,13}
* file B2.csv {16,9,20,30}
*/
tangxin@tangxin:~/csvrepeat$ ls
A1.csv A2.csv B1.csv B2.csv
CSVUtilVersion2.java
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.springframework.util.StringUtils;
import java.io.*;
import java.lang.reflect.Array;
import java.util.*;
/**
* before:
* file A1.csv {1,2,3,4,5}
* file A2.csv {2,3,9,10,11}
* file B1.csv {5,12,13,14,15}
* file B2.csv {16,14,15,4,9,20,30}
* A1.csv A2.csv A3.csv A4.csv cant not repeat
*
* after:
* file A1.csv {1,4}
* file A2.csv {2,3,10,11}
* file B1.csv {12,13}
* file B2.csv {16,9,20,30}
*/
@Slf4j
public class CSVUtilVersion2 {
private static final String CSV_PATH = "/home/tangxin/csvrepeat/";
private static final boolean CREATE_SWITCH = true;
/**
* read single column data list
* @param path
* @return
*/
public static List<String> ids(String path) {
List<String> result = new ArrayList<>();
File csv = new File(path); // CSV文件路径
LineIterator it = null;
try {
it = FileUtils.lineIterator(csv);
while (it.hasNext()) {
String line = it.nextLine();
if (line.trim().contains("ID")) {
continue;
}
String[] arr = line.split(",");
String ID = arr[0];
ID = ID.replaceAll("\"", "").trim();
if (!StringUtils.isEmpty(ID)) {
result.add(ID);
}
}
} catch (Exception e) {
log.error("读取ID csv文件失败:{}", e.getMessage());
} finally {
LineIterator.closeQuietly(it);
}
return result;
}
/**
* from src delete oth
* @param src
* @param oth
* @return
*/
public static List removeAll(List src, List oth) {
LinkedList result = new LinkedList(src);
HashSet othHash = new HashSet(oth);
Iterator iter = result.iterator();
while (iter.hasNext()) {
if (othHash.contains(iter.next())) {
iter.remove();
}
}
return result;
}
/**
* -Xms1g -Xmx1g -XX:PermSize=128m -XX:SurvivorRatio=2 -XX:+UseParallelGC
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
//∑=1+2+3+...+(n-1) group
LinkedList<String> fileList = new LinkedList<>();
fileList.add("A1.csv");
fileList.add("A2.csv");
fileList.add("B1.csv");
fileList.add("B2.csv");
// fileList.add("C1.csv");
DescartesRepeat(fileList);
ded(fileList);
}
private static void DescartesRepeat(LinkedList<String> fileList) {
Set<String> repeatList = new HashSet<>();
Set<String> groupSet = new HashSet<>();
Set<String> goONList = new HashSet<>();
//A1->A2,B1,B2
for (int i = 0; i < fileList.size(); i++) {
String itemI = fileList.get(i);
for (int j = 0; j < fileList.size(); j++) {
String itemJ = fileList.get(j);
if (!itemI.equals(itemJ)) {
String groupR1 = itemI + "->" + itemJ;
String groupR2 = itemJ + "->" + itemI;
if (groupSet.contains(groupR1) || groupSet.contains(groupR2)){
continue;
}
groupSet.add(groupR1);
String repeatT = repeat(CSV_PATH + itemI, CSV_PATH + itemJ);
if(!StringUtils.isEmpty(repeatT)){
repeatList.add(repeatT);
//System.out.println(groupR1+"->"+repeatT);
}
}
}
}
if (CollectionUtils.isNotEmpty(repeatList)) {
// System.out.println(repeatList);
for (String repeatItem : repeatList) {
Iterator<String> iterator = fileList.iterator();
while (iterator.hasNext()) {
String oldItem = iterator.next();
String oldS = oldItem.replace(".csv", "").replace("-new","");
String repeatS = repeatItem.replace(".csv","").replace("-new","");
if (repeatS.contains(oldS)) {
iterator.remove();
goONList.add(repeatItem);
}
}
}
fileList.addAll(goONList);
System.out.println(fileList);
DescartesRepeat(fileList);
}
}
public static void ded(List<String> args) {
//保证指定csv列表每组都不能有重复数据
for (int i = 0; i < args.size(); i++) {
// if(i>0){
// continue;
// }
String source = CSV_PATH + args.get(i);
for (int j = 0; j < args.size(); j++) {
if (i == j) {
continue;
}
String target = CSV_PATH + args.get(j);
intersection(source, target);
}
}
}
public static void intersection(String sourcePath, String targetPath) {
List<String> ids1 = ids(sourcePath);
List<String> ids2 = ids(targetPath);
List<String> inter = (List<String>) CollectionUtils.intersection(ids1, ids2);
System.out.println(sourcePath + "和" + targetPath + "的重复数据大小" + inter.size());
}
public static String repeat(String source, String target){
//cdd fund xyd
List<String> ids1 = ids(source);
List<String> ids2 = ids(target);
// System.out.println(source + "集合大小" + ids1.size());
// System.out.println(target + "集合大小" + ids2.size());
List<String> inter = (List<String>) CollectionUtils.intersection(ids1, ids2);
// System.out.println("去重数据大小:" + inter.size());
if (inter != null && inter.size() > 0) {
if (ids1.size() > ids2.size()) {
return repeatInner(source, ids1, inter);
} else if (ids2.size() > ids1.size()) {
return repeatInner(target, ids2, inter);
} else {
return repeatInner(source, ids1, inter);
}
}
return "";
}
private static String repeatInner(String source, List<String> ids, List<String> inter) {
String newPath = source.replace(".csv", "-new.csv");
List<String> ids1new = removeAll(ids, inter);
createCSV(ids1new, newPath);
return newPath.replace(CSV_PATH,"");
}
/**
* 创建CSV文件
*/
public static void createCSV(List<String> list, String fileName) {
if(!CREATE_SWITCH){
// System.out.println("创建csv开关关闭");
return;
}else{
// System.out.println("创建csv开关开启");
}
// 表格头
Object[] head = {"ID"};
List<Object> headList = Arrays.asList(head);
//数据
List<List<Object>> dataList = new ArrayList<>();
List<Object> rowList = null;
for (int i = 0; i < list.size(); i++) {
rowList = new ArrayList<>();
rowList.add(list.get(i));
dataList.add(rowList);
}
File csvFile;
BufferedWriter csvWtriter = null;
try {
csvFile = new File(fileName);
File parent = csvFile.getParentFile();
if (parent != null && !parent.exists()) {
parent.mkdirs();
}
csvFile.createNewFile();
// GB2312使正确读取分隔符","
csvWtriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(csvFile), "GB2312"), 1024);
// 写入文件头部
writeRow(headList, csvWtriter);
// 写入文件内容
for (List<Object> row : dataList) {
writeRow(row, csvWtriter);
}
csvWtriter.flush();
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
csvWtriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 写一行数据
*
* @param row 数据列表
* @param csvWriter
* @throws IOException
*/
private static void writeRow(List<Object> row, BufferedWriter csvWriter) throws IOException {
for (Object data : row) {
StringBuffer sb = new StringBuffer();
String rowStr = sb.append("\"").append(data).append("\",").toString();
csvWriter.write(rowStr);
}
csvWriter.newLine();
}
}
tangxin@tangxin:~/csvrepeat$ ls
A1.csv A1-new.csv A1-new-new.csv A2.csv A2-new.csv B1.csv B2.csv B2-new.csv B2-new-new.csv