HDFS API相关操作,大文件上传、合并、删除、修改、查看,复制、移动等相关操作
文件合并,大文件IOUtils等的操作,本地文件系统的获取,文件合并上传,合并下载等操作
package com.byd.bigdata.spark.job;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.Progressable;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.catalyst.encoders.RowEncoder;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import java.io.*;
import java.net.URL;
import java.security.PrivilegedAction;
import java.util.ArrayList;
import java.util.UUID;
public class RoadCsvRead2Ods {
public static void main(String[] args) {
// String filePath =args[0];
// if(filePath == null){
// filePath = "/data/bigdatamgr/blims/20220816/HCEC-2335-20220719-153805-164659.csv";
// }
// read2Ods(filePath);
getAllFiles();
}
public static void getAllFiles() {
//1、kerberos验证
URL resource = Thread.currentThread().getContextClassLoader().getResource("");
String basePath = resource.getPath();
Configuration conf = new Configuration();
String keyTab = basePath + "kerberos/prod/ic.bigdatamgr.keytab";
String krb5 = basePath + "kerberos/prod/krb5.conf";
String priciple = "ic.bigdatamgr";
conf.set("", krb5);
UserGroupInformation.setConfiguration(conf);
try {
UserGroupInformation.loginUserFromKeytab(priciple, keyTab);
UserGroupInformation loginUser = UserGroupInformation.getLoginUser();
loginUser.doAs(new PrivilegedAction<Object>() {
@Override
public Object run() {
conf.addResource(basePath + "hadoop/prod/core-site.xml");
conf.addResource(basePath + "hadoop/prod/hdfs-site.xml");
conf.addResource(basePath + "hadoop/prod/yarn-site.xml");
try {
FileSystem fs = FileSystem.newInstance(conf);
FileContext fc = FileContext.getFileContext(conf);
LocalFileSystem localFileSystem = FileSystem.getLocal(conf);
FileContext.Util util = fc.util();// 集群上的复制文件或者移动文件
//一、下载操作
downLoad(fs, localFileSystem, fc);
fs.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
return null;
}
});
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public void cat(FileSystem fs) throws Exception {
FSDataInputStream inputStream = fs.open(new Path("/hdfsapi/test3/a.txt"));
IOUtils.copyBytes(inputStream, System.out, 1024);
inputStream.close();
}
/**
* 文件重命名
*
* @throws Exception
*/
public void rename(FileSystem fs) throws Exception {
Path oldPath = new Path("/hdfsapi/test3/a.txt");
Path newPath = new Path("/hdfsapi/test3/b.txt");
fs.rename(oldPath, newPath);
}
// 大文件上传
public void copyFromLocalFileWithProgress(FileSystem fs) throws Exception {
InputStream in = new BufferedInputStream(new FileInputStream(new File("D:\\迅雷下载\\用1.mp4")));
FSDataOutputStream outputStream = fs.create(new Path("/hdfsapi/test/gakki.mp4"),
new Progressable() {
@Override
public void progress() {
System.out.print("."); // 带进度提醒
}
});
IOUtils.copyBytes(in, outputStream, 4096);
}
public static void upload(FileSystem fs, FileSystem local) {
Path targetPath = new Path("/data/bigdatamgr/blims_ori/blf/20220804/SA3HF-8697-20220619-235730-235800.blf");
Path srcPath = new Path("D:\\test\\a.txt");
//1. 最简单的
try {
fs.copyFromLocalFile(false, false, srcPath, targetPath);
} catch (IOException e) {
throw new RuntimeException(e);
}
//2.遍历目录,IOUtils.copyBytes(),顺序复制,通过获取本地文件系统或者直接new FileInputStream()的方式实现 2种
try {
//获取本地文件系统
LocalFileSystem localFs = FileSystem.getLocal(new Configuration());
if (localFs.isDirectory(srcPath)) {
RemoteIterator<LocatedFileStatus> lsfr = localFs.listFiles(srcPath, true);
while (lsfr.hasNext()) {
LocatedFileStatus next = lsfr.next();
FSDataInputStream fin = fs.open(next.getPath());
FSDataOutputStream fout = fs.create(new Path("/tmp/data/" + next.getPath().getName()));
IOUtils.copyBytes(fin, fout, 4096);
fin.close();
fout.close();
}
} else {
// FSDataInputStream fin = fs.open(srcPath);
InputStream fin = new FileInputStream("D://a.txt");
FSDataOutputStream fout = fs.create(targetPath);
IOUtils.copyBytes(fin, fout, 4096);
fin.close();
fout.close();
}
} catch (IOException e) {
throw new RuntimeException(e);
}
//针对大文件上传
InputStream in = null;
try {
in = new BufferedInputStream(new FileInputStream(new File("D:\\迅雷下载\\用1.mp4")));
FSDataOutputStream outputStream = fs.create(new Path("/hdfsapi/test/gakki.mp4"),
new Progressable() {
@Override
public void progress() {
System.out.print("."); // 带进度提醒
}
});
IOUtils.copyBytes(in, outputStream, 4096);
} catch (FileNotFoundException e) {
throw new RuntimeException(e);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
//下载
public static void downLoad(FileSystem fs, FileSystem local, FileContext fc) {
Path srcPath = new Path("/data/bigdatamgr/blims_ori/blf/20220804/SA3HF-8697-20220619-235730-235800.blf");
Path targetPath = new Path("D:\\test\\a.txt");
// 1.copytToLocalFile
try {
fs.copyToLocalFile(false, srcPath, targetPath);
} catch (IOException e) {
throw new RuntimeException(e);
}
// 2.字节流的复制
FileOutputStream localFileOut = null;
FSDataInputStream remoteFin = null;
try {
remoteFin = fs.open(srcPath);
localFileOut = new FileOutputStream("D:\\test\\a.txt");
IOUtils.copyBytes(remoteFin, localFileOut, 2048);
remoteFin.close();
localFileOut.close();
} catch (FileNotFoundException e) {
throw new RuntimeException(e);
} catch (IOException e) {
throw new RuntimeException(e);
}
// 4.下载前做合并,
try {
LocalFileSystem localFs = FileSystem.getLocal(new Configuration());
FSDataOutputStream fsdOut = localFs.create(targetPath);
if (fs.isDirectory(srcPath)) {
RemoteIterator<LocatedFileStatus> lfsr = fs.listFiles(srcPath, true);
while (lfsr.hasNext()) {
LocatedFileStatus next = lfsr.next();
FSDataInputStream fdIn = fs.open(next.getPath());
IOUtils.copyBytes(fdIn, fsdOut, 2048);
fdIn.close();
}
fsdOut.close();
} else {
FSDataInputStream fsIn = fs.open(srcPath);
IOUtils.copyBytes(fsIn, fsdOut, 2048);
fsIn.close();
fsdOut.close();
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
// cluster merge small file
public static void mergeFile(FileSystem fs) {
Path targetPath = new Path("/tmp/merger");
Path srcPath = new Path("/tmp/files");
try {
FSDataOutputStream fout = fs.create(targetPath);
if (fs.isDirectory(srcPath)) {
RemoteIterator<LocatedFileStatus> lsfr = fs.listFiles(srcPath, true);
while (lsfr.hasNext()) {
LocatedFileStatus next = lsfr.next();
FSDataInputStream fin = fs.open(next.getPath());
IOUtils.copyBytes(fin, fout, 4096);
fin.close();
}
}
fout.close();
fs.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public static void copyOrMoveFiles() {
Path targetPath = new Path("/tmp/merger");
Path srcPath = new Path("/tmp/files");
Configuration conf = new Configuration();
conf.addResource("basedir/hdfs-site.xml");
conf.addResource("basedir/core-site.xml");
try {
// FileSystem fs = FileSystem.newInstance(conf);
FileContext fc = FileContext.getFileContext(conf);
FileContext.Util util = fc.util();
util.copy(srcPath, targetPath, false, false);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public static void read2Ods(String filePath) {
SparkSession spark = SparkSession.builder().enableHiveSupport().appName("readCsv2OdsJob")
.master("yarn")
.enableHiveSupport()
.getOrCreate();
Dataset<Row> ds = spark.read()
.option("header", "true")
.option("delimiter", ",")
.csv(filePath);
ArrayList<StructField> resStructType = new ArrayList<>();
resStructType.add(DataTypes.createStructField("id", DataTypes.StringType, true));
resStructType.add(DataTypes.createStructField("car_vin", DataTypes.StringType, true));
resStructType.add(DataTypes.createStructField("s_id", DataTypes.StringType, true));
resStructType.add(DataTypes.createStructField("channel", DataTypes.StringType, true));
resStructType.add(DataTypes.createStructField("millisecond", DataTypes.StringType, true));
resStructType.add(DataTypes.createStructField("micro_sec", DataTypes.StringType, true));
resStructType.add(DataTypes.createStructField("s_data", DataTypes.StringType, true));
StructType structType = DataTypes.createStructType(resStructType);
Dataset<Row> transDs = ds.map(new MapFunction<Row, Row>() {
@Override
public Row call(Row row) throws Exception {
String id = UUID.randomUUID().toString();
String car_vin = row.getAs("car_vin");
String s_id = row.getAs("s_id");
String channel = row.getAs("channel");
String millisecond = row.getAs("millisecond");
String micro_sec = row.getAs("micro_sec");
String s_data = row.getAs("s_data");
Row row1 = RowFactory.create(id, car_vin, s_id, channel, millisecond, micro_sec, s_data);
return row1;
}
}, RowEncoder.apply(structType));
transDs.show(10);
// transDs.write().mode(SaveMode.Append).saveAsTable("");
// transDs.write().mode(SaveMode.Overwrite).insertInto("");
}
public static void loadCsv() {
SparkSession spark = SparkSession.builder().appName("Move").enableHiveSupport().getOrCreate();
JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
Configuration conf = jsc.hadoopConfiguration();
try {
FileSystem fs = FileSystem.get(conf);
Path path = new Path("/tmp/blims");
// boolean directory = fs.isDirectory();
// getExistsFile(fs,path);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
```java