/** Splits files returned by {@link #listStatus(JobConf)} when
* they're too big.*/
public InputSplit[] getSplits(JobConf job, int numSplits)
throws IOException {
//计时器,
StopWatch sw = new StopWatch().start();
//
FileStatus[] files = listStatus(job);
// Save the number of input files for metrics/loadgen
//设置配置中文件个数mapreduce.input.fileinputformat.numinputfiles
job.setLong(NUM_INPUT_FILES, files.length);
// 计算所有文件的大小总和
long totalSize = 0; // compute total size
for (FileStatus file: files) { // check we have valid files
if (file.isDirectory()) {
throw new IOException("Not a file: "+ file.getPath());
}
totalSize += file.getLen();
}
// 每个split目标大小,用总的文件大小 / (max(设置的split个数,1)),
long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
// 每个split大小的最小值,读取mapreduce.input.fileinputformat.split.minsize配置,如果没有配置的话那么
// 取minSplitSize =1
long minSize = Math.max(job.getLong(org.apache.hadoop.mapreduce.lib.input.
FileInputFormat.SPLIT_MINSIZE, 1), minSplitSize);
// 生成 splits
ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
NetworkTopology clusterMap = new NetworkTopology();
//遍历文件列表
for (FileStatus file: files) {
//获取一个文件路径
Path path = file.getPath();
//获取文件大小
long length = file.getLen();
if (length != 0) {
FileSystem fs = path.getFileSystem(job);
BlockLocation[] blkLocations;
//判断file是否包含file的location,也就是,是否包含BlockLocation等信息,
if (file instanceof LocatedFileStatus) {
blkLocations = ((LocatedFileStatus) file).getBlockLocations();
} else {
//去构造BlockLocation信息
blkLocations = fs.getFileBlockLocations(file, 0, length);
}
//判断文件是否可以切分
if (isSplitable(fs, path)) {
//获取文件的BlockSize大小
long blockSize = file.getBlockSize();
//splitSize最终由 goalSize(设置的每个split大小的目标值),minSize(设置的每个split大小的最小值),blockSize(file的block数量)三个值所决定,逻辑关系如下:
// Math.max(minSize, Math.min(goalSize, blockSize))
// Math.max(minSize, Math.min((totalSize / (numSplits == 0 ? 1 : numSplits)), blockSize))
// numSplits这个设置,只有在totalSize/numSplits < blockSize才会生效
// minSize 只有在大于blockSize的时候才会生效
long splitSize = computeSplitSize(goalSize, minSize, blockSize);
//文件为读取长度
long bytesRemaining = length;
//如果剩余的大小/split的大小大雨1.1,那么就商城生成一个split
while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) {
String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations,
length-bytesRemaining, splitSize, clusterMap);
splits.add(makeSplit(path, length-bytesRemaining, splitSize,
splitHosts[0], splitHosts[1]));
bytesRemaining -= splitSize;
}
//剩余的一点点数据也要生成一个split,
if (bytesRemaining != 0) {
String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, length
- bytesRemaining, bytesRemaining, clusterMap);
splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
splitHosts[0], splitHosts[1]));
}
} else {
String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations,0,length,clusterMap);
splits.add(makeSplit(path, 0, length, splitHosts[0], splitHosts[1]));
}
} else {
//Create empty hosts array for zero length files
splits.add(makeSplit(path, 0, length, new String[0]));
}
}
sw.stop();
if (LOG.isDebugEnabled()) {
LOG.debug("Total # of splits generated by getSplits: " + splits.size()
+ ", TimeTaken: " + sw.now(TimeUnit.MILLISECONDS));
}
return splits.toArray(new FileSplit[splits.size()]);
}