hadoop源码剖析--RawLocalFileSystem

RawLocalFileSystem是hadoop中实现的本地文件系统,在该类中与文件元数据和目录相关的操作,都是通过适配方式适配到java.io.File的对应API来完成的,适配过程简单,代码清晰。

1.文件元数据和目录相关的操作分析

下面主要以mkDirs()方法为例来窥探该类的实现和一些独到之处。

/****************************************************************
* Implement the FileSystem API for the raw local filesystem.
*
* 本地文件系统实现,文件元数据和目录相关的操作都是通过适配到java.io.File的对应API完成的。
*****************************************************************/
public class RawLocalFileSystem extends FileSystem {
  static final URI NAME = URI.create("file:///"); //本地文件系统的uri scheme
  private Path workingDir;

 /**
   * Creates the specified directory hierarchy. Does not
   * treat existence as an error.
   */
  //递归创建目录,是个幂等操作
  public boolean mkdirs(Path f) throws IOException {
    Path parent = f.getParent();
    File p2f = pathToFile(f);
    
    //如果父目录为空,试图先创建父目录
    //通过File创建目录,并判断成功创建目录
    return (parent == null || mkdirs(parent)) &&
      (p2f.mkdir() || p2f.isDirectory());
  }

  /** {@inheritDoc} */
  //递归创建目录,并为目录设置访问权限(通过调用shell的"chmod "命令来完成的)
  //问答:奇怪java的文件操作中没有提供chmod的api吗???查看java.io.File后发现提供了相应的api,
  //但控制粒度太粗了,相关api为:setReadOnly,setWritable,setReadable,setExecutable。对用户权限的控制只到了owner和other的区分对待,没有“chmod ”控制的精细
  @Override
  public boolean mkdirs(Path f, FsPermission permission) throws IOException {
    boolean b = mkdirs(f);
    setPermission(f, permission);
    return b;
  }

再看一下RawLocalFileSystem中的一个内部类RowLocalFileStatus:

 static class RawLocalFileStatus extends FileStatus {
    /* We can add extra fields here. It breaks at least CopyFiles.FilePair().
     * We recognize if the information is already loaded by check if
     * onwer.equals("").
     */
    private boolean isPermissionLoaded() {
      return !super.getOwner().equals(""); 
    }
    
    RawLocalFileStatus(File f, long defaultBlockSize, FileSystem fs) {
      super(f.length(), f.isDirectory(), 1, defaultBlockSize,
            f.lastModified(), new Path(f.getPath()).makeQualified(fs));
    }
    
    @Override
    public FsPermission getPermission() {
      if (!isPermissionLoaded()) {
        loadPermissionInfo();
      }
      return super.getPermission();
    }

  //使用'ls -ld'命令来获取权限信息
    private void loadPermissionInfo() {
      IOException e = null;
      try {
        StringTokenizer t = new StringTokenizer(
            FileUtil.execCommand(new File(getPath().toUri()), 
                                 Shell.getGET_PERMISSION_COMMAND()));
        //expected format
        //-rw-------    1 username groupname ...
        String permission = t.nextToken();
        if (permission.length() > 10) { //files with ACLs might have a '+'
          permission = permission.substring(0, 10);
        }
        setPermission(FsPermission.valueOf(permission));
        t.nextToken();
        setOwner(t.nextToken());
        setGroup(t.nextToken());
      } catch (Shell.ExitCodeException ioe) {
        if (ioe.getExitCode() != 1) {
          e = ioe;
        } else {
          setPermission(null);
          setOwner(null);
          setGroup(null);
        }
      } catch (IOException ioe) {
        e = ioe;
      } finally {
        if (e != null) {
          throw new RuntimeException("Error while running command to get " +
                                     "file permissions : " + 
                                     StringUtils.stringifyException(e));
        }
      }
    }

通过以上两段代码可以看出hadoop的本地文件系统的实现,在利用java语言提供的File类的基础上,做了一些适合自身的变化来达到目标。调用linux的shell命令,需要在linux系统中创建一个新的java虚拟机而消耗大量的资源。

2. 文件的读分析

RawLocalFileSystem使用LocalFSFileInputStream和LocalFSFileOutputStream进行读写。

/*******************************************************
   * For open()'s FSInputStream
   *******************************************************/
  //本地文件系统读取流
  class LocalFSFileInputStream extends FSInputStream {
    FileInputStream fis; //文件读取流
    private long position; //记录当前读取的数据在文件中的位置

    public LocalFSFileInputStream(Path f) throws IOException {
      this.fis = new TrackingFileInputStream(pathToFile(f)); //实际使用的是文件读取流是TrackingFileInputStream
    }
    
    //系统文件当前位置
    public void seek(long pos) throws IOException {
      fis.getChannel().position(pos);
      this.position = pos;
    }
    
    //获取位置
    public long getPos() throws IOException {
      return this.position;
    }
    
    //定位到新的block(本地文件系统没有这样的功能,所以简单返回失败)
    public boolean seekToNewSource(long targetPos) throws IOException {
      return false;
    }
    
    /*
     * Just forward to the fis
     */
    //获取剩余可读或可跳过的字节数
    public int available() throws IOException { return fis.available(); }
    //关闭输入流,并释放系统分配的资源
    public void close() throws IOException { fis.close(); }
    public boolean markSupport() { return false; }
    
    //read()方法需要随时更新position,以保证getPos()能返回正确的值
    public int read() throws IOException {
      try {
        int value = fis.read();
        if (value >= 0) {
          this.position++; //更新文件当前位置
        }
        return value;
      } catch (IOException e) {                 // unexpected exception
        throw new FSError(e);                   // assume native fs error
      }
    }
    
    public int read(byte[] b, int off, int len) throws IOException {
      try {
        int value = fis.read(b, off, len);
        if (value > 0) {
          this.position += value;
        }
        return value;
      } catch (IOException e) {                 // unexpected exception
        throw new FSError(e);                   // assume native fs error
      }
    }
    
    public int read(long position, byte[] b, int off, int len)
      throws IOException {
      ByteBuffer bb = ByteBuffer.wrap(b, off, len);
      try {
        return fis.getChannel().read(bb, position);
      } catch (IOException e) {
        throw new FSError(e);
      }
    }
    
    public long skip(long n) throws IOException {
      long value = fis.skip(n);
      if (value > 0) {
        this.position += value;
      }
      return value;
    }
  }

可以看到LocalFSFileInputStream类实际使用的读流是TrackingFileInputStream

//重写了FileInputStream中的所有read方法,提供文件读取字节数的统计功能。
  //TrackingFileInputStream使用修饰器模式
  class TrackingFileInputStream extends FileInputStream {
    public TrackingFileInputStream(File f) throws IOException {
      super(f);
    }
    
    public int read() throws IOException {
      int result = super.read();
      if (result != -1) {
        statistics.incrementBytesRead(1);
      }
      return result;
    }
    
    public int read(byte[] data) throws IOException {
      int result = super.read(data);
      if (result != -1) {
        statistics.incrementBytesRead(result);
      }
      return result;
    }
    
    public int read(byte[] data, int offset, int length) throws IOException {
      int result = super.read(data, offset, length);
      if (result != -1) {
        statistics.incrementBytesRead(result);
      }
      return result;
    }
  }

那么RawLocalFileSystem和LocalFSFileInputStream是如何对接起来进行读操作的呢,当然还是和java的api一致(使用open()和create()方法来创建LocalFSFileInputStream)。下面以LocalFSFileInputStream的open()方法为例进行分析:

public FSDataInputStream open(Path f, int bufferSize) throws IOException {
    if (!exists(f)) {
      throw new FileNotFoundException(f.toString());
    }
    return new FSDataInputStream(new BufferedFSInputStream( //包装LocalFSFileInputStream
        new LocalFSFileInputStream(f), bufferSize));
  }
public class FSDataInputStream extends DataInputStream
    implements Seekable, PositionedReadable, Closeable {

  public FSDataInputStream(InputStream in)
    throws IOException {
    super(in);
    if( !(in instanceof Seekable) || !(in instanceof PositionedReadable) ) {
      throw new IllegalArgumentException(
          "In is not an instance of Seekable or PositionedReadable");
    }
  }
  
  public synchronized void seek(long desired) throws IOException {
    ((Seekable)in).seek(desired);
  }

  public long getPos() throws IOException {
    return ((Seekable)in).getPos();
  }
  
  public int read(long position, byte[] buffer, int offset, int length)
    throws IOException {
    return ((PositionedReadable)in).read(position, buffer, offset, length);
  }
  
  public void readFully(long position, byte[] buffer, int offset, int length)
    throws IOException {
    ((PositionedReadable)in).readFully(position, buffer, offset, length);
  }
  
  public void readFully(long position, byte[] buffer)
    throws IOException {
    ((PositionedReadable)in).readFully(position, buffer, 0, buffer.length);
  }
  
  public boolean seekToNewSource(long targetPos) throws IOException {
    return ((Seekable)in).seekToNewSource(targetPos); 
  }
}

获取到读流后就可以调用流的读取方法进行读取了。

3. 文件的写分析

至于写操作,还是和java中的写保持一致的,支持append和随机写两种方式。

posted @ 2015-03-13 16:42  JerryShao  阅读(1115)  评论(0编辑  收藏  举报