read & write

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
  struct fd f = fdget_pos(fd);
  ......
  loff_t pos = file_pos_read(f.file);
  ret = vfs_read(f.file, buf, count, &pos);
  ......
}

SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
    size_t, count)
{
  struct fd f = fdget_pos(fd);
  ......
  loff_t pos = file_pos_read(f.file);
    ret = vfs_write(f.file, buf, count, &pos);
  ......
}

// vfs_read->__vfs_read
// vfs_write->__vfs_write

ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
       loff_t *pos)
{
  if (file->f_op->read)
    return file->f_op->read(file, buf, count, pos);
  else if (file->f_op->read_iter) // 比如ext4_file_read_iter
    return new_sync_read(file, buf, count, pos);
  else
    return -EINVAL;
}

ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
        loff_t *pos)
{
  if (file->f_op->write)
    return file->f_op->write(file, p, count, pos);
  else if (file->f_op->write_iter) // 比如ext4_file_write_iter
    return new_sync_write(file, p, count, pos);
  else
    return -EINVAL;
}

const struct file_operations ext4_file_operations = {
  ......
  .read_iter  = ext4_file_read_iter,
  .write_iter  = ext4_file_write_iter,
  ......
}

// ext4_file_read_iter -> generic_file_read_iter
// ext4_file_write_iter -> __generic_file_write_iter

ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
    ......
    if (iocb->ki_flags & IOCB_DIRECT) {
    ......
        struct address_space *mapping = file->f_mapping;
    ......
        retval = mapping->a_ops->direct_IO(iocb, iter);
    }
    ......
    retval = generic_file_buffered_read(iocb, iter, retval);
}

ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
    ......
    if (iocb->ki_flags & IOCB_DIRECT) {
    ......
        written = generic_file_direct_write(iocb, from);
    ......
    } else {
    ......
    written = generic_perform_write(file, from, iocb->ki_pos);
    ......
    }
}

generic_file_read_iter 和 __generic_file_write_iter 有相似的逻辑，就是要区分是否用缓存。

如果发现设置了 IOCB_DIRECT，则会调用 address_space 的 direct_IO 的函数，直接从硬盘读写数据。

我们在 mmap 映射文件到内存的时候讲过 address_space，它主要用于在内存映射的时候将文件和内存页产生关联。

同样，对于缓存来讲，也需要文件和内存页进行关联，这就要用到 address_space。address_space 的相关操作定义在 struct address_space_operations 结构中。

对于 ext4 文件系统来讲， address_space 的操作定义在 ext4_aops，direct_IO 对应的函数是 ext4_direct_IO。

ext4_direct_IO 最终会调用到 __blockdev_direct_IO->do_blockdev_direct_IO，这就跨过了缓存层，到了通用块层，最终到了文件系统的设备驱动层。

由于文件系统是块设备，所以这个调用的是 blockdev 相关的函数。

static const struct address_space_operations ext4_aops = {
  ......
  .direct_IO    = ext4_direct_IO,
  ......
};

/*
 * This is a library function for use by filesystem drivers.
 */
static inline ssize_t
do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
          struct block_device *bdev, struct iov_iter *iter,
          get_block_t get_block, dio_iodone_t end_io,
          dio_submit_t submit_io, int flags)
{......}

带缓存的写入操作

ssize_t generic_perform_write(struct file *file,
        struct iov_iter *i, loff_t pos)
{
  struct address_space *mapping = file->f_mapping;
  const struct address_space_operations *a_ops = mapping->a_ops;
  do {
    struct page *page;
    unsigned long offset;  /* Offset into pagecache page */
    unsigned long bytes;  /* Bytes to write to page */
    // 1. 对于每一页，先调用 address_space 的 write_begin 做一些准备
    status = a_ops->write_begin(file, mapping, pos, bytes, flags,
            &page, &fsdata);
    // 2. 将写入的内容从用户态拷贝到内核态的页中
    copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
    flush_dcache_page(page);
    // 3. 调用 address_space 的 write_end 完成写操作
    status = a_ops->write_end(file, mapping, pos, bytes, copied,
            page, fsdata);
    pos += copied;
written += copied;

    // 4. 看脏页是否太多，需要写回硬盘。所谓脏页，就是写入到缓存，但是还没有写入到硬盘的页面。
    balance_dirty_pages_ratelimited(mapping);
  } while (iov_iter_count(i));
}

static const struct address_space_operations ext4_aops = {
......
  .write_begin    = ext4_write_begin,
  .write_end    = ext4_write_end,
......
}

struct page *grab_cache_page_write_begin(struct address_space *mapping,
          pgoff_t index, unsigned flags)
{
  struct page *page;
  int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT;
  page = pagecache_get_page(mapping, index, fgp_flags,
      mapping_gfp_mask(mapping));
  if (page)
    wait_for_stable_page(page);
  return page;
}

struct address_space {
  struct inode    *host;    /* owner: inode, block_device */
  struct radix_tree_root  page_tree;  /* radix tree of all pages */
  spinlock_t    tree_lock;  /* and lock protecting it */
......
}

第一步，对于 ext4 来讲，调用的是 ext4_write_begin。在 ext4_write_begin中，调用 ext4_journal_start 做日志相关的工作；调用 grab_cache_page_write_begin，来得到应该写入的缓存页，缓存页放在 radix 基数树里面。pagecache_get_page 就是根据 pgoff_t index 这个长整型，在这棵树里面查找缓存页，如果找不到就会创建一个缓存页。

第三步，调用 ext4_write_end 完成写入。这里面会调用 ext4_journal_stop 完成日志的写入，会调用 block_write_end->__block_commit_write->mark_buffer_dirty，将修改过的缓存标记为脏页。可以看出，其实所谓的完成写入，并没有真正写入硬盘，仅仅是写入缓存后，标记为脏页。

但是这里有一个问题，数据很危险，一旦宕机就没有了，所以需要一种机制，将写入的页面真正写到硬盘中，我们称为回写（Write Back）。

size_t iov_iter_copy_from_user_atomic(struct page *page,
    struct iov_iter *i, unsigned long offset, size_t bytes)
{
  // 1. 将分配好的页面映射到内核里面的一个虚拟地址
  char *kaddr = kmap_atomic(page), *p = kaddr + offset;
  // 2. 将用户态的数据拷贝到内核态的页面的虚拟地址中
  iterate_all_kinds(i, bytes, v,
    copyin((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
    memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page,
         v.bv_offset, v.bv_len),
    memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
  )
  // 3. 将内核里面的映射删除
  kunmap_atomic(kaddr);
  return bytes;
}

/**
 * balance_dirty_pages_ratelimited - balance dirty memory state
 * @mapping: address_space which was dirtied
 *
 * Processes which are dirtying memory should call in here once for each page
 * which was newly dirtied.  The function will periodically check the system's
 * dirty state and will initiate writeback if needed.
  */
void balance_dirty_pages_ratelimited(struct address_space *mapping)
{
  struct inode *inode = mapping->host;
  struct backing_dev_info *bdi = inode_to_bdi(inode);
  struct bdi_writeback *wb = NULL;
  int ratelimit;
......
  if (unlikely(current->nr_dirtied >= ratelimit))
    balance_dirty_pages(mapping, wb, current->nr_dirtied);
......
}

// 发现脏页的数目超过了规定的数目，就调用 balance_dirty_pages->wb_start_background_writeback，启动一个背后线程开始回写。

void wb_start_background_writeback(struct bdi_writeback *wb)
{
  /*
   * We just wake up the flusher thread. It will perform background
   * writeback as soon as there is no other work to do.
   */
  wb_wakeup(wb);
}

static void wb_wakeup(struct bdi_writeback *wb)
{
  spin_lock_bh(&wb->work_lock);
  if (test_bit(WB_registered, &wb->state))
    mod_delayed_work(bdi_wq, &wb->dwork, 0);
  spin_unlock_bh(&wb->work_lock);
}


  (_tflags) | TIMER_IRQSAFE);    \
  } while (0)

// bdi_wq 是一个全局变量，所有回写的任务都挂在这个队列上
// mod_delayed_work 函数负责将一个回写任务 bdi_writeback 挂在这个队列上

/* bdi_wq serves all asynchronous writeback tasks */
struct workqueue_struct *bdi_wq;


/**
 * mod_delayed_work - modify delay of or queue a delayed work
 * @wq: workqueue to use
 * @dwork: work to queue
 * @delay: number of jiffies to wait before queueing
 * bdi_writeback 以 dwork 的身份挂到bdi_wq，并设delay=0，即一刻不等，马上执行。
 * 
 * mod_delayed_work_on() on local CPU.
 */
static inline bool mod_delayed_work(struct workqueue_struct *wq,
            struct delayed_work *dwork,
            unsigned long delay)
{....}

/* bdi 即 backing device info，用于描述后端存储相关的信息。
 * 每个块设备都会有这样一个结构，并且在初始化块设备的时候，调用 bdi_init 初始化这个结构
 * 在初始化 bdi 的时候，也会调用 wb_init 初始化 bdi_writeback。*/

static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
       int blkcg_id, gfp_t gfp)
{
  wb->bdi = bdi;
  wb->last_old_flush = jiffies;
  INIT_LIST_HEAD(&wb->b_dirty);
  INIT_LIST_HEAD(&wb->b_io);
  INIT_LIST_HEAD(&wb->b_more_io);
  INIT_LIST_HEAD(&wb->b_dirty_time);
  wb->bw_time_stamp = jiffies;
  wb->balanced_dirty_ratelimit = INIT_BW;
  wb->dirty_ratelimit = INIT_BW;
  wb->write_bandwidth = INIT_BW;
  wb->avg_write_bandwidth = INIT_BW;
  spin_lock_init(&wb->work_lock);
  INIT_LIST_HEAD(&wb->work_list);
  // 初始化一个 timer定时器，到时间就执行 wb_workfn 这个函数。
  INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
  wb->dirty_sleep = jiffies;
......
}


#define __INIT_DELAYED_WORK(_work, _func, _tflags)      \
  do {                \
    INIT_WORK(&(_work)->work, (_func));      \
    __setup_timer(&(_work)->timer, delayed_work_timer_fn,  \
            (unsigned long)(_work),      \

接下来的调用链为：

wb_workfn->wb_do_writeback->wb_writeback->writeback_sb_inodes->__writeback_single_inode->do_writepages，写入页面到硬盘。

带缓存的读操作

static ssize_t generic_file_buffered_read(struct kiocb *iocb,
    struct iov_iter *iter, ssize_t written)
{
  struct file *filp = iocb->ki_filp;
  struct address_space *mapping = filp->f_mapping;
  struct inode *inode = mapping->host;
  for (;;) {
    struct page *page;
    pgoff_t end_index;
    loff_t isize;
    // 1. 先在page cache 里面找是否有缓存页
    page = find_get_page(mapping, index);
    if (!page) {
      if (iocb->ki_flags & IOCB_NOWAIT)
        goto would_block;
      // 2. 如果没有找到，不但读取这一页，还要进行预读
      page_cache_sync_readahead(mapping,
          ra, filp,
          index, last_index - index);
      // 3. 预读完了以后，再试一把查找缓存页，应该能找到了
      page = find_get_page(mapping, index);
      if (unlikely(page == NULL))
        goto no_cached_page;
    }
    // 4. 如果第一次找缓存页就找到了，我们还是要判断，是不是应该继续预读
    if (PageReadahead(page)) {
      // 5. 如果需要，就发起一个异步预读
      page_cache_async_readahead(mapping,
          ra, filp, page,
          index, last_index - index);
    }
    /*
     * Ok, we have the page, and it's up-to-date, so
     * now we can copy it to user space...
     * 6. 将内容从内核缓存页拷贝到用户内存空间
     */
    ret = copy_page_to_iter(page, offset, nr, iter);
    }
}

posted on 2021-06-16 20:32 jingmojing 阅读(206) 评论(0) 收藏举报