Linux Kernel File IO Syscall Kernel-Source-Code Analysis(undone)

0. 引言
1. open() syscall
2. close() syscall

0. 引言

在linux的哲学中，所有的磁盘文件、目录、外设设备、驱动设备全部被抽象为了"文件"这个概念，所以本文提到的"File IO"适用于linux下所有的IO操作，需要明白的的，本文分析的是linux下的IO系统调用对应的内核源代码，linux下每一个系统调用都有对应的内核源代码，而我们在ring3常用的glib c的编程所有的c库API，它们只是对系统调用的一个封装，最终还是要通过系统调用实现功能

0x1: SYSCALL_DEFINE宏定义

我们在学习内核源代码的时候经常会遇到一个宏定义: SYSCALL_DEFINE，所有的系统调用的声明都通过它来实现

\linux-2.6.32.63\include\linux\syscalls.h

#define SYSCALL_DEFINE0(sname)                    \
    SYSCALL_TRACE_ENTER_EVENT(_##sname);            \
    SYSCALL_TRACE_EXIT_EVENT(_##sname);            \
    static const struct syscall_metadata __used        \
      __attribute__((__aligned__(4)))            \
      __attribute__((section("__syscalls_metadata")))    \
      __syscall_meta_##sname = {                \
        .name         = "sys_"#sname,            \
        .nb_args     = 0,                \
        .enter_event    = &event_enter__##sname,    \
        .exit_event    = &event_exit__##sname,        \
    };                            \
    asmlinkage long sys_##sname(void)
#else
    #define SYSCALL_DEFINE0(name)       asmlinkage long sys_##name(void)
#endif

#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)

...

#ifdef CONFIG_FTRACE_SYSCALLS
    #define SYSCALL_DEFINEx(x, sname, ...)                \
        static const char *types_##sname[] = {            \
            __SC_STR_TDECL##x(__VA_ARGS__)            \
        };                            \
        static const char *args_##sname[] = {            \
            __SC_STR_ADECL##x(__VA_ARGS__)            \
        };                            \
        SYSCALL_METADATA(sname, x);                \
        __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
#else
    #define SYSCALL_DEFINEx(x, sname, ...)                \
        __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
#endif

#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
    #define SYSCALL_DEFINE(name) static inline long SYSC_##name
    #define __SYSCALL_DEFINEx(x, name, ...)                    \
    asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__));        \
    static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__));    \
    asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__))        \
    {                                \
        __SC_TEST##x(__VA_ARGS__);                \
        return (long) SYSC##name(__SC_CAST##x(__VA_ARGS__));    \
    }                                \
    SYSCALL_ALIAS(sys##name, SyS##name);                \
    static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__))
#else /* CONFIG_HAVE_SYSCALL_WRAPPERS */
    #define SYSCALL_DEFINE(name) asmlinkage long sys_##name
    #define __SYSCALL_DEFINEx(x, name, ...) asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__))
#endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */

所以对函数定义

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)就等于
asmlinkage long sys_socket(int family, int type, int protocol)

Relevant Link:

http://blog.csdn.net/p_panyuch/article/details/5648007

1. open() syscall

open()系统调用在kernel中对应的是sys_open()

\linux-2.6.32.63\fs\open.c

SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)
{
    long ret;

    if (force_o_largefile())
    {
        flags |= O_LARGEFILE;
    } 

    //调用do_sys_open完成实际功能
    ret = do_sys_open(AT_FDCWD, filename, flags, mode);
    /* avoid REGPARM breakage on x86: */
    asmlinkage_protect(3, ret, filename, flags, mode);
    return ret;
}

继续跟进do_sys_open()函数

long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
{
    /*获取文件名称，由getname()函数完成，其内部首先创建存取文件名称的空间，然后从用户空间把文件名拷贝过来*/
    char *tmp = getname(filename);
    int fd = PTR_ERR(tmp);

    if (!IS_ERR(tmp)) 
    {
        /*获取一个可用的fd，此函数调用alloc_fd()函数从fd_table中获取一个可用fd,并进行初始化*/
        fd = get_unused_fd_flags(flags);
        if (fd >= 0) 
        {
            /*fd获取成功则开始打开文件，此函数是主要完成打开功能的函数*/
            struct file *f = do_filp_open(dfd, tmp, flags, mode, 0);
            if (IS_ERR(f)) 
            {
                /*打开失败，释放fd*/
                put_unused_fd(fd);
                fd = PTR_ERR(f);
            } 
            else 
            {
                //文件如果已经被打开了，调用fsnotify_open()函数 
                fsnotify_open(f->f_path.dentry);
                //将文件指针安装在fd数组中，每个进程都会将打开的文件句柄保存在fd_array[]数组中
                fd_install(fd, f);
            }
        }
        //释放放置从用户空间拷贝过来的文件名的存储空间 
        putname(tmp);
    }
    return fd;
}

继续跟进do_file_open()函数

/*
 * Note that the low bits of the passed in "open_flag"
 * are not the same as in the local variable "flag". See
 * open_to_namei_flags() for more details.
 */
struct file *do_filp_open(int dfd, const char *pathname, int open_flag, int mode, int acc_mode)
{
    /* 若干变量声明 */
    struct file *filp;
    struct nameidata nd;
    int error;
    struct path path;
    struct dentry *dir;
    int count = 0;
    int will_write;
    /*改变参数flag的值，具体做法是flag+1*/
    int flag = open_to_namei_flags(open_flag);
    /*设置访问权限*/
    if (!acc_mode)
    {
        acc_mode = MAY_OPEN | ACC_MODE(flag);
    } 

    /* O_TRUNC implies we need access checks for write permissions */
    /* 根据O_TRUNC标志设置写权限 */
    if (flag & O_TRUNC)
    {
        acc_mode |= MAY_WRITE;
    } 

    /* Allow the LSM permission hook to distinguish append access from general write access. */
    /* 设置O_APPEND标志 */
    if (flag & O_APPEND)
    {
        acc_mode |= MAY_APPEND;
    } 

    /* The simplest case - just a plain lookup. */
    /* 如果不是创建文件 */
    if (!(flag & O_CREAT)) 
    { 
        /*
        当内核要访问一个文件的时候，第一步要做的是找到这个文件，而查找文件的过程在vfs里面是由path_lookup或者path_lookup_open函数来完成的
        这两个函数将用户传进来的字符串表示的文件路径转换成一个dentry结构，并建立好相应的inode和file结构，将指向file的描述符返回用户
        用户随后通过文件描述符，来访问这些数据结构
        */
        error = path_lookup_open(dfd, pathname, lookup_flags(flag), &nd, flag);
        if (error)
        {
            return ERR_PTR(error);
        } 
        goto ok;
    }

    /*
     * Create - we need to know the parent.
     */
    //path-init为查找作准备工作，path_walk真正上路查找，这两个函数联合起来根据一段路径名找到对应的dentry  
    error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
    if (error)
    {
        return ERR_PTR(error);
    } 
    /*
    这个函数相当重要，是整个NFS的名字解析函数，其实也是NFS得以构筑的函数
    该函数采用一个for循环，对name路径根据目录的层次，一层一层推进，直到终点或失败。在推进的过程中，一步步建立了目录树的dentry和对应的inode
    */
    error = path_walk(pathname, &nd);
    if (error) 
    {
        if (nd.root.mnt)
        {
            /*减少dentry和vsmount得计数*/
            path_put(&nd.root);
        } 
        return ERR_PTR(error);
    }
    if (unlikely(!audit_dummy_context()))
    {
        /*保存inode节点信息*/
        audit_inode(pathname, nd.path.dentry);
    } 

    /*
     * We have the parent and last component. First of all, check
     * that we are not asked to creat(2) an obvious directory - that
     * will not do.
     */
    error = -EISDIR;
    /*父节点信息*/
    if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len])
    {
        goto exit_parent;
    } 

    error = -ENFILE;
    /* 返回特定的file结构体指针 */
    filp = get_empty_filp();
    if (filp == NULL)
    {
        goto exit_parent;
    } 
    /* 填充nameidata结构 */
    nd.intent.open.file = filp;
    nd.intent.open.flags = flag;
    nd.intent.open.create_mode = mode;
    dir = nd.path.dentry;
    nd.flags &= ~LOOKUP_PARENT;
    nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN;
    if (flag & O_EXCL)
    {
        nd.flags |= LOOKUP_EXCL;
    } 
    mutex_lock(&dir->d_inode->i_mutex);
    /*从哈希表中查找nd对应的dentry*/
    path.dentry = lookup_hash(&nd);
    path.mnt = nd.path.mnt;

do_last:
    error = PTR_ERR(path.dentry);
    if (IS_ERR(path.dentry)) 
    {
        mutex_unlock(&dir->d_inode->i_mutex);
        goto exit;
    }

    if (IS_ERR(nd.intent.open.file)) 
    {
        error = PTR_ERR(nd.intent.open.file);
        goto exit_mutex_unlock;
    }

    /* Negative dentry, just create the file */
    /*如果此dentry结构没有对应的inode节点，说明是无效的，应该创建文件节点 */
    if (!path.dentry->d_inode) 
    {
        /*
         * This write is needed to ensure that a
         * ro->rw transition does not occur between
         * the time when the file is created and when
         * a permanent write count is taken through
         * the 'struct file' in nameidata_to_filp().
        */
        /*write权限是必需的*/
        error = mnt_want_write(nd.path.mnt);
        if (error)
        {
            goto exit_mutex_unlock;
        } 
        /*按照namei格式的flag open*/
        error = __open_namei_create(&nd, &path, flag, mode);
        if (error) 
        {
            mnt_drop_write(nd.path.mnt);
            goto exit;
        }
        /*根据nameidata 得到相应的file结构*/
        filp = nameidata_to_filp(&nd, open_flag);
        if (IS_ERR(filp))
        {
            ima_counts_put(&nd.path, acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
        } 
        /*放弃写权限*/
        mnt_drop_write(nd.path.mnt);
        if (nd.root.mnt)
        {
            /*计数减一*/
            path_put(&nd.root);
        } 
        return filp;
    }

    /*
     * It already exists.
     */
    /*要打开的文件已经存在*/
    mutex_unlock(&dir->d_inode->i_mutex);
    /*保存inode节点*/
    audit_inode(pathname, path.dentry);

    error = -EEXIST;
    /*flag标志检查代码*/
    if (flag & O_EXCL)
    {
        goto exit_dput;
    } 

    if (__follow_mount(&path))
    {
        error = -ELOOP;
        if (flag & O_NOFOLLOW)
        {
            goto exit_dput;
        } 
    }

    error = -ENOENT;
    if (!path.dentry->d_inode)
    {
        goto exit_dput;
    } 
    if (path.dentry->d_inode->i_op->follow_link)
    {
        goto do_link;
    } 
    /*路径装化为相应的nameidata结构*/
    path_to_nameidata(&path, &nd);
    error = -EISDIR;
    /*如果是文件夹*/
    if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
    {
        goto exit;
    } 
ok:
    /*
     * Consider:
     * 1. may_open() truncates a file
     * 2. a rw->ro mount transition occurs
     * 3. nameidata_to_filp() fails due to
     *    the ro mount.
     * That would be inconsistent, and should
     * be avoided. Taking this mnt write here
     * ensures that (2) can not occur.
     */
    /*检测是否截断文件标志*/
    will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);
    if (will_write) 
    {
        /*要截断的话就要获取写权限*/
        error = mnt_want_write(nd.path.mnt);
        if (error)
        {
            goto exit;
        } 
    }
    //may_open执行权限检测、文件打开和truncate的操作
    error = may_open(&nd.path, acc_mode, flag);
    if (error) 
    {
        if (will_write)
        {
            mnt_drop_write(nd.path.mnt);
        } 
        goto exit;
    }
    filp = nameidata_to_filp(&nd, open_flag);
    if (IS_ERR(filp))
    {
        ima_counts_put(&nd.path, acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
    }
        
    /*
     * It is now safe to drop the mnt write
     * because the filp has had a write taken
     * on its behalf.
     */
    //安全的放弃写权限
    if (will_write)
    {
        mnt_drop_write(nd.path.mnt);
    } 
    if (nd.root.mnt)
    {
        path_put(&nd.root);
    } 
    return filp;

exit_mutex_unlock:
    mutex_unlock(&dir->d_inode->i_mutex);
exit_dput:
    path_put_conditional(&path, &nd);
exit:
    if (!IS_ERR(nd.intent.open.file))
    {
        release_open_intent(&nd);
    }
        
exit_parent:
    if (nd.root.mnt)
    {
        path_put(&nd.root);
    } 
    path_put(&nd.path);
    return ERR_PTR(error);

do_link:
//允许遍历连接文件，则手工找到连接文件对应的文件
    error = -ELOOP;
    if (flag & O_NOFOLLOW)
    {
        //不允许遍历连接文件，返回错误
        goto exit_dput;
    } 
    /*
     * This is subtle. Instead of calling do_follow_link() we do the
     * thing by hands. The reason is that this way we have zero link_count
     * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT.
     * After that we have the parent and last component, i.e.
     * we are in the same situation as after the first path_walk().
     * Well, almost - if the last component is normal we get its copy
     * stored in nd->last.name and we will have to putname() it when we
     * are done. Procfs-like symlinks just set LAST_BIND.
     */
    /* 以下是手工找到链接文件对应的文件dentry结构代码 */

    //设置查找LOOKUP_PARENT标志
    nd.flags |= LOOKUP_PARENT;
    //判断操作是否安全
    error = security_inode_follow_link(path.dentry, &nd);
    if (error)
    {
        goto exit_dput;
    } 
    //处理符号链接
    error = __do_follow_link(&path, &nd);
    if (error) 
    {
        /* Does someone understand code flow here? Or it is only
         * me so stupid? Anathema to whoever designed this non-sense
         * with "intent.open".
         */
        release_open_intent(&nd);
        if (nd.root.mnt)
        {
            path_put(&nd.root);
        } 
        return ERR_PTR(error);
    }
    nd.flags &= ~LOOKUP_PARENT;
    //检查最后一段文件或目录名的属性情况
    if (nd.last_type == LAST_BIND)
    {
        goto ok;
    } 
    error = -EISDIR;
    if (nd.last_type != LAST_NORM)
    {
        goto exit;
    } 
    if (nd.last.name[nd.last.len]) 
    {
        __putname(nd.last.name);
        goto exit;
    }
    error = -ELOOP;
    //出现回环标志: 循环超过32次
    if (count++==32) 
    {
        __putname(nd.last.name);
        goto exit;
    }
    dir = nd.path.dentry;
    mutex_lock(&dir->d_inode->i_mutex);
    //更新路径的挂接点和dentry
    path.dentry = lookup_hash(&nd);
    path.mnt = nd.path.mnt;
    __putname(nd.last.name);
    goto do_last;
}

总结一下流程

1. open系统调用访问SYSCALL_DEFINE3函数
2. 在open系统调用中，调用do_sys_open函数完成主要功能
3. 在do_sys_open函数中，调用函数do_filp_open完成主要的打开功能
4. 在内核中要打开一个文件，首先应该找到这个文件，而查找文件的过程在vfs里面是由do_path_lookup或者path_lookup_open函数来完成的
    4.1 设置nd->root=根路径(绝对地址)或者当前工作目录(相对地址)
    4.2 这一步做完了后，内核会建立一些数据结构(dentry,inode)来初始化查找的起点
    if(!retval){ retval = path_walk(name,nd);}
    4.3 path_walk会遍历路径的每一节点分量，也就是用"/"分隔开的每一部分，最终找到name指向的文件 
    int path_walk(const char *name,struct nameidata *nd)
    {
        return link_path_walk(name,nd);
        //path_walk其实相当于直接调用link_path_walk来完成工作
    }
    4.4 link_path_walk的主要工作是有其内部函数__link_path_walk 来完成的
        result = __link_path_walk(name,nd)
    4.5 __link_walk_path,该函数把传进来的字符串name，也就是用户指定的路径，按路径分隔符分解成一系列小的component。比如用户说，我要找"/path/to/dest"这个文件，那么我们的文件系统就会按path、to、dest一个
一个来找，知道最后一个分量是文件或者查找完成。他找的时候，会先用path_init初始化过的根路径去找第一个分量，也就是path。然后用path的dentry->d_inode去找to，这样循环到最后一个。注意，内核会缓存找到的路径分量，
所以往往只有第一次访问一个路径的时候，才会去访问磁盘，后面的访问会直接从缓存里找，下面会看到，很多与页告诉缓存打交道的代码。但不管怎样，第一遍查找总是会访问磁盘的
    static int __link_path_walk(const char *name,strucy nameidata *nd){..}
至此，按照每一个component查找完成之后，就会找到相应的文件，然后相应的打开工作就基本完成了

Relevant Link:

http://oss.org.cn/kernel-book/
http://blog.csdn.net/f413933206/article/details/5701913

2. close() syscall

close()系统调用对应内核中的函数为: sys_close()

\linux-2.6.32.63\fs\open.c

/*
 * Careful here! We test whether the file pointer is NULL before
 * releasing the fd. This ensures that one clone task can't release
 * an fd while another clone is opening it.
 */
SYSCALL_DEFINE1(close, unsigned int, fd)
{
    struct file * filp;
    struct files_struct *files = current->files;
    struct fdtable *fdt;
    int retval;

    spin_lock(&files->file_lock);
    /*
    获取指向struct fdtable结构体的指针
    \linux-2.6.32.63\include\linux\fdtable.h
    #define files_fdtable(files) (rcu_dereference((files)->fdt))
    */
    fdt = files_fdtable(files);
    if (fd >= fdt->max_fds)
    {
        goto out_unlock;
    } 
    //获取需要关闭的文件描述符编号
    filp = fdt->fd[fd];
    if (!filp)
    {
        goto out_unlock;
    } 
    /*
    将fd_array[]中的的指定元素值置null 
    */
    rcu_assign_pointer(fdt->fd[fd], NULL);
    FD_CLR(fd, fdt->close_on_exec); 
    /*
    调用__put_unused_fd函数，将当前fd回收，则下一次打开新的文件又可以用这个fd了
    static void __put_unused_fd(struct files_struct *files, unsigned int fd)
    {
        struct fdtable *fdt = files_fdtable(files);
        __FD_CLR(fd, fdt->open_fds);
        if (fd < files->next_fd)
        {
            files->next_fd = fd;
        } 
    }
    */
    __put_unused_fd(files, fd);
    spin_unlock(&files->file_lock);
    retval = filp_close(filp, files);

    /* can't restart close syscall because file table entry was cleared */
    if (unlikely(retval == -ERESTARTSYS || retval == -ERESTARTNOINTR || retval == -ERESTARTNOHAND || retval == -ERESTART_RESTARTBLOCK))
    {
        retval = -EINTR;
    } 

    return retval;

out_unlock:
    spin_unlock(&files->file_lock);
    return -EBADF;
}
EXPORT_SYMBOL(sys_close);

对于，我们需要重点跟进2个函数: rcu_assign_pointer(fdt->fd[fd], NULL);、retval = filp_close(filp, files);

\linux-2.6.32.63\fs\rcupdate.h

/**
 * rcu_assign_pointer - assign (publicize) a pointer to a newly
 * initialized structure that will be dereferenced by RCU read-side
 * critical sections.  Returns the value assigned.
 *
 * Inserts memory barriers on architectures that require them
 * (pretty much all of them other than x86), and also prevents
 * the compiler from reordering the code that initializes the
 * structure after the pointer assignment.  More importantly, this
 * call documents which pointers will be dereferenced by RCU read-side
 * code.
 */

#define rcu_assign_pointer(p, v) \
    ({ \
        if (!__builtin_constant_p(v) || \
            ((v) != NULL)) \
            smp_wmb(); \
        (p) = (v); \
    })

我们知道，每个进程在kernel中都有一个对应的task_struct与之对应，而通过task_struct可以间接地获得一个fd_array[]数组，表示当前进程已经打开的文件，每一个元素都是一个文件描述符的值，只有通过这个fd_array[x]才能获取当前进程打开的文件的struc file*，而rcu_assign_pointer(fdt->fd[fd], NULL)的作用就在于将将这个数组的指定元素置空，即断开了这个引用的关系，至于之后内核栈中的那个struct file*是否释放，那内存回收的事，至少现在进程想通过task_stuct是无法再引用到之前打开过的文件了，这里面的关系图可以参阅:

http://www.cnblogs.com/LittleHann/p/3865490.html
//搜索: 用一张图表示task_struct、fs_struct、files_struct、fdtable、file的关系

我们继续分析etval = filp_close(filp, files);

\linux-2.6.32.63\fs\open.c

/*
 * "id" is the POSIX thread ID. We use the
 * files pointer for this..
 */
int filp_close(struct file *filp, fl_owner_t id)
{
    int retval = 0;

    if (!file_count(filp)) 
    {
        printk(KERN_ERR "VFS: Close: file count is 0\n");
        return 0;
    }

    if (filp->f_op && filp->f_op->flush)
    {
        retval = filp->f_op->flush(filp, id);
    } 

    dnotify_flush(filp, id);
    locks_remove_posix(filp, id);
    fput(filp);
    return retval;
}

filp_close()负责将表示打开的文件的struct file*内存空间进行释放，至此，内核栈中就再也没有之前打开过的文件的任何痕迹了

Relevant Link:

http://blog.csdn.net/ce123_zhouwei/article/details/8459794

posted @ 2014-08-24 11:36 郑瀚阅读(1387) 评论(0) 收藏举报

刷新页面返回顶部

Han Zheng, Thinker and Doer

Welcome to contact me. Wechat：LittleHann

Linux Kernel File IO Syscall Kernel-Source-Code Analysis(undone)

公告