feisky

云计算、虚拟化与Linux技术笔记
  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

Linux Kernel Development——虚拟文件系统

Posted on 2013-03-25 21:53  feisky  阅读(596)  评论(0编辑  收藏  举报

虚拟文件系统(VFS)为用户空间提供了文件系统相关的接口,用户程序可以通过标准的Unix文件系统调用对不同介质上的不同文件系统进行读写操作。

通用文件系统接口

VFS使得用户可以直接使用open()、read()和write()而无需考虑具体的文件系统和实际物理介质。标准系统调用也可以在不同的介质和文件系统之间执行,VFS负责这种不同介质和不同文件系统之间的协调,并对上提供一种通用的访问方法。

之所以这种通用接口对所有类型的文件系统都可以操作,是因为内核在它的底层文件系统之上建立了一个抽象层。这个抽象层提供了一个通用文件系统模型,支持各种文件系统。VFS定义了所有文件系统都支持的基本数据结构和接口,而实际文件系统都实现了这些基本接口。由于实际文件系统的代码在统一的接口和数据结构下隐藏了实现的细节,所以在VFS层和内核的其他部分来看,所有的文件系统都是相同的。

VFS中有四个主要的对象模型,分别是:

  • 超级块对象:一个已安装的文件系统;
  • 索引节点对象:代表一个文件;
  • 目录项对象:代表路径的一个组成部分;
  • 文件对象:代表文件,注意目录也是文件。

每种对象模型内核都定义了对应的操作对象,描述了内核针对该对象可以使用的方法。

超级块对象

每种文件系统都必须实现超级块,用于存储特定文件系统的信息,通常对应于存放在磁盘特定扇区中的文件系统超级块或文件系统控制块。对于非基于磁盘的文件系统,会在使用现场创建超级块并保存在内存中。

超级块用struct super_block结构体表示:

1400struct super_block {
1401        struct list_head        s_list;         /* Keep this first 指向超级块链表的指针 */
1402        dev_t                   s_dev;          /* search index; _not_ kdev_t 设备标志符 */
1403        unsigned char           s_dirt;     /* 修改(脏)标志 */
1404        unsigned char           s_blocksize_bits; /* 块大小 单位bits */
1405        unsigned long           s_blocksize;  /* 块大小 单位Bytes*/
1406        loff_t                  s_maxbytes;     /* Max file size */
1407        struct file_system_type *s_type;
1408        const struct super_operations   *s_op;  /× 超级块方法 ×/
1409        const struct dquot_operations   *dq_op;  /× 磁盘限额方法 ×/
1410        const struct quotactl_ops       *s_qcop; /× 限额控制方法 ×/
1411        const struct export_operations *s_export_op; /× 导出方法 ×/
1412        unsigned long           s_flags;
1413        unsigned long           s_magic;
1414        struct dentry           *s_root;
1415        struct rw_semaphore     s_umount;
1416        struct mutex            s_lock;
1417        int                     s_count;
1418        atomic_t                s_active;
1419#ifdef CONFIG_SECURITY
1420        void                    *s_security;
1421#endif
1422        const struct xattr_handler **s_xattr;
1423
1424        struct list_head        s_inodes;       /* all inodes */
1425        struct hlist_bl_head    s_anon;         /* anonymous dentries for (nfs) exporting */
1426#ifdef CONFIG_SMP
1427        struct list_head __percpu *s_files;
1428#else
1429        struct list_head        s_files;
1430#endif
1431        /* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */
1432        struct list_head        s_dentry_lru;   /* unused dentry lru */
1433        int                     s_nr_dentry_unused;     /* # of dentry on lru */
1434
1435        /* s_inode_lru_lock protects s_inode_lru and s_nr_inodes_unused */
1436        spinlock_t              s_inode_lru_lock ____cacheline_aligned_in_smp;
1437        struct list_head        s_inode_lru;            /* unused inode lru */
1438        int                     s_nr_inodes_unused;     /* # of inodes on lru */
1439
1440        struct block_device     *s_bdev;
1441        struct backing_dev_info *s_bdi;
1442        struct mtd_info         *s_mtd;
1443        struct list_head        s_instances;
1444        struct quota_info       s_dquot;        /* Diskquota specific options */
1445
1446        int                     s_frozen;
1447        wait_queue_head_t       s_wait_unfrozen;
1448
1449        char s_id[32];                          /* Informational name */
1450        u8 s_uuid[16];                          /* UUID */
1451
1452        void                    *s_fs_info;     /* Filesystem private info */
1453        fmode_t                 s_mode;
1454
1455        /* Granularity of c/m/atime in ns.
1456           Cannot be worse than a second */
1457        u32                s_time_gran;
1458
1459        /*
1460         * The next field is for VFS *only*. No filesystems have any business
1461         * even looking at it. You had been warned.
1462         */
1463        struct mutex s_vfs_rename_mutex;        /* Kludge */
1464
1465        /*
1466         * Filesystem subtype.  If non-empty the filesystem type field
1467         * in /proc/mounts will be "type.subtype"
1468         */
1469        char *s_subtype;
1470
1471        /*
1472         * Saved mount options for lazy filesystems using
1473         * generic_show_options()
1474         */
1475        char __rcu *s_options;
1476        const struct dentry_operations *s_d_op; /* default d_op for dentries */
1477
1478        /*
1479         * Saved pool identifier for cleancache (-1 means none)
1480         */
1481        int cleancache_poolid;
1482
1483        struct shrinker s_shrink;       /* per-sb shrinker handle */
1484};

超级块对象中的s_op定义了超级块的操作函数表,用super_operations结构体表示,其中的每一项都定义了一种操作的函数指针:
1658struct super_operations {
1659        struct inode *(*alloc_inode)(struct super_block *sb);
1660        void (*destroy_inode)(struct inode *);
1661
1662        void (*dirty_inode) (struct inode *, int flags);
1663        int (*write_inode) (struct inode *, struct writeback_control *wbc);
1664        int (*drop_inode) (struct inode *);
1665        void (*evict_inode) (struct inode *);
1666        void (*put_super) (struct super_block *);
1667        void (*write_super) (struct super_block *);
1668        int (*sync_fs)(struct super_block *sb, int wait);
1669        int (*freeze_fs) (struct super_block *);
1670        int (*unfreeze_fs) (struct super_block *);
1671        int (*statfs) (struct dentry *, struct kstatfs *);
1672        int (*remount_fs) (struct super_block *, int *, char *);
1673        void (*umount_begin) (struct super_block *);
1674
1675        int (*show_options)(struct seq_file *, struct vfsmount *);
1676        int (*show_devname)(struct seq_file *, struct vfsmount *);
1677        int (*show_path)(struct seq_file *, struct vfsmount *);
1678        int (*show_stats)(struct seq_file *, struct vfsmount *);
1679#ifdef CONFIG_QUOTA
1680        ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
1681        ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
1682#endif
1683        int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
1684        int (*nr_cached_objects)(struct super_block *);
1685        void (*free_cached_objects)(struct super_block *, int);
1686};

在具体的文件系统中,存在一个指向超级块结构的指针,其内会向该结构体传递对应该文件系统的操作函数实现。上述的操作表也不必全部实现,文件系统可以将不需要的函数指针设置为NULL。另外,上述函数均在进程上下文中调用,必要时均可阻塞。

索引节点对象
索引节点对象包含了内核在操作文件和目录时需要的全部信息,这些信息可以从磁盘索引节点直接读入。
索引节点使用inode结构体表示,一个索引节点代表了文件系统中的一个文件,它也可以是设备或者管道这样的特殊文件。
 744/*
 745 * Keep mostly read-only and often accessed (especially for
 746 * the RCU path lookup and 'stat' data) fields at the beginning
 747 * of the 'struct inode'
 748 */
 749struct inode {
 750        umode_t                 i_mode;
 751        unsigned short          i_opflags;
 752        uid_t                   i_uid;
 753        gid_t                   i_gid;
 754        unsigned int            i_flags;
 755
 756#ifdef CONFIG_FS_POSIX_ACL
 757        struct posix_acl        *i_acl;
 758        struct posix_acl        *i_default_acl;
 759#endif
 760
 761        const struct inode_operations   *i_op;
 762        struct super_block      *i_sb;
 763        struct address_space    *i_mapping;
 764
 765#ifdef CONFIG_SECURITY
 766        void                    *i_security;
 767#endif
 768
 769        /* Stat data, not accessed from path walking */
 770        unsigned long           i_ino;
 771        /*
 772         * Filesystems may only read i_nlink directly.  They shall use the
 773         * following functions for modification:
 774         *
 775         *    (set|clear|inc|drop)_nlink
 776         *    inode_(inc|dec)_link_count
 777         */
 778        union {
 779                const unsigned int i_nlink;
 780                unsigned int __i_nlink;
 781        };
 782        dev_t                   i_rdev;
 783        struct timespec         i_atime;
 784        struct timespec         i_mtime;
 785        struct timespec         i_ctime;
 786        spinlock_t              i_lock; /* i_blocks, i_bytes, maybe i_size */
 787        unsigned short          i_bytes;
 788        blkcnt_t                i_blocks;
 789        loff_t                  i_size;
 790
 791#ifdef __NEED_I_SIZE_ORDERED
 792        seqcount_t              i_size_seqcount;
 793#endif
 794
 795        /* Misc */
 796        unsigned long           i_state;
 797        struct mutex            i_mutex;
 798
 799        unsigned long           dirtied_when;   /* jiffies of first dirtying */
 800
 801        struct hlist_node       i_hash;
 802        struct list_head        i_wb_list;      /* backing dev IO list */
 803        struct list_head        i_lru;          /* inode LRU list */
 804        struct list_head        i_sb_list;
 805        union {
 806                struct list_head        i_dentry;
 807                struct rcu_head         i_rcu;
 808        };
 809        atomic_t                i_count;
 810        unsigned int            i_blkbits;
 811        u64                     i_version;
 812        atomic_t                i_dio_count;
 813        atomic_t                i_writecount;
 814        const struct file_operations    *i_fop; /* former ->i_op->default_file_ops */
 815        struct file_lock        *i_flock;
 816        struct address_space    i_data;
 817#ifdef CONFIG_QUOTA
 818        struct dquot            *i_dquot[MAXQUOTAS];
 819#endif
 820        struct list_head        i_devices;
 821        union {
 822                struct pipe_inode_info  *i_pipe;
 823                struct block_device     *i_bdev;
 824                struct cdev             *i_cdev;
 825        };
 826
 827        __u32                   i_generation;
 828
 829#ifdef CONFIG_FSNOTIFY
 830        __u32                   i_fsnotify_mask; /* all events this inode cares about */
 831        struct hlist_head       i_fsnotify_marks;
 832#endif
 833
 834#ifdef CONFIG_IMA
 835        atomic_t                i_readcount; /* struct files open RO */
 836#endif
 837        void                    *i_private; /* fs or device private pointer */
 838};
 839

其中,i_op定义了索引节点对象的所有操作方法:
1613struct inode_operations {
1614        struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *);
1615        void * (*follow_link) (struct dentry *, struct nameidata *);
1616        int (*permission) (struct inode *, int);
1617        struct posix_acl * (*get_acl)(struct inode *, int);
1618
1619        int (*readlink) (struct dentry *, char __user *,int);
1620        void (*put_link) (struct dentry *, struct nameidata *, void *);
1621
1622        int (*create) (struct inode *,struct dentry *,int, struct nameidata *);
1623        int (*link) (struct dentry *,struct inode *,struct dentry *);
1624        int (*unlink) (struct inode *,struct dentry *);
1625        int (*symlink) (struct inode *,struct dentry *,const char *);
1626        int (*mkdir) (struct inode *,struct dentry *,int);
1627        int (*rmdir) (struct inode *,struct dentry *);
1628        int (*mknod) (struct inode *,struct dentry *,int,dev_t);
1629        int (*rename) (struct inode *, struct dentry *,
1630                        struct inode *, struct dentry *);
1631        void (*truncate) (struct inode *);
1632        int (*setattr) (struct dentry *, struct iattr *);
1633        int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
1634        int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
1635        ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
1636        ssize_t (*listxattr) (struct dentry *, char *, size_t);
1637        int (*removexattr) (struct dentry *, const char *);
1638        void (*truncate_range)(struct inode *, loff_t, loff_t);
1639        int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
1640                      u64 len);
1641} ____cacheline_aligned;

目录项对象

VFS把目录当作文件看待,而路径的每个部分则用目录项来表示。在路径中,包括普通文件在内,每个部分都是目录项对象。目录项对象的主要目的是为了方便路径名的查找等操作。
目录项用struct dentry 表示,它没有对应的磁盘数据结构,由于它并不会保存到磁盘上,所以也没有脏标志。
 116struct dentry {
 117        /* RCU lookup touched fields */
 118        unsigned int d_flags;           /* protected by d_lock */
 119        seqcount_t d_seq;               /* per dentry seqlock */
 120        struct hlist_bl_node d_hash;    /* lookup hash list */
 121        struct dentry *d_parent;        /* parent directory */
 122        struct qstr d_name;
 123        struct inode *d_inode;          /* Where the name belongs to - NULL is
 124                                         * negative */
 125        unsigned char d_iname[DNAME_INLINE_LEN];        /* small names */
 126
 127        /* Ref lookup also touches following */
 128        unsigned int d_count;           /* protected by d_lock */
 129        spinlock_t d_lock;              /* per dentry lock */
 130        const struct dentry_operations *d_op;
 131        struct super_block *d_sb;       /* The root of the dentry tree */
 132        unsigned long d_time;           /* used by d_revalidate */
 133        void *d_fsdata;                 /* fs-specific data */
 134
 135        struct list_head d_lru;         /* LRU list */
 136        /*
 137         * d_child and d_rcu can share memory
 138         */
 139        union {
 140                struct list_head d_child;       /* child of parent list */
 141                struct rcu_head d_rcu;
 142        } d_u;
 143        struct list_head d_subdirs;     /* our children */
 144        struct list_head d_alias;       /* inode alias list */
 145};

如果VFS遍历路径名中的所有元素并将它们逐个解析成目录项对象,这将是一件非常耗时的工作。因此,内核将目录项对象缓存在目录项缓存中。目录项缓存包括三部分:
  • 被使用的目录项链表:通过索引节点的i_dentry项连接相关的索引节点;
  • 最近被使用的目录项双向链表
  • 散列表,用来快速将指定路径解析为相关目录项对象

内核通过dentry_operations定义了目录项操作函数列表:

 159struct dentry_operations {
 160        int (*d_revalidate)(struct dentry *, struct nameidata *);
 161        int (*d_hash)(const struct dentry *, const struct inode *,
 162                        struct qstr *);
 163        int (*d_compare)(const struct dentry *, const struct inode *,
 164                        const struct dentry *, const struct inode *,
 165                        unsigned int, const char *, const struct qstr *);
 166        int (*d_delete)(const struct dentry *);
 167        void (*d_release)(struct dentry *);
 168        void (*d_prune)(struct dentry *);
 169        void (*d_iput)(struct dentry *, struct inode *);
 170        char *(*d_dname)(struct dentry *, char *, int);
 171        struct vfsmount *(*d_automount)(struct path *);
 172        int (*d_manage)(struct dentry *, bool);
 173} ____cacheline_aligned;

文件对象

文件对象代表已打开的文件,同一个文件可能对应多个文件对象(多个进程打开该文件),而一个文件对应的索引节点和目录项对象是唯一的。
文件对象通过结构体struct file表示,文件对象没有对应的磁盘数据,它在文件打开的时候创建,文件关闭的时候销毁。文件对象中也不记录脏标志(由inode记录)。
 964struct file {
 965        /*
 966         * fu_list becomes invalid after file_free is called and queued via
 967         * fu_rcuhead for RCU freeing
 968         */
 969        union {
 970                struct list_head        fu_list;
 971                struct rcu_head         fu_rcuhead;
 972        } f_u;
 973        struct path             f_path;
 974#define f_dentry        f_path.dentry
 975#define f_vfsmnt        f_path.mnt
 976        const struct file_operations    *f_op;
 977
 978        /*
 979         * Protects f_ep_links, f_flags, f_pos vs i_size in lseek SEEK_CUR.
 980         * Must not be taken from IRQ context.
 981         */
 982        spinlock_t              f_lock;
 983#ifdef CONFIG_SMP
 984        int                     f_sb_list_cpu;
 985#endif
 986        atomic_long_t           f_count;
 987        unsigned int            f_flags;
 988        fmode_t                 f_mode;
 989        loff_t                  f_pos;
 990        struct fown_struct      f_owner;
 991        const struct cred       *f_cred;
 992        struct file_ra_state    f_ra;
 993
 994        u64                     f_version;
 995#ifdef CONFIG_SECURITY
 996        void                    *f_security;
 997#endif
 998        /* needed for tty driver, and maybe others */
 999        void                    *private_data;
1000
1001#ifdef CONFIG_EPOLL
1002        /* Used by fs/eventpoll.c to link all the hooks to this file */
1003        struct list_head        f_ep_links;
1004#endif /* #ifdef CONFIG_EPOLL */
1005        struct address_space    *f_mapping;
1006#ifdef CONFIG_DEBUG_WRITECOUNT
1007        unsigned long f_mnt_write_state;
1008#endif
1009};

下面的结构定义了文件对象的操作函数列表:
1583struct file_operations {
1584        struct module *owner;
1585        loff_t (*llseek) (struct file *, loff_t, int);
1586        ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
1587        ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
1588        ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
1589        ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
1590        int (*readdir) (struct file *, void *, filldir_t);
1591        unsigned int (*poll) (struct file *, struct poll_table_struct *);
1592        long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
1593        long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
1594        int (*mmap) (struct file *, struct vm_area_struct *);
1595        int (*open) (struct inode *, struct file *);
1596        int (*flush) (struct file *, fl_owner_t id);
1597        int (*release) (struct inode *, struct file *);
1598        int (*fsync) (struct file *, loff_t, loff_t, int datasync);
1599        int (*aio_fsync) (struct kiocb *, int datasync);
1600        int (*fasync) (int, struct file *, int);
1601        int (*lock) (struct file *, int, struct file_lock *);
1602        ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
1603        unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
1604        int (*check_flags)(int);
1605        int (*flock) (struct file *, int, struct file_lock *);
1606        ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
1607        ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
1608        int (*setlease)(struct file *, long, struct file_lock **);
1609        long (*fallocate)(struct file *file, int mode, loff_t offset,
1610                          loff_t len);
1611};

参考:
《Linux内核设计与实现》
http://lxr.linux.no
无觅相关文章插件,快速提升流量