当我们在C程序中用到某些库函数进行文件读取操作时,后续的整个过程都是透明的,为了了解文件系统在其中起到了什么作用,又是如何和内核的其他部分进行协作的,我们可以对Read()函数进行追踪,下面的代码均来自linux2.6.11.10版本的内核。
首先,我们写下如下的测试程序,test.c,其中1.txt里只有一句Hello,World。
#include <stdio.h>
#include <stdlib.h>
int main() {
char word[20];
FILE *fp;
if((fp = fopen("1.txt","a+")) == NULL) {
fprintf(stdout, "ERROR!");
exit(EXIT_FAILURE);
} fscanf(fp,"%s",word);
printf("%s\n",word);
return 0;
}
然后进行编译,并通过strace 工具查看函数运行时用到了哪些系统调用函数,并将结果输出到hello.txt中。
~/test$ gcc hello.c -o hello
~/test$ strace -o hello.txt ./hello
查看hello.txt中的主要内容如下
……
openat(AT_FDCWD, "x86_64/libc.so.6", O_RDONLY|O_CLOEXEC) = -1 ENOENT
……
openat(AT_FDCWD, "1.txt", O_RDWR|O_CREAT|O_APPEND, 0666) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=13, ...}) = 0
read(3, "Hello,World!\n", 4096) = 13
fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 1), ...}) = 0
write(1, "Hello,World!\n", 13) = 13
lseek(3, -1, SEEK_CUR) = 12
exit_group(0) = ?
可以看到首先打开了libc.so,这里面封装了我们需要的库函数,而后调用了write、read、lseek等库函数。
我们知道,系统调用有两种方式实现,一种是老旧的int$0x80方式,还有一种是sysenter,具体细节不纠结,但过程总是先将系统调用号存入$eax,然后进行系统调用,这部分实现已经完全放进库函数了,进行系统调用后,会查系统调用表,比如read的系统调用就是3,那么查表就能查到这个函数。
比如i386处理器的系统调用号局部如下所示
/linux-2.6.11.10/include/asm-i386/unistd.h
#define __NR_restart_syscall 0
#define __NR_exit 1
#define __NR_fork 2
#define __NR_read 3
#define __NR_write 4
#define __NR_open 5
#define __NR_close 6
#define __NR_waitpid 7
#define __NR_creat 8
#define __NR_link 9
#define __NR_unlink 10
#define __NR_execve 11
#define __NR_chdir 12
#define __NR_time 13
#define __NR_mknod 14
#define __NR_chmod 15
#define __NR_lchown 16
#define __NR_break 17
由上我们看到,调用read的系统调用号为3,在这个文件的下面我们还能看到比较老旧的系统调用实现代码,现在这个功能好像已经放到库中去实现了,不在内核中实现,这里内核版本较老,所以在内核中还能看到,这里用的是通过系统调用需要的参数个数来进行区分的。
/linux-2.6.11.10/include/asm-i386/unistd.h
#define __syscall_return(type, res) \
do { \
if ((unsigned long)(res) >= (unsigned long)(-(128 + 1))) { \
errno = -(res); \
res = -1; \
} \
return (type) (res); \
} while (0)
/* XXX - _foo needs to be __foo, while __NR_bar could be _NR_bar. */
#define _syscall0(type,name) \
type name(void) \
{ \
long __res; \
__asm__ volatile ("int $0x80" \
: "=a" (__res) \
: "0" (__NR_##name)); \
__syscall_return(type,__res); \
}
而之后,read会调用相应的服务例程sys_read,此函数定义如下。
/linux-2.6.11.10/fs/read_write.c
asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)
{
struct file *file;
ssize_t ret = -EBADF;
int fput_needed;
file = fget_light(fd, &fput_needed); //从当前打开的文件集中返回要写的文件对象地址
if (file) {
loff_t pos = file_pos_read(file); //返回文件偏移地址
ret = vfs_read(file, buf, count, &pos); //buf为用户态缓冲区,count为读取长度
file_pos_write(file, pos); //将新的偏移地址写回文件
fput_light(file, fput_needed); //释放文件
}
return ret;
}
EXPORT_SYMBOL_GPL(sys_read);
该函数首先通过fget_light(light表示轻量级的)通过文件描述符,来返回一个文件地址,类型为虚拟文件系统层的struct file,然后获取文件偏移地址,并调用vfs_read即虚拟文件系统的读操作,从这里我们可以看到,无论底层是什么文件系统,由于有VFS这个中间层存在,对文件进行操作都可以把事情交给VFS来处理,这是抽象的好处。
我们可以在sys_read所在的文件里找到vfs_read。
/linux-2.6.11.10/fs/read_write.c
ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
if (!(file->f_mode & FMODE_READ)) //进程的访问模式是否可读文件
return -EBADF;
if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read)) //检查文件是否定义有相关操作
return -EINVAL;
if (unlikely(!access_ok(VERIFY_WRITE, buf, count))) //粗略检查参数,看缓冲区是否有效
return -EFAULT;
ret = rw_verify_area(READ, file, pos, count); //检查当前区域是否有锁
if (!ret) {
ret = security_file_permission (file, MAY_READ); //检查是否有读的权限
if (!ret) {
if (file->f_op->read)
ret = file->f_op->read(file, buf, count, pos); //如有则调用相应文件系统的read函数
else
ret = do_sync_read(file, buf, count, pos); //否则调用这个函数
if (ret > 0) {
dnotify_parent(file->f_dentry, DN_ACCESS); //通知父目录文件已获取
current->rchar += ret;
}
current->syscr++; //一些I/O次数的统计
}
}
return ret;
}
EXPORT_SYMBOL(vfs_read);
我们可以看到,vfs_read函数只是检查了一些状态,就使用回调函数 file->f_op->read,使用相应文件系统的read函数继续进行操作,这个file_operations应该是open file的时候就已经填好的,我们可以/linux-2.6.11.10/fs/ext2/file.c里找到ext2所有的文件操作,如下,其实在新内核里,read和write之类的操作已经改了。
/linux-2.6.11.10/fs/ext2/file.c
struct file_operations ext2_file_operations = {
.llseek = generic_file_llseek,
.read = generic_file_read,
.write = generic_file_write,
.aio_read = generic_file_aio_read,
.aio_write = generic_file_aio_write,
.ioctl = ext2_ioctl,
.mmap = generic_file_mmap,
.open = generic_file_open,
.release = ext2_release_file,
.fsync = ext2_sync_file,
.readv = generic_file_readv,
.writev = generic_file_writev,
.sendfile = generic_file_sendfile,
};
可以看到,ext2的read操作并没有额外定义,而是使用了一个通用文件读函数,在/linux-2.6.11.10/mm/filemap.c文件里可以找到这个函数,因为读写是基于页操作的。
/linux-2.6.11.10/mm/filemap.c
ssize_t generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
{
struct iovec local_iov = { .iov_base = buf, .iov_len = count }; //用local_iov存用户缓区和读取长度
struct kiocb kiocb; //同步和异步I/O操作描述符
ssize_t ret;
init_sync_kiocb(&kiocb, filp); //初始化描述符
ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos); //所有文件系统使用的通用例程
if (-EIOCBQUEUED == ret) //如果在排队
ret = wait_on_sync_kiocb(&kiocb);
return ret;
}
EXPORT_SYMBOL(generic_file_read);
这个函数继续调用了一个通用例程,即__generic_file_aio_read,字面理解就是异步I/O读,它不是立即读取,而是会先在一个链表里排队,如果在排队就需要继续等。
/linux-2.6.11.10/mm/filemap.c
ssize_t
__generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos)
{
struct file *filp = iocb->ki_filp; //与正在进行的read操作相关的文件对象指针
ssize_t retval;
unsigned long seg;
size_t count;
count = 0;
for (seg = 0; seg < nr_segs; seg++) {
const struct iovec *iv = &iov[seg];
/*
* If any segment has a negative length, or the cumulative
* length ever wraps negative then return -EINVAL.
*/
count += iv->iov_len;
if (unlikely((ssize_t)(count|iv->iov_len) < 0))
return -EINVAL;
if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len)) //检查ivoec描述符所描述的用户态缓冲区是否有效
continue;
if (seg == 0)
return -EFAULT;
nr_segs = seg;
count -= iv->iov_len; /* This segment is no good */
break;
}
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (filp->f_flags & O_DIRECT) { //直接I/O模式
loff_t pos = *ppos, size;
struct address_space *mapping;
struct inode *inode;
mapping = filp->f_mapping;
inode = mapping->host;
retval = 0;
if (!count)
goto out; /* skip atime */
size = i_size_read(inode);
if (pos < size) {
retval = generic_file_direct_IO(READ, iocb,
iov, pos, nr_segs);
if (retval >= 0 && !is_sync_kiocb(iocb))
retval = -EIOCBQUEUED;
if (retval > 0)
*ppos = pos + retval;
}
file_accessed(filp);
goto out;
}
retval = 0; //如果不是直接I/O模式的话,就用页高速缓存
if (count) {
for (seg = 0; seg < nr_segs; seg++) {
read_descriptor_t desc; //定义读操作描述符
desc.written = 0;
desc.arg.buf = iov[seg].iov_base; //用户缓冲区
desc.count = iov[seg].iov_len; //读取长度
if (desc.count == 0)
continue;
desc.error = 0;
do_generic_file_read(filp,ppos,&desc,file_read_actor); //调用该函数读文件
retval += desc.written;
if (!retval) {
retval = desc.error;
break;
}
}
}
out:
return retval;
}
EXPORT_SYMBOL(__generic_file_aio_read);
我们可以将上面这个函数粗略划分为三部分,检查部分,以及直接I/O读取,和页高速缓存读取,如果设置了O_DIRECT标志,则直接读取调用generic_file_direct_IO(),否则要使用页高速缓存,调用do_generic_file_read,,我们主要关注页高速缓存读取。
/linux-2.6.11.10/include/linux/fs.h
static inline void do_generic_file_read(struct file * filp, loff_t *ppos,
read_descriptor_t * desc,
read_actor_t actor)
{
do_generic_mapping_read(filp->f_mapping,
&filp->f_ra,
filp,
ppos,
desc,
actor);
}
do_generic_file_read会继续调用do_generic_mapping_read,这个调用表示对文件的读操作转换为对页高速缓存的读操作。
之所以要在I/O过程中加入页高速缓存这么一个缓冲层,是为了提高读取的效率,我们希望能尽量减少对磁盘的读取,而将读取放到内存中进行,所以引入页高速缓存这么一个中间层。
上面的参数中有一个filp->f_mapping,这个是一个地址空间变量,其定义如下。
/linux-2.6.11.10/include/linux/fs.h
struct address_space {
struct inode *host; /* owner: inode, block_device */
struct radix_tree_root page_tree; /* radix tree of all pages */
spinlock_t tree_lock; /* and spinlock protecting it */
unsigned int i_mmap_writable;/* count VM_SHARED mappings */
struct prio_tree_root i_mmap; /* tree of private and shared mappings */
struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
spinlock_t i_mmap_lock; /* protect tree, count, list */
unsigned int truncate_count; /* Cover race condition with truncate */
unsigned long nrpages; /* number of total pages */
pgoff_t writeback_index;/* writeback starts here */
struct address_space_operations *a_ops; /* methods */
unsigned long flags; /* error bits/gfp mask */
struct backing_dev_info *backing_dev_info; /* device readahead, etc */
spinlock_t private_lock; /* for use by the address_space */
struct list_head private_list; /* ditto */
struct address_space *assoc_mapping; /* ditto */
} __attribute__((aligned(sizeof(long))));
通过host和page_tree两个属性,一个adrees_space结构体可以将一个文件和属于它的缓存页联系起来,page_tree是struct radix_tree_root类型的,就是一颗树的根,它指向一颗基树,相应的页都存在叶子节点上,这样找页就很简单了。
在do_generic_mapping_read里,检查完基础数据后,会建立一个循环,这个循环每次读一页内容,直到读完所有内容。
首先是find_page,它会通过关联有页的基树找到相应的页,如果没找到,就跳到no_cached_page重新分配一个页插入到基树里去,如果为脏页则需要更新,如果既能找到,又不需要更新,那么直接page_ok将数据拷贝到用户态即可。
/linux-2.6.11.10/mm/filemap.c——do_generic_mapping_read
find_page:
page = find_get_page(mapping, index); //首先在页高速缓存里寻找页描述符
if (unlikely(page == NULL)) {
handle_ra_miss(mapping, &ra, index);
goto no_cached_page;
}
if (!PageUptodate(page)) //检查是否为脏页
goto page_not_up_to_date;
找到页以后开始读页,主要的重点语句是这一句
linux-2.6.11.10/mm/filemap.c——do_generic_mapping_read
readpage:
/* Start the actual read. The read will unlock the page. */
error = mapping->a_ops->readpage(filp, page);
if (unlikely(error))
goto readpage_error;
if (!PageUptodate(page)) {
lock_page(page);
if (!PageUptodate(page)) {
if (page->mapping == NULL) {
/*
* invalidate_inode_pages got it
*/
unlock_page(page);
page_cache_release(page);
goto find_page;
}
unlock_page(page);
error = -EIO;
goto readpage_error;
}
unlock_page(page);
}
/*
* i_size must be checked after we have done ->readpage.
*
* Checking i_size after the readpage allows us to calculate
* the correct value for "nr", which means the zero-filled
* part of the page is not copied back to userspace (unless
* another truncate extends the file - this is desired though).
*/
isize = i_size_read(inode);
end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
if (unlikely(!isize || index > end_index)) {
page_cache_release(page);
goto out;
}
/* nr is the maximum number of bytes to copy from this page */
nr = PAGE_CACHE_SIZE;
if (index == end_index) {
nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
if (nr <= offset) {
page_cache_release(page);
goto out;
}
}
nr = nr - offset;
goto page_ok;
这里又出现了一个回调函数,又调用了相关文件系统的相关函数,我们可以找到ext2的a_ops定义如下:
/linux-2.6.11.10/fs/ext2/inode.c
struct address_space_operations ext2_aops = {
.readpage = ext2_readpage,
.readpages = ext2_readpages,
.writepage = ext2_writepage,
.sync_page = block_sync_page,
.prepare_write = ext2_prepare_write,
.commit_write = generic_commit_write,
.bmap = ext2_bmap,
.direct_IO = ext2_direct_IO,
.writepages = ext2_writepages,
};
再找到ext2_readpage开始我们的读页操作。
/linux-2.6.11.10/fs/ext2/inode.c
static int ext2_readpage(struct file *file, struct page *page)
{
return mpage_readpage(page, ext2_get_block);
}
这里它又继续调用了一个通用例程mapge_readpage,导入了页地址以及ext2的数据块寻址函数。
/linux-2.6.11.10/fs/mpage.c
int mpage_readpage(struct page *page, get_block_t get_block)
{
struct bio *bio = NULL;
sector_t last_block_in_bio = 0;
bio = do_mpage_readpage(bio, page, 1,
&last_block_in_bio, get_block);
if (bio)
mpage_bio_submit(READ, bio);
return 0;
}
EXPORT_SYMBOL(mpage_readpage);
这里就两步操作,申请一个struct bio对象,然后提交这个任务。bio是通用块层用来管理传输数据的,他把一个磁盘存储区和一块内存区域联系起来。
然后提交这个任务,这里面其实还有一个调度过程,所有的bio请求都在一个队列里,它可以重排读写数据块的请求,在重复访问文件同一个部分或多进程访问同一数据,可以大大提高读取效率。
最终,这件读操作会交给磁盘的设备驱动程序来进行真正的数据操作。
读完以后,再回到do_generic_mapping_read,跳到page_ok,它会调用__copy_to_user()函数将数据拷贝到用户态缓冲区,
linux-2.6.11.10/mm/filemap.c——do_generic_mapping_read
page_ok:
/* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
/*
* When (part of) the same page is read multiple times
* in succession, only mark it as accessed the first time.
*/
if (prev_index != index)
mark_page_accessed(page);
prev_index = index;
/*
* Ok, we have the page, and it's up-to-date, so
* now we can copy it to user space...
*
* The actor routine returns how many bytes were actually used..
* NOTE! This may not be the same as how much of a user buffer
* we filled up (we may be padding etc), so we can only update
* "pos" here (the actor routine has to update the user buffer
* pointers and the remaining count).
*/
ret = actor(desc, page, offset, nr);
offset += ret;
index += offset >> PAGE_CACHE_SHIFT;
offset &= ~PAGE_CACHE_MASK;
page_cache_release(page);
if (ret == nr && desc->count)
continue;
goto out;
linux-2.6.11.10/mm/filemap.c
int file_read_actor(read_descriptor_t *desc, struct page *page,
unsigned long offset, unsigned long size)
{
char *kaddr;
unsigned long left, count = desc->count;
if (size > count)
size = count;
/*
* Faults on the destination of a read are common, so do it before
* taking the kmap.
*/
if (!fault_in_pages_writeable(desc->arg.buf, size)) {
kaddr = kmap_atomic(page, KM_USER0);
left = __copy_to_user_inatomic(desc->arg.buf,
kaddr + offset, size);
kunmap_atomic(kaddr, KM_USER0);
if (left == 0)
goto success;
}
/* Do it the slow way */
kaddr = kmap(page);
left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
kunmap(page);
if (left) {
size -= left;
desc->error = -EFAULT;
}
success:
desc->count = count - size;
desc->written += size;
desc->arg.buf += size;
return size;
}
然后更新一些计数,再一步步往上返回到最开始的read()系统调用,调用就结束了。
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。