深入理解系统调用

实验内容

  • 找一个系统调用,系统调用号为学号最后2位相同的系统调用
  • 通过汇编指令触发该系统调用
  • 通过gdb跟踪该系统调用的内核处理过程
  • 重点阅读分析系统调用入口的保存现场、恢复现场和系统调用返回,以及重点关注系统调用过程中内核堆栈状态的变化

准备环境

利用上次实验下载的linux内核,重新编译,再制作根文件系统,并准备好触发系统调用的程序。

首先配置内核编译选项

make defconfig # Default configuration is based on 'x86_64_defconfig'
make menuconfig  
# 打开debug相关选项
Kernel hacking  ---> 
    Compile-time checks and compiler options  ---> 
       [*] Compile the kernel with debug info 
       [*]   Provide GDB scripts for kernel debugging
 [*] Kernel debugging 
# 关闭KASLR,否则会导致打断点失败
Processor type and features ----> 
   [] Randomize the address of the kernel image (KASLR)

编译内核

make -j$(nproc) # nproc gives the number of CPU cores/threads available
# 测试⼀下内核能不能正常加载运⾏,因为没有⽂件系统终会kernel panic 
qemu-system-x86_64 -kernel arch/x86/boot/bzImage  #  此时应该不能正常运行

制作根文件系统

#下载
axel -n 20 https://busybox.net/downloads/busybox-1.31.1.tar.bz2
tar -jxvf busybox-1.31.1.tar.bz2
cd busybox-1.31.1
#制作根文件系统
make menuconfig 
#记得要编译成静态链接,不⽤动态链接库。
Settings  --->
    [*] Build static binary (no shared libs) 
#然后编译安装,默认会安装到源码⽬录下的 _install ⽬录中。 
make -j$(nproc) && make install
# 制作内存根文件系统镜像
mkdir rootfs
cd rootfs
cp ../busybox-1.31.1/_install/* ./ -rf
mkdir dev proc sys home
sudo cp -a /dev/{null,console,tty,tty1,tty2,tty3,tty4} dev/

准备init脚本文件放在根文件系统跟目录下(rootfs/init),添加如下内容到init文件。

复制代码
#!/bin/sh
mount -t proc none /proc mount -t sysfs none /sys
echo "Wellcome TestOS!" echo "--------------------"
cd home
/bin/sh

#给init脚本添加可执行权限
chmod +x init

我的学号后两位是17,对应的系统调用是pread函数,pread函数用于从打开文件的指定位置开始读取数据,函数原型如下:

#include <unistd.h>
ssize_t pread(int filedes, void *buf, size_t nbytes, off_t offset);

返回值:若读取成功则返回实际读到的字节数,若已到文件结尾则返回0,若出错则返回-1。

参数:

1、filedes文件标识符;

2、*buf存放读出数据的缓冲区;

3、nbytes要读取的字节数;

4、offset文件指针。

#include<stdlib.h>
#include<unistd.h>
#include<fcntl.h>
int main()
{
    int fd,i,count;
    char buf[20];
    fd=open("/home/world",O_RDWR);
    printf("use pread\n");
//    pread(fd,buf,10,0);
    asm volatile(
            "mov $0x0, %%rcx\n\t" //参数4
            "mov $0x0a, %%rdx\n\t" //参数3
            "mov %2, %%rsi\n\t" //参数2
            "mov %1, %%rdi\n\t" //参数1

            "mov $0x11, %%eax\n\t" //传递系统调用号
            "syscall\n\t" //系统调用
            "mov %%rax, %0\n\t" //结果存到count中
            :"=m"(count) //输出
            :"m"(fd),"p"(&buf)//输入
            );
    buf[20]='\0';
    printf("%d",count);
    close(fd);
    return 0;
}

打包根文件系统

#打包成内存根文件系统镜像
 find . -print0 | cpio --null -ov --format=newc | gzip -9 > ../rootfs.cpio.gz


#测试挂载根文件系统,看内核启动完成后是否执行init脚本
qemu-system-x86_64 -kernel linux-5.4.34/arch/x86/boot/bzImage -initrd rootfs.cpio.gz

 

到此,环境准备完毕。

GDB跟踪系统调用的内核处理过程

 

 

 

 

 

 

 

 

 

 

 

 可以看到现在停在了fs/read_write.c:646处,具体代码如下

SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
            size_t, count, loff_t, pos)
{
    return ksys_pread64(fd, buf, count, pos);
}

调用了ksys_pread64,继续向下看

ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
             loff_t pos)
{
    struct fd f;
    ssize_t ret = -EBADF;

    if (pos < 0)
        return -EINVAL;

    f = fdget(fd);
    if (f.file) {
        ret = -ESPIPE;
        if (f.file->f_mode & FMODE_PREAD)
            ret = vfs_read(f.file, buf, count, &pos);
        fdput(f);
    }

    return ret;
}

内核执行过程

 

 

 执行完ksys_pread64()函数后,意味着内核调用基本结束,接下来就是要转到用户态了,返回到之前系统调用的do_syscall_64()中,执行syscall_return_slowpath。

 

 

 

 do_syscall_64(),位于arch/x86/entry/common,部分代码如下

__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs)
{
    struct thread_info *ti;

    enter_from_user_mode();
    local_irq_enable();
    ti = current_thread_info();
    if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
        nr = syscall_trace_enter(regs);

    if (likely(nr < NR_syscalls)) {
        nr = array_index_nospec(nr, NR_syscalls);
        regs->ax = sys_call_table[nr](regs);
   ...
     syscall_return_slowpath(regs);
}

可以看到regs->ax利用系统调用表拿到系统调用号,最后调用了syscall_return_slowpath(), 这个函数是为切换到用户态做准备。接着返回到entry_SYSCALL_64(), entry_SYSCALL_64()是进入64位系统调用的唯一入口点,它完成了保存现场,调用对应的内核处理函数、恢复现场、系统调用返回等工作。如下是代码,

ENTRY(entry_SYSCALL_64)
    UNWIND_HINT_EMPTY
    /*
     * Interrupts are off on entry.
     * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
     * it is too small to ever cause noticeable irq latency.
     */

    swapgs
    /* tss.sp2 is scratch space. */
    movq    %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
    SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
    movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp

    /* Construct struct pt_regs on stack */
    pushq    $__USER_DS                /* pt_regs->ss */
    pushq    PER_CPU_VAR(cpu_tss_rw + TSS_sp2)    /* pt_regs->sp */
    pushq    %r11                    /* pt_regs->flags */
    pushq    $__USER_CS                /* pt_regs->cs */
    pushq    %rcx                    /* pt_regs->ip */
GLOBAL(entry_SYSCALL_64_after_hwframe)
    pushq    %rax                    /* pt_regs->orig_ax */

    PUSH_AND_CLEAR_REGS rax=$-ENOSYS

    TRACE_IRQS_OFF

    /* IRQs are off. */
    movq    %rax, %rdi
    movq    %rsp, %rsi
    call    do_syscall_64        /* returns with IRQs disabled */

    TRACE_IRQS_IRETQ        /* we're about to change IF */

    /*
     * Try to use SYSRET instead of IRET if we're returning to
     * a completely clean 64-bit userspace context.  If we're not,
     * go to the slow exit path.
     */
    movq    RCX(%rsp), %rcx
    movq    RIP(%rsp), %r11

    cmpq    %rcx, %r11    /* SYSRET requires RCX == RIP */
    jne    swapgs_restore_regs_and_return_to_usermode

    /*
     * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
     * in kernel space.  This essentially lets the user take over
     * the kernel, since userspace controls RSP.
     *
     * If width of "canonical tail" ever becomes variable, this will need
     * to be updated to remain correct on both old and new CPUs.
     *
     * Change top bits to match most significant bit (47th or 56th bit
     * depending on paging mode) in the address.
     */
#ifdef CONFIG_X86_5LEVEL
    ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
        "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
#else
    shl    $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
    sar    $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
#endif

    /* If this changed %rcx, it was not canonical */
    cmpq    %rcx, %r11
    jne    swapgs_restore_regs_and_return_to_usermode

    cmpq    $__USER_CS, CS(%rsp)        /* CS must match SYSRET */
    jne    swapgs_restore_regs_and_return_to_usermode

    movq    R11(%rsp), %r11
    cmpq    %r11, EFLAGS(%rsp)        /* R11 == RFLAGS */
    jne    swapgs_restore_regs_and_return_to_usermode

    /*
     * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
     * restore RF properly. If the slowpath sets it for whatever reason, we
     * need to restore it correctly.
     *
     * SYSRET can restore TF, but unlike IRET, restoring TF results in a
     * trap from userspace immediately after SYSRET.  This would cause an
     * infinite loop whenever #DB happens with register state that satisfies
     * the opportunistic SYSRET conditions.  For example, single-stepping
     * this user code:
     *
     *           movq    $stuck_here, %rcx
     *           pushfq
     *           popq %r11
     *   stuck_here:
     *
     * would never get past 'stuck_here'.
     */
    testq    $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
    jnz    swapgs_restore_regs_and_return_to_usermode

    /* nothing to check for RSP */

    cmpq    $__USER_DS, SS(%rsp)        /* SS must match SYSRET */
    jne    swapgs_restore_regs_and_return_to_usermode

    /*
     * We win! This label is here just for ease of understanding
     * perf profiles. Nothing jumps here.
     */
syscall_return_via_sysret:
    /* rcx and r11 are already restored (see code above) */
    UNWIND_HINT_EMPTY
    POP_REGS pop_rdi=0 skip_r11rcx=1

    /*
     * Now all regs are restored except RSP and RDI.
     * Save old stack pointer and switch to trampoline stack.
     */
    movq    %rsp, %rdi
    movq    PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp

    pushq    RSP-RDI(%rdi)    /* RSP */
    pushq    (%rdi)        /* RDI */

    /*
     * We are on the trampoline stack.  All regs except RDI are live.
     * We can do future final exit work right here.
     */
    STACKLEAK_ERASE_NOCLOBBER

    SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi

    popq    %rdi
    popq    %rsp
    USERGS_SYSRET64
END(entry_SYSCALL_64)

流程大概就是在 entry_SYSCALL_64()中,先用swapgs保存现场,再调用了call do_syscall_64,而我们打的断点就在这个do_syscall_64中,找到read_write.c文件,执行ksys_pread64(),执行完后返回到do_syscall_64中,接着执行syscall_return_slowpath函数,为返回用户态做准备,执行完后再返回到enter_SYSCALL_64中,接着利用swapgs恢复现场,最后进入用户态。

 

 

 

 

 

 

posted @ 2020-05-25 23:33  cyh2czj  阅读(350)  评论(0编辑  收藏  举报