《Linux内核分析》(五)Linux内核的系统调用(下)

                                                          原创作品转载请注明出处 + 《Linux内核分析》MOOC课程http://mooc.study.163.com/course/USTC-1000029000 

一、实验过程:

使用gdb跟踪分析一个系统调用内核函数(您上周选择那一个系统调用),系统调用列表参见http://codelab.shiyanlou.com/xref/linux-3.18.6/arch/x86/syscalls/syscall_32.tbl ,推荐在实验楼Linux虚拟机环境下完成实验。

实验截图如下:

 (这里不知道哪里出了错误,代码是没有什么错误的,因为上次实验都能运行成功,成功地打印出当前进程的pid,不知道为何这次却不能正确打印了。

 

二、实验分析:

system_calliret结束之间的整个过程的简要流程图如下:

 (为节省时间,就用手工画图了,看不太清,望见谅)

  system_call部分的汇编代码如下:

ENTRY(system_call)
491    RING0_INT_FRAME            # can't unwind into user space anyway
492    ASM_CLAC
493    pushl_cfi %eax            # save orig_eax
494    SAVE_ALL       //保存现场
495    GET_THREAD_INFO(%ebp)
496                    # system call tracing in operation / emulation
497    testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
498    jnz syscall_trace_entry
499    cmpl $(NR_syscalls), %eax
500    jae syscall_badsys
501syscall_call:      /*根据系统调用号来查sys_call_table,找到系统调用号对应的处理函数*/
502    call *sys_call_table(,%eax,4)   /*调用了系统调用处理函数,eax保存系统调用号。*/
503syscall_after_call:
504    movl %eax,PT_EAX(%esp)        # store the return value
505syscall_exit:  /*检测当前的所有任务,是否需要处理syscall_exit_work,若需要则执行syscall_exit_work,否则执行restore_all*/
506    LOCKDEP_SYS_EXIT
507    DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
508                    # setting need_resched or sigpending
509                    # between sampling and the iret
510    TRACE_IRQS_OFF
511    movl TI_flags(%ebp), %ecx
512    testl $_TIF_ALLWORK_MASK, %ecx    # current->work
513    jne syscall_exit_work
514
515restore_all:                  //恢复现场
516    TRACE_IRQS_IRET
517restore_all_notrace:
518#ifdef CONFIG_X86_ESPFIX32
519    movl PT_EFLAGS(%esp), %eax    # mix EFLAGS, SS and CS
520    # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
521    # are returning to the kernel.
522    # See comments in process.c:copy_thread() for details.
523    movb PT_OLDSS(%esp), %ah
524    movb PT_CS(%esp), %al
525    andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
526    cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
527    CFI_REMEMBER_STATE
528    je ldt_ss            # returning to user-space with LDT SS
529#endif
530restore_nocheck:
531    RESTORE_REGS 4            # skip orig_eax/error_code
532irq_return:
533    INTERRUPT_RETURN    /* 中断返回,表示中断系统调用的处理过程到此结束 */
534.section .fixup,"ax"
535ENTRY(iret_exc)
536    pushl $0            # no error code
537    pushl $do_iret_error
538    jmp error_code
539.previous
540    _ASM_EXTABLE(irq_return,iret_exc)
541
542#ifdef CONFIG_X86_ESPFIX32
543    CFI_RESTORE_STATE
544ldt_ss:
545#ifdef CONFIG_PARAVIRT
546    /*
547     * The kernel can't run on a non-flat stack if paravirt mode
548     * is active.  Rather than try to fixup the high bits of
549     * ESP, bypass this code entirely.  This may break DOSemu
550     * and/or Wine support in a paravirt VM, although the option
551     * is still available to implement the setting of the high
552     * 16-bits in the INTERRUPT_RETURN paravirt-op.
553     */
554    cmpl $0, pv_info+PARAVIRT_enabled
555    jne restore_nocheck
556#endif
557
558/*
559 * Setup and switch to ESPFIX stack
560 *
561 * We're returning to userspace with a 16 bit stack. The CPU will not
562 * restore the high word of ESP for us on executing iret... This is an
563 * "official" bug of all the x86-compatible CPUs, which we can work
564 * around to make dosemu and wine happy. We do this by preloading the
565 * high word of ESP with the high word of the userspace ESP while
566 * compensating for the offset by changing to the ESPFIX segment with
567 * a base address that matches for the difference.
568 */
569#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
570    mov %esp, %edx            /* load kernel esp */
571    mov PT_OLDESP(%esp), %eax    /* load userspace esp */
572    mov %dx, %ax            /* eax: new kernel esp */
573    sub %eax, %edx            /* offset (low word is 0) */
574    shr $16, %edx
575    mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
576    mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
577    pushl_cfi $__ESPFIX_SS
578    pushl_cfi %eax            /* new kernel esp */
579    /* Disable interrupts, but do not irqtrace this section: we
580     * will soon execute iret and the tracer was already set to
581     * the irqstate after the iret */
582    DISABLE_INTERRUPTS(CLBR_EAX)
583    lss (%esp), %esp        /* switch to espfix segment */
584    CFI_ADJUST_CFA_OFFSET -8
585    jmp restore_nocheck
586#endif
587    CFI_ENDPROC
588ENDPROC(system_call)
589
590    # perform work that needs to be done immediately before resumption
591    ALIGN
592    RING0_PTREGS_FRAME        # can't unwind into user space anyway
593work_pending:/*检测是否需要重新调度,若需要,则执行work_resched,否则执行work_notifysig*/
594    testb $_TIF_NEED_RESCHED, %cl
595    jz work_notifysig
596work_resched:
597    call schedule
598    LOCKDEP_SYS_EXIT
599    DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
600                    # setting need_resched or sigpending
601                    # between sampling and the iret
602    TRACE_IRQS_OFF
603    movl TI_flags(%ebp), %ecx
604    andl $_TIF_WORK_MASK, %ecx    # is there any work to be done other
605                    # than syscall tracing?
606    jz restore_all
607    testb $_TIF_NEED_RESCHED, %cl
608    jnz work_resched
609
610work_notifysig:                # deal with pending signals and  用来处理信号的
611                    # notify-resume requests
612#ifdef CONFIG_VM86
613    testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
614    movl %esp, %eax
615    jne work_notifysig_v86        # returning to kernel-space or
616                    # vm86-space
6171:
618#else
619    movl %esp, %eax
620#endif
621    TRACE_IRQS_ON
622    ENABLE_INTERRUPTS(CLBR_NONE)
623    movb PT_CS(%esp), %bl
624    andb $SEGMENT_RPL_MASK, %bl
625    cmpb $USER_RPL, %bl
626    jb resume_kernel
627    xorl %edx, %edx
628    call do_notify_resume
629    jmp resume_userspace
630
631#ifdef CONFIG_VM86
632    ALIGN
633work_notifysig_v86:
634    pushl_cfi %ecx            # save ti_flags for do_notify_resume
635    call save_v86_state        # %eax contains pt_regs pointer
636    popl_cfi %ecx
637    movl %eax, %esp
638    jmp 1b
639#endif
640END(work_pending)
641
642    # perform syscall exit tracing
643    ALIGN
644syscall_trace_entry:
645    movl $-ENOSYS,PT_EAX(%esp)
646    movl %esp, %eax
647    call syscall_trace_enter
648    /* What it returned is what we'll actually use.  */
649    cmpl $(NR_syscalls), %eax
650    jnae syscall_call
651    jmp syscall_exit
652END(syscall_trace_entry)
653
654    # perform syscall exit tracing
655    ALIGN
656syscall_exit_work:
657    testl $_TIF_WORK_SYSCALL_EXIT, %ecx
658    jz work_pending
659    TRACE_IRQS_ON
660    ENABLE_INTERRUPTS(CLBR_ANY)    # could let syscall_trace_leave() call
661                    # schedule() instead
662    movl %esp, %eax
663    call syscall_trace_leave
664    jmp resume_userspace
665END(syscall_exit_work)
666    CFI_ENDPROC
667
668    RING0_INT_FRAME            # can't unwind into user space anyway

  注意系统调用返回iret之前的进程调度时机:从work_pending开始,检测是否需要重新调度,若需要调度,则执行work_resched函数,call schedule。

 三、实验总结:

  系统调用的处理过程,大致可以用我上面画的那张流程图来表示:ENTRY(system_call)是系统调用的入口,先SAVE_ALL保存现场,然后syscall_call函数根据eax中传递的系统调用号来调用系统调用处理函数,接着执行syscall_exit来检测是否需要处理syscall_exit_work函数,如果不需要,则restore_all来恢复现场,再执行INTERRUPT_RETURN,实现系统调用的中断返回;如果需要处理syscall_exit_work函数,则执行syscall_exit_work函数部分,其中还可能会有work_resched来实现进程调度。  在分析系统调用的处理过程中,可以发现,它和一般的中断处理过程有很多相似之处,都有中断上下文的保存和恢复。所以,在系统调用中的有些中断处理过程,推广到一般的中断处理过程中,也同样适用。

posted on 2015-04-05 16:51  xyon  阅读(135)  评论(0)    收藏  举报