Linux write系统调用流程

源码

.global _start
.text
_start:
	mov $1,%rax    # 1代表 x86_64架构下 write系统调用编号
	mov $1,%rdi    # 1代表 标准输出
	mov $msg,%rsi  # 输出的字符串
	mov $len,%rdx  # 字符串长度
	syscall        # 执行系统调用 write(1,msg,len)
	
	mov $60,%rax   # exit系统调用编号
	xor %rdi,%rdi  # 参数清空
	syscall        # 执行系统调用 exit(0)
.data:
msg:
	.ascii "hello world\n"
	len = . - msg

编译执行

as write.s -o write.o
ld write.o -o write
./write

image-20251208132717582

流程分析

我使用的linux源码版本是6.1.115,在arch/x86/entry/entry_64.S中介绍了系统调用时寄存器的状态。syscall使得内核从用户态切换到内核态并执行对应的系统调用。

image-20251208133607987

  • rax 系统调用编号
  • rdi 系统调用第一个参数
  • rsi 系统调用第二个参数
  • rdx 系统调用第三个参数
  • r10 系统调用第四个参数
  • r8 系统调用第五个参数
  • r9 系统调用第六个参数

syscall_init

在内核系统初始化时cpu_init --> syscall_init(arch/x86/kernel/cpu/common.c)会指定系统调用入口地址。

void syscall_init(void)
{
	wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
    // 注册系统调用入口
    // 因为MSR_LSTAR 是syscall入口地址寄存器
	// 所以系统调用的入口都是 entry_SYSCALL_64
	wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
	......
}

entry_SYSCALL_64

**arch/x86/entry/entry_64.S**

SYM_CODE_START(entry_SYSCALL_64) #系统调用入口
	......
SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
    ......
SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
    pushq   %rax                                    /* pt_regs->orig_ax */
    PUSH_AND_CLEAR_REGS rax=$-ENOSYS
    /* IRQs are off. */
    movq    %rsp, %rdi
    /* Sign extend the lower 32bit as syscall numbers are treated as int */
    movslq  %eax, %rsi
    /* clobbers %rax, make sure it is after saving the syscall nr */
        
    call    do_syscall_64           /* returns with IRQs disabled */
    ......

do_syscall_64

当进程调用write系统函数时,会通过syscall 进入entry_64.S中并继续调用do_syscall_64

arch/x86/entry/common.c

__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
{
	add_random_kstack_offset();
	nr = syscall_enter_from_user_mode(regs, nr);

	instrumentation_begin();

	if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
		/* Invalid system call, but still a system call. */
		regs->ax = __x64_sys_ni_syscall(regs);
	}

	instrumentation_end();
	syscall_exit_to_user_mode(regs);
}

do_syscall_x64

接着调用do_syscall_x64arch/x86/entry/common.c

static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
{
	unsigned int unr = nr;
	if (likely(unr < NR_syscalls)) {
		unr = array_index_nospec(unr, NR_syscalls);
		regs->ax = x64_sys_call(regs, unr);
		return true;
	}
	return false;
}

NR_syscalls

NR_syscalls 系统调用数量 由内核自动生成

arch/x86/include/generated/uapi/asm/unistd_64.h

define __NR_syscalls 451

arch/x86/include/unistd.h

define NR_syscalls (__NR_syscalls)

x64_sys_call

然后调用x64_sys_call, arch/x86/entry/syscall_64.c

#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
#include <asm/syscalls_64.h>
#undef __SYSCALL

#define __SYSCALL(nr, sym) __x64_##sym,
const sys_call_ptr_t sys_call_table[] = {
#include <asm/syscalls_64.h>
};
#undef __SYSCALL

#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);


long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
{
	switch (nr) {
	#include <asm/syscalls_64.h>
	default: return __x64_sys_ni_syscall(regs);
	}
};

展开后,对于write()最终会执行 __x64_sys_write(regs)

系统调用映射

arch/x86/entry/syscalls/syscall_64.tbl中系统调用之间的映射关系

# 64-bit system call numbers and entry vectors
# The format is:
# <number> <abi> <name> <entry point>
# The __x64_sys_*() stubs are created on-the-fly for sys_*() system calls
# The abi is "common", "64" or "x32" for this file.
0	common	read			sys_read
1	common	write			sys_write
2	common	open			sys_open
3	common	close			sys_close
4	common	stat			sys_newstat
5	common	fstat			sys_newfstat
6	common	lstat			sys_newlstat
   ......

所以__x64_sys_write 就是 sys_write

sys_write

fs/read_write.c中定义了write

SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
		size_t, count)
{
	return ksys_write(fd, buf, count);
}

展开宏定义

/* 1. 诊断忽略区域开始(实际编译器指令) */
__diag_push();
__diag_ignore(GCC, 8, "-Wattribute-alias", "Type aliasing is used to sanitize syscall arguments");

/* 2. 声明 sys_write 为 __se_sys_write 的别名 */
asmlinkage long sys_write(unsigned int fd, const char __user * buf, size_t count)
    __attribute__((alias("__se_sys_write")));

/* 3. 允许错误注入 */
ALLOW_ERROR_INJECTION(sys_write, ERRNO);

/* 4. 声明内联函数 __do_sys_write */
static inline long __do_sys_write(unsigned int fd, const char __user * buf, size_t count);

/* 5. 定义 __se_sys_write(参数类型转换为 long 形式) */
asmlinkage long __se_sys_write(unsigned long fd, long buf, long count)
{
    long ret = __do_sys_write((unsigned int)fd, (const char __user *)buf, (size_t)count);
    /* 参数检查(此处无操作,宏 __SC_TEST 展开为空) */
    ;
    /* 系统调用保护(宏 __PROTECT 展开为空) */
    return ret;
}

/* 6. 诊断忽略区域结束 */
__diag_pop();

/* 7. 定义内联函数 __do_sys_write */
static inline long __do_sys_write(unsigned int fd, const char __user * buf, size_t count)
{
    return ksys_write(fd, buf, count);
}

流程图

image-20251208152540693

posted @ 2025-12-08 15:59  只A有缘人  阅读(2)  评论(0)    收藏  举报