kvm vcpu run

 

https://luohao-brian.gitbooks.io/interrupt-virtualization/content/kvm-run-processzhi-qemu-he-xin-liu-cheng.html

 

kvm_cpu_thread

void *kvm_cpu_thread(void *data) {
    struct kvm *kvm = (struct kvm *)data;
    int ret = 0;
    kvm_reset_vcpu(kvm->vcpus);

    while (1) {
        printf("KVM start run\n");
        ret = ioctl(kvm->vcpus->vcpu_fd, KVM_RUN, 0);
    
        if (ret < 0) {
            fprintf(stderr, "KVM_RUN failed\n");
            exit(1);
        }

        switch (kvm->vcpus->kvm_run->exit_reason) {
        case KVM_EXIT_UNKNOWN:
            printf("KVM_EXIT_UNKNOWN\n");
            break;
        case KVM_EXIT_DEBUG:
            printf("KVM_EXIT_DEBUG\n");
            break;
        case KVM_EXIT_IO:
            printf("KVM_EXIT_IO\n");
            printf("out port: %d, data: %d\n", 
                kvm->vcpus->kvm_run->io.port,  
                *(int *)((char *)(kvm->vcpus->kvm_run) + kvm->vcpus->kvm_run->io.data_offset)
                );
            sleep(1);
            break;
        case KVM_EXIT_MMIO:
            printf("KVM_EXIT_MMIO\n");
            break;
        case KVM_EXIT_INTR:
            printf("KVM_EXIT_INTR\n");
            break;
        case KVM_EXIT_SHUTDOWN:
            printf("KVM_EXIT_SHUTDOWN\n");
            goto exit_kvm;
            break;
        default:
            printf("KVM PANIC\n");
            goto exit_kvm;
        }
    }

exit_kvm:
    return 0;
}

 

 

 

kvm->vcpus = kvm_init_vcpu(kvm, 0, kvm_cpu_thread);

struct vcpu *kvm_init_vcpu(struct kvm *kvm, int vcpu_id, void *(*fn)(void *)) {
    struct vcpu *vcpu = malloc(sizeof(struct vcpu));
    vcpu->vcpu_id = 0;
    vcpu->vcpu_fd = ioctl(kvm->vm_fd, KVM_CREATE_VCPU, vcpu->vcpu_id);

    if (vcpu->vcpu_fd < 0) {
        perror("can not create vcpu");
        return NULL;
    }

    vcpu->kvm_run_mmap_size = ioctl(kvm->dev_fd, KVM_GET_VCPU_MMAP_SIZE, 0);

    if (vcpu->kvm_run_mmap_size < 0) {
        perror("can not get vcpu mmsize");
        return NULL;
    }

    printf("%d\n", vcpu->kvm_run_mmap_size);
    vcpu->kvm_run = mmap(NULL, vcpu->kvm_run_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->vcpu_fd, 0);

    if (vcpu->kvm_run == MAP_FAILED) {
        perror("can not mmap kvm_run");
        return NULL;
    }

    vcpu->vcpu_thread_func = fn;
    return vcpu;
}

 

 

void kvm_run_vm(struct kvm *kvm) {
    int i = 0;

    for (i = 0; i < kvm->vcpu_number; i++) {
        if (pthread_create(&(kvm->vcpus->vcpu_thread), (const pthread_attr_t *)NULL, kvm->vcpus[i].vcpu_thread_func, kvm) != 0) {
            perror("can not create kvm thread");
            exit(1);
        }
    }

    pthread_join(kvm->vcpus->vcpu_thread, NULL);
}

 

 

 

 

Qemu核心流程

阶段一:参数解析

这里使用qemu版本为qemu-kvm-1.2.0,使用Qemu工具使能虚拟机运行的命令为:

$sudo /usr/local/kvm/bin/qemu-system-x86_64 -hda vdisk_linux.img -m 1024

这时候会启动qemu-system-x86_64应用程序,该程序入口为

int main(int argc, char **argv, char **envp)   <------file: vl.c,line: 2345

在main函数中第一阶段主要对命令传入的参数进行parser,包括如下几个方面:

QEMU_OPTION_M                      机器类型及体系架构相关
QEMU_OPTION_hda
/mtdblock/
pflash    存储介质相关
QEMU_OPTION_numa                   numa系统相关
QEMU_OPTION_kernel                 内核镜像相关
QEMU_OPTION_initrd                 initramdisk相关
QEMU_OPTION_append                 启动参数相关
QEMU_OPTION_net/netdev             网络相关
QEMU_OPTION_smp                    smp相关

阶段二:VM的创建

通过configure_accelerator()->kvm_init() file: kvm-all.c, line: 1281;
首先打开/dev/kvm,获得三大描述符之一kvmfd, 其次通过KVM_GET_API_VERSION进行版本验证,最后通过KVM_CREATE_VM创建了一个VM对象,返回了三大描述符之二: VM描述符/vmfd。

s->fd = qemu_open("/dev/kvm", O_RDWR);        kvm_init()/line: 1309
ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);   kmv_init()/line: 1316
s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0);     kvm_init()/line: 1339

阶段三:VM的初始化

通过加载命令的参数的解析和相关系统的初始化,找到对应的machine类型进行第三阶段的初始化:

machine->init(ram_size, boot_devices, kernel_filename, kernel_cmdline, initrd_filename, cpu_model);
file: vl.c, line: 3651

其中的参数包括从命令行传入解析后得到的,ram的大小,内核镜像文件名,内核启动参数,initramdisk文件名,cpu模式等。
我们使用系统默认类型的machine,则init函数为pc_init_pci(),通过一系列的调用:

pc_init_pci()   file: pc_piix.c, line: 294
    --->pc_init1()    file: pc_piix.c, line: 123
        --->pc_cpus_init()  file: pc.c, line: 941

在命令行启动时配置的smp参数在这里启作用了,qemu根据配置的cpu个数,进行n次的cpu初始化,相当于n个核的执行体。

void pc_cpus_init(const char *cpu_model)
{
    int i;
    /* init CPUs */
for(i = 0; i < smp_cpus; i++) {
        pc_new_cpu(cpu_model);
    }
}

继续cpu的初始化:

pc_new_cpu()    file: hw/pc.c, line: 915
    --->cpu_x86_init()    file: target-i386/helper.c, line: 1150
        --->x86_cpu_realize()    file: target-i386/cpu.c, line: 1767
            --->qemu_init_vcpu()    file: cpus.c, line: 1039
                --->qemu_kvm_start_vcpu()    file: cpus.c, line: 1011

qemu_kvm_start_vcpu是一个比较重要的函数,我们在这里可以看到VM真正的执行体是什么。

阶段四:VM RUN

static void qemu_kvm_start_vcpu(CPUArchState *env) <--------file: cpus.c, line: 1011
{
    CPUState *cpu = ENV_GET_CPU(env);
    ......
    qemu_cond_init(env->halt_cond);
    qemu_thread_create(cpu->thread, qemu_kvm_cpu_thread_fn, env, QEMU_THREAD_JOINABLE);
    ......
}


void qemu_thread_create(QemuThread *thread,
                       void *(*start_routine)(void*),
                       void *arg, int mode)    <--------file: qemu-thread-posix.c, line: 118
{
    ......
    err = pthread_attr_init(&attr);
    ......
    err = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
    ......
    pthread_sigmask(SIG_SETMASK, &set, &oldset);
    ......
    pthread_create(&thread->thread, &attr, start_routine, arg);
    ......
    pthread_attr_destroy(&attr);
}

可以看到VM真正的执行体是QEMU进程创建的一系列POSIX线程,而线程执行函数为qemu_kvm_cpu_thread_fn。
kvm_init_vcpu()通过KVM_CREATE_VCPU创建了三大描述符之三:vcpu描述符/vcpufd。
并进入了while(1)的循环循环,反复调用kvm_cpu_exec()。

static void *qemu_kvm_cpu_thread_fn(void *arg)   file: cpus.c, line: 732
{
    ......
    r = kvm_init_vcpu(env);       <--------file: kvm-all.c, line: 213
    ......
    qemu_kvm_init_cpu_signals(env);
    /* signal CPU creation */
    env->created = 1;
    qemu_cond_signal(&qemu_cpu_cond);
    while (1) {
        if (cpu_can_run(env)) {
            r = kvm_cpu_exec(env);      <--------file: kvm-all.c, line: 1550 
            if (r == EXCP_DEBUG) {
                cpu_handle_guest_debug(env);
            }
        }
        qemu_kvm_wait_io_event(env);
    }
    return NULL;
}

我们可以看到kvm_cpu_exec()中又是一个do()while(ret == 0)的循环体,该循环体中主要通过KVM_RUN启动VM的运行,从此处进入了kvm的内核处理阶段,并等待返回结果,同时根据返回的原因进行相关的处理,最后将处理结果返回。因为整个执行体在上述函数中也是在循环中,所以后续又会进入到该函数的处理中,而整个VM的cpu的处理就是在这个循环中不断的进行。

int kvm_cpu_exec(CPUArchState *env)
{
    do {
        run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);       ---------调kvm,进入内核kvm 
//退出kvm,进入qemu
switch (run->exit_reason) { case KVM_EXIT_IO: kvm_handle_io(); ...... case KVM_EXIT_MMIO: cpu_physical_memory_rw(); ...... case KVM_EXIT_IRQ_WINDOW_OPEN: ret = EXCP_INTERRUPT; ...... case KVM_EXIT_SHUTDOWN: ret = EXCP_INTERRUPT; ...... case KVM_EXIT_UNKNOWN: ret = -1 ...... case KVM_EXIT_INTERNAL_ERROR: ret = kvm_handle_internal_error(env, run); ...... default: ret = kvm_arch_handle_exit(env, run); ...... } } while (ret == 0); env->exit_request = 0; return ret; }

Conclusion

总结下kvm run在Qemu中的核心流程:

  1. 解析参数;
  2. 创建三大描述符:kvmfd/vmfd/vcpufd,及相关的初始化,为VM的运行创造必要的条件;
  3. 根据cpu的配置数目,启动n个POSIX线程运行VM实体,所以vm的执行环境追根溯源是在Qemu创建的线程环境中开始的。
  4. 通过 KVM_RUN 调用KVM提供的API发起KVM的启动,从这里进入到了内核空间运行,等待运行返回;
  5. 重复循环进入run阶段。

 

KVM核心流程

qemu通过调用kvm提供的一系列接口来启动kvm. qemu的入口为vl.c中的main函数, main函数通过调用kvm_init 和 machine->init来初始化kvm. 其中, machine->init会创建vcpu, 用一个线程去模拟vcpu, 该线程执行的函数为qemu_kvm_cpu_thread_fn, 并且该线程最终调用kvm_cpu_exec, 该函数调用kvm_vcpu_ioctl切换到kvm中,下次从kvm中返回时,会接着执行kvm_vcpu_ioctl之后的代码,判断exit_reason,然后进行相应处理.
int kvm_cpu_exec(CPUState *cpu) --> run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);

当传入参数为KVM_RUN时,会进入到KVM中,会执行__vcpu_run函数,最终会调用到vcpu_enter_guest函数, vcpu_enter_guest函数中调用了kvm_x86_ops->run(vcpu), 在intel处理器架构中该函数对应的实现为vmx_vcpu_runvmx_vcpu_run设置好寄存器状态之后调用VM_LAUNCH或者VM_RESUME进入guest vm, 一旦发生vm exit则从此处继续执行下去.

static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
	/*vmx_vcpu_run设置好寄存器状态之后调用VM_LAUNCH或者VM_RESUME
	进入guest vm, 一旦发生vm exit则从此处继续执行下去*/
	asm(	
		/* Enter guest mode */
		"jne .Llaunched \n\t"
		__ex(ASM_VMX_VMLAUNCH) "\n\t"
		"jmp .Lkvm_vmx_return \n\t"
		".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
		".Lkvm_vmx_return: "
		vmx->launched = 1;
	/*当Guest Vm进行IO操作需要访问设备时,
	就会触发vm exit 返回到vmx_vcpu_run*/
	vmx_complete_interrupts(vmx);
}

 

static int vcpu_run(struct kvm_vcpu *vcpu)
{
    int r;
    struct kvm *kvm = vcpu->kvm;

    vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);

    for (;;) {
        if (kvm_vcpu_running(vcpu))
            r = vcpu_enter_guest(vcpu);//进入虚拟机
        else
            r = vcpu_block(kvm, vcpu);

        if (r <= 0)
            break;

        clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
        if (kvm_cpu_has_pending_timer(vcpu))
            kvm_inject_pending_timer_irqs(vcpu);

        if (dm_request_for_irq_injection(vcpu) &&
            kvm_vcpu_ready_for_interrupt_injection(vcpu)) {
            r = 0;
            vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
            ++vcpu->stat.request_irq_exits;
            break;
        }

        kvm_check_async_pf_completion(vcpu);

        if (signal_pending(current)) {
            r = -EINTR;
            vcpu->run->exit_reason = KVM_EXIT_INTR;
            ++vcpu->stat.signal_exits;
            break;
        }
        if (need_resched()) {
            srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
            cond_resched();
            vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
        }
    }

    srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);

    return r;
}

 

 

当Guest VM进行IO操作需要访问设备时,就会触发VM Exit 返回到vmx_vcpu_run, vmx保存好vmcs并且记录下VM_EXIT_REASON后返回到调用该函数的vcpu_enter_guest

vmx_complete_interrupts --> vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);

回到vcpu_enter_guest中,在vcpu_enter_guest函数末尾调用了r = kvm_x86_ops->handle_exit(vcpu)该函数对应于vmx_handle_exit函数。

static int vmx_handle_exit(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	u32 exit_reason = vmx->exit_reason;
	u32 vectoring_info = vmx->idt_vectoring_info;
	...
	if (exit_reason < kvm_vmx_max_exit_handlers
	    && kvm_vmx_exit_handlers[exit_reason])
		return kvm_vmx_exit_handlers[exit_reason](vcpu);
	else {
		vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
		vcpu->run->hw.hardware_exit_reason = exit_reason;
	}
	return 0;
}

 

vmx_handle_exit 调用kvm_vmx_exit_handlers[exit_reason](vcpu),该语句根据exit_reason调用不同的函数,该数据结构定义如下:

static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
	[EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
	[EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
	[EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
	[EXIT_REASON_NMI_WINDOW]	          = handle_nmi_window,
	[EXIT_REASON_IO_INSTRUCTION]          = handle_io,
	[EXIT_REASON_CR_ACCESS]               = handle_cr,
	[EXIT_REASON_DR_ACCESS]               = handle_dr,
	[EXIT_REASON_CPUID]                   = handle_cpuid,
	[EXIT_REASON_MSR_READ]                = handle_rdmsr,
	[EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
	[EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
	[EXIT_REASON_HLT]                     = handle_halt,
	[EXIT_REASON_INVD]		              = handle_invd,
	[EXIT_REASON_INVLPG]		          = handle_invlpg,
	[EXIT_REASON_RDPMC]                   = handle_rdpmc,
	[EXIT_REASON_VMCALL]                  = handle_vmcall,
	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
	[EXIT_REASON_VMLAUNCH]                = handle_vmlaunch,
	[EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
	[EXIT_REASON_VMPTRST]                 = handle_vmptrst,
	[EXIT_REASON_VMREAD]                  = handle_vmread,
	[EXIT_REASON_VMRESUME]                = handle_vmresume,
	[EXIT_REASON_VMWRITE]                 = handle_vmwrite,
	[EXIT_REASON_VMOFF]                   = handle_vmoff,
	[EXIT_REASON_VMON]                    = handle_vmon,
	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
	[EXIT_REASON_XSETBV]                  = handle_xsetbv,
	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
	[EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
	[EXIT_REASON_EPT_VIOLATION]	          = handle_ept_violation,
	[EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
	[EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
	[EXIT_REASON_MWAIT_INSTRUCTION]	      = handle_invalid_op,
	[EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
};

 

如果是因为IO原因导致的vm exit,则调用的处理函数为handle_io.

static int handle_io(struct kvm_vcpu *vcpu)
{
	unsigned long exit_qualification;
	int size, in, string;
	unsigned port;
	++vcpu->stat.io_exits;
	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);  //获取exit qualification  
	string = (exit_qualification & 16) != 0;  //判断是否为string io (ins, outs) 
	if (string) {
		if (emulate_instruction(vcpu, 0) == EMULATE_DO_MMIO)
			return 0;
		return 1;
	}
	size = (exit_qualification & 7) + 1;  //大小
	in = (exit_qualification & 8) != 0;   //判断io方向,是in  还是out
	port = exit_qualification >> 16; //得到端口号 

	skip_emulated_instruction(vcpu);
	return kvm_emulate_pio(vcpu, in, size, port);
}

 

这里我们可以看到,根据io的方向,以及是否为string io,处理方式是不同的.

int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
{
	unsigned long val;
	trace_kvm_pio(!in, port, size, 1);
	vcpu->run->exit_reason = KVM_EXIT_IO;
	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
	vcpu->run->io.size = vcpu->arch.pio.size = size;
	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
	vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
	vcpu->run->io.port = vcpu->arch.pio.port = port;
	vcpu->arch.pio.in = in;
	vcpu->arch.pio.string = 0;
	vcpu->arch.pio.down = 0;
	vcpu->arch.pio.rep = 0;
	val = kvm_register_read(vcpu, VCPU_REGS_RAX);
	memcpy(vcpu->arch.pio_data, &val, 4);
	/* 如果在kmod中能完成io的话,就完成处理,不需要再返回qemu了  */
	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
		complete_pio(vcpu);
		return 1;
	}
	return 0;
}

 

返回qemu之后,继续kvm_cpu_exec的执行:

int kvm_cpu_exec(CPUArchState *env)
{
    struct kvm_run *run = env->kvm_run;
	int ret, run_ret;
	…
	switch (run->exit_reason) { 
		//根据kvm_run中存放的退出原因来选择处理方式
        case KVM_EXIT_IO:
            DPRINTF("handle_io\n");
            kvm_handle_io(run->io.port,
                		  (uint8_t *)run + run->io.data_offset,
                          run->io.direction,
                          run->io.size,
                          run->io.count);
			//这里可以看到,qemu会根据kvm_run中存放的数据来进行处理了
            ret = 0;
            break;
		…
}
static void kvm_handle_io(uint16_t port, void *data, int direction, int size, uint32_t count)
{
	//在这个函数中,我们就可以看到,最终是调用了
	//cpu_inb、cpu_outb这些函数和具体的设备进行交互。
    int i;
    uint8_t *ptr = data;
    for (i = 0; i < count; i++) {
        if (direction == KVM_EXIT_IO_IN) {
            switch (size) {
            case 1:
                stb_p(ptr, cpu_inb(port));
                break;
			...
            }
        } else {
            switch (size) {
            case 1:
                cpu_outb(port, ldub_p(ptr));
                break;
			...
        }
        ptr += size;
    }
}

 

这样,kvm就完成了一次对guest执行out指令的虚拟。
当qemu完成IO操作后,会在kvm_cpu_exec函数的循环中,调用kvm_vcpu_ioctl重新进入kvm。
Guest IO Code Path

 

[<ffffffffb66a4d89>] schedule+0x39/0x80
[<ffffffffc0606a06>] ? kvm_irq_delivery_to_apic+0x56/0x220 [kvm]
[<ffffffffb66a7447>] rwsem_down_read_failed+0xc7/0x120
[<ffffffffb63cb594>] call_rwsem_down_read_failed+0x14/0x30
[<ffffffffb66a6af7>] ? down_read+0x17/0x20
[<ffffffffc05d1480>] kvm_host_page_size+0x60/0xa0 [kvm]
[<ffffffffc05ea9bc>] mapping_level+0x5c/0x130 [kvm]
[<ffffffffc05f1b1b>] tdp_page_fault+0x9b/0x260 [kvm]
[<ffffffffc05eba21>] kvm_mmu_page_fault+0x31/0x120 [kvm]
[<ffffffffc0678db4>] handle_ept_violation+0xa4/0x170 [kvm_intel]
[<ffffffffc067fd07>] vmx_handle_exit+0x257/0x490 [kvm_intel]
[<ffffffffb60b2081>] ? __vtime_account_system+0x31/0x40
[<ffffffffc05e662f>] vcpu_enter_guest+0x6af/0xff0 [kvm]
[<ffffffffc06034ad>] ? kvm_apic_local_deliver+0x5d/0x60 [kvm]
[<ffffffffc05e8564>] kvm_arch_vcpu_ioctl_run+0xc4/0x3c0 [kvm]
[<ffffffffc05cf844>] kvm_vcpu_ioctl+0x324/0x5d0 [kvm]
[<ffffffffb611a4cc>] ? acct_account_cputime+0x1c/0x20
[<ffffffffb60b1f23>] ? account_user_time+0x73/0x80
[<ffffffffb61da203>] do_vfs_ioctl+0x83/0x4e0
[<ffffffffb600261f>] ? enter_from_user_mode+0x1f/0x50
[<ffffffffb6002711>] ? syscall_trace_enter_phase1+0xc1/0x110
[<ffffffffb61da6ac>] SyS_ioctl+0x4c/0x80
[<ffffffffb66a892e>] entry_SYSCALL_64_fastpath+0x12/0x7

 

KVM RUN的准备

当Qemu使用kvm_vcpu_ioctl(env, KVM_RUN, 0);发起KVM_RUN命令时,ioctl会陷入内核,到达kvm_vcpu_ioctl();

kvm_vcpu_ioctl()     file: virt/kvm/kvm_main.c, line: 1958
    --->kvm_arch_vcpu_ioctl_run()    file: arch/x86/kvm, line: 6305
        --->__vcpu_run()  file: arch/x86/kvm/x86.c, line: 6156

在__vcpu_run()中也出现了一个while(){}主循环;

static int __vcpu_run(struct kvm_vcpu *vcpu)
{
    ......
    r = 1;
    while (r > 0) {
        if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted)
            r = vcpu_enter_guest(vcpu);
        else {
            ......
            }
        }
        if (r <= 0)     <--------当r小于0时会跳出循环体,回到qemu
            break;
        ......
    }
    return r;
}

 

 

我们看到当KVM通过__vcpu_run()进入主循环后,调用vcpu_enter_guest(),从名字上看可以知道这是进入guest模式的入口;
当r大于0时KVM内核代码会一直调用vcpu_enter_guest(),重复进入guest模式;
当r小于等于0时则会跳出循环体,此时会一步一步退到当初的入口kvm_vcpu_ioctl(),乃至于退回到用户态空间Qemu进程中(退出kvm,回到qemu),具体的地方可以参看上一篇文章,这里也给出相关的代码片段:

int kvm_cpu_exec(CPUArchState *env)
{
    do {
        run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
        switch (run->exit_reason) {     <----------Qemu根据退出的原因进行处理,主要是IO相关方面的操作
        case KVM_EXIT_IO:
            kvm_handle_io();      -------------不同于handle_io
            ......
        case KVM_EXIT_MMIO:
            cpu_physical_memory_rw();
            ......
        case KVM_EXIT_IRQ_WINDOW_OPEN:
            ret = EXCP_INTERRUPT;
            ......
        case KVM_EXIT_SHUTDOWN:
            ret = EXCP_INTERRUPT;
            ......
        case KVM_EXIT_UNKNOWN:
            ret = -1
            ......
        case KVM_EXIT_INTERNAL_ERROR:
            ret = kvm_handle_internal_error(env, run);
            ......
        default:
            ret = kvm_arch_handle_exit(env, run);
            ......
        }
    } while (ret == 0);
    env->exit_request = 0;
    return ret;
}

Qemu根据退出的原因进行处理,主要是IO相关方面的操作,当然处理完后又会调用kvm_vcpu_ioctl(env, KVM_RUN, 0)再次RUN KMV。
我们再次拉回到内核空间,走到了static int vcpu_enter_guest(struct kvm_vcpu *vcpu)函数,其中有几个重要的初始化准备工作:

static int vcpu_enter_guest(struct kvm_vcpu *vcpu)  file: arch/x86/kvm/x86.c, line: 5944
{
    ......
    kvm_check_request();     <-------查看是否有guest退出的相关请求
    ......
    kvm_mmu_reload(vcpu);    <-------Guest的MMU初始化,为内存虚拟化做准备
    ......
    preempt_disable();       <-------内核抢占关闭
    ......
    kvm_x86_ops->run(vcpu);  <-------体系架构相关的run操作
    ......                   <-------到这里表明guest模式已退出
    kvm_x86_ops->handle_external_intr(vcpu);  <-------host处理外部中断
    ......
    preempt_enable();        <-------内核抢占使能
    ......
    r = kvm_x86_ops->handle_exit(vcpu);  <------根据具体的退出原因进行处理
    return r;
    ......
}

Guest的进入

kvm_x86_ops是一个x86体系相关的函数集,定义位于file: arch/x86/kvm/vmx.c, line: 8693

static struct kvm_x86_ops vmx_x86_ops = {
    ......
    .run = vmx_vcpu_run,
    .handle_exit = vmx_handle_exit,
    ......
}

vmx_vcpu_run()中一段核心的汇编函数的功能主要就是从ROOT模式切换至NO ROOT模式,主要进行了:

  1. Store host registers:主要将host状态上下文存入到VM对应的VMCS结构中;
  2. Load guest registers:主要讲guest状态进行加载;
  3. Enter guest mode: 通过ASM_VMX_VMLAUNCH指令进行VM的切换,从此进入另一个世界,即Guest OS中;
  4. Save guest registers, load host registers: 当发生VM Exit时,需要保持guest状态,同时加载HOST;

当第4步完成后,Guest即从NO ROOT模式退回到了ROOT模式中,又恢复了HOST的执行生涯。

Guest的退出处理

当然Guest的退出不会就这么算了,退出总是有原因的,为了保证Guest后续的顺利运行,KVM要根据退出原因进行处理,此时重要的函数为:vmx_handle_exit();

static int vmx_handle_exit(struct kvm_vcpu *vcpu)    file: arch/x86/kvm/vmx.c, line: 6877
{
    ......
    if (exit_reason < kvm_vmx_max_exit_handlers
        && kvm_vmx_exit_handlers[exit_reason])
        return kvm_vmx_exit_handlers[exit_reason](vcpu);     <-----根据reason调用对应的注册函数处理
    else {
        vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
        vcpu->run->hw.hardware_exit_reason = exit_reason;
    }
    return 0;      <--------若发生退出原因不在KVM预定义的handler范围内,则返回0
}

而众多的exit reason对应的handler如下

static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
    [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,     <------异常
    [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,  <------外部中断
    [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
    [EXIT_REASON_NMI_WINDOW]              = handle_nmi_window,
    [EXIT_REASON_IO_INSTRUCTION]          = handle_io,      <------io指令操作
    [EXIT_REASON_CR_ACCESS]               = handle_cr,
    [EXIT_REASON_DR_ACCESS]               = handle_dr,
    [EXIT_REASON_CPUID]                   = handle_cpuid,
    [EXIT_REASON_MSR_READ]                = handle_rdmsr,
    [EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
    [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
    [EXIT_REASON_HLT]                     = handle_halt,
    [EXIT_REASON_INVD]                      = handle_invd,
    [EXIT_REASON_INVLPG]                  = handle_invlpg,
    [EXIT_REASON_RDPMC]                   = handle_rdpmc,
    [EXIT_REASON_VMCALL]                  = handle_vmcall,     <-----VM相关操作指令
    [EXIT_REASON_VMCLEAR]                  = handle_vmclear,
    [EXIT_REASON_VMLAUNCH]                = handle_vmlaunch,
    [EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
    [EXIT_REASON_VMPTRST]                 = handle_vmptrst,
    [EXIT_REASON_VMREAD]                  = handle_vmread,
    [EXIT_REASON_VMRESUME]                = handle_vmresume,
    [EXIT_REASON_VMWRITE]                 = handle_vmwrite,
    [EXIT_REASON_VMOFF]                   = handle_vmoff,
    [EXIT_REASON_VMON]                    = handle_vmon,
    [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
    [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
    [EXIT_REASON_APIC_WRITE]              = handle_apic_write,
    [EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
    [EXIT_REASON_WBINVD]                  = handle_wbinvd,
    [EXIT_REASON_XSETBV]                  = handle_xsetbv,
    [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,     <----进程切换
    [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
    [EXIT_REASON_EPT_VIOLATION]              = handle_ept_violation,   <----EPT缺页异常
    [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
    [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
    [EXIT_REASON_MWAIT_INSTRUCTION]          = handle_invalid_op,
    [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
    [EXIT_REASON_INVEPT]                  = handle_invept,
};

当该众多的handler处理成功后,会得到一个大于0的返回值,而处理失败则会返回一个小于0的数;则又回到__vcpu_run()中的主循环中;
vcpu_enter_guest() > 0时: 则继续循环,再次准备进入Guest模式;
vcpu_enter_guest() <= 0时: 则跳出循环,返回用户态空间,由Qemu根据退出原因进行处理。

handle_io

vmx_handle_eixt() {
    /* kvm_vmx_exit_handlers[exit_reason](vcpu); */
    handle_io() {
        kvm_emulate_pio() {
            kernel_io() {
                if (read) {
                    kvm_io_bus_read() {

                    }
                } else {
                    kvm_io_bus_write() {
                        ioeventfd_write();
                }
            }
        }
    }
}

 

vmx_handle_exit-->kvm_vmx_exit_handlers[exit_reason]-->handle_io-->kvm_fast_pio_out-->emulator_pio_out_emulated-->emulator_pio_in_out-->kernel_pio-->kvm_io_bus_write-->kvm_iodevice_write(dev->ops->write)-->ioeventfd_write-->eventfd_signal

-->wake_up_locked_poll-->__wake_up_locked_key-->__wake_up_common-->vhost_poll_wakeup-->vhost_poll_queue-->vhost_work_queue-->wake_up_process
————————————————————
 

 

static int handle_io(struct kvm_vcpu *vcpu)
{
    unsigned long exit_qualification;
    int size, in, string;
    unsigned port;
    exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
    string = (exit_qualification & 16) != 0;
    in = (exit_qualification & 8) != 0;
    ++vcpu->stat.io_exits;
    if (string || in)
        return emulate_instruction(vcpu, 0) == EMULATE_DONE;
    port = exit_qualification >> 16;
    size = (exit_qualification & 7) + 1;
    skip_emulated_instruction(vcpu);
    return kvm_fast_pio_out(vcpu, size, port);
}

 

kvm_handle_io


static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction,
                          int size, uint32_t count)
{
    int i;
    uint8_t *ptr = data;

    for (i = 0; i < count; i++) {
        address_space_rw(&address_space_io, port, attrs,
                         ptr, size,
                         direction == KVM_EXIT_IO_OUT);
        ptr += size;
    }
}

 

#0  blk_aio_prwv (blk=0x555556a6fc60, offset=0x0, bytes=0x200, qiov=0x7ffff0059e70, co_entry=0x555555b58df1 <blk_aio_read_entry>, flags=0, cb=0x555555997813 <ide_buffered_readv_cb>, opaque=0x7ffff0059e50) at block/block-backend.c:995
#1  blk_aio_preadv (blk=0x555556a6fc60, offset=0x0, qiov=0x7ffff0059e70, flags=0, cb=0x555555997813 <ide_buffered_readv_cb>, opaque=0x7ffff0059e50) at block/block-backend.c:1100
#2  ide_buffered_readv (s=0x555557f66a68, sector_num=0x0, iov=0x555557f66d60, nb_sectors=0x1, cb=0x555555997b41 <ide_sector_read_cb>, opaque=0x555557f66a68) at hw/ide/core.c:637
#3  ide_sector_read (s=0x555557f66a68) at hw/ide/core.c:760
#4  cmd_read_pio (s=0x555557f66a68, cmd=0x20) at hw/ide/core.c:1452
#5  ide_exec_cmd (bus=0x555557f669f0, val=0x20) at hw/ide/core.c:2043
#6  ide_ioport_write (opaque=0x555557f669f0, addr=0x7, val=0x20) at hw/ide/core.c:1249
#7  portio_write (opaque=0x555558044e00, addr=0x7, data=0x20, size=0x1) at /home/jaycee/qemu-io_test/qemu-2.8.0/ioport.c:202
#8  memory_region_write_accessor (mr=0x555558044e00, addr=0x7, value=0x7ffff5f299b8, size=0x1, shift=0x0, mask=0xff, attrs=...) at /home/jaycee/qemu-io_test/qemu-2.8.0/memory.c:526
#9  access_with_adjusted_size (addr=0x7, value=0x7ffff5f299b8, size=0x1, access_size_min=0x1, access_size_max=0x4, access=0x5555557abd17 <memory_region_write_accessor>, mr=0x555558044e00, attrs=...) at /home/jaycee/qemu-io_test/qemu-2.8.0/memory.c:592
#10 memory_region_dispatch_write (mr=0x555558044e00, addr=0x7, data=0x20, size=0x1, attrs=...) at /home/jaycee/qemu-io_test/qemu-2.8.0/memory.c:1323
#11 address_space_write_continue (as=0x555556577d20 <address_space_io>, addr=0x1f7, attrs=..., buf=0x7ffff7fef000 " \237\006", len=0x1, addr1=0x7, l=0x1, mr=0x555558044e00) at /home/jaycee/qemu-io_test/qemu-2.8.0/exec.c:2608
#12 address_space_write (as=0x555556577d20 <address_space_io>, addr=0x1f7, attrs=..., buf=0x7ffff7fef000 " \237\006", len=0x1) at /home/jaycee/qemu-io_test/qemu-2.8.0/exec.c:2653
#13 address_space_rw (as=0x555556577d20 <address_space_io>, addr=0x1f7, attrs=..., buf=0x7ffff7fef000 " \237\006", len=0x1, is_write=0x1) at /home/jaycee/qemu-io_test/qemu-2.8.0/exec.c:2755
#14 kvm_handle_io (port=0x1f7, attrs=..., data=0x7ffff7fef000, direction=0x1, size=0x1, count=0x1) at /home/jaycee/qemu-io_test/qemu-2.8.0/kvm-all.c:1800
#15 kvm_cpu_exec (cpu=0x555556a802a0) at /home/jaycee/qemu-io_test/qemu-2.8.0/kvm-all.c:1958
#16 qemu_kvm_cpu_thread_fn (arg=0x555556a802a0) at /home/jaycee/qemu-io_test/qemu-2.8.0/cpus.c:998
#17 start_thread (arg=0x7ffff5f2a700) at pthread_create.c:333
#18 clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109

 

 

2、kvm和qemu的交互

Qemu创建虚拟机进入kvm:main函数通过调用kvm_init 和 machine->init来初始化kvm. 其中, machine->init会创建vcpu, 用一个线程去模拟vcpu, 该线程执行的函数为qemu_kvm_cpu_thread_fn, 并且该线程最终kvm_cpu_exec,该函数调用kvm_vcpu_ioctl切换到kvm中。

Kvm运行并因io退出:在kvm中看到参数KVM_RUN,最后调用vcpu_enter_guest,然后 vmx_vcpu_run设置好寄存器状态之后调用VM_LAUNCH或者VM_RESUME进入guest vm。如果vm进行IO操作需要访问设备时,就会触发vm exit 返回到vmx_vcpu_run, vmx保存好vmcs并且记录下VM_EXIT_REASON后返回到调用该函数的vcpu_enter_guest, 在vcpu_enter_guest函数末尾调用了r = kvm_x86_ops->handle_exit(vcpu), 该函数对应于vmx_handle_exit函数, vmx_handle_exit 调用kvm_vmx_exit_handlers[exit_reason](vcpu),该语句根据exit_reason调用不同的函数。io操作则是handle_io把数据填充到vcpu->run,就一路return到kvm_vcpu_ioctl,就ioctl返回到qemu的kvm_cpu_exec中。

从kvm返回到qemu后的处理:Qemu在kvm_cpu_exec中会看kvm_run的run->exit_reason如果是KVM_EXIT_IO就进入kvm_handle_io里处理。 当qemu完成IO操作后,会在kvm_cpu_exec函数的循环中,调用kvm_vcpu_ioctl重新进入kvm。

kvm_run,这是用于vcpu和应用层的程序(典型如qemu)通信的一个结构,user space的 程序通过KVM__VCPU_MMAP_SIZE这个ioctl得到大小,然后映射到用户空间。

 

3、kvm的io处理流程

static int handle_io(struct kvm_vcpu *vcpu)

{

         unsigned long exit_qualification;

         int size, in, string;

         unsigned port;

 

         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);  //获取exit qualification

         string = (exit_qualification & 16) != 0; //判断是否为string io (ins, outs)

         in = (exit_qualification & 8) != 0; //判断io方向,是in  还是out

 

         ++vcpu->stat.io_exits;

 

         if (string || in) //如果是输入类的指令,或者是string io,就进入emulator处理

                   return emulate_instruction(vcpu, 0) == EMULATE_DONE;

 

         port = exit_qualification >> 16; //得到端口号

         size = (exit_qualification & 7) + 1; //大小

         skip_emulated_instruction(vcpu); //跳过这个指令

 

         return kvm_fast_pio_out(vcpu, size, port); //进行out操作

}

Guest执行io指令 -> 发生vmexit-> 返回qemu -> 处理io

1、out指令虚拟:虚拟单个out指令,在KVM中可以直接把out的数据返回给qemu,qemu完成out操作。

流程:KVM的handle_io->kvm_fast_pio_out->emulator_pio_out_emulated后面是vcpu->arch.pio.count = 0函数中非string类型的 out操作可以一步完成,所以从qemu处理完返回kvm后不需要再进入emulator。在emulator_pio_out_emulated中,将IO数据memcpy到kvm和qemu共享buffer中,然后emulator_pio_in_out,将相应数据保存到kvm_run中就返回到qemu的kvm_cpu_exec的switch看run->exit_reason,如果是KVM_EXIT_IO则进入kvm_handle_io中和设备交互。

2、String或in指令虚拟:如果是in指令,qemu只能把得到的数据写到kvm_run中,kvm必须在下一次vmentry的时候,将qemu得到的数据放到相应的位置,所以,在handle_io中,如果是in或者string指令,没有调用skip_emulated_instruction,这样,在qemu完成in或者一次out之后,还会在同样的地方发生vmexit,这样再由emulator完成相应的处理,针对string类型的指令,emulator会进行解码等操作,确认io的次数和源操作数、目的操作数等。

流程:handle_io->emulate_instruction->x86_emulate_instruction对指令的decode,在过程中会调用到em_in和em_out(这两个函数最后调用的emulator_pio_in_emulated中先通过和上面PIO一样的函数emulator_pio_in_out,正确返回表明qemu已经将模拟出的数据返回到参数val了,则可直接memcpy完成具体的将从qemu中得到的数据写到正确位置vcpu->arch.pio_data),设置如果是out,下次到KVM时直接进入emulator,如果是in,注册vcpu->arch.complete_userspace_io = complete_emulated_pio;需要在下次qemu进入kvm的时候,完成io,实际上就是将qemu得到的数据写到正确的位置。下次进入kvm,如果要完成in指令,会在函数kvm_arch_vcpu_ioctl_run中调用注册的complete_emulated_pio会再次调用emulate_instruction将数据写到正确位置(这次不用解码,而是直接em_in)。

 

- 在guest中,virtio-blk的初始化或者说是在探测virtio-blk之前

virtio_dev_probe
 |-->add_status
      |-->dev->config->set_status[vp_set_status]
           |-->iowrite8(status, vp_dev->ioaddr + VIRTIO_PCI_STATUS)

 

这里就产生VM exit到Qemu中了,而在Qemu中有如下的处理:-

Qemu中建立ioeventfd的处理流程:

virtio_pci_config_write
 |-->virtio_ioport_write
      |-->virtio_pci_start_ioeventfd
          |-->virtio_pci_set_host_notifier_internal
              |-->virtio_queue_set_host_notifier_fd_handler
              |-->memory_region_add_eventfd
                  |-->memory_region_transaction_commit
                      |-->address_space_update_ioeventfds
                          |-->address_space_add_del_ioeventfds
                              |-->eventfd_add[kvm_mem_ioeventfd_add]
                                  |-->kvm_set_ioeventfd_mmio
                                      |-->kvm_vm_ioctl(...,KVM_IOEVENTFD,...)

 

最后这一步就切换到kvm内核模块中来通过KVM_IOEVENT来建立ioeventfd:
kvm内核模块中建立ioeventfd:

kvm_ioeventfd
|-->kvm_assign_ioeventfd

在这个流程中为某段区域建立了一个ioeventfd,这样的话guest在操作这块区域的时候就会触发ioeventfd(这是fs的eventfd机制),从而通知到Qemu,Qemu的main loop原先是阻塞的,现在有ioevent发生之后就可以得到运行了,也就可以做对virtio-blk相应的处理了。

 

KVm 使用ioeventfd

那么当guest对该块区域内存区域进行写的时候,势必会先exit到kvm内核模块中,kvm内核模块又是怎么知道这块区域是注册了event的呢?是怎么个流程呢?
只使用EPT的情况下,guest对一块属于MMIO的区域进行读写操作引起的exit在kvm中对应的处理函数是handle_ept_misconfig,下面就看下具体的流程:

handle_ept_misconfig
|-->x86_emulate_instruction
    |-->x86_emulate_insn
         |-->writeback
              |-->segmented_write
                  |-->write_emulated[emulator_write_emulated]
                       |-->emulator_read_write
                            |-->emulator_read_write_onepage
                                 |-->ops->read_write_mmio[write_mmio]
                                          |-->vcpu_mmio_write
                                               |-->kvm_io_bus_write
                                                   |-->__kvm_io_bus_write
                                                       |-->kvm_iodevice_write
                                                           |-->ops->write[ioeventfd_write]

 

在ioeventfd_write函数中会调用文件系统eventfd机制的eventfd_signal函数来触发相应的事件。
上述就是整个ioeventfd从创建到触发的流程!!!!

Conclusion

至此,KVM内核代码部分的核心调用流程的分析到此结束,从上述流程中可以看出,KVM内核代码的主要工作如下:

  1. Guest进入前的准备工作;
  2. Guest的进入;
  3. 根据Guest的退出原因进行处理,若kvm自身能够处理的则自行处理;若KVM无法处理,则返回到用户态空间的Qemu进程中进行处理;

总而言之,KVM与Qemu的工作是为了确保Guest的正常运行,通过各种异常的处理,使Guest无需感知其运行的虚拟环境。

posted on 2020-12-20 12:20  tycoon3  阅读(544)  评论(0编辑  收藏  举报

导航