Linux perf：周期性采样实现简析

1. 周期性采样实现

1. 周期性采样实现

本文以 Linux 4.14.x 内核代码，简要分析 perf 子系统周期性采样实现扼要。

1.1 打开事件对象

以 PERF_COUNT_SW_TASK_CLOCK 事件为例，从用户空间代码开始:

static long perf_event_open(struct perf_event_attr *attr, pid_t pid,
                            int cpu, int group_fd, unsigned long flags)
{
	/* 系统调用 sys_perf_event_open() */
    return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
}

struct perf_event_attr attr;

memset(&attr, sizeof(attr), 0);
attr.type = PERF_TYPE_SOFTWARE;
attr.size = sizeof(attr);
attr.config = PERF_COUNT_SW_TASK_CLOCK;
attr.sample_period = 100000;
attr.sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_TIME |
					PERF_SAMPLE_CPU | PERF_SAMPLE_PERIOD;
attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | 
					PERF_FORMAT_TOTAL_TIME_RUNNING | PERF_FORMAT_ID;
attr.wakeup_events = 1;
attr.disabled = 1;
attr.exclude_kernel = 1;
attr.exclude_hv = 1;
fd = perf_event_open(&attr, 0, -1, -1, 0);

sys_perf_event_open() 使流程进入内核空间部分：

/* kernel/events/core.c */

SYSCALL_DEFINE5(perf_event_open,
		struct perf_event_attr __user *, attr_uptr,
		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
	...
	struct perf_event *event, *sibling;
	struct perf_event_attr attr;
	struct perf_event_context *ctx, *uninitialized_var(gctx);
	struct file *event_file = NULL;
	...
	struct pmu *pmu;
	...

	...
	/* 拷贝 用户空间 attr 到 内核空间 */
	err = perf_copy_attr(attr_uptr, &attr);
	...

	/* (1) 创建初始化 perf_event 对象 */
	event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
				 NULL, NULL, cgroup_fd);
	...

	pmu = event->pmu;
	...

	if (pmu->task_ctx_nr == perf_sw_context)
		event->event_caps |= PERF_EV_CAP_SOFTWARE;

	...

	/*
	 * Get the target context (task or percpu):
	 */
	/* (2) 获取 或 分配 perf_event_context */
	ctx = find_get_context(pmu, task, event);
	...

	/* (3) 创建 perf 事件文件对象, 绑定 [perf_event + file + 文件操作接口 perf_fops] */
	event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
					f_flags);
	...

	if (!task) {
		/*
		 * Check if the @cpu we're creating an event for is online.
		 *
		 * We use the perf_cpu_context::ctx::mutex to serialize against
		 * the hotplug notifiers. See perf_event_{init,exit}_cpu().
		 */
		struct perf_cpu_context *cpuctx =
			container_of(ctx, struct perf_cpu_context, ctx);

		...
	}

	...

	/*
	 * Precalculate sample_data sizes; do while holding ctx::mutex such
	 * that we're serialized against further additions and before
	 * perf_install_in_context() which is the point the event is active and
	 * can use these values.
	 */
	/* 
	 * 设定 perf_event @event 的:
	 * . 读数据大小: @perf_event::read_size
	 * . perf.data 头部大小: @perf_event::header_size
	 */ 
	perf_event__header_size(event);
	/* 设定 perf_event @event 事件的 id header (perf_event::id_header_size) 大小 */
	perf_event__id_header_size(event);

	event->owner = current;

	/* (4) 调度 perf_event */
	perf_install_in_context(ctx, event, event->cpu);
	...

	/* 由当前进程发起的 perf_event, 添加到当前进程的 perf_event 列表 */
	mutex_lock(&current->perf_event_mutex);
	list_add_tail(&event->owner_entry, &current->perf_event_list);
	mutex_unlock(&current->perf_event_mutex);
	...

	/*
	 * Drop the reference on the group_event after placing the
	 * new event on the sibling_list. This ensures destruction
	 * of the group leader will find the pointer to itself in
	 * perf_group_detach().
	 */
	...
	fd_install(event_fd, event_file);
	return event_fd; /* 返回指代 perf_event 的句柄 */

	...
}

流程经系统调用 sys_perf_event_open() 进入内核空间后，创建 perf 事件对象的过程中，主要工作包括代码注释中 (1),(2),(3),(4) 这 4 处，下面来一一做简要分析。

创建 perf_event 对象，按 perf_event 的类型，绑定初始化 PMU

/*
 * Allocate and initialize a event structure
 */
static struct perf_event *
perf_event_alloc(struct perf_event_attr *attr, int cpu,
		 struct task_struct *task,
		 struct perf_event *group_leader,
		 struct perf_event *parent_event,
		 perf_overflow_handler_t overflow_handler,
		 void *context, int cgroup_fd)
{
	struct pmu *pmu;
	struct perf_event *event;
	struct hw_perf_event *hwc;
	long err = -EINVAL;

	...

	/* 创建 perf_event 对象, 初始为全 0 */
	event = kzalloc(sizeof(*event), GFP_KERNEL);
	...

	atomic_long_set(&event->refcount, 1);
	event->cpu		= cpu;
	event->attr		= *attr;
	...
	event->pmu		= NULL;
	event->oncpu		= -1;

	...

	event->state		= PERF_EVENT_STATE_INACTIVE; /* 初始为 INACTIVE 状态 */

	...

	/* 设置 event clock */
	event->clock = &local_clock;
	...

	/* 设置 event overflow handler */
	if (overflow_handler) {
		...
	} else if (is_write_backward(event)){
		..
	} else {
		event->overflow_handler = perf_event_output_forward;
		event->overflow_handler_context = NULL;
	}

	...

	pmu = NULL;

	/* event 是按 freq 或 period 进行采样, 这里设定 freq 或 period */
	hwc = &event->hw;
	hwc->sample_period = attr->sample_period;
	if (attr->freq && attr->sample_freq)
		hwc->sample_period = 1; /* 设置 freq 的情况下, period 设为 1 */
	hwc->last_period = hwc->sample_period;

	local64_set(&hwc->period_left, hwc->sample_period);

	...

	/* 按 perf_event 的类型匹配、并初始化 pmu, 然后绑定 perf_event 和 pmu */
	pmu = perf_init_event(event);
	...

	return event;

	...
}

static struct pmu *perf_init_event(struct perf_event *event)
{
	struct pmu *pmu;
	int idx;
	int ret;

	idx = srcu_read_lock(&pmus_srcu);

	/* Try parent's PMU first: */
	if (event->parent && event->parent->pmu) {
		pmu = event->parent->pmu;
		ret = perf_try_init_event(pmu, event);
		if (!ret)
			goto unlock;
	}

	/* 按 perf_event::attr::type 匹配 pmu: 
	 * 从自动分配类型 ID (即 perf_pmu_register() 注册 pmu 时, 
	 * 指定 @type < 0) 的 pmu 类型中寻找.
	 */
	rcu_read_lock();
	pmu = idr_find(&pmu_idr, event->attr.type);
	rcu_read_unlock();
	if (pmu) {
		ret = perf_try_init_event(pmu, event); /* 按 perf_event::attr::config 子类型匹配事件接口 */
		if (ret)
			pmu = ERR_PTR(ret);
		goto unlock;
	}

	/*
	 * 没能从自动分配类型 ID 的中找到匹配的 pmu, 
	 * 则从系统中所有的 pmu 列表中进行匹配.
	 */
	list_for_each_entry_rcu(pmu, &pmus, entry) {
		ret = perf_try_init_event(pmu, event);
		if (!ret)
			goto unlock;

		if (ret != -ENOENT) {
			pmu = ERR_PTR(ret);
			goto unlock;
		}
	}
	pmu = ERR_PTR(-ENOENT);
unlock:
	srcu_read_unlock(&pmus_srcu, idx);

	return pmu; /* 返回匹配的 PMU，没匹配时返回错误码 -ENOENT */
}

通过 perf_try_init_event() 尝试匹配 perf_event 的 PMU，这些被匹配的 PMU 来自 perf_pmu_register() 注册的对象。看看 perf_try_init_event() 的细节：

static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
{
	struct perf_event_context *ctx = NULL;
	int ret;

	...

	event->pmu = pmu; /* 设定 perf_event 关联的 PMU 对象 */
	/*
	 * kernel/events/core.c, perf_task_clock: task_clock_event_init()
	 * 返回 ENOENT 表示初始化的 type 不匹配, 需尝试其它类型的 pmu.
	 */
	ret = pmu->event_init(event);

	...

	return ret;
}

在 perf_try_init_event() 中调用 PMU 的 event_init 接口，如果 PMU 的 event_init 接口不返回 -ENOENT，则表示匹配成功，否则匹配失败。我们的场景会匹配到 perf_task_clock PMU 对象：

static struct pmu perf_task_clock = {
	.task_ctx_nr	= perf_sw_context,

	.capabilities	= PERF_PMU_CAP_NO_NMI,

	.event_init	= task_clock_event_init,
	.add		= task_clock_event_add,
	.del		= task_clock_event_del,
	.start		= task_clock_event_start,
	.stop		= task_clock_event_stop,
	.read		= task_clock_event_read,
};

perf_task_clock PMU 对象的 event_init 接口 task_clock_event_init() 中做如下工作：

static int task_clock_event_init(struct perf_event *event)
{
	if (event->attr.type != PERF_TYPE_SOFTWARE)
		return -ENOENT;

	if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
		return -ENOENT;

	...

	perf_swevent_init_hrtimer(event);

	return 0;
}

static void perf_swevent_init_hrtimer(struct perf_event *event)
{
	struct hw_perf_event *hwc = &event->hw;

	if (!is_sampling_event(event)) /* 非 sample 类型 event, 无需 hrtimer 做 period sample */
		return;

	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
	hwc->hrtimer.function = perf_swevent_hrtimer;

	...
}

在 task_clock_event_init() 中，按 perf_event_attr::type 和 perf_event_attr::config 匹配 PMU，然后初始化了一个周期性进行数据采样的 hrtimer，该定时器的回调接口为 perf_swevent_hrtimer()。perf_swevent_hrtimer() 的细节在下一小节进行分析。那这个 hrtimer 什么时候启动呢？这里暂时按下不表，后续会对此做出分析。这里继续看 perf_event 绑定匹配的 PMU 之后的流程。

创建 perf_event 运行的上下文对象 perf_event_context

sys_perf_event_open()
	//perf_event_alloc()
	ctx = find_get_context(pmu, task, event);

/*
 * Returns a matching context with refcount and pincount.
 */
static struct perf_event_context *
find_get_context(struct pmu *pmu, struct task_struct *task,
		struct perf_event *event)
{
	struct perf_event_context *ctx, *clone_ctx = NULL;
	struct perf_cpu_context *cpuctx;
	void *task_ctx_data = NULL;
	unsigned long flags;
	int ctxn, err;
	int cpu = event->cpu;

	/* 非 进程 perf event, 针对整个系统, 使用 per-cpu 的 context */
	if (!task) {
		...

		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
		ctx = &cpuctx->ctx;
		get_ctx(ctx);
		++ctx->pin_count;

		return ctx;
	}

	...
}

为 perf_event 创建文件对象并初始化

static const struct file_operations perf_fops = {
	.llseek			= no_llseek,
	.release		= perf_release,
	.read			= perf_read,
	.poll			= perf_poll,
	.unlocked_ioctl		= perf_ioctl,
	.compat_ioctl		= perf_compat_ioctl,
	.mmap			= perf_mmap,
	.fasync			= perf_fasync,
};

sys_perf_event_open()
	//perf_event_alloc()
	//ctx = find_get_context(pmu, task, event);
	event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
					f_flags);

调度 perf_event 启动执行

sys_perf_event_open()
	//perf_event_alloc()
	//ctx = find_get_context(pmu, task, event);
	//event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
	//				f_flags);
	perf_install_in_context(ctx, event, event->cpu);
		...
		ctx_sched_in()
			...
			group_sched_in()
				event_sched_in()

static int
event_sched_in(struct perf_event *event,
		 struct perf_cpu_context *cpuctx,
		 struct perf_event_context *ctx)
{
	u64 tstamp = perf_event_time(event);
	int ret = 0;

	...

	WRITE_ONCE(event->oncpu, smp_processor_id()); /* 标记 event 所在的 event */
	/*
	 * Order event::oncpu write to happen before the ACTIVE state
	 * is visible.
	 */
	smp_wmb();
	WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE); /* 标记 event 为 ACTIVE 状态 */

	...

	/* kernel/events/core.c,  perf_task_clock: task_clock_event_add() [启动 event hrtimer] */
	if (event->pmu->add(event, PERF_EF_START)) {
		/* 失败 */
		event->state = PERF_EVENT_STATE_INACTIVE;
		event->oncpu = -1;
		ret = -EAGAIN;
		goto out;
	}

	...

out:
	perf_pmu_enable(event->pmu); /* 启用 PMU */

	return ret;
}

我们测试例子的上下文，触发 task_clock_event_add() 回调，期间会启动采样 hrtimer：

static int task_clock_event_add(struct perf_event *event, int flags)
{
	if (flags & PERF_EF_START)
		task_clock_event_start(event, flags);
	perf_event_update_userpage(event);

	return 0;
}

static void task_clock_event_start(struct perf_event *event, int flags)
{
	local64_set(&event->hw.prev_count, event->ctx->time);
	perf_swevent_start_hrtimer(event);
}

static void perf_swevent_start_hrtimer(struct perf_event *event)
{
	struct hw_perf_event *hwc = &event->hw;
	s64 period;

	/*
	 * 非采样类型事件: 即 perf_event::attr.sample_period == 0,
	 * 不启动 hrtimer.
	 */
	if (!is_sampling_event(event))
		return;

	/* 启动/重启 event sample hrtimer */
	period = local64_read(&hwc->period_left);
	if (period) {
		if (period < 0)
			period = 10000;

		local64_set(&hwc->period_left, 0);
	} else {
		period = max_t(u64, 10000, hwc->sample_period);
	}
	/* perf_swevent_hrtimer() */
	hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
		      HRTIMER_MODE_REL_PINNED);
}

到此，对于 perf_event 的创建过程已经完成，sys_perf_event_open() 会返回一个文件句柄指代该 perf_event 对象。另外，这里已经回答了前面提到的 hrtimer 什么时候启动的问题：在 perf_install_in_context() 调用链中启动了 hrtimer。

1.2 创建事件数据 ring buffer

perf_event 事件采样数据，使用 ring_buffer 来存储，接下来简要分析其建立过程。先看用户空间部分：

#define BUFFER_SIZE (1 << 18)  // 256KB
#define PAGE_SIZE 4096

size_t mmap_size = PAGE_SIZE + BUFFER_SIZE;
char *buffer = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);

为什么 mmap_size 设置为 PAGE_SIZE + BUFFER_SIZE 这么个尺寸？首先，perf 的采样数据头部有一个管理数据 perf_event_mmap_page，使用一个 page，在第一个 page 之后紧跟的才是采样数据，后面再说其细节。另外，perf 要求数据缓冲大小对齐到 page。

继续看内核空间的细节：

sys_mmap()
	...
	perf_mmap()

static int perf_mmap(struct file *file, struct vm_area_struct *vma)
{
	struct perf_event *event = file->private_data;
	unsigned long user_locked, user_lock_limit;
	struct user_struct *user = current_user();
	unsigned long locked, lock_limit;
	struct ring_buffer *rb = NULL;
	unsigned long vma_size;
	unsigned long nr_pages;
	long user_extra = 0, extra = 0;
	int ret = 0, flags = 0;

	vma_size = vma->vm_end - vma->vm_start; /* mmap() 空间大小 */

	if (vma->vm_pgoff == 0) {
		nr_pages = (vma_size / PAGE_SIZE) - 1; /* 计算页面数: 减 1 个页面是 ring_buffer::user_page */
	} else {
		...
	}

	...

	user_extra = nr_pages + 1;

	...

	if (!rb) {
		/* 创建事件 @event 的 mmap 的 ring_buffer */
		rb = rb_alloc(nr_pages,
			      event->attr.watermark ? event->attr.wakeup_watermark : 0,
			      event->cpu, flags);
		...

		/* 设定 @event 的 ring_buffer 为 @rb */
		ring_buffer_attach(event, rb);

		perf_event_init_userpage(event);
		perf_event_update_userpage(event);
	} else {
		...
	}

	...

	/*
	 * Since pinned accounting is per vm we cannot allow fork() to copy our
	 * vma.
	 */
	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
	vma->vm_ops = &perf_mmap_vmops;

	...
}

struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
{
	struct ring_buffer *rb;
	unsigned long size;
	void *all_buf;

	size = sizeof(struct ring_buffer);
	size += sizeof(void *); /* ring_buffer::data_pages[0] 指针空间 */

	/* 创建 ring_buffer 对象, 初始化为 0 */
	rb = kzalloc(size, GFP_KERNEL);
	if (!rb)
		goto fail;

	INIT_WORK(&rb->work, rb_free_work);

	/* 分配 nr_pages + 1 个 page */
	all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
	if (!all_buf)
		goto fail_all_buf;

	rb->user_page = all_buf;
	rb->data_pages[0] = all_buf + PAGE_SIZE;
	if (nr_pages) {
		rb->nr_pages = 1;
		rb->page_order = ilog2(nr_pages);
	}

	ring_buffer_init(rb, watermark, flags);

	return rb;

	...
}

static void ring_buffer_attach(struct perf_event *event,
			       struct ring_buffer *rb)
{
	struct ring_buffer *old_rb = NULL;
	unsigned long flags;

	...

	if (rb) {
		if (event->rcu_pending) {
			cond_synchronize_rcu(event->rcu_batches);
			event->rcu_pending = 0;
		}

		spin_lock_irqsave(&rb->event_lock, flags);
		list_add_rcu(&event->rb_entry, &rb->event_list); /* 添加到 ring_buffer 的 perf_event 事件列表 */
		spin_unlock_irqrestore(&rb->event_lock, flags);
	}

	...

	rcu_assign_pointer(event->rb, rb); /* 设定 perf_event @event 的 ring_buffer 为 @rb */

	...
}

从上面的分析知道，perf_event 采样数据使用 ring_buffer 存储，其空间构成如下：

1.3 更新事件数据

在我们示例的场景中，启动 hrtimer 来采集数据，具体是在 perf_swevent_hrtimer() 回调中，看看细节：

static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
{
	enum hrtimer_restart ret = HRTIMER_RESTART;
	struct perf_sample_data data;
	struct pt_regs *regs;
	struct perf_event *event;
	u64 period;

	event = container_of(hrtimer, struct perf_event, hw.hrtimer);

	if (event->state != PERF_EVENT_STATE_ACTIVE) /* 事件未激活, 不再重启 hrtimer */
		return HRTIMER_NORESTART;

	/*
	 * (1) PMU 数据读取
	 * perf_task_clock: task_clock_event_read()
	 */
	event->pmu->read(event);

	perf_sample_data_init(&data, 0, event->hw.last_period);
	regs = get_irq_regs();

	if (regs/*中断上下文*/ && !perf_exclude_event(event, regs)/*不是 exclude event*/) {
		if (!(event->attr.exclude_idle && is_idle_task(current)))
			/* (2) 更新采样数据到 perf_event 的 ring_buffer */
			if (__perf_event_overflow(event, 1, &data, regs))
				ret = HRTIMER_NORESTART;
	}

	/* 重启 hrtimer, 进行下一采样 */
	period = max_t(u64, 10000, event->hw.sample_period);
	hrtimer_forward_now(hrtimer, ns_to_ktime(period));

	return ret;
}

首先在上面代码 (1) 处是做 PMU 数据的读取，具体到我们的场景是更新任务消耗的时间：

static void task_clock_event_read(struct perf_event *event)
{
	u64 now = perf_clock(); /* 读取当前时间 */
	u64 delta = now - event->ctx->timestamp; /* 自从上次读取后流逝的时间 */
	u64 time = event->ctx->time + delta; /* 当前时间 */

	task_clock_event_update(event, time);
}

static void task_clock_event_update(struct perf_event *event, u64 now)
{
	u64 prev;
	s64 delta;

	prev = local64_xchg(&event->hw.prev_count, now);
	delta = now - prev;
	local64_add(delta, &event->count);
}

然后在上面代码 (2) 处将 (1) 处读取的 PMU 数据，更新写入到 perf_event 的 ring_buffer 中：

__perf_event_overflow()
	perf_event_output_forward()
		__perf_event_output(event, data, regs, perf_output_begin_forward);

static void __always_inline
__perf_event_output(struct perf_event *event,
		    struct perf_sample_data *data,
		    struct pt_regs *regs,
		    int (*output_begin)(struct perf_output_handle *,
					struct perf_event *,
					unsigned int))
{
	struct perf_output_handle handle;
	struct perf_event_header header;

	/* protect the callchain buffers */
	rcu_read_lock();

	/* 准备好采样数据 */
	perf_prepare_sample(&header, data, event, regs);

	/*
	 * 做采样数据 ring_buffer 写入的准备工作:
	 * - 包括移动写入位置指针 `ring_buffer::head`，
	 * - 选定当前写入内存页面, 以及当前写入内存页面写入偏移位置的设定
	 */
	if (output_begin(&handle, event, header.size)) /* perf_output_begin_forward() */
		goto exit;

	/* 将 event 采样数据写入到 event 的 ring buffer @event->rb */
	perf_output_sample(&handle, &header, data, event);

	/* 将 ring_buffer 当前可读数据位置 ring_buffer::head 同步到 用户空间，然后唤醒数据读取等待进程 */
	perf_output_end(&handle);

exit:
	rcu_read_unlock();
}

首先，perf_prepare_sample() 准备好采样数据 perf_sample_data：

struct perf_sample_data {
	/*
	 * Fields set by perf_sample_data_init(), group so as to
	 * minimize the cachelines touched.
	 */
	u64				addr;
	struct perf_raw_record		*raw;
	struct perf_branch_stack	*br_stack;
	u64				period;
	u64				weight;
	u64				txn;
	union  perf_mem_data_src	data_src;

	/*
	 * The other fields, optionally {set,used} by
	 * perf_{prepare,output}_sample().
	 */
	/*
	 * 采样的所有数据类型掩码:
	 * PERF_SAMPLE_IDENTIFIER, PERF_SAMPLE_IP, ...
	 * PERF_SAMPLE_PERIOD, PERF_SAMPLE_READ, ...
	 * 如:
	 * PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_TIME | 
	 * PERF_SAMPLE_CPU | PERF_SAMPLE_PERIOD | ...
	 */
	u64				type;
	u64				ip;
	struct {
		u32	pid;
		u32	tid;
	}				tid_entry;
	u64				time;
	u64				id;
	u64				stream_id;
	struct {
		u32	cpu;
		u32	reserved;
	}				cpu_entry;
	struct perf_callchain_entry	*callchain;

	/*
	 * regs_user may point to task_pt_regs or to regs_user_copy, depending
	 * on arch details.
	 */
	struct perf_regs		regs_user;
	struct pt_regs			regs_user_copy;

	struct perf_regs		regs_intr;
	u64				stack_user_size;

	u64				phys_addr;
} ____cacheline_aligned;

void perf_prepare_sample(struct perf_event_header *header,
			 struct perf_sample_data *data,
			 struct perf_event *event,
			 struct pt_regs *regs)
{
	u64 sample_type = event->attr.sample_type;

	header->type = PERF_RECORD_SAMPLE; /* 标记为采样数据类型 */
	header->size = sizeof(*header) + event->header_size; /* 一个 sample 的数据总长度，包括 perf_event_header 和 sample 数据 */

	...

	/* 填充 ID 类型采样数据 */
	__perf_event_header__init_id(header, data, event);

	/* PC 指针采样数据 */
	if (sample_type & PERF_SAMPLE_IP)
		data->ip = perf_instruction_pointer(regs);

	/* 其它采样数据填充 */
	...
}

static void __perf_event_header__init_id(struct perf_event_header *header,
					 struct perf_sample_data *data,
					 struct perf_event *event)
{
	u64 sample_type = event->attr.sample_type;

	data->type = sample_type;
	header->size += event->id_header_size;

	if (sample_type & PERF_SAMPLE_TID) {
		/* namespace issues */
		data->tid_entry.pid = perf_event_pid(event, current);
		data->tid_entry.tid = perf_event_tid(event, current);
	}

	if (sample_type & PERF_SAMPLE_TIME)
		data->time = perf_event_clock(event);

	if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
		data->id = primary_event_id(event);

	if (sample_type & PERF_SAMPLE_STREAM_ID)
		data->stream_id = event->id;

	if (sample_type & PERF_SAMPLE_CPU) {
		data->cpu_entry.cpu	 = raw_smp_processor_id();
		data->cpu_entry.reserved = 0;
	}
}

然后，perf_output_begin_forward() 做采样数据 ring_buffer 写入的准备工作：包括移动写入位置指针 ring_buffer::head，以及写入内存页面位置的选定。

perf_output_begin_forward()
	__perf_output_begin(handle, event, size, false)

static int __always_inline
__perf_output_begin(struct perf_output_handle *handle,
		    struct perf_event *event, unsigned int size,
		    bool backward)
{
	struct ring_buffer *rb;
	unsigned long tail, offset, head;
	int have_lost, page_shift;
	struct {
		struct perf_event_header header;
		u64			 id;
		u64			 lost;
	} lost_event;

	rcu_read_lock();
	...

	rb = rcu_dereference(event->rb);
	...

	handle->rb    = rb;
	handle->event = event;
	...

	/* 移动 rb->head 指针 */
	do {
		tail = READ_ONCE(rb->user_page->data_tail);
		offset = head = local_read(&rb->head);

		...

		if (!backward)
			head += size;
		else
			...
	}  while (local_cmpxchg(&rb->head, offset, head) != offset); /* 更新 ring_buffer 写入位置 rb->head */

	...

	page_shift = PAGE_SHIFT + page_order(rb);

	handle->page = (offset >> page_shift) & (rb->nr_pages - 1); /* 选定数据写入 page */
	offset &= (1UL << page_shift) - 1;
	handle->addr = rb->data_pages[handle->page] + offset; /* 确定相对于选定写入 page 偏移地址 */
	handle->size = (1UL << page_shift) - offset;

	...

	return 0;

	...
}

接着，perf_output_sample() 将 perf_prepare_sample() 准备好的采样数据写入到 ring_buffer：

void perf_output_sample(struct perf_output_handle *handle,
			struct perf_event_header *header,
			struct perf_sample_data *data,
			struct perf_event *event)
{
	u64 sample_type = data->type;

	perf_output_put(handle, *header); /* 写入数据 perf_event_header */

	
	if (sample_type & PERF_SAMPLE_IDENTIFIER)
		perf_output_put(handle, data->id);

	if (sample_type & PERF_SAMPLE_IP)
		perf_output_put(handle, data->ip);

	if (sample_type & PERF_SAMPLE_TID)
		perf_output_put(handle, data->tid_entry);

	if (sample_type & PERF_SAMPLE_TIME)
		perf_output_put(handle, data->time);

	...

	if (sample_type & PERF_SAMPLE_CPU)
		perf_output_put(handle, data->cpu_entry);

	if (sample_type & PERF_SAMPLE_PERIOD)
		perf_output_put(handle, data->period);

	...
}

/* 数据拷贝: handle->addr <= x */
#define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x))

最后，perf_output_end() 将 ring_buffer 当前可读数据位置 ring_buffer::head 同步到用户空间，并唤醒数据读取等待进程：

void perf_output_end(struct perf_output_handle *handle)
{
	perf_output_put_handle(handle);
	rcu_read_unlock();
}

static void perf_output_put_handle(struct perf_output_handle *handle)
{
	struct ring_buffer *rb = handle->rb;
	unsigned long head;

again:
	head = local_read(&rb->head);

	/*
	 * IRQ/NMI can happen here, which means we can miss a head update.
	 */

	if (!local_dec_and_test(&rb->nest))
		goto out;

	/*
	 * Since the mmap() consumer (userspace) can run on a different CPU:
	 *
	 *   kernel				user
	 *
	 *   if (LOAD ->data_tail) {		LOAD ->data_head
	 *			(A)		smp_rmb()	(C)
	 *	STORE $data			LOAD $data
	 *	smp_wmb()	(B)		smp_mb()	(D)
	 *	STORE ->data_head		STORE ->data_tail
	 *   }
	 *
	 * Where A pairs with D, and B pairs with C.
	 *
	 * In our case (A) is a control dependency that separates the load of
	 * the ->data_tail and the stores of $data. In case ->data_tail
	 * indicates there is no room in the buffer to store $data we do not.
	 *
	 * D needs to be a full barrier since it separates the data READ
	 * from the tail WRITE.
	 *
	 * For B a WMB is sufficient since it separates two WRITEs, and for C
	 * an RMB is sufficient since it separates two READs.
	 *
	 * See perf_output_begin().
	 */
	smp_wmb(); /* B, matches C */
	rb->user_page->data_head = head; /* 更新 用户空间 可读数据位置 结尾 (head = rb->head)*/

	/*
	 * Now check if we missed an update -- rely on previous implied
	 * compiler barriers to force a re-read.
	 */
	if (unlikely(head != local_read(&rb->head))) {
		local_inc(&rb->nest);
		goto again;
	}

	if (handle->wakeup != local_read(&rb->wakeup))
		perf_output_wakeup(handle); /* 唤醒数据读取等待进程 */

out:
	preempt_enable();
}

1.4 读取事件数据

#define BUFFER_SIZE (1 << 18)  // 256KB
#define PAGE_SIZE 4096

size_t mmap_size = PAGE_SIZE + BUFFER_SIZE;
char *buffer = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);

struct perf_event_mmap_page *header = buffer;
char *data_buffer = (char *)buffer + PAGE_SIZE;
unsigned long data_head = ACCESS_ONCE(header->data_head);
unsigned long data_tail = header->data_tail;

if (data_head != data_tail) { // 有新的数据可读
	while (data_tail < data_head) {
		size_t offset = data_tail % BUFFER_SIZE; // 计算当前记录位置（处理环形缓冲区回绕）
		struct perf_event_header *event_header = 
                    (struct perf_event_header *)(data_buffer + offset);

		printf("事件头信息:\n");
		printf("  类型: %u\n", event_header->type);
		printf("  杂项: %u\n", event_header->misc);
		printf("  大小: %u bytes\n", event_header->size);

		switch (event_header->type) { // 处理不同类型的记录：这里只关注采样类型 (PERF_RECORD_SAMPLE)
		case PERF_RECORD_SAMPLE: {
			// 解析样本数据
			char *sample_data = (char *)(event_header + 1);

			// 根据 sample_type 解析数据
			u64 *sample_ptr = (u64 *)sample_data;
			int idx = 0;

			if (pe.sample_type & PERF_SAMPLE_IP)
				printf("指令指针: 0x%llx\n", sample_ptr[idx++]);

			if (pe.sample_type & PERF_SAMPLE_TID) {
				u32 pid = sample_ptr[idx] & 0xFFFFFFFF;
				u32 tid = sample_ptr[idx] >> 32;
				printf("PID/TID: %u/%u\n", pid, tid);
				idx++;
			}

			if (pe.sample_type & PERF_SAMPLE_TIME)
				printf("时间戳: %llu\n", sample_ptr[idx++]);

			if (pe.sample_type & PERF_SAMPLE_CPU) {
				u32 cpu = sample_ptr[idx] & 0xFFFFFFFF;
				u32 res = sample_ptr[idx] >> 32;
				printf("  CPU: %u (reserved: %u)\n", cpu, res);
				idx++;
			}

			if (pe.sample_type & PERF_SAMPLE_PERIOD)
				printf("采样周期: %llu\n", sample_ptr[idx++]);

			 break;
		 }
		 ...
		}

		// 移动到下一个记录
		data_tail += event_header->size;
	}

	// 更新数据尾部指针
	header->data_tail = data_tail;
}

1.5 采样数据格式小结

mmap 映射的 ring_buffer 空间的第 1 个 page，存放 perf_event_mmap_page，后续 page 存放采样数据。
每组采样数据总是以一个 8 字节的 perf_event_header 开头，标记采样数据的类型和 size，然后每个采样数据按 ID 决定排列顺序，每个数据大小为 8 字节。

posted @ 2026-01-22 16:37 JiMoKuangXiangQu 阅读(0) 评论(0) 收藏举报

刷新页面返回顶部

JiMoKuangXiangQu