kfence源码分析【转】

转自：https://www.cnblogs.com/pengdonglin137/p/16342898.html

参考

作者

pengdonglin137@163.com

内核版本

linux-5.14

实现分析

Kfence (Kernel Electric Fence) 是 Linux 内核引入的一种低开销的内存错误检测机制，因为是低开销的所以它可以在运行的生产环境中开启，同样由于是低开销所以它的功能相比较 KASAN 会偏弱。

Kfence是一种基于采样的低开销的内存安全错误检测技术。可以检测UAF、非法释放、OOB三种内存错误，目前支持x86和ARM64，它在slab和slub内存分配器中添加了hook函数。
Kfence的设计理念：如果有足够长的总的运行时间，kfence可以在非生产环境的测试程序无法充分测试的代码路径上检测到bug。可以通过大范围部署kfence来快速达到足够长的总运行时间。
Kfence管理的每个object都分别存放在一个单独的内存页的左边或者右边，跟这个内存页紧邻的左右两侧的内存页被成为保护页，这些保护页的内存属性被设置成保护状态（PTE页表项的P位），如果访问这些保护页，就会导致缺页异常，而kfence在缺页异常中会解析和报告发生的错误。
从kfence内存池中分配object是基于一个采样间隔，这个间隔可以通过内核启动参数kfence.sample_interval来修改。当经过了一个采样间隔的时间，下一次从slab或slub中分配的object将会来自kfence内存池。然后需要再经过一个采样间隔，slab或者slub才能从kfence内存池中分配一个object。
由于采用了static key机制，可以省去判断逻辑，所以不管是否开启kfence，从slub或者slab的的快速路径分配内存时的性能都不会受到影响。
Kfence内存池的大小是固定的，如果Kfence内存池被用光了，那么就不能再从kfence内存池分配内存了。默认的内核配置是kfence内存池大小为2MB，可以分配到255的object，每个object对应一个内存页。

初始化

kfence内存池框图：

其中data区域是用来分配的，fence区域是用来检测内存越界的。metadata数组的元素跟data区域一一对应，用于描述data区域的信息。

	start_kernel
	-> mm_init
	-> kfence_alloc_pool
	// 将memblock分配器中的空闲页面释放给伙伴分配器，之前被memblock分配出去还没有释放的内存也就不会出现在伙伴系统里，虽然如此，这部分内存还是有
	// 与之对应的page结构体
	-> mem_init
	-> kfence_init

kfence_alloc_pool [mm\kfence\core.c]

	void __init kfence_alloc_pool(void)
	{
	// 如果采样间隔为0的话，不初始化kfence。需要通过内核配置选项CONFIG_KFENCE_SAMPLE_INTERVAL或者内核启动参数kfence.sample_interval来设置
	if (!kfence_sample_interval)
	return;

	// 申请kfence pool内存池，大小为：((CONFIG_KFENCE_NUM_OBJECTS + 1) * 2 * PAGE_SIZE)，对齐到PAGE_SIZE
	// CONFIG_KFENCE_NUM_OBJECTS最大为65535，最小为1.
	__kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
	}

此时伙伴分配器不能使用，所以给kfence的内存在伙伴系统之外，不属于伙伴系统管理，所以也就不用担心被伙伴系统分配出去。

kfence_init

	void __init kfence_init(void)
	{
	/* 如果采样间隔为0，那么会关闭kfence */
	if (!kfence_sample_interval)
	return;

	// 初始化kfence内存池
	kfence_init_pool();

	// 表示kfence可以工作了
	WRITE_ONCE(kfence_enabled, true);
	/*
	用于周期性开启kfence内存池的任务，这里delay时间为0，表示立刻开启，见下文toggle_allocation_gate
	*/
	queue_delayed_work(system_unbound_wq, &kfence_timer, 0);

	pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE,
	CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool,
	(void *)(__kfence_pool + KFENCE_POOL_SIZE));
	}

kfence_init_pool [kfence_init -> kfence_init_pool]

	static bool __init kfence_init_pool(void)
	{
	unsigned long addr = (unsigned long)__kfence_pool;
	struct page *pages;
	int i;

	/* 对于x86架构，会检查__kfence_pool是否映射到物理地址了 */
	arch_kfence_init_pool();

	/* 获取将kfence内存池首地址对应的page结构体 */
	pages = virt_to_page(addr);

	for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
	if (!i \|\| (i % 2)) // 跳过第0页和所有的奇数页
	continue;
	/* 1. 设置所有的偶数页的struct page结构体的slab标志，因为在调用kmem_cache_free时会检查
	虚拟地址对应的page结构体是否设置了slab标志，如果没有设置，那么无法释放
	2. 如果用kfree释放，这个标志可以保证调用slab_free -> __slab_free -> kfence_free
	*/
	__SetPageSlab(&pages[i]);
	}

	// 将前两页在页表中的PTE项的Present标志去掉，这样当cpu访问前两页时，就会触发缺页异常
	for (i = 0; i < 2; i++) {
	kfence_protect(addr);
	addr += PAGE_SIZE;
	}

	// kfence_metadata是一个数据类型为struct kfence_metadata的数组，元素个数是CONFIG_KFENCE_NUM_OBJECTS
	// 从这里可以看出，每一个kfence_metadata数组成员管理一个object
	for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
	struct kfence_metadata *meta = &kfence_metadata[i];

	/* Initialize metadata. */
	INIT_LIST_HEAD(&meta->list);
	raw_spin_lock_init(&meta->lock);
	meta->state = KFENCE_OBJECT_UNUSED; // object的初始状态为UNUSED
	meta->addr = addr; /* object所在的4KB内存的起始地址 */
	list_add_tail(&meta->list, &kfence_freelist); // 添加到全局链表中

	// 将object所在的4KB内存的下一个4KB的页表映射信息置为无效，用来检测内存越界访问
	kfence_protect(addr + PAGE_SIZE);

	addr += 2 * PAGE_SIZE;
	}

	// 之前在调用memblock_alloc时在kmemleak中有记录，这里先删除这部分记录，防止后面调用kfence_alloc出现冲突
	kmemleak_free(__kfence_pool);

	return true;
	}

折叠

周期性开启kfence内存池

在kfence_init中还添加了一个kfence_timer的延迟任务，用于周期性开启kfence内存分配，实现如下：

toggle_allocation_gate

	/*
	* Set up delayed work, which will enable and disable the static key. We need to
	* use a work queue (rather than a simple timer), since enabling and disabling a
	* static key cannot be done from an interrupt.
	*
	* Note: Toggling a static branch currently causes IPIs, and here we'll end up
	* with a total of 2 IPIs to all CPUs. If this ends up a problem in future (with
	* more aggressive sampling intervals), we could get away with a variant that
	* avoids IPIs, at the cost of not immediately capturing allocations if the
	* instructions remain cached.
	*/
	static struct delayed_work kfence_timer;
	static void toggle_allocation_gate(struct work_struct *work)
	{
	if (!READ_ONCE(kfence_enabled))
	return;

	// 周期性将kfence_allocation_gate设置为0，这个作为一个kfence内存池开启的标志位，0表示开启，非0表示关闭，
	// 保证每隔一定时间最多只允许从kfence内存池分配一次内存
	atomic_set(&kfence_allocation_gate, 0);
	// 使用static key来优化性能，因为直接通过读取kfence_allocation_gate的值是否为0来判断的性能开销比较大
	#ifdef CONFIG_KFENCE_STATIC_KEYS
	/* 打开static key，并且等待从kfence内存池分配 */
	static_branch_enable(&kfence_allocation_key);

	if (sysctl_hung_task_timeout_secs) { // 内核发出hang task警告的时间最短时间长度，一般为120秒
	/*
	* 如果内存分配没有那么频繁，就有可能出现等待时间过长的问题，这里将等待超时时间设置为hang task警告时间的一半，
	这样内核就不会因为处于D状态过长导致内核出现警告。

	被唤醒的原因：
	1. 当有人从kfence分配了内存，会将kfence_allocation_gate设置为1，然后唤醒阻塞在allocation_wait里的任务
	2. 超时
	*/
	wait_event_idle_timeout(allocation_wait, atomic_read(&kfence_allocation_gate),
	sysctl_hung_task_timeout_secs * HZ / 2);
	} else {
	/* 如果hangtask检测时间为0，表示时间无限长，那么可以放心地等待下去，直到有人从kfence分配了内存，会将kfence_allocation_gate
	设置为1，然后唤醒阻塞在allocation_wait里的任务
	*/
	wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate));
	}

	/* 将static keys关闭，保证不会进入__kfence_alloc */
	static_branch_disable(&kfence_allocation_key);
	#endif
	// 等待kfence_sample_interval，单位时毫秒，然后再此开启kfence内存池
	queue_delayed_work(system_unbound_wq, &kfence_timer,
	msecs_to_jiffies(kfence_sample_interval));
	}
	static DECLARE_DELAYED_WORK(kfence_timer, toggle_allocation_gate);

折叠

分配内存

框图：

入口1：

	kmalloc
	-> kmem_cache_alloc_trace
	-> slab_alloc
	-> return
	-> __kmalloc
	-> slab_alloc
	-> return

入口2

	kmem_cache_alloc
	-> slab_alloc

上面两个路径最后都会调用到slab_alloc：

	slab_alloc
	-> slab_alloc_node
	-> kfence_alloc
	-> 如果kfence_alloc返回NULL的话，走常规的slub分配

kfence_alloc

	static __always_inline void kfence_alloc(struct kmem_cache s, size_t size, gfp_t flags)
	{
	#ifdef CONFIG_KFENCE_STATIC_KEYS
	/* 如果内核配置了kfence_static_keys，那么走这个优化分支 */
	if (static_branch_unlikely(&kfence_allocation_key))
	#else
	/* 常规的判断分支，性能比static key分支差 */
	if (unlikely(!atomic_read(&kfence_allocation_gate)))
	#endif
	return __kfence_alloc(s, size, flags);
	return NULL;
	}

__kfence_alloc

	void __kfence_alloc(struct kmem_cache s, size_t size, gfp_t flags)
	{
	/*
	目前kfence内存池仅支持大小不超过一页的内存大小object分配
	*/
	if (size > PAGE_SIZE)
	return NULL;

	/*
	* 需要从DMA、DMA32、HIGHMEM分配内存的话，kfence内存池不支持。因为kfence内存池的内存
	属性不一定满足需求，比如dma一般要求内存是不带cache的，而kfence内存池中的内存不能保证这一点。
	*/
	if ((flags & GFP_ZONEMASK) \|\|
	(s->flags & (SLAB_CACHE_DMA \| SLAB_CACHE_DMA32)))
	return NULL;

	/*
	下面判断可以保证只有一个分配者可以进入，进入后kfence内存池就关闭后，在下次开启之前，所有的分配者
	都无法进入，只能返回NULL，从而走常规的slub分配器。
	*/
	if (atomic_read(&kfence_allocation_gate) \|\| atomic_inc_return(&kfence_allocation_gate) > 1)
	return NULL;
	#ifdef CONFIG_KFENCE_STATIC_KEYS
	/*
	* 检查allocation_wait中是否有进程在阻塞，有的话，会起一个work来唤醒被阻塞的进程
	*/
	if (waitqueue_active(&allocation_wait)) {
	/*
	* Calling wake_up() here may deadlock when allocations happen
	* from within timer code. Use an irq_work to defer it.
	*/
	irq_work_queue(&wake_up_kfence_timer_work);
	}
	#endif
	// 判断kfence功能是否使能了
	if (!READ_ONCE(kfence_enabled))
	return NULL;

	// 从kfence内存池中分配object
	return kfence_guarded_alloc(s, size, flags);
	}

kfence_guarded_alloc [kfence_alloc -> __kfence_alloc -> kfence_guarded_alloc]

	static void kfence_guarded_alloc(struct kmem_cache cache, size_t size, gfp_t gfp)
	{
	struct kfence_metadata *meta = NULL;
	unsigned long flags;
	struct page *page;
	void *addr;

	// 检查kfence内存池是否还有空闲的内存页
	if (!list_empty(&kfence_freelist)) {
	// 获取空闲内存页对应的kfence_metadata数据结构
	meta = list_entry(kfence_freelist.next, struct kfence_metadata, list);
	list_del_init(&meta->list);
	}

	// 如果为空，表示kfence内存池已经分配完了。需要用常规的slub分配器分配。
	if (!meta)
	return NULL;

	// 获取meta对应的空闲内存页的虚拟首地址
	meta->addr = metadata_to_pageaddr(meta);
	/* 如果是空闲的，那么需要恢复这个内存页在页表的PTE的present标志，保证cpu可以正常访问这页内存而不发生缺页异常

	这里为什么要判断freed呢？因为在初始函数kfence_init_pool中设置的初始状态是KFENCE_OBJECT_UNUSED，表示还
	这页内存还没有使用过，而且初始化时也没有调用kfence_protect来保护该页，所以对于UNUSED的页就没有必要kfence_unprotect

	只有当这页被分配出去，然后释放的时候会将该页设置为freed，并且调用kfence_protect来保护该页，用于检查use after free。
	所以对于free的内存页在下次分配的时候当然要进行kfence_unprotect处理。
	*/
	if (meta->state == KFENCE_OBJECT_FREED)
	kfence_unprotect(meta->addr);

	/*
	* Note: for allocations made before RNG initialization, will always
	* return zero. We still benefit from enabling KFENCE as early as
	* possible, even when the RNG is not yet available, as this will allow
	* KFENCE to detect bugs due to earlier allocations. The only downside
	* is that the out-of-bounds accesses detected are deterministic for
	* such allocations.
	如果随机数发生器初始化之前分配，那么object的地址是从这页内存的起始位置开始。当随机数
	发生器可以工作了，那么将object放到这页内存的最右侧
	*/
	if (prandom_u32_max(2)) {
	/* Allocate on the "right" side, re-calculate address. */
	meta->addr += PAGE_SIZE - size;
	meta->addr = ALIGN_DOWN(meta->addr, cache->align);
	}

	// object起始地址
	addr = (void *)meta->addr;

	/*
	这个函数做了几件事：
	1. 将当前进程的调用栈记录到meta的alloc_track中，即内存分配栈
	2. 将当前进程的pid记录到meta的pid中
	3. 设置meta的状态为KFENCE_OBJECT_ALLOCATED，表示meta描述的一页内存已经被分配
	*/
	metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED);
	/* 将当前kmem_cache记录到meta中 */
	WRITE_ONCE(meta->cache, cache);
	/* 记录object的大小 */
	meta->size = size;
	/* 将这页内存中除了给object用的size大小的空间之外的填充成一个跟地址相关的pattern数
	目的是在释放时检查有没有发生内存越界访问
	*/
	for_each_canary(meta, set_canary_byte);

	/* 获取这页内存对应的struct page结构 */
	page = virt_to_page(meta->addr);
	/* 在page中记录对应的kmem_cache，将来释放的时候要用到 */
	page->slab_cache = cache;
	/* 由于kfence内存池中一个页只放了一个object，所以这里将objects设置为1 */
	if (IS_ENABLED(CONFIG_SLUB))
	page->objects = 1;
	// 如果是slab分配器，s_smem会记录第一个object的地址
	if (IS_ENABLED(CONFIG_SLAB))
	page->s_mem = addr;

	/* Memory initialization. */

	/*
	* We check slab_want_init_on_alloc() ourselves, rather than letting
	* SL*B do the initialization, as otherwise we might overwrite KFENCE's
	* redzone.
	*/
	if (unlikely(slab_want_init_on_alloc(gfp, cache))) // 如果设置了__GFP_ZERO标志，返回true
	memzero_explicit(addr, size); // 将object使用的那部分区域清零
	if (cache->ctor) // 如果有构造函数
	cache->ctor(addr);

	/* KFENCE_COUNTER_ALLOCATED 表示kfence内存池中有多少object被分配出去了，在释放的时候会减一 */
	atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCATED]);
	/* KFENCE_COUNTER_ALLOCS 表示发生从kfence内存池分配内存的次数，单调递增 */
	atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]);

	return addr;
	}

折叠

释放内存

路径1：

	kfree
	-> slab_free
	-> slab_free_hook
	-> do_slab_free
	-> __slab_free
	-> kfence_free

路径2

	kmem_cache_free
	-> slab_free

释放内存时，最终会调用到kfence_free

kfence_free

	static __always_inline __must_check bool kfence_free(void *addr)
	{
	// 检查要释放的虚拟地址是否在kfence内存池的虚拟地址范围内
	if (!is_kfence_address(addr))
	return false;
	__kfence_free(addr);
	return true;
	}

__kfence_free

	void __kfence_free(void *addr)
	{
	/*
	根据object的地址可以获取对应的meta。根据addr跟kfence内存池起始地址的偏移可以计算出一个索引，然后从kfence_metadata数组
	中就可以得到索引对应的meta
	*/
	struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);

	/*
	* 如果meta对应的kmem_cache有SLAB_TYPESAFE_BY_RCU，那么不能立刻释放，需要异步处理，当过了一个宽限期再释放
	在rcu_guarded_free会直接调用kfence_guarded_free
	*/
	if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU)))
	call_rcu(&meta->rcu_head, rcu_guarded_free);
	else
	kfence_guarded_free(addr, meta, false);
	}

kfence_guarded_free [kfence_free -> __kfence_free -> kfence_guarded_free]

	static void kfence_guarded_free(void addr, struct kfence_metadata meta, bool zombie)
	{
	struct kcsan_scoped_access assert_page_exclusive;
	unsigned long flags;

	// 如果meta的状态不是已分配的话或者地址不匹配，或者是释放了两次，或者是释放时传的地址跟申请时获得的不一样
	if (meta->state != KFENCE_OBJECT_ALLOCATED \|\| meta->addr != (unsigned long)addr) {
	/* Invalid or double-free, bail out. */
	atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); // 将kfence检测到的内存问题的个数加1
	kfence_report_error((unsigned long)addr, false, NULL, meta,
	KFENCE_ERROR_INVALID_FREE);
	raw_spin_unlock_irqrestore(&meta->lock, flags);
	return;
	}

	/* 如果在缺页异常中检测到OOB内存错误，那么unprotected_page会记录发生异常的地址 */
	if (meta->unprotected_page) {
	// 将发生OOB的地址所在的page页清零
	memzero_explicit((void *)ALIGN_DOWN(meta->unprotected_page, PAGE_SIZE), PAGE_SIZE);
	// 将发生OOB的地址所在的内存页设置为保护，因为缺页异常的最后会取消保护发生异常的地址所在的页
	kfence_protect(meta->unprotected_page);
	meta->unprotected_page = 0;
	}

	/* 检查object所在的内存页的空闲区域的pattern值是否发生了改变，以此来判断是否发生了OOB
	for_eatch_canary首先检查object左侧的pattern，将第一个pattern不一致的信息输出。然后检查object右侧
	的pattern，也只输出第一个pattern不一致的信息输出
	*/
	for_each_canary(meta, check_canary_byte);

	/*
	* Clear memory if init-on-free is set. While we protect the page, the
	* data is still there, and after a use-after-free is detected, we
	* unprotect the page, so the data is still accessible.
	*/
	if (!zombie && unlikely(slab_want_init_on_free(meta->cache)))
	memzero_explicit(addr, meta->size);

	/* 这个函数做如下几件事：
	1. 将当前进程的调用栈存放到meta的free_track中，即内存释放栈
	2. 记录当前进程的pid到meta的pid成员中
	3. 设置meta的状态为KFENCE_OBJECT_FREED，表示对应的内存页空闲了
	*/
	metadata_update_state(meta, KFENCE_OBJECT_FREED);

	/* 将这页内存保护起来，用来检测use after free类型的内存访问错误 */
	kfence_protect((unsigned long)addr);

	if (!zombie) {
	/* 将meta重新放回空闲链表 */
	list_add_tail(&meta->list, &kfence_freelist);

	// 将KFENCE_COUNTER_ALLOCATED的计数减1，表示当前有多少kfence内存池里有多少object被分配出去了
	atomic_long_dec(&counters[KFENCE_COUNTER_ALLOCATED]);
	// 将KFENCE_COUNTER_FREES的计数加1，表示kfence内存池发生了多少次object释放，单调递增
	atomic_long_inc(&counters[KFENCE_COUNTER_FREES]);
	} else {
	/* 当kmem_cache被销毁时，所有尚未释放的object个数会记录到KFENCE_COUNTER_ZOMBIES中
	处于zombie的object也时free的，但是不能被分配了
	*/
	atomic_long_inc(&counters[KFENCE_COUNTER_ZOMBIES]);
	}
	}

折叠

检查pattern区

for_each_canary [kfence_free -> __kfence_free -> kfence_guarded_free -> for_each_canary]

	/* __always_inline this to ensure we won't do an indirect call to fn. */
	static __always_inline void for_each_canary(const struct kfence_metadata meta, bool (fn)(u8 *))
	{
	const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE);
	unsigned long addr;

	/* 检查object所在的内存页的左侧的pattern区域 */
	for (addr = pageaddr; addr < meta->addr; addr++) {
	if (!fn((u8 *)addr)) // 如果不匹配，会输出kfence错误log，并返回false
	break;
	}

	/* 检查object所在的内存页的右侧的pattern区域 */
	for (addr = meta->addr + meta->size; addr < pageaddr + PAGE_SIZE; addr++) {
	if (!fn((u8 *)addr)) // 如果不匹配，会输出kfence错误log，并返回false
	break;
	}
	}

check_canary_byte [kfence_free -> __kfence_free -> kfence_guarded_free -> for_each_canary -> check_canary_byte ]

	/* Check canary byte at @addr. */
	static inline bool check_canary_byte(u8 *addr)
	{
	if (likely(*addr == KFENCE_CANARY_PATTERN(addr)))
	return true;

	// 如果内存页中的空闲区域的值跟之前的pattern值不同，表示在该页内部发生了越界，这种越界不会触发缺页
	// KFENCE_COUNTER_BUGS的计数加1，表示kfence检测到的内存问题的个数
	atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
	kfence_report_error((unsigned long)addr, false, NULL, addr_to_metadata((unsigned long)addr),
	KFENCE_ERROR_CORRUPTION);
	return false;
	}

kmem_cache销毁

	kmem_cache_destroy
	-> shutdown_cache
	-> kfence_shutdown_cache

kfence_shutdown_cache

	void kfence_shutdown_cache(struct kmem_cache *s)
	{
	unsigned long flags;
	struct kfence_metadata *meta;
	int i;

	for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
	bool in_use;

	meta = &kfence_metadata[i];

	/* 跳过不跟指定kmem_cache匹配的meta以及状态不是已分配的meta
	*/
	if (READ_ONCE(meta->cache) != s \|\|
	READ_ONCE(meta->state) != KFENCE_OBJECT_ALLOCATED)
	continue;

	raw_spin_lock_irqsave(&meta->lock, flags);
	in_use = meta->cache == s && meta->state == KFENCE_OBJECT_ALLOCATED;
	raw_spin_unlock_irqrestore(&meta->lock, flags);

	if (in_use) {
	/*
	* This cache still has allocations, and we should not
	* release them back into the freelist so they can still
	* safely be used and retain the kernel's default
	* behaviour of keeping the allocations alive (leak the
	* cache); however, they effectively become "zombie
	* allocations" as the KFENCE objects are the only ones
	* still in use and the owning cache is being destroyed.
	*
	* We mark them freed, so that any subsequent use shows
	* more useful error messages that will include stack
	* traces of the user of the object, the original
	* allocation, and caller to shutdown_cache().
	*/
	kfence_guarded_free((void )meta->addr, meta, /zombie=*/true);
	// 将zombie设置为true，被释放的meta并不会加入到kfence_freelist中，也就不会分分配出去
	// 处于zombie的object也属于free，但是不能再被分配
	}
	}

	for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
	meta = &kfence_metadata[i];

	/* See above. */
	if (READ_ONCE(meta->cache) != s \|\| READ_ONCE(meta->state) != KFENCE_OBJECT_FREED)
	continue;

	raw_spin_lock_irqsave(&meta->lock, flags);
	// 将meta的cache字段清除，这样通过/sys/kernel/debug/kfence/objects知道哪些object是zombie的
	if (meta->cache == s && meta->state == KFENCE_OBJECT_FREED)
	meta->cache = NULL;
	raw_spin_unlock_irqrestore(&meta->lock, flags);
	}
	}

折叠

缺页异常

当发生内存越界访问导致被protect的页被访问，此时会发生缺页。
当发生了use after free，即object被释放后在没有申请的情况下，又访问这个object，也会发生缺页。因为在释放时，空闲object所在的内存页已经被保护了。

路径：

	handle_page_fault
	-> do_kern_addr_fault
	-> bad_area_nosemaphore
	-> __bad_area_nosemaphore
	-> kernelmode_fixup_or_oops
	-> page_fault_oops
	-> kfence_handle_page_fault

kfence_handle_page_fault

	/*
	addr是导致缺页的地址
	is_write表示是否是写访问
	regs记录缺页发生时的cpu寄存器上下文
	*/
	bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs)
	{
	/*
	根据缺页发生的地址计算在kfence内存池中的索引
	*/
	const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE;
	struct kfence_metadata *to_report = NULL;
	enum kfence_error_type error_type;
	unsigned long flags;

	// 判断是否为kfence内存池的地址范围
	if (!is_kfence_address((void *)addr))
	return false;

	// 检查kfence是否被关闭了，可以向/sys/module/kfence/parameters/sample_interval写入0关闭kfence
	if (!READ_ONCE(kfence_enabled)) /* If disabled at runtime ... */
	return kfence_unprotect(addr); /* ... unprotect and proceed. */

	// KFENCE_COUNTER_BUGS计数加1，表示检测到的内存错误的个数
	atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);

	if (page_index % 2) {
	/*
	如果是在kfence内存池中奇数页上发生的缺页，表示发生了内存越界。因为在初始化时，已经将奇数页保护起来了
	*/

	/* This is a redzone, report a buffer overflow. */
	struct kfence_metadata *meta;
	int distance = 0;

	// 获取缺页地址左边的一页对应的meta，因为奇数页不用来存放object。
	meta = addr_to_metadata(addr - PAGE_SIZE);
	if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) { // 检查左边的页是否分配了
	to_report = meta;
	/* Data race ok; distance calculation approximate.
	计算发生缺页的地址跟左边被分配出去的object的结尾地址之间的距离
	*/
	distance = addr - data_race(meta->addr + meta->size);
	}

	// 检查缺页地址右边的页对应的meta
	meta = addr_to_metadata(addr + PAGE_SIZE);
	if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) { // 检查右边的页是否分配了
	/* Data race ok; distance calculation approximate.
	如果to_report是空，表示左边的页没有分配，那么当前右边的页就是发生越界的object所在的页
	如果左边的页也分配了，需要比较右边的的页中object的起始地址距离缺页发生的地址之间的距离跟左边页计算来的
	的距离，距离小的一边就是发生越界的object所在的页
	*/
	if (!to_report \|\| distance > data_race(meta->addr) - addr)
	to_report = meta;
	}

	// 如果左边和右边的页都没有分配出去，这是一种kfence也不敢确定的异常行为，可能是UAF或者OOB
	if (!to_report)
	goto out;

	raw_spin_lock_irqsave(&to_report->lock, flags);
	// 记录缺页发生的地址
	to_report->unprotected_page = addr;
	// kfence检测到的错误类型为越界访问
	error_type = KFENCE_ERROR_OOB;

	/*
	* If the object was freed before we took the look we can still
	* report this as an OOB -- the report will simply show the
	* stacktrace of the free as well.
	*/
	} else {
	// 表示发生了UAF，在偶数页上发生了缺页，只有一种可能，就是object被释放后，没有申请的情况下，又访问了这个object。
	// 在前面的分析中直到，对于偶数页，只有在free后才会被protect起来。
	to_report = addr_to_metadata(addr);
	if (!to_report)
	goto out;

	raw_spin_lock_irqsave(&to_report->lock, flags);
	// kfence检测到UAF内存访问错误
	error_type = KFENCE_ERROR_UAF;
	/*
	* We may race with __kfence_alloc(), and it is possible that a
	* freed object may be reallocated. We simply report this as a
	* use-after-free, with the stack trace showing the place where
	* the object was re-allocated.
	*/
	}

	out:
	if (to_report) {
	// 报告OOB内存访问错误
	kfence_report_error(addr, is_write, regs, to_report, error_type);
	raw_spin_unlock_irqrestore(&to_report->lock, flags);
	} else {
	/* 触发OOB的左侧和右侧的内存页都没有分配，既可能使UAF，也可能是OOB
	This may be a UAF or OOB access, but we can't be sure. */
	kfence_report_error(addr, is_write, regs, NULL, KFENCE_ERROR_INVALID);
	}

	// 执行到这里，说明kfence不希望系统宕机，所以撤销发生缺页的地址所在的内存区的保护，保证系统还可以正常跑下去
	return kfence_unprotect(addr); /* Unprotect and let access proceed. */
	}

折叠

错误报告

当检测到内存错误访问时，会调用kfence_report_error输出错误log。

错误种类分为如下几种：

缺页异常中检测到的访问了protect页的oob：KFENCE_ERROR_OOB
释放内存时检测到的访问了object所在的内存区的空闲区域的OOB：KFENCE_ERROR_CORRUPTION
缺页异常中检测到的访问了被释放的object所在的内存页的UAF：KFENCE_ERROR_UAF
释放内存时检测到的kfence到重复释放或者申请和释放的地址不一致：KFENCE_ERROR_INVALID_FREE
缺页异常中检测到的kfence无法确定的内存访问错误，比如发生OOB时但是protect页左右的内存页都没有分配出去：KFENCE_ERROR_INVALID

kfence_report_error

	/*
	address: 导致内存问题的地址
	is_write: 是不是写访问、
	regs：发生缺页异常时的cpu上下文
	meta：跟导致内存异常的地址关联的meta，对于访问protect区域的oob来说，meta表示的是因为访问那个object导致的oob，这个object对应的meta
	type：内存问题的类型
	*/

	void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs,
	const struct kfence_metadata *meta, enum kfence_error_type type)
	{
	unsigned long stack_entries[KFENCE_STACK_DEPTH] = { 0 };
	const ptrdiff_t object_index = meta ? meta - kfence_metadata : -1;
	int num_stack_entries;
	int skipnr = 0;

	/*
	对于regs非空，是因为触发了缺页的情况，此时根据regs得到的调用栈不需要skip任何一项，所以skipnr为0，因为regs记录的就是异常发生那
	一刻的栈的状态；

	对于regs为空的场景，是通过释放内存触发的，记录调用栈的时候，调用栈里不可避免的会出现kfence、slab以及kmem_cache相关的函数，这些
	函数对于分析问题没啥帮助，所以对分析问题有帮助的是谁调用了这些函数，即谁在哪里执行了释放内存的操作，因为需要将这部分的调用栈输出出来，
	以节省开发人员时间，所以skipnr非0
	*/
	if (regs) {
	/* 根据pt_regs获取发生异常时的调用栈，并且存放到stack_entries中，深度为64 */
	num_stack_entries = stack_trace_save_regs(regs, stack_entries, KFENCE_STACK_DEPTH, 0);
	} else {
	/* 如果没有传递pt_regs，那么记录的当前的调用栈，但是会将堆栈的去掉调用栈的第一项，即stack_trace_save */
	num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 1);
	/* 解析调用栈，目的是尽量得到导致内存问题的业务逻辑的位置，跳过kfence、slab、kfree、kmem_cache、kmalloc相关的函数
	这样更加方便定位问题
	*/
	skipnr = get_stack_skipnr(stack_entries, num_stack_entries, &type);
	}

	/* Require non-NULL meta, except if KFENCE_ERROR_INVALID. */
	if (WARN_ON(type != KFENCE_ERROR_INVALID && !meta))
	return;

	if (meta)
	lockdep_assert_held(&meta->lock);
	/*
	* Because we may generate reports in printk-unfriendly parts of the
	* kernel, such as scheduler code, the use of printk() could deadlock.
	* Until such time that all printing code here is safe in all parts of
	* the kernel, accept the risk, and just get our message out (given the
	* system might already behave unpredictably due to the memory error).
	* As such, also disable lockdep to hide warnings, and avoid disabling
	* lockdep for the rest of the kernel.
	*/
	lockdep_off();

	pr_err("==================================================================\n");
	/* Print report header. */
	switch (type) {
	case KFENCE_ERROR_OOB: { // 访问了protect的内存页导致的OOB

	// 如果触发异常的地址小于meta对应的object地址，意味着访问了与object所在的内存页紧邻的左边的protect内存页
	// 否则，意味着访问的是与object所在的内存页紧邻的右边的protect内存页
	const bool left_of_object = address < meta->addr;

	pr_err("BUG: KFENCE: out-of-bounds %s in %pS\n\n", get_access_type(is_write),
	(void *)stack_entries[skipnr]);

	// 输出访问类型，缺页地址，缺页地址跟object之间的字节偏移，缺页地址在object的左边内存页还是右边内存页，以及object的索引
	pr_err("Out-of-bounds %s at 0x%p (%luB %s of kfence-#%td):\n",
	get_access_type(is_write), (void *)address,
	left_of_object ? meta->addr - address : address - meta->addr,
	left_of_object ? "left" : "right", object_index);
	break;
	}
	case KFENCE_ERROR_UAF: // object被释放了，没有申请，又访问了
	pr_err("BUG: KFENCE: use-after-free %s in %pS\n\n", get_access_type(is_write),
	(void *)stack_entries[skipnr]);
	pr_err("Use-after-free %s at 0x%p (in kfence-#%td):\n",
	get_access_type(is_write), (void *)address, object_index);
	break;
	case KFENCE_ERROR_CORRUPTION: // object所在的内存页的空闲区域的pattern被破坏，也属于OOB
	pr_err("BUG: KFENCE: memory corruption in %pS\n\n", (void *)stack_entries[skipnr]);
	pr_err("Corrupted memory at 0x%p ", (void *)address); // 发生pattern不一致的地址
	print_diff_canary(address, 16, meta); // 显示pattern不一致的地址右侧16字节地址范围内的数据的匹配信息
	pr_cont(" (in kfence-#%td):\n", object_index); // object的索引
	break;
	case KFENCE_ERROR_INVALID: // 缺页异常里检测到的无效的错误
	pr_err("BUG: KFENCE: invalid %s in %pS\n\n", get_access_type(is_write),
	(void *)stack_entries[skipnr]);
	pr_err("Invalid %s at 0x%p:\n", get_access_type(is_write),
	(void *)address);
	break;
	case KFENCE_ERROR_INVALID_FREE: // kfence_free检测到的重复释放以及申请和释放的地址不一致的错误
	pr_err("BUG: KFENCE: invalid free in %pS\n\n", (void *)stack_entries[skipnr]);
	pr_err("Invalid free of 0x%p (in kfence-#%td):\n", (void *)address,
	object_index);
	break;
	}

	/* 输出内存错误发生的调用栈，其中skipnr用于帮助跳过一些对分析问题没有帮助的mm内部函数 */
	stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr, 0);

	if (meta) {
	pr_err("\n");
	/*
	1. 输出meta的状态信息，object的地址范围，kmem_cache以及进程pid
	2. 输出object被分配出去时的调用栈
	3. 如果meta是free状态，那么还会输出内存释放时的调用栈，以及调用者的pid
	*/
	kfence_print_object(NULL, meta);
	}

	/* Print report footer. */
	pr_err("\n");
	if (no_hash_pointers && regs) // 可以通过启动参数no_hash_pointers来设置为1
	show_regs(regs); // 输出缺页异常发生时的CPU寄存器内容以及调用栈
	else
	dump_stack_print_info(KERN_ERR); // 简略的debug信息
	trace_error_report_end(ERROR_DETECTOR_KFENCE, address);
	pr_err("==================================================================\n");

	lockdep_on();

	if (panic_on_warn) // 可以通过将/proc/sys/kernel/panic_on_warn设置为1让系统宕机
	panic("panic_on_warn set ...\n");

	/* We encountered a memory safety error, taint the kernel!
	可以通过给启动参数设置'panic_on_taint=0x20'，这样当添加TAINT_BAD_PAGE类型的taint时，会发生宕机
	*/
	add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK);
	}

折叠

get_stack_skipnr [kfence_report_error -> get_stack_skipnr ]

从调用栈里将mm的内部函数跳过。

	/*
	* Get the number of stack entries to skip to get out of MM internals. @type is
	* optional, and if set to NULL, assumes an allocation or free stack.
	*/
	static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries,
	const enum kfence_error_type *type)
	{
	char buf[64];
	int skipnr, fallback = 0;

	if (type) {
	/* Depending on error type, find different stack entries. */
	switch (*type) {
	case KFENCE_ERROR_UAF:
	case KFENCE_ERROR_OOB:
	case KFENCE_ERROR_INVALID:
	/*
	* kfence_handle_page_fault() may be called with pt_regs
	* set to NULL; in that case we'll simply show the full
	* stack trace.
	*/
	return 0;
	case KFENCE_ERROR_CORRUPTION:
	case KFENCE_ERROR_INVALID_FREE:
	break;
	}
	}

	for (skipnr = 0; skipnr < num_entries; skipnr++) {
	int len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skipnr]);

	if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfence_") \|\|
	str_has_prefix(buf, ARCH_FUNC_PREFIX "__kfence_") \|\|
	!strncmp(buf, ARCH_FUNC_PREFIX "__slab_free", len)) {
	/*
	* In case of tail calls from any of the below
	* to any of the above.
	*/
	fallback = skipnr + 1;
	}

	/* Also the _bulk() variants by only checking prefixes. /
	if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfree") \|\|
	str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_free") \|\|
	str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmalloc") \|\|
	str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_alloc"))
	goto found;
	}
	if (fallback < num_entries)
	return fallback;
	found:
	skipnr++;
	return skipnr < num_entries ? skipnr : 0;
	}

折叠

print_diff_canary [kfence_report_error -> print_diff_canary]

	/*
	* Show bytes at @addr that are different from the expected canary values, up to
	* @max_bytes.

	address： pattern不一致的地址，这个地址可能是左侧pattern区域或者右侧pattern区域的，通过跟meta->addr比较就可以知道，参考下图
	bytes_to_show: 最长输出多少个地址的的匹配信息
	meta：pattern区所在的内存页对应的meta信息
	*/
	static void print_diff_canary(unsigned long address, size_t bytes_to_show,
	const struct kfence_metadata *meta)
	{
	const unsigned long show_until_addr = address + bytes_to_show; //
	const u8 cur, end;

	/* 计算结束地址，不能越出pattern区的范围。比如左侧的pattern区，最长输出到meta->addr-1。
	对于右侧的pattern区，最长到右边保护区起始地址-1 */
	end = (const u8 *)(address < meta->addr ? min(show_until_addr, meta->addr)
	: min(show_until_addr, PAGE_ALIGN(address)));

	pr_cont("[");
	for (cur = (const u8 *)address; cur < end; cur++) {
	if (*cur == KFENCE_CANARY_PATTERN(cur))
	pr_cont(" ."); // 对于pattern一致的地址，输出 '.'
	else if (no_hash_pointers) // 可以通过启动参数no_hash_pointers来设置为1
	pr_cont(" 0x%02x", *cur);
	else /* Do not leak kernel memory in non-debug builds. */
	pr_cont(" !"); // 对于pattern不一致的地址，输出 '!'
	}
	pr_cont(" ]");
	}

内存异常log分析

OOB错误

读左侧保护区导致的OOB: KFENCE_ERROR_OOB

示例：

	size = kmalloc_cache_alignment(size);
	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT);
	expect.addr = buf - 1;
	READ_ONCE(*expect.addr);
	KUNIT_EXPECT_TRUE(test, report_matches(&expect));
	test_free(buf);

log:

	==================================================================
	BUG: KFENCE: out-of-bounds read in test_out_of_bounds_read+0xad/0x1f2 [kfence_test]

	# 触发异常时的内核栈
	Out-of-bounds read at 0x000000008e1b5d12 (1B left of kfence-#109):
	test_out_of_bounds_read+0xad/0x1f2 [kfence_test]
	kunit_try_run_case+0x51/0x80
	kunit_generic_run_threadfn_adapter+0x16/0x30
	kthread+0x11a/0x140
	ret_from_fork+0x22/0x30

	# 分配object的调用栈
	kfence-#109 [0x00000000753194ac-0x000000000d237ced, size=32, cache=kmalloc-32] allocated by task 35779:
	test_alloc+0xe9/0x36f [kfence_test]
	test_out_of_bounds_read+0x86/0x1f2 [kfence_test]
	kunit_try_run_case+0x51/0x80
	kunit_generic_run_threadfn_adapter+0x16/0x30
	kthread+0x11a/0x140
	ret_from_fork+0x22/0x30

	CPU: 5 PID: 35779 Comm: kunit_try_catch Kdump: loaded Not tainted 5.14.0+ #4
	Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
	==================================================================

读右侧保护区导致的OOB: KFENCE_ERROR_OOB

示例：

	size = kmalloc_cache_alignment(size);
	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT);
	expect.addr = buf + size;
	READ_ONCE(*expect.addr);
	KUNIT_EXPECT_TRUE(test, report_matches(&expect));
	test_free(buf);

log：

	==================================================================
	BUG: KFENCE: out-of-bounds read in test_out_of_bounds_read+0x14a/0x1f2 [kfence_test]

	# 触发异常的调用栈
	Out-of-bounds read at 0x0000000002d76451 (32B right of kfence-#111):
	test_out_of_bounds_read+0x14a/0x1f2 [kfence_test]
	kunit_try_run_case+0x51/0x80
	kunit_generic_run_threadfn_adapter+0x16/0x30
	kthread+0x11a/0x140
	ret_from_fork+0x22/0x30

	# 分配object的调用栈
	kfence-#111 [0x00000000432dce97-0x000000008d6138c3, size=32, cache=kmalloc-32] allocated by task 35779:
	test_alloc+0xe9/0x36f [kfence_test]
	test_out_of_bounds_read+0x140/0x1f2 [kfence_test]
	kunit_try_run_case+0x51/0x80
	kunit_generic_run_threadfn_adapter+0x16/0x30
	kthread+0x11a/0x140
	ret_from_fork+0x22/0x30

	CPU: 5 PID: 35779 Comm: kunit_try_catch Kdump: loaded Tainted: G B 5.14.0+ #4
	Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
	==================================================================

写左侧保护区导致的OOB: KFENCE_ERROR_OOB

示例：

	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT);
	expect.addr = buf - 1;
	WRITE_ONCE(*expect.addr, 42);

log:

	==================================================================
	BUG: KFENCE: out-of-bounds write in test_out_of_bounds_write+0x7a/0x116 [kfence_test]

	# 触发异常的调用栈
	Out-of-bounds write at 0x000000003f50719f (1B left of kfence-#134):
	test_out_of_bounds_write+0x7a/0x116 [kfence_test]
	kunit_try_run_case+0x51/0x80
	kunit_generic_run_threadfn_adapter+0x16/0x30
	kthread+0x11a/0x140
	ret_from_fork+0x22/0x30

	# 分配object的调用栈
	kfence-#134 [0x0000000080436418-0x0000000052b079df, size=32, cache=kmalloc-32] allocated by task 35781:
	test_alloc+0xe9/0x36f [kfence_test]
	test_out_of_bounds_write+0x65/0x116 [kfence_test]
	kunit_try_run_case+0x51/0x80
	kunit_generic_run_threadfn_adapter+0x16/0x30
	kthread+0x11a/0x140
	ret_from_fork+0x22/0x30

	CPU: 5 PID: 35781 Comm: kunit_try_catch Kdump: loaded Tainted: G B 5.14.0+ #4
	Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
	==================================================================

UAF

KFENCE_ERROR_UAF

示例：

	expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
	test_free(expect.addr);
	READ_ONCE(*expect.addr);

log:

	==================================================================
	BUG: KFENCE: use-after-free read in test_use_after_free_read+0x89/0x10b [kfence_test]

	# 触发UAF时的调用栈
	Use-after-free read at 0x0000000067fb284c (in kfence-#152):
	test_use_after_free_read+0x89/0x10b [kfence_test]
	kunit_try_run_case+0x51/0x80
	kunit_generic_run_threadfn_adapter+0x16/0x30
	kthread+0x11a/0x140
	ret_from_fork+0x22/0x30

	# 分配object的调用栈
	kfence-#152 [0x0000000067fb284c-0x00000000cd45daeb, size=32, cache=kmalloc-32] allocated by task 35783:
	test_alloc+0xe9/0x36f [kfence_test]
	test_use_after_free_read+0x63/0x10b [kfence_test]
	kunit_try_run_case+0x51/0x80
	kunit_generic_run_threadfn_adapter+0x16/0x30
	kthread+0x11a/0x140
	ret_from_fork+0x22/0x30

	# 释放object的调用栈
	freed by task 35783:
	test_use_after_free_read+0x85/0x10b [kfence_test]
	kunit_try_run_case+0x51/0x80
	kunit_generic_run_threadfn_adapter+0x16/0x30
	kthread+0x11a/0x140
	ret_from_fork+0x22/0x30

	CPU: 7 PID: 35783 Comm: kunit_try_catch Kdump: loaded Tainted: G B 5.14.0+ #4
	Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
	==================================================================

pattern区不一致

右侧pattern区不一致：KFENCE_ERROR_CORRUPTION

示例：

	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT);
	expect.addr = buf + size;
	WRITE_ONCE(*expect.addr, 42);
	test_free(buf);

log:

	==================================================================
	BUG: KFENCE: memory corruption in test_corruption+0x9c/0x1cb [kfence_test]

	# 输出pattern不一致的地址及其右侧一共16个地址（不超出右侧pattern区）的匹配结果，'!'表示不一致，'.'表示一致。
	Corrupted memory at 0x000000003b880c36 [ ! . . . . . . . . . . . . . . . ] (in kfence-#139):
	test_corruption+0x9c/0x1cb [kfence_test]
	kunit_try_run_case+0x51/0x80
	kunit_generic_run_threadfn_adapter+0x16/0x30
	kthread+0x11a/0x140
	ret_from_fork+0x22/0x30

	# 分配object的调用栈
	kfence-#139 [0x0000000084320c94-0x00000000ebf5c6c5, size=32, cache=kmalloc-32] allocated by task 35789:
	test_alloc+0xe9/0x36f [kfence_test]
	test_corruption+0x72/0x1cb [kfence_test]
	kunit_try_run_case+0x51/0x80
	kunit_generic_run_threadfn_adapter+0x16/0x30
	kthread+0x11a/0x140
	ret_from_fork+0x22/0x30

	CPU: 5 PID: 35789 Comm: kunit_try_catch Kdump: loaded Tainted: G B 5.14.0+ #4
	Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
	==================================================================

左侧pattern区不一致：KFENCE_ERROR_CORRUPTION

示例：

	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT);
	expect.addr = buf - 1;
	WRITE_ONCE(*expect.addr, 42);
	test_free(buf);

log:

	==================================================================
	BUG: KFENCE: memory corruption in test_corruption+0x14e/0x1cb [kfence_test]

	# 输出pattern不一致的地址及其右侧一共16个地址（不超出左侧pattern区）的匹配结果，'!'表示不一致，'.'表示一致。
	Corrupted memory at 0x00000000d7861e9d [ ! ] (in kfence-#155):
	test_corruption+0x14e/0x1cb [kfence_test]
	kunit_try_run_case+0x51/0x80
	kunit_generic_run_threadfn_adapter+0x16/0x30
	kthread+0x11a/0x140
	ret_from_fork+0x22/0x30

	kfence-#155 [0x000000009acdf655-0x00000000008cbfb7, size=32, cache=kmalloc-32] allocated by task 35789:
	test_alloc+0xe9/0x36f [kfence_test]
	test_corruption+0x124/0x1cb [kfence_test]
	kunit_try_run_case+0x51/0x80
	kunit_generic_run_threadfn_adapter+0x16/0x30
	kthread+0x11a/0x140
	ret_from_fork+0x22/0x30

	CPU: 5 PID: 35789 Comm: kunit_try_catch Kdump: loaded Tainted: G B 5.14.0+ #4
	Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
	==================================================================

无效的释放

重复释放：KFENCE_ERROR_INVALID_FREE

示例：

	expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
	test_free(expect.addr);
	test_free(expect.addr); /* Double-free. */

log:

	==================================================================
	BUG: KFENCE: invalid free in test_double_free+0x9a/0x124 [kfence_test]

	# 触发重复释放的调用栈
	Invalid free of 0x000000007fb6a8f8 (in kfence-#136):
	test_double_free+0x9a/0x124 [kfence_test]
	kunit_try_run_case+0x51/0x80
	kunit_generic_run_threadfn_adapter+0x16/0x30
	kthread+0x11a/0x140
	ret_from_fork+0x22/0x30

	# 分配objcet的调用栈
	kfence-#136 [0x000000007fb6a8f8-0x00000000d967e9cd, size=32, cache=test] allocated by task 35786:
	test_alloc+0xdf/0x36f [kfence_test]
	test_double_free+0x63/0x124 [kfence_test]
	kunit_try_run_case+0x51/0x80
	kunit_generic_run_threadfn_adapter+0x16/0x30
	kthread+0x11a/0x140
	ret_from_fork+0x22/0x30

	# 释放object的调用栈
	freed by task 35786:
	test_double_free+0x7b/0x124 [kfence_test]
	kunit_try_run_case+0x51/0x80
	kunit_generic_run_threadfn_adapter+0x16/0x30
	kthread+0x11a/0x140
	ret_from_fork+0x22/0x30

	CPU: 5 PID: 35786 Comm: kunit_try_catch Kdump: loaded Tainted: G B 5.14.0+ #4
	Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
	==================================================================

申请和释放的地址不一致：KFENCE_ERROR_INVALID_FREE

示例：

	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
	expect.addr = buf + 1; /* Free on invalid address. */
	test_free(expect.addr); /* Invalid address free. */
	test_free(buf); /* No error. */

log:

	==================================================================
	BUG: KFENCE: invalid free in test_invalid_addr_free+0x8b/0x12b [kfence_test]

	Invalid free of 0x0000000000b3e82d (in kfence-#124):
	test_invalid_addr_free+0x8b/0x12b [kfence_test]
	kunit_try_run_case+0x51/0x80
	kunit_generic_run_threadfn_adapter+0x16/0x30
	kthread+0x11a/0x140
	ret_from_fork+0x22/0x30

	kfence-#124 [0x000000002aecf77f-0x0000000046ff045a, size=32, cache=kmalloc-32] allocated by task 35787:
	test_alloc+0xe9/0x36f [kfence_test]
	test_invalid_addr_free+0x65/0x12b [kfence_test]
	kunit_try_run_case+0x51/0x80
	kunit_generic_run_threadfn_adapter+0x16/0x30
	kthread+0x11a/0x140
	ret_from_fork+0x22/0x30

	CPU: 5 PID: 35787 Comm: kunit_try_catch Kdump: loaded Tainted: G B 5.14.0+ #4
	Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
	==================================================================

其他无法识别的内存错误

如触发缺页的OOB区域左侧和右侧的内存页都没有分配出去：KFENCE_ERROR_INVALID

示例：

READ_ONCE(__kfence_pool[10]);

log:

	==================================================================
	BUG: KFENCE: invalid read in test_invalid_access+0x48/0xd0 [kfence_test]

	Invalid read at 0x0000000023713263:
	test_invalid_access+0x48/0xd0 [kfence_test]
	kunit_try_run_case+0x51/0x80
	kunit_generic_run_threadfn_adapter+0x16/0x30
	kthread+0x11a/0x140
	ret_from_fork+0x22/0x30

	CPU: 5 PID: 35936 Comm: kunit_try_catch Kdump: loaded Tainted: G B 5.14.0+ #4
	Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
	==================================================================

debugfs调试节点

在/sys/kernel/debug/kfence下面有两个用于查看kfence状态的节点：objects和stats

stats节点

	# cat stats
	enabled: 1
	currently allocated: 47
	total allocations: 2416
	total frees: 2369
	zombie allocations: 0
	total bugs: 21

含义

名字	含义
enabled	kfence功能是否处于开启状态。可以通过内核启动参数开启，启动后可以通过模块参数关闭
currently allocated	kfence内存池中有多少个object被分配出去了
total allocations	在kfence内存池中发生过object分配的总次数，当掉递增
total frees	在kfence内存池中发生过object释放的总次数，当掉递增
zombie allocations	当某个kmem_cache被销毁时，在kfence中与之对应的尚未释放的object个数
total bugs	kfence检测到的内存错误的次数

实现

	static int stats_show(struct seq_file seq, void v)
	{
	int i;

	seq_printf(seq, "enabled: %i\n", READ_ONCE(kfence_enabled));
	for (i = 0; i < KFENCE_COUNTER_COUNT; i++)
	seq_printf(seq, "%s: %ld\n", counter_names[i], atomic_long_read(&counters[i]));

	return 0;
	}
	DEFINE_SHOW_ATTRIBUTE(stats);

其中用到的统计数据定义如下：

	/* Statistics counters for debugfs. */
	enum kfence_counter_id {
	KFENCE_COUNTER_ALLOCATED,
	KFENCE_COUNTER_ALLOCS,
	KFENCE_COUNTER_FREES,
	KFENCE_COUNTER_ZOMBIES,
	KFENCE_COUNTER_BUGS,
	KFENCE_COUNTER_COUNT,
	};
	static atomic_long_t counters[KFENCE_COUNTER_COUNT];
	static const char *const counter_names[] = {
	[KFENCE_COUNTER_ALLOCATED] = "currently allocated",
	[KFENCE_COUNTER_ALLOCS] = "total allocations",
	[KFENCE_COUNTER_FREES] = "total frees",
	[KFENCE_COUNTER_ZOMBIES] = "zombie allocations",
	[KFENCE_COUNTER_BUGS] = "total bugs",
	};

objects节点

输出kfence中每个meta的信息，当前状态以及调用栈。

	# cat objects
	kfence-#0 [0xffff89c43b202000-0xffff89c43b202067, size=104, cache=kmalloc-128] allocated by task 8:
	set_kthread_struct+0x30/0x40
	kthread+0x2e/0x140
	ret_from_fork+0x22/0x30
	---------------------------------
	kfence-#1 [0xffff89c43b204000-0xffff89c43b20400f, size=16, cache=kmalloc-16] allocated by task 1:
	__smpboot_create_thread.part.9+0x3c/0x120
	smpboot_create_threads+0x67/0x90
	cpuhp_invoke_callback+0x105/0x400
	cpuhp_invoke_callback_range+0x40/0x80
	_cpu_up+0xd8/0x1e0
	cpu_up+0x85/0x90
	bringup_nonboot_cpus+0x4f/0x60
	smp_init+0x26/0x74
	kernel_init_freeable+0x10e/0x246
	kernel_init+0x16/0x120
	ret_from_fork+0x22/0x30
	---------------------------------
	...
	kfence-#40 [0xffff89c43b252dc0-0xffff89c43b252fff, size=576, cache=inode_cache] allocated by task 531:
	alloc_inode+0x87/0xa0
	new_inode_pseudo+0xb/0x50
	create_pipe_files+0x32/0x200
	__do_pipe_flags+0x2c/0xd0
	do_pipe2+0x2d/0xb0
	__x64_sys_pipe+0x10/0x20
	do_syscall_64+0x3a/0x80
	entry_SYSCALL_64_after_hwframe+0x44/0xae

	freed by task 531:
	destroy_inode+0x3b/0x70
	__dentry_kill+0xc5/0x150
	__fput+0xd9/0x230
	task_work_run+0x74/0xb0
	exit_to_user_mode_prepare+0x191/0x1a0
	syscall_exit_to_user_mode+0x19/0x30
	do_syscall_64+0x46/0x80
	entry_SYSCALL_64_after_hwframe+0x44/0xae
	...
	---------------------------------
	kfence-#254 unused
	---------------------------------

含义

对于被分配出去且尚未释放的object，只显示分配栈。
对于当前处于free状态的object，既显示分配栈，也显示释放栈。处于zombie的object也属于free。
对于从来没有被分配出去过的object，显示unused
对于zombie的object，虽然是free的，但是已经不能被分配了，对应的kmem_cache被销毁的了，所以cache会显示为<destroyed>

实现

	static int show_object(struct seq_file seq, void v)
	{
	struct kfence_metadata *meta = &kfence_metadata[(long)v - 1];
	unsigned long flags;

	raw_spin_lock_irqsave(&meta->lock, flags);
	kfence_print_object(seq, meta);
	raw_spin_unlock_irqrestore(&meta->lock, flags);
	seq_puts(seq, "---------------------------------\n");

	return 0;
	}

kfence_print_object

	void kfence_print_object(struct seq_file seq, const struct kfence_metadata meta)
	{
	const int size = abs(meta->size);
	const unsigned long start = meta->addr;
	const struct kmem_cache *const cache = meta->cache;

	lockdep_assert_held(&meta->lock);

	if (meta->state == KFENCE_OBJECT_UNUSED) { // 尚未使用的meta
	seq_con_printf(seq, "kfence-#%td unused\n", meta - kfence_metadata);
	return;
	}

	seq_con_printf(seq,
	"kfence-#%td [0x%p-0x%p"
	", size=%d, cache=%s] allocated by task %d:\n",
	meta - kfence_metadata, (void )start, (void )(start + size - 1), size,
	(cache && cache->name) ? cache->name : "<destroyed>", meta->alloc_track.pid);
	kfence_print_stack(seq, meta, true); // 输出meta对应的object被分配出去时的调用栈

	if (meta->state == KFENCE_OBJECT_FREED) { // 如果meta对应的object被释放了
	seq_con_printf(seq, "\nfreed by task %d:\n", meta->free_track.pid);
	kfence_print_stack(seq, meta, false); // 输出meta对应的object被释放时的调用栈
	}
	}

测试框架

kfence提供了测试用例，在mm\kfence\kfence_test.c中。

	static int __init kfence_test_init(void)
	{
	/* 遍历内核中的tracepoint，在名为"console"的tracepoint上挂载一个hook函数 */
	for_each_kernel_tracepoint(register_tracepoints, NULL);

	/* 执行测试用例 */
	return __kunit_test_suites_init(kfence_test_suites);
	}

register_tracepoints

	static void register_tracepoints(struct tracepoint tp, void ignore)
	{
	check_trace_callback_type_console(probe_console);
	if (!strcmp(tp->name, "console"))
	WARN_ON(tracepoint_probe_register(tp, probe_console, NULL));
	}

当kfence_report_error输出错误log时，"console"这个tracepoint会触发，然后会回调到probe_console，在probe_console中会过滤kfence_report_error中输出的错误log，并记录到observed，用于跟期望的错误类型比对，比对通过表示测试成功。

probe_console

过滤kfence_report_error中输出的错误log，并记录到observed，用于跟期望的错误类型比对，比对通过表示测试成功。

	/* Probe for console output: obtains observed lines of interest. */
	static void probe_console(void ignore, const char buf, size_t len)
	{
	unsigned long flags;
	int nlines;

	spin_lock_irqsave(&observed.lock, flags);
	nlines = observed.nlines;

	if (strnstr(buf, "BUG: KFENCE: ", len) && strnstr(buf, "test_", len)) {
	/*
	* KFENCE report and related to the test.
	*
	* The provided @buf is not NUL-terminated; copy no more than
	* @len bytes and let strscpy() add the missing NUL-terminator.
	*/
	strscpy(observed.lines[0], buf, min(len + 1, sizeof(observed.lines[0])));
	nlines = 1;
	} else if (nlines == 1 && (strnstr(buf, "at 0x", len) \|\| strnstr(buf, "of 0x", len))) {
	strscpy(observed.lines[nlines++], buf, min(len + 1, sizeof(observed.lines[0])));
	}

	WRITE_ONCE(observed.nlines, nlines); /* Publish new nlines. */
	spin_unlock_irqrestore(&observed.lock, flags);
	}

kfence_test_suites

记录了测试case的具体内容：

	#define KFENCE_KUNIT_CASE(test_name) \
	{ .run_case = test_name, .name = #test_name }, \
	{ .run_case = test_name, .name = #test_name "-memcache" }

	static struct kunit_case kfence_test_cases[] = {
	KFENCE_KUNIT_CASE(test_out_of_bounds_read),
	KFENCE_KUNIT_CASE(test_out_of_bounds_write),
	KFENCE_KUNIT_CASE(test_use_after_free_read),
	KFENCE_KUNIT_CASE(test_double_free),
	KFENCE_KUNIT_CASE(test_invalid_addr_free),
	KFENCE_KUNIT_CASE(test_corruption),
	KFENCE_KUNIT_CASE(test_free_bulk),
	KFENCE_KUNIT_CASE(test_init_on_free),
	KUNIT_CASE(test_kmalloc_aligned_oob_read),
	KUNIT_CASE(test_kmalloc_aligned_oob_write),
	KUNIT_CASE(test_shrink_memcache),
	KUNIT_CASE(test_memcache_ctor),
	KUNIT_CASE(test_invalid_access),
	KUNIT_CASE(test_gfpzero),
	KUNIT_CASE(test_memcache_typesafe_by_rcu),
	KUNIT_CASE(test_krealloc),
	KUNIT_CASE(test_memcache_alloc_bulk),
	{},
	};

	static struct kunit_suite kfence_test_suite = {
	.name = "kfence",
	.test_cases = kfence_test_cases,
	.init = test_init,
	.exit = test_exit,
	};
	static struct kunit_suite *kfence_test_suites[] = { &kfence_test_suite, NULL };

以test_out_of_bounds_read为例：

	static void test_out_of_bounds_read(struct kunit *test)
	{
	size_t size = 32;
	struct expect_report expect = { // 期望发生的结果
	.type = KFENCE_ERROR_OOB, // 期望发生的错误类型
	.fn = test_out_of_bounds_read, // 期望导致错误发生的函数
	.is_write = false, // 期望的读写方向，这里是读
	};
	char *buf;

	setup_test_cache(test, size, 0, NULL);

	/*
	* If we don't have our own cache, adjust based on alignment, so that we
	* actually access guard pages on either side.
	*/
	if (!test_cache)
	size = kmalloc_cache_alignment(size);

	/* Test both sides. */

	// 从kfence中分配内存，构造访问左边保护页的OOB，返回的是object所在页的首地址
	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT);
	expect.addr = buf - 1; // 期望在哪个地址上发生OOB,地址减1就是左边保护页的结尾地址
	READ_ONCE(*expect.addr); // 触发OOB异常
	KUNIT_EXPECT_TRUE(test, report_matches(&expect)); // 调用report_matche比对实际发生的错误跟期望发生的错误是否一致
	test_free(buf);

	// 从kfence中分配内存，构造访问右边保护页的OOB，返回的是object所在页的首地址
	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT);
	expect.addr = buf + size; // 期望发生缺页的地址，地址加上size就是右边保护页的首地址
	READ_ONCE(*expect.addr); // 触发OOB异常
	KUNIT_EXPECT_TRUE(test, report_matches(&expect)); // 核对结果
	test_free(buf);
	}

report_matches

	static bool report_matches(const struct expect_report *r)
	{
	bool ret = false;
	unsigned long flags;
	typeof(observed.lines) expect;
	const char *end;
	char *cur;

	/* Doubled-checked locking. */
	if (!report_available())
	return false;

	/* Generate expected report contents. */

	/* Title */
	cur = expect[0];
	end = &expect[0][sizeof(expect[0]) - 1];
	switch (r->type) {
	case KFENCE_ERROR_OOB:
	cur += scnprintf(cur, end - cur, "BUG: KFENCE: out-of-bounds %s",
	get_access_type(r));
	break;
	case KFENCE_ERROR_UAF:
	cur += scnprintf(cur, end - cur, "BUG: KFENCE: use-after-free %s",
	get_access_type(r));
	break;
	case KFENCE_ERROR_CORRUPTION:
	cur += scnprintf(cur, end - cur, "BUG: KFENCE: memory corruption");
	break;
	case KFENCE_ERROR_INVALID:
	cur += scnprintf(cur, end - cur, "BUG: KFENCE: invalid %s",
	get_access_type(r));
	break;
	case KFENCE_ERROR_INVALID_FREE:
	cur += scnprintf(cur, end - cur, "BUG: KFENCE: invalid free");
	break;
	}

	scnprintf(cur, end - cur, " in %pS", r->fn);
	/* The exact offset won't match, remove it; also strip module name. */
	cur = strchr(expect[0], '+');
	if (cur)
	*cur = '\0';

	/* Access information */
	cur = expect[1];
	end = &expect[1][sizeof(expect[1]) - 1];

	switch (r->type) {
	case KFENCE_ERROR_OOB:
	cur += scnprintf(cur, end - cur, "Out-of-bounds %s at", get_access_type(r));
	break;
	case KFENCE_ERROR_UAF:
	cur += scnprintf(cur, end - cur, "Use-after-free %s at", get_access_type(r));
	break;
	case KFENCE_ERROR_CORRUPTION:
	cur += scnprintf(cur, end - cur, "Corrupted memory at");
	break;
	case KFENCE_ERROR_INVALID:
	cur += scnprintf(cur, end - cur, "Invalid %s at", get_access_type(r));
	break;
	case KFENCE_ERROR_INVALID_FREE:
	cur += scnprintf(cur, end - cur, "Invalid free of");
	break;
	}

	cur += scnprintf(cur, end - cur, " 0x%p", (void *)r->addr);

	spin_lock_irqsave(&observed.lock, flags);
	if (!report_available())
	goto out; /* A new report is being captured. */

	/* Finally match expected output to what we actually observed. */
	ret = strstr(observed.lines[0], expect[0]) && strstr(observed.lines[1], expect[1]);
	out:
	spin_unlock_irqrestore(&observed.lock, flags);
	return ret;
	}

折叠

完。

posted @ 2022-07-28 11:10 Sky&Zhang 阅读(561) 评论(0) 收藏举报

刷新页面返回顶部

sky

我所做的事情都是源于自己对梦想的追求--分享技术、共同创造新世界---欢迎交流：zhangbinghua2012@163.com skyzhangbinghua@gmai.com

kfence源码分析【转】

参考

作者

内核版本

实现分析

初始化

周期性开启kfence内存池

分配内存

释放内存

检查pattern区

kmem_cache销毁

缺页异常

错误报告

内存异常log分析

OOB错误

UAF

pattern区不一致

无效的释放

其他无法识别的内存错误

debugfs调试节点

stats节点

含义

实现

objects节点

含义

实现

测试框架

公告