内核源码解读-KSM

导读

本文适合有基本Linux内存管理概念的新手阅读,且本文旨在从工作流程和设计思想上介绍KSM,在涉及到源代码的地方,进行了部分删减,如果想详细了解KSM,推荐阅读源代码及源代码中的注释。

作者也是初次接触Linux内核源码,所以文章中难免出现纰漏,欢迎在评论中纠正。

一、KSM概述

KSM的全称是 Kernel Samepage Merging,主要应用在虚拟化环境中,它允许内核通过合并内存页面来节省内存,从来可以增加虚拟机的并发数据。

KSM核心设计思想是基于写时复制机制COW,也就是将内容相同的页面合并成一个只读页面,从而释放出空闲物理页面。

二、核心数据结构

KSM的核心数据结构有三个:

  • struct rmap_item
  • struct mm_slot
  • struct ksm_scan

rmap_item

从虚拟地址到一个 mm 的反向映射

同时,对于 stable tree,它负责组织其结构;对于 unstable tree,保存其校验和

struct rmap_item {
    struct rmap_item *rmap_list;        // 所有rmap_item连成一个链表,ksm_scam.rmap_list
    union {
        struct anon_vma *anon_vma;      // rmap_item在stable树中时,指向VMA
#ifdef CONFIG_NUMA
        int nid;                        // when node of unstable tree
#endif
    };
    struct mm_struct *mm;				// 进程的struct mm_struct数据结构
    unsigned long address;      		// rmap_item所跟踪的用户地址空间
    unsigned int oldchecksum;  			// 虚拟地址对应的物理页面的旧校验值
    union {
        struct rb_node node;    		// rmap_item加入unstable红黑树的节点
        struct {   /* 在 stable tree 中时才有效 */
            struct stable_node *head;	// 加入stable红黑树的节点
            struct hlist_node hlist;	// stable链表
        };
    };
};

struct mm_slot

描述添加到KSM系统中将来要被扫描的进程mm_struct数据结构

struct mm_slot {
	struct hlist_node link;			// 用于添加到mm_slot哈希表中
	struct list_head mm_list;		// 用于添加到mm_slot链表中,链表头在ksm_mm_head
	struct rmap_item *rmap_list;	// rmap_item链表头
	struct mm_struct *mm;			// 进程的mm_sturct数据结构
};

struct ksm_scan

当前的扫描状态

struct ksm_scan {
	struct mm_slot *mm_slot;		// 当前正在扫描的mm_slot
	unsigned long address;			// 将要扫描的地址(page的地址)
	struct rmap_item **rmap_list;	// 将要扫描的rmap_item的指针
	unsigned long seqnr;			// 全部扫描完成后会计数一次,用于删除unstable节点
};

三、工作流程

上层的应用通过 madvise() 给某内存区域增加 MADV_MERGEABLE 或者 MADV_UNMERGEABLE 标记,造成对系统调用的访问,该系统调用由 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 定义。

SYSCALL_DEFINE3

在这里会进行一个预处理,如找到该内存区域的所有VMA,并调用 madvise_vma 进行进一步处理。

SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
{
	unsigned long end, tmp;
	struct vm_area_struct *vma, *prev;
	int unmapped_error = 0;
	int error = -EINVAL;
	int write;
	size_t len;
	struct blk_plug plug;

	...

	
	vma = find_vma_prev(current->mm, start, &prev);
	if (vma && start > vma->vm_start)
		prev = vma;

	blk_start_plug(&plug);
	for (;;) {				// 在这个循环中遍历所有的VMA
		
		error = -ENOMEM;
		if (!vma)
			goto out;

		/* Here start < (end|vma->vm_end). */
		if (start < vma->vm_start) {
			unmapped_error = -ENOMEM;
			start = vma->vm_start;
			if (start >= end)
				goto out;
		}

		// 获得VMA的终止地址
		tmp = vma->vm_end;
		if (end < tmp)
			tmp = end;

		// 将当前VMA的起始地址(start和end)传递给 madvise_vma,当然继承的标记behavior不能忘记
		error = madvise_vma(vma, &prev, start, tmp, behavior);	
		if (error)
			goto out;
		start = tmp;
		if (prev && start < prev->vm_end)
			start = prev->vm_end;
		error = unmapped_error;
		if (start >= end)
			goto out;	// 这里是正常结束的出口
		if (prev)
			vma = prev->vm_next;
		else	
			vma = find_vma(current->mm, start);
	}
out:
	blk_finish_plug(&plug);
	if (write)
		up_write(&current->mm->mmap_sem);
	else
		up_read(&current->mm->mmap_sem);

	return error;
}

madvise_behavior

如果在某个VMA上发现 MADV_MERGEABLE 或者 MADV_UNMERGEABLE 标记,则触发 ksm_madvise

static long madvise_behavior(struct vm_area_struct *vma,
		     struct vm_area_struct **prev,
		     unsigned long start, unsigned long end, int behavior)
{
	struct mm_struct *mm = vma->vm_mm;
	int error = 0;
	pgoff_t pgoff;
	unsigned long new_flags = vma->vm_flags;

	switch (behavior) {
	
    ...
                
	case MADV_MERGEABLE:
	case MADV_UNMERGEABLE:
		error = ksm_madvise(vma, start, end, behavior, &new_flags);
		if (error) {
			/*
			 * madvise() returns EAGAIN if kernel resources, such as
			 * slab, are temporarily unavailable.
			 */
			if (error == -ENOMEM)
				error = -EAGAIN;
			goto out;
		}
		break;
	case MADV_HUGEPAGE:
	case MADV_NOHUGEPAGE:
		error = hugepage_madvise(vma, &new_flags, behavior);
		if (error) {
			/*
			 * madvise() returns EAGAIN if kernel resources, such as
			 * slab, are temporarily unavailable.
			 */
			if (error == -ENOMEM)
				error = -EAGAIN;
			goto out;
		}
		break;
	}

	...

success:
	vma->vm_flags = new_flags;
out:
	return error;
}

ksm_madvise

在这一步会找到 vma 所属进程(mm),并判断标记决定是否对页面进行共享

如果需要共享,调用 __ksm_enter()并传递当前 vma 所属的 mm 地址。

int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
		unsigned long end, int advice, unsigned long *vm_flags)
{
	struct mm_struct *mm = vma->vm_mm;
	int err;

	switch (advice) {
	case MADV_MERGEABLE:
		...

		if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
			err = __ksm_enter(mm);	// 这是真正的入口
			if (err)
				return err;
		}

		*vm_flags |= VM_MERGEABLE;
		break;

	case MADV_UNMERGEABLE:
		if (!(*vm_flags & VM_MERGEABLE))
			return 0;		/* just ignore the advice */

		if (vma->anon_vma) {
			err = unmerge_ksm_pages(vma, start, end);
			if (err)
				return err;
		}

		*vm_flags &= ~VM_MERGEABLE;
		break;
	}

	return 0;
}

__ksm_enter

将进程的 mm 加入到 ksm_mm_head 链表中

int __ksm_enter(struct mm_struct *mm)
{
	struct mm_slot *mm_slot;
	int needs_wakeup;

	mm_slot = alloc_mm_slot();		// 分配一个mm_slot,表示当前进程mm_struct数据结构
	if (!mm_slot)
		return -ENOMEM;

	/* Check ksm_run too?  Would need tighter locking */
	needs_wakeup = list_empty(&ksm_mm_head.mm_list);	// 为空表示当前没有正在被扫描的mm_slot

	spin_lock(&ksm_mmlist_lock);
	insert_to_mm_slots_hash(mm, mm_slot);		// 将当前进程mm赋给mm_slot->mm
	/*
	 * When KSM_RUN_MERGE (or KSM_RUN_STOP),
	 * insert just behind the scanning cursor, to let the area settle
	 * down a little; when fork is followed by immediate exec, we don't
	 * want ksmd to waste time setting up and tearing down an rmap_list.
	 *
	 * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
	 * scanning cursor, otherwise KSM pages in newly forked mms will be
	 * missed: then we might as well insert at the end of the list.
	 */
	if (ksm_run & KSM_RUN_UNMERGE)
		list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
	else
		list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);	// mm_slot添加到ksm_scan.mm_slot->mm_list链表中
	spin_unlock(&ksm_mmlist_lock);

	set_bit(MMF_VM_MERGEABLE, &mm->flags);		// 表示这个进程已经添加到KSM系统中
	mmgrab(mm);

	if (needs_wakeup)
		wake_up_interruptible(&ksm_thread_wait);	// 则唤醒ksmd内核线程

	return 0;
}

ksm_init

创建内核线程ksmd

static int __init ksm_init(void)
{
	struct task_struct *ksm_thread;
	int err;

	/* The correct value depends on page size and endianness */
	zero_checksum = calc_checksum(ZERO_PAGE(0));
	/* Default to false for backwards compatibility */
	ksm_use_zero_pages = false;

	err = ksm_slab_init();	// 创建ksm_rmap_item、ksm_stable_node、ksm_mm_slot三个高速缓存
	if (err)
		goto out;

	ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");	// 创建ksmd内核线程,线程函数为ksm_scan_thread
	if (IS_ERR(ksm_thread)) {
		pr_err("ksm: creating kthread failed\n");
		err = PTR_ERR(ksm_thread);
		goto out_free;
	}

#ifdef CONFIG_SYSFS
	err = sysfs_create_group(mm_kobj, &ksm_attr_group);		// 在sys/kernel/mm下创建ksm相关节点
	if (err) {
		pr_err("ksm: register sysfs failed\n");
		kthread_stop(ksm_thread);
		goto out_free;
	}
#else
	ksm_run = KSM_RUN_MERGE;	/* no way for user to start it */

#endif /* CONFIG_SYSFS */

#ifdef CONFIG_MEMORY_HOTREMOVE
	/* There is no significance to this priority 100 */
	hotplug_memory_notifier(ksm_memory_callback, 100);
#endif
	return 0;

out_free:
	ksm_slab_free();
out:
	return err;
}

ksm_scan_thread

只需要知道这里进入ksm_do_scan()进行实际的操作就可以了

static int ksm_scan_thread(void *nothing)
{
	set_freezable();
	set_user_nice(current, 5);

	while (!kthread_should_stop()) {
		mutex_lock(&ksm_thread_mutex);
		wait_while_offlining();
		if (ksmd_should_run())
			ksm_do_scan(ksm_thread_pages_to_scan);		// 实际的执行者,参数是一次扫描的页面数量
		mutex_unlock(&ksm_thread_mutex);

		try_to_freeze();

		if (ksmd_should_run()) {
			schedule_timeout_interruptible(
				msecs_to_jiffies(ksm_thread_sleep_millisecs));
		} else {
			wait_event_freezable(ksm_thread_wait,
				ksmd_should_run() || kthread_should_stop());
		}
	}
	return 0;
}

ksm_do_scam

static void ksm_do_scan(unsigned int scan_npages)
{
	struct rmap_item *rmap_item;
	struct page *uninitialized_var(page);

	while (scan_npages-- && likely(!freezing(current))) {	// 一次扫描 scan_npages 个页面
		cond_resched();
		rmap_item = scan_get_next_rmap_item(&page);		// 找到一个合适的匿名page,并为其建立由物理地址到虚拟地址的反向映射
		if (!rmap_item)
			return;
		cmp_and_merge_page(page, rmap_item);			// 查找两棵树看看是不是能合并这个page
		put_page(page);
	}
}

cmp_and_merge_page

首先查看这个 page 是不是可以和 stable tree 中的 page 合并,不可以的话,检查下它的检验值和之前是否有变化,如果变化则认为是写频繁的页面,跳过它;否则,决定是将它插入到 unstable tree 中,或与其中节点合并加入到 stable tree 中。

static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
{
	struct rmap_item *tree_rmap_item;
	struct page *tree_page = NULL;
	struct stable_node *stable_node;
	struct page *kpage;
	unsigned int checksum;
	int err;

	stable_node = page_stable_node(page);
	if (stable_node) {
		if (stable_node->head != &migrate_nodes &&
		    get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
			rb_erase(&stable_node->node,
				 root_stable_tree + NUMA(stable_node->nid));
			stable_node->head = &migrate_nodes;
			list_add(&stable_node->list, stable_node->head);
		}
		if (stable_node->head != &migrate_nodes &&
		    rmap_item->head == stable_node)
			return;
	}

	kpage = stable_tree_search(page);						// 查 stable tree
	if (kpage == page && rmap_item->head == stable_node) {	// 查找结果就是自身的情况
		put_page(kpage);
		return;
	}

	remove_rmap_item_from_tree(rmap_item);			// 为什么???????????

	if (kpage) {	// kpage 和 page 内容相同且不是同一页面
		err = try_to_merge_with_ksm_page(rmap_item, page, kpage);	// 尝试合并页面
		if (!err) {
			lock_page(kpage);
			stable_tree_append(rmap_item, page_stable_node(kpage));	// 合并成功,把rmap_item添加到stable_node->hlist哈希链表上
			unlock_page(kpage);
		}
		put_page(kpage);
		return;
	}

    
    /********************************下面是 unstable tree 的处理 ***********************/
	// 计算检验值,如果与旧值不等,说明页面写频繁,不适合KSM
	checksum = calc_checksum(page);				
	if (rmap_item->oldchecksum != checksum) {
		rmap_item->oldchecksum = checksum;
		return;
	}

	/*
	 * Same checksum as an empty page. We attempt to merge it with the
	 * appropriate zero page if the user enabled this via sysfs.
	 */
	if (ksm_use_zero_pages && (checksum == zero_checksum)) {
		struct vm_area_struct *vma;

		vma = find_mergeable_vma(rmap_item->mm, rmap_item->address);
		err = try_to_merge_one_page(vma, page,
					    ZERO_PAGE(rmap_item->address));
		/*
		 * In case of failure, the page was not really empty, so we
		 * need to continue. Otherwise we're done.
		 */
		if (!err)
			return;
	}
    
    // 搜索 unstable tree 中查看是否有相同页面
	tree_rmap_item =
		unstable_tree_search_insert(rmap_item, page, &tree_page);
	if (tree_rmap_item) {
        // 尝试将 page 和 tree_page 合并成一个 kpage
		kpage = try_to_merge_two_pages(rmap_item, page,
						tree_rmap_item, tree_page);
		put_page(tree_page);
		if (kpage) {
			/*
			 * 合并成功,将 kpage 插入到 stable tree 中和 rmap_items 中
			 */
			lock_page(kpage);
			stable_node = stable_tree_insert(kpage);
			if (stable_node) {
				stable_tree_append(tree_rmap_item, stable_node);
				stable_tree_append(rmap_item, stable_node);
			}
			unlock_page(kpage);

			/*
			 * 如果stable_node插入到stable树失败,
			 * 那么调用break_cow()主动触发一个缺页中断
			 * 来分离这个KSM页面
			 */
			if (!stable_node) {
				break_cow(tree_rmap_item);
				break_cow(rmap_item);
			}
		}
	}
}

try_to_merge_one_page

将两个 page 进行合并

/*
 * try_to_merge_one_page - take two pages and merge them into one
 * @vma: the vma that holds the pte pointing to page
 * @page: the PageAnon page that we want to replace with kpage
 * @kpage: the PageKsm page that we want to map instead of page,
 *         or NULL the first time when we want to use page as kpage.
 */
static int try_to_merge_one_page(struct vm_area_struct *vma,
				 struct page *page, struct page *kpage)
{
	pte_t orig_pte = __pte(0);
	int err = -EFAULT;

	if (page == kpage)			/* ksm page forked */
		return 0;

	if (!PageAnon(page))
		goto out;

	/*
	 * We need the page lock to read a stable PageSwapCache in
	 * write_protect_page().  
	 */
	if (!trylock_page(page))	// 我们不希望在此等待,所以不直接 lock_page
		goto out;				// 跳过这个 page 转而去处理其他 page

	if (PageTransCompound(page)) {	// 如果 page 是透明大页,返回 1
		if (split_huge_page(page))  // 将大页分裂成 normal  pages,成功则返回 0
			goto out_unlock;
	}

	/*
	 * If this anonymous page is mapped only here, its pte may need
	 * to be write-protected.  If it's mapped elsewhere, all of its
	 * ptes are necessarily already write-protected.  But in either
	 * case, we need to lock and check page_count is not raised.
	 */
	if (write_protect_page(vma, page, &orig_pte) == 0) {
		if (!kpage) {
			/*
			 * While we hold page lock, upgrade page from
			 * PageAnon+anon_vma to PageKsm+NULL stable_node:
			 * stable_tree_insert() will update stable_node.
			 */
			set_page_stable_node(page, NULL);
			mark_page_accessed(page);
			/*
			 * Page reclaim just frees a clean page with no dirty
			 * ptes: make sure that the ksm page would be swapped.
			 */
			if (!PageDirty(page))
				SetPageDirty(page);
			err = 0;
		} else if (pages_identical(page, kpage))
			err = replace_page(vma, page, kpage, orig_pte);		// 
	}

	if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
		munlock_vma_page(page);
		if (!PageMlocked(kpage)) {
			unlock_page(page);
			lock_page(kpage);
			mlock_vma_page(kpage);
			page = kpage;		/* for final unlock */
		}
	}

out_unlock:
	unlock_page(page);
out:
	return err;
}

附:

scan_get_next_rmap_item()

遍历ksm_mm_head链表(即进程的mm),然后再遍历进程地址空间的每个VMA,然后通过get_next_rmpa_item()返回rmap_item

static struct rmap_item *scan_get_next_rmap_item(struct page **page)
{
	struct mm_struct *mm;
	struct mm_slot *slot;
	struct vm_area_struct *vma;
	struct rmap_item *rmap_item;
	int nid;

	if (list_empty(&ksm_mm_head.mm_list))	// 没有mm_slot,直接退出
		return NULL;

	slot = ksm_scan.mm_slot;
	if (slot == &ksm_mm_head) {				// 第一次运行ksmd,进行初始化
		...
	}

	mm = slot->mm;
	down_read(&mm->mmap_sem);
	if (ksm_test_exit(mm))
		vma = NULL;
	else
		vma = find_vma(mm, ksm_scan.address);		// 根据 address 在 mm 中找到 vma

	for (; vma; vma = vma->vm_next) {				// 遍历当前mm的所有VMA
		if (!(vma->vm_flags & VM_MERGEABLE))
			continue;
		if (ksm_scan.address < vma->vm_start)
			ksm_scan.address = vma->vm_start;
		if (!vma->anon_vma)
			ksm_scan.address = vma->vm_end;

		while (ksm_scan.address < vma->vm_end) {	// 遍历VMA中的所有页面
			if (ksm_test_exit(mm))
				break;
			*page = follow_page(vma, ksm_scan.address, FOLL_GET);	// 根据虚拟地址找到物理页
			if (IS_ERR_OR_NULL(*page)) {
				ksm_scan.address += PAGE_SIZE;
				cond_resched();
				continue;
			}
			if (PageAnon(*page)) {					// 只考虑匿名页面
				flush_anon_page(vma, *page, ksm_scan.address);
				flush_dcache_page(*page);
				rmap_item = get_next_rmap_item(slot,
					ksm_scan.rmap_list, ksm_scan.address);		// 去找mm_slot->rmap_list链表上是否有该虚拟地址对应的rmap_item,没有就新建一个
				if (rmap_item) {
					ksm_scan.rmap_list =
							&rmap_item->rmap_list;
					ksm_scan.address += PAGE_SIZE;
				} else
					put_page(*page);
				up_read(&mm->mmap_sem);
				return rmap_item;
			}
			put_page(*page);
			ksm_scan.address += PAGE_SIZE;
			cond_resched();
		}
	}

	if (ksm_test_exit(mm)) {			// for循环里扫描该进程所有的VMA都没找到合适的匿名页面
		ksm_scan.address = 0;
		ksm_scan.rmap_list = &slot->rmap_list;
	}
	/*
	 * Nuke all the rmap_items that are above this current rmap:
	 * because there were no VM_MERGEABLE vmas with such addresses.
	 */
	remove_trailing_rmap_items(slot, ksm_scan.rmap_list);	// 在该进程中没找到合适的匿名页面时,那么对应的rmap_item已经没用必要占用空间,直接删除

	spin_lock(&ksm_mmlist_lock);
	ksm_scan.mm_slot = list_entry(slot->mm_list.next,
						struct mm_slot, mm_list);		// 取下一个mm_slot
	if (ksm_scan.address == 0) {			// 处理该进程被销毁的情况,把mm_slot从ksm_mm_head链表中删除,释放mm_slot数据结构,清MMF_VM_MERGEABLE标志位。
		/*
		 * We've completed a full scan of all vmas, holding mmap_sem
		 * throughout, and found no VM_MERGEABLE: so do the same as
		 * __ksm_exit does to remove this mm from all our lists now.
		 * This applies either when cleaning up after __ksm_exit
		 * (but beware: we can reach here even before __ksm_exit),
		 * or when all VM_MERGEABLE areas have been unmapped (and
		 * mmap_sem then protects against race with MADV_MERGEABLE).
		 */
		hash_del(&slot->link);
		list_del(&slot->mm_list);
		spin_unlock(&ksm_mmlist_lock);

		free_mm_slot(slot);
		clear_bit(MMF_VM_MERGEABLE, &mm->flags);
		up_read(&mm->mmap_sem);
		mmdrop(mm);
	} else {
		up_read(&mm->mmap_sem);
		/*
		 * up_read(&mm->mmap_sem) first because after
		 * spin_unlock(&ksm_mmlist_lock) run, the "mm" may
		 * already have been freed under us by __ksm_exit()
		 * because the "mm_slot" is still hashed and
		 * ksm_scan.mm_slot doesn't point to it anymore.
		 */
		spin_unlock(&ksm_mmlist_lock);
	}

	/* Repeat until we've completed scanning the whole list */
	slot = ksm_scan.mm_slot;
	if (slot != &ksm_mm_head)
		goto next_mm;			// 继续扫描下一个mm_slot

	ksm_scan.seqnr++;			// 扫描完一轮mm_slot,增加计数
	return NULL;
}

相关推荐

发表评论

电子邮件地址不会被公开。 必填项已用*标注

微信扫一扫

微信扫一扫

微信扫一扫,分享到朋友圈

内核源码解读-KSM
返回顶部

显示

忘记密码?

显示

显示

获取验证码

Close