导读

本文适合有基本Linux内存管理概念的新手阅读,且本文旨在从工作流程和设计思想上介绍KSM,在涉及到源代码的地方,进行了部分删减,如果想详细了解KSM,推荐阅读源代码及源代码中的注释。

作者也是初次接触Linux内核源码,所以文章中难免出现纰漏,欢迎在评论中纠正。

一、KSM概述

KSM的全称是 Kernel Samepage Merging,主要应用在虚拟化环境中,它允许内核通过合并内存页面来节省内存,从来可以增加虚拟机的并发数据。

KSM核心设计思想是基于写时复制机制COW,也就是将内容相同的页面合并成一个只读页面,从而释放出空闲物理页面。

二、核心数据结构

KSM的核心数据结构有三个:

  • struct rmap_item
  • struct mm_slot
  • struct ksm_scan

rmap_item

从虚拟地址到一个 mm 的反向映射

同时,对于 stable tree,它负责组织其结构;对于 unstable tree,保存其校验和

struct rmap_item {
    struct rmap_item *rmap_list;        // 所有rmap_item连成一个链表,ksm_scam.rmap_list
    union {
        struct anon_vma *anon_vma;      // rmap_item在stable树中时,指向VMA
#ifdef CONFIG_NUMA
        int nid;                        // when node of unstable tree
#endif
    };
    struct mm_struct *mm;               // 进程的struct mm_struct数据结构
    unsigned long address;              // rmap_item所跟踪的用户地址空间
    unsigned int oldchecksum;           // 虚拟地址对应的物理页面的旧校验值
    union {
        struct rb_node node;            // rmap_item加入unstable红黑树的节点
        struct {   /* 在 stable tree 中时才有效 */
            struct stable_node *head;   // 加入stable红黑树的节点
            struct hlist_node hlist;    // stable链表
        };
    };
};

struct mm_slot

描述添加到KSM系统中将来要被扫描的进程mm_struct数据结构

struct mm_slot {
    struct hlist_node link;         // 用于添加到mm_slot哈希表中
    struct list_head mm_list;       // 用于添加到mm_slot链表中,链表头在ksm_mm_head
    struct rmap_item *rmap_list;    // rmap_item链表头
    struct mm_struct *mm;           // 进程的mm_sturct数据结构
};

struct ksm_scan

当前的扫描状态

struct ksm_scan {
    struct mm_slot *mm_slot;        // 当前正在扫描的mm_slot
    unsigned long address;          // 将要扫描的地址(page的地址)
    struct rmap_item **rmap_list;   // 将要扫描的rmap_item的指针
    unsigned long seqnr;            // 全部扫描完成后会计数一次,用于删除unstable节点
};

三、工作流程

上层的应用通过 madvise() 给某内存区域增加 MADV_MERGEABLE 或者 MADV_UNMERGEABLE 标记,造成对系统调用的访问,该系统调用由 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 定义。

SYSCALL_DEFINE3

在这里会进行一个预处理,如找到该内存区域的所有VMA,并调用 madvise_vma 进行进一步处理。

SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
{
    unsigned long end, tmp;
    struct vm_area_struct *vma, *prev;
    int unmapped_error = 0;
    int error = -EINVAL;
    int write;
    size_t len;
    struct blk_plug plug;

    ...

    vma = find_vma_prev(current->mm, start, &prev);
    if (vma && start > vma->vm_start)
        prev = vma;

    blk_start_plug(&plug);
    for (;;) {              // 在这个循环中遍历所有的VMA

        error = -ENOMEM;
        if (!vma)
            goto out;

        /* Here start < (end|vma->vm_end). */
        if (start < vma->vm_start) {
            unmapped_error = -ENOMEM;
            start = vma->vm_start;
            if (start >= end)
                goto out;
        }

        // 获得VMA的终止地址
        tmp = vma->vm_end;
        if (end < tmp)
            tmp = end;

        // 将当前VMA的起始地址(start和end)传递给 madvise_vma,当然继承的标记behavior不能忘记
        error = madvise_vma(vma, &prev, start, tmp, behavior);  
        if (error)
            goto out;
        start = tmp;
        if (prev && start < prev->vm_end)
            start = prev->vm_end;
        error = unmapped_error;
        if (start >= end)
            goto out;   // 这里是正常结束的出口
        if (prev)
            vma = prev->vm_next;
        else    
            vma = find_vma(current->mm, start);
    }
out:
    blk_finish_plug(&plug);
    if (write)
        up_write(&current->mm->mmap_sem);
    else
        up_read(&current->mm->mmap_sem);

    return error;
}

madvise_behavior

如果在某个VMA上发现 MADV_MERGEABLE 或者 MADV_UNMERGEABLE 标记,则触发 ksm_madvise

static long madvise_behavior(struct vm_area_struct *vma,
             struct vm_area_struct **prev,
             unsigned long start, unsigned long end, int behavior)
{
    struct mm_struct *mm = vma->vm_mm;
    int error = 0;
    pgoff_t pgoff;
    unsigned long new_flags = vma->vm_flags;

    switch (behavior) {

    ...

    case MADV_MERGEABLE:
    case MADV_UNMERGEABLE:
        error = ksm_madvise(vma, start, end, behavior, &new_flags);
        if (error) {
            /*
             * madvise() returns EAGAIN if kernel resources, such as
             * slab, are temporarily unavailable.
             */
            if (error == -ENOMEM)
                error = -EAGAIN;
            goto out;
        }
        break;
    case MADV_HUGEPAGE:
    case MADV_NOHUGEPAGE:
        error = hugepage_madvise(vma, &new_flags, behavior);
        if (error) {
            /*
             * madvise() returns EAGAIN if kernel resources, such as
             * slab, are temporarily unavailable.
             */
            if (error == -ENOMEM)
                error = -EAGAIN;
            goto out;
        }
        break;
    }

    ...

success:
    vma->vm_flags = new_flags;
out:
    return error;
}

ksm_madvise

在这一步会找到 vma 所属进程(mm),并判断标记决定是否对页面进行共享

如果需要共享,调用 __ksm_enter()并传递当前 vma 所属的 mm 地址。

int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
        unsigned long end, int advice, unsigned long *vm_flags)
{
    struct mm_struct *mm = vma->vm_mm;
    int err;

    switch (advice) {
    case MADV_MERGEABLE:
        ...

        if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
            err = __ksm_enter(mm);  // 这是真正的入口
            if (err)
                return err;
        }

        *vm_flags |= VM_MERGEABLE;
        break;

    case MADV_UNMERGEABLE:
        if (!(*vm_flags & VM_MERGEABLE))
            return 0;       /* just ignore the advice */

        if (vma->anon_vma) {
            err = unmerge_ksm_pages(vma, start, end);
            if (err)
                return err;
        }

        *vm_flags &= ~VM_MERGEABLE;
        break;
    }

    return 0;
}

__ksm_enter

将进程的 mm 加入到 ksm_mm_head 链表中

int __ksm_enter(struct mm_struct *mm)
{
    struct mm_slot *mm_slot;
    int needs_wakeup;

    mm_slot = alloc_mm_slot();      // 分配一个mm_slot,表示当前进程mm_struct数据结构
    if (!mm_slot)
        return -ENOMEM;

    /* Check ksm_run too?  Would need tighter locking */
    needs_wakeup = list_empty(&ksm_mm_head.mm_list);    // 为空表示当前没有正在被扫描的mm_slot

    spin_lock(&ksm_mmlist_lock);
    insert_to_mm_slots_hash(mm, mm_slot);       // 将当前进程mm赋给mm_slot->mm
    /*
     * When KSM_RUN_MERGE (or KSM_RUN_STOP),
     * insert just behind the scanning cursor, to let the area settle
     * down a little; when fork is followed by immediate exec, we don't
     * want ksmd to waste time setting up and tearing down an rmap_list.
     *
     * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
     * scanning cursor, otherwise KSM pages in newly forked mms will be
     * missed: then we might as well insert at the end of the list.
     */
    if (ksm_run & KSM_RUN_UNMERGE)
        list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
    else
        list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); // mm_slot添加到ksm_scan.mm_slot->mm_list链表中
    spin_unlock(&ksm_mmlist_lock);

    set_bit(MMF_VM_MERGEABLE, &mm->flags);       // 表示这个进程已经添加到KSM系统中
    mmgrab(mm);

    if (needs_wakeup)
        wake_up_interruptible(&ksm_thread_wait);    // 则唤醒ksmd内核线程

    return 0;
}

ksm_init

创建内核线程ksmd

static int __init ksm_init(void)
{
    struct task_struct *ksm_thread;
    int err;

    /* The correct value depends on page size and endianness */
    zero_checksum = calc_checksum(ZERO_PAGE(0));
    /* Default to false for backwards compatibility */
    ksm_use_zero_pages = false;

    err = ksm_slab_init();  // 创建ksm_rmap_item、ksm_stable_node、ksm_mm_slot三个高速缓存
    if (err)
        goto out;

    ksm_thread = kthread_run(ksm_scan_thread, NULL, ksmd);  // 创建ksmd内核线程,线程函数为ksm_scan_thread
    if (IS_ERR(ksm_thread)) {
        pr_err(ksm: creating kthread failed\n);
        err = PTR_ERR(ksm_thread);
        goto out_free;
    }

#ifdef CONFIG_SYSFS
    err = sysfs_create_group(mm_kobj, &ksm_attr_group);     // 在sys/kernel/mm下创建ksm相关节点
    if (err) {
        pr_err(ksm: register sysfs failed\n);
        kthread_stop(ksm_thread);
        goto out_free;
    }
#else
    ksm_run = KSM_RUN_MERGE;    /* no way for user to start it */

#endif /* CONFIG_SYSFS */

#ifdef CONFIG_MEMORY_HOTREMOVE
    /* There is no significance to this priority 100 */
    hotplug_memory_notifier(ksm_memory_callback, 100);
#endif
    return 0;

out_free:
    ksm_slab_free();
out:
    return err;
}

ksm_scan_thread

只需要知道这里进入ksm_do_scan()进行实际的操作就可以了

static int ksm_scan_thread(void *nothing)
{
    set_freezable();
    set_user_nice(current, 5);

    while (!kthread_should_stop()) {
        mutex_lock(&ksm_thread_mutex);
        wait_while_offlining();
        if (ksmd_should_run())
            ksm_do_scan(ksm_thread_pages_to_scan);      // 实际的执行者,参数是一次扫描的页面数量
        mutex_unlock(&ksm_thread_mutex);

        try_to_freeze();

        if (ksmd_should_run()) {
            schedule_timeout_interruptible(
                msecs_to_jiffies(ksm_thread_sleep_millisecs));
        } else {
            wait_event_freezable(ksm_thread_wait,
                ksmd_should_run() || kthread_should_stop());
        }
    }
    return 0;
}

ksm_do_scam

static void ksm_do_scan(unsigned int scan_npages)
{
    struct rmap_item *rmap_item;
    struct page *uninitialized_var(page);

    while (scan_npages-- && likely(!freezing(current))) {   // 一次扫描 scan_npages 个页面
        cond_resched();
        rmap_item = scan_get_next_rmap_item(&page);     // 找到一个合适的匿名page,并为其建立由物理地址到虚拟地址的反向映射
        if (!rmap_item)
            return;
        cmp_and_merge_page(page, rmap_item);            // 查找两棵树看看是不是能合并这个page
        put_page(page);
    }
}

cmp_and_merge_page

首先查看这个 page 是不是可以和 stable tree 中的 page 合并,不可以的话,检查下它的检验值和之前是否有变化,如果变化则认为是写频繁的页面,跳过它;否则,决定是将它插入到 unstable tree 中,或与其中节点合并加入到 stable tree 中。

static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
{
    struct rmap_item *tree_rmap_item;
    struct page *tree_page = NULL;
    struct stable_node *stable_node;
    struct page *kpage;
    unsigned int checksum;
    int err;

    stable_node = page_stable_node(page);
    if (stable_node) {
        if (stable_node->head != &migrate_nodes &&
            get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
            rb_erase(&stable_node->node,
                 root_stable_tree + NUMA(stable_node->nid));
            stable_node->head = &migrate_nodes;
            list_add(&stable_node->list, stable_node->head);
        }
        if (stable_node->head != &migrate_nodes &&
            rmap_item->head == stable_node)
            return;
    }

    kpage = stable_tree_search(page);                       // 查 stable tree
    if (kpage == page && rmap_item->head == stable_node) {   // 查找结果就是自身的情况
        put_page(kpage);
        return;
    }

    remove_rmap_item_from_tree(rmap_item);          // 为什么???????????

    if (kpage) {    // kpage 和 page 内容相同且不是同一页面
        err = try_to_merge_with_ksm_page(rmap_item, page, kpage);   // 尝试合并页面
        if (!err) {
            lock_page(kpage);
            stable_tree_append(rmap_item, page_stable_node(kpage)); // 合并成功,把rmap_item添加到stable_node->hlist哈希链表上
            unlock_page(kpage);
        }
        put_page(kpage);
        return;
    }

    /********************************下面是 unstable tree 的处理 ***********************/
    // 计算检验值,如果与旧值不等,说明页面写频繁,不适合KSM
    checksum = calc_checksum(page);             
    if (rmap_item->oldchecksum != checksum) {
        rmap_item->oldchecksum = checksum;
        return;
    }

    /*
     * Same checksum as an empty page. We attempt to merge it with the
     * appropriate zero page if the user enabled this via sysfs.
     */
    if (ksm_use_zero_pages && (checksum == zero_checksum)) {
        struct vm_area_struct *vma;

        vma = find_mergeable_vma(rmap_item->mm, rmap_item->address);
        err = try_to_merge_one_page(vma, page,
                        ZERO_PAGE(rmap_item->address));
        /*
         * In case of failure, the page was not really empty, so we
         * need to continue. Otherwise we're done.
         */
        if (!err)
            return;
    }

    // 搜索 unstable tree 中查看是否有相同页面
    tree_rmap_item =
        unstable_tree_search_insert(rmap_item, page, &tree_page);
    if (tree_rmap_item) {
        // 尝试将 page 和 tree_page 合并成一个 kpage
        kpage = try_to_merge_two_pages(rmap_item, page,
                        tree_rmap_item, tree_page);
        put_page(tree_page);
        if (kpage) {
            /*
             * 合并成功,将 kpage 插入到 stable tree 中和 rmap_items 中
             */
            lock_page(kpage);
            stable_node = stable_tree_insert(kpage);
            if (stable_node) {
                stable_tree_append(tree_rmap_item, stable_node);
                stable_tree_append(rmap_item, stable_node);
            }
            unlock_page(kpage);

            /*
             * 如果stable_node插入到stable树失败,
             * 那么调用break_cow()主动触发一个缺页中断
             * 来分离这个KSM页面
             */
            if (!stable_node) {
                break_cow(tree_rmap_item);
                break_cow(rmap_item);
            }
        }
    }
}

try_to_merge_one_page

将两个 page 进行合并

/*
 * try_to_merge_one_page - take two pages and merge them into one
 * @vma: the vma that holds the pte pointing to page
 * @page: the PageAnon page that we want to replace with kpage
 * @kpage: the PageKsm page that we want to map instead of page,
 *         or NULL the first time when we want to use page as kpage.
 */
static int try_to_merge_one_page(struct vm_area_struct *vma,
                 struct page *page, struct page *kpage)
{
    pte_t orig_pte = __pte(0);
    int err = -EFAULT;

    if (page == kpage)          /* ksm page forked */
        return 0;

    if (!PageAnon(page))
        goto out;

    /*
     * We need the page lock to read a stable PageSwapCache in
     * write_protect_page().  
     */
    if (!trylock_page(page))    // 我们不希望在此等待,所以不直接 lock_page
        goto out;               // 跳过这个 page 转而去处理其他 page

    if (PageTransCompound(page)) {  // 如果 page 是透明大页,返回 1
        if (split_huge_page(page))  // 将大页分裂成 normal  pages,成功则返回 0
            goto out_unlock;
    }

    /*
     * If this anonymous page is mapped only here, its pte may need
     * to be write-protected.  If it's mapped elsewhere, all of its
     * ptes are necessarily already write-protected.  But in either
     * case, we need to lock and check page_count is not raised.
     */
    if (write_protect_page(vma, page, &orig_pte) == 0) {
        if (!kpage) {
            /*
             * While we hold page lock, upgrade page from
             * PageAnon+anon_vma to PageKsm+NULL stable_node:
             * stable_tree_insert() will update stable_node.
             */
            set_page_stable_node(page, NULL);
            mark_page_accessed(page);
            /*
             * Page reclaim just frees a clean page with no dirty
             * ptes: make sure that the ksm page would be swapped.
             */
            if (!PageDirty(page))
                SetPageDirty(page);
            err = 0;
        } else if (pages_identical(page, kpage))
            err = replace_page(vma, page, kpage, orig_pte);     // 
    }

    if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
        munlock_vma_page(page);
        if (!PageMlocked(kpage)) {
            unlock_page(page);
            lock_page(kpage);
            mlock_vma_page(kpage);
            page = kpage;       /* for final unlock */
        }
    }

out_unlock:
    unlock_page(page);
out:
    return err;
}

附:

scan_get_next_rmap_item()

遍历ksm_mm_head链表(即进程的mm),然后再遍历进程地址空间的每个VMA,然后通过get_next_rmpa_item()返回rmap_item

static struct rmap_item *scan_get_next_rmap_item(struct page **page)
{
    struct mm_struct *mm;
    struct mm_slot *slot;
    struct vm_area_struct *vma;
    struct rmap_item *rmap_item;
    int nid;

    if (list_empty(&ksm_mm_head.mm_list))   // 没有mm_slot,直接退出
        return NULL;

    slot = ksm_scan.mm_slot;
    if (slot == &ksm_mm_head) {             // 第一次运行ksmd,进行初始化
        ...
    }

    mm = slot->mm;
    down_read(&mm->mmap_sem);
    if (ksm_test_exit(mm))
        vma = NULL;
    else
        vma = find_vma(mm, ksm_scan.address);       // 根据 address 在 mm 中找到 vma

    for (; vma; vma = vma->vm_next) {                // 遍历当前mm的所有VMA
        if (!(vma->vm_flags & VM_MERGEABLE))
            continue;
        if (ksm_scan.address < vma->vm_start)
            ksm_scan.address = vma->vm_start;
        if (!vma->anon_vma)
            ksm_scan.address = vma->vm_end;

        while (ksm_scan.address < vma->vm_end) {  // 遍历VMA中的所有页面
            if (ksm_test_exit(mm))
                break;
            *page = follow_page(vma, ksm_scan.address, FOLL_GET);   // 根据虚拟地址找到物理页
            if (IS_ERR_OR_NULL(*page)) {
                ksm_scan.address += PAGE_SIZE;
                cond_resched();
                continue;
            }
            if (PageAnon(*page)) {                  // 只考虑匿名页面
                flush_anon_page(vma, *page, ksm_scan.address);
                flush_dcache_page(*page);
                rmap_item = get_next_rmap_item(slot,
                    ksm_scan.rmap_list, ksm_scan.address);      // 去找mm_slot->rmap_list链表上是否有该虚拟地址对应的rmap_item,没有就新建一个
                if (rmap_item) {
                    ksm_scan.rmap_list =
                            &rmap_item->rmap_list;
                    ksm_scan.address += PAGE_SIZE;
                } else
                    put_page(*page);
                up_read(&mm->mmap_sem);
                return rmap_item;
            }
            put_page(*page);
            ksm_scan.address += PAGE_SIZE;
            cond_resched();
        }
    }

    if (ksm_test_exit(mm)) {            // for循环里扫描该进程所有的VMA都没找到合适的匿名页面
        ksm_scan.address = 0;
        ksm_scan.rmap_list = &slot->rmap_list;
    }
    /*
     * Nuke all the rmap_items that are above this current rmap:
     * because there were no VM_MERGEABLE vmas with such addresses.
     */
    remove_trailing_rmap_items(slot, ksm_scan.rmap_list);   // 在该进程中没找到合适的匿名页面时,那么对应的rmap_item已经没用必要占用空间,直接删除

    spin_lock(&ksm_mmlist_lock);
    ksm_scan.mm_slot = list_entry(slot->mm_list.next,
                        struct mm_slot, mm_list);       // 取下一个mm_slot
    if (ksm_scan.address == 0) {            // 处理该进程被销毁的情况,把mm_slot从ksm_mm_head链表中删除,释放mm_slot数据结构,清MMF_VM_MERGEABLE标志位。
        /*
         * We've completed a full scan of all vmas, holding mmap_sem
         * throughout, and found no VM_MERGEABLE: so do the same as
         * __ksm_exit does to remove this mm from all our lists now.
         * This applies either when cleaning up after __ksm_exit
         * (but beware: we can reach here even before __ksm_exit),
         * or when all VM_MERGEABLE areas have been unmapped (and
         * mmap_sem then protects against race with MADV_MERGEABLE).
         */
        hash_del(&slot->link);
        list_del(&slot->mm_list);
        spin_unlock(&ksm_mmlist_lock);

        free_mm_slot(slot);
        clear_bit(MMF_VM_MERGEABLE, &mm->flags);
        up_read(&mm->mmap_sem);
        mmdrop(mm);
    } else {
        up_read(&mm->mmap_sem);
        /*
         * up_read(&mm->mmap_sem) first because after
         * spin_unlock(&ksm_mmlist_lock) run, the mm may
         * already have been freed under us by __ksm_exit()
         * because the mm_slot is still hashed and
         * ksm_scan.mm_slot doesn't point to it anymore.
         */
        spin_unlock(&ksm_mmlist_lock);
    }

    /* Repeat until we've completed scanning the whole list */
    slot = ksm_scan.mm_slot;
    if (slot != &ksm_mm_head)
        goto next_mm;           // 继续扫描下一个mm_slot

    ksm_scan.seqnr++;           // 扫描完一轮mm_slot,增加计数
    return NULL;
}