#include <linux/bitops.h>
 #include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
 #include <linux/sched.h>
-#include <linux/vmstat.h>
 
 struct stable_node;
+struct mem_cgroup;
 
 #ifdef CONFIG_KSM
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
                                (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
 }
 
-static inline void page_add_ksm_rmap(struct page *page)
+/*
+ * When do_swap_page() first faults in from swap what used to be a KSM page,
+ * no problem, it will be assigned to this vma's anon_vma; but thereafter,
+ * it might be faulted into a different anon_vma (or perhaps to a different
+ * offset in the same anon_vma).  do_swap_page() cannot do all the locking
+ * needed to reconstitute a cross-anon_vma KSM page: for now it has to make
+ * a copy, and leave remerging the pages to a later pass of ksmd.
+ *
+ * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE,
+ * but what if the vma was unmerged while the page was swapped out?
+ */
+struct page *ksm_does_need_to_copy(struct page *page,
+                       struct vm_area_struct *vma, unsigned long address);
+static inline struct page *ksm_might_need_to_copy(struct page *page,
+                       struct vm_area_struct *vma, unsigned long address)
 {
-       if (atomic_inc_and_test(&page->_mapcount))
-               __inc_zone_page_state(page, NR_ANON_PAGES);
+       struct anon_vma *anon_vma = page_anon_vma(page);
+
+       if (!anon_vma ||
+           (anon_vma == vma->anon_vma &&
+            page->index == linear_page_index(vma, address)))
+               return page;
+
+       return ksm_does_need_to_copy(page, vma, address);
 }
+
+int page_referenced_ksm(struct page *page,
+                       struct mem_cgroup *memcg, unsigned long *vm_flags);
+int try_to_unmap_ksm(struct page *page, enum ttu_flags flags);
+
 #else  /* !CONFIG_KSM */
 
 static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
        return 0;
 }
 
-/* No stub required for page_add_ksm_rmap(page) */
+static inline struct page *ksm_might_need_to_copy(struct page *page,
+                       struct vm_area_struct *vma, unsigned long address)
+{
+       return page;
+}
+
+static inline int page_referenced_ksm(struct page *page,
+                       struct mem_cgroup *memcg, unsigned long *vm_flags)
+{
+       return 0;
+}
+
+static inline int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
+{
+       return 0;
+}
 #endif /* !CONFIG_KSM */
 
-#endif
+#endif /* __LINUX_KSM_H */
 
  */
 int page_referenced(struct page *, int is_locked,
                        struct mem_cgroup *cnt, unsigned long *vm_flags);
+int page_referenced_one(struct page *, struct vm_area_struct *,
+       unsigned long address, unsigned int *mapcount, unsigned long *vm_flags);
+
 enum ttu_flags {
        TTU_UNMAP = 0,                  /* unmap mode */
        TTU_MIGRATION = 1,              /* migration mode */
 #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
 
 int try_to_unmap(struct page *, enum ttu_flags flags);
+int try_to_unmap_one(struct page *, struct vm_area_struct *,
+                       unsigned long address, enum ttu_flags flags);
 
 /*
  * Called from mm/filemap_xip.c to unmap empty zero page
 
 static DEFINE_MUTEX(ksm_thread_mutex);
 static DEFINE_SPINLOCK(ksm_mmlist_lock);
 
+/*
+ * Temporary hack for page_referenced_ksm() and try_to_unmap_ksm(),
+ * later we rework things a little to get the right vma to them.
+ */
+static DEFINE_SPINLOCK(ksm_fallback_vma_lock);
+static struct vm_area_struct ksm_fallback_vma;
+
 #define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
                sizeof(struct __struct), __alignof__(struct __struct),\
                (__flags), NULL)
 {
        if (rmap_item->address & STABLE_FLAG) {
                struct stable_node *stable_node;
+               struct page *page;
 
                stable_node = rmap_item->head;
+               page = stable_node->page;
+               lock_page(page);
+
                hlist_del(&rmap_item->hlist);
-               if (stable_node->hlist.first)
+               if (stable_node->hlist.first) {
+                       unlock_page(page);
                        ksm_pages_sharing--;
-               else {
-                       set_page_stable_node(stable_node->page, NULL);
-                       put_page(stable_node->page);
+               } else {
+                       set_page_stable_node(page, NULL);
+                       unlock_page(page);
+                       put_page(page);
 
                        rb_erase(&stable_node->node, &root_stable_tree);
                        free_stable_node(stable_node);
        }
 
        get_page(kpage);
-       page_add_ksm_rmap(kpage);
+       page_add_anon_rmap(kpage, vma, addr);
 
        flush_cache_page(vma, addr, pte_pfn(*ptep));
        ptep_clear_flush(vma, addr, ptep);
            pages_identical(page, kpage))
                err = replace_page(vma, page, kpage, orig_pte);
 
-       if ((vma->vm_flags & VM_LOCKED) && !err)
+       if ((vma->vm_flags & VM_LOCKED) && !err) {
                munlock_vma_page(page);
+               if (!PageMlocked(kpage)) {
+                       unlock_page(page);
+                       lru_add_drain();
+                       lock_page(kpage);
+                       mlock_vma_page(kpage);
+                       page = kpage;           /* for final unlock */
+               }
+       }
 
        unlock_page(page);
 out:
 
        copy_user_highpage(kpage, page, rmap_item->address, vma);
 
+       SetPageDirty(kpage);
+       __SetPageUptodate(kpage);
+       SetPageSwapBacked(kpage);
        set_page_stable_node(kpage, NULL);      /* mark it PageKsm */
+       lru_cache_add_lru(kpage, LRU_ACTIVE_ANON);
 
        err = try_to_merge_one_page(vma, page, kpage);
 up:
                         * The page was successfully merged:
                         * add its rmap_item to the stable tree.
                         */
+                       lock_page(kpage);
                        stable_tree_append(rmap_item, stable_node);
+                       unlock_page(kpage);
                }
                put_page(kpage);
                return;
                if (kpage) {
                        remove_rmap_item_from_tree(tree_rmap_item);
 
+                       lock_page(kpage);
                        stable_node = stable_tree_insert(kpage);
                        if (stable_node) {
                                stable_tree_append(tree_rmap_item, stable_node);
                                stable_tree_append(rmap_item, stable_node);
                        }
+                       unlock_page(kpage);
                        put_page(kpage);
 
                        /*
                        return;
                if (!PageKsm(page) || !in_stable_tree(rmap_item))
                        cmp_and_merge_page(page, rmap_item);
-               else if (page_mapcount(page) == 1) {
-                       /*
-                        * Replace now-unshared ksm page by ordinary page.
-                        */
-                       break_cow(rmap_item);
-                       remove_rmap_item_from_tree(rmap_item);
-                       rmap_item->oldchecksum = calc_checksum(page);
-               }
                put_page(page);
        }
 }
                if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
                                 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
                                 VM_RESERVED  | VM_HUGETLB | VM_INSERTPAGE |
-                                VM_MIXEDMAP  | VM_SAO))
+                                VM_NONLINEAR | VM_MIXEDMAP | VM_SAO))
                        return 0;               /* just ignore the advice */
 
                if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
        }
 }
 
+struct page *ksm_does_need_to_copy(struct page *page,
+                       struct vm_area_struct *vma, unsigned long address)
+{
+       struct page *new_page;
+
+       unlock_page(page);      /* any racers will COW it, not modify it */
+
+       new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+       if (new_page) {
+               copy_user_highpage(new_page, page, address, vma);
+
+               SetPageDirty(new_page);
+               __SetPageUptodate(new_page);
+               SetPageSwapBacked(new_page);
+               __set_page_locked(new_page);
+
+               if (page_evictable(new_page, vma))
+                       lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
+               else
+                       add_page_to_unevictable_list(new_page);
+       }
+
+       page_cache_release(page);
+       return new_page;
+}
+
+int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
+                       unsigned long *vm_flags)
+{
+       struct stable_node *stable_node;
+       struct rmap_item *rmap_item;
+       struct hlist_node *hlist;
+       unsigned int mapcount = page_mapcount(page);
+       int referenced = 0;
+       struct vm_area_struct *vma;
+
+       VM_BUG_ON(!PageKsm(page));
+       VM_BUG_ON(!PageLocked(page));
+
+       stable_node = page_stable_node(page);
+       if (!stable_node)
+               return 0;
+
+       /*
+        * Temporary hack: really we need anon_vma in rmap_item, to
+        * provide the correct vma, and to find recently forked instances.
+        * Use zalloc to avoid weirdness if any other fields are involved.
+        */
+       vma = kmem_cache_zalloc(vm_area_cachep, GFP_ATOMIC);
+       if (!vma) {
+               spin_lock(&ksm_fallback_vma_lock);
+               vma = &ksm_fallback_vma;
+       }
+
+       hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+               if (memcg && !mm_match_cgroup(rmap_item->mm, memcg))
+                       continue;
+
+               vma->vm_mm = rmap_item->mm;
+               vma->vm_start = rmap_item->address;
+               vma->vm_end = vma->vm_start + PAGE_SIZE;
+
+               referenced += page_referenced_one(page, vma,
+                               rmap_item->address, &mapcount, vm_flags);
+               if (!mapcount)
+                       goto out;
+       }
+out:
+       if (vma == &ksm_fallback_vma)
+               spin_unlock(&ksm_fallback_vma_lock);
+       else
+               kmem_cache_free(vm_area_cachep, vma);
+       return referenced;
+}
+
+int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
+{
+       struct stable_node *stable_node;
+       struct hlist_node *hlist;
+       struct rmap_item *rmap_item;
+       int ret = SWAP_AGAIN;
+       struct vm_area_struct *vma;
+
+       VM_BUG_ON(!PageKsm(page));
+       VM_BUG_ON(!PageLocked(page));
+
+       stable_node = page_stable_node(page);
+       if (!stable_node)
+               return SWAP_FAIL;
+
+       /*
+        * Temporary hack: really we need anon_vma in rmap_item, to
+        * provide the correct vma, and to find recently forked instances.
+        * Use zalloc to avoid weirdness if any other fields are involved.
+        */
+       if (TTU_ACTION(flags) != TTU_UNMAP)
+               return SWAP_FAIL;
+
+       vma = kmem_cache_zalloc(vm_area_cachep, GFP_ATOMIC);
+       if (!vma) {
+               spin_lock(&ksm_fallback_vma_lock);
+               vma = &ksm_fallback_vma;
+       }
+
+       hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+               vma->vm_mm = rmap_item->mm;
+               vma->vm_start = rmap_item->address;
+               vma->vm_end = vma->vm_start + PAGE_SIZE;
+
+               ret = try_to_unmap_one(page, vma, rmap_item->address, flags);
+               if (ret != SWAP_AGAIN || !page_mapped(page))
+                       goto out;
+       }
+out:
+       if (vma == &ksm_fallback_vma)
+               spin_unlock(&ksm_fallback_vma_lock);
+       else
+               kmem_cache_free(vm_area_cachep, vma);
+       return ret;
+}
+
 #ifdef CONFIG_SYSFS
 /*
  * This all compiles without CONFIG_SYSFS, but is a waste of space.
 
        lock_page(page);
        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 
+       page = ksm_might_need_to_copy(page, vma, address);
+       if (!page) {
+               ret = VM_FAULT_OOM;
+               goto out;
+       }
+
        if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
                ret = VM_FAULT_OOM;
                goto out_page;
 
 #include <linux/swapops.h>
 #include <linux/slab.h>
 #include <linux/init.h>
+#include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/rcupdate.h>
 #include <linux/module.h>
  * Subfunctions of page_referenced: page_referenced_one called
  * repeatedly from either page_referenced_anon or page_referenced_file.
  */
-static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
-                              unsigned long address, unsigned int *mapcount,
-                              unsigned long *vm_flags)
+int page_referenced_one(struct page *page, struct vm_area_struct *vma,
+                       unsigned long address, unsigned int *mapcount,
+                       unsigned long *vm_flags)
 {
        struct mm_struct *mm = vma->vm_mm;
        pte_t *pte;
                    unsigned long *vm_flags)
 {
        int referenced = 0;
+       int we_locked = 0;
 
        if (TestClearPageReferenced(page))
                referenced++;
 
        *vm_flags = 0;
        if (page_mapped(page) && page_rmapping(page)) {
-               if (PageAnon(page))
+               if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
+                       we_locked = trylock_page(page);
+                       if (!we_locked) {
+                               referenced++;
+                               goto out;
+                       }
+               }
+               if (unlikely(PageKsm(page)))
+                       referenced += page_referenced_ksm(page, mem_cont,
+                                                               vm_flags);
+               else if (PageAnon(page))
                        referenced += page_referenced_anon(page, mem_cont,
                                                                vm_flags);
-               else if (is_locked)
+               else if (page->mapping)
                        referenced += page_referenced_file(page, mem_cont,
                                                                vm_flags);
-               else if (!trylock_page(page))
-                       referenced++;
-               else {
-                       if (page->mapping)
-                               referenced += page_referenced_file(page,
-                                                       mem_cont, vm_flags);
+               if (we_locked)
                        unlock_page(page);
-               }
        }
-
+out:
        if (page_test_and_clear_young(page))
                referenced++;
 
        BUG_ON(!anon_vma);
        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
        page->mapping = (struct address_space *) anon_vma;
-
        page->index = linear_page_index(vma, address);
-
-       /*
-        * nr_mapped state can be updated without turning off
-        * interrupts because it is not modified via interrupt.
-        */
-       __inc_zone_page_state(page, NR_ANON_PAGES);
 }
 
 /**
  * @vma:       the vm area in which the mapping is added
  * @address:   the user virtual address mapped
  *
- * The caller needs to hold the pte lock and the page must be locked.
+ * The caller needs to hold the pte lock, and the page must be locked in
+ * the anon_vma case: to serialize mapping,index checking after setting.
  */
 void page_add_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address)
 {
+       int first = atomic_inc_and_test(&page->_mapcount);
+       if (first)
+               __inc_zone_page_state(page, NR_ANON_PAGES);
+       if (unlikely(PageKsm(page)))
+               return;
+
        VM_BUG_ON(!PageLocked(page));
        VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
-       if (atomic_inc_and_test(&page->_mapcount))
+       if (first)
                __page_set_anon_rmap(page, vma, address);
        else
                __page_check_anon_rmap(page, vma, address);
        VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        SetPageSwapBacked(page);
        atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
+       __inc_zone_page_state(page, NR_ANON_PAGES);
        __page_set_anon_rmap(page, vma, address);
        if (page_evictable(page, vma))
                lru_cache_add_lru(page, LRU_ACTIVE_ANON);
  * Subfunctions of try_to_unmap: try_to_unmap_one called
  * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
  */
-static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
-                           unsigned long address, enum ttu_flags flags)
+int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
+                    unsigned long address, enum ttu_flags flags)
 {
        struct mm_struct *mm = vma->vm_mm;
        pte_t *pte;
 
        BUG_ON(!PageLocked(page));
 
-       if (PageAnon(page))
+       if (unlikely(PageKsm(page)))
+               ret = try_to_unmap_ksm(page, flags);
+       else if (PageAnon(page))
                ret = try_to_unmap_anon(page, flags);
        else
                ret = try_to_unmap_file(page, flags);
  *
  * SWAP_AGAIN  - no vma is holding page mlocked, or,
  * SWAP_AGAIN  - page mapped in mlocked vma -- couldn't acquire mmap sem
+ * SWAP_FAIL   - page cannot be located at present
  * SWAP_MLOCK  - page is now mlocked.
  */
 int try_to_munlock(struct page *page)
 {
        VM_BUG_ON(!PageLocked(page) || PageLRU(page));
 
-       if (PageAnon(page))
+       if (unlikely(PageKsm(page)))
+               return try_to_unmap_ksm(page, TTU_MUNLOCK);
+       else if (PageAnon(page))
                return try_to_unmap_anon(page, TTU_MUNLOCK);
        else
                return try_to_unmap_file(page, TTU_MUNLOCK);
 }
-
 
 #include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/security.h>
 #include <linux/backing-dev.h>
        int count;
 
        VM_BUG_ON(!PageLocked(page));
+       if (unlikely(PageKsm(page)))
+               return 0;
        count = page_mapcount(page);
        if (count <= 1 && PageSwapCache(page)) {
                count += page_swapcount(page);
                        SetPageDirty(page);
                }
        }
-       return count == 1;
+       return count <= 1;
 }
 
 /*
                 * read from disk into another page.  Splitting into two
                 * pages would be incorrect if swap supported "shared
                 * private" pages, but they are handled by tmpfs files.
+                *
+                * Given how unuse_vma() targets one particular offset
+                * in an anon_vma, once the anon_vma has been determined,
+                * this splitting happens to be just what is needed to
+                * handle where KSM pages have been swapped out: re-reading
+                * is unnecessarily slow, but we can fix that later on.
                 */
                if (swap_count(*swap_map) &&
                     PageDirty(page) && PageSwapCache(page)) {