[PATCH] Swapless page migration: add R/W migration entries

author Christoph Lameter <clameter@sgi.com>

Fri, 23 Jun 2006 09:03:35 +0000 (02:03 -0700)

committer Linus Torvalds <torvalds@g5.osdl.org>

Fri, 23 Jun 2006 14:42:50 +0000 (07:42 -0700)
author Christoph Lameter <clameter@sgi.com>
Fri, 23 Jun 2006 09:03:35 +0000 (02:03 -0700)
committer Linus Torvalds <torvalds@g5.osdl.org>
Fri, 23 Jun 2006 14:42:50 +0000 (07:42 -0700)
diff --git a/include/linux/swap.h b/include/linux/swap.h

index cd28ad206dae3f8b4b66964321ec7091d3f52ddd..7cee73ef4f15c79bacc9a4b1bdd400d5f323be71 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -28,7 +28,14 @@ static inline int current_is_kswapd(void)
   * the type/offset into the pte as 5/27 as well.
   */
  #define MAX_SWAPFILES_SHIFT    5
+#ifndef CONFIG_MIGRATION
  #define MAX_SWAPFILES          (1 << MAX_SWAPFILES_SHIFT)
+#else
+/* Use last two entries for page migration swap entries */
+#define MAX_SWAPFILES          ((1 << MAX_SWAPFILES_SHIFT)-2)
+#define SWP_MIGRATION_READ     MAX_SWAPFILES
+#define SWP_MIGRATION_WRITE    (MAX_SWAPFILES + 1)
+#endif
  
  /*
   * Magic header for a swap area. The first part of the union is
diff --git a/include/linux/swapops.h b/include/linux/swapops.h

index 87b9d14c710db69217e2eac3a18c903ca772a254..ec639aa3a1d379aa724938933195afd213a7789c 100644 (file)
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -67,3 +67,56 @@ static inline pte_t swp_entry_to_pte(swp_entry_t entry)
         BUG_ON(pte_file(__swp_entry_to_pte(arch_entry)));
         return __swp_entry_to_pte(arch_entry);
  }
+
+#ifdef CONFIG_MIGRATION
+static inline swp_entry_t make_migration_entry(struct page *page, int write)
+{
+       BUG_ON(!PageLocked(page));
+       return swp_entry(write ? SWP_MIGRATION_WRITE : SWP_MIGRATION_READ,
+                       page_to_pfn(page));
+}
+
+static inline int is_migration_entry(swp_entry_t entry)
+{
+       return unlikely(swp_type(entry) == SWP_MIGRATION_READ ||
+                       swp_type(entry) == SWP_MIGRATION_WRITE);
+}
+
+static inline int is_write_migration_entry(swp_entry_t entry)
+{
+       return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE);
+}
+
+static inline struct page *migration_entry_to_page(swp_entry_t entry)
+{
+       struct page *p = pfn_to_page(swp_offset(entry));
+       /*
+        * Any use of migration entries may only occur while the
+        * corresponding page is locked
+        */
+       BUG_ON(!PageLocked(p));
+       return p;
+}
+
+static inline void make_migration_entry_read(swp_entry_t *entry)
+{
+       *entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry));
+}
+
+extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+                                       unsigned long address);
+#else
+
+#define make_migration_entry(page, write) swp_entry(0, 0)
+#define is_migration_entry(swp) 0
+#define migration_entry_to_page(swp) NULL
+static inline void make_migration_entry_read(swp_entry_t *entryp) { }
+static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+                                        unsigned long address) { }
+static inline int is_write_migration_entry(swp_entry_t entry)
+{
+       return 0;
+}
+
+#endif
+
diff --git a/mm/memory.c b/mm/memory.c

index 7e3683fd4f3ca706439ce3bbac5c79b23b19da61..11673c5d2c20236fcdc7f514ad91ac0131cb2a94 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -434,7 +434,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         /* pte contains position in swap or file, so copy. */
         if (unlikely(!pte_present(pte))) {
                 if (!pte_file(pte)) {
-                       swap_duplicate(pte_to_swp_entry(pte));
+                       swp_entry_t entry = pte_to_swp_entry(pte);
+
+                       swap_duplicate(entry);
                         /* make sure dst_mm is on swapoff's mmlist. */
                         if (unlikely(list_empty(&dst_mm->mmlist))) {
                                 spin_lock(&mmlist_lock);
@@ -443,6 +445,16 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                                                  &src_mm->mmlist);
                                 spin_unlock(&mmlist_lock);
                         }
+                       if (is_write_migration_entry(entry) &&
+                                       is_cow_mapping(vm_flags)) {
+                               /*
+                                * COW mappings require pages in both parent
+                                * and child to be set to read.
+                                */
+                               make_migration_entry_read(&entry);
+                               pte = swp_entry_to_pte(entry);
+                               set_pte_at(src_mm, addr, src_pte, pte);
+                       }
                 }
                 goto out_set_pte;
         }
@@ -1879,6 +1891,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 goto out;
  
         entry = pte_to_swp_entry(orig_pte);
+       if (is_migration_entry(entry)) {
+               migration_entry_wait(mm, pmd, address);
+               goto out;
+       }
         page = lookup_swap_cache(entry);
         if (!page) {
                 swapin_readahead(entry, address, vma);
diff --git a/mm/migrate.c b/mm/migrate.c

index 5a340f4ca212d0a80bef2e49a3d0f127fb4e62ec..0a011e421bb4d8887e8ca14e2e141606a7693111 100644 (file)
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -15,6 +15,7 @@
  #include <linux/migrate.h>
  #include <linux/module.h>
  #include <linux/swap.h>
+#include <linux/swapops.h>
  #include <linux/pagemap.h>
  #include <linux/buffer_head.h>
  #include <linux/mm_inline.h>
@@ -23,7 +24,6 @@
  #include <linux/topology.h>
  #include <linux/cpu.h>
  #include <linux/cpuset.h>
-#include <linux/swapops.h>
  
  #include "internal.h"
  
@@ -119,6 +119,132 @@ int putback_lru_pages(struct list_head *l)
         return count;
  }
  
+static inline int is_swap_pte(pte_t pte)
+{
+       return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
+}
+
+/*
+ * Restore a potential migration pte to a working pte entry
+ */
+static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
+               struct page *old, struct page *new)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       swp_entry_t entry;
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *ptep, pte;
+       spinlock_t *ptl;
+
+       pgd = pgd_offset(mm, addr);
+       if (!pgd_present(*pgd))
+                return;
+
+       pud = pud_offset(pgd, addr);
+       if (!pud_present(*pud))
+                return;
+
+       pmd = pmd_offset(pud, addr);
+       if (!pmd_present(*pmd))
+               return;
+
+       ptep = pte_offset_map(pmd, addr);
+
+       if (!is_swap_pte(*ptep)) {
+               pte_unmap(ptep);
+               return;
+       }
+
+       ptl = pte_lockptr(mm, pmd);
+       spin_lock(ptl);
+       pte = *ptep;
+       if (!is_swap_pte(pte))
+               goto out;
+
+       entry = pte_to_swp_entry(pte);
+
+       if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
+               goto out;
+
+       inc_mm_counter(mm, anon_rss);
+       get_page(new);
+       pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
+       if (is_write_migration_entry(entry))
+               pte = pte_mkwrite(pte);
+       set_pte_at(mm, addr, ptep, pte);
+       page_add_anon_rmap(new, vma, addr);
+out:
+       pte_unmap_unlock(ptep, ptl);
+}
+
+/*
+ * Get rid of all migration entries and replace them by
+ * references to the indicated page.
+ *
+ * Must hold mmap_sem lock on at least one of the vmas containing
+ * the page so that the anon_vma cannot vanish.
+ */
+static void remove_migration_ptes(struct page *old, struct page *new)
+{
+       struct anon_vma *anon_vma;
+       struct vm_area_struct *vma;
+       unsigned long mapping;
+
+       mapping = (unsigned long)new->mapping;
+
+       if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
+               return;
+
+       /*
+        * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
+        */
+       anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
+       spin_lock(&anon_vma->lock);
+
+       list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
+               remove_migration_pte(vma, page_address_in_vma(new, vma),
+                                       old, new);
+
+       spin_unlock(&anon_vma->lock);
+}
+
+/*
+ * Something used the pte of a page under migration. We need to
+ * get to the page and wait until migration is finished.
+ * When we return from this function the fault will be retried.
+ *
+ * This function is called from do_swap_page().
+ */
+void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+                               unsigned long address)
+{
+       pte_t *ptep, pte;
+       spinlock_t *ptl;
+       swp_entry_t entry;
+       struct page *page;
+
+       ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+       pte = *ptep;
+       if (!is_swap_pte(pte))
+               goto out;
+
+       entry = pte_to_swp_entry(pte);
+       if (!is_migration_entry(entry))
+               goto out;
+
+       page = migration_entry_to_page(entry);
+
+       get_page(page);
+       pte_unmap_unlock(ptep, ptl);
+       wait_on_page_locked(page);
+       put_page(page);
+       return;
+out:
+       pte_unmap_unlock(ptep, ptl);
+}
+
  /*
   * swapout a single page
   * page is locked upon entry, unlocked on exit
diff --git a/mm/mprotect.c b/mm/mprotect.c

index 5faf01ad3ef8aa13872f11456686e07f4e174df8..14f93e62270f7b2ad38fa0846c0275e8aeccf231 100644 (file)
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -19,7 +19,8 @@
  #include <linux/mempolicy.h>
  #include <linux/personality.h>
  #include <linux/syscalls.h>
-
+#include <linux/swap.h>
+#include <linux/swapops.h>
  #include <asm/uaccess.h>
  #include <asm/pgtable.h>
  #include <asm/cacheflush.h>
@@ -28,12 +29,13 @@
  static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
                 unsigned long addr, unsigned long end, pgprot_t newprot)
  {
-       pte_t *pte;
+       pte_t *pte, oldpte;
         spinlock_t *ptl;
  
         pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
         do {
-               if (pte_present(*pte)) {
+               oldpte = *pte;
+               if (pte_present(oldpte)) {
                         pte_t ptent;
  
                         /* Avoid an SMP race with hardware updated dirty/clean
@@ -43,7 +45,22 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
                         ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot);
                         set_pte_at(mm, addr, pte, ptent);
                         lazy_mmu_prot_update(ptent);
+#ifdef CONFIG_MIGRATION
+               } else if (!pte_file(oldpte)) {
+                       swp_entry_t entry = pte_to_swp_entry(oldpte);
+
+                       if (is_write_migration_entry(entry)) {
+                               /*
+                                * A protection check is difficult so
+                                * just be safe and disable write
+                                */
+                               make_migration_entry_read(&entry);
+                               set_pte_at(mm, addr, pte,
+                                       swp_entry_to_pte(entry));
+                       }
+#endif
                 }
+
         } while (pte++, addr += PAGE_SIZE, addr != end);
         pte_unmap_unlock(pte - 1, ptl);
  }
diff --git a/mm/rmap.c b/mm/rmap.c

index 10806b7af40c0a111c544e0635dd31fb24b10809..3b8ce86daa3a6600845d78174a5506b8fe738166 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -103,7 +103,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                 spin_lock(&mm->page_table_lock);
                 if (likely(!vma->anon_vma)) {
                         vma->anon_vma = anon_vma;
-                       list_add(&vma->anon_vma_node, &anon_vma->head);
+                       list_add_tail(&vma->anon_vma_node, &anon_vma->head);
                         allocated = NULL;
                 }
                 spin_unlock(&mm->page_table_lock);
@@ -127,7 +127,7 @@ void __anon_vma_link(struct vm_area_struct *vma)
         struct anon_vma *anon_vma = vma->anon_vma;
  
         if (anon_vma) {
-               list_add(&vma->anon_vma_node, &anon_vma->head);
+               list_add_tail(&vma->anon_vma_node, &anon_vma->head);
                 validate_anon_vma(vma);
         }
  }
@@ -138,7 +138,7 @@ void anon_vma_link(struct vm_area_struct *vma)
  
         if (anon_vma) {
                 spin_lock(&anon_vma->lock);
-               list_add(&vma->anon_vma_node, &anon_vma->head);
+               list_add_tail(&vma->anon_vma_node, &anon_vma->head);
                 validate_anon_vma(vma);
                 spin_unlock(&anon_vma->lock);
         }
@@ -620,17 +620,27 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
  
         if (PageAnon(page)) {
                 swp_entry_t entry = { .val = page_private(page) };
-               /*
-                * Store the swap location in the pte.
-                * See handle_pte_fault() ...
-                */
-               BUG_ON(!PageSwapCache(page));
-               swap_duplicate(entry);
-               if (list_empty(&mm->mmlist)) {
-                       spin_lock(&mmlist_lock);
-                       if (list_empty(&mm->mmlist))
-                               list_add(&mm->mmlist, &init_mm.mmlist);
-                       spin_unlock(&mmlist_lock);
+
+               if (PageSwapCache(page)) {
+                       /*
+                        * Store the swap location in the pte.
+                        * See handle_pte_fault() ...
+                        */
+                       swap_duplicate(entry);
+                       if (list_empty(&mm->mmlist)) {
+                               spin_lock(&mmlist_lock);
+                               if (list_empty(&mm->mmlist))
+                                       list_add(&mm->mmlist, &init_mm.mmlist);
+                               spin_unlock(&mmlist_lock);
+                       }
+               } else {
+                       /*
+                        * Store the pfn of the page in a special migration
+                        * pte. do_swap_page() will wait until the migration
+                        * pte is removed and then restart fault handling.
+                        */
+                       BUG_ON(!migration);
+                       entry = make_migration_entry(page, pte_write(pteval));
                 }
                 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
                 BUG_ON(pte_file(*pte));
diff --git a/mm/swapfile.c b/mm/swapfile.c

index 47a6812f5f8cf5dcd6e0afc1894f9911e3ecfbd7..e3b1362372c2ff8054c664113c39174687348b6a 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -395,6 +395,9 @@ void free_swap_and_cache(swp_entry_t entry)
         struct swap_info_struct * p;
         struct page *page = NULL;
  
+       if (is_migration_entry(entry))
+               return;
+
         p = swap_info_get(entry);
         if (p) {
                 if (swap_entry_free(p, swp_offset(entry)) == 1) {
@@ -1400,19 +1403,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                 if (!(p->flags & SWP_USED))
                         break;
         error = -EPERM;
-       /*
-        * Test if adding another swap device is possible. There are
-        * two limiting factors: 1) the number of bits for the swap
-        * type swp_entry_t definition and 2) the number of bits for
-        * the swap type in the swap ptes as defined by the different
-        * architectures. To honor both limitations a swap entry
-        * with swap offset 0 and swap type ~0UL is created, encoded
-        * to a swap pte, decoded to a swp_entry_t again and finally
-        * the swap type part is extracted. This will mask all bits
-        * from the initial ~0UL that can't be encoded in either the
-        * swp_entry_t or the architecture definition of a swap pte.
-        */
-       if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) {
+       if (type >= MAX_SWAPFILES) {
                 spin_unlock(&swap_lock);
                 goto out;
         }
@@ -1702,6 +1693,9 @@ int swap_duplicate(swp_entry_t entry)
         unsigned long offset, type;
         int result = 0;
  
+       if (is_migration_entry(entry))
+               return 1;
+
         type = swp_type(entry);
         if (type >= nr_swapfiles)
                 goto bad_file;
author	Christoph Lameter <clameter@sgi.com>
	Fri, 23 Jun 2006 09:03:35 +0000 (02:03 -0700)
committer	Linus Torvalds <torvalds@g5.osdl.org>
	Fri, 23 Jun 2006 14:42:50 +0000 (07:42 -0700)
include/linux/swap.h		patch \| blob \| history
include/linux/swapops.h		patch \| blob \| history
mm/memory.c		patch \| blob \| history
mm/migrate.c		patch \| blob \| history
mm/mprotect.c		patch \| blob \| history
mm/rmap.c		patch \| blob \| history
mm/swapfile.c		patch \| blob \| history