mm: ZONE_DEVICE for "device memory"

author Dan Williams <dan.j.williams@intel.com>

Sun, 9 Aug 2015 19:29:06 +0000 (15:29 -0400)

committer Dan Duval <dan.duval@oracle.com>

Wed, 7 Dec 2016 17:19:28 +0000 (12:19 -0500)
author Dan Williams <dan.j.williams@intel.com>
Sun, 9 Aug 2015 19:29:06 +0000 (15:29 -0400)
committer Dan Duval <dan.duval@oracle.com>
Wed, 7 Dec 2016 17:19:28 +0000 (12:19 -0500)
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c

index a9b65cf7b34a74b120f842a8344a2ce763ca963a..1c864a9dc8a6934cc0aeffa32099977d10506c93 100644 (file)
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -652,7 +652,7 @@ mem_init (void)
  }
  
  #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
  {
         pg_data_t *pgdat;
         struct zone *zone;
@@ -663,7 +663,7 @@ int arch_add_memory(int nid, u64 start, u64 size)
         pgdat = NODE_DATA(nid);
  
         zone = pgdat->node_zones +
-               zone_for_memory(nid, start, size, ZONE_NORMAL);
+               zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
         ret = __add_pages(nid, zone, start_pfn, nr_pages);
  
         if (ret)
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c

index 45fda71feb27465fcd4079744d0a19cd9f168ea9..43968d473f8d157db07321ef86514e4476b605d2 100644 (file)
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -113,7 +113,7 @@ int memory_add_physaddr_to_nid(u64 start)
  }
  #endif
  
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
  {
         struct pglist_data *pgdata;
         struct zone *zone;
@@ -128,7 +128,7 @@ int arch_add_memory(int nid, u64 start, u64 size)
  
         /* this should work for most non-highmem platforms */
         zone = pgdata->node_zones +
-               zone_for_memory(nid, start, size, 0);
+               zone_for_memory(nid, start, size, 0, for_device);
  
         return __add_pages(nid, zone, start_pfn, nr_pages);
  }
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c

index 80875c43a4a4b16c11e8449d5cbf1f4db1b66d75..087798144a14838c87fe92a72f8cd07eeffab34f 100644 (file)
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -168,7 +168,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end)
  #endif
  
  #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
  {
         unsigned long zone_start_pfn, zone_end_pfn, nr_pages;
         unsigned long start_pfn = PFN_DOWN(start);
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c

index 2790b6a64157f79663fe5232afe9f857e6d81cb7..c1490096b8637f61eac9c7ee9a3703ce6680dae2 100644 (file)
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -485,7 +485,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
  #endif
  
  #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
  {
         pg_data_t *pgdat;
         unsigned long start_pfn = start >> PAGE_SHIFT;
@@ -496,7 +496,8 @@ int arch_add_memory(int nid, u64 start, u64 size)
  
         /* We only have ZONE_NORMAL, so this is easy.. */
         ret = __add_pages(nid, pgdat->node_zones +
-                       zone_for_memory(nid, start, size, ZONE_NORMAL),
+                       zone_for_memory(nid, start, size, ZONE_NORMAL,
+                       for_device),
                         start_pfn, nr_pages);
         if (unlikely(ret))
                 printk("%s: Failed, __add_pages() == %d\n", __func__, ret);
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c

index 5bd252e3fdc506a6aa393419252a94261768d400..d4e1fc41d06db21a475b1bdbd6508df5b7f5ab8a 100644 (file)
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -863,7 +863,7 @@ void __init mem_init(void)
   * memory to the highmem for now.
   */
  #ifndef CONFIG_NEED_MULTIPLE_NODES
-int arch_add_memory(u64 start, u64 size)
+int arch_add_memory(u64 start, u64 size, bool for_device)
  {
         struct pglist_data *pgdata = &contig_page_data;
         struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c

index c23ab1ee3a9a9d15e4c043d27452ecf99deee9b4..3c993d902b6411e6fe3f876dd7b32252c65c4550 100644 (file)
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -823,11 +823,11 @@ void __init mem_init(void)
  }
  
  #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
  {
         struct pglist_data *pgdata = NODE_DATA(nid);
         struct zone *zone = pgdata->node_zones +
-               zone_for_memory(nid, start, size, ZONE_HIGHMEM);
+               zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device);
         unsigned long start_pfn = start >> PAGE_SHIFT;
         unsigned long nr_pages = size >> PAGE_SHIFT;
  
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c

index f9977a7a94448479efb628cc90c2c11c1ae166e8..df48430c279b8688996b9f0074c08b1ce139af06 100644 (file)
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -687,11 +687,11 @@ static void  update_end_of_memory_vars(u64 start, u64 size)
   * Memory is added always to NORMAL zone. This means you will never get
   * additional DMA/DMA32 memory.
   */
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
  {
         struct pglist_data *pgdat = NODE_DATA(nid);
         struct zone *zone = pgdat->node_zones +
-               zone_for_memory(nid, start, size, ZONE_NORMAL);
+               zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
         unsigned long start_pfn = start >> PAGE_SHIFT;
         unsigned long nr_pages = size >> PAGE_SHIFT;
         int ret;
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h

index 4caa65ca3177c99868a77165fd73d212b34fb1f9..62ff5bd7d276a17c4ced47de6713ca869b25833d 100644 (file)
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -267,9 +267,10 @@ static inline void remove_memory(int nid, u64 start, u64 size) {}
  extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
                 void *arg, int (*func)(struct memory_block *, void *));
  extern int add_memory(int nid, u64 start, u64 size);
-extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default);
+extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
+               bool for_device);
  extern int add_memory_resource(int nid, struct resource *resource);
-extern int arch_add_memory(int nid, u64 start, u64 size);
+extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device);
  extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
  extern bool is_memblock_offlined(struct memory_block *mem);
  extern void remove_memory(int nid, u64 start, u64 size);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index 54d74f6eb233521d6cb84b2720a15c3cb2e6b734..ba700669d586793313bbb423272f656c0eac36bb 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -319,7 +319,11 @@ enum zone_type {
         ZONE_HIGHMEM,
  #endif
         ZONE_MOVABLE,
+#ifdef CONFIG_ZONE_DEVICE
+       ZONE_DEVICE,
+#endif
         __MAX_NR_ZONES
+
  };
  
  #ifndef __GENERATING_BOUNDS_H
@@ -786,6 +790,25 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat)
         return !pgdat->node_start_pfn && !pgdat->node_spanned_pages;
  }
  
+static inline int zone_id(const struct zone *zone)
+{
+       struct pglist_data *pgdat = zone->zone_pgdat;
+
+       return zone - pgdat->node_zones;
+}
+
+#ifdef CONFIG_ZONE_DEVICE
+static inline bool is_dev_zone(const struct zone *zone)
+{
+       return zone_id(zone) == ZONE_DEVICE;
+}
+#else
+static inline bool is_dev_zone(const struct zone *zone)
+{
+       return false;
+}
+#endif
+
  #include <linux/memory_hotplug.h>
  
  extern struct mutex zonelists_mutex;
diff --git a/mm/Kconfig b/mm/Kconfig

index 390214da45463e0b134709af6f072ddcc9162f7d..33f30121a7b9e09e000910095dc4d26c02f4c723 100644 (file)
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -635,3 +635,38 @@ config MAX_STACK_SIZE_MB
           changed to a smaller value in which case that is used.
  
           A sane initial value is 80 MB.
+
+# For architectures that support deferred memory initialisation
+config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
+       bool
+
+config DEFERRED_STRUCT_PAGE_INIT
+       bool "Defer initialisation of struct pages to kswapd"
+       default n
+       depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
+       depends on MEMORY_HOTPLUG
+       help
+         Ordinarily all struct pages are initialised during early boot in a
+         single thread. On very large machines this can take a considerable
+         amount of time. If this option is set, large machines will bring up
+         a subset of memmap at boot and then initialise the rest in parallel
+         when kswapd starts. This has a potential performance impact on
+         processes running early in the lifetime of the systemm until kswapd
+         finishes the initialisation.
+
+config ZONE_DEVICE
+       bool "Device memory (pmem, etc...) hotplug support" if EXPERT
+       default !ZONE_DMA
+       depends on !ZONE_DMA
+       depends on MEMORY_HOTPLUG
+       depends on MEMORY_HOTREMOVE
+       depends on X86_64 #arch_add_memory() comprehends device memory
+
+       help
+         Device memory hotplug support allows for establishing pmem,
+         or other device driver discovered memory regions, in the
+         memmap. This allows pfn_to_page() lookups of otherwise
+         "device-physical" addresses which is needed for using a DAX
+         mapping in an O_DIRECT operation, among other things.
+
+         If FS_DAX is enabled, then say Y.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c

index 74bfdcaeebab45c37ff0c2f454d5a8e3dcdfee90..31981dd7aa32be20a5a8516454742c4f625cf3b6 100644 (file)
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -769,7 +769,10 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
  
         start = phys_start_pfn << PAGE_SHIFT;
         size = nr_pages * PAGE_SIZE;
-       ret = release_mem_region_adjustable(&iomem_resource, start, size);
+
+       /* in the ZONE_DEVICE case device driver owns the memory region */
+       if (!is_dev_zone(zone))
+               ret = release_mem_region_adjustable(&iomem_resource, start, size);
         if (ret) {
                 resource_size_t endres = start + size - 1;
  
@@ -1206,8 +1209,13 @@ static int should_add_memory_movable(int nid, u64 start, u64 size)
         return 0;
  }
  
-int zone_for_memory(int nid, u64 start, u64 size, int zone_default)
+int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
+               bool for_device)
  {
+#ifdef CONFIG_ZONE_DEVICE
+       if (for_device)
+               return ZONE_DEVICE;
+#endif
         if (should_add_memory_movable(nid, start, size))
                 return ZONE_MOVABLE;
  
@@ -1246,7 +1254,7 @@ int __ref add_memory_resource(int nid, struct resource *res)
         }
  
         /* call arch's memory hotadd */
-       ret = arch_add_memory(nid, start, size);
+       ret = arch_add_memory(nid, start, size, false);
  
         if (ret < 0)
                 goto error;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index ab080bfeb5cd6bfb95d3ead697da2a6e7a1ed7c1..f1de8e4de4b574e6cbbfc9d90604b04570b5e13a 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -205,6 +205,9 @@ static char * const zone_names[MAX_NR_ZONES] = {
          "HighMem",
  #endif
          "Movable",
+#ifdef CONFIG_ZONE_DEVICE
+        "Device",
+#endif
  };
  
  int min_free_kbytes = 1024;
author	Dan Williams <dan.j.williams@intel.com>
	Sun, 9 Aug 2015 19:29:06 +0000 (15:29 -0400)
committer	Dan Duval <dan.duval@oracle.com>
	Wed, 7 Dec 2016 17:19:28 +0000 (12:19 -0500)
arch/ia64/mm/init.c		patch \| blob \| history
arch/powerpc/mm/mem.c		patch \| blob \| history
arch/s390/mm/init.c		patch \| blob \| history
arch/sh/mm/init.c		patch \| blob \| history
arch/tile/mm/init.c		patch \| blob \| history
arch/x86/mm/init_32.c		patch \| blob \| history
arch/x86/mm/init_64.c		patch \| blob \| history
include/linux/memory_hotplug.h		patch \| blob \| history
include/linux/mmzone.h		patch \| blob \| history
mm/Kconfig		patch \| blob \| history
mm/memory_hotplug.c		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history