mm/slub: add opt-in slub_percpu_array

author Vlastimil Babka <vbabka@suse.cz>

Wed, 15 Nov 2023 10:38:15 +0000 (11:38 +0100)

committer Liam R. Howlett <Liam.Howlett@oracle.com>

Tue, 21 Nov 2023 18:03:21 +0000 (13:03 -0500)
author Vlastimil Babka <vbabka@suse.cz>
Wed, 15 Nov 2023 10:38:15 +0000 (11:38 +0100)
committer Liam R. Howlett <Liam.Howlett@oracle.com>
Tue, 21 Nov 2023 18:03:21 +0000 (13:03 -0500)
diff --git a/include/linux/slab.h b/include/linux/slab.h

index d6d6ffeeb9a2af2f3ff497b48c08e2c19f8d0efb..fe0c0981be592cd9ec6f6bc6ab59f50b9fa3dec7 100644 (file)
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -197,6 +197,8 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name,
  void kmem_cache_destroy(struct kmem_cache *s);
  int kmem_cache_shrink(struct kmem_cache *s);
  
+int kmem_cache_setup_percpu_array(struct kmem_cache *s, unsigned int count);
+
  /*
   * Please use this macro to create slab caches. Simply specify the
   * name of the structure and maybe some flags that are listed above.
@@ -512,6 +514,8 @@ void kmem_cache_free(struct kmem_cache *s, void *objp);
  void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p);
  int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p);
  
+int kmem_cache_prefill_percpu_array(struct kmem_cache *s, unsigned int count, gfp_t gfp);
+
  static __always_inline void kfree_bulk(size_t size, void **p)
  {
         kmem_cache_free_bulk(NULL, size, p);
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h

index deb90cf4bffb05ecbc69e8654431f470f8faf6e0..2083aa84976625b34f6d21f954e1f95b5645b3ed 100644 (file)
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -13,8 +13,10 @@
  #include <linux/local_lock.h>
  
  enum stat_item {
+       ALLOC_PCA,              /* Allocation from percpu array cache */
         ALLOC_FASTPATH,         /* Allocation from cpu slab */
         ALLOC_SLOWPATH,         /* Allocation by getting a new cpu slab */
+       FREE_PCA,               /* Free to percpu array cache */
         FREE_FASTPATH,          /* Free to cpu slab */
         FREE_SLOWPATH,          /* Freeing not to cpu slab */
         FREE_FROZEN,            /* Freeing to frozen slab */
@@ -39,6 +41,8 @@ enum stat_item {
         CPU_PARTIAL_FREE,       /* Refill cpu partial on free */
         CPU_PARTIAL_NODE,       /* Refill cpu partial from node partial */
         CPU_PARTIAL_DRAIN,      /* Drain cpu partial to node partial */
+       PCA_REFILL,             /* Refilling empty percpu array cache */
+       PCA_FLUSH,              /* Flushing full percpu array cache */
         NR_SLUB_STAT_ITEMS
  };
  
@@ -66,6 +70,13 @@ struct kmem_cache_cpu {
  };
  #endif /* CONFIG_SLUB_TINY */
  
+struct slub_percpu_array {
+       spinlock_t lock;
+       unsigned int count;
+       unsigned int used;
+       void * objects[];
+};
+
  #ifdef CONFIG_SLUB_CPU_PARTIAL
  #define slub_percpu_partial(c)         ((c)->partial)
  
@@ -99,6 +110,7 @@ struct kmem_cache {
  #ifndef CONFIG_SLUB_TINY
         struct kmem_cache_cpu __percpu *cpu_slab;
  #endif
+       struct slub_percpu_array __percpu *cpu_array;
         /* Used for retrieving partial slabs, etc. */
         slab_flags_t flags;
         unsigned long min_partial;
diff --git a/mm/Kconfig b/mm/Kconfig

index 89971a894b6058fd7b908c0e8ffa216cbdc887dd..aa53c51bb4a65d34d2e2f5411fa2905761cf46cb 100644 (file)
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -237,6 +237,7 @@ choice
  config SLAB_DEPRECATED
         bool "SLAB (DEPRECATED)"
         depends on !PREEMPT_RT
+       depends on BROKEN
         help
           Deprecated and scheduled for removal in a few cycles. Replaced by
           SLUB.
diff --git a/mm/slub.c b/mm/slub.c

index 59912a376c6d5861c1cd8e8ed10286db9700acd5..b0879714279bd2f52581ec722f9664169ca18e58 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -188,6 +188,79 @@ do {                                       \
  #define USE_LOCKLESS_FAST_PATH()       (false)
  #endif
  
+/* copy/pasted  from mm/page_alloc.c */
+
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+/*
+ * On SMP, spin_trylock is sufficient protection.
+ * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
+ */
+#define pcp_trylock_prepare(flags)     do { } while (0)
+#define pcp_trylock_finish(flag)       do { } while (0)
+#else
+
+/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
+#define pcp_trylock_prepare(flags)     local_irq_save(flags)
+#define pcp_trylock_finish(flags)      local_irq_restore(flags)
+#endif
+
+/*
+ * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
+ * a migration causing the wrong PCP to be locked and remote memory being
+ * potentially allocated, pin the task to the CPU for the lookup+lock.
+ * preempt_disable is used on !RT because it is faster than migrate_disable.
+ * migrate_disable is used on RT because otherwise RT spinlock usage is
+ * interfered with and a high priority task cannot preempt the allocator.
+ */
+#ifndef CONFIG_PREEMPT_RT
+#define pcpu_task_pin()                preempt_disable()
+#define pcpu_task_unpin()      preempt_enable()
+#else
+#define pcpu_task_pin()                migrate_disable()
+#define pcpu_task_unpin()      migrate_enable()
+#endif
+
+/*
+ * Generic helper to lookup and a per-cpu variable with an embedded spinlock.
+ * Return value should be used with equivalent unlock helper.
+ */
+#define pcpu_spin_lock(type, member, ptr)                              \
+({                                                                     \
+       type *_ret;                                                     \
+       pcpu_task_pin();                                                \
+       _ret = this_cpu_ptr(ptr);                                       \
+       spin_lock(&_ret->member);                                       \
+       _ret;                                                           \
+})
+
+#define pcpu_spin_trylock(type, member, ptr)                           \
+({                                                                     \
+       type *_ret;                                                     \
+       pcpu_task_pin();                                                \
+       _ret = this_cpu_ptr(ptr);                                       \
+       if (!spin_trylock(&_ret->member)) {                             \
+               pcpu_task_unpin();                                      \
+               _ret = NULL;                                            \
+       }                                                               \
+       _ret;                                                           \
+})
+
+#define pcpu_spin_unlock(member, ptr)                                  \
+({                                                                     \
+       spin_unlock(&ptr->member);                                      \
+       pcpu_task_unpin();                                              \
+})
+
+/* struct slub_percpu_array specific helpers. */
+#define pca_spin_lock(ptr)                                             \
+       pcpu_spin_lock(struct slub_percpu_array, lock, ptr)
+
+#define pca_spin_trylock(ptr)                                          \
+       pcpu_spin_trylock(struct slub_percpu_array, lock, ptr)
+
+#define pca_spin_unlock(ptr)                                           \
+       pcpu_spin_unlock(lock, ptr)
+
  #ifndef CONFIG_SLUB_TINY
  #define __fastpath_inline __always_inline
  #else
@@ -3454,6 +3527,78 @@ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
                         0, sizeof(void *));
  }
  
+static bool refill_pca(struct kmem_cache *s, unsigned int count, gfp_t gfp);
+
+static __fastpath_inline
+void *alloc_from_pca(struct kmem_cache *s, gfp_t gfp)
+{
+       unsigned long __maybe_unused UP_flags;
+       struct slub_percpu_array *pca;
+       void *object;
+
+retry:
+       pcp_trylock_prepare(UP_flags);
+       pca = pca_spin_trylock(s->cpu_array);
+
+       if (unlikely(!pca)) {
+               pcp_trylock_finish(UP_flags);
+               return NULL;
+       }
+
+       if (unlikely(pca->used == 0)) {
+               unsigned int batch = pca->count / 2;
+
+               pca_spin_unlock(pca);
+               pcp_trylock_finish(UP_flags);
+
+               if (!gfpflags_allow_blocking(gfp) || in_irq())
+                       return NULL;
+
+               if (refill_pca(s, batch, gfp))
+                       goto retry;
+
+               return NULL;
+       }
+
+       object = pca->objects[--pca->used];
+
+       pca_spin_unlock(pca);
+       pcp_trylock_finish(UP_flags);
+
+       stat(s, ALLOC_PCA);
+
+       return object;
+}
+
+static __fastpath_inline
+int alloc_from_pca_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+       unsigned long __maybe_unused UP_flags;
+       struct slub_percpu_array *pca;
+
+       pcp_trylock_prepare(UP_flags);
+       pca = pca_spin_trylock(s->cpu_array);
+
+       if (unlikely(!pca)) {
+               size = 0;
+               goto failed;
+       }
+
+       if (pca->used < size)
+               size = pca->used;
+
+       for (int i = size; i > 0;) {
+               p[--i] = pca->objects[--pca->used];
+       }
+
+       pca_spin_unlock(pca);
+       stat_add(s, ALLOC_PCA, size);
+
+failed:
+       pcp_trylock_finish(UP_flags);
+       return size;
+}
+
  /*
   * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
   * have the fastpath folded into their functions. So no function call
@@ -3479,7 +3624,11 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list
         if (unlikely(object))
                 goto out;
  
-       object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
+       if (s->cpu_array)
+               object = alloc_from_pca(s, gfpflags);
+
+       if (!object)
+               object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
  
         maybe_wipe_obj_freeptr(s, object);
         init = slab_want_init_on_alloc(gfpflags, s);
@@ -3726,6 +3875,81 @@ slab_empty:
         discard_slab(s, slab);
  }
  
+static bool flush_pca(struct kmem_cache *s, unsigned int count);
+
+static __fastpath_inline
+bool free_to_pca(struct kmem_cache *s, void *object)
+{
+       unsigned long __maybe_unused UP_flags;
+       struct slub_percpu_array *pca;
+
+retry:
+       pcp_trylock_prepare(UP_flags);
+       pca = pca_spin_trylock(s->cpu_array);
+
+       if (!pca) {
+               pcp_trylock_finish(UP_flags);
+               return false;
+       }
+
+       if (pca->used == pca->count) {
+               unsigned int batch = pca->count / 2;
+
+               pca_spin_unlock(pca);
+               pcp_trylock_finish(UP_flags);
+
+               if (in_irq())
+                       return false;
+
+               if (!flush_pca(s, batch))
+                       return false;
+
+               goto retry;
+       }
+
+       pca->objects[pca->used++] = object;
+
+       pca_spin_unlock(pca);
+       pcp_trylock_finish(UP_flags);
+
+       stat(s, FREE_PCA);
+
+       return true;
+}
+
+static __fastpath_inline
+size_t free_to_pca_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+       unsigned long __maybe_unused UP_flags;
+       struct slub_percpu_array *pca;
+       bool init;
+
+       pcp_trylock_prepare(UP_flags);
+       pca = pca_spin_trylock(s->cpu_array);
+
+       if (unlikely(!pca)) {
+               size = 0;
+               goto failed;
+       }
+
+       if (pca->count - pca->used < size)
+               size = pca->count - pca->used;
+
+       init = slab_want_init_on_free(s);
+
+       for (size_t i = 0; i < size; i++) {
+               if (likely(slab_free_hook(s, p[i], init)))
+                       pca->objects[pca->used++] = p[i];
+       }
+
+       pca_spin_unlock(pca);
+       stat_add(s, FREE_PCA, size);
+
+failed:
+       pcp_trylock_finish(UP_flags);
+       return size;
+}
+
  #ifndef CONFIG_SLUB_TINY
  /*
   * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
@@ -3811,7 +4035,12 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
  {
         memcg_slab_free_hook(s, slab, &object, 1);
  
-       if (likely(slab_free_hook(s, object, slab_want_init_on_free(s))))
+       if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s))))
+               return;
+
+       if (s->cpu_array)
+               free_to_pca(s, object);
+       else
                 do_slab_free(s, slab, object, object, 1, addr);
  }
  
@@ -3956,6 +4185,26 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
         if (!size)
                 return;
  
+       /*
+        * In case the objects might need memcg_slab_free_hook(), skip the array
+        * because the hook is not effective with single objects and benefits
+        * from groups of objects from a single slab that the detached freelist
+        * builds. But once we build the detached freelist, it's wasteful to
+        * throw it away and put the objects into the array.
+        *
+        * XXX: This test could be cache-specific if it was not possible to use
+        * __GFP_ACCOUNT with caches that are not SLAB_ACCOUNT
+        */
+       if (s && s->cpu_array && !memcg_kmem_online()) {
+               size_t pca_freed = free_to_pca_bulk(s, size, p);
+
+               if (pca_freed == size)
+                       return;
+
+               p += pca_freed;
+               size -= pca_freed;
+       }
+
         do {
                 struct detached_freelist df;
  
@@ -4073,7 +4322,8 @@ error:
  int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
                           void **p)
  {
-       int i;
+       int from_pca = 0;
+       int allocated = 0;
         struct obj_cgroup *objcg = NULL;
  
         if (!size)
@@ -4084,19 +4334,147 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
         if (unlikely(!s))
                 return 0;
  
-       i = __kmem_cache_alloc_bulk(s, flags, size, p);
+       if (s->cpu_array)
+               from_pca = alloc_from_pca_bulk(s, size, p);
+
+       if (from_pca < size) {
+               allocated = __kmem_cache_alloc_bulk(s, flags, size-from_pca,
+                                                   p+from_pca);
+               if (allocated == 0 && from_pca > 0) {
+                       __kmem_cache_free_bulk(s, from_pca, p);
+               }
+       }
+
+       allocated += from_pca;
  
         /*
          * memcg and kmem_cache debug support and memory initialization.
          * Done outside of the IRQ disabled fastpath loop.
          */
-       if (i != 0)
+       if (allocated != 0)
                 slab_post_alloc_hook(s, objcg, flags, size, p,
                         slab_want_init_on_alloc(flags, s), s->object_size);
-       return i;
+       return allocated;
  }
  EXPORT_SYMBOL(kmem_cache_alloc_bulk);
  
+static bool refill_pca(struct kmem_cache *s, unsigned int count, gfp_t gfp)
+{
+       void *objects[32];
+       unsigned int batch, allocated;
+       unsigned long __maybe_unused UP_flags;
+       struct slub_percpu_array *pca;
+
+bulk_alloc:
+       batch = min(count, 32U);
+
+       allocated = __kmem_cache_alloc_bulk(s, gfp, batch, &objects[0]);
+       if (!allocated)
+               return false;
+
+       pcp_trylock_prepare(UP_flags);
+       pca = pca_spin_trylock(s->cpu_array);
+       if (!pca) {
+               pcp_trylock_finish(UP_flags);
+               return false;
+       }
+
+       batch = min(allocated, pca->count - pca->used);
+
+       for (unsigned int i = 0; i < batch; i++) {
+               pca->objects[pca->used++] = objects[i];
+       }
+
+       pca_spin_unlock(pca);
+       pcp_trylock_finish(UP_flags);
+
+       stat_add(s, PCA_REFILL, batch);
+
+       /*
+        * We could have migrated to a different cpu or somebody else freed to the
+        * pca while we were bulk allocating, and now we have too many objects
+        */
+       if (batch < allocated) {
+               __kmem_cache_free_bulk(s, allocated - batch, &objects[batch]);
+       } else {
+               count -= batch;
+               if (count > 0)
+                       goto bulk_alloc;
+       }
+
+       return true;
+}
+
+static bool flush_pca(struct kmem_cache *s, unsigned int count)
+{
+       void *objects[32];
+       unsigned int batch, remaining;
+       unsigned long __maybe_unused UP_flags;
+       struct slub_percpu_array *pca;
+
+next_batch:
+       batch = min(count, 32);
+
+       pcp_trylock_prepare(UP_flags);
+       pca = pca_spin_trylock(s->cpu_array);
+       if (!pca) {
+               pcp_trylock_finish(UP_flags);
+               return false;
+       }
+
+       batch = min(batch, pca->used);
+
+       for (unsigned int i = 0; i < batch; i++) {
+               objects[i] = pca->objects[--pca->used];
+       }
+
+       remaining = pca->used;
+
+       pca_spin_unlock(pca);
+       pcp_trylock_finish(UP_flags);
+
+       __kmem_cache_free_bulk(s, batch, &objects[0]);
+
+       stat_add(s, PCA_FLUSH, batch);
+
+       if (batch < count && remaining > 0) {
+               count -= batch;
+               goto next_batch;
+       }
+
+       return true;
+}
+
+/* Do not call from irq handler nor with irqs disabled */
+int kmem_cache_prefill_percpu_array(struct kmem_cache *s, unsigned int count,
+                                   gfp_t gfp)
+{
+       struct slub_percpu_array *pca;
+       unsigned int used;
+
+       lockdep_assert_no_hardirq();
+
+       if (!s->cpu_array)
+               return -EINVAL;
+
+       /* racy but we don't care */
+       pca = raw_cpu_ptr(s->cpu_array);
+
+       used = READ_ONCE(pca->used);
+
+       if (used >= count)
+               return 0;
+
+       if (pca->count < count)
+               return -EINVAL;
+
+       count -= used;
+
+       if (!refill_pca(s, count, gfp))
+               return -ENOMEM;
+
+       return 0;
+}
  
  /*
   * Object placement in a slab is made very easy because we always start at
@@ -5167,6 +5545,30 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
         return 0;
  }
  
+int kmem_cache_setup_percpu_array(struct kmem_cache *s, unsigned int count)
+{
+       int cpu;
+
+       if (WARN_ON_ONCE(!(s->flags & SLAB_NO_MERGE)))
+               return -EINVAL;
+
+       s->cpu_array = __alloc_percpu(struct_size(s->cpu_array, objects, count),
+                                       sizeof(void *));
+
+       if (!s->cpu_array)
+               return -ENOMEM;
+
+       for_each_possible_cpu(cpu) {
+               struct slub_percpu_array *pca = per_cpu_ptr(s->cpu_array, cpu);
+
+               spin_lock_init(&pca->lock);
+               pca->count = count;
+               pca->used = 0;
+       }
+
+       return 0;
+}
+
  #ifdef SLAB_SUPPORTS_SYSFS
  static int count_inuse(struct slab *slab)
  {
@@ -5944,8 +6346,10 @@ static ssize_t text##_store(struct kmem_cache *s,                \
  }                                                              \
  SLAB_ATTR(text);                                               \
  
+STAT_ATTR(ALLOC_PCA, alloc_cpu_cache);
  STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
  STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
+STAT_ATTR(FREE_PCA, free_cpu_cache);
  STAT_ATTR(FREE_FASTPATH, free_fastpath);
  STAT_ATTR(FREE_SLOWPATH, free_slowpath);
  STAT_ATTR(FREE_FROZEN, free_frozen);
@@ -5970,6 +6374,8 @@ STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
  STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
  STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
  STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
+STAT_ATTR(PCA_REFILL, cpu_cache_refill);
+STAT_ATTR(PCA_FLUSH, cpu_cache_flush);
  #endif /* CONFIG_SLUB_STATS */
  
  #ifdef CONFIG_KFENCE
@@ -6031,8 +6437,10 @@ static struct attribute *slab_attrs[] = {
         &remote_node_defrag_ratio_attr.attr,
  #endif
  #ifdef CONFIG_SLUB_STATS
+       &alloc_cpu_cache_attr.attr,
         &alloc_fastpath_attr.attr,
         &alloc_slowpath_attr.attr,
+       &free_cpu_cache_attr.attr,
         &free_fastpath_attr.attr,
         &free_slowpath_attr.attr,
         &free_frozen_attr.attr,
@@ -6057,6 +6465,8 @@ static struct attribute *slab_attrs[] = {
         &cpu_partial_free_attr.attr,
         &cpu_partial_node_attr.attr,
         &cpu_partial_drain_attr.attr,
+       &cpu_cache_refill_attr.attr,
+       &cpu_cache_flush_attr.attr,
  #endif
  #ifdef CONFIG_FAILSLAB
         &failslab_attr.attr,
author	Vlastimil Babka <vbabka@suse.cz>
	Wed, 15 Nov 2023 10:38:15 +0000 (11:38 +0100)
committer	Liam R. Howlett <Liam.Howlett@oracle.com>
	Tue, 21 Nov 2023 18:03:21 +0000 (13:03 -0500)
include/linux/slab.h		patch \| blob \| history
include/linux/slub_def.h		patch \| blob \| history
mm/Kconfig		patch \| blob \| history
mm/slub.c		patch \| blob \| history