mm, slub: cheaper locking for percpu sheaves

author Vlastimil Babka <vbabka@suse.cz>

Wed, 28 Aug 2024 09:28:19 +0000 (11:28 +0200)

committer Vlastimil Babka <vbabka@suse.cz>

Fri, 17 Jan 2025 15:57:36 +0000 (16:57 +0100)
author Vlastimil Babka <vbabka@suse.cz>
Wed, 28 Aug 2024 09:28:19 +0000 (11:28 +0200)
committer Vlastimil Babka <vbabka@suse.cz>
Fri, 17 Jan 2025 15:57:36 +0000 (16:57 +0100)
diff --git a/mm/slub.c b/mm/slub.c

index 74dd7dc8be15fd4b44a3aef195247dd28ebb768f..b71e560e818e8a423111d6ded004fcf07d52ed86 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -448,14 +448,111 @@ struct slab_sheaf {
         void *objects[];
  };
  
+struct local_tryirq_lock {
+#ifndef CONFIG_PREEMPT_RT
+       int active;
+#else
+       local_lock_t llock;
+#endif
+};
+
  struct slub_percpu_sheaves {
-       local_lock_t lock;
+       struct local_tryirq_lock lock;
         struct slab_sheaf *main; /* never NULL when unlocked */
         struct slab_sheaf *spare; /* empty or full, may be NULL */
         struct slab_sheaf *rcu_free;
         struct node_barn *barn;
  };
  
+/*
+ * Generic helper to lookup a per-cpu variable with a lock that allows only
+ * trylock from irq handler context to avoid expensive irq disable or atomic
+ * operations and memory barriers - only compiler barriers are needed.
+ *
+ * On !PREEMPT_RT this is done by get_cpu_ptr(), which disables preemption, and
+ * checking that a variable is not already set to 1. If it is, it means we are
+ * in irq handler that has interrupted the locked operation, and must give up.
+ * Otherwise we set the variable to 1.
+ *
+ * On PREEMPT_RT we can simply use local_lock() as that does the right thing
+ * without actually disabling irqs. Thus the trylock can't actually fail.
+ *
+ */
+#ifndef CONFIG_PREEMPT_RT
+
+#define pcpu_local_tryirq_lock(type, member, ptr)                       \
+({                                                                      \
+       type *_ret;                                                     \
+       lockdep_assert(!irq_count());                                   \
+       _ret = get_cpu_ptr(ptr);                                        \
+       lockdep_assert(_ret->member.active == 0);                       \
+       WRITE_ONCE(_ret->member.active, 1);                             \
+       barrier();                                                      \
+       _ret;                                                           \
+})
+
+#define pcpu_local_tryirq_trylock(type, member, ptr)                    \
+({                                                                      \
+       type *_ret;                                                     \
+       _ret = get_cpu_ptr(ptr);                                        \
+       if (unlikely(READ_ONCE(_ret->member.active) == 1)) {            \
+               put_cpu_ptr(ptr);                                       \
+               _ret = NULL;                                            \
+       } else {                                                        \
+               WRITE_ONCE(_ret->member.active, 1);                     \
+               barrier();                                              \
+       }                                                               \
+       _ret;                                                           \
+})
+
+#define pcpu_local_tryirq_unlock(member, ptr)                           \
+({                                                                      \
+       lockdep_assert(this_cpu_ptr(ptr)->member.active == 1);          \
+       barrier();                                                      \
+       WRITE_ONCE(this_cpu_ptr(ptr)->member.active, 0);                \
+       put_cpu_ptr(ptr);                                               \
+})
+
+#define local_tryirq_lock_init(lock)                                   \
+({                                                                     \
+       (lock)->active = 0;                                             \
+})
+
+#else
+
+#define pcpu_local_tryirq_lock(type, member, ptr)                       \
+({                                                                      \
+       type *_ret;                                                     \
+       local_lock(&ptr->member.llock);                                 \
+       _ret = this_cpu_ptr(ptr);                                       \
+       _ret;                                                           \
+})
+
+#define pcpu_local_tryirq_trylock(type, member, ptr)                    \
+       pcpu_local_tryirq_lock(type, member, ptr)
+
+#define pcpu_local_tryirq_unlock(member, ptr)                           \
+({                                                                      \
+       local_unlock(&ptr->member.llock);                               \
+})
+
+#define local_tryirq_lock_init(lock)                                   \
+({                                                                     \
+       local_lock_init(&(lock)->llock);                                \
+})
+
+#endif
+
+/* struct slub_percpu_sheaves specific helpers. */
+#define cpu_sheaves_lock(ptr)                                           \
+       pcpu_local_tryirq_lock(struct slub_percpu_sheaves, lock, ptr)
+
+#define cpu_sheaves_trylock(ptr)                                        \
+       pcpu_local_tryirq_trylock(struct slub_percpu_sheaves, lock, ptr)
+
+#define cpu_sheaves_unlock(ptr)                                         \
+       pcpu_local_tryirq_unlock(lock, ptr)
+
  /*
   * The slab lists for all objects.
   */
@@ -2528,17 +2625,20 @@ static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp)
  
  static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p);
  
-static void sheaf_flush_main(struct kmem_cache *s)
+/* returns true if at least partially flushed */
+static bool sheaf_flush_main(struct kmem_cache *s)
  {
         struct slub_percpu_sheaves *pcs;
         unsigned int batch, remaining;
         void *objects[PCS_BATCH_MAX];
         struct slab_sheaf *sheaf;
-       unsigned long flags;
+       bool ret = false;
  
  next_batch:
-       local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-       pcs = this_cpu_ptr(s->cpu_sheaves);
+       pcs = cpu_sheaves_trylock(s->cpu_sheaves);
+       if (!pcs)
+               return ret;
+
         sheaf = pcs->main;
  
         batch = min(PCS_BATCH_MAX, sheaf->size);
@@ -2548,14 +2648,18 @@ next_batch:
  
         remaining = sheaf->size;
  
-       local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+       cpu_sheaves_unlock(s->cpu_sheaves);
  
         __kmem_cache_free_bulk(s, batch, &objects[0]);
  
         stat_add(s, SHEAF_FLUSH_MAIN, batch);
  
+       ret = true;
+
         if (remaining)
                 goto next_batch;
+
+       return ret;
  }
  
  static void sheaf_flush(struct kmem_cache *s, struct slab_sheaf *sheaf)
@@ -2592,6 +2696,8 @@ static void rcu_free_sheaf_nobarn(struct rcu_head *head)
   * Caller needs to make sure migration is disabled in order to fully flush
   * single cpu's sheaves
   *
+ * must not be called from an irq
+ *
   * flushing operations are rare so let's keep it simple and flush to slabs
   * directly, skipping the barn
   */
@@ -2599,10 +2705,8 @@ static void pcs_flush_all(struct kmem_cache *s)
  {
         struct slub_percpu_sheaves *pcs;
         struct slab_sheaf *spare, *rcu_free;
-       unsigned long flags;
  
-       local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-       pcs = this_cpu_ptr(s->cpu_sheaves);
+       pcs = cpu_sheaves_lock(s->cpu_sheaves);
  
         spare = pcs->spare;
         pcs->spare = NULL;
@@ -2610,7 +2714,7 @@ static void pcs_flush_all(struct kmem_cache *s)
         rcu_free = pcs->rcu_free;
         pcs->rcu_free = NULL;
  
-       local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+       cpu_sheaves_unlock(s->cpu_sheaves);
  
         if (spare) {
                 sheaf_flush(s, spare);
@@ -4557,11 +4661,11 @@ static __fastpath_inline
  void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp)
  {
         struct slub_percpu_sheaves *pcs;
-       unsigned long flags;
         void *object;
  
-       local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-       pcs = this_cpu_ptr(s->cpu_sheaves);
+       pcs = cpu_sheaves_trylock(s->cpu_sheaves);
+       if (!pcs)
+               return NULL;
  
         if (unlikely(pcs->main->size == 0)) {
  
@@ -4593,7 +4697,7 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp)
                         }
                 }
  
-               local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+               cpu_sheaves_unlock(s->cpu_sheaves);
  
                 if (!can_alloc)
                         return NULL;
@@ -4615,8 +4719,11 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp)
                 if (!full)
                         return NULL;
  
-               local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-               pcs = this_cpu_ptr(s->cpu_sheaves);
+               /*
+                * we can reach here only when gfpflags_allow_blocking
+                * so this must not be an irq
+                */
+               pcs = cpu_sheaves_lock(s->cpu_sheaves);
  
                 /*
                  * If we are returning empty sheaf, we either got it from the
@@ -4649,7 +4756,7 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp)
  do_alloc:
         object = pcs->main->objects[--pcs->main->size];
  
-       local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+       cpu_sheaves_unlock(s->cpu_sheaves);
  
         stat(s, ALLOC_PCS);
  
@@ -4661,13 +4768,13 @@ unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
  {
         struct slub_percpu_sheaves *pcs;
         struct slab_sheaf *main;
-       unsigned long flags;
         unsigned int allocated = 0;
         unsigned int batch;
  
  next_batch:
-       local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-       pcs = this_cpu_ptr(s->cpu_sheaves);
+       pcs = cpu_sheaves_trylock(s->cpu_sheaves);
+       if (!pcs)
+               return allocated;
  
         if (unlikely(pcs->main->size == 0)) {
  
@@ -4686,7 +4793,7 @@ next_batch:
                         goto do_alloc;
                 }
  
-               local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+               cpu_sheaves_unlock(s->cpu_sheaves);
  
                 /*
                  * Once full sheaves in barn are depleted, let the bulk
@@ -4704,7 +4811,7 @@ do_alloc:
         main->size -= batch;
         memcpy(p, main->objects + main->size, batch * sizeof(void *));
  
-       local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+       cpu_sheaves_unlock(s->cpu_sheaves);
  
         stat_add(s, ALLOC_PCS, batch);
  
@@ -5124,14 +5231,14 @@ slab_empty:
   * The object is expected to have passed slab_free_hook() already.
   */
  static __fastpath_inline
-void free_to_pcs(struct kmem_cache *s, void *object)
+bool free_to_pcs(struct kmem_cache *s, void *object)
  {
         struct slub_percpu_sheaves *pcs;
-       unsigned long flags;
  
  restart:
-       local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-       pcs = this_cpu_ptr(s->cpu_sheaves);
+       pcs = cpu_sheaves_trylock(s->cpu_sheaves);
+       if (!pcs)
+               return false;
  
         if (unlikely(pcs->main->size == s->sheaf_capacity)) {
  
@@ -5165,7 +5272,7 @@ restart:
                         struct slab_sheaf *to_flush = pcs->spare;
  
                         pcs->spare = NULL;
-                       local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+                       cpu_sheaves_unlock(s->cpu_sheaves);
  
                         sheaf_flush(s, to_flush);
                         empty = to_flush;
@@ -5173,18 +5280,27 @@ restart:
                 }
  
  alloc_empty:
-               local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+               cpu_sheaves_unlock(s->cpu_sheaves);
  
                 empty = alloc_empty_sheaf(s, GFP_NOWAIT);
  
                 if (!empty) {
-                       sheaf_flush_main(s);
-                       goto restart;
+                       if (sheaf_flush_main(s))
+                               goto restart;
+                       else
+                               return false;
                 }
  
  got_empty:
-               local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-               pcs = this_cpu_ptr(s->cpu_sheaves);
+               pcs = cpu_sheaves_trylock(s->cpu_sheaves);
+               if (!pcs) {
+                       struct node_barn *barn;
+
+                       barn = get_node(s, numa_mem_id())->barn;
+
+                       barn_put_empty_sheaf(barn, empty, true);
+                       return false;
+               }
  
                 /*
                  * if we put any sheaf to barn here, it's because we raced or
@@ -5212,9 +5328,11 @@ got_empty:
  do_free:
         pcs->main->objects[pcs->main->size++] = object;
  
-       local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+       cpu_sheaves_unlock(s->cpu_sheaves);
  
         stat(s, FREE_PCS);
+
+       return true;
  }
  
  static void __rcu_free_sheaf_prepare(struct kmem_cache *s,
@@ -5276,10 +5394,10 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
  {
         struct slub_percpu_sheaves *pcs;
         struct slab_sheaf *rcu_sheaf;
-       unsigned long flags;
  
-       local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-       pcs = this_cpu_ptr(s->cpu_sheaves);
+       pcs = cpu_sheaves_trylock(s->cpu_sheaves);
+       if (!pcs)
+               goto fail;
  
         if (unlikely(!pcs->rcu_free)) {
  
@@ -5292,17 +5410,16 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
                         goto do_free;
                 }
  
-               local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+               cpu_sheaves_unlock(s->cpu_sheaves);
  
                 empty = alloc_empty_sheaf(s, GFP_NOWAIT);
  
-               if (!empty) {
-                       stat(s, FREE_RCU_SHEAF_FAIL);
-                       return false;
-               }
+               if (!empty)
+                       goto fail;
  
-               local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-               pcs = this_cpu_ptr(s->cpu_sheaves);
+               pcs = cpu_sheaves_trylock(s->cpu_sheaves);
+               if (!pcs)
+                       goto fail;
  
                 if (unlikely(pcs->rcu_free))
                         barn_put_empty_sheaf(pcs->barn, empty, true);
@@ -5317,19 +5434,22 @@ do_free:
         rcu_sheaf->objects[rcu_sheaf->size++] = obj;
  
         if (likely(rcu_sheaf->size < s->sheaf_capacity)) {
-               local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+               cpu_sheaves_unlock(s->cpu_sheaves);
                 stat(s, FREE_RCU_SHEAF);
                 return true;
         }
  
         pcs->rcu_free = NULL;
-       local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+       cpu_sheaves_unlock(s->cpu_sheaves);
  
         call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
  
         stat(s, FREE_RCU_SHEAF);
-
         return true;
+
+fail:
+       stat(s, FREE_RCU_SHEAF_FAIL);
+       return false;
  }
  
  /*
@@ -5341,7 +5461,6 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
  {
         struct slub_percpu_sheaves *pcs;
         struct slab_sheaf *main;
-       unsigned long flags;
         unsigned int batch, i = 0;
         bool init;
  
@@ -5364,8 +5483,9 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
         }
  
  next_batch:
-       local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-       pcs = this_cpu_ptr(s->cpu_sheaves);
+       pcs = cpu_sheaves_trylock(s->cpu_sheaves);
+       if (!pcs)
+               goto fallback;
  
         if (unlikely(pcs->main->size == s->sheaf_capacity)) {
  
@@ -5395,13 +5515,13 @@ next_batch:
                 }
  
  no_empty:
-               local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+               cpu_sheaves_unlock(s->cpu_sheaves);
  
                 /*
                  * if we depleted all empty sheaves in the barn or there are too
                  * many full sheaves, free the rest to slab pages
                  */
-
+fallback:
                 __kmem_cache_free_bulk(s, size, p);
                 return;
         }
@@ -5413,7 +5533,7 @@ do_free:
         memcpy(main->objects + main->size, p, batch * sizeof(void *));
         main->size += batch;
  
-       local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+       cpu_sheaves_unlock(s->cpu_sheaves);
  
         stat_add(s, FREE_PCS, batch);
  
@@ -5513,9 +5633,7 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
         if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false)))
                 return;
  
-       if (s->cpu_sheaves)
-               free_to_pcs(s, object);
-       else
+       if (!s->cpu_sheaves || !free_to_pcs(s, object))
                 do_slab_free(s, slab, object, object, 1, addr);
  }
  
@@ -6275,7 +6393,7 @@ static int init_percpu_sheaves(struct kmem_cache *s)
  
                 pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
  
-               local_lock_init(&pcs->lock);
+               local_tryirq_lock_init(&pcs->lock);
  
                 nid = cpu_to_mem(cpu);
author	Vlastimil Babka <vbabka@suse.cz>
	Wed, 28 Aug 2024 09:28:19 +0000 (11:28 +0200)
committer	Vlastimil Babka <vbabka@suse.cz>
	Fri, 17 Jan 2025 15:57:36 +0000 (16:57 +0100)