]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
mm, slub: cheaper locking for percpu sheaves
authorVlastimil Babka <vbabka@suse.cz>
Wed, 28 Aug 2024 09:28:19 +0000 (11:28 +0200)
committerVlastimil Babka <vbabka@suse.cz>
Fri, 17 Jan 2025 15:57:36 +0000 (16:57 +0100)
Instead of local_lock_irqsave(), use just get_cpu_ptr() (which only
disables preemption) and then set an active flag. If potential callers
include irq handler, the operation must use a trylock variant that bails
out if the flag is already set to active because we interrupted another
operation in progress.

Changing the flag doesn't need to be atomic as the irq is one the same
cpu. This should make using percpu sheaves cheaper, with the downside of
some unlucky operations in irq handlers have to fallback to non-sheave
variants. That should be rare so there should be a net benefit.

On PREEMPT_RT we can use simply local_lock() as that does the right
thing without the need to disable irqs.

Thanks to Mateusz Guzik and Jann Horn for suggesting this kind of
locking scheme in online conversations. Initially attempted to fully
copy the page allocator's pcplist locking, but its reliance on
spin_trylock() made it much more costly.

Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
mm/slub.c

index 74dd7dc8be15fd4b44a3aef195247dd28ebb768f..b71e560e818e8a423111d6ded004fcf07d52ed86 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -448,14 +448,111 @@ struct slab_sheaf {
        void *objects[];
 };
 
+struct local_tryirq_lock {
+#ifndef CONFIG_PREEMPT_RT
+       int active;
+#else
+       local_lock_t llock;
+#endif
+};
+
 struct slub_percpu_sheaves {
-       local_lock_t lock;
+       struct local_tryirq_lock lock;
        struct slab_sheaf *main; /* never NULL when unlocked */
        struct slab_sheaf *spare; /* empty or full, may be NULL */
        struct slab_sheaf *rcu_free;
        struct node_barn *barn;
 };
 
+/*
+ * Generic helper to lookup a per-cpu variable with a lock that allows only
+ * trylock from irq handler context to avoid expensive irq disable or atomic
+ * operations and memory barriers - only compiler barriers are needed.
+ *
+ * On !PREEMPT_RT this is done by get_cpu_ptr(), which disables preemption, and
+ * checking that a variable is not already set to 1. If it is, it means we are
+ * in irq handler that has interrupted the locked operation, and must give up.
+ * Otherwise we set the variable to 1.
+ *
+ * On PREEMPT_RT we can simply use local_lock() as that does the right thing
+ * without actually disabling irqs. Thus the trylock can't actually fail.
+ *
+ */
+#ifndef CONFIG_PREEMPT_RT
+
+#define pcpu_local_tryirq_lock(type, member, ptr)                       \
+({                                                                      \
+       type *_ret;                                                     \
+       lockdep_assert(!irq_count());                                   \
+       _ret = get_cpu_ptr(ptr);                                        \
+       lockdep_assert(_ret->member.active == 0);                       \
+       WRITE_ONCE(_ret->member.active, 1);                             \
+       barrier();                                                      \
+       _ret;                                                           \
+})
+
+#define pcpu_local_tryirq_trylock(type, member, ptr)                    \
+({                                                                      \
+       type *_ret;                                                     \
+       _ret = get_cpu_ptr(ptr);                                        \
+       if (unlikely(READ_ONCE(_ret->member.active) == 1)) {            \
+               put_cpu_ptr(ptr);                                       \
+               _ret = NULL;                                            \
+       } else {                                                        \
+               WRITE_ONCE(_ret->member.active, 1);                     \
+               barrier();                                              \
+       }                                                               \
+       _ret;                                                           \
+})
+
+#define pcpu_local_tryirq_unlock(member, ptr)                           \
+({                                                                      \
+       lockdep_assert(this_cpu_ptr(ptr)->member.active == 1);          \
+       barrier();                                                      \
+       WRITE_ONCE(this_cpu_ptr(ptr)->member.active, 0);                \
+       put_cpu_ptr(ptr);                                               \
+})
+
+#define local_tryirq_lock_init(lock)                                   \
+({                                                                     \
+       (lock)->active = 0;                                             \
+})
+
+#else
+
+#define pcpu_local_tryirq_lock(type, member, ptr)                       \
+({                                                                      \
+       type *_ret;                                                     \
+       local_lock(&ptr->member.llock);                                 \
+       _ret = this_cpu_ptr(ptr);                                       \
+       _ret;                                                           \
+})
+
+#define pcpu_local_tryirq_trylock(type, member, ptr)                    \
+       pcpu_local_tryirq_lock(type, member, ptr)
+
+#define pcpu_local_tryirq_unlock(member, ptr)                           \
+({                                                                      \
+       local_unlock(&ptr->member.llock);                               \
+})
+
+#define local_tryirq_lock_init(lock)                                   \
+({                                                                     \
+       local_lock_init(&(lock)->llock);                                \
+})
+
+#endif
+
+/* struct slub_percpu_sheaves specific helpers. */
+#define cpu_sheaves_lock(ptr)                                           \
+       pcpu_local_tryirq_lock(struct slub_percpu_sheaves, lock, ptr)
+
+#define cpu_sheaves_trylock(ptr)                                        \
+       pcpu_local_tryirq_trylock(struct slub_percpu_sheaves, lock, ptr)
+
+#define cpu_sheaves_unlock(ptr)                                         \
+       pcpu_local_tryirq_unlock(lock, ptr)
+
 /*
  * The slab lists for all objects.
  */
@@ -2528,17 +2625,20 @@ static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp)
 
 static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p);
 
-static void sheaf_flush_main(struct kmem_cache *s)
+/* returns true if at least partially flushed */
+static bool sheaf_flush_main(struct kmem_cache *s)
 {
        struct slub_percpu_sheaves *pcs;
        unsigned int batch, remaining;
        void *objects[PCS_BATCH_MAX];
        struct slab_sheaf *sheaf;
-       unsigned long flags;
+       bool ret = false;
 
 next_batch:
-       local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-       pcs = this_cpu_ptr(s->cpu_sheaves);
+       pcs = cpu_sheaves_trylock(s->cpu_sheaves);
+       if (!pcs)
+               return ret;
+
        sheaf = pcs->main;
 
        batch = min(PCS_BATCH_MAX, sheaf->size);
@@ -2548,14 +2648,18 @@ next_batch:
 
        remaining = sheaf->size;
 
-       local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+       cpu_sheaves_unlock(s->cpu_sheaves);
 
        __kmem_cache_free_bulk(s, batch, &objects[0]);
 
        stat_add(s, SHEAF_FLUSH_MAIN, batch);
 
+       ret = true;
+
        if (remaining)
                goto next_batch;
+
+       return ret;
 }
 
 static void sheaf_flush(struct kmem_cache *s, struct slab_sheaf *sheaf)
@@ -2592,6 +2696,8 @@ static void rcu_free_sheaf_nobarn(struct rcu_head *head)
  * Caller needs to make sure migration is disabled in order to fully flush
  * single cpu's sheaves
  *
+ * must not be called from an irq
+ *
  * flushing operations are rare so let's keep it simple and flush to slabs
  * directly, skipping the barn
  */
@@ -2599,10 +2705,8 @@ static void pcs_flush_all(struct kmem_cache *s)
 {
        struct slub_percpu_sheaves *pcs;
        struct slab_sheaf *spare, *rcu_free;
-       unsigned long flags;
 
-       local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-       pcs = this_cpu_ptr(s->cpu_sheaves);
+       pcs = cpu_sheaves_lock(s->cpu_sheaves);
 
        spare = pcs->spare;
        pcs->spare = NULL;
@@ -2610,7 +2714,7 @@ static void pcs_flush_all(struct kmem_cache *s)
        rcu_free = pcs->rcu_free;
        pcs->rcu_free = NULL;
 
-       local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+       cpu_sheaves_unlock(s->cpu_sheaves);
 
        if (spare) {
                sheaf_flush(s, spare);
@@ -4557,11 +4661,11 @@ static __fastpath_inline
 void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp)
 {
        struct slub_percpu_sheaves *pcs;
-       unsigned long flags;
        void *object;
 
-       local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-       pcs = this_cpu_ptr(s->cpu_sheaves);
+       pcs = cpu_sheaves_trylock(s->cpu_sheaves);
+       if (!pcs)
+               return NULL;
 
        if (unlikely(pcs->main->size == 0)) {
 
@@ -4593,7 +4697,7 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp)
                        }
                }
 
-               local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+               cpu_sheaves_unlock(s->cpu_sheaves);
 
                if (!can_alloc)
                        return NULL;
@@ -4615,8 +4719,11 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp)
                if (!full)
                        return NULL;
 
-               local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-               pcs = this_cpu_ptr(s->cpu_sheaves);
+               /*
+                * we can reach here only when gfpflags_allow_blocking
+                * so this must not be an irq
+                */
+               pcs = cpu_sheaves_lock(s->cpu_sheaves);
 
                /*
                 * If we are returning empty sheaf, we either got it from the
@@ -4649,7 +4756,7 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp)
 do_alloc:
        object = pcs->main->objects[--pcs->main->size];
 
-       local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+       cpu_sheaves_unlock(s->cpu_sheaves);
 
        stat(s, ALLOC_PCS);
 
@@ -4661,13 +4768,13 @@ unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
 {
        struct slub_percpu_sheaves *pcs;
        struct slab_sheaf *main;
-       unsigned long flags;
        unsigned int allocated = 0;
        unsigned int batch;
 
 next_batch:
-       local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-       pcs = this_cpu_ptr(s->cpu_sheaves);
+       pcs = cpu_sheaves_trylock(s->cpu_sheaves);
+       if (!pcs)
+               return allocated;
 
        if (unlikely(pcs->main->size == 0)) {
 
@@ -4686,7 +4793,7 @@ next_batch:
                        goto do_alloc;
                }
 
-               local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+               cpu_sheaves_unlock(s->cpu_sheaves);
 
                /*
                 * Once full sheaves in barn are depleted, let the bulk
@@ -4704,7 +4811,7 @@ do_alloc:
        main->size -= batch;
        memcpy(p, main->objects + main->size, batch * sizeof(void *));
 
-       local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+       cpu_sheaves_unlock(s->cpu_sheaves);
 
        stat_add(s, ALLOC_PCS, batch);
 
@@ -5124,14 +5231,14 @@ slab_empty:
  * The object is expected to have passed slab_free_hook() already.
  */
 static __fastpath_inline
-void free_to_pcs(struct kmem_cache *s, void *object)
+bool free_to_pcs(struct kmem_cache *s, void *object)
 {
        struct slub_percpu_sheaves *pcs;
-       unsigned long flags;
 
 restart:
-       local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-       pcs = this_cpu_ptr(s->cpu_sheaves);
+       pcs = cpu_sheaves_trylock(s->cpu_sheaves);
+       if (!pcs)
+               return false;
 
        if (unlikely(pcs->main->size == s->sheaf_capacity)) {
 
@@ -5165,7 +5272,7 @@ restart:
                        struct slab_sheaf *to_flush = pcs->spare;
 
                        pcs->spare = NULL;
-                       local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+                       cpu_sheaves_unlock(s->cpu_sheaves);
 
                        sheaf_flush(s, to_flush);
                        empty = to_flush;
@@ -5173,18 +5280,27 @@ restart:
                }
 
 alloc_empty:
-               local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+               cpu_sheaves_unlock(s->cpu_sheaves);
 
                empty = alloc_empty_sheaf(s, GFP_NOWAIT);
 
                if (!empty) {
-                       sheaf_flush_main(s);
-                       goto restart;
+                       if (sheaf_flush_main(s))
+                               goto restart;
+                       else
+                               return false;
                }
 
 got_empty:
-               local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-               pcs = this_cpu_ptr(s->cpu_sheaves);
+               pcs = cpu_sheaves_trylock(s->cpu_sheaves);
+               if (!pcs) {
+                       struct node_barn *barn;
+
+                       barn = get_node(s, numa_mem_id())->barn;
+
+                       barn_put_empty_sheaf(barn, empty, true);
+                       return false;
+               }
 
                /*
                 * if we put any sheaf to barn here, it's because we raced or
@@ -5212,9 +5328,11 @@ got_empty:
 do_free:
        pcs->main->objects[pcs->main->size++] = object;
 
-       local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+       cpu_sheaves_unlock(s->cpu_sheaves);
 
        stat(s, FREE_PCS);
+
+       return true;
 }
 
 static void __rcu_free_sheaf_prepare(struct kmem_cache *s,
@@ -5276,10 +5394,10 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
 {
        struct slub_percpu_sheaves *pcs;
        struct slab_sheaf *rcu_sheaf;
-       unsigned long flags;
 
-       local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-       pcs = this_cpu_ptr(s->cpu_sheaves);
+       pcs = cpu_sheaves_trylock(s->cpu_sheaves);
+       if (!pcs)
+               goto fail;
 
        if (unlikely(!pcs->rcu_free)) {
 
@@ -5292,17 +5410,16 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
                        goto do_free;
                }
 
-               local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+               cpu_sheaves_unlock(s->cpu_sheaves);
 
                empty = alloc_empty_sheaf(s, GFP_NOWAIT);
 
-               if (!empty) {
-                       stat(s, FREE_RCU_SHEAF_FAIL);
-                       return false;
-               }
+               if (!empty)
+                       goto fail;
 
-               local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-               pcs = this_cpu_ptr(s->cpu_sheaves);
+               pcs = cpu_sheaves_trylock(s->cpu_sheaves);
+               if (!pcs)
+                       goto fail;
 
                if (unlikely(pcs->rcu_free))
                        barn_put_empty_sheaf(pcs->barn, empty, true);
@@ -5317,19 +5434,22 @@ do_free:
        rcu_sheaf->objects[rcu_sheaf->size++] = obj;
 
        if (likely(rcu_sheaf->size < s->sheaf_capacity)) {
-               local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+               cpu_sheaves_unlock(s->cpu_sheaves);
                stat(s, FREE_RCU_SHEAF);
                return true;
        }
 
        pcs->rcu_free = NULL;
-       local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+       cpu_sheaves_unlock(s->cpu_sheaves);
 
        call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
 
        stat(s, FREE_RCU_SHEAF);
-
        return true;
+
+fail:
+       stat(s, FREE_RCU_SHEAF_FAIL);
+       return false;
 }
 
 /*
@@ -5341,7 +5461,6 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
 {
        struct slub_percpu_sheaves *pcs;
        struct slab_sheaf *main;
-       unsigned long flags;
        unsigned int batch, i = 0;
        bool init;
 
@@ -5364,8 +5483,9 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
        }
 
 next_batch:
-       local_lock_irqsave(&s->cpu_sheaves->lock, flags);
-       pcs = this_cpu_ptr(s->cpu_sheaves);
+       pcs = cpu_sheaves_trylock(s->cpu_sheaves);
+       if (!pcs)
+               goto fallback;
 
        if (unlikely(pcs->main->size == s->sheaf_capacity)) {
 
@@ -5395,13 +5515,13 @@ next_batch:
                }
 
 no_empty:
-               local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+               cpu_sheaves_unlock(s->cpu_sheaves);
 
                /*
                 * if we depleted all empty sheaves in the barn or there are too
                 * many full sheaves, free the rest to slab pages
                 */
-
+fallback:
                __kmem_cache_free_bulk(s, size, p);
                return;
        }
@@ -5413,7 +5533,7 @@ do_free:
        memcpy(main->objects + main->size, p, batch * sizeof(void *));
        main->size += batch;
 
-       local_unlock_irqrestore(&s->cpu_sheaves->lock, flags);
+       cpu_sheaves_unlock(s->cpu_sheaves);
 
        stat_add(s, FREE_PCS, batch);
 
@@ -5513,9 +5633,7 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
        if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false)))
                return;
 
-       if (s->cpu_sheaves)
-               free_to_pcs(s, object);
-       else
+       if (!s->cpu_sheaves || !free_to_pcs(s, object))
                do_slab_free(s, slab, object, object, 1, addr);
 }
 
@@ -6275,7 +6393,7 @@ static int init_percpu_sheaves(struct kmem_cache *s)
 
                pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
 
-               local_lock_init(&pcs->lock);
+               local_tryirq_lock_init(&pcs->lock);
 
                nid = cpu_to_mem(cpu);