mm, slub: convert kmem_cpu_slab protection to local_lock

author Vlastimil Babka <vbabka@suse.cz>

Fri, 21 May 2021 23:59:38 +0000 (01:59 +0200)

committer Vlastimil Babka <vbabka@suse.cz>

Sat, 4 Sep 2021 08:22:01 +0000 (10:22 +0200)
author Vlastimil Babka <vbabka@suse.cz>
Fri, 21 May 2021 23:59:38 +0000 (01:59 +0200)
committer Vlastimil Babka <vbabka@suse.cz>
Sat, 4 Sep 2021 08:22:01 +0000 (10:22 +0200)
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h

index dcde82a4434caee2346115b861a3b9b73a2e5e47..85499f0586b06c55d2a494e5888bda1e2d651b9d 100644 (file)
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -10,6 +10,7 @@
  #include <linux/kfence.h>
  #include <linux/kobject.h>
  #include <linux/reciprocal_div.h>
+#include <linux/local_lock.h>
  
  enum stat_item {
         ALLOC_FASTPATH,         /* Allocation from cpu slab */
@@ -40,6 +41,10 @@ enum stat_item {
         CPU_PARTIAL_DRAIN,      /* Drain cpu partial to node partial */
         NR_SLUB_STAT_ITEMS };
  
+/*
+ * When changing the layout, make sure freelist and tid are still compatible
+ * with this_cpu_cmpxchg_double() alignment requirements.
+ */
  struct kmem_cache_cpu {
         void **freelist;        /* Pointer to next available object */
         unsigned long tid;      /* Globally unique transaction id */
@@ -47,6 +52,7 @@ struct kmem_cache_cpu {
  #ifdef CONFIG_SLUB_CPU_PARTIAL
         struct page *partial;   /* Partially allocated frozen slabs */
  #endif
+       local_lock_t lock;      /* Protects the fields above */
  #ifdef CONFIG_SLUB_STATS
         unsigned stat[NR_SLUB_STAT_ITEMS];
  #endif
diff --git a/mm/slub.c b/mm/slub.c

index 38d4cc51e880c5a19688560dcc5daa9bb47891b1..3d2025f7163b295c45c2b26a8c5260414b2fc238 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -46,13 +46,21 @@
  /*
   * Lock order:
   *   1. slab_mutex (Global Mutex)
- *   2. node->list_lock
- *   3. slab_lock(page) (Only on some arches and for debugging)
+ *   2. node->list_lock (Spinlock)
+ *   3. kmem_cache->cpu_slab->lock (Local lock)
+ *   4. slab_lock(page) (Only on some arches or for debugging)
+ *   5. object_map_lock (Only for debugging)
   *
   *   slab_mutex
   *
   *   The role of the slab_mutex is to protect the list of all the slabs
   *   and to synchronize major metadata changes to slab cache structures.
+ *   Also synchronizes memory hotplug callbacks.
+ *
+ *   slab_lock
+ *
+ *   The slab_lock is a wrapper around the page lock, thus it is a bit
+ *   spinlock.
   *
   *   The slab_lock is only used for debugging and on arches that do not
   *   have the ability to do a cmpxchg_double. It only protects:
@@ -61,6 +69,8 @@
   *     C. page->objects        -> Number of objects in page
   *     D. page->frozen         -> frozen state
   *
+ *   Frozen slabs
+ *
   *   If a slab is frozen then it is exempt from list management. It is not
   *   on any list except per cpu partial list. The processor that froze the
   *   slab is the one who can perform list operations on the page. Other
@@ -68,6 +78,8 @@
   *   froze the slab is the only one that can retrieve the objects from the
   *   page's freelist.
   *
+ *   list_lock
+ *
   *   The list_lock protects the partial and full list on each node and
   *   the partial slab counter. If taken then no new slabs may be added or
   *   removed from the lists nor make the number of partial slabs be modified.
@@ -79,10 +91,36 @@
   *   slabs, operations can continue without any centralized lock. F.e.
   *   allocating a long series of objects that fill up slabs does not require
   *   the list lock.
- *   Interrupts are disabled during allocation and deallocation in order to
- *   make the slab allocator safe to use in the context of an irq. In addition
- *   interrupts are disabled to ensure that the processor does not change
- *   while handling per_cpu slabs, due to kernel preemption.
+ *
+ *   cpu_slab->lock local lock
+ *
+ *   This locks protect slowpath manipulation of all kmem_cache_cpu fields
+ *   except the stat counters. This is a percpu structure manipulated only by
+ *   the local cpu, so the lock protects against being preempted or interrupted
+ *   by an irq. Fast path operations rely on lockless operations instead.
+ *   On PREEMPT_RT, the local lock does not actually disable irqs (and thus
+ *   prevent the lockless operations), so fastpath operations also need to take
+ *   the lock and are no longer lockless.
+ *
+ *   lockless fastpaths
+ *
+ *   The fast path allocation (slab_alloc_node()) and freeing (do_slab_free())
+ *   are fully lockless when satisfied from the percpu slab (and when
+ *   cmpxchg_double is possible to use, otherwise slab_lock is taken).
+ *   They also don't disable preemption or migration or irqs. They rely on
+ *   the transaction id (tid) field to detect being preempted or moved to
+ *   another cpu.
+ *
+ *   irq, preemption, migration considerations
+ *
+ *   Interrupts are disabled as part of list_lock or local_lock operations, or
+ *   around the slab_lock operation, in order to make the slab allocator safe
+ *   to use in the context of an irq.
+ *
+ *   In addition, preemption (or migration on PREEMPT_RT) is disabled in the
+ *   allocation slowpath, bulk allocation, and put_cpu_partial(), so that the
+ *   local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer
+ *   doesn't have to be revalidated in each section protected by the local lock.
   *
   * SLUB assigns one slab for allocation to each processor.
   * Allocations only occur from these slabs called cpu slabs.
@@ -2250,9 +2288,13 @@ static inline void note_cmpxchg_failure(const char *n,
  static void init_kmem_cache_cpus(struct kmem_cache *s)
  {
         int cpu;
+       struct kmem_cache_cpu *c;
  
-       for_each_possible_cpu(cpu)
-               per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
+       for_each_possible_cpu(cpu) {
+               c = per_cpu_ptr(s->cpu_slab, cpu);
+               local_lock_init(&c->lock);
+               c->tid = init_tid(cpu);
+       }
  }
  
  /*
@@ -2463,10 +2505,10 @@ static void unfreeze_partials(struct kmem_cache *s)
         struct page *partial_page;
         unsigned long flags;
  
-       local_irq_save(flags);
+       local_lock_irqsave(&s->cpu_slab->lock, flags);
         partial_page = this_cpu_read(s->cpu_slab->partial);
         this_cpu_write(s->cpu_slab->partial, NULL);
-       local_irq_restore(flags);
+       local_unlock_irqrestore(&s->cpu_slab->lock, flags);
  
         if (partial_page)
                 __unfreeze_partials(s, partial_page);
@@ -2499,7 +2541,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
         int pages = 0;
         int pobjects = 0;
  
-       local_irq_save(flags);
+       local_lock_irqsave(&s->cpu_slab->lock, flags);
  
         oldpage = this_cpu_read(s->cpu_slab->partial);
  
@@ -2527,7 +2569,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
  
         this_cpu_write(s->cpu_slab->partial, page);
  
-       local_irq_restore(flags);
+       local_unlock_irqrestore(&s->cpu_slab->lock, flags);
  
         if (page_to_unfreeze) {
                 __unfreeze_partials(s, page_to_unfreeze);
@@ -2549,7 +2591,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
         struct page *page;
         void *freelist;
  
-       local_irq_save(flags);
+       local_lock_irqsave(&s->cpu_slab->lock, flags);
  
         page = c->page;
         freelist = c->freelist;
@@ -2558,7 +2600,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
         c->freelist = NULL;
         c->tid = next_tid(c->tid);
  
-       local_irq_restore(flags);
+       local_unlock_irqrestore(&s->cpu_slab->lock, flags);
  
         if (page) {
                 deactivate_slab(s, page, freelist);
@@ -2780,8 +2822,6 @@ static inline bool pfmemalloc_match_unsafe(struct page *page, gfp_t gfpflags)
   * The page is still frozen if the return value is not NULL.
   *
   * If this function returns NULL then the page has been unfrozen.
- *
- * This function must be called with interrupt disabled.
   */
  static inline void *get_freelist(struct kmem_cache *s, struct page *page)
  {
@@ -2789,6 +2829,8 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
         unsigned long counters;
         void *freelist;
  
+       lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
+
         do {
                 freelist = page->freelist;
                 counters = page->counters;
@@ -2873,9 +2915,9 @@ redo:
                 goto deactivate_slab;
  
         /* must check again c->page in case we got preempted and it changed */
-       local_irq_save(flags);
+       local_lock_irqsave(&s->cpu_slab->lock, flags);
         if (unlikely(page != c->page)) {
-               local_irq_restore(flags);
+               local_unlock_irqrestore(&s->cpu_slab->lock, flags);
                 goto reread_page;
         }
         freelist = c->freelist;
@@ -2886,7 +2928,7 @@ redo:
  
         if (!freelist) {
                 c->page = NULL;
-               local_irq_restore(flags);
+               local_unlock_irqrestore(&s->cpu_slab->lock, flags);
                 stat(s, DEACTIVATE_BYPASS);
                 goto new_slab;
         }
@@ -2895,7 +2937,7 @@ redo:
  
  load_freelist:
  
-       lockdep_assert_irqs_disabled();
+       lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
  
         /*
          * freelist is pointing to the list of objects to be used.
@@ -2905,39 +2947,39 @@ load_freelist:
         VM_BUG_ON(!c->page->frozen);
         c->freelist = get_freepointer(s, freelist);
         c->tid = next_tid(c->tid);
-       local_irq_restore(flags);
+       local_unlock_irqrestore(&s->cpu_slab->lock, flags);
         return freelist;
  
  deactivate_slab:
  
-       local_irq_save(flags);
+       local_lock_irqsave(&s->cpu_slab->lock, flags);
         if (page != c->page) {
-               local_irq_restore(flags);
+               local_unlock_irqrestore(&s->cpu_slab->lock, flags);
                 goto reread_page;
         }
         freelist = c->freelist;
         c->page = NULL;
         c->freelist = NULL;
-       local_irq_restore(flags);
+       local_unlock_irqrestore(&s->cpu_slab->lock, flags);
         deactivate_slab(s, page, freelist);
  
  new_slab:
  
         if (slub_percpu_partial(c)) {
-               local_irq_save(flags);
+               local_lock_irqsave(&s->cpu_slab->lock, flags);
                 if (unlikely(c->page)) {
-                       local_irq_restore(flags);
+                       local_unlock_irqrestore(&s->cpu_slab->lock, flags);
                         goto reread_page;
                 }
                 if (unlikely(!slub_percpu_partial(c))) {
-                       local_irq_restore(flags);
+                       local_unlock_irqrestore(&s->cpu_slab->lock, flags);
                         /* we were preempted and partial list got empty */
                         goto new_objects;
                 }
  
                 page = c->page = slub_percpu_partial(c);
                 slub_set_percpu_partial(c, page);
-               local_irq_restore(flags);
+               local_unlock_irqrestore(&s->cpu_slab->lock, flags);
                 stat(s, CPU_PARTIAL_ALLOC);
                 goto redo;
         }
@@ -2990,7 +3032,7 @@ check_new_page:
  
  retry_load_page:
  
-       local_irq_save(flags);
+       local_lock_irqsave(&s->cpu_slab->lock, flags);
         if (unlikely(c->page)) {
                 void *flush_freelist = c->freelist;
                 struct page *flush_page = c->page;
@@ -2999,7 +3041,7 @@ retry_load_page:
                 c->freelist = NULL;
                 c->tid = next_tid(c->tid);
  
-               local_irq_restore(flags);
+               local_unlock_irqrestore(&s->cpu_slab->lock, flags);
  
                 deactivate_slab(s, flush_page, flush_freelist);
  
@@ -3118,7 +3160,15 @@ redo:
  
         object = c->freelist;
         page = c->page;
-       if (unlikely(!object || !page || !node_match(page, node))) {
+       /*
+        * We cannot use the lockless fastpath on PREEMPT_RT because if a
+        * slowpath has taken the local_lock_irqsave(), it is not protected
+        * against a fast path operation in an irq handler. So we need to take
+        * the slow path which uses local_lock. It is still relatively fast if
+        * there is a suitable cpu freelist.
+        */
+       if (IS_ENABLED(CONFIG_PREEMPT_RT) ||
+           unlikely(!object || !page || !node_match(page, node))) {
                 object = __slab_alloc(s, gfpflags, node, addr, c);
         } else {
                 void *next_object = get_freepointer_safe(s, object);
@@ -3378,6 +3428,7 @@ redo:
         barrier();
  
         if (likely(page == c->page)) {
+#ifndef CONFIG_PREEMPT_RT
                 void **freelist = READ_ONCE(c->freelist);
  
                 set_freepointer(s, tail_obj, freelist);
@@ -3390,6 +3441,31 @@ redo:
                         note_cmpxchg_failure("slab_free", s, tid);
                         goto redo;
                 }
+#else /* CONFIG_PREEMPT_RT */
+               /*
+                * We cannot use the lockless fastpath on PREEMPT_RT because if
+                * a slowpath has taken the local_lock_irqsave(), it is not
+                * protected against a fast path operation in an irq handler. So
+                * we need to take the local_lock. We shouldn't simply defer to
+                * __slab_free() as that wouldn't use the cpu freelist at all.
+                */
+               void **freelist;
+
+               local_lock(&s->cpu_slab->lock);
+               c = this_cpu_ptr(s->cpu_slab);
+               if (unlikely(page != c->page)) {
+                       local_unlock(&s->cpu_slab->lock);
+                       goto redo;
+               }
+               tid = c->tid;
+               freelist = c->freelist;
+
+               set_freepointer(s, tail_obj, freelist);
+               c->freelist = head;
+               c->tid = next_tid(tid);
+
+               local_unlock(&s->cpu_slab->lock);
+#endif
                 stat(s, FREE_FASTPATH);
         } else
                 __slab_free(s, page, head, tail_obj, cnt, addr);
@@ -3568,7 +3644,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
          * handlers invoking normal fastpath.
          */
         c = slub_get_cpu_ptr(s->cpu_slab);
-       local_irq_disable();
+       local_lock_irq(&s->cpu_slab->lock);
  
         for (i = 0; i < size; i++) {
                 void *object = kfence_alloc(s, s->object_size, flags);
@@ -3589,7 +3665,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
                          */
                         c->tid = next_tid(c->tid);
  
-                       local_irq_enable();
+                       local_unlock_irq(&s->cpu_slab->lock);
  
                         /*
                          * Invoking slow path likely have side-effect
@@ -3603,7 +3679,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
                         c = this_cpu_ptr(s->cpu_slab);
                         maybe_wipe_obj_freeptr(s, p[i]);
  
-                       local_irq_disable();
+                       local_lock_irq(&s->cpu_slab->lock);
  
                         continue; /* goto for-loop */
                 }
@@ -3612,7 +3688,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
                 maybe_wipe_obj_freeptr(s, p[i]);
         }
         c->tid = next_tid(c->tid);
-       local_irq_enable();
+       local_unlock_irq(&s->cpu_slab->lock);
         slub_put_cpu_ptr(s->cpu_slab);
  
         /*
author	Vlastimil Babka <vbabka@suse.cz>
	Fri, 21 May 2021 23:59:38 +0000 (01:59 +0200)
committer	Vlastimil Babka <vbabka@suse.cz>
	Sat, 4 Sep 2021 08:22:01 +0000 (10:22 +0200)
include/linux/slub_def.h		patch \| blob \| history
mm/slub.c		patch \| blob \| history