From d6fd0b53b3e79956a550e7e66fbc452edb29cb9f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 15 Nov 2023 11:38:15 +0100 Subject: [PATCH] mm/slub: add opt-in percpu array cache of objects kmem_cache_setup_percpu_array() will allocate a per-cpu array for caching alloc/free objects of given capacity for the cache. The cache has to have been created with the SLAB_NO_MERGE flag. Further allocations from the cache will allocate from the per-cpu array as long as they have no NUMA node preference. Frees will also go to the array. When the array is found empty during an allocation that is allowed to block, half of the array is refilled from slabs by an internal bulk alloc operation. When the array is found full during freeing, half of the array is flushed by an internal bulk free operation. The array does not distinguish NUMA locality of the cached objects. If an allocation is requested with kmem_cache_alloc_node() with a specific node (not NUMA_NO_NODE), the array is bypassed. The bulk operations exposed to slab users also try to utilize the array when possible, but leave the array empty or full without a refill or flush, and use the internal bulk alloc/free only to fulfil (part of) the request itself, if the array cannot serve it completely. If kmemcg is enabled and active, bulk freeing skips the array completely as it would be less efficient to use it than not. kmem_cache_prefill_percpu_array() can be called to ensure the array on the current cpu contains at least the given number of objects. If the array needs to be refilled, it will be done above the indicated count to prevent doing many small prefills followed by small numbers of actual allocations. However the prefill is only opportunistic as there's no cpu pinning or disabled preemption between the prefill and actual allocations. Therefore allocations cannot fully rely on the array for success even after a prefill. But misses should be rare enough that e.g. GFP_ATOMIC allocations in a should be acceptable to perform in a restricted context following a prefill. Sysfs stat counters alloc_cpu_cache and free_cpu_cache count objects allocated or freed using the percpu array; counters cpu_cache_refill and cpu_cache_flush count objects refilled or flushed form the array. The efficiency of reusing objects in the array can thus be determined by comparing the alloc_cpu_cache/free_cpu_cache counters with the refill/flush counters. The refill and flush operations will also count towards the usual alloc_fastpath/slowpath, free_fastpath/slowpath and other counters. Access to the array is protected by local_lock_irqsave() operations. When slub_debug is enabled for a cache with percpu array, the objects in the array are considered as allocated from the slub_debug perspective, and the alloc/free debugging hooks occur when moving the objects between the array and slab pages. This means that e.g. an use-after-free that occurs for an object cached in the array is undetected. Collected alloc/free stacktraces might also be less useful. This limitation could be changed in the future. On the other hand, KASAN, kmemcg and other hooks are executed on actual allocations and frees by kmem_cache users even if those use the array, so their debugging or accounting accuracy should be unaffected. Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 4 + mm/slab.h | 8 + mm/slub.c | 442 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 447 insertions(+), 7 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index b5f5ee8308d0..1163d3670999 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -197,6 +197,8 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name, void kmem_cache_destroy(struct kmem_cache *s); int kmem_cache_shrink(struct kmem_cache *s); +int kmem_cache_setup_percpu_array(struct kmem_cache *s, unsigned int count); + /* * Please use this macro to create slab caches. Simply specify the * name of the structure and maybe some flags that are listed above. @@ -502,6 +504,8 @@ void kmem_cache_free(struct kmem_cache *s, void *objp); void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p); int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p); +int kmem_cache_prefill_percpu_array(struct kmem_cache *s, unsigned int count, gfp_t gfp); + static __always_inline void kfree_bulk(size_t size, void **p) { kmem_cache_free_bulk(NULL, size, p); diff --git a/mm/slab.h b/mm/slab.h index 54deeb0428c6..145a6969eb2f 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -245,6 +245,13 @@ struct kmem_cache_order_objects { unsigned int x; }; +struct slub_percpu_array { + local_lock_t lock; + unsigned int capacity; + unsigned int size; + void *objects[]; +}; + /* * Slab cache management. */ @@ -252,6 +259,7 @@ struct kmem_cache { #ifndef CONFIG_SLUB_TINY struct kmem_cache_cpu __percpu *cpu_slab; #endif + struct slub_percpu_array __percpu *cpu_array; /* Used for retrieving partial slabs, etc. */ slab_flags_t flags; unsigned long min_partial; diff --git a/mm/slub.c b/mm/slub.c index 2ef88bbf56a3..ce7161716964 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -347,8 +347,10 @@ static inline void debugfs_slab_add(struct kmem_cache *s) { } #endif enum stat_item { + ALLOC_PCA, /* Allocation from percpu array cache */ ALLOC_FASTPATH, /* Allocation from cpu slab */ ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ + FREE_PCA, /* Free to percpu array cache */ FREE_FASTPATH, /* Free to cpu slab */ FREE_SLOWPATH, /* Freeing not to cpu slab */ FREE_FROZEN, /* Freeing to frozen slab */ @@ -373,6 +375,8 @@ enum stat_item { CPU_PARTIAL_FREE, /* Refill cpu partial on free */ CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ + PCA_REFILL, /* Refilling empty percpu array cache */ + PCA_FLUSH, /* Flushing full percpu array cache */ NR_SLUB_STAT_ITEMS }; @@ -3054,6 +3058,8 @@ struct slub_flush_work { bool skip; }; +static void flush_pca(struct kmem_cache *s); + /* * Flush cpu slab. * @@ -3068,6 +3074,14 @@ static void flush_cpu_slab(struct work_struct *w) sfw = container_of(w, struct slub_flush_work, work); s = sfw->s; + + if (s->cpu_array) { + struct slub_percpu_array *pca = this_cpu_ptr(s->cpu_array); + + if (pca->size) + flush_pca(s); + } + c = this_cpu_ptr(s->cpu_slab); if (c->slab) @@ -3083,6 +3097,18 @@ static bool has_cpu_slab(int cpu, struct kmem_cache *s) return c->slab || slub_percpu_partial(c); } +static bool has_pca_used(int cpu, struct kmem_cache *s) +{ + struct slub_percpu_array *pca; + + if (!s->cpu_array) + return false; + + pca = per_cpu_ptr(s->cpu_array, cpu); + + return (pca->size > 0); +} + static DEFINE_MUTEX(flush_lock); static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); @@ -3096,7 +3122,7 @@ static void flush_all_cpus_locked(struct kmem_cache *s) for_each_online_cpu(cpu) { sfw = &per_cpu(slub_flush, cpu); - if (!has_cpu_slab(cpu, s)) { + if (!has_cpu_slab(cpu, s) && !has_pca_used(cpu, s)) { sfw->skip = true; continue; } @@ -3822,6 +3848,70 @@ void slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, memcg_slab_post_alloc_hook(s, objcg, flags, size, p); } +/* + * Maximum number of objects allocated or freed during a refill or flush batch + * when the percpu array is empty or full, respectively. Translates directly to + * an on-stack array size. + */ +#define PCA_BATCH_MAX 32U + +static void *refill_pca(struct kmem_cache *s, unsigned int count, gfp_t gfp, + bool want_obj); + +static __fastpath_inline +void *alloc_from_pca(struct kmem_cache *s, gfp_t gfp) +{ + struct slub_percpu_array *pca; + unsigned long flags; + void *object; + + local_lock_irqsave(&s->cpu_array->lock, flags); + pca = this_cpu_ptr(s->cpu_array); + + if (unlikely(pca->size == 0)) { + local_unlock_irqrestore(&s->cpu_array->lock, flags); + + if (!gfpflags_allow_blocking(gfp)) + return NULL; + + object = refill_pca(s, pca->capacity / 2, gfp, true); + + if (IS_ERR(object)) + return NULL; + + return object; + } + + object = pca->objects[--pca->size]; + + local_unlock_irqrestore(&s->cpu_array->lock, flags); + + stat(s, ALLOC_PCA); + + return object; +} + +static __fastpath_inline +int alloc_from_pca_bulk(struct kmem_cache *s, size_t size, void **p) +{ + struct slub_percpu_array *pca; + unsigned long flags; + + local_lock_irqsave(&s->cpu_array->lock, flags); + pca = this_cpu_ptr(s->cpu_array); + + if (pca->size < size) + size = pca->size; + + pca->size -= size; + memcpy(p, pca->objects + pca->size, size * sizeof(void *)); + + local_unlock_irqrestore(&s->cpu_array->lock, flags); + stat_add(s, ALLOC_PCA, size); + + return size; +} + /* * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) * have the fastpath folded into their functions. So no function call @@ -3847,7 +3937,11 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list if (unlikely(object)) goto out; - object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); + if (s->cpu_array && (node == NUMA_NO_NODE)) + object = alloc_from_pca(s, gfpflags); + + if (!object) + object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); maybe_wipe_obj_freeptr(s, object); init = slab_want_init_on_alloc(gfpflags, s); @@ -4211,6 +4305,71 @@ slab_empty: discard_slab(s, slab); } +static void __flush_pca(struct kmem_cache *s, struct slub_percpu_array *pca, + unsigned int count, unsigned long flags); + +/* + * Free an object to the percpu array. + * The object is expected to have passed slab_free_hook() already. + */ +static __fastpath_inline +void free_to_pca(struct kmem_cache *s, void *object) +{ + struct slub_percpu_array *pca; + unsigned long flags; + + stat(s, FREE_PCA); + + local_lock_irqsave(&s->cpu_array->lock, flags); + pca = this_cpu_ptr(s->cpu_array); + + /* + * the array has the space for one extra pointer beyond capacity + * but we must then flush immediately without unlocking + */ + pca->objects[pca->size++] = object; + if (unlikely(pca->size > pca->capacity)) + __flush_pca(s, pca, pca->capacity / 2 + 1, flags); + else + local_unlock_irqrestore(&s->cpu_array->lock, flags); +} + +/* + * Bulk free objects to the percpu array. + * Unlike free_to_pca() this includes the calls to slab_free_hook() as that + * allows us to iterate the array of objects just once. + * There is no flushing. If the percpu array becomes full, anything over the + * capacity has to be freed to slabs directly. + * + * Returns how many objects were freed to the array. + */ +static __fastpath_inline +size_t free_to_pca_bulk(struct kmem_cache *s, size_t size, void **p) +{ + struct slub_percpu_array *pca; + unsigned long flags; + bool init; + + local_lock_irqsave(&s->cpu_array->lock, flags); + pca = this_cpu_ptr(s->cpu_array); + + if (pca->capacity - pca->size < size) + size = pca->capacity - pca->size; + + init = slab_want_init_on_free(s); + + for (size_t i = 0; i < size; i++) { + if (likely(slab_free_hook(s, p[i], init))) + pca->objects[pca->size++] = p[i]; + } + + local_unlock_irqrestore(&s->cpu_array->lock, flags); + + stat_add(s, FREE_PCA, size); + + return size; +} + #ifndef CONFIG_SLUB_TINY /* * Fastpath with forced inlining to produce a kfree and kmem_cache_free that @@ -4296,7 +4455,12 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, { memcg_slab_free_hook(s, slab, &object, 1); - if (likely(slab_free_hook(s, object, slab_want_init_on_free(s)))) + if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s)))) + return; + + if (s->cpu_array) + free_to_pca(s, object); + else do_slab_free(s, slab, object, object, 1, addr); } @@ -4516,6 +4680,26 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) if (!size) return; + /* + * In case the objects might need memcg_slab_free_hook(), skip the array + * because the hook is not effective with single objects and benefits + * from groups of objects from a single slab that the detached freelist + * builds. But once we build the detached freelist, it's wasteful to + * throw it away and put the objects into the array. + * + * XXX: This test could be cache-specific if it was not possible to use + * __GFP_ACCOUNT with caches that are not SLAB_ACCOUNT + */ + if (s && s->cpu_array && !memcg_kmem_online()) { + size_t pca_freed = free_to_pca_bulk(s, size, p); + + if (pca_freed == size) + return; + + p += pca_freed; + size -= pca_freed; + } + do { struct detached_freelist df; @@ -4634,8 +4818,9 @@ error: int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p) { - int i; struct obj_cgroup *objcg = NULL; + int allocated = 0; + int from_pca = 0; if (!size) return 0; @@ -4645,23 +4830,194 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, if (unlikely(!s)) return 0; - i = __kmem_cache_alloc_bulk(s, flags, size, p); + if (s->cpu_array) + from_pca = alloc_from_pca_bulk(s, size, p); + + if (from_pca < size) { + allocated = __kmem_cache_alloc_bulk(s, flags, size - from_pca, + p + from_pca); + /* + * If we ran out of memory, don't bother with freeing back to + * the percpu array, we have bigger problems. + */ + if (unlikely(allocated == 0 && from_pca > 0)) { + __kmem_cache_free_bulk(s, from_pca, p); + from_pca = 0; + } + } + + allocated += from_pca; /* * memcg and kmem_cache debug support and memory initialization. * Done outside of the IRQ disabled fastpath loop. */ - if (likely(i != 0)) { + if (likely(allocated != 0)) { slab_post_alloc_hook(s, objcg, flags, size, p, slab_want_init_on_alloc(flags, s), s->object_size); } else { memcg_slab_alloc_error_hook(s, size, objcg); } - return i; + return allocated; } EXPORT_SYMBOL(kmem_cache_alloc_bulk); +/* + * Refill the per-cpu array from slabs. + * + * If want_obj is true, an object is returned in addition to filling the array + * and in that case it's returned even if the given count of objects could not + * be allocated, but a partial refill was successful. + * + * If want_obj is false, -ENOMEM is returned if the given count of objects could + * not be allocated, even if partial refill was successful. + */ +static void *refill_pca(struct kmem_cache *s, unsigned int count, gfp_t gfp, + bool want_obj) +{ + unsigned int batch, allocated; + struct slub_percpu_array *pca; + void *objects[PCA_BATCH_MAX]; + unsigned long flags; + void *obj = NULL; + + if (want_obj) + count++; + +next_batch: + batch = min(count, PCA_BATCH_MAX); + allocated = __kmem_cache_alloc_bulk(s, gfp, batch, &objects[0]); + if (!allocated) { + /* + * If we already have non-NULL obj, it means a previous batch + * succeeded and the caller does not care that the refill did + * not succeed completely. + */ + return obj ? obj : ERR_PTR(-ENOMEM); + } + + if (want_obj) { + obj = objects[--allocated]; + count--; + } + + local_lock_irqsave(&s->cpu_array->lock, flags); + pca = this_cpu_ptr(s->cpu_array); + + batch = min(allocated, pca->capacity - pca->size); + + memcpy(pca->objects + pca->size, objects, batch * sizeof(void *)); + pca->size += batch; + + local_unlock_irqrestore(&s->cpu_array->lock, flags); + + stat_add(s, PCA_REFILL, batch); + + /* + * We could have migrated to a different cpu or somebody else freed to + * pca while we were bulk allocating, and now we have too many objects. + */ + if (batch < allocated) { + __kmem_cache_free_bulk(s, allocated - batch, &objects[batch]); + } else if (batch < count) { + count -= batch; + want_obj = false; + goto next_batch; + } + + return obj; +} + +/* + * Called with pca->lock locked and corresponding flags, returns unlocked. + */ +static void __flush_pca(struct kmem_cache *s, struct slub_percpu_array *pca, + unsigned int count, unsigned long flags) +{ + + unsigned int batch, remaining; + void *objects[PCA_BATCH_MAX]; + +next_batch: + batch = min(count, PCA_BATCH_MAX); + + batch = min(batch, pca->size); + + pca->size -= batch; + memcpy(objects, pca->objects + pca->size, batch * sizeof(void *)); + + remaining = pca->size; + + local_unlock_irqrestore(&s->cpu_array->lock, flags); + + __kmem_cache_free_bulk(s, batch, &objects[0]); + + stat_add(s, PCA_FLUSH, batch); + + if (batch < count && remaining > 0) { + count -= batch; + local_lock_irqsave(&s->cpu_array->lock, flags); + pca = this_cpu_ptr(s->cpu_array); + goto next_batch; + } +} + +static void flush_pca(struct kmem_cache *s) +{ + struct slub_percpu_array *pca; + unsigned long flags; + + local_lock_irqsave(&s->cpu_array->lock, flags); + pca = this_cpu_ptr(s->cpu_array); + + __flush_pca(s, pca, pca->capacity, flags); +} + + +int kmem_cache_prefill_percpu_array(struct kmem_cache *s, unsigned int count, + gfp_t gfp) +{ + struct slub_percpu_array *pca; + unsigned int size; + void *refill; + + if (!gfpflags_allow_blocking(gfp)) + return -EINVAL; + + if (!s->cpu_array) + return -EINVAL; + + /* racy but we don't care */ + pca = raw_cpu_ptr(s->cpu_array); + + if (pca->capacity < count) + return -EINVAL; + + size = READ_ONCE(pca->size); + if (size >= count) + return 0; + + /* + * If the existing size is less than desired count, do not refill only + * up to pca->size == count, as this way we could end up doing many + * small refills if only few of the prefilled objects actually end up + * being allocated and we refill the same small amount on the next + * prefill. + * + * Instead set our target prefilled size to the requested amount plus + * half of the remaining capacity. This matches the refill in + * alloc_from_pca() also done up to half the capacity. + */ + count += (pca->capacity - count) / 2; + count -= size; + + refill = refill_pca(s, count, gfp, false); + if (IS_ERR(refill)) + return PTR_ERR(refill); + + return 0; +} /* * Object placement in a slab is made very easy because we always start at @@ -4897,6 +5253,8 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) void __kmem_cache_release(struct kmem_cache *s) { cache_random_seq_destroy(s); + if (s->cpu_array) + free_percpu(s->cpu_array); #ifndef CONFIG_SLUB_TINY free_percpu(s->cpu_slab); #endif @@ -5733,6 +6091,68 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) return 0; } +/** + * kmem_cache_setup_percpu_array - Create a per-cpu array cache for the cache + * @s: The cache to add per-cpu array. Must be created with SLAB_NO_MERGE flag. + * @count: Size of the per-cpu array. + * + * After this call, allocations from the cache go through a percpu array. When + * it becomes empty, and gfp flags allow blocking, half is refilled with a bulk + * allocation. When it becomes full, half is flushed with a bulk free operation. + * + * The array cache does not distinguish NUMA nodes, so allocations via + * kmem_cache_alloc_node() with a node specified other than NUMA_NO_NODE will + * bypass the cache. + * + * Bulk allocation and free operations also try to use the array. + * + * kmem_cache_prefill_percpu_array() can be used to pre-fill the array cache + * before e.g. entering a restricted context. It is however not guaranteed that + * the caller will be able to subsequently consume the prefilled cache. Such + * failures should be however sufficiently rare so after the prefill, + * allocations using GFP_ATOMIC | __GFP_NOFAIL are acceptable for objects up to + * the prefilled amount. + * + * Limitations: when slub_debug is enabled for the cache, all relevant actions + * (i.e. poisoning, obtaining stacktraces) and checks happen when objects move + * between the array cache and slab pages, which may result in e.g. not + * detecting a use-after-free while the object is in the array cache, and the + * stacktraces may be less useful. + * + * Return: 0 if OK, -EINVAL on caches without SLAB_NO_MERGE or with the array + * already created, -ENOMEM when the per-cpu array creation fails. + */ +int kmem_cache_setup_percpu_array(struct kmem_cache *s, unsigned int count) +{ + int cpu; + + if (WARN_ON_ONCE(!(s->flags & SLAB_NO_MERGE))) + return -EINVAL; + + if (s->cpu_array) + return -EINVAL; + + /* + * the object array is the desired count + 1 so we can fit the object + * that triggers the need for flushing + */ + s->cpu_array = __alloc_percpu(struct_size(s->cpu_array, objects, + count + 1), sizeof(void *)); + + if (!s->cpu_array) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct slub_percpu_array *pca = per_cpu_ptr(s->cpu_array, cpu); + + local_lock_init(&pca->lock); + pca->capacity = count; + pca->size = 0; + } + + return 0; +} + #ifdef SLAB_SUPPORTS_SYSFS static int count_inuse(struct slab *slab) { @@ -6510,8 +6930,10 @@ static ssize_t text##_store(struct kmem_cache *s, \ } \ SLAB_ATTR(text); \ +STAT_ATTR(ALLOC_PCA, alloc_cpu_cache); STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); +STAT_ATTR(FREE_PCA, free_cpu_cache); STAT_ATTR(FREE_FASTPATH, free_fastpath); STAT_ATTR(FREE_SLOWPATH, free_slowpath); STAT_ATTR(FREE_FROZEN, free_frozen); @@ -6536,6 +6958,8 @@ STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); +STAT_ATTR(PCA_REFILL, cpu_cache_refill); +STAT_ATTR(PCA_FLUSH, cpu_cache_flush); #endif /* CONFIG_SLUB_STATS */ #ifdef CONFIG_KFENCE @@ -6597,8 +7021,10 @@ static struct attribute *slab_attrs[] = { &remote_node_defrag_ratio_attr.attr, #endif #ifdef CONFIG_SLUB_STATS + &alloc_cpu_cache_attr.attr, &alloc_fastpath_attr.attr, &alloc_slowpath_attr.attr, + &free_cpu_cache_attr.attr, &free_fastpath_attr.attr, &free_slowpath_attr.attr, &free_frozen_attr.attr, @@ -6623,6 +7049,8 @@ static struct attribute *slab_attrs[] = { &cpu_partial_free_attr.attr, &cpu_partial_node_attr.attr, &cpu_partial_drain_attr.attr, + &cpu_cache_refill_attr.attr, + &cpu_cache_flush_attr.attr, #endif #ifdef CONFIG_FAILSLAB &failslab_attr.attr, -- 2.49.0