mm/slab: Move kvfree_rcu() into SLAB

author Uladzislau Rezki (Sony) <urezki@gmail.com>

Thu, 12 Dec 2024 18:02:08 +0000 (19:02 +0100)

committer Vlastimil Babka <vbabka@suse.cz>

Sat, 11 Jan 2025 19:39:43 +0000 (20:39 +0100)
author Uladzislau Rezki (Sony) <urezki@gmail.com>
Thu, 12 Dec 2024 18:02:08 +0000 (19:02 +0100)
committer Vlastimil Babka <vbabka@suse.cz>
Sat, 11 Jan 2025 19:39:43 +0000 (20:39 +0100)
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h

index acb0095b4dbe946aaf84a1cf8ee680a392a778c3..48e5c03df1dd83c246a61d0fcc8aa638adcd7654 100644 (file)
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -118,7 +118,6 @@ static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
  
  /* Internal to kernel */
  void rcu_init(void);
-void __init kvfree_rcu_init(void);
  extern int rcu_scheduler_active;
  void rcu_sched_clock_irq(int user);
  
diff --git a/include/linux/slab.h b/include/linux/slab.h

index 10a971c2bde31813aa6d84f216f28f2d5025ef1b..09eedaecf1205672bb2e7c8cd57ae8fccebc2737 100644 (file)
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -1099,5 +1099,6 @@ unsigned int kmem_cache_size(struct kmem_cache *s);
  size_t kmalloc_size_roundup(size_t size);
  
  void __init kmem_cache_init_late(void);
+void __init kvfree_rcu_init(void);
  
  #endif /* _LINUX_SLAB_H */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c

index b7ec998f360e7cee7ba5f1ab3cb64ca6034aa374..6af042cde9727c8d7f7e3d88f26ba222d0d9c535 100644 (file)
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -186,26 +186,6 @@ static int rcu_unlock_delay;
  module_param(rcu_unlock_delay, int, 0444);
  #endif
  
-/*
- * This rcu parameter is runtime-read-only. It reflects
- * a minimum allowed number of objects which can be cached
- * per-CPU. Object size is equal to one page. This value
- * can be changed at boot time.
- */
-static int rcu_min_cached_objs = 5;
-module_param(rcu_min_cached_objs, int, 0444);
-
-// A page shrinker can ask for pages to be freed to make them
-// available for other parts of the system. This usually happens
-// under low memory conditions, and in that case we should also
-// defer page-cache filling for a short time period.
-//
-// The default value is 5 seconds, which is long enough to reduce
-// interference with the shrinker while it asks other systems to
-// drain their caches.
-static int rcu_delay_page_cache_fill_msec = 5000;
-module_param(rcu_delay_page_cache_fill_msec, int, 0444);
-
  /* Retrieve RCU kthreads priority for rcutorture */
  int rcu_get_gp_kthreads_prio(void)
  {
@@ -3191,816 +3171,6 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
  }
  EXPORT_SYMBOL_GPL(call_rcu);
  
-/* Maximum number of jiffies to wait before draining a batch. */
-#define KFREE_DRAIN_JIFFIES (5 * HZ)
-#define KFREE_N_BATCHES 2
-#define FREE_N_CHANNELS 2
-
-/**
- * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
- * @list: List node. All blocks are linked between each other
- * @gp_snap: Snapshot of RCU state for objects placed to this bulk
- * @nr_records: Number of active pointers in the array
- * @records: Array of the kvfree_rcu() pointers
- */
-struct kvfree_rcu_bulk_data {
-       struct list_head list;
-       struct rcu_gp_oldstate gp_snap;
-       unsigned long nr_records;
-       void *records[] __counted_by(nr_records);
-};
-
-/*
- * This macro defines how many entries the "records" array
- * will contain. It is based on the fact that the size of
- * kvfree_rcu_bulk_data structure becomes exactly one page.
- */
-#define KVFREE_BULK_MAX_ENTR \
-       ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
-
-/**
- * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
- * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
- * @head_free: List of kfree_rcu() objects waiting for a grace period
- * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees.
- * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
- * @krcp: Pointer to @kfree_rcu_cpu structure
- */
-
-struct kfree_rcu_cpu_work {
-       struct rcu_work rcu_work;
-       struct rcu_head *head_free;
-       struct rcu_gp_oldstate head_free_gp_snap;
-       struct list_head bulk_head_free[FREE_N_CHANNELS];
-       struct kfree_rcu_cpu *krcp;
-};
-
-/**
- * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
- * @head: List of kfree_rcu() objects not yet waiting for a grace period
- * @head_gp_snap: Snapshot of RCU state for objects placed to "@head"
- * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
- * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
- * @lock: Synchronize access to this structure
- * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
- * @initialized: The @rcu_work fields have been initialized
- * @head_count: Number of objects in rcu_head singular list
- * @bulk_count: Number of objects in bulk-list
- * @bkvcache:
- *     A simple cache list that contains objects for reuse purpose.
- *     In order to save some per-cpu space the list is singular.
- *     Even though it is lockless an access has to be protected by the
- *     per-cpu lock.
- * @page_cache_work: A work to refill the cache when it is empty
- * @backoff_page_cache_fill: Delay cache refills
- * @work_in_progress: Indicates that page_cache_work is running
- * @hrtimer: A hrtimer for scheduling a page_cache_work
- * @nr_bkv_objs: number of allocated objects at @bkvcache.
- *
- * This is a per-CPU structure.  The reason that it is not included in
- * the rcu_data structure is to permit this code to be extracted from
- * the RCU files.  Such extraction could allow further optimization of
- * the interactions with the slab allocators.
- */
-struct kfree_rcu_cpu {
-       // Objects queued on a linked list
-       // through their rcu_head structures.
-       struct rcu_head *head;
-       unsigned long head_gp_snap;
-       atomic_t head_count;
-
-       // Objects queued on a bulk-list.
-       struct list_head bulk_head[FREE_N_CHANNELS];
-       atomic_t bulk_count[FREE_N_CHANNELS];
-
-       struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
-       raw_spinlock_t lock;
-       struct delayed_work monitor_work;
-       bool initialized;
-
-       struct delayed_work page_cache_work;
-       atomic_t backoff_page_cache_fill;
-       atomic_t work_in_progress;
-       struct hrtimer hrtimer;
-
-       struct llist_head bkvcache;
-       int nr_bkv_objs;
-};
-
-static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
-       .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
-};
-
-static __always_inline void
-debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
-{
-#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
-       int i;
-
-       for (i = 0; i < bhead->nr_records; i++)
-               debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i]));
-#endif
-}
-
-static inline struct kfree_rcu_cpu *
-krc_this_cpu_lock(unsigned long *flags)
-{
-       struct kfree_rcu_cpu *krcp;
-
-       local_irq_save(*flags); // For safely calling this_cpu_ptr().
-       krcp = this_cpu_ptr(&krc);
-       raw_spin_lock(&krcp->lock);
-
-       return krcp;
-}
-
-static inline void
-krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
-{
-       raw_spin_unlock_irqrestore(&krcp->lock, flags);
-}
-
-static inline struct kvfree_rcu_bulk_data *
-get_cached_bnode(struct kfree_rcu_cpu *krcp)
-{
-       if (!krcp->nr_bkv_objs)
-               return NULL;
-
-       WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1);
-       return (struct kvfree_rcu_bulk_data *)
-               llist_del_first(&krcp->bkvcache);
-}
-
-static inline bool
-put_cached_bnode(struct kfree_rcu_cpu *krcp,
-       struct kvfree_rcu_bulk_data *bnode)
-{
-       // Check the limit.
-       if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
-               return false;
-
-       llist_add((struct llist_node *) bnode, &krcp->bkvcache);
-       WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1);
-       return true;
-}
-
-static int
-drain_page_cache(struct kfree_rcu_cpu *krcp)
-{
-       unsigned long flags;
-       struct llist_node *page_list, *pos, *n;
-       int freed = 0;
-
-       if (!rcu_min_cached_objs)
-               return 0;
-
-       raw_spin_lock_irqsave(&krcp->lock, flags);
-       page_list = llist_del_all(&krcp->bkvcache);
-       WRITE_ONCE(krcp->nr_bkv_objs, 0);
-       raw_spin_unlock_irqrestore(&krcp->lock, flags);
-
-       llist_for_each_safe(pos, n, page_list) {
-               free_page((unsigned long)pos);
-               freed++;
-       }
-
-       return freed;
-}
-
-static void
-kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp,
-       struct kvfree_rcu_bulk_data *bnode, int idx)
-{
-       unsigned long flags;
-       int i;
-
-       if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) {
-               debug_rcu_bhead_unqueue(bnode);
-               rcu_lock_acquire(&rcu_callback_map);
-               if (idx == 0) { // kmalloc() / kfree().
-                       trace_rcu_invoke_kfree_bulk_callback(
-                               "slab", bnode->nr_records,
-                               bnode->records);
-
-                       kfree_bulk(bnode->nr_records, bnode->records);
-               } else { // vmalloc() / vfree().
-                       for (i = 0; i < bnode->nr_records; i++) {
-                               trace_rcu_invoke_kvfree_callback(
-                                       "slab", bnode->records[i], 0);
-
-                               vfree(bnode->records[i]);
-                       }
-               }
-               rcu_lock_release(&rcu_callback_map);
-       }
-
-       raw_spin_lock_irqsave(&krcp->lock, flags);
-       if (put_cached_bnode(krcp, bnode))
-               bnode = NULL;
-       raw_spin_unlock_irqrestore(&krcp->lock, flags);
-
-       if (bnode)
-               free_page((unsigned long) bnode);
-
-       cond_resched_tasks_rcu_qs();
-}
-
-static void
-kvfree_rcu_list(struct rcu_head *head)
-{
-       struct rcu_head *next;
-
-       for (; head; head = next) {
-               void *ptr = (void *) head->func;
-               unsigned long offset = (void *) head - ptr;
-
-               next = head->next;
-               debug_rcu_head_unqueue((struct rcu_head *)ptr);
-               rcu_lock_acquire(&rcu_callback_map);
-               trace_rcu_invoke_kvfree_callback("slab", head, offset);
-
-               if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
-                       kvfree(ptr);
-
-               rcu_lock_release(&rcu_callback_map);
-               cond_resched_tasks_rcu_qs();
-       }
-}
-
-/*
- * This function is invoked in workqueue context after a grace period.
- * It frees all the objects queued on ->bulk_head_free or ->head_free.
- */
-static void kfree_rcu_work(struct work_struct *work)
-{
-       unsigned long flags;
-       struct kvfree_rcu_bulk_data *bnode, *n;
-       struct list_head bulk_head[FREE_N_CHANNELS];
-       struct rcu_head *head;
-       struct kfree_rcu_cpu *krcp;
-       struct kfree_rcu_cpu_work *krwp;
-       struct rcu_gp_oldstate head_gp_snap;
-       int i;
-
-       krwp = container_of(to_rcu_work(work),
-               struct kfree_rcu_cpu_work, rcu_work);
-       krcp = krwp->krcp;
-
-       raw_spin_lock_irqsave(&krcp->lock, flags);
-       // Channels 1 and 2.
-       for (i = 0; i < FREE_N_CHANNELS; i++)
-               list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]);
-
-       // Channel 3.
-       head = krwp->head_free;
-       krwp->head_free = NULL;
-       head_gp_snap = krwp->head_free_gp_snap;
-       raw_spin_unlock_irqrestore(&krcp->lock, flags);
-
-       // Handle the first two channels.
-       for (i = 0; i < FREE_N_CHANNELS; i++) {
-               // Start from the tail page, so a GP is likely passed for it.
-               list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
-                       kvfree_rcu_bulk(krcp, bnode, i);
-       }
-
-       /*
-        * This is used when the "bulk" path can not be used for the
-        * double-argument of kvfree_rcu().  This happens when the
-        * page-cache is empty, which means that objects are instead
-        * queued on a linked list through their rcu_head structures.
-        * This list is named "Channel 3".
-        */
-       if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap)))
-               kvfree_rcu_list(head);
-}
-
-static bool
-need_offload_krc(struct kfree_rcu_cpu *krcp)
-{
-       int i;
-
-       for (i = 0; i < FREE_N_CHANNELS; i++)
-               if (!list_empty(&krcp->bulk_head[i]))
-                       return true;
-
-       return !!READ_ONCE(krcp->head);
-}
-
-static bool
-need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp)
-{
-       int i;
-
-       for (i = 0; i < FREE_N_CHANNELS; i++)
-               if (!list_empty(&krwp->bulk_head_free[i]))
-                       return true;
-
-       return !!krwp->head_free;
-}
-
-static int krc_count(struct kfree_rcu_cpu *krcp)
-{
-       int sum = atomic_read(&krcp->head_count);
-       int i;
-
-       for (i = 0; i < FREE_N_CHANNELS; i++)
-               sum += atomic_read(&krcp->bulk_count[i]);
-
-       return sum;
-}
-
-static void
-__schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
-{
-       long delay, delay_left;
-
-       delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;
-       if (delayed_work_pending(&krcp->monitor_work)) {
-               delay_left = krcp->monitor_work.timer.expires - jiffies;
-               if (delay < delay_left)
-                       mod_delayed_work(system_unbound_wq, &krcp->monitor_work, delay);
-               return;
-       }
-       queue_delayed_work(system_unbound_wq, &krcp->monitor_work, delay);
-}
-
-static void
-schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
-{
-       unsigned long flags;
-
-       raw_spin_lock_irqsave(&krcp->lock, flags);
-       __schedule_delayed_monitor_work(krcp);
-       raw_spin_unlock_irqrestore(&krcp->lock, flags);
-}
-
-static void
-kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
-{
-       struct list_head bulk_ready[FREE_N_CHANNELS];
-       struct kvfree_rcu_bulk_data *bnode, *n;
-       struct rcu_head *head_ready = NULL;
-       unsigned long flags;
-       int i;
-
-       raw_spin_lock_irqsave(&krcp->lock, flags);
-       for (i = 0; i < FREE_N_CHANNELS; i++) {
-               INIT_LIST_HEAD(&bulk_ready[i]);
-
-               list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) {
-                       if (!poll_state_synchronize_rcu_full(&bnode->gp_snap))
-                               break;
-
-                       atomic_sub(bnode->nr_records, &krcp->bulk_count[i]);
-                       list_move(&bnode->list, &bulk_ready[i]);
-               }
-       }
-
-       if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) {
-               head_ready = krcp->head;
-               atomic_set(&krcp->head_count, 0);
-               WRITE_ONCE(krcp->head, NULL);
-       }
-       raw_spin_unlock_irqrestore(&krcp->lock, flags);
-
-       for (i = 0; i < FREE_N_CHANNELS; i++) {
-               list_for_each_entry_safe(bnode, n, &bulk_ready[i], list)
-                       kvfree_rcu_bulk(krcp, bnode, i);
-       }
-
-       if (head_ready)
-               kvfree_rcu_list(head_ready);
-}
-
-/*
- * Return: %true if a work is queued, %false otherwise.
- */
-static bool
-kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp)
-{
-       unsigned long flags;
-       bool queued = false;
-       int i, j;
-
-       raw_spin_lock_irqsave(&krcp->lock, flags);
-
-       // Attempt to start a new batch.
-       for (i = 0; i < KFREE_N_BATCHES; i++) {
-               struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
-
-               // Try to detach bulk_head or head and attach it, only when
-               // all channels are free.  Any channel is not free means at krwp
-               // there is on-going rcu work to handle krwp's free business.
-               if (need_wait_for_krwp_work(krwp))
-                       continue;
-
-               // kvfree_rcu_drain_ready() might handle this krcp, if so give up.
-               if (need_offload_krc(krcp)) {
-                       // Channel 1 corresponds to the SLAB-pointer bulk path.
-                       // Channel 2 corresponds to vmalloc-pointer bulk path.
-                       for (j = 0; j < FREE_N_CHANNELS; j++) {
-                               if (list_empty(&krwp->bulk_head_free[j])) {
-                                       atomic_set(&krcp->bulk_count[j], 0);
-                                       list_replace_init(&krcp->bulk_head[j],
-                                               &krwp->bulk_head_free[j]);
-                               }
-                       }
-
-                       // Channel 3 corresponds to both SLAB and vmalloc
-                       // objects queued on the linked list.
-                       if (!krwp->head_free) {
-                               krwp->head_free = krcp->head;
-                               get_state_synchronize_rcu_full(&krwp->head_free_gp_snap);
-                               atomic_set(&krcp->head_count, 0);
-                               WRITE_ONCE(krcp->head, NULL);
-                       }
-
-                       // One work is per one batch, so there are three
-                       // "free channels", the batch can handle. Break
-                       // the loop since it is done with this CPU thus
-                       // queuing an RCU work is _always_ success here.
-                       queued = queue_rcu_work(system_unbound_wq, &krwp->rcu_work);
-                       WARN_ON_ONCE(!queued);
-                       break;
-               }
-       }
-
-       raw_spin_unlock_irqrestore(&krcp->lock, flags);
-       return queued;
-}
-
-/*
- * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
- */
-static void kfree_rcu_monitor(struct work_struct *work)
-{
-       struct kfree_rcu_cpu *krcp = container_of(work,
-               struct kfree_rcu_cpu, monitor_work.work);
-
-       // Drain ready for reclaim.
-       kvfree_rcu_drain_ready(krcp);
-
-       // Queue a batch for a rest.
-       kvfree_rcu_queue_batch(krcp);
-
-       // If there is nothing to detach, it means that our job is
-       // successfully done here. In case of having at least one
-       // of the channels that is still busy we should rearm the
-       // work to repeat an attempt. Because previous batches are
-       // still in progress.
-       if (need_offload_krc(krcp))
-               schedule_delayed_monitor_work(krcp);
-}
-
-static void fill_page_cache_func(struct work_struct *work)
-{
-       struct kvfree_rcu_bulk_data *bnode;
-       struct kfree_rcu_cpu *krcp =
-               container_of(work, struct kfree_rcu_cpu,
-                       page_cache_work.work);
-       unsigned long flags;
-       int nr_pages;
-       bool pushed;
-       int i;
-
-       nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ?
-               1 : rcu_min_cached_objs;
-
-       for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) {
-               bnode = (struct kvfree_rcu_bulk_data *)
-                       __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
-
-               if (!bnode)
-                       break;
-
-               raw_spin_lock_irqsave(&krcp->lock, flags);
-               pushed = put_cached_bnode(krcp, bnode);
-               raw_spin_unlock_irqrestore(&krcp->lock, flags);
-
-               if (!pushed) {
-                       free_page((unsigned long) bnode);
-                       break;
-               }
-       }
-
-       atomic_set(&krcp->work_in_progress, 0);
-       atomic_set(&krcp->backoff_page_cache_fill, 0);
-}
-
-// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
-// state specified by flags.  If can_alloc is true, the caller must
-// be schedulable and not be holding any locks or mutexes that might be
-// acquired by the memory allocator or anything that it might invoke.
-// Returns true if ptr was successfully recorded, else the caller must
-// use a fallback.
-static inline bool
-add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
-       unsigned long *flags, void *ptr, bool can_alloc)
-{
-       struct kvfree_rcu_bulk_data *bnode;
-       int idx;
-
-       *krcp = krc_this_cpu_lock(flags);
-       if (unlikely(!(*krcp)->initialized))
-               return false;
-
-       idx = !!is_vmalloc_addr(ptr);
-       bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx],
-               struct kvfree_rcu_bulk_data, list);
-
-       /* Check if a new block is required. */
-       if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) {
-               bnode = get_cached_bnode(*krcp);
-               if (!bnode && can_alloc) {
-                       krc_this_cpu_unlock(*krcp, *flags);
-
-                       // __GFP_NORETRY - allows a light-weight direct reclaim
-                       // what is OK from minimizing of fallback hitting point of
-                       // view. Apart of that it forbids any OOM invoking what is
-                       // also beneficial since we are about to release memory soon.
-                       //
-                       // __GFP_NOMEMALLOC - prevents from consuming of all the
-                       // memory reserves. Please note we have a fallback path.
-                       //
-                       // __GFP_NOWARN - it is supposed that an allocation can
-                       // be failed under low memory or high memory pressure
-                       // scenarios.
-                       bnode = (struct kvfree_rcu_bulk_data *)
-                               __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
-                       raw_spin_lock_irqsave(&(*krcp)->lock, *flags);
-               }
-
-               if (!bnode)
-                       return false;
-
-               // Initialize the new block and attach it.
-               bnode->nr_records = 0;
-               list_add(&bnode->list, &(*krcp)->bulk_head[idx]);
-       }
-
-       // Finally insert and update the GP for this page.
-       bnode->nr_records++;
-       bnode->records[bnode->nr_records - 1] = ptr;
-       get_state_synchronize_rcu_full(&bnode->gp_snap);
-       atomic_inc(&(*krcp)->bulk_count[idx]);
-
-       return true;
-}
-
-#if !defined(CONFIG_TINY_RCU)
-
-static enum hrtimer_restart
-schedule_page_work_fn(struct hrtimer *t)
-{
-       struct kfree_rcu_cpu *krcp =
-               container_of(t, struct kfree_rcu_cpu, hrtimer);
-
-       queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0);
-       return HRTIMER_NORESTART;
-}
-
-static void
-run_page_cache_worker(struct kfree_rcu_cpu *krcp)
-{
-       // If cache disabled, bail out.
-       if (!rcu_min_cached_objs)
-               return;
-
-       if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
-                       !atomic_xchg(&krcp->work_in_progress, 1)) {
-               if (atomic_read(&krcp->backoff_page_cache_fill)) {
-                       queue_delayed_work(system_unbound_wq,
-                               &krcp->page_cache_work,
-                                       msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
-               } else {
-                       hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-                       krcp->hrtimer.function = schedule_page_work_fn;
-                       hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
-               }
-       }
-}
-
-void __init kfree_rcu_scheduler_running(void)
-{
-       int cpu;
-
-       for_each_possible_cpu(cpu) {
-               struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
-
-               if (need_offload_krc(krcp))
-                       schedule_delayed_monitor_work(krcp);
-       }
-}
-
-/*
- * Queue a request for lazy invocation of the appropriate free routine
- * after a grace period.  Please note that three paths are maintained,
- * two for the common case using arrays of pointers and a third one that
- * is used only when the main paths cannot be used, for example, due to
- * memory pressure.
- *
- * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
- * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
- * be free'd in workqueue context. This allows us to: batch requests together to
- * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
- */
-void kvfree_call_rcu(struct rcu_head *head, void *ptr)
-{
-       unsigned long flags;
-       struct kfree_rcu_cpu *krcp;
-       bool success;
-
-       /*
-        * Please note there is a limitation for the head-less
-        * variant, that is why there is a clear rule for such
-        * objects: it can be used from might_sleep() context
-        * only. For other places please embed an rcu_head to
-        * your data.
-        */
-       if (!head)
-               might_sleep();
-
-       // Queue the object but don't yet schedule the batch.
-       if (debug_rcu_head_queue(ptr)) {
-               // Probable double kfree_rcu(), just leak.
-               WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
-                         __func__, head);
-
-               // Mark as success and leave.
-               return;
-       }
-
-       kasan_record_aux_stack_noalloc(ptr);
-       success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head);
-       if (!success) {
-               run_page_cache_worker(krcp);
-
-               if (head == NULL)
-                       // Inline if kvfree_rcu(one_arg) call.
-                       goto unlock_return;
-
-               head->func = ptr;
-               head->next = krcp->head;
-               WRITE_ONCE(krcp->head, head);
-               atomic_inc(&krcp->head_count);
-
-               // Take a snapshot for this krcp.
-               krcp->head_gp_snap = get_state_synchronize_rcu();
-               success = true;
-       }
-
-       /*
-        * The kvfree_rcu() caller considers the pointer freed at this point
-        * and likely removes any references to it. Since the actual slab
-        * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore
-        * this object (no scanning or false positives reporting).
-        */
-       kmemleak_ignore(ptr);
-
-       // Set timer to drain after KFREE_DRAIN_JIFFIES.
-       if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
-               __schedule_delayed_monitor_work(krcp);
-
-unlock_return:
-       krc_this_cpu_unlock(krcp, flags);
-
-       /*
-        * Inline kvfree() after synchronize_rcu(). We can do
-        * it from might_sleep() context only, so the current
-        * CPU can pass the QS state.
-        */
-       if (!success) {
-               debug_rcu_head_unqueue((struct rcu_head *) ptr);
-               synchronize_rcu();
-               kvfree(ptr);
-       }
-}
-EXPORT_SYMBOL_GPL(kvfree_call_rcu);
-
-/**
- * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
- *
- * Note that a single argument of kvfree_rcu() call has a slow path that
- * triggers synchronize_rcu() following by freeing a pointer. It is done
- * before the return from the function. Therefore for any single-argument
- * call that will result in a kfree() to a cache that is to be destroyed
- * during module exit, it is developer's responsibility to ensure that all
- * such calls have returned before the call to kmem_cache_destroy().
- */
-void kvfree_rcu_barrier(void)
-{
-       struct kfree_rcu_cpu_work *krwp;
-       struct kfree_rcu_cpu *krcp;
-       bool queued;
-       int i, cpu;
-
-       /*
-        * Firstly we detach objects and queue them over an RCU-batch
-        * for all CPUs. Finally queued works are flushed for each CPU.
-        *
-        * Please note. If there are outstanding batches for a particular
-        * CPU, those have to be finished first following by queuing a new.
-        */
-       for_each_possible_cpu(cpu) {
-               krcp = per_cpu_ptr(&krc, cpu);
-
-               /*
-                * Check if this CPU has any objects which have been queued for a
-                * new GP completion. If not(means nothing to detach), we are done
-                * with it. If any batch is pending/running for this "krcp", below
-                * per-cpu flush_rcu_work() waits its completion(see last step).
-                */
-               if (!need_offload_krc(krcp))
-                       continue;
-
-               while (1) {
-                       /*
-                        * If we are not able to queue a new RCU work it means:
-                        * - batches for this CPU are still in flight which should
-                        *   be flushed first and then repeat;
-                        * - no objects to detach, because of concurrency.
-                        */
-                       queued = kvfree_rcu_queue_batch(krcp);
-
-                       /*
-                        * Bail out, if there is no need to offload this "krcp"
-                        * anymore. As noted earlier it can run concurrently.
-                        */
-                       if (queued || !need_offload_krc(krcp))
-                               break;
-
-                       /* There are ongoing batches. */
-                       for (i = 0; i < KFREE_N_BATCHES; i++) {
-                               krwp = &(krcp->krw_arr[i]);
-                               flush_rcu_work(&krwp->rcu_work);
-                       }
-               }
-       }
-
-       /*
-        * Now we guarantee that all objects are flushed.
-        */
-       for_each_possible_cpu(cpu) {
-               krcp = per_cpu_ptr(&krc, cpu);
-
-               /*
-                * A monitor work can drain ready to reclaim objects
-                * directly. Wait its completion if running or pending.
-                */
-               cancel_delayed_work_sync(&krcp->monitor_work);
-
-               for (i = 0; i < KFREE_N_BATCHES; i++) {
-                       krwp = &(krcp->krw_arr[i]);
-                       flush_rcu_work(&krwp->rcu_work);
-               }
-       }
-}
-EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
-
-#endif /* #if !defined(CONFIG_TINY_RCU) */
-
-static unsigned long
-kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
-{
-       int cpu;
-       unsigned long count = 0;
-
-       /* Snapshot count of all CPUs */
-       for_each_possible_cpu(cpu) {
-               struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
-
-               count += krc_count(krcp);
-               count += READ_ONCE(krcp->nr_bkv_objs);
-               atomic_set(&krcp->backoff_page_cache_fill, 1);
-       }
-
-       return count == 0 ? SHRINK_EMPTY : count;
-}
-
-static unsigned long
-kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
-{
-       int cpu, freed = 0;
-
-       for_each_possible_cpu(cpu) {
-               int count;
-               struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
-
-               count = krc_count(krcp);
-               count += drain_page_cache(krcp);
-               kfree_rcu_monitor(&krcp->monitor_work.work);
-
-               sc->nr_to_scan -= count;
-               freed += count;
-
-               if (sc->nr_to_scan <= 0)
-                       break;
-       }
-
-       return freed == 0 ? SHRINK_STOP : freed;
-}
-
  /*
   * During early boot, any blocking grace-period wait automatically
   * implies a grace period.
@@ -5652,55 +4822,6 @@ static void __init rcu_dump_rcu_node_tree(void)
  
  struct workqueue_struct *rcu_gp_wq;
  
-void __init kvfree_rcu_init(void)
-{
-       int cpu;
-       int i, j;
-       struct shrinker *kfree_rcu_shrinker;
-
-       /* Clamp it to [0:100] seconds interval. */
-       if (rcu_delay_page_cache_fill_msec < 0 ||
-               rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
-
-               rcu_delay_page_cache_fill_msec =
-                       clamp(rcu_delay_page_cache_fill_msec, 0,
-                               (int) (100 * MSEC_PER_SEC));
-
-               pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
-                       rcu_delay_page_cache_fill_msec);
-       }
-
-       for_each_possible_cpu(cpu) {
-               struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
-
-               for (i = 0; i < KFREE_N_BATCHES; i++) {
-                       INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
-                       krcp->krw_arr[i].krcp = krcp;
-
-                       for (j = 0; j < FREE_N_CHANNELS; j++)
-                               INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]);
-               }
-
-               for (i = 0; i < FREE_N_CHANNELS; i++)
-                       INIT_LIST_HEAD(&krcp->bulk_head[i]);
-
-               INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
-               INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
-               krcp->initialized = true;
-       }
-
-       kfree_rcu_shrinker = shrinker_alloc(0, "slab-kvfree-rcu");
-       if (!kfree_rcu_shrinker) {
-               pr_err("Failed to allocate kfree_rcu() shrinker!\n");
-               return;
-       }
-
-       kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count;
-       kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan;
-
-       shrinker_register(kfree_rcu_shrinker);
-}
-
  void __init rcu_init(void)
  {
         int cpu = smp_processor_id();
diff --git a/mm/slab_common.c b/mm/slab_common.c

index a29457bef626feadabdd674ab97aa822b3487e9b..69f2d19010dedaa3e5b303ab9803c8cdd40152fa 100644 (file)
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -28,7 +28,9 @@
  #include <asm/page.h>
  #include <linux/memcontrol.h>
  #include <linux/stackdepot.h>
+#include <trace/events/rcu.h>
  
+#include "../kernel/rcu/rcu.h"
  #include "internal.h"
  #include "slab.h"
  
@@ -1282,3 +1284,881 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
  EXPORT_TRACEPOINT_SYMBOL(kfree);
  EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
  
+/*
+ * This rcu parameter is runtime-read-only. It reflects
+ * a minimum allowed number of objects which can be cached
+ * per-CPU. Object size is equal to one page. This value
+ * can be changed at boot time.
+ */
+static int rcu_min_cached_objs = 5;
+module_param(rcu_min_cached_objs, int, 0444);
+
+// A page shrinker can ask for pages to be freed to make them
+// available for other parts of the system. This usually happens
+// under low memory conditions, and in that case we should also
+// defer page-cache filling for a short time period.
+//
+// The default value is 5 seconds, which is long enough to reduce
+// interference with the shrinker while it asks other systems to
+// drain their caches.
+static int rcu_delay_page_cache_fill_msec = 5000;
+module_param(rcu_delay_page_cache_fill_msec, int, 0444);
+
+/* Maximum number of jiffies to wait before draining a batch. */
+#define KFREE_DRAIN_JIFFIES (5 * HZ)
+#define KFREE_N_BATCHES 2
+#define FREE_N_CHANNELS 2
+
+/**
+ * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
+ * @list: List node. All blocks are linked between each other
+ * @gp_snap: Snapshot of RCU state for objects placed to this bulk
+ * @nr_records: Number of active pointers in the array
+ * @records: Array of the kvfree_rcu() pointers
+ */
+struct kvfree_rcu_bulk_data {
+       struct list_head list;
+       struct rcu_gp_oldstate gp_snap;
+       unsigned long nr_records;
+       void *records[] __counted_by(nr_records);
+};
+
+/*
+ * This macro defines how many entries the "records" array
+ * will contain. It is based on the fact that the size of
+ * kvfree_rcu_bulk_data structure becomes exactly one page.
+ */
+#define KVFREE_BULK_MAX_ENTR \
+       ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
+
+/**
+ * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
+ * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
+ * @head_free: List of kfree_rcu() objects waiting for a grace period
+ * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees.
+ * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
+ * @krcp: Pointer to @kfree_rcu_cpu structure
+ */
+
+struct kfree_rcu_cpu_work {
+       struct rcu_work rcu_work;
+       struct rcu_head *head_free;
+       struct rcu_gp_oldstate head_free_gp_snap;
+       struct list_head bulk_head_free[FREE_N_CHANNELS];
+       struct kfree_rcu_cpu *krcp;
+};
+
+/**
+ * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
+ * @head: List of kfree_rcu() objects not yet waiting for a grace period
+ * @head_gp_snap: Snapshot of RCU state for objects placed to "@head"
+ * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
+ * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
+ * @lock: Synchronize access to this structure
+ * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
+ * @initialized: The @rcu_work fields have been initialized
+ * @head_count: Number of objects in rcu_head singular list
+ * @bulk_count: Number of objects in bulk-list
+ * @bkvcache:
+ *     A simple cache list that contains objects for reuse purpose.
+ *     In order to save some per-cpu space the list is singular.
+ *     Even though it is lockless an access has to be protected by the
+ *     per-cpu lock.
+ * @page_cache_work: A work to refill the cache when it is empty
+ * @backoff_page_cache_fill: Delay cache refills
+ * @work_in_progress: Indicates that page_cache_work is running
+ * @hrtimer: A hrtimer for scheduling a page_cache_work
+ * @nr_bkv_objs: number of allocated objects at @bkvcache.
+ *
+ * This is a per-CPU structure.  The reason that it is not included in
+ * the rcu_data structure is to permit this code to be extracted from
+ * the RCU files.  Such extraction could allow further optimization of
+ * the interactions with the slab allocators.
+ */
+struct kfree_rcu_cpu {
+       // Objects queued on a linked list
+       // through their rcu_head structures.
+       struct rcu_head *head;
+       unsigned long head_gp_snap;
+       atomic_t head_count;
+
+       // Objects queued on a bulk-list.
+       struct list_head bulk_head[FREE_N_CHANNELS];
+       atomic_t bulk_count[FREE_N_CHANNELS];
+
+       struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
+       raw_spinlock_t lock;
+       struct delayed_work monitor_work;
+       bool initialized;
+
+       struct delayed_work page_cache_work;
+       atomic_t backoff_page_cache_fill;
+       atomic_t work_in_progress;
+       struct hrtimer hrtimer;
+
+       struct llist_head bkvcache;
+       int nr_bkv_objs;
+};
+
+static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
+       .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
+};
+
+static __always_inline void
+debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
+{
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+       int i;
+
+       for (i = 0; i < bhead->nr_records; i++)
+               debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i]));
+#endif
+}
+
+static inline struct kfree_rcu_cpu *
+krc_this_cpu_lock(unsigned long *flags)
+{
+       struct kfree_rcu_cpu *krcp;
+
+       local_irq_save(*flags); // For safely calling this_cpu_ptr().
+       krcp = this_cpu_ptr(&krc);
+       raw_spin_lock(&krcp->lock);
+
+       return krcp;
+}
+
+static inline void
+krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
+{
+       raw_spin_unlock_irqrestore(&krcp->lock, flags);
+}
+
+static inline struct kvfree_rcu_bulk_data *
+get_cached_bnode(struct kfree_rcu_cpu *krcp)
+{
+       if (!krcp->nr_bkv_objs)
+               return NULL;
+
+       WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1);
+       return (struct kvfree_rcu_bulk_data *)
+               llist_del_first(&krcp->bkvcache);
+}
+
+static inline bool
+put_cached_bnode(struct kfree_rcu_cpu *krcp,
+       struct kvfree_rcu_bulk_data *bnode)
+{
+       // Check the limit.
+       if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
+               return false;
+
+       llist_add((struct llist_node *) bnode, &krcp->bkvcache);
+       WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1);
+       return true;
+}
+
+static int
+drain_page_cache(struct kfree_rcu_cpu *krcp)
+{
+       unsigned long flags;
+       struct llist_node *page_list, *pos, *n;
+       int freed = 0;
+
+       if (!rcu_min_cached_objs)
+               return 0;
+
+       raw_spin_lock_irqsave(&krcp->lock, flags);
+       page_list = llist_del_all(&krcp->bkvcache);
+       WRITE_ONCE(krcp->nr_bkv_objs, 0);
+       raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+       llist_for_each_safe(pos, n, page_list) {
+               free_page((unsigned long)pos);
+               freed++;
+       }
+
+       return freed;
+}
+
+static void
+kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp,
+       struct kvfree_rcu_bulk_data *bnode, int idx)
+{
+       unsigned long flags;
+       int i;
+
+       if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) {
+               debug_rcu_bhead_unqueue(bnode);
+               rcu_lock_acquire(&rcu_callback_map);
+               if (idx == 0) { // kmalloc() / kfree().
+                       trace_rcu_invoke_kfree_bulk_callback(
+                               "slab", bnode->nr_records,
+                               bnode->records);
+
+                       kfree_bulk(bnode->nr_records, bnode->records);
+               } else { // vmalloc() / vfree().
+                       for (i = 0; i < bnode->nr_records; i++) {
+                               trace_rcu_invoke_kvfree_callback(
+                                       "slab", bnode->records[i], 0);
+
+                               vfree(bnode->records[i]);
+                       }
+               }
+               rcu_lock_release(&rcu_callback_map);
+       }
+
+       raw_spin_lock_irqsave(&krcp->lock, flags);
+       if (put_cached_bnode(krcp, bnode))
+               bnode = NULL;
+       raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+       if (bnode)
+               free_page((unsigned long) bnode);
+
+       cond_resched_tasks_rcu_qs();
+}
+
+static void
+kvfree_rcu_list(struct rcu_head *head)
+{
+       struct rcu_head *next;
+
+       for (; head; head = next) {
+               void *ptr = (void *) head->func;
+               unsigned long offset = (void *) head - ptr;
+
+               next = head->next;
+               debug_rcu_head_unqueue((struct rcu_head *)ptr);
+               rcu_lock_acquire(&rcu_callback_map);
+               trace_rcu_invoke_kvfree_callback("slab", head, offset);
+
+               if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
+                       kvfree(ptr);
+
+               rcu_lock_release(&rcu_callback_map);
+               cond_resched_tasks_rcu_qs();
+       }
+}
+
+/*
+ * This function is invoked in workqueue context after a grace period.
+ * It frees all the objects queued on ->bulk_head_free or ->head_free.
+ */
+static void kfree_rcu_work(struct work_struct *work)
+{
+       unsigned long flags;
+       struct kvfree_rcu_bulk_data *bnode, *n;
+       struct list_head bulk_head[FREE_N_CHANNELS];
+       struct rcu_head *head;
+       struct kfree_rcu_cpu *krcp;
+       struct kfree_rcu_cpu_work *krwp;
+       struct rcu_gp_oldstate head_gp_snap;
+       int i;
+
+       krwp = container_of(to_rcu_work(work),
+               struct kfree_rcu_cpu_work, rcu_work);
+       krcp = krwp->krcp;
+
+       raw_spin_lock_irqsave(&krcp->lock, flags);
+       // Channels 1 and 2.
+       for (i = 0; i < FREE_N_CHANNELS; i++)
+               list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]);
+
+       // Channel 3.
+       head = krwp->head_free;
+       krwp->head_free = NULL;
+       head_gp_snap = krwp->head_free_gp_snap;
+       raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+       // Handle the first two channels.
+       for (i = 0; i < FREE_N_CHANNELS; i++) {
+               // Start from the tail page, so a GP is likely passed for it.
+               list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
+                       kvfree_rcu_bulk(krcp, bnode, i);
+       }
+
+       /*
+        * This is used when the "bulk" path can not be used for the
+        * double-argument of kvfree_rcu().  This happens when the
+        * page-cache is empty, which means that objects are instead
+        * queued on a linked list through their rcu_head structures.
+        * This list is named "Channel 3".
+        */
+       if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap)))
+               kvfree_rcu_list(head);
+}
+
+static bool
+need_offload_krc(struct kfree_rcu_cpu *krcp)
+{
+       int i;
+
+       for (i = 0; i < FREE_N_CHANNELS; i++)
+               if (!list_empty(&krcp->bulk_head[i]))
+                       return true;
+
+       return !!READ_ONCE(krcp->head);
+}
+
+static bool
+need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp)
+{
+       int i;
+
+       for (i = 0; i < FREE_N_CHANNELS; i++)
+               if (!list_empty(&krwp->bulk_head_free[i]))
+                       return true;
+
+       return !!krwp->head_free;
+}
+
+static int krc_count(struct kfree_rcu_cpu *krcp)
+{
+       int sum = atomic_read(&krcp->head_count);
+       int i;
+
+       for (i = 0; i < FREE_N_CHANNELS; i++)
+               sum += atomic_read(&krcp->bulk_count[i]);
+
+       return sum;
+}
+
+static void
+__schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
+{
+       long delay, delay_left;
+
+       delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;
+       if (delayed_work_pending(&krcp->monitor_work)) {
+               delay_left = krcp->monitor_work.timer.expires - jiffies;
+               if (delay < delay_left)
+                       mod_delayed_work(system_unbound_wq, &krcp->monitor_work, delay);
+               return;
+       }
+       queue_delayed_work(system_unbound_wq, &krcp->monitor_work, delay);
+}
+
+static void
+schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
+{
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&krcp->lock, flags);
+       __schedule_delayed_monitor_work(krcp);
+       raw_spin_unlock_irqrestore(&krcp->lock, flags);
+}
+
+static void
+kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
+{
+       struct list_head bulk_ready[FREE_N_CHANNELS];
+       struct kvfree_rcu_bulk_data *bnode, *n;
+       struct rcu_head *head_ready = NULL;
+       unsigned long flags;
+       int i;
+
+       raw_spin_lock_irqsave(&krcp->lock, flags);
+       for (i = 0; i < FREE_N_CHANNELS; i++) {
+               INIT_LIST_HEAD(&bulk_ready[i]);
+
+               list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) {
+                       if (!poll_state_synchronize_rcu_full(&bnode->gp_snap))
+                               break;
+
+                       atomic_sub(bnode->nr_records, &krcp->bulk_count[i]);
+                       list_move(&bnode->list, &bulk_ready[i]);
+               }
+       }
+
+       if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) {
+               head_ready = krcp->head;
+               atomic_set(&krcp->head_count, 0);
+               WRITE_ONCE(krcp->head, NULL);
+       }
+       raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+       for (i = 0; i < FREE_N_CHANNELS; i++) {
+               list_for_each_entry_safe(bnode, n, &bulk_ready[i], list)
+                       kvfree_rcu_bulk(krcp, bnode, i);
+       }
+
+       if (head_ready)
+               kvfree_rcu_list(head_ready);
+}
+
+/*
+ * Return: %true if a work is queued, %false otherwise.
+ */
+static bool
+kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp)
+{
+       unsigned long flags;
+       bool queued = false;
+       int i, j;
+
+       raw_spin_lock_irqsave(&krcp->lock, flags);
+
+       // Attempt to start a new batch.
+       for (i = 0; i < KFREE_N_BATCHES; i++) {
+               struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
+
+               // Try to detach bulk_head or head and attach it, only when
+               // all channels are free.  Any channel is not free means at krwp
+               // there is on-going rcu work to handle krwp's free business.
+               if (need_wait_for_krwp_work(krwp))
+                       continue;
+
+               // kvfree_rcu_drain_ready() might handle this krcp, if so give up.
+               if (need_offload_krc(krcp)) {
+                       // Channel 1 corresponds to the SLAB-pointer bulk path.
+                       // Channel 2 corresponds to vmalloc-pointer bulk path.
+                       for (j = 0; j < FREE_N_CHANNELS; j++) {
+                               if (list_empty(&krwp->bulk_head_free[j])) {
+                                       atomic_set(&krcp->bulk_count[j], 0);
+                                       list_replace_init(&krcp->bulk_head[j],
+                                               &krwp->bulk_head_free[j]);
+                               }
+                       }
+
+                       // Channel 3 corresponds to both SLAB and vmalloc
+                       // objects queued on the linked list.
+                       if (!krwp->head_free) {
+                               krwp->head_free = krcp->head;
+                               get_state_synchronize_rcu_full(&krwp->head_free_gp_snap);
+                               atomic_set(&krcp->head_count, 0);
+                               WRITE_ONCE(krcp->head, NULL);
+                       }
+
+                       // One work is per one batch, so there are three
+                       // "free channels", the batch can handle. Break
+                       // the loop since it is done with this CPU thus
+                       // queuing an RCU work is _always_ success here.
+                       queued = queue_rcu_work(system_unbound_wq, &krwp->rcu_work);
+                       WARN_ON_ONCE(!queued);
+                       break;
+               }
+       }
+
+       raw_spin_unlock_irqrestore(&krcp->lock, flags);
+       return queued;
+}
+
+/*
+ * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
+ */
+static void kfree_rcu_monitor(struct work_struct *work)
+{
+       struct kfree_rcu_cpu *krcp = container_of(work,
+               struct kfree_rcu_cpu, monitor_work.work);
+
+       // Drain ready for reclaim.
+       kvfree_rcu_drain_ready(krcp);
+
+       // Queue a batch for a rest.
+       kvfree_rcu_queue_batch(krcp);
+
+       // If there is nothing to detach, it means that our job is
+       // successfully done here. In case of having at least one
+       // of the channels that is still busy we should rearm the
+       // work to repeat an attempt. Because previous batches are
+       // still in progress.
+       if (need_offload_krc(krcp))
+               schedule_delayed_monitor_work(krcp);
+}
+
+static void fill_page_cache_func(struct work_struct *work)
+{
+       struct kvfree_rcu_bulk_data *bnode;
+       struct kfree_rcu_cpu *krcp =
+               container_of(work, struct kfree_rcu_cpu,
+                       page_cache_work.work);
+       unsigned long flags;
+       int nr_pages;
+       bool pushed;
+       int i;
+
+       nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ?
+               1 : rcu_min_cached_objs;
+
+       for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) {
+               bnode = (struct kvfree_rcu_bulk_data *)
+                       __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+
+               if (!bnode)
+                       break;
+
+               raw_spin_lock_irqsave(&krcp->lock, flags);
+               pushed = put_cached_bnode(krcp, bnode);
+               raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+               if (!pushed) {
+                       free_page((unsigned long) bnode);
+                       break;
+               }
+       }
+
+       atomic_set(&krcp->work_in_progress, 0);
+       atomic_set(&krcp->backoff_page_cache_fill, 0);
+}
+
+// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
+// state specified by flags.  If can_alloc is true, the caller must
+// be schedulable and not be holding any locks or mutexes that might be
+// acquired by the memory allocator or anything that it might invoke.
+// Returns true if ptr was successfully recorded, else the caller must
+// use a fallback.
+static inline bool
+add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
+       unsigned long *flags, void *ptr, bool can_alloc)
+{
+       struct kvfree_rcu_bulk_data *bnode;
+       int idx;
+
+       *krcp = krc_this_cpu_lock(flags);
+       if (unlikely(!(*krcp)->initialized))
+               return false;
+
+       idx = !!is_vmalloc_addr(ptr);
+       bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx],
+               struct kvfree_rcu_bulk_data, list);
+
+       /* Check if a new block is required. */
+       if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) {
+               bnode = get_cached_bnode(*krcp);
+               if (!bnode && can_alloc) {
+                       krc_this_cpu_unlock(*krcp, *flags);
+
+                       // __GFP_NORETRY - allows a light-weight direct reclaim
+                       // what is OK from minimizing of fallback hitting point of
+                       // view. Apart of that it forbids any OOM invoking what is
+                       // also beneficial since we are about to release memory soon.
+                       //
+                       // __GFP_NOMEMALLOC - prevents from consuming of all the
+                       // memory reserves. Please note we have a fallback path.
+                       //
+                       // __GFP_NOWARN - it is supposed that an allocation can
+                       // be failed under low memory or high memory pressure
+                       // scenarios.
+                       bnode = (struct kvfree_rcu_bulk_data *)
+                               __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+                       raw_spin_lock_irqsave(&(*krcp)->lock, *flags);
+               }
+
+               if (!bnode)
+                       return false;
+
+               // Initialize the new block and attach it.
+               bnode->nr_records = 0;
+               list_add(&bnode->list, &(*krcp)->bulk_head[idx]);
+       }
+
+       // Finally insert and update the GP for this page.
+       bnode->nr_records++;
+       bnode->records[bnode->nr_records - 1] = ptr;
+       get_state_synchronize_rcu_full(&bnode->gp_snap);
+       atomic_inc(&(*krcp)->bulk_count[idx]);
+
+       return true;
+}
+
+#if !defined(CONFIG_TINY_RCU)
+
+static enum hrtimer_restart
+schedule_page_work_fn(struct hrtimer *t)
+{
+       struct kfree_rcu_cpu *krcp =
+               container_of(t, struct kfree_rcu_cpu, hrtimer);
+
+       queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0);
+       return HRTIMER_NORESTART;
+}
+
+static void
+run_page_cache_worker(struct kfree_rcu_cpu *krcp)
+{
+       // If cache disabled, bail out.
+       if (!rcu_min_cached_objs)
+               return;
+
+       if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
+                       !atomic_xchg(&krcp->work_in_progress, 1)) {
+               if (atomic_read(&krcp->backoff_page_cache_fill)) {
+                       queue_delayed_work(system_unbound_wq,
+                               &krcp->page_cache_work,
+                                       msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
+               } else {
+                       hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+                       krcp->hrtimer.function = schedule_page_work_fn;
+                       hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+               }
+       }
+}
+
+void __init kfree_rcu_scheduler_running(void)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+               if (need_offload_krc(krcp))
+                       schedule_delayed_monitor_work(krcp);
+       }
+}
+
+/*
+ * Queue a request for lazy invocation of the appropriate free routine
+ * after a grace period.  Please note that three paths are maintained,
+ * two for the common case using arrays of pointers and a third one that
+ * is used only when the main paths cannot be used, for example, due to
+ * memory pressure.
+ *
+ * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
+ * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
+ * be free'd in workqueue context. This allows us to: batch requests together to
+ * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
+ */
+void kvfree_call_rcu(struct rcu_head *head, void *ptr)
+{
+       unsigned long flags;
+       struct kfree_rcu_cpu *krcp;
+       bool success;
+
+       /*
+        * Please note there is a limitation for the head-less
+        * variant, that is why there is a clear rule for such
+        * objects: it can be used from might_sleep() context
+        * only. For other places please embed an rcu_head to
+        * your data.
+        */
+       if (!head)
+               might_sleep();
+
+       // Queue the object but don't yet schedule the batch.
+       if (debug_rcu_head_queue(ptr)) {
+               // Probable double kfree_rcu(), just leak.
+               WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
+                         __func__, head);
+
+               // Mark as success and leave.
+               return;
+       }
+
+       kasan_record_aux_stack_noalloc(ptr);
+       success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head);
+       if (!success) {
+               run_page_cache_worker(krcp);
+
+               if (head == NULL)
+                       // Inline if kvfree_rcu(one_arg) call.
+                       goto unlock_return;
+
+               head->func = ptr;
+               head->next = krcp->head;
+               WRITE_ONCE(krcp->head, head);
+               atomic_inc(&krcp->head_count);
+
+               // Take a snapshot for this krcp.
+               krcp->head_gp_snap = get_state_synchronize_rcu();
+               success = true;
+       }
+
+       /*
+        * The kvfree_rcu() caller considers the pointer freed at this point
+        * and likely removes any references to it. Since the actual slab
+        * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore
+        * this object (no scanning or false positives reporting).
+        */
+       kmemleak_ignore(ptr);
+
+       // Set timer to drain after KFREE_DRAIN_JIFFIES.
+       if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
+               __schedule_delayed_monitor_work(krcp);
+
+unlock_return:
+       krc_this_cpu_unlock(krcp, flags);
+
+       /*
+        * Inline kvfree() after synchronize_rcu(). We can do
+        * it from might_sleep() context only, so the current
+        * CPU can pass the QS state.
+        */
+       if (!success) {
+               debug_rcu_head_unqueue((struct rcu_head *) ptr);
+               synchronize_rcu();
+               kvfree(ptr);
+       }
+}
+EXPORT_SYMBOL_GPL(kvfree_call_rcu);
+
+/**
+ * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
+ *
+ * Note that a single argument of kvfree_rcu() call has a slow path that
+ * triggers synchronize_rcu() following by freeing a pointer. It is done
+ * before the return from the function. Therefore for any single-argument
+ * call that will result in a kfree() to a cache that is to be destroyed
+ * during module exit, it is developer's responsibility to ensure that all
+ * such calls have returned before the call to kmem_cache_destroy().
+ */
+void kvfree_rcu_barrier(void)
+{
+       struct kfree_rcu_cpu_work *krwp;
+       struct kfree_rcu_cpu *krcp;
+       bool queued;
+       int i, cpu;
+
+       /*
+        * Firstly we detach objects and queue them over an RCU-batch
+        * for all CPUs. Finally queued works are flushed for each CPU.
+        *
+        * Please note. If there are outstanding batches for a particular
+        * CPU, those have to be finished first following by queuing a new.
+        */
+       for_each_possible_cpu(cpu) {
+               krcp = per_cpu_ptr(&krc, cpu);
+
+               /*
+                * Check if this CPU has any objects which have been queued for a
+                * new GP completion. If not(means nothing to detach), we are done
+                * with it. If any batch is pending/running for this "krcp", below
+                * per-cpu flush_rcu_work() waits its completion(see last step).
+                */
+               if (!need_offload_krc(krcp))
+                       continue;
+
+               while (1) {
+                       /*
+                        * If we are not able to queue a new RCU work it means:
+                        * - batches for this CPU are still in flight which should
+                        *   be flushed first and then repeat;
+                        * - no objects to detach, because of concurrency.
+                        */
+                       queued = kvfree_rcu_queue_batch(krcp);
+
+                       /*
+                        * Bail out, if there is no need to offload this "krcp"
+                        * anymore. As noted earlier it can run concurrently.
+                        */
+                       if (queued || !need_offload_krc(krcp))
+                               break;
+
+                       /* There are ongoing batches. */
+                       for (i = 0; i < KFREE_N_BATCHES; i++) {
+                               krwp = &(krcp->krw_arr[i]);
+                               flush_rcu_work(&krwp->rcu_work);
+                       }
+               }
+       }
+
+       /*
+        * Now we guarantee that all objects are flushed.
+        */
+       for_each_possible_cpu(cpu) {
+               krcp = per_cpu_ptr(&krc, cpu);
+
+               /*
+                * A monitor work can drain ready to reclaim objects
+                * directly. Wait its completion if running or pending.
+                */
+               cancel_delayed_work_sync(&krcp->monitor_work);
+
+               for (i = 0; i < KFREE_N_BATCHES; i++) {
+                       krwp = &(krcp->krw_arr[i]);
+                       flush_rcu_work(&krwp->rcu_work);
+               }
+       }
+}
+EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
+
+#endif /* #if !defined(CONFIG_TINY_RCU) */
+
+static unsigned long
+kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+       int cpu;
+       unsigned long count = 0;
+
+       /* Snapshot count of all CPUs */
+       for_each_possible_cpu(cpu) {
+               struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+               count += krc_count(krcp);
+               count += READ_ONCE(krcp->nr_bkv_objs);
+               atomic_set(&krcp->backoff_page_cache_fill, 1);
+       }
+
+       return count == 0 ? SHRINK_EMPTY : count;
+}
+
+static unsigned long
+kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+       int cpu, freed = 0;
+
+       for_each_possible_cpu(cpu) {
+               int count;
+               struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+               count = krc_count(krcp);
+               count += drain_page_cache(krcp);
+               kfree_rcu_monitor(&krcp->monitor_work.work);
+
+               sc->nr_to_scan -= count;
+               freed += count;
+
+               if (sc->nr_to_scan <= 0)
+                       break;
+       }
+
+       return freed == 0 ? SHRINK_STOP : freed;
+}
+
+void __init kvfree_rcu_init(void)
+{
+       int cpu;
+       int i, j;
+       struct shrinker *kfree_rcu_shrinker;
+
+       /* Clamp it to [0:100] seconds interval. */
+       if (rcu_delay_page_cache_fill_msec < 0 ||
+               rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
+
+               rcu_delay_page_cache_fill_msec =
+                       clamp(rcu_delay_page_cache_fill_msec, 0,
+                               (int) (100 * MSEC_PER_SEC));
+
+               pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
+                       rcu_delay_page_cache_fill_msec);
+       }
+
+       for_each_possible_cpu(cpu) {
+               struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+               for (i = 0; i < KFREE_N_BATCHES; i++) {
+                       INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
+                       krcp->krw_arr[i].krcp = krcp;
+
+                       for (j = 0; j < FREE_N_CHANNELS; j++)
+                               INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]);
+               }
+
+               for (i = 0; i < FREE_N_CHANNELS; i++)
+                       INIT_LIST_HEAD(&krcp->bulk_head[i]);
+
+               INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
+               INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
+               krcp->initialized = true;
+       }
+
+       kfree_rcu_shrinker = shrinker_alloc(0, "slab-kvfree-rcu");
+       if (!kfree_rcu_shrinker) {
+               pr_err("Failed to allocate kfree_rcu() shrinker!\n");
+               return;
+       }
+
+       kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count;
+       kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan;
+
+       shrinker_register(kfree_rcu_shrinker);
+}
author	Uladzislau Rezki (Sony) <urezki@gmail.com>
	Thu, 12 Dec 2024 18:02:08 +0000 (19:02 +0100)
committer	Vlastimil Babka <vbabka@suse.cz>
	Sat, 11 Jan 2025 19:39:43 +0000 (20:39 +0100)
include/linux/rcupdate.h		patch \| blob \| history
include/linux/slab.h		patch \| blob \| history
kernel/rcu/tree.c		patch \| blob \| history
mm/slab_common.c		patch \| blob \| history