workqueue: Introduce struct wq_node_nr_active

author Tejun Heo <tj@kernel.org>

Mon, 29 Jan 2024 18:11:24 +0000 (08:11 -1000)

committer Tejun Heo <tj@kernel.org>

Mon, 29 Jan 2024 18:11:24 +0000 (08:11 -1000)
author Tejun Heo <tj@kernel.org>
Mon, 29 Jan 2024 18:11:24 +0000 (08:11 -1000)
committer Tejun Heo <tj@kernel.org>
Mon, 29 Jan 2024 18:11:24 +0000 (08:11 -1000)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c

index b5aba0e5a699045f522b1e461b4f8feb60a1b94c..8d465478adb922370cb264dfa9d3e0bad9af895c 100644 (file)
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -284,6 +284,16 @@ struct wq_flusher {
  
  struct wq_device;
  
+/*
+ * Unlike in a per-cpu workqueue where max_active limits its concurrency level
+ * on each CPU, in an unbound workqueue, max_active applies to the whole system.
+ * As sharing a single nr_active across multiple sockets can be very expensive,
+ * the counting and enforcement is per NUMA node.
+ */
+struct wq_node_nr_active {
+       atomic_t                nr;             /* per-node nr_active count */
+};
+
  /*
   * The externally visible workqueue.  It relays the issued work items to
   * the appropriate worker_pool through its pool_workqueues.
@@ -330,6 +340,7 @@ struct workqueue_struct {
         /* hot fields used during command issue, aligned to cacheline */
         unsigned int            flags ____cacheline_aligned; /* WQ: WQ_* flags */
         struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */
+       struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */
  };
  
  static struct kmem_cache *pwq_cache;
@@ -1425,6 +1436,31 @@ work_func_t wq_worker_last_func(struct task_struct *task)
         return worker->last_func;
  }
  
+/**
+ * wq_node_nr_active - Determine wq_node_nr_active to use
+ * @wq: workqueue of interest
+ * @node: NUMA node, can be %NUMA_NO_NODE
+ *
+ * Determine wq_node_nr_active to use for @wq on @node. Returns:
+ *
+ * - %NULL for per-cpu workqueues as they don't need to use shared nr_active.
+ *
+ * - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE.
+ *
+ * - Otherwise, node_nr_active[@node].
+ */
+static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq,
+                                                  int node)
+{
+       if (!(wq->flags & WQ_UNBOUND))
+               return NULL;
+
+       if (node == NUMA_NO_NODE)
+               node = nr_node_ids;
+
+       return wq->node_nr_active[node];
+}
+
  /**
   * get_pwq - get an extra reference on the specified pool_workqueue
   * @pwq: pool_workqueue to get
@@ -1506,12 +1542,17 @@ static bool pwq_activate_work(struct pool_workqueue *pwq,
                               struct work_struct *work)
  {
         struct worker_pool *pool = pwq->pool;
+       struct wq_node_nr_active *nna;
  
         lockdep_assert_held(&pool->lock);
  
         if (!(*work_data_bits(work) & WORK_STRUCT_INACTIVE))
                 return false;
  
+       nna = wq_node_nr_active(pwq->wq, pool->node);
+       if (nna)
+               atomic_inc(&nna->nr);
+
         pwq->nr_active++;
         __pwq_activate_work(pwq, work);
         return true;
@@ -1528,14 +1569,18 @@ static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq)
  {
         struct workqueue_struct *wq = pwq->wq;
         struct worker_pool *pool = pwq->pool;
+       struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node);
         bool obtained;
  
         lockdep_assert_held(&pool->lock);
  
         obtained = pwq->nr_active < READ_ONCE(wq->max_active);
  
-       if (obtained)
+       if (obtained) {
                 pwq->nr_active++;
+               if (nna)
+                       atomic_inc(&nna->nr);
+       }
         return obtained;
  }
  
@@ -1572,10 +1617,26 @@ static bool pwq_activate_first_inactive(struct pool_workqueue *pwq)
  static void pwq_dec_nr_active(struct pool_workqueue *pwq)
  {
         struct worker_pool *pool = pwq->pool;
+       struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pool->node);
  
         lockdep_assert_held(&pool->lock);
  
+       /*
+        * @pwq->nr_active should be decremented for both percpu and unbound
+        * workqueues.
+        */
         pwq->nr_active--;
+
+       /*
+        * For a percpu workqueue, it's simple. Just need to kick the first
+        * inactive work item on @pwq itself.
+        */
+       if (!nna) {
+               pwq_activate_first_inactive(pwq);
+               return;
+       }
+
+       atomic_dec(&nna->nr);
         pwq_activate_first_inactive(pwq);
  }
  
@@ -4039,11 +4100,63 @@ static void wq_free_lockdep(struct workqueue_struct *wq)
  }
  #endif
  
+static void free_node_nr_active(struct wq_node_nr_active **nna_ar)
+{
+       int node;
+
+       for_each_node(node) {
+               kfree(nna_ar[node]);
+               nna_ar[node] = NULL;
+       }
+
+       kfree(nna_ar[nr_node_ids]);
+       nna_ar[nr_node_ids] = NULL;
+}
+
+static void init_node_nr_active(struct wq_node_nr_active *nna)
+{
+       atomic_set(&nna->nr, 0);
+}
+
+/*
+ * Each node's nr_active counter will be accessed mostly from its own node and
+ * should be allocated in the node.
+ */
+static int alloc_node_nr_active(struct wq_node_nr_active **nna_ar)
+{
+       struct wq_node_nr_active *nna;
+       int node;
+
+       for_each_node(node) {
+               nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, node);
+               if (!nna)
+                       goto err_free;
+               init_node_nr_active(nna);
+               nna_ar[node] = nna;
+       }
+
+       /* [nr_node_ids] is used as the fallback */
+       nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, NUMA_NO_NODE);
+       if (!nna)
+               goto err_free;
+       init_node_nr_active(nna);
+       nna_ar[nr_node_ids] = nna;
+
+       return 0;
+
+err_free:
+       free_node_nr_active(nna_ar);
+       return -ENOMEM;
+}
+
  static void rcu_free_wq(struct rcu_head *rcu)
  {
         struct workqueue_struct *wq =
                 container_of(rcu, struct workqueue_struct, rcu);
  
+       if (wq->flags & WQ_UNBOUND)
+               free_node_nr_active(wq->node_nr_active);
+
         wq_free_lockdep(wq);
         free_percpu(wq->cpu_pwq);
         free_workqueue_attrs(wq->unbound_attrs);
@@ -4785,7 +4898,8 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
  {
         va_list args;
         struct workqueue_struct *wq;
-       int len;
+       size_t wq_size;
+       int name_len;
  
         /*
          * Unbound && max_active == 1 used to imply ordered, which is no longer
@@ -4801,7 +4915,12 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
                 flags |= WQ_UNBOUND;
  
         /* allocate wq and format name */
-       wq = kzalloc(sizeof(*wq), GFP_KERNEL);
+       if (flags & WQ_UNBOUND)
+               wq_size = struct_size(wq, node_nr_active, nr_node_ids + 1);
+       else
+               wq_size = sizeof(*wq);
+
+       wq = kzalloc(wq_size, GFP_KERNEL);
         if (!wq)
                 return NULL;
  
@@ -4812,11 +4931,12 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
         }
  
         va_start(args, max_active);
-       len = vsnprintf(wq->name, sizeof(wq->name), fmt, args);
+       name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args);
         va_end(args);
  
-       if (len >= WQ_NAME_LEN)
-               pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n", wq->name);
+       if (name_len >= WQ_NAME_LEN)
+               pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n",
+                            wq->name);
  
         max_active = max_active ?: WQ_DFL_ACTIVE;
         max_active = wq_clamp_max_active(max_active, flags, wq->name);
@@ -4835,8 +4955,13 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
         wq_init_lockdep(wq);
         INIT_LIST_HEAD(&wq->list);
  
+       if (flags & WQ_UNBOUND) {
+               if (alloc_node_nr_active(wq->node_nr_active) < 0)
+                       goto err_unreg_lockdep;
+       }
+
         if (alloc_and_link_pwqs(wq) < 0)
-               goto err_unreg_lockdep;
+               goto err_free_node_nr_active;
  
         if (wq_online && init_rescuer(wq) < 0)
                 goto err_destroy;
@@ -4861,6 +4986,9 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
  
         return wq;
  
+err_free_node_nr_active:
+       if (wq->flags & WQ_UNBOUND)
+               free_node_nr_active(wq->node_nr_active);
  err_unreg_lockdep:
         wq_unregister_lockdep(wq);
         wq_free_lockdep(wq);
author	Tejun Heo <tj@kernel.org>
	Mon, 29 Jan 2024 18:11:24 +0000 (08:11 -1000)
committer	Tejun Heo <tj@kernel.org>
	Mon, 29 Jan 2024 18:11:24 +0000 (08:11 -1000)