writeback, cgroup: release dying cgwbs by switching attached inodes

author Roman Gushchin <guro@fb.com>

Tue, 29 Jun 2021 02:36:03 +0000 (19:36 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 29 Jun 2021 17:53:48 +0000 (10:53 -0700)
author Roman Gushchin <guro@fb.com>
Tue, 29 Jun 2021 02:36:03 +0000 (19:36 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 29 Jun 2021 17:53:48 +0000 (10:53 -0700)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c

index 737ac27adb77c12157fab467569e7da26acfe699..62193106683d9ceb63178efbbb24a72a45d23429 100644 (file)
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -225,6 +225,13 @@ void wb_wait_for_completion(struct wb_completion *done)
                                         /* one round can affect upto 5 slots */
  #define WB_FRN_MAX_IN_FLIGHT   1024    /* don't queue too many concurrently */
  
+/*
+ * Maximum inodes per isw.  A specific value has been chosen to make
+ * struct inode_switch_wbs_context fit into 1024 bytes kmalloc.
+ */
+#define WB_MAX_INODES_PER_ISW  ((1024UL - sizeof(struct inode_switch_wbs_context)) \
+                                / sizeof(struct inode *))
+
  static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
  static struct workqueue_struct *isw_wq;
  
@@ -503,6 +510,32 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
         atomic_dec(&isw_nr_in_flight);
  }
  
+static bool inode_prepare_wbs_switch(struct inode *inode,
+                                    struct bdi_writeback *new_wb)
+{
+       /*
+        * Paired with smp_mb() in cgroup_writeback_umount().
+        * isw_nr_in_flight must be increased before checking SB_ACTIVE and
+        * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0
+        * in cgroup_writeback_umount() and the isw_wq will be not flushed.
+        */
+       smp_mb();
+
+       /* while holding I_WB_SWITCH, no one else can update the association */
+       spin_lock(&inode->i_lock);
+       if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
+           inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) ||
+           inode_to_wb(inode) == new_wb) {
+               spin_unlock(&inode->i_lock);
+               return false;
+       }
+       inode->i_state |= I_WB_SWITCH;
+       __iget(inode);
+       spin_unlock(&inode->i_lock);
+
+       return true;
+}
+
  /**
   * inode_switch_wbs - change the wb association of an inode
   * @inode: target inode
@@ -540,17 +573,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
         if (!isw->new_wb)
                 goto out_free;
  
-       /* while holding I_WB_SWITCH, no one else can update the association */
-       spin_lock(&inode->i_lock);
-       if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
-           inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) ||
-           inode_to_wb(inode) == isw->new_wb) {
-               spin_unlock(&inode->i_lock);
+       if (!inode_prepare_wbs_switch(inode, isw->new_wb))
                 goto out_free;
-       }
-       inode->i_state |= I_WB_SWITCH;
-       __iget(inode);
-       spin_unlock(&inode->i_lock);
  
         isw->inodes[0] = inode;
  
@@ -571,6 +595,73 @@ out_free:
         kfree(isw);
  }
  
+/**
+ * cleanup_offline_cgwb - detach associated inodes
+ * @wb: target wb
+ *
+ * Switch all inodes attached to @wb to a nearest living ancestor's wb in order
+ * to eventually release the dying @wb.  Returns %true if not all inodes were
+ * switched and the function has to be restarted.
+ */
+bool cleanup_offline_cgwb(struct bdi_writeback *wb)
+{
+       struct cgroup_subsys_state *memcg_css;
+       struct inode_switch_wbs_context *isw;
+       struct inode *inode;
+       int nr;
+       bool restart = false;
+
+       isw = kzalloc(sizeof(*isw) + WB_MAX_INODES_PER_ISW *
+                     sizeof(struct inode *), GFP_KERNEL);
+       if (!isw)
+               return restart;
+
+       atomic_inc(&isw_nr_in_flight);
+
+       for (memcg_css = wb->memcg_css->parent; memcg_css;
+            memcg_css = memcg_css->parent) {
+               isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
+               if (isw->new_wb)
+                       break;
+       }
+       if (unlikely(!isw->new_wb))
+               isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
+
+       nr = 0;
+       spin_lock(&wb->list_lock);
+       list_for_each_entry(inode, &wb->b_attached, i_io_list) {
+               if (!inode_prepare_wbs_switch(inode, isw->new_wb))
+                       continue;
+
+               isw->inodes[nr++] = inode;
+
+               if (nr >= WB_MAX_INODES_PER_ISW - 1) {
+                       restart = true;
+                       break;
+               }
+       }
+       spin_unlock(&wb->list_lock);
+
+       /* no attached inodes? bail out */
+       if (nr == 0) {
+               atomic_dec(&isw_nr_in_flight);
+               wb_put(isw->new_wb);
+               kfree(isw);
+               return restart;
+       }
+
+       /*
+        * In addition to synchronizing among switchers, I_WB_SWITCH tells
+        * the RCU protected stat update paths to grab the i_page
+        * lock so that stat transfer can synchronize against them.
+        * Let's continue after I_WB_SWITCH is guaranteed to be visible.
+        */
+       INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
+       queue_rcu_work(isw_wq, &isw->work);
+
+       return restart;
+}
+
  /**
   * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
   * @wbc: writeback_control of interest
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h

index 63f52ad2ce7a955cb7c82b055f4f493b4d4f8d71..1d7edad9914fc5b8579d2ae6481287f8a0c67acf 100644 (file)
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -155,6 +155,7 @@ struct bdi_writeback {
         struct list_head memcg_node;    /* anchored at memcg->cgwb_list */
         struct list_head blkcg_node;    /* anchored at blkcg->cgwb_list */
         struct list_head b_attached;    /* attached inodes, protected by list_lock */
+       struct list_head offline_node;  /* anchored at offline_cgwbs */
  
         union {
                 struct work_struct release_work;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h

index 8e5c5bb16e2d9c08f28706ee72c2124d254413cf..95de51c1024803eb8278ad45e1761dc21694a676 100644 (file)
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -221,6 +221,7 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
  int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr_pages,
                            enum wb_reason reason, struct wb_completion *done);
  void cgroup_writeback_umount(void);
+bool cleanup_offline_cgwb(struct bdi_writeback *wb);
  
  /**
   * inode_attach_wb - associate an inode with its wb
diff --git a/mm/backing-dev.c b/mm/backing-dev.c

index 54c5dc4b8c24eb99d7f7405f2fb50ca60800bfe9..271f2ca862c82ca1e95b4e22598cae7e17d8c45e 100644 (file)
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -371,12 +371,16 @@ static void wb_exit(struct bdi_writeback *wb)
  #include <linux/memcontrol.h>
  
  /*
- * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, and memcg->cgwb_list.
- * bdi->cgwb_tree is also RCU protected.
+ * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and
+ * memcg->cgwb_list.  bdi->cgwb_tree is also RCU protected.
   */
  static DEFINE_SPINLOCK(cgwb_lock);
  static struct workqueue_struct *cgwb_release_wq;
  
+static LIST_HEAD(offline_cgwbs);
+static void cleanup_offline_cgwbs_workfn(struct work_struct *work);
+static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn);
+
  static void cgwb_release_workfn(struct work_struct *work)
  {
         struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
@@ -395,6 +399,11 @@ static void cgwb_release_workfn(struct work_struct *work)
  
         fprop_local_destroy_percpu(&wb->memcg_completions);
         percpu_ref_exit(&wb->refcnt);
+
+       spin_lock_irq(&cgwb_lock);
+       list_del(&wb->offline_node);
+       spin_unlock_irq(&cgwb_lock);
+
         wb_exit(wb);
         WARN_ON_ONCE(!list_empty(&wb->b_attached));
         kfree_rcu(wb, rcu);
@@ -414,6 +423,7 @@ static void cgwb_kill(struct bdi_writeback *wb)
         WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
         list_del(&wb->memcg_node);
         list_del(&wb->blkcg_node);
+       list_add(&wb->offline_node, &offline_cgwbs);
         percpu_ref_kill(&wb->refcnt);
  }
  
@@ -635,6 +645,54 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
         mutex_unlock(&bdi->cgwb_release_mutex);
  }
  
+/*
+ * cleanup_offline_cgwbs_workfn - try to release dying cgwbs
+ *
+ * Try to release dying cgwbs by switching attached inodes to the nearest
+ * living ancestor's writeback. Processed wbs are placed at the end
+ * of the list to guarantee the forward progress.
+ */
+static void cleanup_offline_cgwbs_workfn(struct work_struct *work)
+{
+       struct bdi_writeback *wb;
+       LIST_HEAD(processed);
+
+       spin_lock_irq(&cgwb_lock);
+
+       while (!list_empty(&offline_cgwbs)) {
+               wb = list_first_entry(&offline_cgwbs, struct bdi_writeback,
+                                     offline_node);
+               list_move(&wb->offline_node, &processed);
+
+               /*
+                * If wb is dirty, cleaning up the writeback by switching
+                * attached inodes will result in an effective removal of any
+                * bandwidth restrictions, which isn't the goal.  Instead,
+                * it can be postponed until the next time, when all io
+                * will be likely completed.  If in the meantime some inodes
+                * will get re-dirtied, they should be eventually switched to
+                * a new cgwb.
+                */
+               if (wb_has_dirty_io(wb))
+                       continue;
+
+               if (!wb_tryget(wb))
+                       continue;
+
+               spin_unlock_irq(&cgwb_lock);
+               while (cleanup_offline_cgwb(wb))
+                       cond_resched();
+               spin_lock_irq(&cgwb_lock);
+
+               wb_put(wb);
+       }
+
+       if (!list_empty(&processed))
+               list_splice_tail(&processed, &offline_cgwbs);
+
+       spin_unlock_irq(&cgwb_lock);
+}
+
  /**
   * wb_memcg_offline - kill all wb's associated with a memcg being offlined
   * @memcg: memcg being offlined
@@ -651,6 +709,8 @@ void wb_memcg_offline(struct mem_cgroup *memcg)
                 cgwb_kill(wb);
         memcg_cgwb_list->next = NULL;   /* prevent new wb's */
         spin_unlock_irq(&cgwb_lock);
+
+       queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work);
  }
  
  /**
author	Roman Gushchin <guro@fb.com>
	Tue, 29 Jun 2021 02:36:03 +0000 (19:36 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 29 Jun 2021 17:53:48 +0000 (10:53 -0700)
fs/fs-writeback.c		patch \| blob \| history
include/linux/backing-dev-defs.h		patch \| blob \| history
include/linux/writeback.h		patch \| blob \| history
mm/backing-dev.c		patch \| blob \| history