Revert "xfs: don't chain ioends during writepage submission"

author Wengang Wang <wen.gang.wang@oracle.com>

Tue, 29 Jan 2019 03:18:16 +0000 (19:18 -0800)

committer Brian Maly <brian.maly@oracle.com>

Wed, 30 Jan 2019 03:57:51 +0000 (22:57 -0500)
author Wengang Wang <wen.gang.wang@oracle.com>
Tue, 29 Jan 2019 03:18:16 +0000 (19:18 -0800)
committer Brian Maly <brian.maly@oracle.com>
Wed, 30 Jan 2019 03:57:51 +0000 (22:57 -0500)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c

index 0676c1fef6c0653484c1e8ec5e7e7f5b254d24dc..fc26695a88df2384bcfb8ff8531bb80dd0552846 100644 (file)
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -47,6 +47,7 @@ struct xfs_writepage_ctx {
         struct xfs_bmbt_irec    imap;
         bool                    imap_valid;
         unsigned int            io_type;
+       struct xfs_ioend        *iohead;
         struct xfs_ioend        *ioend;
         sector_t                last_block;
  };
@@ -280,7 +281,7 @@ xfs_alloc_ioend(
          */
         atomic_set(&ioend->io_remaining, 1);
         ioend->io_error = 0;
-       INIT_LIST_HEAD(&ioend->io_list);
+       ioend->io_list = NULL;
         ioend->io_type = type;
         ioend->io_inode = inode;
         ioend->io_buffer_head = NULL;
@@ -425,7 +426,8 @@ xfs_start_buffer_writeback(
  STATIC void
  xfs_start_page_writeback(
         struct page             *page,
-       int                     clear_dirty)
+       int                     clear_dirty,
+       int                     buffers)
  {
         ASSERT(PageLocked(page));
         ASSERT(!PageWriteback(page));
@@ -444,6 +446,10 @@ xfs_start_page_writeback(
                 set_page_writeback_keepwrite(page);
  
         unlock_page(page);
+
+       /* If no buffers on the page are to be written, finish it here */
+       if (!buffers)
+               end_page_writeback(page);
  }
  
  static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
@@ -452,90 +458,110 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
  }
  
  /*
- * Submit all of the bios for an ioend. We are only passed a single ioend at a
- * time; the caller is responsible for chaining prior to submission.
+ * Submit all of the bios for all of the ioends we have saved up, covering the
+ * initial writepage page and also any probed pages.
+ *
+ * Because we may have multiple ioends spanning a page, we need to start
+ * writeback on all the buffers before we submit them for I/O. If we mark the
+ * buffers as we got, then we can end up with a page that only has buffers
+ * marked async write and I/O complete on can occur before we mark the other
+ * buffers async write.
+ *
+ * The end result of this is that we trip a bug in end_page_writeback() because
+ * we call it twice for the one page as the code in end_buffer_async_write()
+ * assumes that all buffers on the page are started at the same time.
+ *
+ * The fix is two passes across the ioend list - one to start writeback on the
+ * buffer_heads, and then submit them for I/O on the second pass.
   *
   * If @fail is non-zero, it means that we have a situation where some part of
   * the submission process has failed after we have marked paged for writeback
   * and unlocked them. In this situation, we need to fail the ioend chain rather
   * than submit it to IO. This typically only happens on a filesystem shutdown.
   */
-STATIC int
+STATIC void
  xfs_submit_ioend(
         struct writeback_control *wbc,
         xfs_ioend_t             *ioend,
-       int                     status)
+       int                     fail)
  {
+       xfs_ioend_t             *head = ioend;
+       xfs_ioend_t             *next;
         struct buffer_head      *bh;
         struct bio              *bio;
         sector_t                lastblock = 0;
  
-       /* Reserve log space if we might write beyond the on-disk inode size. */
-       if (!status &&
-            ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
-               status = xfs_setfilesize_trans_alloc(ioend);
-       /*
-        * If we are failing the IO now, just mark the ioend with an
-        * error and finish it. This will run IO completion immediately
-        * as there is only one reference to the ioend at this point in
-        * time.
-        */
-       if (status) {
-               ioend->io_error = status;
-               xfs_finish_ioend(ioend);
-               return status;
-       }
+       /* Pass 1 - start writeback */
+       do {
+               next = ioend->io_list;
+               for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
+                       xfs_start_buffer_writeback(bh);
+       } while ((ioend = next) != NULL);
  
-       bio = NULL;
-       for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+       /* Pass 2 - submit I/O */
+       ioend = head;
+       do {
+               next = ioend->io_list;
+               bio = NULL;
  
-               if (!bio) {
-retry:
-                       bio = xfs_alloc_ioend_bio(bh);
-               } else if (bh->b_blocknr != lastblock + 1) {
-                       xfs_submit_ioend_bio(wbc, ioend, bio);
-                       goto retry;
+               /*
+                * If we are failing the IO now, just mark the ioend with an
+                * error and finish it. This will run IO completion immediately
+                * as there is only one reference to the ioend at this point in
+                * time.
+                */
+               if (fail) {
+                       ioend->io_error = fail;
+                       xfs_finish_ioend(ioend);
+                       continue;
                 }
  
-               if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
-                       xfs_submit_ioend_bio(wbc, ioend, bio);
-                       goto retry;
-               }
+               for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
  
-               lastblock = bh->b_blocknr;
-       }
-       if (bio)
-               xfs_submit_ioend_bio(wbc, ioend, bio);
-       xfs_finish_ioend(ioend);
-       return 0;
+                       if (!bio) {
+ retry:
+                               bio = xfs_alloc_ioend_bio(bh);
+                       } else if (bh->b_blocknr != lastblock + 1) {
+                               xfs_submit_ioend_bio(wbc, ioend, bio);
+                               goto retry;
+                       }
+
+                       if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
+                               xfs_submit_ioend_bio(wbc, ioend, bio);
+                               goto retry;
+                       }
+
+                       lastblock = bh->b_blocknr;
+               }
+               if (bio)
+                       xfs_submit_ioend_bio(wbc, ioend, bio);
+               xfs_finish_ioend(ioend);
+       } while ((ioend = next) != NULL);
  }
  
  /*
   * Test to see if we've been building up a completion structure for
   * earlier buffers -- if so, we try to append to this ioend if we
   * can, otherwise we finish off any current ioend and start another.
- * Return the ioend we finished off so that the caller can submit it
- * once it has finished processing the dirty page.
+ * Return true if we've finished the given ioend.
   */
  STATIC void
  xfs_add_to_ioend(
         struct inode            *inode,
         struct buffer_head      *bh,
         xfs_off_t               offset,
-       struct xfs_writepage_ctx *wpc,
-       struct list_head        *iolist)
+       struct xfs_writepage_ctx *wpc)
  {
         if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
             bh->b_blocknr != wpc->last_block + 1) {
                 struct xfs_ioend        *new;
  
-               if (wpc->ioend)
-                       list_add(&wpc->ioend->io_list, iolist);
-
                 new = xfs_alloc_ioend(inode, wpc->io_type);
                 new->io_offset = offset;
                 new->io_buffer_head = bh;
                 new->io_buffer_tail = bh;
+               if (wpc->ioend)
+                       wpc->ioend->io_list = new;
                 wpc->ioend = new;
         } else {
                 wpc->ioend->io_buffer_tail->b_private = bh;
@@ -545,7 +571,6 @@ xfs_add_to_ioend(
         bh->b_private = NULL;
         wpc->ioend->io_size += bh->b_size;
         wpc->last_block = bh->b_blocknr;
-       xfs_start_buffer_writeback(bh);
  }
  
  STATIC void
@@ -707,41 +732,44 @@ out_invalidate:
         return;
  }
  
-/*
- * We implement an immediate ioend submission policy here to avoid needing to
- * chain multiple ioends and hence nest mempool allocations which can violate
- * forward progress guarantees we need to provide. The current ioend we are
- * adding buffers to is cached on the writepage context, and if the new buffer
- * does not append to the cached ioend it will create a new ioend and cache that
- * instead.
- *
- * If a new ioend is created and cached, the old ioend is returned and queued
- * locally for submission once the entire page is processed or an error has been
- * detected.  While ioends are submitted immediately after they are completed,
- * batching optimisations are provided by higher level block plugging.
- *
- * At the end of a writeback pass, there will be a cached ioend remaining on the
- * writepage context that the caller will need to submit.
- */
  static int
-xfs_writepage_map(
+xfs_writepage_submit(
         struct xfs_writepage_ctx *wpc,
         struct writeback_control *wbc,
+       int                     status)
+{
+       struct blk_plug         plug;
+
+       /* Reserve log space if we might write beyond the on-disk inode size. */
+       if (!status && wpc->ioend && wpc->ioend->io_type != XFS_IO_UNWRITTEN &&
+           xfs_ioend_is_append(wpc->ioend))
+               status = xfs_setfilesize_trans_alloc(wpc->ioend);
+
+       if (wpc->iohead) {
+               blk_start_plug(&plug);
+               xfs_submit_ioend(wbc, wpc->iohead, status);
+               blk_finish_plug(&plug);
+       }
+       return status;
+}
+
+static int
+xfs_writepage_map(
+       struct xfs_writepage_ctx *wpc,
         struct inode            *inode,
         struct page             *page,
         loff_t                  offset,
         __uint64_t              end_offset)
  {
-       LIST_HEAD(submit_list);
-       struct xfs_ioend        *ioend, *next;
         struct buffer_head      *bh, *head;
         ssize_t                 len = 1 << inode->i_blkbits;
         int                     error = 0;
-       int                     count = 0;
         int                     uptodate = 1;
+       int                     count = 0;
  
         bh = head = page_buffers(page);
         offset = page_offset(page);
+
         do {
                 if (offset >= end_offset)
                         break;
@@ -794,7 +822,7 @@ xfs_writepage_map(
                         error = xfs_map_blocks(inode, offset, &wpc->imap,
                                              wpc->io_type);
                         if (error)
-                               goto out;
+                               goto out_error;
                         wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
                                                          offset);
                 }
@@ -802,65 +830,46 @@ xfs_writepage_map(
                         lock_buffer(bh);
                         if (wpc->io_type != XFS_IO_OVERWRITE)
                                 xfs_map_at_offset(inode, bh, &wpc->imap, offset);
-                       xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list);
+                       xfs_add_to_ioend(inode, bh, offset, wpc);
                         count++;
                 }
  
+               if (!wpc->iohead)
+                       wpc->iohead = wpc->ioend;
+
         } while (offset += len, ((bh = bh->b_this_page) != head));
  
         if (uptodate && bh == head)
                 SetPageUptodate(page);
  
-       ASSERT(wpc->ioend || list_empty(&submit_list));
+       xfs_start_page_writeback(page, 1, count);
+       ASSERT(wpc->iohead || !count);
+       return 0;
  
-out:
+out_error:
         /*
-        * On error, we have to fail the ioend here because we have locked
-        * buffers in the ioend. If we don't do this, we'll deadlock
-        * invalidating the page as that tries to lock the buffers on the page.
-        * Also, because we may have set pages under writeback, we have to make
-        * sure we run IO completion to mark the error state of the IO
-        * appropriately, so we can't cancel the ioend directly here. That means
-        * we have to mark this page as under writeback if we included any
-        * buffers from it in the ioend chain so that completion treats it
-        * correctly.
+        * On error, we have to fail the iohead here because we locked buffers
+        * in the ioend chain. If we don't do this, we'll deadlock invalidating
+        * the page as that tries to lock the buffers on the page. Also, because
+        * we may have set pages under writeback, we have to make sure we run
+        * IO completion to mark the error state of the IO appropriately, so we
+        * can't cancel the ioend directly here. That means we have to mark this
+        * page as under writeback if we included any buffers from it in the
+        * ioend chain so that completion treats it correctly.
          *
-        * If we didn't include the page in the ioend, the on error we can
-        * simply discard and unlock it as there are no other users of the page
-        * or it's buffers right now. The caller will still need to trigger
-        * submission of outstanding ioends on the writepage context so they are
-        * treated correctly on error.
+        * If we didn't include the page in the ioend, then we can simply
+        * discard and unlock it as there are no other users of the page or it's
+        * buffers right now. The caller will still need to trigger submission
+        * of outstanding ioends on the writepage context so they are treated
+        * correctly on error.
          */
-       if (count) {
-               xfs_start_page_writeback(page, !error);
-
-               /*
-                * Preserve the original error if there was one, otherwise catch
-                * submission errors here and propagate into subsequent ioend
-                * submissions.
-                */
-               list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
-                       int error2;
-
-                       list_del_init(&ioend->io_list);
-                       error2 = xfs_submit_ioend(wbc, ioend, error);
-                       if (error2 && !error)
-                               error = error2;
-               }
-       } else if (error) {
+       if (count)
+               xfs_start_page_writeback(page, 0, count);
+       else {
                 xfs_aops_discard_page(page);
                 ClearPageUptodate(page);
                 unlock_page(page);
-       } else {
-               /*
-                * We can end up here with no error and nothing to write if we
-                * race with a partial page truncate on a sub-page block sized
-                * filesystem. In that case we need to mark the page clean.
-                */
-               xfs_start_page_writeback(page, 1);
-               end_page_writeback(page);
         }
-
         mapping_set_error(page->mapping, error);
         return error;
  }
@@ -976,7 +985,7 @@ xfs_do_writepage(
                 end_offset = offset;
         }
  
-       return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset);
+       return xfs_writepage_map(wpc, inode, page, offset, end_offset);
  
  redirty:
         redirty_page_for_writepage(wbc, page);
@@ -995,9 +1004,7 @@ xfs_vm_writepage(
         int                     ret;
  
         ret = xfs_do_writepage(page, wbc, &wpc);
-       if (wpc.ioend)
-               ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
-       return ret;
+       return xfs_writepage_submit(&wpc, wbc, ret);
  }
  
  STATIC int
@@ -1012,9 +1019,7 @@ xfs_vm_writepages(
  
         xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
         ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
-       if (wpc.ioend)
-               ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
-       return ret;
+       return xfs_writepage_submit(&wpc, wbc, ret);
  }
  
  /*
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h

index 4e01bd5b64266e1585899fd040ae897549d8e95d..3c3f1a37a1c79d45d5d0e89e224b3247cb2da727 100644 (file)
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -41,7 +41,7 @@ enum {
   * It can manage several multi-page bio's at once.
   */
  typedef struct xfs_ioend {
-       struct list_head        io_list;        /* next ioend in chain */
+       struct xfs_ioend        *io_list;       /* next ioend in chain */
         unsigned int            io_type;        /* delalloc / unwritten */
         int                     io_error;       /* I/O error code */
         atomic_t                io_remaining;   /* hold count */
author	Wengang Wang <wen.gang.wang@oracle.com>
	Tue, 29 Jan 2019 03:18:16 +0000 (19:18 -0800)
committer	Brian Maly <brian.maly@oracle.com>
	Wed, 30 Jan 2019 03:57:51 +0000 (22:57 -0500)
fs/xfs/xfs_aops.c		patch \| blob \| history
fs/xfs/xfs_aops.h		patch \| blob \| history