]> www.infradead.org Git - users/hch/block.git/commitdiff
block, mm: move PSI accounting out the block layer
authorChristoph Hellwig <hch@lst.de>
Thu, 31 Mar 2022 11:52:49 +0000 (13:52 +0200)
committerChristoph Hellwig <hch@lst.de>
Thu, 31 Mar 2022 12:05:24 +0000 (14:05 +0200)
The bio layer has no business doing PSI accounting.  Move it to the
callers of ->readpage and ->readahead instead in the generic page
cache handling code instead.  This means some change in coverage:

 - the accounting critical sections are generally longer, but this
   accurately affects the time actually spent refaulting
 - this now automatically also covers file systems that are not
   block based, such as the various network file systems
 - this does not automatically cover bio submissions for read ahead
   windows enlarged in an ongoing readpage or readahead operation
   where file systems want to optimistically read more data (e.g.
   btrfs compressed reads)
 - it does not cover some cases where file systems read data into
   two address spaces in the same readpage/readpages request, which
   can happen in erofs

Signed-off-by: Christoph Hellwig <hch@lst.de>
block/bio.c
block/blk-core.c
fs/direct-io.c
include/linux/blk_types.h
mm/filemap.c
mm/readahead.c

index cdd7b2915c532c1590abafe0019594402b98cbc3..a9d2a8544cd8ccccec9639b38dcb0c6014433e1b 100644 (file)
@@ -1034,9 +1034,6 @@ void __bio_add_page(struct bio *bio, struct page *page,
 
        bio->bi_iter.bi_size += len;
        bio->bi_vcnt++;
-
-       if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page)))
-               bio_set_flag(bio, BIO_WORKINGSET);
 }
 EXPORT_SYMBOL_GPL(__bio_add_page);
 
@@ -1252,9 +1249,6 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
  * fit into the bio, or are requested in @iter, whatever is smaller. If
  * MM encounters an error pinning the requested pages, it stops. Error
  * is returned only if 0 pages could be pinned.
- *
- * It's intended for direct IO, so doesn't do PSI tracking, the caller is
- * responsible for setting BIO_WORKINGSET if necessary.
  */
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 {
@@ -1273,8 +1267,6 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
                        ret = __bio_iov_iter_get_pages(bio, iter);
        } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
 
-       /* don't account direct I/O as memory stall */
-       bio_clear_flag(bio, BIO_WORKINGSET);
        return bio->bi_vcnt ? 0 : ret;
 }
 EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
index 937bb6b863317a96908ae382c61e90fc1782b1b7..2b909060395eed2749c58a00f4d1bc31b13c93dc 100644 (file)
@@ -37,7 +37,6 @@
 #include <linux/t10-pi.h>
 #include <linux/debugfs.h>
 #include <linux/bpf.h>
-#include <linux/psi.h>
 #include <linux/part_stat.h>
 #include <linux/sched/sysctl.h>
 #include <linux/blk-crypto.h>
@@ -908,22 +907,6 @@ void submit_bio(struct bio *bio)
                }
        }
 
-       /*
-        * If we're reading data that is part of the userspace workingset, count
-        * submission time as memory stall.  When the device is congested, or
-        * the submitting cgroup IO-throttled, submission can be a significant
-        * part of overall IO time.
-        */
-       if (unlikely(bio_op(bio) == REQ_OP_READ &&
-           bio_flagged(bio, BIO_WORKINGSET))) {
-               unsigned long pflags;
-
-               psi_memstall_enter(&pflags);
-               submit_bio_noacct(bio);
-               psi_memstall_leave(&pflags);
-               return;
-       }
-
        submit_bio_noacct(bio);
 }
 EXPORT_SYMBOL(submit_bio);
index aef06e607b4054073f47b8cc4f8f274f903fd1a2..5cac8c8869c52af0d86496801a47b55361b0c04e 100644 (file)
@@ -419,8 +419,6 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
        unsigned long flags;
 
        bio->bi_private = dio;
-       /* don't account direct I/O as memory stall */
-       bio_clear_flag(bio, BIO_WORKINGSET);
 
        spin_lock_irqsave(&dio->bio_lock, flags);
        dio->refcount++;
index dd0763a1c6740f74b6a5fe4fb6d35d70ed610d6c..ec7771e83e176d6a80345f0a32c70e4fccf3eafb 100644 (file)
@@ -314,7 +314,6 @@ enum {
        BIO_NO_PAGE_REF,        /* don't put release vec pages */
        BIO_CLONED,             /* doesn't own data */
        BIO_BOUNCED,            /* bio is a bounce bio */
-       BIO_WORKINGSET,         /* contains userspace workingset pages */
        BIO_QUIET,              /* Make BIO Quiet */
        BIO_CHAIN,              /* chained bio, ->bi_remaining in effect */
        BIO_REFFED,             /* bio has elevated ->bi_cnt */
index 741c75d57977d1c69d0de1782bd9a8fcec0a88fa..5147727859989195f499e2dedbd32a0c752e313b 100644 (file)
@@ -2416,6 +2416,8 @@ retry:
 static int filemap_read_folio(struct file *file, struct address_space *mapping,
                struct folio *folio)
 {
+       bool workingset = folio_test_workingset(folio);
+       unsigned long pflags;
        int error;
 
        /*
@@ -2424,8 +2426,13 @@ static int filemap_read_folio(struct file *file, struct address_space *mapping,
         * fails.
         */
        folio_clear_error(folio);
+
        /* Start the actual read. The read will unlock the page. */
+       if (unlikely(workingset))
+               psi_memstall_enter(&pflags);
        error = mapping->a_ops->readpage(file, &folio->page);
+       if (unlikely(workingset))
+               psi_memstall_leave(&pflags);
        if (error)
                return error;
 
@@ -3491,7 +3498,9 @@ EXPORT_SYMBOL(generic_file_readonly_mmap);
 static struct folio *do_read_cache_folio(struct address_space *mapping,
                pgoff_t index, filler_t filler, void *data, gfp_t gfp)
 {
+       unsigned long pflags;
        struct folio *folio;
+       bool workingset;
        int err;
 
 repeat:
@@ -3541,10 +3550,15 @@ repeat:
                }
        }
 
+       workingset = folio_test_workingset(folio);
+       if (unlikely(workingset))
+               psi_memstall_enter(&pflags);
        if (filler)
                err = filler(data, &folio->page);
        else
                err = mapping->a_ops->readpage(data, &folio->page);
+       if (unlikely(workingset))
+               psi_memstall_leave(&pflags);
        if (err < 0) {
                folio_put(folio);
                return ERR_PTR(err);
index 06f668108bdbe734435718ae480290231ea2a215..1c330be254c7154b9e591f65c873d5fc1389f663 100644 (file)
 #include <linux/task_io_accounting_ops.h>
 #include <linux/pagevec.h>
 #include <linux/pagemap.h>
+#include <linux/psi.h>
 #include <linux/syscalls.h>
 #include <linux/file.h>
 #include <linux/mm_inline.h>
@@ -142,17 +143,19 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
 }
 EXPORT_SYMBOL_GPL(file_ra_state_init);
 
-static void read_pages(struct readahead_control *rac)
+static void read_pages(struct readahead_control *rac, bool workingset)
 {
        const struct address_space_operations *aops = rac->mapping->a_ops;
        struct page *page;
        struct blk_plug plug;
+       unsigned long pflags;
 
        if (!readahead_count(rac))
                return;
 
+       if (unlikely(workingset))
+               psi_memstall_enter(&pflags);
        blk_start_plug(&plug);
-
        if (aops->readahead) {
                aops->readahead(rac);
                /*
@@ -175,8 +178,9 @@ static void read_pages(struct readahead_control *rac)
                        put_page(page);
                }
        }
-
        blk_finish_plug(&plug);
+       if (unlikely(workingset))
+               psi_memstall_leave(&pflags);
 
        BUG_ON(readahead_count(rac));
 }
@@ -201,6 +205,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
        struct address_space *mapping = ractl->mapping;
        unsigned long index = readahead_index(ractl);
        gfp_t gfp_mask = readahead_gfp_mask(mapping);
+       bool workingset = false;
        unsigned long i;
 
        /*
@@ -231,9 +236,10 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
                         * have a stable reference to this page, and it's
                         * not worth getting one just for that.
                         */
-                       read_pages(ractl);
+                       read_pages(ractl, workingset);
                        ractl->_index++;
                        i = ractl->_index + ractl->_nr_pages - index - 1;
+                       workingset = false;
                        continue;
                }
 
@@ -243,14 +249,16 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
                if (filemap_add_folio(mapping, folio, index + i,
                                        gfp_mask) < 0) {
                        folio_put(folio);
-                       read_pages(ractl);
+                       read_pages(ractl, workingset);
                        ractl->_index++;
                        i = ractl->_index + ractl->_nr_pages - index - 1;
+                       workingset = false;
                        continue;
                }
                if (i == nr_to_read - lookahead_size)
                        folio_set_readahead(folio);
                ractl->_nr_pages++;
+               workingset |= folio_test_workingset(folio);
        }
 
        /*
@@ -258,7 +266,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
         * uptodate then the caller will launch readpage again, and
         * will then handle the error.
         */
-       read_pages(ractl);
+       read_pages(ractl, workingset);
        filemap_invalidate_unlock_shared(mapping);
        memalloc_nofs_restore(nofs);
 }
@@ -473,6 +481,7 @@ void page_cache_ra_order(struct readahead_control *ractl,
        pgoff_t index = readahead_index(ractl);
        pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT;
        pgoff_t mark = index + ra->size - ra->async_size;
+       bool workingset = false;
        int err = 0;
        gfp_t gfp = readahead_gfp_mask(mapping);
 
@@ -518,6 +527,7 @@ void page_cache_ra_order(struct readahead_control *ractl,
                        break;
                }
 
+               workingset |= folio_test_workingset(folio);
                ractl->_nr_pages += 1UL << order;
                index += 1UL << order;
        }
@@ -527,7 +537,7 @@ void page_cache_ra_order(struct readahead_control *ractl,
                ra->async_size += index - limit - 1;
        }
 
-       read_pages(ractl);
+       read_pages(ractl, workingset);
 
        /*
         * If there were already pages in the page cache, then we may have