From b2603f891b43b531e5766cfe654b55a61317fe03 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 31 Mar 2022 13:52:49 +0200 Subject: [PATCH] block, mm: move PSI accounting out the block layer The bio layer has no business doing PSI accounting. Move it to the callers of ->readpage and ->readahead instead in the generic page cache handling code instead. This means some change in coverage: - the accounting critical sections are generally longer, but this accurately affects the time actually spent refaulting - this now automatically also covers file systems that are not block based, such as the various network file systems - this does not automatically cover bio submissions for read ahead windows enlarged in an ongoing readpage or readahead operation where file systems want to optimistically read more data (e.g. btrfs compressed reads) - it does not cover some cases where file systems read data into two address spaces in the same readpage/readpages request, which can happen in erofs Signed-off-by: Christoph Hellwig --- block/bio.c | 8 -------- block/blk-core.c | 17 ----------------- fs/direct-io.c | 2 -- include/linux/blk_types.h | 1 - mm/filemap.c | 14 ++++++++++++++ mm/readahead.c | 24 +++++++++++++++++------- 6 files changed, 31 insertions(+), 35 deletions(-) diff --git a/block/bio.c b/block/bio.c index cdd7b2915c53..a9d2a8544cd8 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1034,9 +1034,6 @@ void __bio_add_page(struct bio *bio, struct page *page, bio->bi_iter.bi_size += len; bio->bi_vcnt++; - - if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page))) - bio_set_flag(bio, BIO_WORKINGSET); } EXPORT_SYMBOL_GPL(__bio_add_page); @@ -1252,9 +1249,6 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) * fit into the bio, or are requested in @iter, whatever is smaller. If * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. - * - * It's intended for direct IO, so doesn't do PSI tracking, the caller is - * responsible for setting BIO_WORKINGSET if necessary. */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { @@ -1273,8 +1267,6 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) ret = __bio_iov_iter_get_pages(bio, iter); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); - /* don't account direct I/O as memory stall */ - bio_clear_flag(bio, BIO_WORKINGSET); return bio->bi_vcnt ? 0 : ret; } EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); diff --git a/block/blk-core.c b/block/blk-core.c index 937bb6b86331..2b909060395e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include @@ -908,22 +907,6 @@ void submit_bio(struct bio *bio) } } - /* - * If we're reading data that is part of the userspace workingset, count - * submission time as memory stall. When the device is congested, or - * the submitting cgroup IO-throttled, submission can be a significant - * part of overall IO time. - */ - if (unlikely(bio_op(bio) == REQ_OP_READ && - bio_flagged(bio, BIO_WORKINGSET))) { - unsigned long pflags; - - psi_memstall_enter(&pflags); - submit_bio_noacct(bio); - psi_memstall_leave(&pflags); - return; - } - submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio); diff --git a/fs/direct-io.c b/fs/direct-io.c index aef06e607b40..5cac8c8869c5 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -419,8 +419,6 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) unsigned long flags; bio->bi_private = dio; - /* don't account direct I/O as memory stall */ - bio_clear_flag(bio, BIO_WORKINGSET); spin_lock_irqsave(&dio->bio_lock, flags); dio->refcount++; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index dd0763a1c674..ec7771e83e17 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -314,7 +314,6 @@ enum { BIO_NO_PAGE_REF, /* don't put release vec pages */ BIO_CLONED, /* doesn't own data */ BIO_BOUNCED, /* bio is a bounce bio */ - BIO_WORKINGSET, /* contains userspace workingset pages */ BIO_QUIET, /* Make BIO Quiet */ BIO_CHAIN, /* chained bio, ->bi_remaining in effect */ BIO_REFFED, /* bio has elevated ->bi_cnt */ diff --git a/mm/filemap.c b/mm/filemap.c index 741c75d57977..514772785998 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2416,6 +2416,8 @@ retry: static int filemap_read_folio(struct file *file, struct address_space *mapping, struct folio *folio) { + bool workingset = folio_test_workingset(folio); + unsigned long pflags; int error; /* @@ -2424,8 +2426,13 @@ static int filemap_read_folio(struct file *file, struct address_space *mapping, * fails. */ folio_clear_error(folio); + /* Start the actual read. The read will unlock the page. */ + if (unlikely(workingset)) + psi_memstall_enter(&pflags); error = mapping->a_ops->readpage(file, &folio->page); + if (unlikely(workingset)) + psi_memstall_leave(&pflags); if (error) return error; @@ -3491,7 +3498,9 @@ EXPORT_SYMBOL(generic_file_readonly_mmap); static struct folio *do_read_cache_folio(struct address_space *mapping, pgoff_t index, filler_t filler, void *data, gfp_t gfp) { + unsigned long pflags; struct folio *folio; + bool workingset; int err; repeat: @@ -3541,10 +3550,15 @@ repeat: } } + workingset = folio_test_workingset(folio); + if (unlikely(workingset)) + psi_memstall_enter(&pflags); if (filler) err = filler(data, &folio->page); else err = mapping->a_ops->readpage(data, &folio->page); + if (unlikely(workingset)) + psi_memstall_leave(&pflags); if (err < 0) { folio_put(folio); return ERR_PTR(err); diff --git a/mm/readahead.c b/mm/readahead.c index 06f668108bdb..1c330be254c7 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -121,6 +121,7 @@ #include #include #include +#include #include #include #include @@ -142,17 +143,19 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) } EXPORT_SYMBOL_GPL(file_ra_state_init); -static void read_pages(struct readahead_control *rac) +static void read_pages(struct readahead_control *rac, bool workingset) { const struct address_space_operations *aops = rac->mapping->a_ops; struct page *page; struct blk_plug plug; + unsigned long pflags; if (!readahead_count(rac)) return; + if (unlikely(workingset)) + psi_memstall_enter(&pflags); blk_start_plug(&plug); - if (aops->readahead) { aops->readahead(rac); /* @@ -175,8 +178,9 @@ static void read_pages(struct readahead_control *rac) put_page(page); } } - blk_finish_plug(&plug); + if (unlikely(workingset)) + psi_memstall_leave(&pflags); BUG_ON(readahead_count(rac)); } @@ -201,6 +205,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, struct address_space *mapping = ractl->mapping; unsigned long index = readahead_index(ractl); gfp_t gfp_mask = readahead_gfp_mask(mapping); + bool workingset = false; unsigned long i; /* @@ -231,9 +236,10 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, * have a stable reference to this page, and it's * not worth getting one just for that. */ - read_pages(ractl); + read_pages(ractl, workingset); ractl->_index++; i = ractl->_index + ractl->_nr_pages - index - 1; + workingset = false; continue; } @@ -243,14 +249,16 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, if (filemap_add_folio(mapping, folio, index + i, gfp_mask) < 0) { folio_put(folio); - read_pages(ractl); + read_pages(ractl, workingset); ractl->_index++; i = ractl->_index + ractl->_nr_pages - index - 1; + workingset = false; continue; } if (i == nr_to_read - lookahead_size) folio_set_readahead(folio); ractl->_nr_pages++; + workingset |= folio_test_workingset(folio); } /* @@ -258,7 +266,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, * uptodate then the caller will launch readpage again, and * will then handle the error. */ - read_pages(ractl); + read_pages(ractl, workingset); filemap_invalidate_unlock_shared(mapping); memalloc_nofs_restore(nofs); } @@ -473,6 +481,7 @@ void page_cache_ra_order(struct readahead_control *ractl, pgoff_t index = readahead_index(ractl); pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; pgoff_t mark = index + ra->size - ra->async_size; + bool workingset = false; int err = 0; gfp_t gfp = readahead_gfp_mask(mapping); @@ -518,6 +527,7 @@ void page_cache_ra_order(struct readahead_control *ractl, break; } + workingset |= folio_test_workingset(folio); ractl->_nr_pages += 1UL << order; index += 1UL << order; } @@ -527,7 +537,7 @@ void page_cache_ra_order(struct readahead_control *ractl, ra->async_size += index - limit - 1; } - read_pages(ractl); + read_pages(ractl, workingset); /* * If there were already pages in the page cache, then we may have -- 2.50.1