btrfs: try to search for data csums in commit root

author Boris Burkov <boris@bur.io>

Mon, 21 Oct 2024 19:01:53 +0000 (12:01 -0700)

committer David Sterba <dsterba@suse.com>

Mon, 22 Sep 2025 08:54:31 +0000 (10:54 +0200)
author Boris Burkov <boris@bur.io>
Mon, 21 Oct 2024 19:01:53 +0000 (12:01 -0700)
committer David Sterba <dsterba@suse.com>
Mon, 22 Sep 2025 08:54:31 +0000 (10:54 +0200)
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c

index 50b5fc1c06d7cc5d067e23211a0f3e32b751fa54..ea7f7a17a3d5bbf709da7dd57060c5a103eac3b4 100644 (file)
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -93,6 +93,7 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
                 refcount_inc(&orig_bbio->ordered->refs);
                 bbio->ordered = orig_bbio->ordered;
         }
+       bbio->csum_search_commit_root = orig_bbio->csum_search_commit_root;
         atomic_inc(&orig_bbio->pending_ios);
         return bbio;
  }
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h

index dc2eb43b70970b110c47697f462b81013de5beca..00883aea55d70fb3b3e1983b1bec2d7a08f32d05 100644 (file)
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -82,6 +82,8 @@ struct btrfs_bio {
         /* Save the first error status of split bio. */
         blk_status_t status;
  
+       /* Use the commit root to look up csums (data read bio only). */
+       bool csum_search_commit_root;
         /*
          * This member must come last, bio_alloc_bioset will allocate enough
          * bytes for entire btrfs_bio but relies on bio being last.
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c

index 35e3071cec0636f6c0b1ebc47295e997908da73d..06e119ee2649242ac12c0db49b69801ca96a106a 100644 (file)
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -602,6 +602,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
         cb->compressed_len = compressed_len;
         cb->compress_type = btrfs_extent_map_compression(em);
         cb->orig_bbio = bbio;
+       cb->bbio.csum_search_commit_root = bbio->csum_search_commit_root;
  
         btrfs_free_extent_map(em);
  
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c

index b21cb72835ccf46c4122f0e501d088578641be0a..7ab45b2346216e3117a54bb548ea6f525b538b77 100644 (file)
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -101,6 +101,26 @@ struct btrfs_bio_ctrl {
         enum btrfs_compression_type compress_type;
         u32 len_to_oe_boundary;
         blk_opf_t opf;
+       /*
+        * For data read bios, we attempt to optimize csum lookups if the extent
+        * generation is older than the current one. To make this possible, we
+        * need to track the maximum generation of an extent in a bio_ctrl to
+        * make the decision when submitting the bio.
+        *
+        * The pattern between do_readpage(), submit_one_bio() and
+        * submit_extent_folio() is quite subtle, so tracking this is tricky.
+        *
+        * As we process extent E, we might submit a bio with existing built up
+        * extents before adding E to a new bio, or we might just add E to the
+        * bio. As a result, E's generation could apply to the current bio or
+        * to the next one, so we need to be careful to update the bio_ctrl's
+        * generation with E's only when we are sure E is added to bio_ctrl->bbio
+        * in submit_extent_folio().
+        *
+        * See the comment in btrfs_lookup_bio_sums() for more detail on the
+        * need for this optimization.
+        */
+       u64 generation;
         btrfs_bio_end_io_t end_io_func;
         struct writeback_control *wbc;
  
@@ -131,6 +151,26 @@ struct btrfs_bio_ctrl {
         u64 last_em_start;
  };
  
+/*
+ * Helper to set the csum search commit root option for a bio_ctrl's bbio
+ * before submitting the bio.
+ *
+ * Only for use by submit_one_bio().
+ */
+static void bio_set_csum_search_commit_root(struct btrfs_bio_ctrl *bio_ctrl)
+{
+       struct btrfs_bio *bbio = bio_ctrl->bbio;
+
+       ASSERT(bbio);
+
+       if (!(btrfs_op(&bbio->bio) == BTRFS_MAP_READ && is_data_inode(bbio->inode)))
+               return;
+
+       bio_ctrl->bbio->csum_search_commit_root =
+               (bio_ctrl->generation &&
+                bio_ctrl->generation < btrfs_get_fs_generation(bbio->inode->root->fs_info));
+}
+
  static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
  {
         struct btrfs_bio *bbio = bio_ctrl->bbio;
@@ -141,6 +181,8 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
         /* Caller should ensure the bio has at least some range added */
         ASSERT(bbio->bio.bi_iter.bi_size);
  
+       bio_set_csum_search_commit_root(bio_ctrl);
+
         if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ &&
             bio_ctrl->compress_type != BTRFS_COMPRESS_NONE)
                 btrfs_submit_compressed_read(bbio);
@@ -149,6 +191,12 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
  
         /* The bbio is owned by the end_io handler now */
         bio_ctrl->bbio = NULL;
+       /*
+        * We used the generation to decide whether to lookup csums in the
+        * commit_root or not when we called bio_set_csum_search_commit_root()
+        * above. Now, reset the generation for the next bio.
+        */
+       bio_ctrl->generation = 0;
  }
  
  /*
@@ -719,6 +767,8 @@ static void alloc_new_bio(struct btrfs_inode *inode,
   * @size:      portion of page that we want to write to
   * @pg_offset: offset of the new bio or to check whether we are adding
   *              a contiguous page to the previous one
+ * @read_em_generation: generation of the extent_map we are submitting
+ *                     (only used for read)
   *
   * The will either add the page into the existing @bio_ctrl->bbio, or allocate a
   * new one in @bio_ctrl->bbio.
@@ -727,7 +777,8 @@ static void alloc_new_bio(struct btrfs_inode *inode,
   */
  static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
                                u64 disk_bytenr, struct folio *folio,
-                              size_t size, unsigned long pg_offset)
+                              size_t size, unsigned long pg_offset,
+                              u64 read_em_generation)
  {
         struct btrfs_inode *inode = folio_to_inode(folio);
         loff_t file_offset = folio_pos(folio) + pg_offset;
@@ -758,6 +809,11 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
                         submit_one_bio(bio_ctrl);
                         continue;
                 }
+               /*
+                * Now that the folio is definitely added to the bio, include its
+                * generation in the max generation calculation.
+                */
+               bio_ctrl->generation = max(bio_ctrl->generation, read_em_generation);
                 bio_ctrl->next_file_offset += len;
  
                 if (bio_ctrl->wbc)
@@ -960,6 +1016,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
                 bool force_bio_submit = false;
                 u64 disk_bytenr;
                 u64 block_start;
+               u64 em_gen;
  
                 ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
                 if (cur >= last_byte) {
@@ -1043,6 +1100,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
  
                 bio_ctrl->last_em_start = em->start;
  
+               em_gen = em->generation;
                 btrfs_free_extent_map(em);
                 em = NULL;
  
@@ -1066,7 +1124,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
                 if (force_bio_submit)
                         submit_one_bio(bio_ctrl);
                 submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize,
-                                   pg_offset);
+                                   pg_offset, em_gen);
         }
         return 0;
  }
@@ -1600,7 +1658,7 @@ static int submit_one_sector(struct btrfs_inode *inode,
         ASSERT(folio_test_writeback(folio));
  
         submit_extent_folio(bio_ctrl, disk_bytenr, folio,
-                           sectorsize, filepos - folio_pos(folio));
+                           sectorsize, filepos - folio_pos(folio), 0);
         return 0;
  }
  
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c

index c09fbc257634aba66eb4fb06bf06ba350a7c4693..4dd3d8a02519eccabe9da8d303a2e9193a56bd2d 100644 (file)
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -397,6 +397,36 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
                 path->skip_locking = 1;
         }
  
+       /*
+        * If we are searching for a csum of an extent from a past
+        * transaction, we can search in the commit root and reduce
+        * lock contention on the csum tree extent buffers.
+        *
+        * This is important because that lock is an rwsem which gets
+        * pretty heavy write load under memory pressure and sustained
+        * csum overwrites, unlike the commit_root_sem. (Memory pressure
+        * makes us writeback the nodes multiple times per transaction,
+        * which makes us cow them each time, taking the write lock.)
+        *
+        * Due to how rwsem is implemented, there is a possible
+        * priority inversion where the readers holding the lock don't
+        * get scheduled (say they're in a cgroup stuck in heavy reclaim)
+        * which then blocks writers, including transaction commit. By
+        * using a semaphore with fewer writers (only a commit switching
+        * the roots), we make this issue less likely.
+        *
+        * Note that we don't rely on btrfs_search_slot to lock the
+        * commit root csum. We call search_slot multiple times, which would
+        * create a potential race where a commit comes in between searches
+        * while we are not holding the commit_root_sem, and we get csums
+        * from across transactions.
+        */
+       if (bbio->csum_search_commit_root) {
+               path->search_commit_root = 1;
+               path->skip_locking = 1;
+               down_read(&fs_info->commit_root_sem);
+       }
+
         while (bio_offset < orig_len) {
                 int count;
                 u64 cur_disk_bytenr = orig_disk_bytenr + bio_offset;
@@ -442,6 +472,8 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
                 bio_offset += count * sectorsize;
         }
  
+       if (bbio->csum_search_commit_root)
+               up_read(&fs_info->commit_root_sem);
         return ret;
  }
author	Boris Burkov <boris@bur.io>
	Mon, 21 Oct 2024 19:01:53 +0000 (12:01 -0700)
committer	David Sterba <dsterba@suse.com>
	Mon, 22 Sep 2025 08:54:31 +0000 (10:54 +0200)
fs/btrfs/bio.c		patch \| blob \| history
fs/btrfs/bio.h		patch \| blob \| history
fs/btrfs/compression.c		patch \| blob \| history
fs/btrfs/extent_io.c		patch \| blob \| history
fs/btrfs/file-item.c		patch \| blob \| history