]> www.infradead.org Git - users/hch/xfs.git/commitdiff
xfs: support write life time based data placement
authorHans Holmberg <hans.holmberg@wdc.com>
Tue, 5 Nov 2024 07:51:29 +0000 (08:51 +0100)
committerChristoph Hellwig <hch@lst.de>
Tue, 5 Nov 2024 08:29:54 +0000 (09:29 +0100)
Add a file write life time data placement allocation scheme that aims
minimize fragmentation and thereby to do two things:

a) Complete separate file data when possible into diffent zones when
   possible.
b) Colocate file data of similar life times when feasible.

To get best results, average file sizes should align with average
zone capacitity.

Benchmarked with RocksDB using leveled compaction, obeserving ~10%
throughput improvement for overwrite workloads at 80% file system
utilization.

Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
fs/xfs/xfs_zone_alloc.c

index 9cdce2cde89a83089a89c55b5aa03869546288f7..332d01c3a65a623b80ebb3ab47f6d6d4cfd960b1 100644 (file)
@@ -311,6 +311,56 @@ xfs_select_open_zone_lru(
                if (!atomic_inc_not_zero(&rtg->rtg_group.xg_active_ref))
                        continue;
                list_move_tail(&rtg->rtg_entry, &mp->m_open_zones);
+
+               return rtg;
+       }
+
+       return NULL;
+}
+
+static bool
+xfs_good_hint_match(
+       struct xfs_rtgroup      *rtg,
+       enum rw_hint            file_hint)
+{
+       switch (rtg->rtg_write_hint) {
+       case WRITE_LIFE_LONG:
+       case WRITE_LIFE_EXTREME:
+               /* don't colocate cold data */
+               break;
+       case WRITE_LIFE_MEDIUM:
+               /* colocate medium with medium */
+               if (file_hint == WRITE_LIFE_MEDIUM)
+                       return true;
+               break;
+       case WRITE_LIFE_SHORT:
+       case WRITE_LIFE_NONE:
+       case WRITE_LIFE_NOT_SET:
+               /* colocate short and none */
+               if (file_hint <= WRITE_LIFE_SHORT)
+                       return true;
+               break;
+       }
+       return false;
+}
+
+static struct xfs_rtgroup *
+xfs_select_open_zone_hint(
+       struct xfs_mount        *mp,
+       enum rw_hint            file_hint,
+       unsigned int            minlen)
+{
+       struct xfs_rtgroup      *rtg;
+
+       /* first try to get an unused rtg */
+       list_for_each_entry(rtg, &mp->m_open_zones, rtg_entry) {
+               if (rtg->rtg_extents - rtg->rtg_write_pointer < minlen)
+                       continue;
+               if (!xfs_good_hint_match(rtg, file_hint))
+                       continue;
+               if (!atomic_inc_not_zero(&rtg->rtg_group.xg_active_ref))
+                       continue;
+
                return rtg;
        }
 
@@ -332,21 +382,50 @@ xfs_select_zone_nowait(
        xfs_filblks_t           count_fsb)
 {
        struct xfs_mount        *mp = ip->i_mount;
+       enum rw_hint            hint = VFS_I(ip)->i_write_hint;
        struct xfs_rtgroup      *rtg;
 
+       if (!xfs_has_lifetime(mp))
+               hint = WRITE_LIFE_NOT_SET;
+
+       /*
+        * Try to fill up open zones with matching temperature if available.  It
+        * is better to try to co-locate data when this is favorable, so we can
+        * activate empty zones when it is statistically better to separate
+        * data.
+        */
+       if (hint != WRITE_LIFE_NOT_SET) {
+               rtg = xfs_select_open_zone_hint(mp, hint, count_fsb);
+               if (rtg)
+                       goto done;
+       }
+
        /*
         * If we are below the open limit try to activate a zone.
         */
        if (mp->m_nr_open_zones < mp->m_max_open_zones - XFS_OPEN_GC_ZONES) {
                rtg = xfs_activate_zone(mp);
                if (rtg)
-                       return rtg;
+                       goto done;
        }
 
        rtg = xfs_select_open_zone_lru(mp, count_fsb);
        if (rtg)
-               return rtg;
-       return xfs_select_open_zone_lru(mp, 1);
+               goto done;
+
+       rtg = xfs_select_open_zone_lru(mp, 1);
+done:
+       /*
+        * If we have a hint set for the data, use that for the zone even if
+        * some data was written already without any hint set, but don't change
+        * the temperature after that as that would make little sense without
+        * tracking per-temperature class written block counts, which is
+        * probably overkill anyway.
+        */
+       if (rtg && rtg->rtg_write_hint == WRITE_LIFE_NOT_SET &&
+           hint != WRITE_LIFE_NOT_SET)
+               rtg->rtg_write_hint = hint;
+       return rtg;
 }
 
 static struct xfs_rtgroup *