xfs: allow sysadmins to specify a maximum atomic write limit at mount time

author Darrick J. Wong <djwong@kernel.org>

Wed, 7 May 2025 21:18:34 +0000 (14:18 -0700)

committer Darrick J. Wong <djwong@kernel.org>

Wed, 7 May 2025 21:25:33 +0000 (14:25 -0700)
author Darrick J. Wong <djwong@kernel.org>
Wed, 7 May 2025 21:18:34 +0000 (14:18 -0700)
committer Darrick J. Wong <djwong@kernel.org>
Wed, 7 May 2025 21:25:33 +0000 (14:25 -0700)
diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst

index 5becb441c3cba0fe8d1e9a4f0f1bc16bd713810d..a18328a5fb93be33b943b9941bfba6fa09c745cd 100644 (file)
--- a/Documentation/admin-guide/xfs.rst
+++ b/Documentation/admin-guide/xfs.rst
@@ -151,6 +151,17 @@ When mounting an XFS filesystem, the following options are accepted.
         optional, and the log section can be separate from the data
         section or contained within it.
  
+  max_atomic_write=value
+       Set the maximum size of an atomic write.  The size may be
+       specified in bytes, in kilobytes with a "k" suffix, in megabytes
+       with a "m" suffix, or in gigabytes with a "g" suffix.  The size
+       cannot be larger than the maximum write size, larger than the
+       size of any allocation group, or larger than the size of a
+       remapping operation that the log can complete atomically.
+
+       The default value is to set the maximum I/O completion size
+       to allow each CPU to handle one at a time.
+
    max_open_zones=value
         Specify the max number of zones to keep open for writing on a
         zoned rt device. Many open zones aids file data separation
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c

index e73c09fbd24c30323f29a91d870e30b92e1dcdf7..86a111d0f2fc7cfa59c50da2420f023a6fe813b8 100644 (file)
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -1488,3 +1488,72 @@ xfs_calc_max_atomic_write_fsblocks(
  
         return ret;
  }
+
+/*
+ * Compute the log blocks and transaction reservation needed to complete an
+ * atomic write of a given number of blocks.  Worst case, each block requires
+ * separate handling.  A return value of 0 means something went wrong.
+ */
+xfs_extlen_t
+xfs_calc_atomic_write_log_geometry(
+       struct xfs_mount        *mp,
+       xfs_extlen_t            blockcount,
+       unsigned int            *new_logres)
+{
+       struct xfs_trans_res    *curr_res = &M_RES(mp)->tr_atomic_ioend;
+       uint                    old_logres = curr_res->tr_logres;
+       unsigned int            per_intent, step_size;
+       unsigned int            logres;
+       xfs_extlen_t            min_logblocks;
+
+       ASSERT(blockcount > 0);
+
+       xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp));
+
+       per_intent = xfs_calc_atomic_write_ioend_geometry(mp, &step_size);
+
+       /* Check for overflows */
+       if (check_mul_overflow(blockcount, per_intent, &logres) ||
+           check_add_overflow(logres, step_size, &logres))
+               return 0;
+
+       curr_res->tr_logres = logres;
+       min_logblocks = xfs_log_calc_minimum_size(mp);
+       curr_res->tr_logres = old_logres;
+
+       trace_xfs_calc_max_atomic_write_log_geometry(mp, per_intent, step_size,
+                       blockcount, min_logblocks, logres);
+
+       *new_logres = logres;
+       return min_logblocks;
+}
+
+/*
+ * Compute the transaction reservation needed to complete an out of place
+ * atomic write of a given number of blocks.
+ */
+int
+xfs_calc_atomic_write_reservation(
+       struct xfs_mount        *mp,
+       xfs_extlen_t            blockcount)
+{
+       unsigned int            new_logres;
+       xfs_extlen_t            min_logblocks;
+
+       /*
+        * If the caller doesn't ask for a specific atomic write size, then
+        * use the defaults.
+        */
+       if (blockcount == 0) {
+               xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp));
+               return 0;
+       }
+
+       min_logblocks = xfs_calc_atomic_write_log_geometry(mp, blockcount,
+                       &new_logres);
+       if (!min_logblocks || min_logblocks > mp->m_sb.sb_logblocks)
+               return -EINVAL;
+
+       M_RES(mp)->tr_atomic_ioend.tr_logres = new_logres;
+       return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h

index a6d303b836883feead0db7536006d327eb9218e8..336279e0fc61371ea469e8b66b0143af3362d18c 100644 (file)
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -122,5 +122,9 @@ unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp);
  unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp);
  
  xfs_extlen_t xfs_calc_max_atomic_write_fsblocks(struct xfs_mount *mp);
+xfs_extlen_t xfs_calc_atomic_write_log_geometry(struct xfs_mount *mp,
+               xfs_extlen_t blockcount, unsigned int *new_logres);
+int xfs_calc_atomic_write_reservation(struct xfs_mount *mp,
+               xfs_extlen_t blockcount);
  
  #endif /* __XFS_TRANS_RESV_H__ */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c

index 86089e27b8e765a40688e546e4471734dd8713d8..29276fe60df9c6d21e3ad0a3de551c92a46ef239 100644 (file)
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -742,6 +742,82 @@ xfs_calc_atomic_write_unit_max(
                         max_agsize, max_rgsize);
  }
  
+/*
+ * Try to set the atomic write maximum to a new value that we got from
+ * userspace via mount option.
+ */
+int
+xfs_set_max_atomic_write_opt(
+       struct xfs_mount        *mp,
+       unsigned long long      new_max_bytes)
+{
+       const xfs_filblks_t     new_max_fsbs = XFS_B_TO_FSBT(mp, new_max_bytes);
+       const xfs_extlen_t      max_write = xfs_calc_atomic_write_max(mp);
+       const xfs_extlen_t      max_group =
+               max(mp->m_groups[XG_TYPE_AG].blocks,
+                   mp->m_groups[XG_TYPE_RTG].blocks);
+       const xfs_extlen_t      max_group_write =
+               max(xfs_calc_perag_awu_max(mp), xfs_calc_rtgroup_awu_max(mp));
+       int                     error;
+
+       if (new_max_bytes == 0)
+               goto set_limit;
+
+       ASSERT(max_write <= U32_MAX);
+
+       /* generic_atomic_write_valid enforces power of two length */
+       if (!is_power_of_2(new_max_bytes)) {
+               xfs_warn(mp,
+ "max atomic write size of %llu bytes is not a power of 2",
+                               new_max_bytes);
+               return -EINVAL;
+       }
+
+       if (new_max_bytes & mp->m_blockmask) {
+               xfs_warn(mp,
+ "max atomic write size of %llu bytes not aligned with fsblock",
+                               new_max_bytes);
+               return -EINVAL;
+       }
+
+       if (new_max_fsbs > max_write) {
+               xfs_warn(mp,
+ "max atomic write size of %lluk cannot be larger than max write size %lluk",
+                               new_max_bytes >> 10,
+                               XFS_FSB_TO_B(mp, max_write) >> 10);
+               return -EINVAL;
+       }
+
+       if (new_max_fsbs > max_group) {
+               xfs_warn(mp,
+ "max atomic write size of %lluk cannot be larger than allocation group size %lluk",
+                               new_max_bytes >> 10,
+                               XFS_FSB_TO_B(mp, max_group) >> 10);
+               return -EINVAL;
+       }
+
+       if (new_max_fsbs > max_group_write) {
+               xfs_warn(mp,
+ "max atomic write size of %lluk cannot be larger than max allocation group write size %lluk",
+                               new_max_bytes >> 10,
+                               XFS_FSB_TO_B(mp, max_group_write) >> 10);
+               return -EINVAL;
+       }
+
+set_limit:
+       error = xfs_calc_atomic_write_reservation(mp, new_max_fsbs);
+       if (error) {
+               xfs_warn(mp,
+ "cannot support completing atomic writes of %lluk",
+                               new_max_bytes >> 10);
+               return error;
+       }
+
+       xfs_calc_atomic_write_unit_max(mp);
+       mp->m_awu_max_bytes = new_max_bytes;
+       return 0;
+}
+
  /* Compute maximum possible height for realtime btree types for this fs. */
  static inline void
  xfs_rtbtree_compute_maxlevels(
@@ -1163,7 +1239,9 @@ xfs_mountfs(
          * derived from transaction reservations, so we must do this after the
          * log is fully initialized.
          */
-       xfs_calc_atomic_write_unit_max(mp);
+       error = xfs_set_max_atomic_write_opt(mp, mp->m_awu_max_bytes);
+       if (error)
+               goto out_agresv;
  
         return 0;
  
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h

index e2abf31438e0e2d1060f16714f30b4b6ee717b51..5b5df70570c0db88c1a404df28118bace5a12d59 100644 (file)
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -237,6 +237,9 @@ typedef struct xfs_mount {
         unsigned int            m_max_open_zones;
         unsigned int            m_zonegc_low_space;
  
+       /* max_atomic_write mount option value */
+       unsigned long long      m_awu_max_bytes;
+
         /*
          * Bitsets of per-fs metadata that have been checked and/or are sick.
          * Callers must hold m_sb_lock to access these two fields.
@@ -804,4 +807,7 @@ static inline void xfs_mod_sb_delalloc(struct xfs_mount *mp, int64_t delta)
         percpu_counter_add(&mp->m_delalloc_blks, delta);
  }
  
+int xfs_set_max_atomic_write_opt(struct xfs_mount *mp,
+               unsigned long long new_max_bytes);
+
  #endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c

index 77a3c003fc4ff7c6f1d04fe1df04ca4ea95166e1..8e3ae1749855adcbb0a1c04de9a0aeb15a1ae135 100644 (file)
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -111,7 +111,7 @@ enum {
         Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
         Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
         Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones,
-       Opt_lifetime, Opt_nolifetime,
+       Opt_lifetime, Opt_nolifetime, Opt_max_atomic_write,
  };
  
  static const struct fs_parameter_spec xfs_fs_parameters[] = {
@@ -159,6 +159,7 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = {
         fsparam_u32("max_open_zones",   Opt_max_open_zones),
         fsparam_flag("lifetime",        Opt_lifetime),
         fsparam_flag("nolifetime",      Opt_nolifetime),
+       fsparam_string("max_atomic_write",      Opt_max_atomic_write),
         {}
  };
  
@@ -241,6 +242,9 @@ xfs_fs_show_options(
  
         if (mp->m_max_open_zones)
                 seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones);
+       if (mp->m_awu_max_bytes)
+               seq_printf(m, ",max_atomic_write=%lluk",
+                               mp->m_awu_max_bytes >> 10);
  
         return 0;
  }
@@ -1343,6 +1347,42 @@ suffix_kstrtoint(
         return ret;
  }
  
+static int
+suffix_kstrtoull(
+       const char              *s,
+       unsigned int            base,
+       unsigned long long      *res)
+{
+       int                     last, shift_left_factor = 0;
+       unsigned long long      _res;
+       char                    *value;
+       int                     ret = 0;
+
+       value = kstrdup(s, GFP_KERNEL);
+       if (!value)
+               return -ENOMEM;
+
+       last = strlen(value) - 1;
+       if (value[last] == 'K' || value[last] == 'k') {
+               shift_left_factor = 10;
+               value[last] = '\0';
+       }
+       if (value[last] == 'M' || value[last] == 'm') {
+               shift_left_factor = 20;
+               value[last] = '\0';
+       }
+       if (value[last] == 'G' || value[last] == 'g') {
+               shift_left_factor = 30;
+               value[last] = '\0';
+       }
+
+       if (kstrtoull(value, base, &_res))
+               ret = -EINVAL;
+       kfree(value);
+       *res = _res << shift_left_factor;
+       return ret;
+}
+
  static inline void
  xfs_fs_warn_deprecated(
         struct fs_context       *fc,
@@ -1527,6 +1567,14 @@ xfs_fs_parse_param(
         case Opt_nolifetime:
                 parsing_mp->m_features |= XFS_FEAT_NOLIFETIME;
                 return 0;
+       case Opt_max_atomic_write:
+               if (suffix_kstrtoull(param->string, 10,
+                                    &parsing_mp->m_awu_max_bytes)) {
+                       xfs_warn(parsing_mp,
+ "max atomic write size must be positive integer");
+                       return -EINVAL;
+               }
+               return 0;
         default:
                 xfs_warn(parsing_mp, "unknown mount option [%s].", param->key);
                 return -EINVAL;
@@ -2137,6 +2185,14 @@ xfs_fs_reconfigure(
         if (error)
                 return error;
  
+       /* Validate new max_atomic_write option before making other changes */
+       if (mp->m_awu_max_bytes != new_mp->m_awu_max_bytes) {
+               error = xfs_set_max_atomic_write_opt(mp,
+                               new_mp->m_awu_max_bytes);
+               if (error)
+                       return error;
+       }
+
         /* inode32 -> inode64 */
         if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) {
                 mp->m_features &= ~XFS_FEAT_SMALL_INUMS;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h

index d5ae00f8e04cf43f94540a7afa3f4ad1b6487daf..01d284a1c75961a528dd4386a1c4ac9c005b535d 100644 (file)
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -230,6 +230,39 @@ TRACE_EVENT(xfs_calc_max_atomic_write_fsblocks,
                   __entry->blockcount)
  );
  
+TRACE_EVENT(xfs_calc_max_atomic_write_log_geometry,
+       TP_PROTO(struct xfs_mount *mp, unsigned int per_intent,
+                unsigned int step_size, unsigned int blockcount,
+                unsigned int min_logblocks, unsigned int logres),
+       TP_ARGS(mp, per_intent, step_size, blockcount, min_logblocks, logres),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(unsigned int, per_intent)
+               __field(unsigned int, step_size)
+               __field(unsigned int, blockcount)
+               __field(unsigned int, min_logblocks)
+               __field(unsigned int, cur_logblocks)
+               __field(unsigned int, logres)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->per_intent = per_intent;
+               __entry->step_size = step_size;
+               __entry->blockcount = blockcount;
+               __entry->min_logblocks = min_logblocks;
+               __entry->cur_logblocks = mp->m_sb.sb_logblocks;
+               __entry->logres = logres;
+       ),
+       TP_printk("dev %d:%d per_intent %u step_size %u blockcount %u min_logblocks %u logblocks %u logres %u",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->per_intent,
+                 __entry->step_size,
+                 __entry->blockcount,
+                 __entry->min_logblocks,
+                 __entry->cur_logblocks,
+                 __entry->logres)
+);
+
  TRACE_EVENT(xlog_intent_recovery_failed,
         TP_PROTO(struct xfs_mount *mp, const struct xfs_defer_op_type *ops,
                  int error),
author	Darrick J. Wong <djwong@kernel.org>
	Wed, 7 May 2025 21:18:34 +0000 (14:18 -0700)
committer	Darrick J. Wong <djwong@kernel.org>
	Wed, 7 May 2025 21:25:33 +0000 (14:25 -0700)
Documentation/admin-guide/xfs.rst		patch \| blob \| history
fs/xfs/libxfs/xfs_trans_resv.c		patch \| blob \| history
fs/xfs/libxfs/xfs_trans_resv.h		patch \| blob \| history
fs/xfs/xfs_mount.c		patch \| blob \| history
fs/xfs/xfs_mount.h		patch \| blob \| history
fs/xfs/xfs_super.c		patch \| blob \| history
fs/xfs/xfs_trace.h		patch \| blob \| history