#include "xfs_trace.h"
struct xfs_zone_reservation {
+ struct list_head entry;
struct task_struct *task;
- uint64_t target;
- struct list_head reservation_entry;
+ xfs_rtxnum_t rtxlen;
+ bool done;
};
/*
XFS_B_TO_FSB(mp, mp->m_zoned_op);
}
-static int64_t
-xfs_zoned_user_to_reclaim_count(
- struct xfs_mount *mp,
- size_t count)
+/*
+ * We aim to keep enough zones free in stock to fully use the open zone limit
+ * for data placement purposes.
+ */
+bool
+xfs_zoned_need_gc(
+ struct xfs_mount *mp)
{
- /*
- * We give the gc reclaim a slight 10% advantage to
- * drift towards a state where we can stop throtteling
- */
- return div_u64(READ_ONCE(mp->m_reclaim_ratio) * count, 100 - 10);
+ if (!xa_marked(&mp->m_rtgroups, XFS_RTG_RECLAIMABLE))
+ return false;
+ if (!list_empty_careful(&mp->m_reclaim_reservations))
+ return true;
+ if (xfs_estimate_freecounter(mp, FREE_RTAVAILABLE) <
+ mp->m_rgblocks * (mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
+ return true;
+ return false;
}
static void
-xfs_zoned_require_reservations(
- struct xfs_mount *mp)
+xfs_zoned_wake_all(
+ struct xfs_mount *mp)
{
- uint64_t start_bytes = 0;
-
- if (xfs_is_zone_reservation_required(mp))
- return;
+ struct xfs_zone_reservation *reservation;
spin_lock(&mp->m_reservation_lock);
- /*
- * We're low on space and require reservations.
- *
- * Make sure we won't let any user reservations through before we have
- * free space for them. E.g. if we're completely out of free zones, we
- * will have to wait for a whole zone to be reclaimed before starting to
- * release waiters, as the minimum space we can reclaim is one zone.
- */
- if (atomic_read(&mp->m_nr_free_zones) < 1)
- start_bytes = XFS_FSB_TO_B(mp, mp->m_rgblocks);
-
- atomic64_set(&mp->m_zone_reservation_head,
- xfs_zoned_user_to_reclaim_count(mp, start_bytes));
- atomic64_set(&mp->m_zone_reclaim_head, 0);
- xfs_set_zone_reservation_required(mp);
+ list_for_each_entry(reservation, &mp->m_reclaim_reservations, entry)
+ wake_up_process(reservation->task);
spin_unlock(&mp->m_reservation_lock);
}
-static void
-xfs_zoned_release_all_reservations(
- struct xfs_mount *mp)
+void
+xfs_zoned_add_available(
+ struct xfs_mount *mp,
+ xfs_filblks_t count_fsb)
{
+ xfs_rtxnum_t rtxlen = xfs_rtb_to_rtx(mp, count_fsb);
struct xfs_zone_reservation *reservation;
- struct xfs_zone_reservation *tmp;
- if (!xfs_is_zone_reservation_required(mp))
+ if (list_empty_careful(&mp->m_reclaim_reservations)) {
+ xfs_add_freecounter(mp, FREE_RTAVAILABLE, rtxlen);
return;
+ }
spin_lock(&mp->m_reservation_lock);
- xfs_clear_zone_reservation_required(mp);
- list_for_each_entry_safe(reservation, tmp, &mp->m_reclaim_reservations,
- reservation_entry) {
+ xfs_add_freecounter(mp, FREE_RTAVAILABLE, rtxlen);
+ rtxlen = xfs_sum_freecounter(mp, FREE_RTAVAILABLE);
+ list_for_each_entry(reservation, &mp->m_reclaim_reservations, entry) {
+ if (reservation->rtxlen > rtxlen)
+ break;
wake_up_process(reservation->task);
+ rtxlen -= reservation->rtxlen;
+
}
spin_unlock(&mp->m_reservation_lock);
}
-static void
-xfs_zoned_move_reclaim_head(
- struct xfs_mount *mp,
- size_t count)
-
+static int
+xfs_zoned_space_wait(
+ struct xfs_inode *ip,
+ xfs_rtxnum_t rtxlen)
{
- struct xfs_zone_reservation *reservation;
- struct xfs_zone_reservation *tmp;
- int64_t reclaim_head;
- int64_t reservation_head;
-
- if (!xfs_is_zone_reservation_required(mp))
- return;
-
- reclaim_head = atomic64_add_return(count,
- &mp->m_zone_reclaim_head);
- reservation_head = atomic64_read(&mp->m_zone_reservation_head);
-
- /*
- * If the previous reclaim head was ahead of the reservation head, no
- * user waits should be waiting, so avoid taking the lock. In the very
- * unlikely case of a race, we'll wake up the user write on the next
- * reclaim write.
- */
- if (reclaim_head - count > reservation_head)
- return;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_zone_reservation reservation = {
+ .task = current,
+ .rtxlen = rtxlen,
+ };
+ int error;
spin_lock(&mp->m_reservation_lock);
- if (xfs_is_zone_reservation_required(mp)) {
- list_for_each_entry_safe(reservation, tmp,
- &mp->m_reclaim_reservations,
- reservation_entry) {
- if (reservation->target < reclaim_head)
- wake_up_process(reservation->task);
+ do {
+ if (xfs_is_shutdown(mp)) {
+ error = -EIO;
+ break;
}
+ list_add_tail(&reservation.entry, &mp->m_reclaim_reservations);
+ set_current_state(TASK_KILLABLE);
+ spin_unlock(&mp->m_reservation_lock);
- }
+ error = xfs_dec_freecounter(mp, FREE_RTAVAILABLE, rtxlen,
+ false);
+ if (error == -ENOSPC &&
+ xa_marked(&mp->m_rtgroups, XFS_RTG_RECLAIMABLE)) {
+ schedule();
+ if (fatal_signal_pending(current))
+ error = -EINTR;
+ }
+
+ spin_lock(&mp->m_reservation_lock);
+ list_del(&reservation.entry);
+ } while (error == -ENOSPC &&
+ xa_marked(&mp->m_rtgroups, XFS_RTG_RECLAIMABLE));
spin_unlock(&mp->m_reservation_lock);
+
+ __set_current_state(TASK_RUNNING);
+ return error;
}
int
xfs_zoned_space_reserve(
struct xfs_inode *ip,
size_t count,
- struct xfs_zone_alloc_ctx *ac)
+ struct xfs_zone_alloc_ctx *ac,
+ bool nowait)
{
struct xfs_mount *mp = ip->i_mount;
- xfs_filblks_t count_fsb;
- struct xfs_zone_reservation reservation;
- int64_t needed;
+ xfs_rtxnum_t rtxlen;
int error;
/*
* Round up the block count as a write could hit partial blocks at the
* start and the end.
*/
- count_fsb = XFS_B_TO_FSB(mp, count) + 1;
+ ac->reserved_blocks = XFS_B_TO_FSB(mp, count) + 1;
+ ac->cached_rtg = NULL;
- error = xfs_dec_frextents(mp, xfs_rtb_to_rtx(mp, count_fsb));
+ rtxlen = xfs_rtb_to_rtx(mp, ac->reserved_blocks);
+ error = xfs_dec_frextents(mp, rtxlen);
if (error)
return error;
- if (!xfs_is_zone_reservation_required(mp))
- goto done;
-
- needed = xfs_zoned_user_to_reclaim_count(mp,
- XFS_FSB_TO_B(mp, count_fsb));
- reservation.target = atomic64_add_return(needed,
- &mp->m_zone_reservation_head);
- if (reservation.target < atomic64_read(&mp->m_zone_reclaim_head))
- goto done;
-
- reservation.task = current;
-
- spin_lock(&mp->m_reservation_lock);
- if (!xfs_is_zone_reservation_required(mp)) {
- spin_unlock(&mp->m_reservation_lock);
- goto done;
- }
-
- list_add_tail(&reservation.reservation_entry,
- &mp->m_reclaim_reservations);
-
- __set_current_state(TASK_KILLABLE);
- spin_unlock(&mp->m_reservation_lock);
-
- schedule();
-
- spin_lock(&mp->m_reservation_lock);
- list_del(&reservation.reservation_entry);
- spin_unlock(&mp->m_reservation_lock);
-
- if (fatal_signal_pending(current)) {
- xfs_add_frextents(mp, xfs_rtb_to_rtx(mp, count_fsb));
- return -EINTR;
+ if (list_empty_careful(&mp->m_reclaim_reservations)) {
+ error = xfs_dec_freecounter(mp, FREE_RTAVAILABLE, rtxlen,
+ false);
+ if (error != -ENOSPC)
+ goto out;
+ if (nowait) {
+ error = -EAGAIN;
+ goto out;
+ }
}
-done:
- ac->reserved_blocks = count_fsb;
- ac->cached_rtg = NULL;
- return 0;
+ error = xfs_zoned_space_wait(ip, rtxlen);
+out:
+ if (error)
+ xfs_add_frextents(mp, rtxlen);
+ return error;
}
void
struct xfs_inode *ip,
struct xfs_zone_alloc_ctx *ac)
{
- struct xfs_mount *mp = ip->i_mount;
- xfs_filblks_t count_fsb = ac->reserved_blocks;
+ if (ac->reserved_blocks > 0) {
+ struct xfs_mount *mp = ip->i_mount;
- if (count_fsb > 0) {
- xfs_zoned_move_reclaim_head(mp,
- xfs_zoned_user_to_reclaim_count(mp,
- XFS_FSB_TO_B(mp, count_fsb)));
- xfs_add_frextents(mp, xfs_rtb_to_rtx(mp, count_fsb));
+ xfs_zoned_add_available(mp, ac->reserved_blocks);
+ xfs_add_freecounter(mp, FREE_RTEXTENTS,
+ xfs_rtb_to_rtx(mp, ac->reserved_blocks));
}
xfs_zone_finish_alloc(ac->cached_rtg);
}
unsigned long index = 0;
bool easy = false;
+ if (!xfs_zoned_need_gc(mp))
+ return false;
+
rcu_read_lock();
xa_for_each_marked(&mp->m_rtgroups, index, rtg, XFS_RTG_RECLAIMABLE) {
u64 used = *xfs_zone_used_counter(rtg);
xa_clear_mark(&mp->m_rtgroups, victim_rtg->rtg_rgno,
XFS_RTG_RECLAIMABLE);
- WRITE_ONCE(mp->m_reclaim_ratio,
- div_u64(100 * victim_used,
- victim_rtg->rtg_blockcount - victim_used));
xfs_info(mp, "reclaiming zone %d, used = %lld/%u (%s)",
victim_rtg->rtg_rgno, victim_used,
{
struct xfs_rtgroup *rtg = data->mp->m_open_gc_zone;
struct xfs_mount *mp = rtg->rtg_mount;
+ xfs_rtxnum_t rtxlen = xfs_rtb_to_rtx(mp, count_fsb);
xfs_rgblock_t rgbno = 0;
- int error;
/* the caller must have ensured there is enough space */
ASSERT(rtg->rtg_blockcount - rtg->rtg_write_pointer >= count_fsb);
- error = xfs_dec_freecounter(mp, FREE_RTEXTENTS,
- xfs_rtb_to_rtx(mp, count_fsb), true);
- if (error) {
- ASSERT(!error);
+
+ /*
+ * Directly allocate GC blocks from the reserved pool.
+ * If we'd take them from the normal pool we could be stealing
+ * blocks a regular writer, which would then have to wait for
+ * GC and deadlock.
+ */
+ spin_lock(&mp->m_sb_lock);
+ if (rtxlen > mp->m_resblks[FREE_RTEXTENTS].avail ||
+ rtxlen > mp->m_resblks[FREE_RTAVAILABLE].avail) {
+ xfs_crit(mp,
+"out of space for garbage collection (rtxlen = %lld, free = %lld, avail = %lld).",
+ rtxlen,
+ mp->m_resblks[FREE_RTEXTENTS].avail,
+ mp->m_resblks[FREE_RTAVAILABLE].avail);
+ spin_unlock(&mp->m_sb_lock);
return NULLFSBLOCK;
}
+ mp->m_resblks[FREE_RTEXTENTS].avail -= rtxlen;
+ mp->m_resblks[FREE_RTAVAILABLE].avail -= rtxlen;
+ spin_unlock(&mp->m_sb_lock);
*is_seq = test_bit(RTG_F_SEQUENTIAL, &rtg->rtg_flags);
if (!*is_seq)
struct xfs_mount *mp = data->mp;
struct xfs_rtgroup *rtg = mp->m_open_gc_zone;
- if (!iter->victim_rtg) {
- if (!xfs_zoned_need_gc(mp) ||
- !xa_marked(&mp->m_rtgroups, XFS_RTG_RECLAIMABLE)) {
- xfs_zoned_release_all_reservations(mp);
- return 0;
- }
-
- xfs_zoned_require_reservations(mp);
- if (!xfs_zone_reclaim_pick(mp, iter))
- return 0;
- }
+ if (!iter->victim_rtg && !xfs_zone_reclaim_pick(mp, iter))
+ return 0;
if (rtg && rtg->rtg_write_pointer == rtg->rtg_blockcount) {
/*
XFS_BB_TO_FSB(mp, bio->bi_iter.bi_sector);
error = xfs_zoned_end_cow(ip, chunk->offset, chunk->len,
chunk->new_startblock, chunk->old_startblock);
- if (!error)
- xfs_zoned_move_reclaim_head(mp, chunk->len);
free:
if (error)
xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
atomic_inc(&mp->m_nr_free_zones);
spin_unlock(&mp->m_zone_list_lock);
+ xfs_zoned_add_available(mp, rtg->rtg_blockcount);
+
wake_up_all(&mp->m_zone_wait);
out:
bio_put(bio);
continue;
if (!data->inflight) {
+ xfs_zoned_wake_all(mp);
+
if (kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
break;
if (mp->m_open_gc_zone)
xfs_rtgroup_rele(mp->m_open_gc_zone);
- xfs_zoned_release_all_reservations(mp);
memalloc_nofs_restore(nofs_flag);
kfree(iter);
out_free_data:
static int
xfs_init_zone(
struct xfs_rtgroup *rtg,
- uint64_t *freeblocks)
+ uint64_t *available,
+ uint64_t *freedblocks)
{
struct xfs_mount *mp = rtg->rtg_mount;
uint64_t used = *xfs_zone_used_counter(rtg);
/* zone is free */
list_add_tail(&rtg->rtg_entry, &mp->m_free_zones);
atomic_inc(&mp->m_nr_free_zones);
- *freeblocks += rtg->rtg_blockcount;
+ *available += rtg->rtg_blockcount;
} else if (rtg->rtg_write_pointer < rtg->rtg_blockcount) {
/* zone is open */
list_add(&rtg->rtg_entry, &mp->m_open_zones);
mp->m_nr_open_zones++;
set_bit(RTG_F_OPEN, &rtg->rtg_flags);
- *freeblocks += (rtg->rtg_blockcount - rtg->rtg_write_pointer);
+ *available += (rtg->rtg_blockcount - rtg->rtg_write_pointer);
} else if (used < rtg->rtg_blockcount) {
/* zone fully written, but has freed blocks */
xa_set_mark(&mp->m_rtgroups, rtg->rtg_rgno,
XFS_RTG_RECLAIMABLE);
- *freeblocks += (rtg->rtg_blockcount - used);
+ *freedblocks += (rtg->rtg_blockcount - used);
}
return 0;
{
struct xfs_buftarg *bt = mp->m_rtdev_targp;
unsigned int bdev_open_zones;
- int64_t freeblocks = 0;
+ int64_t available = 0, freedblocks = 0;
struct xfs_rtgroup *rtg;
xfs_rgnumber_t rgno;
int error;
spin_lock_init(&mp->m_reservation_lock);
init_waitqueue_head(&mp->m_zone_wait);
- atomic64_set(&mp->m_zone_reclaim_head, 0);
- atomic64_set(&mp->m_zone_reservation_head, 0);
-
- /*
- * Assume a one-to-one reclaim ratio until we pick
- * a zone for reclaim and update the estimate.
- */
- WRITE_ONCE(mp->m_reclaim_ratio, 100);
-
xfs_info(mp, "%u zones of %u blocks size (%d max open)",
mp->m_sb.sb_rgcount, mp->m_rgblocks, mp->m_max_open_zones);
kthread_park(mp->m_zone_gc_thread);
for_each_rtgroup(mp, rgno, rtg) {
- error = xfs_init_zone(rtg, &freeblocks);
+ error = xfs_init_zone(rtg, &available, &freedblocks);
if (error)
goto out_unlink_zones;
}
+ percpu_counter_set(&mp->m_free[FREE_RTAVAILABLE],
+ xfs_rtb_to_rtx(mp, available));
percpu_counter_set(&mp->m_free[FREE_RTEXTENTS],
- xfs_rtb_to_rtx(mp, freeblocks));
+ xfs_rtb_to_rtx(mp, available + freedblocks));
/*
* If there are no free zones available for GC, pick the open zone with