F: mm/page_io.c
F: mm/swap.c
F: mm/swap.h
+F: mm/swap_table.h
F: mm/swap_state.c
F: mm/swapfile.c
extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry);
extern int swp_swapcount(swp_entry_t entry);
struct backing_dev_info;
-extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
-extern void exit_swap_address_space(unsigned int type);
extern struct swap_info_struct *get_swap_device(swp_entry_t entry);
sector_t swap_folio_sector(struct folio *folio);
/* Prevent deferred_split_scan() touching ->_refcount */
spin_lock(&ds_queue->split_queue_lock);
if (folio_ref_freeze(folio, 1 + extra_pins)) {
- struct address_space *swap_cache = NULL;
+ struct swap_cluster_info *ci = NULL;
struct lruvec *lruvec;
int expected_refs;
goto fail;
}
- swap_cache = swap_address_space(folio->swap);
- xa_lock(&swap_cache->i_pages);
+ ci = swap_cluster_get_and_lock(folio);
}
/* lock lru list/PageCompound, ref frozen by page_ref_freeze */
* Anonymous folio with swap cache.
* NOTE: shmem in swap cache is not supported yet.
*/
- if (swap_cache) {
- __swap_cache_replace_folio(folio, new_folio);
+ if (ci) {
+ __swap_cache_replace_folio(ci, folio, new_folio);
continue;
}
unlock_page_lruvec(lruvec);
- if (swap_cache)
- xa_unlock(&swap_cache->i_pages);
+ if (ci)
+ swap_cluster_unlock(ci);
} else {
spin_unlock(&ds_queue->split_queue_lock);
ret = -EAGAIN;
struct folio *newfolio, struct folio *folio, int expected_count)
{
XA_STATE(xas, &mapping->i_pages, folio_index(folio));
+ struct swap_cluster_info *ci = NULL;
struct zone *oldzone, *newzone;
int dirty;
long nr = folio_nr_pages(folio);
oldzone = folio_zone(folio);
newzone = folio_zone(newfolio);
- xas_lock_irq(&xas);
+ if (folio_test_swapcache(folio))
+ ci = swap_cluster_get_and_lock_irq(folio);
+ else
+ xas_lock_irq(&xas);
+
if (!folio_ref_freeze(folio, expected_count)) {
- xas_unlock_irq(&xas);
+ if (ci)
+ swap_cluster_unlock(ci);
+ else
+ xas_unlock_irq(&xas);
return -EAGAIN;
}
}
if (folio_test_swapcache(folio))
- __swap_cache_replace_folio(folio, newfolio);
+ __swap_cache_replace_folio(ci, folio, newfolio);
else
xas_store(&xas, newfolio);
*/
folio_ref_unfreeze(folio, expected_count - nr);
- xas_unlock(&xas);
/* Leave irq disabled to prevent preemption while updating stats */
+ if (ci)
+ swap_cluster_unlock(ci);
+ else
+ xas_unlock(&xas);
/*
* If moved to a different zone then also account
struct shmem_inode_info *info, pgoff_t index,
struct vm_area_struct *vma)
{
+ struct swap_cluster_info *ci;
struct folio *new, *old = *foliop;
swp_entry_t entry = old->swap;
- struct address_space *swap_mapping = swap_address_space(entry);
int nr_pages = folio_nr_pages(old);
int error = 0;
new->swap = entry;
folio_set_swapcache(new);
- xa_lock_irq(&swap_mapping->i_pages);
- __swap_cache_replace_folio(old, new);
- xa_unlock_irq(&swap_mapping->i_pages);
+ ci = swap_cluster_get_and_lock_irq(old);
+ __swap_cache_replace_folio(ci, old, new);
+ swap_cluster_unlock(ci);
mem_cgroup_replace_folio(old, new);
shmem_update_stats(new, nr_pages);
#ifndef _MM_SWAP_H
#define _MM_SWAP_H
+#include <linux/atomic.h> /* for atomic_long_t */
struct mempolicy;
struct swap_iocb;
u16 count;
u8 flags;
u8 order;
+ atomic_long_t *table; /* Swap table entries, see mm/swap_table.h */
struct list_head list;
};
#include <linux/swapops.h> /* for swp_offset */
#include <linux/blk_types.h> /* for bio_end_io_t */
+static inline unsigned int swp_cluster_offset(swp_entry_t entry)
+{
+ return swp_offset(entry) % SWAPFILE_CLUSTER;
+}
+
/*
* Callers of all helpers below must ensure the entry, type, or offset is
* valid, and protect the swap device with reference count or locks.
return &si->cluster_info[offset / SWAPFILE_CLUSTER];
}
+static inline struct swap_cluster_info *__swap_entry_to_cluster(swp_entry_t entry)
+{
+ return __swap_offset_to_cluster(__swap_entry_to_info(entry),
+ swp_offset(entry));
+}
+
+static __always_inline struct swap_cluster_info *__swap_cluster_lock(
+ struct swap_info_struct *si, unsigned long offset, bool irq)
+{
+ struct swap_cluster_info *ci = __swap_offset_to_cluster(si, offset);
+
+ VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */
+ if (irq)
+ spin_lock_irq(&ci->lock);
+ else
+ spin_lock(&ci->lock);
+ return ci;
+}
+
/**
* swap_cluster_lock - Lock and return the swap cluster of given offset.
* @si: swap device the cluster belongs to.
static inline struct swap_cluster_info *swap_cluster_lock(
struct swap_info_struct *si, unsigned long offset)
{
- struct swap_cluster_info *ci = __swap_offset_to_cluster(si, offset);
+ return __swap_cluster_lock(si, offset, false);
+}
- VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */
- spin_lock(&ci->lock);
- return ci;
+static inline struct swap_cluster_info *__swap_cluster_get_and_lock(
+ const struct folio *folio, bool irq)
+{
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
+ return __swap_cluster_lock(__swap_entry_to_info(folio->swap),
+ swp_offset(folio->swap), irq);
+}
+
+/*
+ * swap_cluster_get_and_lock - Locks the cluster that holds a folio's entries.
+ * @folio: The folio.
+ *
+ * This locks and returns the swap cluster that contains a folio's swap
+ * entries. The swap entries of a folio are always in one single cluster.
+ * The folio has to be locked so its swap entries won't change and the
+ * cluster won't be freed.
+ *
+ * Context: Caller must ensure the folio is locked and in the swap cache.
+ * Return: Pointer to the swap cluster.
+ */
+static inline struct swap_cluster_info *swap_cluster_get_and_lock(
+ const struct folio *folio)
+{
+ return __swap_cluster_get_and_lock(folio, false);
+}
+
+/*
+ * swap_cluster_get_and_lock_irq - Locks the cluster that holds a folio's entries.
+ * @folio: The folio.
+ *
+ * Same as swap_cluster_get_and_lock but also disable IRQ.
+ *
+ * Context: Caller must ensure the folio is locked and in the swap cache.
+ * Return: Pointer to the swap cluster.
+ */
+static inline struct swap_cluster_info *swap_cluster_get_and_lock_irq(
+ const struct folio *folio)
+{
+ return __swap_cluster_get_and_lock(folio, true);
}
static inline void swap_cluster_unlock(struct swap_cluster_info *ci)
spin_unlock(&ci->lock);
}
+static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci)
+{
+ spin_unlock_irq(&ci->lock);
+}
+
/* linux/mm/page_io.c */
int sio_pool_init(void);
struct swap_iocb;
#define SWAP_ADDRESS_SPACE_SHIFT 14
#define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT)
#define SWAP_ADDRESS_SPACE_MASK (SWAP_ADDRESS_SPACE_PAGES - 1)
-extern struct address_space *swapper_spaces[];
-#define swap_address_space(entry) \
- (&swapper_spaces[swp_type(entry)][swp_offset(entry) \
- >> SWAP_ADDRESS_SPACE_SHIFT])
+extern struct address_space swap_space;
+static inline struct address_space *swap_address_space(swp_entry_t entry)
+{
+ return &swap_space;
+}
/*
* Return the swap device position of the swap entry.
return ((loff_t)swp_offset(entry)) << PAGE_SHIFT;
}
-/*
- * Return the swap cache index of the swap entry.
- */
-static inline pgoff_t swap_cache_index(swp_entry_t entry)
-{
- BUILD_BUG_ON((SWP_OFFSET_MASK | SWAP_ADDRESS_SPACE_MASK) != SWP_OFFSET_MASK);
- return swp_offset(entry) & SWAP_ADDRESS_SPACE_MASK;
-}
-
/**
* folio_matches_swap_entry - Check if a folio matches a given swap entry.
* @folio: The folio.
*/
struct folio *swap_cache_get_folio(swp_entry_t entry);
void *swap_cache_get_shadow(swp_entry_t entry);
-int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
- gfp_t gfp, void **shadow);
+void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow);
void swap_cache_del_folio(struct folio *folio);
-void __swap_cache_del_folio(struct folio *folio,
- swp_entry_t entry, void *shadow);
-void __swap_cache_replace_folio(struct folio *old, struct folio *new);
-void swap_cache_clear_shadow(int type, unsigned long begin,
- unsigned long end);
+/* Below helpers require the caller to lock and pass in the swap cluster. */
+void __swap_cache_del_folio(struct swap_cluster_info *ci,
+ struct folio *folio, swp_entry_t entry, void *shadow);
+void __swap_cache_replace_folio(struct swap_cluster_info *ci,
+ struct folio *old, struct folio *new);
+void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents);
void show_swap_cache_info(void);
void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr);
#else /* CONFIG_SWAP */
struct swap_iocb;
+static inline struct swap_cluster_info *swap_cluster_lock(
+ struct swap_info_struct *si, pgoff_t offset, bool irq)
+{
+ return NULL;
+}
+
+static inline struct swap_cluster_info *swap_cluster_get_and_lock(
+ struct folio *folio)
+{
+ return NULL;
+}
+
+static inline struct swap_cluster_info *swap_cluster_get_and_lock_irq(
+ struct folio *folio)
+{
+ return NULL;
+}
+
+static inline void swap_cluster_unlock(struct swap_cluster_info *ci)
+{
+}
+
+static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci)
+{
+}
+
static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry)
{
return NULL;
return NULL;
}
-static inline pgoff_t swap_cache_index(swp_entry_t entry)
-{
- return 0;
-}
-
static inline bool folio_matches_swap_entry(const struct folio *folio, swp_entry_t entry)
{
return false;
return NULL;
}
-static inline int swap_cache_add_folio(swp_entry_t entry, struct folio *folio,
- gfp_t gfp, void **shadow)
+static inline void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow)
{
- return -EINVAL;
}
static inline void swap_cache_del_folio(struct folio *folio)
{
}
-static inline void __swap_cache_del_folio(struct folio *folio, swp_entry_t entry, void *shadow)
+static inline void __swap_cache_del_folio(struct swap_cluster_info *ci,
+ struct folio *folio, swp_entry_t entry, void *shadow)
{
}
-static inline void __swap_cache_replace_folio(struct folio *old, struct folio *new)
+static inline void __swap_cache_replace_folio(struct swap_cluster_info *ci,
+ struct folio *old, struct folio *new)
{
}
*/
static inline pgoff_t folio_index(struct folio *folio)
{
+#ifdef CONFIG_SWAP
if (unlikely(folio_test_swapcache(folio)))
- return swap_cache_index(folio->swap);
+ return swp_offset(folio->swap);
+#endif
return folio->index;
}
#include <linux/huge_mm.h>
#include <linux/shmem_fs.h>
#include "internal.h"
+#include "swap_table.h"
#include "swap.h"
/*
#endif
};
-struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly;
-static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly;
+struct address_space swap_space __read_mostly = {
+ .a_ops = &swap_aops,
+};
+
static bool enable_vma_readahead __read_mostly = true;
#define SWAP_RA_ORDER_CEILING 5
*/
struct folio *swap_cache_get_folio(swp_entry_t entry)
{
- struct folio *folio = filemap_get_folio(swap_address_space(entry),
- swap_cache_index(entry));
- if (IS_ERR(folio))
- return NULL;
- return folio;
+ unsigned long swp_tb;
+ struct folio *folio;
+
+ for (;;) {
+ swp_tb = __swap_table_get(__swap_entry_to_cluster(entry),
+ swp_cluster_offset(entry));
+ if (!swp_tb_is_folio(swp_tb))
+ return NULL;
+ folio = swp_tb_to_folio(swp_tb);
+ if (likely(folio_try_get(folio)))
+ return folio;
+ }
+
+ return NULL;
}
/**
*/
void *swap_cache_get_shadow(swp_entry_t entry)
{
- struct address_space *address_space = swap_address_space(entry);
- pgoff_t idx = swap_cache_index(entry);
- void *shadow;
+ unsigned long swp_tb;
+
+ swp_tb = __swap_table_get(__swap_entry_to_cluster(entry),
+ swp_cluster_offset(entry));
+ if (swp_tb_is_shadow(swp_tb))
+ return swp_tb_to_shadow(swp_tb);
- shadow = xa_load(&address_space->i_pages, idx);
- if (xa_is_value(shadow))
- return shadow;
return NULL;
}
*
* Context: Caller must ensure @entry is valid and protect the swap device
* with reference count or locks.
- * The caller also needs to mark the corresponding swap_map slots with
- * SWAP_HAS_CACHE to avoid race or conflict.
- * Return: Returns 0 on success, error code otherwise.
+ * The caller also needs to update the corresponding swap_map slots with
+ * SWAP_HAS_CACHE bit to avoid race or conflict.
*/
-int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
- gfp_t gfp, void **shadowp)
+void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp)
{
- struct address_space *address_space = swap_address_space(entry);
- pgoff_t idx = swap_cache_index(entry);
- XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio));
- unsigned long i, nr = folio_nr_pages(folio);
- void *old;
-
- xas_set_update(&xas, workingset_update_node);
-
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
- VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
+ void *shadow = NULL;
+ unsigned long old_tb, new_tb;
+ struct swap_cluster_info *ci;
+ unsigned int ci_start, ci_off, ci_end;
+ unsigned long nr_pages = folio_nr_pages(folio);
+
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio);
+
+ new_tb = folio_to_swp_tb(folio);
+ ci_start = swp_cluster_offset(entry);
+ ci_end = ci_start + nr_pages;
+ ci_off = ci_start;
+ ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry));
+ do {
+ old_tb = __swap_table_xchg(ci, ci_off, new_tb);
+ WARN_ON_ONCE(swp_tb_is_folio(old_tb));
+ if (swp_tb_is_shadow(old_tb))
+ shadow = swp_tb_to_shadow(old_tb);
+ } while (++ci_off < ci_end);
- folio_ref_add(folio, nr);
+ folio_ref_add(folio, nr_pages);
folio_set_swapcache(folio);
folio->swap = entry;
+ swap_cluster_unlock(ci);
- do {
- xas_lock_irq(&xas);
- xas_create_range(&xas);
- if (xas_error(&xas))
- goto unlock;
- for (i = 0; i < nr; i++) {
- VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio);
- if (shadowp) {
- old = xas_load(&xas);
- if (xa_is_value(old))
- *shadowp = old;
- }
- xas_store(&xas, folio);
- xas_next(&xas);
- }
- address_space->nrpages += nr;
- __node_stat_mod_folio(folio, NR_FILE_PAGES, nr);
- __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr);
-unlock:
- xas_unlock_irq(&xas);
- } while (xas_nomem(&xas, gfp));
-
- if (!xas_error(&xas))
- return 0;
+ node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
+ lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages);
- folio_clear_swapcache(folio);
- folio_ref_sub(folio, nr);
- return xas_error(&xas);
+ if (shadowp)
+ *shadowp = shadow;
}
/**
* __swap_cache_del_folio - Removes a folio from the swap cache.
+ * @ci: The locked swap cluster.
* @folio: The folio.
* @entry: The first swap entry that the folio corresponds to.
* @shadow: shadow value to be filled in the swap cache.
* Removes a folio from the swap cache and fills a shadow in place.
* This won't put the folio's refcount. The caller has to do that.
*
- * Context: Caller must hold the xa_lock, ensure the folio is
- * locked and in the swap cache, using the index of @entry.
+ * Context: Caller must ensure the folio is locked and in the swap cache
+ * using the index of @entry, and lock the cluster that holds the entries.
*/
-void __swap_cache_del_folio(struct folio *folio,
+void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
swp_entry_t entry, void *shadow)
{
- struct address_space *address_space = swap_address_space(entry);
- int i;
- long nr = folio_nr_pages(folio);
- pgoff_t idx = swap_cache_index(entry);
- XA_STATE(xas, &address_space->i_pages, idx);
-
- xas_set_update(&xas, workingset_update_node);
-
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
- VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
-
- for (i = 0; i < nr; i++) {
- void *entry = xas_store(&xas, shadow);
- VM_BUG_ON_PAGE(entry != folio, entry);
- xas_next(&xas);
- }
+ unsigned long old_tb, new_tb;
+ unsigned int ci_start, ci_off, ci_end;
+ unsigned long nr_pages = folio_nr_pages(folio);
+
+ VM_WARN_ON_ONCE(__swap_entry_to_cluster(entry) != ci);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio);
+
+ new_tb = shadow_swp_to_tb(shadow);
+ ci_start = swp_cluster_offset(entry);
+ ci_end = ci_start + nr_pages;
+ ci_off = ci_start;
+ do {
+ /* If shadow is NULL, we sets an empty shadow */
+ old_tb = __swap_table_xchg(ci, ci_off, new_tb);
+ WARN_ON_ONCE(!swp_tb_is_folio(old_tb) ||
+ swp_tb_to_folio(old_tb) != folio);
+ } while (++ci_off < ci_end);
+
folio->swap.val = 0;
folio_clear_swapcache(folio);
- address_space->nrpages -= nr;
- __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
- __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr);
+ node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages);
+ lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages);
}
/**
*/
void swap_cache_del_folio(struct folio *folio)
{
+ struct swap_cluster_info *ci;
swp_entry_t entry = folio->swap;
- struct address_space *address_space = swap_address_space(entry);
- xa_lock_irq(&address_space->i_pages);
- __swap_cache_del_folio(folio, entry, NULL);
- xa_unlock_irq(&address_space->i_pages);
+ ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry));
+ __swap_cache_del_folio(ci, folio, entry, NULL);
+ swap_cluster_unlock(ci);
put_swap_folio(folio, entry);
folio_ref_sub(folio, folio_nr_pages(folio));
/**
* __swap_cache_replace_folio - Replace a folio in the swap cache.
+ * @ci: The locked swap cluster.
* @old: The old folio to be replaced.
* @new: The new folio.
*
* entries. Replacement will take the new folio's swap entry value as
* the starting offset to override all slots covered by the new folio.
*
- * Context: Caller must ensure both folios are locked, also lock the
- * swap address_space that holds the old folio to avoid races.
+ * Context: Caller must ensure both folios are locked, and lock the
+ * cluster that holds the old folio to be replaced.
*/
-void __swap_cache_replace_folio(struct folio *old, struct folio *new)
+void __swap_cache_replace_folio(struct swap_cluster_info *ci,
+ struct folio *old, struct folio *new)
{
swp_entry_t entry = new->swap;
unsigned long nr_pages = folio_nr_pages(new);
- unsigned long offset = swap_cache_index(entry);
- unsigned long end = offset + nr_pages;
-
- XA_STATE(xas, &swap_address_space(entry)->i_pages, offset);
+ unsigned int ci_off = swp_cluster_offset(entry);
+ unsigned int ci_end = ci_off + nr_pages;
+ unsigned long old_tb, new_tb;
VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new));
VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new));
VM_WARN_ON_ONCE(!entry.val);
/* Swap cache still stores N entries instead of a high-order entry */
+ new_tb = folio_to_swp_tb(new);
do {
- WARN_ON_ONCE(xas_store(&xas, new) != old);
- xas_next(&xas);
- } while (++offset < end);
+ old_tb = __swap_table_xchg(ci, ci_off, new_tb);
+ WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old);
+ } while (++ci_off < ci_end);
+
+ /*
+ * If the old folio is partially replaced (e.g., splitting a large
+ * folio, the old folio is shrunk, and new split sub folios replace
+ * the shrunk part), ensure the new folio doesn't overlap it.
+ */
+ if (IS_ENABLED(CONFIG_DEBUG_VM) &&
+ folio_order(old) != folio_order(new)) {
+ ci_off = swp_cluster_offset(old->swap);
+ ci_end = ci_off + folio_nr_pages(old);
+ while (ci_off++ < ci_end)
+ WARN_ON_ONCE(swp_tb_to_folio(__swap_table_get(ci, ci_off)) != old);
+ }
}
/**
* swap_cache_clear_shadow - Clears a set of shadows in the swap cache.
- * @type: Indicates the swap device.
- * @begin: Beginning offset of the range.
- * @end: Ending offset of the range.
+ * @entry: The starting index entry.
+ * @nr_ents: How many slots need to be cleared.
*
- * Context: Caller must ensure the range is valid and hold a reference to
- * the swap device.
+ * Context: Caller must ensure the range is valid, not occupied by,
+ * any folio and protect the swap device with reference count or locks.
*/
-void swap_cache_clear_shadow(int type, unsigned long begin,
- unsigned long end)
+void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents)
{
- unsigned long curr = begin;
- void *old;
-
- for (;;) {
- swp_entry_t entry = swp_entry(type, curr);
- unsigned long index = curr & SWAP_ADDRESS_SPACE_MASK;
- struct address_space *address_space = swap_address_space(entry);
- XA_STATE(xas, &address_space->i_pages, index);
-
- xas_set_update(&xas, workingset_update_node);
-
- xa_lock_irq(&address_space->i_pages);
- xas_for_each(&xas, old, min(index + (end - curr), SWAP_ADDRESS_SPACE_PAGES)) {
- if (!xa_is_value(old))
- continue;
- xas_store(&xas, NULL);
- }
- xa_unlock_irq(&address_space->i_pages);
+ struct swap_cluster_info *ci = __swap_entry_to_cluster(entry);
+ unsigned int ci_off = swp_cluster_offset(entry), ci_end;
+ unsigned long old;
- /* search the next swapcache until we meet end */
- curr = ALIGN((curr + 1), SWAP_ADDRESS_SPACE_PAGES);
- if (curr > end)
- break;
- }
+ ci_end = ci_off + nr_ents;
+ do {
+ old = __swap_table_xchg(ci, ci_off, null_to_swp_tb());
+ WARN_ON_ONCE(swp_tb_is_folio(old));
+ } while (++ci_off < ci_end);
}
/*
if (mem_cgroup_swapin_charge_folio(new_folio, NULL, gfp_mask, entry))
goto fail_unlock;
- /* May fail (-ENOMEM) if XArray node allocation failed. */
- if (swap_cache_add_folio(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
- goto fail_unlock;
-
+ swap_cache_add_folio(new_folio, entry, &shadow);
memcg1_swapin(entry, 1);
if (shadow)
return folio;
}
-int init_swap_address_space(unsigned int type, unsigned long nr_pages)
-{
- struct address_space *spaces, *space;
- unsigned int i, nr;
-
- nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
- spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL);
- if (!spaces)
- return -ENOMEM;
- for (i = 0; i < nr; i++) {
- space = spaces + i;
- xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ);
- atomic_set(&space->i_mmap_writable, 0);
- space->a_ops = &swap_aops;
- /* swap cache doesn't use writeback related tags */
- mapping_set_no_writeback_tags(space);
- }
- nr_swapper_spaces[type] = nr;
- swapper_spaces[type] = spaces;
-
- return 0;
-}
-
-void exit_swap_address_space(unsigned int type)
-{
- int i;
- struct address_space *spaces = swapper_spaces[type];
-
- for (i = 0; i < nr_swapper_spaces[type]; i++)
- VM_WARN_ON_ONCE(!mapping_empty(&spaces[i]));
- kvfree(spaces);
- nr_swapper_spaces[type] = 0;
- swapper_spaces[type] = NULL;
-}
-
static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start,
unsigned long *end)
{
.attrs = swap_attrs,
};
-static int __init swap_init_sysfs(void)
+static int __init swap_init(void)
{
int err;
struct kobject *swap_kobj;
pr_err("failed to register swap group\n");
goto delete_obj;
}
+ /* Swap cache writeback is LRU based, no tags for it */
+ mapping_set_no_writeback_tags(&swap_space);
return 0;
delete_obj:
kobject_put(swap_kobj);
return err;
}
-subsys_initcall(swap_init_sysfs);
+subsys_initcall(swap_init);
#endif
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MM_SWAP_TABLE_H
+#define _MM_SWAP_TABLE_H
+
+#include "swap.h"
+
+/*
+ * A swap table entry represents the status of a swap slot on a swap
+ * (physical or virtual) device. The swap table in each cluster is a
+ * 1:1 map of the swap slots in this cluster.
+ *
+ * Each swap table entry could be a pointer (folio), a XA_VALUE
+ * (shadow), or NULL.
+ */
+
+/*
+ * Helpers for casting one type of info into a swap table entry.
+ */
+static inline unsigned long null_to_swp_tb(void)
+{
+ BUILD_BUG_ON(sizeof(unsigned long) != sizeof(atomic_long_t));
+ return 0;
+}
+
+static inline unsigned long folio_to_swp_tb(struct folio *folio)
+{
+ BUILD_BUG_ON(sizeof(unsigned long) != sizeof(void *));
+ return (unsigned long)folio;
+}
+
+static inline unsigned long shadow_swp_to_tb(void *shadow)
+{
+ BUILD_BUG_ON((BITS_PER_XA_VALUE + 1) !=
+ BITS_PER_BYTE * sizeof(unsigned long));
+ VM_WARN_ON_ONCE(shadow && !xa_is_value(shadow));
+ return (unsigned long)shadow;
+}
+
+/*
+ * Helpers for swap table entry type checking.
+ */
+static inline bool swp_tb_is_null(unsigned long swp_tb)
+{
+ return !swp_tb;
+}
+
+static inline bool swp_tb_is_folio(unsigned long swp_tb)
+{
+ return !xa_is_value((void *)swp_tb) && !swp_tb_is_null(swp_tb);
+}
+
+static inline bool swp_tb_is_shadow(unsigned long swp_tb)
+{
+ return xa_is_value((void *)swp_tb);
+}
+
+/*
+ * Helpers for retrieving info from swap table.
+ */
+static inline struct folio *swp_tb_to_folio(unsigned long swp_tb)
+{
+ VM_WARN_ON(!swp_tb_is_folio(swp_tb));
+ return (void *)swp_tb;
+}
+
+static inline void *swp_tb_to_shadow(unsigned long swp_tb)
+{
+ VM_WARN_ON(!swp_tb_is_shadow(swp_tb));
+ return (void *)swp_tb;
+}
+
+/*
+ * Helpers for accessing or modifying the swap table of a cluster,
+ * the swap cluster must be locked.
+ */
+static inline void __swap_table_set(struct swap_cluster_info *ci,
+ unsigned int off, unsigned long swp_tb)
+{
+ VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
+ atomic_long_set(&ci->table[off], swp_tb);
+}
+
+static inline unsigned long __swap_table_xchg(struct swap_cluster_info *ci,
+ unsigned int off, unsigned long swp_tb)
+{
+ VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
+ /* Ordering is guaranteed by cluster lock, relax */
+ return atomic_long_xchg_relaxed(&ci->table[off], swp_tb);
+}
+
+static inline unsigned long __swap_table_get(struct swap_cluster_info *ci,
+ unsigned int off)
+{
+ VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
+ return atomic_long_read(&ci->table[off]);
+}
+#endif
#include <asm/tlbflush.h>
#include <linux/swapops.h>
#include <linux/swap_cgroup.h>
+#include "swap_table.h"
#include "internal.h"
#include "swap.h"
return cluster_index(si, ci) * SWAPFILE_CLUSTER;
}
+static int swap_cluster_alloc_table(struct swap_cluster_info *ci)
+{
+ WARN_ON(ci->table);
+ ci->table = kzalloc(sizeof(unsigned long) * SWAPFILE_CLUSTER, GFP_KERNEL);
+ if (!ci->table)
+ return -ENOMEM;
+ return 0;
+}
+
+static void swap_cluster_free_table(struct swap_cluster_info *ci)
+{
+ unsigned int ci_off;
+ unsigned long swp_tb;
+
+ if (!ci->table)
+ return;
+
+ for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++) {
+ swp_tb = __swap_table_get(ci, ci_off);
+ if (!swp_tb_is_null(swp_tb))
+ pr_err_once("swap: unclean swap space on swapoff: 0x%lx",
+ swp_tb);
+ }
+
+ kfree(ci->table);
+ ci->table = NULL;
+}
+
static void move_cluster(struct swap_info_struct *si,
struct swap_cluster_info *ci, struct list_head *list,
enum swap_cluster_flags new_flags)
return true;
}
+/*
+ * Currently, the swap table is not used for count tracking, just
+ * do a sanity check here to ensure nothing leaked, so the swap
+ * table should be empty upon freeing.
+ */
+static void swap_cluster_assert_table_empty(struct swap_cluster_info *ci,
+ unsigned int start, unsigned int nr)
+{
+ unsigned int ci_off = start % SWAPFILE_CLUSTER;
+ unsigned int ci_end = ci_off + nr;
+ unsigned long swp_tb;
+
+ if (IS_ENABLED(CONFIG_DEBUG_VM)) {
+ do {
+ swp_tb = __swap_table_get(ci, ci_off);
+ VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb));
+ } while (++ci_off < ci_end);
+ }
+}
+
static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
unsigned int start, unsigned char usage,
unsigned int order)
ci->order = order;
memset(si->swap_map + start, usage, nr_pages);
+ swap_cluster_assert_table_empty(ci, start, nr_pages);
swap_range_alloc(si, nr_pages);
ci->count += nr_pages;
swap_slot_free_notify(si->bdev, offset);
offset++;
}
- swap_cache_clear_shadow(si->type, begin, end);
+ __swap_cache_clear_shadow(swp_entry(si->type, begin), nr_entries);
/*
* Make sure that try_to_unuse() observes si->inuse_pages reaching 0
if (!entry.val)
return -ENOMEM;
- /*
- * XArray node allocations from PF_MEMALLOC contexts could
- * completely exhaust the page allocator. __GFP_NOMEMALLOC
- * stops emergency reserves from being allocated.
- *
- * TODO: this could cause a theoretical memory reclaim
- * deadlock in the swap out path.
- */
- if (swap_cache_add_folio(folio, entry, gfp | __GFP_NOMEMALLOC, NULL))
- goto out_free;
+ swap_cache_add_folio(folio, entry, NULL);
return 0;
mem_cgroup_uncharge_swap(entry, nr_pages);
swap_range_free(si, offset, nr_pages);
+ swap_cluster_assert_table_empty(ci, offset, nr_pages);
if (!ci->count)
free_cluster(si, ci);
}
}
+static void free_cluster_info(struct swap_cluster_info *cluster_info,
+ unsigned long maxpages)
+{
+ int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
+
+ if (!cluster_info)
+ return;
+ for (i = 0; i < nr_clusters; i++)
+ swap_cluster_free_table(&cluster_info[i]);
+ kvfree(cluster_info);
+}
+
/*
* Called after swap device's reference count is dead, so
* neither scan nor allocation will use it.
swap_file = p->swap_file;
p->swap_file = NULL;
- p->max = 0;
swap_map = p->swap_map;
p->swap_map = NULL;
zeromap = p->zeromap;
p->zeromap = NULL;
cluster_info = p->cluster_info;
+ free_cluster_info(cluster_info, p->max);
+ p->max = 0;
p->cluster_info = NULL;
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
p->global_cluster = NULL;
vfree(swap_map);
kvfree(zeromap);
- kvfree(cluster_info);
/* Destroy swap account information */
swap_cgroup_swapoff(p->type);
- exit_swap_address_space(p->type);
inode = mapping->host;
if (!cluster_info)
goto err;
- for (i = 0; i < nr_clusters; i++)
+ for (i = 0; i < nr_clusters; i++) {
spin_lock_init(&cluster_info[i].lock);
+ if (swap_cluster_alloc_table(&cluster_info[i]))
+ goto err_free;
+ }
if (!(si->flags & SWP_SOLIDSTATE)) {
si->global_cluster = kmalloc(sizeof(*si->global_cluster),
}
return cluster_info;
-
err_free:
- kvfree(cluster_info);
+ free_cluster_info(cluster_info, maxpages);
err:
return ERR_PTR(err);
}
}
}
- error = init_swap_address_space(si->type, maxpages);
- if (error)
- goto bad_swap_unlock_inode;
-
error = zswap_swapon(si->type, maxpages);
if (error)
- goto free_swap_address_space;
+ goto bad_swap_unlock_inode;
/*
* Flush any pending IO and dirty mappings before we start using this
goto out;
free_swap_zswap:
zswap_swapoff(si->type);
-free_swap_address_space:
- exit_swap_address_space(si->type);
bad_swap_unlock_inode:
inode_unlock(inode);
bad_swap:
spin_unlock(&swap_lock);
vfree(swap_map);
kvfree(zeromap);
- kvfree(cluster_info);
+ if (cluster_info)
+ free_cluster_info(cluster_info, maxpages);
if (inced_nr_rotate_swap)
atomic_dec(&nr_rotate_swap);
if (swap_file)
{
int refcount;
void *shadow = NULL;
+ struct swap_cluster_info *ci;
BUG_ON(!folio_test_locked(folio));
BUG_ON(mapping != folio_mapping(folio));
- if (!folio_test_swapcache(folio))
+ if (folio_test_swapcache(folio)) {
+ ci = swap_cluster_get_and_lock_irq(folio);
+ } else {
spin_lock(&mapping->host->i_lock);
- xa_lock_irq(&mapping->i_pages);
+ xa_lock_irq(&mapping->i_pages);
+ }
+
/*
* The non racy check for a busy folio.
*
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(folio, target_memcg);
- __swap_cache_del_folio(folio, swap, shadow);
+ __swap_cache_del_folio(ci, folio, swap, shadow);
memcg1_swapout(folio, swap);
- xa_unlock_irq(&mapping->i_pages);
+ swap_cluster_unlock_irq(ci);
put_swap_folio(folio, swap);
} else {
void (*free_folio)(struct folio *);
return 1;
cannot_free:
- xa_unlock_irq(&mapping->i_pages);
- if (!folio_test_swapcache(folio))
+ if (folio_test_swapcache(folio)) {
+ swap_cluster_unlock_irq(ci);
+ } else {
+ xa_unlock_irq(&mapping->i_pages);
spin_unlock(&mapping->host->i_lock);
+ }
return 0;
}