struct swap_info_struct *swap_info[MAX_SWAPFILES];
+static struct kmem_cache *swap_table_cachep;
+
static DEFINE_MUTEX(swapon_mutex);
static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
return info->flags == CLUSTER_FLAG_DISCARD;
}
+static inline bool cluster_table_is_alloced(struct swap_cluster_info *ci)
+{
+ return rcu_dereference_protected(ci->table, lockdep_is_held(&ci->lock));
+}
+
static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order)
{
if (unlikely(ci->flags > CLUSTER_FLAG_USABLE))
return false;
+ if (!cluster_table_is_alloced(ci))
+ return false;
if (!order)
return true;
return cluster_is_empty(ci) || order == ci->order;
return cluster_index(si, ci) * SWAPFILE_CLUSTER;
}
-static int swap_cluster_alloc_table(struct swap_cluster_info *ci)
+static void swap_cluster_free_table(struct swap_cluster_info *ci)
{
- WARN_ON(ci->table);
- ci->table = kzalloc(sizeof(unsigned long) * SWAPFILE_CLUSTER, GFP_KERNEL);
- if (!ci->table)
- return -ENOMEM;
- return 0;
+ unsigned int ci_off;
+ struct swap_table *table;
+
+ /* Only empty cluster's table is allow to be freed */
+ lockdep_assert_held(&ci->lock);
+ VM_WARN_ON_ONCE(!cluster_is_empty(ci));
+ for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++)
+ VM_WARN_ON_ONCE(!swp_tb_is_null(__swap_table_get(ci, ci_off)));
+ table = (void *)rcu_dereference_protected(ci->table, true);
+ rcu_assign_pointer(ci->table, NULL);
+
+ kmem_cache_free(swap_table_cachep, table);
}
-static void swap_cluster_free_table(struct swap_cluster_info *ci)
+/*
+ * Allocate a swap table may need to sleep, which leads to migration,
+ * so attempt an atomic allocation first then fallback and handle
+ * potential race.
+ */
+static struct swap_cluster_info *
+swap_cluster_alloc_table(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ int order)
{
- unsigned int ci_off;
- unsigned long swp_tb;
+ struct swap_cluster_info *pcp_ci;
+ struct swap_table *table;
+ unsigned long offset;
- if (!ci->table)
- return;
+ /*
+ * Only cluster isolation from the allocator does table allocation.
+ * Swap allocator uses a percpu cluster and holds the local lock.
+ */
+ lockdep_assert_held(&ci->lock);
+ lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock);
+
+ table = kmem_cache_zalloc(swap_table_cachep,
+ __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ if (table) {
+ rcu_assign_pointer(ci->table, table);
+ return ci;
+ }
+
+ /*
+ * Try a sleep allocation. Each isolated free cluster may cause
+ * a sleep allocation, but there is a limited number of them, so
+ * the potential recursive allocation should be limited.
+ */
+ spin_unlock(&ci->lock);
+ if (!(si->flags & SWP_SOLIDSTATE))
+ spin_unlock(&si->global_cluster_lock);
+ local_unlock(&percpu_swap_cluster.lock);
+ table = kmem_cache_zalloc(swap_table_cachep, __GFP_HIGH | GFP_KERNEL);
- for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++) {
- swp_tb = __swap_table_get(ci, ci_off);
- if (!swp_tb_is_null(swp_tb))
- pr_err_once("swap: unclean swap space on swapoff: 0x%lx",
- swp_tb);
+ local_lock(&percpu_swap_cluster.lock);
+ if (!(si->flags & SWP_SOLIDSTATE))
+ spin_lock(&si->global_cluster_lock);
+ /*
+ * Back to atomic context. First, check if we migrated to a new
+ * CPU with a usable percpu cluster. If so, try using that instead.
+ * No need to check it for the spinning device, as swap is
+ * serialized by the global lock on them.
+ *
+ * The is_usable check is a bit rough, but ensures order 0 success.
+ */
+ offset = this_cpu_read(percpu_swap_cluster.offset[order]);
+ if ((si->flags & SWP_SOLIDSTATE) && offset) {
+ pcp_ci = swap_cluster_lock(si, offset);
+ if (cluster_is_usable(pcp_ci, order) &&
+ pcp_ci->count < SWAPFILE_CLUSTER) {
+ ci = pcp_ci;
+ goto free_table;
+ }
+ swap_cluster_unlock(pcp_ci);
}
- kfree(ci->table);
- ci->table = NULL;
+ if (!table)
+ return NULL;
+
+ spin_lock(&ci->lock);
+ /* Nothing should have touched the dangling empty cluster. */
+ if (WARN_ON_ONCE(cluster_table_is_alloced(ci)))
+ goto free_table;
+
+ rcu_assign_pointer(ci->table, table);
+ return ci;
+
+free_table:
+ if (table)
+ kmem_cache_free(swap_table_cachep, table);
+ return ci;
}
static void move_cluster(struct swap_info_struct *si,
static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
{
- lockdep_assert_held(&ci->lock);
+ swap_cluster_free_table(ci);
move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE);
ci->order = 0;
}
* this returns NULL for an non-empty list.
*/
static struct swap_cluster_info *isolate_lock_cluster(
- struct swap_info_struct *si, struct list_head *list)
+ struct swap_info_struct *si, struct list_head *list, int order)
{
- struct swap_cluster_info *ci, *ret = NULL;
+ struct swap_cluster_info *ci, *found = NULL;
spin_lock(&si->lock);
-
- if (unlikely(!(si->flags & SWP_WRITEOK)))
- goto out;
-
list_for_each_entry(ci, list, list) {
if (!spin_trylock(&ci->lock))
continue;
list_del(&ci->list);
ci->flags = CLUSTER_FLAG_NONE;
- ret = ci;
+ found = ci;
break;
}
-out:
spin_unlock(&si->lock);
- return ret;
+ if (found && !cluster_table_is_alloced(found)) {
+ /* Only an empty free cluster's swap table can be freed. */
+ VM_WARN_ON_ONCE(list != &si->free_clusters);
+ VM_WARN_ON_ONCE(!cluster_is_empty(found));
+ return swap_cluster_alloc_table(si, found, order);
+ }
+
+ return found;
}
/*
* added to free cluster list and its usage counter will be increased by 1.
* Only used for initialization.
*/
-static void inc_cluster_info_page(struct swap_info_struct *si,
+static int inc_cluster_info_page(struct swap_info_struct *si,
struct swap_cluster_info *cluster_info, unsigned long page_nr)
{
unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+ struct swap_table *table;
struct swap_cluster_info *ci;
ci = cluster_info + idx;
+ if (!ci->table) {
+ table = kmem_cache_zalloc(swap_table_cachep, GFP_KERNEL);
+ if (!table)
+ return -ENOMEM;
+ rcu_assign_pointer(ci->table, table);
+ }
+
ci->count++;
VM_BUG_ON(ci->count > SWAPFILE_CLUSTER);
VM_BUG_ON(ci->flags);
+
+ return 0;
}
static bool cluster_reclaim_range(struct swap_info_struct *si,
unsigned int found = SWAP_ENTRY_INVALID;
do {
- struct swap_cluster_info *ci = isolate_lock_cluster(si, list);
+ struct swap_cluster_info *ci = isolate_lock_cluster(si, list, order);
unsigned long offset;
if (!ci)
if (force)
to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER;
- while ((ci = isolate_lock_cluster(si, &si->full_clusters))) {
+ while ((ci = isolate_lock_cluster(si, &si->full_clusters, 0))) {
offset = cluster_offset(si, ci);
end = min(si->max, offset + SWAPFILE_CLUSTER);
to_scan--;
done:
if (!(si->flags & SWP_SOLIDSTATE))
spin_unlock(&si->global_cluster_lock);
+
return found;
}
/* This is called for allocating swap entry, not cache */
if (get_swap_device_info(si)) {
if (si->flags & SWP_WRITEOK) {
+ /*
+ * Grab the local lock to be complaint
+ * with swap table allocation.
+ */
+ local_lock(&percpu_swap_cluster.lock);
offset = cluster_alloc_swap_entry(si, 0, 1);
+ local_unlock(&percpu_swap_cluster.lock);
if (offset) {
entry = swp_entry(si->type, offset);
atomic_long_dec(&nr_swap_pages);
static void free_cluster_info(struct swap_cluster_info *cluster_info,
unsigned long maxpages)
{
+ struct swap_cluster_info *ci;
int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
if (!cluster_info)
return;
- for (i = 0; i < nr_clusters; i++)
- swap_cluster_free_table(&cluster_info[i]);
+ for (i = 0; i < nr_clusters; i++) {
+ ci = cluster_info + i;
+ /* Cluster with bad marks count will have a remaining table */
+ spin_lock(&ci->lock);
+ if (rcu_dereference_protected(ci->table, true)) {
+ ci->count = 0;
+ swap_cluster_free_table(ci);
+ }
+ spin_unlock(&ci->lock);
+ }
kvfree(cluster_info);
}
struct address_space *mapping;
struct inode *inode;
struct filename *pathname;
+ unsigned int maxpages;
int err, found = 0;
if (!capable(CAP_SYS_ADMIN))
p->swap_map = NULL;
zeromap = p->zeromap;
p->zeromap = NULL;
+ maxpages = p->max;
cluster_info = p->cluster_info;
- free_cluster_info(cluster_info, p->max);
p->max = 0;
p->cluster_info = NULL;
spin_unlock(&p->lock);
p->global_cluster = NULL;
vfree(swap_map);
kvfree(zeromap);
+ free_cluster_info(cluster_info, maxpages);
/* Destroy swap account information */
swap_cgroup_swapoff(p->type);
if (!cluster_info)
goto err;
- for (i = 0; i < nr_clusters; i++) {
+ for (i = 0; i < nr_clusters; i++)
spin_lock_init(&cluster_info[i].lock);
- if (swap_cluster_alloc_table(&cluster_info[i]))
- goto err_free;
- }
if (!(si->flags & SWP_SOLIDSTATE)) {
si->global_cluster = kmalloc(sizeof(*si->global_cluster),
* See setup_swap_map(): header page, bad pages,
* and the EOF part of the last cluster.
*/
- inc_cluster_info_page(si, cluster_info, 0);
+ err = inc_cluster_info_page(si, cluster_info, 0);
+ if (err)
+ goto err;
for (i = 0; i < swap_header->info.nr_badpages; i++) {
unsigned int page_nr = swap_header->info.badpages[i];
if (page_nr >= maxpages)
continue;
- inc_cluster_info_page(si, cluster_info, page_nr);
+ err = inc_cluster_info_page(si, cluster_info, page_nr);
+ if (err)
+ goto err;
+ }
+ for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) {
+ err = inc_cluster_info_page(si, cluster_info, i);
+ if (err)
+ goto err;
}
- for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
- inc_cluster_info_page(si, cluster_info, i);
INIT_LIST_HEAD(&si->free_clusters);
INIT_LIST_HEAD(&si->full_clusters);
swapfile_maximum_size = arch_max_swapfile_size();
+ /*
+ * Once a cluster is freed, it's swap table content is read
+ * only, and all swap cache readers (swap_cache_*) verifies
+ * the content before use. So it's safe to use RCU slab here.
+ */
+ swap_table_cachep = kmem_cache_create("swap_table",
+ sizeof(struct swap_table),
+ 0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL);
+
#ifdef CONFIG_MIGRATION
if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS))
swap_migration_ad_supported = true;