diff --git a/drivers/staging/lustre/lustre/llite/glimpse.c b/drivers/staging/lustre/lustre/llite/glimpse.c index 504498de536e..68bb9486d75e 100644 --- a/drivers/staging/lustre/lustre/llite/glimpse.c +++ b/drivers/staging/lustre/lustre/llite/glimpse.c @@ -65,12 +65,9 @@ blkcnt_t dirty_cnt(struct inode *inode) { blkcnt_t cnt = 0; struct vvp_object *vob = cl_inode2vvp(inode); - void *results[1]; if (inode->i_mapping) - cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->page_tree, - results, 0, 1, - PAGECACHE_TAG_DIRTY); + cnt = xa_tagged(&inode->i_mapping->pages, PAGECACHE_TAG_DIRTY); if (cnt == 0 && atomic_read(&vob->vob_mmap_cnt) > 0) cnt = 1; diff --git a/drivers/staging/lustre/lustre/mdc/mdc_request.c b/drivers/staging/lustre/lustre/mdc/mdc_request.c index cbfac6fccf66..f08fe8598aeb 100644 --- a/drivers/staging/lustre/lustre/mdc/mdc_request.c +++ b/drivers/staging/lustre/lustre/mdc/mdc_request.c @@ -926,73 +926,74 @@ static void mdc_release_page(struct page *page, int remove) static struct page *mdc_page_locate(struct address_space *mapping, __u64 *hash, __u64 *start, __u64 *end, int hash64) { + struct xa_state xas; /* - * Complement of hash is used as an index so that - * radix_tree_gang_lookup() can be used to find a page with starting - * hash _smaller_ than one we are looking for. + * Complement of hash is used as an index so that xas_next() + * can be used to find a page with starting hash _smaller_ than + * one we are looking for. + * + * XXX: Not needed any more; xas_prev() exists. */ unsigned long offset = hash_x_index(*hash, hash64); struct page *page; - int found; + xas_init(&xas, offset); mapping_lock_irq(mapping); - found = radix_tree_gang_lookup(&mapping->page_tree, - (void **)&page, offset, 1); - if (found > 0 && !radix_tree_exceptional_entry(page)) { - struct lu_dirpage *dp; - - get_page(page); + page = xas_next(&mapping->pages, &xas, ~0UL); + if (page == XA_WALK_END || xa_is_exceptional(page)) { mapping_unlock_irq(mapping); - /* - * In contrast to find_lock_page() we are sure that directory - * page cannot be truncated (while DLM lock is held) and, - * hence, can avoid restart. - * - * In fact, page cannot be locked here at all, because - * mdc_read_page_remote does synchronous io. - */ - wait_on_page_locked(page); - if (PageUptodate(page)) { - dp = kmap(page); - if (BITS_PER_LONG == 32 && hash64) { - *start = le64_to_cpu(dp->ldp_hash_start) >> 32; - *end = le64_to_cpu(dp->ldp_hash_end) >> 32; - *hash = *hash >> 32; - } else { - *start = le64_to_cpu(dp->ldp_hash_start); - *end = le64_to_cpu(dp->ldp_hash_end); - } - if (unlikely(*start == 1 && *hash == 0)) - *hash = *start; - else - LASSERTF(*start <= *hash, "start = %#llx,end = %#llx,hash = %#llx\n", - *start, *end, *hash); - CDEBUG(D_VFSTRACE, "offset %lx [%#llx %#llx], hash %#llx\n", - offset, *start, *end, *hash); - if (*hash > *end) { - kunmap(page); - mdc_release_page(page, 0); - page = NULL; - } else if (*end != *start && *hash == *end) { - /* - * upon hash collision, remove this page, - * otherwise put page reference, and - * mdc_read_page_remote() will issue RPC to - * fetch the page we want. - */ - kunmap(page); - mdc_release_page(page, - le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE); - page = NULL; - } + return NULL; + } + + get_page(page); + mapping_unlock_irq(mapping); + /* + * In contrast to find_lock_page() we are sure that directory + * page cannot be truncated (while DLM lock is held) and, + * hence, can avoid restart. + * + * In fact, page cannot be locked here at all, because + * mdc_read_page_remote does synchronous io. + */ + wait_on_page_locked(page); + if (PageUptodate(page)) { + struct lu_dirpage *dp = kmap(page); + if (BITS_PER_LONG == 32 && hash64) { + *start = le64_to_cpu(dp->ldp_hash_start) >> 32; + *end = le64_to_cpu(dp->ldp_hash_end) >> 32; + *hash = *hash >> 32; } else { - put_page(page); - page = ERR_PTR(-EIO); + *start = le64_to_cpu(dp->ldp_hash_start); + *end = le64_to_cpu(dp->ldp_hash_end); + } + if (unlikely(*start == 1 && *hash == 0)) + *hash = *start; + else + LASSERTF(*start <= *hash, "start = %#llx,end = %#llx,hash = %#llx\n", + *start, *end, *hash); + CDEBUG(D_VFSTRACE, "offset %lx [%#llx %#llx], hash %#llx\n", + offset, *start, *end, *hash); + if (*hash > *end) { + kunmap(page); + mdc_release_page(page, 0); + page = NULL; + } else if (*end != *start && *hash == *end) { + /* + * upon hash collision, remove this page, + * otherwise put page reference, and + * mdc_read_page_remote() will issue RPC to + * fetch the page we want. + */ + kunmap(page); + mdc_release_page(page, le32_to_cpu(dp->ldp_flags) & + LDF_COLLIDE); + page = NULL; } } else { - mapping_unlock_irq(mapping); - page = NULL; + put_page(page); + page = ERR_PTR(-EIO); } + return page; } diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c7721a6aa3bb..e7c675cb82e9 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -482,10 +482,8 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (pg_index > end_index) break; - rcu_read_lock(); - page = radix_tree_lookup(&mapping->page_tree, pg_index); - rcu_read_unlock(); - if (page && !radix_tree_exceptional_entry(page)) { + page = xa_load(&mapping->pages, pg_index); + if (page && !xa_is_exceptional(page)) { misses++; if (misses > 4) break; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 46e9e8964fa6..984565dbbdb8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5096,12 +5096,14 @@ void free_extent_buffer_stale(struct extent_buffer *eb) void clear_extent_buffer_dirty(struct extent_buffer *eb) { + struct xa_state xas; unsigned long i; unsigned long num_pages; struct page *page; num_pages = num_extent_pages(eb->start, eb->len); + xas_init(&xas, 0); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; if (!PageDirty(page)) @@ -5112,9 +5114,9 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb) clear_page_dirty_for_io(page); mapping_lock_irq(page->mapping); + xas_jump(&xas, page_index(page)); if (!PageDirty(page)) { - radix_tree_tag_clear(&page->mapping->page_tree, - page_index(page), + xas_clear_tag(&page->mapping->pages, &xas, PAGECACHE_TAG_DIRTY); } mapping_unlock_irq(page->mapping); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c40060cc481f..231162fc570b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7341,70 +7341,20 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) { - struct radix_tree_root *root = &inode->i_mapping->page_tree; + struct xa_state xas; + void *entry; int found = false; - void **pagep = NULL; - struct page *page = NULL; - int start_idx; - int end_idx; - - start_idx = start >> PAGE_SHIFT; - - /* - * end is the last byte in the last page. end == start is legal - */ - end_idx = end >> PAGE_SHIFT; + unsigned long end_idx = end >> PAGE_SHIFT; + xas_init(&xas, start >> PAGE_SHIFT); rcu_read_lock(); - - /* Most of the code in this while loop is lifted from - * find_get_page. It's been modified to begin searching from a - * page and return just the first page found in that range. If the - * found idx is less than or equal to the end idx then we know that - * a page exists. If no pages are found or if those pages are - * outside of the range then we're fine (yay!) */ - while (page == NULL && - radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) { - page = radix_tree_deref_slot(pagep); - if (unlikely(!page)) - break; - - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - page = NULL; - continue; - } - /* - * Otherwise, shmem/tmpfs must be storing a swap entry - * here as an exceptional entry: so return it without - * attempting to raise page count. - */ - page = NULL; - break; /* TODO: Is this relevant for this use case? */ - } - - if (!page_cache_get_speculative(page)) { - page = NULL; + xas_for_each(&inode->i_mapping->pages, &xas, entry, end_idx) { + /* Shadow entries do not block O_DIRECT */ + if (xa_is_exceptional(entry)) continue; - } - - /* - * Has the page moved? - * This is part of the lockless pagecache protocol. See - * include/linux/pagemap.h for details. - */ - if (unlikely(page != *pagep)) { - put_page(page); - page = NULL; - } - } - - if (page) { - if (page->index <= end_idx) - found = true; - put_page(page); + found = true; + break; } - rcu_read_unlock(); return found; } diff --git a/fs/buffer.c b/fs/buffer.c index b5eef7f531a8..7b08691dc31f 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -618,28 +618,30 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) EXPORT_SYMBOL(mark_buffer_dirty_inode); /* - * Mark the page dirty, and set it dirty in the radix tree, and mark the inode - * dirty. + * Mark the page dirty, and set it dirty in the address space, + * and mark the inode dirty. * * If warn is true, then emit a warning if the page is not uptodate and has * not been truncated. * * The caller must hold lock_page_memcg(). */ -static void __set_page_dirty(struct page *page, struct address_space *mapping, +void __set_page_dirty(struct page *page, struct address_space *mapping, int warn) { + struct xa_state xas; unsigned long flags; + xas_init(&xas, page_index(page)); mapping_lock_irqsave(mapping, flags); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), PAGECACHE_TAG_DIRTY); + xas_set_tag(&mapping->pages, &xas, PAGECACHE_TAG_DIRTY); } mapping_unlock_irqrestore(mapping, flags); } +EXPORT_SYMBOL_GPL(__set_page_dirty); /* * Add a page to the dirty page list. @@ -1116,7 +1118,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, * The relationship between dirty buffers and dirty pages: * * Whenever a page has any dirty buffers, the page's dirty bit is set, and - * the page is tagged dirty in its radix tree. + * the page is tagged dirty in the page cache. * * At all times, the dirtiness of the buffers represents the dirtiness of * subsections of the page. If the page has buffers, the page dirty bit is @@ -1139,9 +1141,9 @@ __getblk_slow(struct block_device *bdev, sector_t block, * mark_buffer_dirty - mark a buffer_head as needing writeout * @bh: the buffer_head to mark dirty * - * mark_buffer_dirty() will set the dirty bit against the buffer, then set its - * backing page dirty, then tag the page as dirty in its address_space's radix - * tree and then attach the address_space's inode to its superblock's dirty + * mark_buffer_dirty() will set the dirty bit against the buffer, then set + * its backing page dirty, then tag the page as dirty in its address_space + * and then attach the address_space's inode to its superblock's dirty * inode list. * * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 78da65ff3277..8aae52b4ac37 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2032,7 +2032,7 @@ wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages, } /* - * This actually clears the dirty bit in the radix tree. + * This actually clears the dirty bit in the page cache. * See cifs_writepage() for more commentary. */ set_page_writeback(page); @@ -2222,13 +2222,13 @@ cifs_writepage_locked(struct page *page, struct writeback_control *wbc) cifs_dbg(FYI, "ppw - page not up to date\n"); /* - * Set the "writeback" flag, and clear "dirty" in the radix tree. + * Set the "writeback" flag, and clear "dirty" in the page cache. * * A writepage() implementation always needs to do either this, * or re-dirty the page with "redirty_page_for_writepage()" in * the case of a failure. * - * Just unlocking the page will cause the radix tree tag-bits + * Just unlocking the page will cause the tag bits * to fail to update with the state of the page correctly. */ set_page_writeback(page); diff --git a/fs/dax.c b/fs/dax.c index f29536667e94..90ca0391b43d 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -171,39 +171,39 @@ static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, * Check whether the given slot is locked. The function must be called with * mapping_lock held */ -static inline int slot_locked(struct address_space *mapping, void **slot) +static inline bool slot_locked(struct address_space *mapping, + struct xa_state *xas) { - unsigned long entry = (unsigned long) - mapping_deref_protected(mapping, slot); - return entry & RADIX_DAX_ENTRY_LOCK; + void *entry = xas_load(&mapping->pages, xas); + return xa_exceptional_value(entry) & DAX_LOCK_BIT; } /* - * Mark the given slot is locked. The function must be called with - * mapping_lock held + * Lock the slot. The mapping_lock must be held. */ -static inline void *lock_slot(struct address_space *mapping, void **slot) +static inline void *lock_slot(struct address_space *mapping, + struct xa_state *xas) { - unsigned long entry = (unsigned long) - mapping_deref_protected(mapping, slot); - - entry |= RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); - return (void *)entry; + void *entry = xas_load(&mapping->pages, xas); + unsigned long v = xa_exceptional_value(entry); + v |= DAX_LOCK_BIT; + entry = xa_mk_exceptional(v); + xas_store(&mapping->pages, xas, entry); + return entry; } /* - * Mark the given slot is unlocked. The function must be called with - * mapping_lock held + * Unlock the slot. The mapping_lock must be held. */ -static inline void *unlock_slot(struct address_space *mapping, void **slot) +static inline void *unlock_slot(struct address_space *mapping, + struct xa_state *xas) { - unsigned long entry = (unsigned long) - mapping_deref_protected(mapping, slot); - - entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); - return (void *)entry; + void *entry = xas_load(&mapping->pages, xas); + unsigned long v = xa_exceptional_value(entry); + v &= ~DAX_LOCK_BIT; + entry = xa_mk_exceptional(v); + xas_store(&mapping->pages, xas, entry); + return entry; } /* @@ -216,9 +216,9 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot) * The function must be called with mapping_lock held. */ static void *get_unlocked_mapping_entry(struct address_space *mapping, - pgoff_t index, void ***slotp) + struct xa_state *xas) { - void *entry, **slot; + void *entry; struct wait_exceptional_entry_queue ewait; wait_queue_head_t *wq; @@ -226,18 +226,16 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, ewait.wait.func = wake_exceptional_entry_func; for (;;) { - entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, - &slot); - if (!entry || !radix_tree_exceptional_entry(entry) || - !slot_locked(mapping, slot)) { - if (slotp) - *slotp = slot; + entry = xas_load(&mapping->pages, xas); + if (!entry || !xa_is_exceptional(entry) || + !slot_locked(mapping, xas)) return entry; - } - wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); + wq = dax_entry_waitqueue(mapping, xas->xa_index, entry, + &ewait.key); prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); + xas_pause(xas); mapping_unlock_irq(mapping); schedule(); finish_wait(wq, &ewait.wait); @@ -248,16 +246,18 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, static void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index) { - void *entry, **slot; + struct xa_state xas; + void *entry; + xas_init(&xas, index); mapping_lock_irq(mapping); - entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); - if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || - !slot_locked(mapping, slot))) { + entry = xas_load(&mapping->pages, &xas); + if (WARN_ON_ONCE(!entry || !xa_is_exceptional(entry) || + !slot_locked(mapping, &xas))) { mapping_unlock_irq(mapping); return; } - unlock_slot(mapping, slot); + unlock_slot(mapping, &xas); mapping_unlock_irq(mapping); dax_wake_mapping_entry_waiter(mapping, index, entry, false); } @@ -280,7 +280,7 @@ static void put_locked_mapping_entry(struct address_space *mapping, static void put_unlocked_mapping_entry(struct address_space *mapping, pgoff_t index, void *entry) { - if (!radix_tree_exceptional_entry(entry)) + if (!xa_is_exceptional(entry)) return; /* We have to wake up next waiter for the radix tree entry lock */ @@ -316,12 +316,14 @@ static void put_unlocked_mapping_entry(struct address_space *mapping, static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, unsigned long size_flag) { + struct xa_state xas; bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */ - void *entry, **slot; + void *entry; + xas_init(&xas, index); restart: mapping_lock_irq(mapping); - entry = get_unlocked_mapping_entry(mapping, index, &slot); + entry = get_unlocked_mapping_entry(mapping, &xas); if (entry) { if (size_flag & RADIX_DAX_PMD) { @@ -354,6 +356,7 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, entry = lock_slot(mapping, slot); } + xas_pause(&xas); mapping_unlock_irq(mapping); /* * Besides huge zero pages the only other thing that gets @@ -364,17 +367,10 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, unmap_mapping_range(mapping, (index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); - err = radix_tree_preload( - mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); - if (err) { - if (pmd_downgrade) - put_locked_mapping_entry(mapping, index, entry); - return ERR_PTR(err); - } mapping_lock_irq(mapping); if (pmd_downgrade) { - radix_tree_delete(&mapping->page_tree, index); + xas_store(&mapping->pages, &xas, NULL); mapping->nrexceptional--; dax_wake_mapping_entry_waiter(mapping, index, entry, true); @@ -384,7 +380,6 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, err = __radix_tree_insert(&mapping->page_tree, index, dax_radix_order(entry), entry); - radix_tree_preload_end(); if (err) { mapping_unlock_irq(mapping); /* @@ -478,6 +473,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping, mapping_unlock_irq(mapping); return ret; } + /* * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree * entry to get unlocked before deleting it. diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 586f33282f81..b16327d32500 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1978,6 +1978,7 @@ int f2fs_release_page(struct page *page, gfp_t wait) */ void f2fs_set_page_dirty_nobuffers(struct page *page) { + struct xa_state xas; struct address_space *mapping = page->mapping; unsigned long flags; @@ -1989,11 +1990,11 @@ void f2fs_set_page_dirty_nobuffers(struct page *page) SetPageDirty(page); spin_unlock(&mapping->private_lock); + xas_init(&xas, page_index(page)); mapping_lock_irqsave(mapping, flags); WARN_ON_ONCE(!PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), PAGECACHE_TAG_DIRTY); + xas_set_tag(&mapping->pages, &xas, PAGECACHE_TAG_DIRTY); mapping_unlock_irqrestore(mapping, flags); unlock_page_memcg(page); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 312648d6f2f0..876501d60101 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -81,14 +81,14 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) static void clear_node_page_dirty(struct page *page) { + struct xa_state xas; struct address_space *mapping = page->mapping; unsigned int long flags; if (PageDirty(page)) { + xas_init(&xas, page_index(page)); mapping_lock_irqsave(mapping, flags); - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), - PAGECACHE_TAG_DIRTY); + xas_clear_tag(&mapping->pages, &xas, PAGECACHE_TAG_DIRTY); mapping_unlock_irqrestore(mapping, flags); clear_page_dirty_for_io(page); @@ -1122,9 +1122,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) return; f2fs_bug_on(sbi, check_nid_range(sbi, nid)); - rcu_read_lock(); - apage = radix_tree_lookup(&NODE_MAPPING(sbi)->page_tree, nid); - rcu_read_unlock(); + apage = xa_load(&NODE_MAPPING(sbi)->pages, nid); if (apage) return; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 27181058ffab..138aa96695c2 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -319,16 +319,17 @@ struct inode_switch_wbs_context { static void inode_switch_wbs_work_fn(struct work_struct *work) { + struct xa_state xas; struct inode_switch_wbs_context *isw = container_of(work, struct inode_switch_wbs_context, work); struct inode *inode = isw->inode; struct address_space *mapping = inode->i_mapping; struct bdi_writeback *old_wb = inode->i_wb; struct bdi_writeback *new_wb = isw->new_wb; - struct radix_tree_iter iter; + struct page *page; bool switched = false; - void **slot; + xas_init(&xas, 0); /* * By the time control reaches here, RCU grace period has passed * since I_WB_SWITCH assertion and all wb stat update transactions @@ -361,23 +362,20 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to * pages actually under writeback. */ - radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0, - PAGECACHE_TAG_DIRTY) { - struct page *page = mapping_deref_protected(mapping, slot); - if (likely(page) && PageDirty(page)) { + xas_for_each_tag(&mapping->pages, &xas, page, ~0UL, + PAGECACHE_TAG_DIRTY) { + if (PageDirty(page)) { __dec_wb_stat(old_wb, WB_RECLAIMABLE); __inc_wb_stat(new_wb, WB_RECLAIMABLE); } } - radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0, - PAGECACHE_TAG_WRITEBACK) { - struct page *page = mapping_deref_protected(mapping, slot); - if (likely(page)) { - WARN_ON_ONCE(!PageWriteback(page)); - __dec_wb_stat(old_wb, WB_WRITEBACK); - __inc_wb_stat(new_wb, WB_WRITEBACK); - } + xas_jump(&xas, 0); + xas_for_each_tag(&mapping->pages, &xas, page, ~0UL, + PAGECACHE_TAG_WRITEBACK) { + WARN_ON_ONCE(!PageWriteback(page)); + __dec_wb_stat(old_wb, WB_WRITEBACK); + __inc_wb_stat(new_wb, WB_WRITEBACK); } wb_get(new_wb); diff --git a/fs/inode.c b/fs/inode.c index 453efc1bbc17..38b8161d8408 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -347,8 +347,7 @@ EXPORT_SYMBOL(inc_nlink); void address_space_init_once(struct address_space *mapping) { memset(mapping, 0, sizeof(*mapping)); - INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC | __GFP_ACCOUNT); - spin_lock_init(&mapping->tree_lock); + xa_init(&mapping->pages); init_rwsem(&mapping->i_mmap_rwsem); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index d27fb2ac77fd..80f6393bf264 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -177,15 +177,9 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc, ctxt->newbh = NULL; if (inode->i_blkbits == PAGE_SHIFT) { + struct page *curr; + lock_page(obh->b_page); - /* - * We cannot call radix_tree_preload for the kernels older - * than 2.6.23, because it is not exported for modules. - */ -retry: - err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); - if (err) - goto failed_unlock; /* BUG_ON(oldkey != obh->b_page->index); */ if (unlikely(oldkey != obh->b_page->index)) NILFS_PAGE_BUG(obh->b_page, @@ -193,20 +187,17 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc, (unsigned long long)oldkey, (unsigned long long)newkey); - mapping_lock_irq(btnc); - err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page); - mapping_unlock_irq(btnc); +retry: + curr = xa_replace(&btnc->pages, newkey, obh->b_page, NULL, + GFP_NOWAIT); /* * Note: page->index will not change to newkey until * nilfs_btnode_commit_change_key() will be called. * To protect the page in intermediate state, the page lock * is held. */ - radix_tree_preload_end(); - if (!err) + if (!curr) return 0; - else if (err != -EEXIST) - goto failed_unlock; err = invalidate_inode_pages2_range(btnc, newkey, newkey); if (!err) @@ -222,10 +213,6 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc, BUG_ON(nbh == obh); ctxt->newbh = nbh; return 0; - - failed_unlock: - unlock_page(obh->b_page); - return err; } /** @@ -235,6 +222,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc, void nilfs_btnode_commit_change_key(struct address_space *btnc, struct nilfs_btnode_chkey_ctxt *ctxt) { + struct xa_state xas; struct buffer_head *obh = ctxt->bh, *nbh = ctxt->newbh; __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey; struct page *opage; @@ -251,10 +239,11 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc, (unsigned long long)newkey); mark_buffer_dirty(obh); + xas_init(&xas, oldkey); mapping_lock_irq(btnc); - radix_tree_delete(&btnc->page_tree, oldkey); - radix_tree_tag_set(&btnc->page_tree, newkey, - PAGECACHE_TAG_DIRTY); + xas_store(&btnc->pages, &xas, NULL); + xas_jump(&xas, newkey); + xas_set_tag(&btnc->pages, &xas, PAGECACHE_TAG_DIRTY); mapping_unlock_irq(btnc); opage->index = obh->b_blocknr = newkey; @@ -283,9 +272,7 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc, return; if (nbh == NULL) { /* blocksize == pagesize */ - mapping_lock_irq(btnc); - radix_tree_delete(&btnc->page_tree, newkey); - mapping_unlock_irq(btnc); + xa_store(&btnc->pages, newkey, NULL, GFP_NOWAIT); unlock_page(ctxt->bh->b_page); } else brelse(nbh); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index c9e631cb135a..cd39214c3947 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -305,16 +305,20 @@ int nilfs_copy_dirty_pages(struct address_space *dmap, void nilfs_copy_back_pages(struct address_space *dmap, struct address_space *smap) { + struct xa_state xas; struct pagevec pvec; unsigned int i, n; pgoff_t index = 0; int err; + xas_init(&xas, 0); pagevec_init(&pvec, 0); repeat: n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE); - if (!n) + if (!n) { + xas_destroy(&xas); return; + } index = pvec.pages[n - 1]->index + 1; for (i = 0; i < pagevec_count(&pvec); i++) { @@ -333,28 +337,35 @@ void nilfs_copy_back_pages(struct address_space *dmap, struct page *page2; /* move the page to the destination cache */ + xas_jump(&xas, offset); mapping_lock_irq(smap); - page2 = radix_tree_delete(&smap->page_tree, offset); + page2 = xas_store(&smap->pages, &xas, NULL); WARN_ON(page2 != page); smap->nrpages--; mapping_unlock_irq(smap); + xas_jump(&xas, offset); +memalloc: mapping_lock_irq(dmap); - err = radix_tree_insert(&dmap->page_tree, offset, page); - if (unlikely(err < 0)) { - WARN_ON(err == -EEXIST); + page2 = xas_store(&dmap->pages, &xas, page); + WARN_ON(page2); + err = xas_error(&xas); + if (err) { + mapping_unlock_irq(dmap); + /* XXX: Correct GFP flags? */ + if (xas_nomem(&xas, GFP_NOFS)) + goto memalloc; page->mapping = NULL; put_page(page); /* for cache */ } else { page->mapping = dmap; dmap->nrpages++; if (PageDirty(page)) - radix_tree_tag_set(&dmap->page_tree, - offset, + xas_set_tag(&dmap->pages, &xas, PAGECACHE_TAG_DIRTY); + mapping_unlock_irq(dmap); } - mapping_unlock_irq(dmap); } unlock_page(page); } @@ -473,13 +484,14 @@ void nilfs_mapping_init(struct address_space *mapping, struct inode *inode) */ int __nilfs_clear_page_dirty(struct page *page) { + struct xa_state xas; struct address_space *mapping = page->mapping; if (mapping) { + xas_init(&xas, page_index(page)); mapping_lock_irq(mapping); if (test_bit(PG_dirty, &page->flags)) { - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), + xas_clear_tag(&mapping->pages, &xas, PAGECACHE_TAG_DIRTY); mapping_unlock_irq(mapping); return clear_page_dirty_for_io(page); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f08bd31c1081..9416327b954d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -545,7 +545,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, if (!page) return; - if (radix_tree_exceptional_entry(page)) + if (xa_is_exceptional(page)) mss->swap += PAGE_SIZE; else put_page(page); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 9db736bb8cec..4a326c6fdf13 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1396,17 +1396,7 @@ xfs_vm_set_page_dirty( spin_unlock(&mapping->private_lock); if (newly_dirty) { - /* sigh - __set_page_dirty() is static, so copy it here, too */ - unsigned long flags; - - mapping_lock_irqsave(mapping, flags); - if (page->mapping) { /* Race with truncate? */ - WARN_ON_ONCE(!PageUptodate(page)); - account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), PAGECACHE_TAG_DIRTY); - } - mapping_unlock_irqrestore(mapping, flags); + __set_page_dirty(page, mapping, 1); } unlock_page_memcg(page); if (newly_dirty) diff --git a/include/linux/fs.h b/include/linux/fs.h index 5943bd938b83..9378259230f2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -11,8 +11,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -378,12 +378,12 @@ int pagecache_write_end(struct file *, struct address_space *mapping, struct address_space { struct inode *host; /* owner: inode, block_device */ - struct radix_tree_root page_tree; /* radix tree of all pages */ - spinlock_t tree_lock; /* and lock protecting it */ + struct xarray pages; /* xarray of all pages */ atomic_t i_mmap_writable;/* count VM_SHARED mappings */ + gfp_t gfp_mask; /* implicit gfp mask for allocations */ struct rb_root i_mmap; /* tree of private and shared mappings */ struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ - /* Protected by tree_lock together with the radix tree */ + /* Protected by mapping_lock */ unsigned long nrpages; /* number of total pages */ /* number of shadow or DAX exceptional entries */ unsigned long nrexceptional; @@ -391,7 +391,6 @@ struct address_space { const struct address_space_operations *a_ops; /* methods */ unsigned long flags; /* error bits */ spinlock_t private_lock; /* for use by the address_space */ - gfp_t gfp_mask; /* implicit gfp mask for allocations */ struct list_head private_list; /* ditto */ void *private_data; /* ditto */ } __attribute__((aligned(sizeof(long)))); @@ -440,27 +439,25 @@ struct block_device { }; /* - * Radix-tree tags, for tagging dirty and writeback pages within the pagecache - * radix trees + * Pagecache xarray tags */ -#define PAGECACHE_TAG_DIRTY 0 -#define PAGECACHE_TAG_WRITEBACK 1 -#define PAGECACHE_TAG_TOWRITE 2 +#define PAGECACHE_TAG_DIRTY XA_TAG_0 +#define PAGECACHE_TAG_WRITEBACK XA_TAG_1 +#define PAGECACHE_TAG_TOWRITE XA_TAG_2 -int mapping_tagged(struct address_space *mapping, int tag); +int mapping_tagged(struct address_space *mapping, xa_tag_t tag); -#define mapping_lock(mapping) spin_lock(&(mapping)->tree_lock) -#define mapping_unlock(mapping) spin_unlock(&(mapping)->tree_lock) -#define mapping_lock_irq(mapping) spin_lock_irq(&(mapping)->tree_lock) -#define mapping_unlock_irq(mapping) spin_unlock_irq(&(mapping)->tree_lock) +#define mapping_lock(mapping) xa_lock(&(mapping)->pages) +#define mapping_unlock(mapping) xa_unlock(&(mapping)->pages) +#define mapping_lock_irq(mapping) xa_lock_irq(&(mapping)->pages) +#define mapping_unlock_irq(mapping) xa_unlock_irq(&(mapping)->pages) #define mapping_lock_irqsave(mapping, flags) \ - spin_lock_irqsave(&(mapping)->tree_lock, flags) + xa_lock_irqsave(&(mapping)->pages, flags) #define mapping_unlock_irqrestore(mapping, flags) \ - spin_unlock_irqrestore(&(mapping)->tree_lock, flags) -#define mapping_trylock(mapping) spin_trylock(&(mapping)->tree_lock) -#define mapping_lock_held(mapping) lockdep_is_held(&(mapping)->tree_lock) -#define mapping_deref_protected(mapping, slot) \ - radix_tree_deref_slot_protected(slot, &mapping->tree_lock) + xa_unlock_irqrestore(&(mapping)->pages, flags) +#define mapping_trylock(mapping) xa_trylock(&(mapping)->pages) +#define mapping_lock_held(mapping) \ + lockdep_is_held(&(mapping)->pages.xa_lock) static inline void i_mmap_lock_write(struct address_space *mapping) { diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 115bb81912cc..04b3dafbb37b 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -22,6 +22,7 @@ #include #include #include +#include #if defined(CONFIG_FSCACHE) || defined(CONFIG_FSCACHE_MODULE) #define fscache_available() (1) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5f01c88f0800..06475c5dbe4e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -692,10 +692,10 @@ int finish_mkwrite_fault(struct vm_fault *vmf); * reference to the page. Setting PG_private should also increment the * refcount. The each user mapping also has a reference to the page. * - * The pagecache pages are stored in a per-mapping radix tree, which is - * rooted at mapping->page_tree, and indexed by offset. + * The pagecache pages are stored in a per-mapping xarray, which is + * rooted at mapping->pages, and indexed by offset. * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space - * lists, we instead now tag pages as dirty/writeback in the radix tree. + * lists, we instead now tag pages as dirty/writeback in the xarray. * * All pagecache pages may be subject to I/O: * - inode pages may need to be read from disk, @@ -1365,6 +1365,7 @@ extern int try_to_release_page(struct page * page, gfp_t gfp_mask); extern void do_invalidatepage(struct page *page, unsigned int offset, unsigned int length); +int __set_page_dirty(struct page *, struct address_space *, int warn); int __set_page_dirty_nobuffers(struct page *page); int __set_page_dirty_no_writeback(struct page *page); int redirty_page_for_writepage(struct writeback_control *wbc, diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c0a945ee81aa..dad06c3fe062 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -341,9 +341,9 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, - int tag, unsigned int nr_pages, struct page **pages); + xa_tag_t tag, unsigned int nr_pages, struct page **pages); unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, - int tag, unsigned int nr_entries, + xa_tag_t tag, unsigned int nr_entries, struct page **entries, pgoff_t *indices); struct page *grab_cache_page_write_begin(struct address_space *mapping, diff --git a/include/linux/swap.h b/include/linux/swap.h index e1680e649250..85b848ffc8b1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -254,7 +254,7 @@ struct swap_info_struct { void *workingset_eviction(struct address_space *mapping, struct page *page); bool workingset_refault(void *shadow); void workingset_activation(struct page *page); -void workingset_update_node(struct radix_tree_node *node); +void workingset_update_node(struct xa_node *node); /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; @@ -355,7 +355,6 @@ extern unsigned long total_swapcache_pages(void); extern void show_swap_cache_info(void); extern int add_to_swap(struct page *, struct list_head *list); extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); -extern int __add_to_swap_cache(struct page *page, swp_entry_t entry); extern void __delete_from_swap_cache(struct page *); extern void delete_from_swap_cache(struct page *); extern void free_page_and_swap_cache(struct page *); diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 5c3a5f3e7eec..c7a9d39b4eea 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -9,15 +9,15 @@ * get good packing density in that tree, so the index should be dense in * the low-order bits. * - * We arrange the `type' and `offset' fields so that `type' is at the seven + * We arrange the `type' and `offset' fields so that `type' is at the six * high-order bits of the swp_entry_t and `offset' is right-aligned in the * remaining bits. Although `type' itself needs only five bits, we allow for - * shmem/tmpfs to shift it all up a further two bits: see swp_to_radix_entry(). + * shmem/tmpfs to shift it all up one bit : see swp_to_radix_entry(). * * swp_entry_t's are *never* stored anywhere in their arch-dependent format. */ #define SWP_TYPE_SHIFT(e) ((sizeof(e.val) * 8) - \ - (MAX_SWAPFILES_SHIFT + RADIX_TREE_EXCEPTIONAL_SHIFT)) + (MAX_SWAPFILES_SHIFT + 1)) #define SWP_OFFSET_MASK(e) ((1UL << SWP_TYPE_SHIFT(e)) - 1) /* @@ -84,20 +84,17 @@ static inline pte_t swp_entry_to_pte(swp_entry_t entry) return __swp_entry_to_pte(arch_entry); } -static inline swp_entry_t radix_to_swp_entry(void *arg) +static inline swp_entry_t xa_to_swp_entry(void *arg) { swp_entry_t entry; - entry.val = (unsigned long)arg >> RADIX_TREE_EXCEPTIONAL_SHIFT; + entry.val = xa_exceptional_value(arg); return entry; } -static inline void *swp_to_radix_entry(swp_entry_t entry) +static inline void *xa_mk_swp_entry(swp_entry_t entry) { - unsigned long value; - - value = entry.val << RADIX_TREE_EXCEPTIONAL_SHIFT; - return (void *)(value | RADIX_TREE_EXCEPTIONAL_ENTRY); + return xa_mk_exceptional(entry.val); } #ifdef CONFIG_MIGRATION diff --git a/include/linux/tty.h b/include/linux/tty.h index 1017e904c0a3..217396ca595c 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -1,7 +1,6 @@ #ifndef _LINUX_TTY_H #define _LINUX_TTY_H -#include #include #include #include diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index b742b5e47cc2..1e7d5be12b93 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -239,7 +239,6 @@ */ #include -#include #include #include #include diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h index 3971cf0eb467..26b41a1239cd 100644 --- a/include/linux/tty_ldisc.h +++ b/include/linux/tty_ldisc.h @@ -116,9 +116,9 @@ * If assigned, prefer this function for automatic flow control. */ -#include #include +struct poll_table_struct; /* * the semaphore definition @@ -164,7 +164,6 @@ extern int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass, ldsem_down_write(sem, timeout) #endif - struct tty_ldisc_ops { int magic; char *name; diff --git a/mm/filemap.c b/mm/filemap.c index 140d1de4277a..e3d1a4c9f377 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -111,67 +111,25 @@ * ->tasklist_lock (memory_failure, collect_procs_ao) */ -static int page_cache_tree_insert(struct address_space *mapping, - struct page *page, void **shadowp) -{ - struct radix_tree_node *node; - void **slot; - int error; - - error = __radix_tree_create(&mapping->page_tree, page->index, 0, - &node, &slot); - if (error) - return error; - if (*slot) { - void *p; - - p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); - if (!radix_tree_exceptional_entry(p)) - return -EEXIST; - - mapping->nrexceptional--; - if (!dax_mapping(mapping)) { - if (shadowp) - *shadowp = p; - } else { - /* DAX can replace empty locked entry with a hole */ - WARN_ON_ONCE(p != - dax_radix_locked_entry(0, RADIX_DAX_EMPTY)); - /* Wakeup waiters for exceptional entry lock */ - dax_wake_mapping_entry_waiter(mapping, page->index, p, - true); - } - } - __radix_tree_replace(&mapping->page_tree, node, slot, page, - workingset_update_node); - mapping->nrpages++; - return 0; -} - -static void page_cache_tree_delete(struct address_space *mapping, +static void page_cache_array_delete(struct address_space *mapping, struct page *page, void *shadow) { - int i, nr; + struct xa_state xas; + struct page *curr; + int nr; - /* hugetlb pages are represented by one entry in the radix tree */ + /* hugetlb pages are represented by one entry in the cache */ nr = PageHuge(page) ? 1 : hpage_nr_pages(page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(nr != 1 && shadow, page); - for (i = 0; i < nr; i++) { - struct radix_tree_node *node; - void **slot; - - __radix_tree_lookup(&mapping->page_tree, page->index + i, - &node, &slot); - - VM_BUG_ON_PAGE(!node && nr != 1, page); - - radix_tree_clear_tags(&mapping->page_tree, node, slot); - __radix_tree_replace(&mapping->page_tree, node, slot, shadow, - workingset_update_node); + xas_init(&xas, page->index); + xas.xa_update = workingset_update_node; + xas_for_each_slot(&mapping->pages, &xas, curr, page->index + nr) { + xas_init_tags(&mapping->pages, &xas); + xas_store(&mapping->pages, &xas, shadow); } if (shadow) { @@ -233,7 +191,7 @@ void __delete_from_page_cache(struct page *page, void *shadow) } } - page_cache_tree_delete(mapping, page, shadow); + page_cache_array_delete(mapping, page, shadow); page->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ @@ -398,7 +356,7 @@ static int __filemap_fdatawait_range(struct address_space *mapping, for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - /* until radix tree lookup accepts end_index */ + /* until pagevec lookup accepts end_index */ if (page->index > end) continue; @@ -557,51 +515,45 @@ EXPORT_SYMBOL(filemap_write_and_wait_range); * locked. This function does not add the new page to the LRU, the * caller must do that. * - * The remove + add is atomic. The only way this function can fail is - * memory allocation failure. + * The remove + add is atomic. This function cannot fail. */ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) { - int error; + struct xa_state xas; + struct address_space *mapping = old->mapping; + void (*freepage)(struct page *) = mapping->a_ops->freepage; + pgoff_t offset = old->index; + unsigned long flags; VM_BUG_ON_PAGE(!PageLocked(old), old); VM_BUG_ON_PAGE(!PageLocked(new), new); VM_BUG_ON_PAGE(new->mapping, new); - error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); - if (!error) { - struct address_space *mapping = old->mapping; - void (*freepage)(struct page *); - unsigned long flags; - - pgoff_t offset = old->index; - freepage = mapping->a_ops->freepage; + get_page(new); + new->mapping = mapping; + new->index = offset; - get_page(new); - new->mapping = mapping; - new->index = offset; - - mapping_lock_irqsave(mapping, flags); - __delete_from_page_cache(old, NULL); - error = page_cache_tree_insert(mapping, new, NULL); - BUG_ON(error); + xas_init(&xas, offset); + mapping_lock_irqsave(mapping, flags); + xas_store(&mapping->pages, &xas, new); - /* - * hugetlb pages do not participate in page cache accounting. - */ - if (!PageHuge(new)) - __inc_node_page_state(new, NR_FILE_PAGES); - if (PageSwapBacked(new)) - __inc_node_page_state(new, NR_SHMEM); - mapping_unlock_irqrestore(mapping, flags); - mem_cgroup_migrate(old, new); - radix_tree_preload_end(); - if (freepage) - freepage(old); - put_page(old); - } + old->mapping = NULL; + /* hugetlb pages do not participate in page cache accounting. */ + if (!PageHuge(old)) + __dec_node_page_state(old, NR_FILE_PAGES); + if (!PageHuge(new)) + __inc_node_page_state(new, NR_FILE_PAGES); + if (PageSwapBacked(old)) + __dec_node_page_state(old, NR_SHMEM); + if (PageSwapBacked(new)) + __inc_node_page_state(new, NR_SHMEM); + mapping_unlock_irqrestore(mapping, flags); + mem_cgroup_migrate(old, new); + if (freepage) + freepage(old); + put_page(old); - return error; + return 0; } EXPORT_SYMBOL_GPL(replace_page_cache_page); @@ -610,9 +562,11 @@ static int __add_to_page_cache_locked(struct page *page, pgoff_t offset, gfp_t gfp_mask, void **shadowp) { + struct xa_state xas; int huge = PageHuge(page); struct mem_cgroup *memcg; int error; + void *old; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapBacked(page), page); @@ -624,35 +578,50 @@ static int __add_to_page_cache_locked(struct page *page, return error; } - error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); - if (error) { - if (!huge) - mem_cgroup_cancel_charge(page, memcg, false); - return error; - } - get_page(page); page->mapping = mapping; page->index = offset; + xas_init(&xas, offset); +retry: mapping_lock_irq(mapping); - error = page_cache_tree_insert(mapping, page, shadowp); - radix_tree_preload_end(); - if (unlikely(error)) - goto err_insert; + old = xas_create(&mapping->pages, &xas); + error = xas_error(&xas); + if (error) { + mapping_unlock_irq(mapping); + if (xas_nomem(&xas, gfp_mask & ~__GFP_HIGHMEM)) + goto retry; + goto error; + } + if (xa_is_exceptional(old)) { + if (shadowp) + *shadowp = old; + mapping->nrexceptional--; + } else if (old) { + goto exist; + } + + xas_store(&mapping->pages, &xas, page); + mapping->nrpages++; /* hugetlb pages do not participate in page cache accounting. */ if (!huge) __inc_node_page_state(page, NR_FILE_PAGES); mapping_unlock_irq(mapping); + + xas_destroy(&xas); if (!huge) mem_cgroup_commit_charge(page, memcg, false, false); trace_mm_filemap_add_to_page_cache(page); return 0; -err_insert: + +exist: + error = -EEXIST; + mapping_unlock_irq(mapping); +error: + xas_destroy(&xas); page->mapping = NULL; /* Leave page->index set: truncation relies upon it */ - mapping_unlock_irq(mapping); if (!huge) mem_cgroup_cancel_charge(page, memcg, false); put_page(page); @@ -1085,84 +1054,84 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, } /** - * page_cache_next_hole - find the next hole (not-present entry) + * page_cache_next_hole() - find the next hole (not-present entry) * @mapping: mapping * @index: index * @max_scan: maximum range to search * - * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the - * lowest indexed hole. + * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the + * hole with the lowest index. * - * Returns: the index of the hole if found, otherwise returns an index - * outside of the set specified (in which case 'return - index >= - * max_scan' will be true). In rare cases of index wrap-around, 0 will - * be returned. + * This function does not atomically search a snapshot of the cache at a + * single point in time. For example, if a hole is created at index 5 + * then subsequently a hole is created at index 10, page_cache_next_hole() + * covering both indices may return 10. * - * page_cache_next_hole may be called under rcu_read_lock. However, - * like radix_tree_gang_lookup, this will not atomically search a - * snapshot of the tree at a single point in time. For example, if a - * hole is created at index 5, then subsequently a hole is created at - * index 10, page_cache_next_hole covering both indexes may return 10 - * if called under rcu_read_lock. + * Return: The index of the hole if found, otherwise an index outside the + * range specified (in which case 'return - index >= max_scan' will be true). + * If (index + max_scan) >= ULONG_MAX, and pages are present all the way to + * ULONG_MAX, this function will return 0. */ pgoff_t page_cache_next_hole(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { - unsigned long i; + struct xa_state xas; + struct page *page; + unsigned long max = index + max_scan; - for (i = 0; i < max_scan; i++) { - struct page *page; + if (max < index) + max = ~0UL; - page = radix_tree_lookup(&mapping->page_tree, index); - if (!page || radix_tree_exceptional_entry(page)) - break; - index++; - if (index == 0) + xas_init(&xas, index); + rcu_read_lock(); + xas_for_each_slot(&mapping->pages, &xas, page, max) { + if (!page || xa_is_exceptional(page)) break; } + rcu_read_unlock(); - return index; + return xas.xa_index; } EXPORT_SYMBOL(page_cache_next_hole); /** - * page_cache_prev_hole - find the prev hole (not-present entry) + * page_cache_prev_hole - find the previous hole (not-present entry) * @mapping: mapping * @index: index * @max_scan: maximum range to search * - * Search backwards in the range [max(index-max_scan+1, 0), index] for - * the first hole. + * Search the range [max(index - max_scan + 1, 0), index] for the + * hole with the highest index. * - * Returns: the index of the hole if found, otherwise returns an index - * outside of the set specified (in which case 'index - return >= - * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX - * will be returned. + * This function does not atomically search a snapshot of the cache at a + * single point in time. For example, if a hole is created at index 10 + * then subsequently a hole is created at index 5, page_cache_prev_hole() + * covering both indices may return 5. * - * page_cache_prev_hole may be called under rcu_read_lock. However, - * like radix_tree_gang_lookup, this will not atomically search a - * snapshot of the tree at a single point in time. For example, if a - * hole is created at index 10, then subsequently a hole is created at - * index 5, page_cache_prev_hole covering both indexes may return 5 if - * called under rcu_read_lock. + * Return: The index of the hole if found, otherwise an index outside the + * range specified (in which case 'index - range >= max_scan' will be true). + * If (index - max_scan) < 0, and pages are present all the way to + * 0, this function will return ULONG_MAX. */ pgoff_t page_cache_prev_hole(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { - unsigned long i; + struct xa_state xas; + struct page *page; + unsigned long min = index - max_scan; - for (i = 0; i < max_scan; i++) { - struct page *page; + if (min > index) + min = 0; - page = radix_tree_lookup(&mapping->page_tree, index); - if (!page || radix_tree_exceptional_entry(page)) - break; - index--; - if (index == ULONG_MAX) + xas_init(&xas, index); + rcu_read_lock(); + xas_for_each_slot_rev(&mapping->pages, &xas, page, min) { + if (!page || xa_is_exceptional(page)) break; } + rcu_read_unlock(); - return index; + return xas.xa_index; } EXPORT_SYMBOL(page_cache_prev_hole); @@ -1171,7 +1140,7 @@ EXPORT_SYMBOL(page_cache_prev_hole); * @mapping: the address_space to search * @offset: the page cache index * - * Looks up the page cache slot at @mapping & @offset. If there is a + * Looks up the page cache entry at @mapping & @offset. If there is a * page cache page, it is returned with an increased refcount. * * If the slot holds a shadow entry of a previously evicted page, or a @@ -1181,49 +1150,27 @@ EXPORT_SYMBOL(page_cache_prev_hole); */ struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { - void **pagep; - struct page *head, *page; + struct xa_state xas; + struct page *head, *page, *page2; + xas_init(&xas, offset); rcu_read_lock(); repeat: - page = NULL; - pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); - if (pagep) { - page = radix_tree_deref_slot(pagep); - if (unlikely(!page)) - goto out; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto repeat; - /* - * A shadow entry of a recently evicted page, - * or a swap entry from shmem/tmpfs. Return - * it without attempting to raise page count. - */ - goto out; - } + page = xas_load(&mapping->pages, &xas); + if (xas_retry(&xas, page)) + goto repeat; + if (page && !xa_is_exceptional(page)) { head = compound_head(page); if (!page_cache_get_speculative(head)) goto repeat; - - /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } - - /* - * Has the page moved? - * This is part of the lockless pagecache protocol. See - * include/linux/pagemap.h for details. - */ - if (unlikely(page != *pagep)) { + /* Did the page move or get split? */ + page2 = xas_load(&mapping->pages, &xas); + if (page != page2 || compound_head(page) != head) { put_page(head); goto repeat; } } -out: rcu_read_unlock(); return page; @@ -1252,7 +1199,7 @@ struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) repeat: page = find_get_entry(mapping, offset); - if (page && !radix_tree_exception(page)) { + if (page && !xa_is_exceptional(page)) { lock_page(page); /* Has the page been truncated? */ if (unlikely(page_mapping(page) != mapping)) { @@ -1296,7 +1243,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, repeat: page = find_get_entry(mapping, offset); - if (radix_tree_exceptional_entry(page)) + if (xa_is_exceptional(page)) page = NULL; if (!page) goto no_page; @@ -1357,7 +1304,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, EXPORT_SYMBOL(pagecache_get_page); /** - * find_get_entries - gang pagecache lookup + * find_get_entries - bulk pagecache lookup * @mapping: The address_space to search * @start: The starting page cache index * @nr_entries: The maximum number of entries @@ -1373,60 +1320,49 @@ EXPORT_SYMBOL(pagecache_get_page); * with ascending indexes. There may be holes in the indices due to * not-present pages. * - * Any shadow entries of evicted pages, or swap entries from - * shmem/tmpfs, are included in the returned array. + * Exceptional entries (shadow entries of evicted pages, swap entries + * from shmem/tmpfs, or DAX entries) are included in the returned array. * - * find_get_entries() returns the number of pages and shadow entries - * which were found. + * find_get_entries() returns the number of entries which were found. */ unsigned find_get_entries(struct address_space *mapping, pgoff_t start, unsigned int nr_entries, struct page **entries, pgoff_t *indices) { - void **slot; + struct xa_state xas; + struct page *head, *page, *page2; unsigned int ret = 0; - struct radix_tree_iter iter; if (!nr_entries) return 0; + xas_init(&xas, start); rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { - struct page *head, *page; + xas_for_each(&mapping->pages, &xas, page, ~0UL) { repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + if (xas_retry(&xas, page)) continue; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; + if (!xa_is_exceptional(page)) { + head = compound_head(page); + if (!page_cache_get_speculative(head)) { + page = xas_load(&mapping->pages, &xas); + if (unlikely(!page)) + continue; + goto repeat; } - /* - * A shadow entry of a recently evicted page, a swap - * entry from shmem/tmpfs or a DAX entry. Return it - * without attempting to raise page count. - */ - goto export; - } - - head = compound_head(page); - if (!page_cache_get_speculative(head)) - goto repeat; - /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; + /* Did the page move or get split? */ + page2 = xas_load(&mapping->pages, &xas); + if (page != page2 || compound_head(page) != head) { + put_page(head); + if (unlikely(!page2)) + continue; + page = page2; + goto repeat; + } } - /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } -export: - indices[ret] = iter.index; + indices[ret] = xas.xa_index; entries[ret] = page; if (++ret == nr_entries) break; @@ -1454,47 +1390,37 @@ unsigned find_get_entries(struct address_space *mapping, unsigned find_get_pages(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages) { - struct radix_tree_iter iter; - void **slot; + struct xa_state xas; + struct page *head, *page, *page2; unsigned ret = 0; if (unlikely(!nr_pages)) return 0; + xas_init(&xas, start); rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { - struct page *head, *page; + xas_for_each(&mapping->pages, &xas, page, ~0UL) { repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + if (xas_retry(&xas, page)) continue; - - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page, - * or a swap entry from shmem/tmpfs. Skip - * over it. - */ + if (xa_is_exceptional(page)) continue; - } head = compound_head(page); - if (!page_cache_get_speculative(head)) - goto repeat; - - /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); + if (!page_cache_get_speculative(head)) { + page = xas_load(&mapping->pages, &xas); + if (unlikely(!page)) + continue; goto repeat; } - /* Has the page moved? */ - if (unlikely(page != *slot)) { + /* Did the page move or get split? */ + page2 = xas_load(&mapping->pages, &xas); + if (page != page2 || compound_head(page) != head) { put_page(head); + if (unlikely(!page2)) + continue; + page = page2; goto repeat; } @@ -1522,61 +1448,36 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, unsigned int nr_pages, struct page **pages) { - struct radix_tree_iter iter; - void **slot; + struct xa_state xas; + struct page *head, *page, *page2; unsigned int ret = 0; if (unlikely(!nr_pages)) return 0; + xas_init(&xas, index); rcu_read_lock(); - radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { - struct page *head, *page; + xas_for_each_slot(&mapping->pages, &xas, page, ~0UL) { repeat: - page = radix_tree_deref_slot(slot); - /* The hole, there no reason to continue */ - if (unlikely(!page)) - break; - - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page, - * or a swap entry from shmem/tmpfs. Stop - * looking for contiguous pages. - */ + if (xas_retry(&xas, page)) + continue; + if (!page || xa_is_exceptional(page)) break; - } head = compound_head(page); - if (!page_cache_get_speculative(head)) - goto repeat; - - /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); + if (!page_cache_get_speculative(head)) { + page = xas_load(&mapping->pages, &xas); goto repeat; } - /* Has the page moved? */ - if (unlikely(page != *slot)) { + /* Did the page move or get split? */ + page2 = xas_load(&mapping->pages, &xas); + if (page != page2 || compound_head(page) != head) { put_page(head); + page = page2; goto repeat; } - /* - * must check mapping and index after taking the ref. - * otherwise we can get both false positives and false - * negatives, which is just confusing to the caller. - */ - if (page->mapping == NULL || page_to_pgoff(page) != iter.index) { - put_page(page); - break; - } - pages[ret] = page; if (++ret == nr_pages) break; @@ -1598,56 +1499,37 @@ EXPORT_SYMBOL(find_get_pages_contig); * @tag. We update @index to index the next page for the traversal. */ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, - int tag, unsigned int nr_pages, struct page **pages) + xa_tag_t tag, unsigned int nr_pages, struct page **pages) { - struct radix_tree_iter iter; - void **slot; + struct xa_state xas; + struct page *head, *page, *page2; unsigned ret = 0; if (unlikely(!nr_pages)) return 0; + xas_init(&xas, *index); rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->page_tree, - &iter, *index, tag) { - struct page *head, *page; + xas_for_each_tag(&mapping->pages, &xas, page, ~0UL, tag) { repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + if (!page) continue; - - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page. - * - * Those entries should never be tagged, but - * this tree walk is lockless and the tags are - * looked up in bulk, one radix tree node at a - * time, so there is a sizable window for page - * reclaim to evict a page we saw tagged. - * - * Skip over it. - */ + if (xas_retry(&xas, page)) + continue; + if (xa_is_exceptional(page)) continue; - } head = compound_head(page); - if (!page_cache_get_speculative(head)) - goto repeat; - - /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); + if (!page_cache_get_speculative(head)) { + page = xas_load(&mapping->pages, &xas); goto repeat; } - /* Has the page moved? */ - if (unlikely(page != *slot)) { + /* Did the page move or get split? */ + page2 = xas_load(&mapping->pages, &xas); + if (page != page2 || compound_head(page) != head) { put_page(head); + page = page2; goto repeat; } @@ -1678,55 +1560,41 @@ EXPORT_SYMBOL(find_get_pages_tag); * @tag. */ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, - int tag, unsigned int nr_entries, + xa_tag_t tag, unsigned int nr_entries, struct page **entries, pgoff_t *indices) { - void **slot; + struct xa_state xas; unsigned int ret = 0; - struct radix_tree_iter iter; + struct page *head, *page, *page2; if (!nr_entries) return 0; + xas_init(&xas, start); rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->page_tree, - &iter, start, tag) { - struct page *head, *page; + xas_for_each_tag(&mapping->pages, &xas, page, ~0UL, tag) { repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + if (!page) + continue; + if (xas_retry(&xas, page)) continue; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - - /* - * A shadow entry of a recently evicted page, a swap - * entry from shmem/tmpfs or a DAX entry. Return it - * without attempting to raise page count. - */ - goto export; - } - head = compound_head(page); - if (!page_cache_get_speculative(head)) - goto repeat; + if (!xa_is_exceptional(page)) { + head = compound_head(page); + if (!page_cache_get_speculative(head)) { + page = xas_load(&mapping->pages, &xas); + goto repeat; + } - /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; + /* Did the page move or get split? */ + page2 = xas_load(&mapping->pages, &xas); + if (page != page2 || compound_head(page) != head) { + put_page(head); + goto repeat; + } } - /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } -export: - indices[ret] = iter.index; + indices[ret] = xas.xa_index; entries[ret] = page; if (++ret == nr_entries) break; @@ -2318,44 +2186,33 @@ EXPORT_SYMBOL(filemap_fault); void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { - struct radix_tree_iter iter; - void **slot; + struct xa_state xas; struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; loff_t size; - struct page *head, *page; + struct page *head, *page, *page2; + xas_init(&xas, start_pgoff); rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, - start_pgoff) { - if (iter.index > end_pgoff) - break; + xas_for_each(&mapping->pages, &xas, page, end_pgoff) { repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) - goto next; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - goto next; - } + if (xa_is_exceptional(page)) + continue; + if (xas_retry(&xas, page)) + continue; head = compound_head(page); - if (!page_cache_get_speculative(head)) - goto repeat; - - /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); + if (!page_cache_get_speculative(head)) { + page = xas_load(&mapping->pages, &xas); goto repeat; } - /* Has the page moved? */ - if (unlikely(page != *slot)) { + /* Did the page move or get split? */ + page2 = xas_load(&mapping->pages, &xas); + if (page != page2 || compound_head(page) != head) { put_page(head); + page = page2; goto repeat; } @@ -2376,10 +2233,10 @@ void filemap_map_pages(struct vm_fault *vmf, if (file->f_ra.mmap_miss > 0) file->f_ra.mmap_miss--; - vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT; + vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT; if (vmf->pte) - vmf->pte += iter.index - last_pgoff; - last_pgoff = iter.index; + vmf->pte += xas.xa_index - last_pgoff; + last_pgoff = xas.xa_index; if (alloc_set_pte(vmf, NULL, page)) goto unlock; unlock_page(page); @@ -2392,8 +2249,6 @@ void filemap_map_pages(struct vm_fault *vmf, /* Huge page is mapped? No need to proceed. */ if (pmd_trans_huge(*vmf->pmd)) break; - if (iter.index == end_pgoff) - break; } rcu_read_unlock(); } @@ -2499,7 +2354,7 @@ static struct page *do_read_cache_page(struct address_space *mapping, put_page(page); if (err == -EEXIST) goto repeat; - /* Presumably ENOMEM for radix tree node */ + /* Presumably ENOMEM for xarray node */ return ERR_PTR(err); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4df7e1785045..1d66d780e41a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2358,6 +2358,7 @@ int page_trans_huge_mapcount(struct page *page, int *total_mapcount) */ int split_huge_page_to_list(struct page *page, struct list_head *list) { + struct xa_state xas; struct page *head = compound_head(page); struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); struct anon_vma *anon_vma = NULL; @@ -2424,16 +2425,16 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags); if (mapping) { - void **pslot; + struct page *curr; + xas_init(&xas, page_index(head)); mapping_lock(mapping); - pslot = radix_tree_lookup_slot(&mapping->page_tree, - page_index(head)); + curr = xas_load(&mapping->pages, &xas); /* * Check if the head page is present in radix tree. * We assume all tail are present too, if head is there. */ - if (mapping_deref_protected(mapping, pslot) != head) + if (curr != head) goto fail; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a7915e7273f7..eda4e9577f6b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1282,10 +1282,10 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * * Basic scheme is simple, details are more complex: * - allocate and freeze a new huge page; - * - scan over radix tree replacing old pages the new one + * - scan page cache replacing old pages with the new one * + swap in pages if necessary; * + fill in gaps; - * + keep old pages around in case if rollback is required; + * + keep old pages around in case rollback is required; * - if replacing succeed: * + copy data over; * + free old pages; @@ -1299,13 +1299,12 @@ static void collapse_shmem(struct mm_struct *mm, struct address_space *mapping, pgoff_t start, struct page **hpage, int node) { + struct xa_state xas; gfp_t gfp; struct page *page, *new_page, *tmp; struct mem_cgroup *memcg; pgoff_t index, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); - struct radix_tree_iter iter; - void **slot; int nr_none = 0, result = SCAN_SUCCEED; VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); @@ -1339,7 +1338,9 @@ static void collapse_shmem(struct mm_struct *mm, */ index = start; + xas_init_order(&xas, index, compound_order(new_page)); mapping_lock_irq(mapping); + xas_for_each(&mapping->pages, &xas, entry) { radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { int n = min(iter.index, end) - index; diff --git a/mm/madvise.c b/mm/madvise.c index 7a2abf0127ae..86c2301a535c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -244,7 +244,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, put_page(page); continue; } - swap = radix_to_swp_entry(page); + swap = xa_to_swp_entry(page); page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, NULL, 0); if (page) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8a2f639d3922..29a1da6bf8e3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4454,7 +4454,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, if (shmem_mapping(mapping)) { page = find_get_entry(mapping, pgoff); if (radix_tree_exceptional_entry(page)) { - swp_entry_t swp = radix_to_swp_entry(page); + swp_entry_t swp = xa_to_swp_entry(page); if (do_memsw_account()) *entry = swp; page = find_get_page(swap_address_space(swp), diff --git a/mm/migrate.c b/mm/migrate.c index 4ae1b4999ae3..8858fa1508e8 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -294,10 +294,10 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, page = migration_entry_to_page(entry); /* - * Once radix-tree replacement of page migration started, page_count - * *must* be zero. And, we don't want to call wait_on_page_locked() + * Once page cache replacement of page migration started, page_count + * *must* be zero. And we don't want to call wait_on_page_locked() * against a page without get_page(). - * So, we use get_page_unless_zero(), here. Even failed, page fault + * So we use get_page_unless_zero() here. Even failed, page fault * will occur again. */ if (!get_page_unless_zero(page)) @@ -388,10 +388,11 @@ int migrate_page_move_mapping(struct address_space *mapping, struct buffer_head *head, enum migrate_mode mode, int extra_count) { + struct xa_state xas; + struct page *oldpage; struct zone *oldzone, *newzone; int dirty; int expected_count = 1 + extra_count; - void **pslot; if (!mapping) { /* Anonymous page without mapping */ @@ -410,14 +411,12 @@ int migrate_page_move_mapping(struct address_space *mapping, oldzone = page_zone(page); newzone = page_zone(newpage); + xas_init(&xas, page_index(page)); mapping_lock_irq(mapping); - - pslot = radix_tree_lookup_slot(&mapping->page_tree, - page_index(page)); + oldpage = xas_load(&mapping->pages, &xas); expected_count += 1 + page_has_private(page); - if (page_count(page) != expected_count || - radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { + if (page_count(page) != expected_count || oldpage != page) { mapping_unlock_irq(mapping); return -EAGAIN; } @@ -465,7 +464,7 @@ int migrate_page_move_mapping(struct address_space *mapping, SetPageDirty(newpage); } - radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); + xas_store(&mapping->pages, &xas, newpage); /* * Drop cache reference from old page by unfreezing @@ -514,17 +513,16 @@ EXPORT_SYMBOL(migrate_page_move_mapping); int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page) { + struct xa_state xas; + struct page *oldpage; int expected_count; - void **pslot; + xas_init(&xas, page_index(page)); mapping_lock_irq(mapping); - - pslot = radix_tree_lookup_slot(&mapping->page_tree, - page_index(page)); + oldpage = xas_load(&mapping->pages, &xas); expected_count = 2 + page_has_private(page); - if (page_count(page) != expected_count || - radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { + if (page_count(page) != expected_count || oldpage != page) { mapping_unlock_irq(mapping); return -EAGAIN; } @@ -539,7 +537,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, get_page(newpage); - radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); + xas_store(&mapping->pages, &xas, newpage); page_ref_unfreeze(page, expected_count - 1); diff --git a/mm/mincore.c b/mm/mincore.c index c5687c45c326..455e4bcb1ace 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -66,7 +66,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) * page too. */ if (radix_tree_exceptional_entry(page)) { - swp_entry_t swp = radix_to_swp_entry(page); + swp_entry_t swp = xa_to_swp_entry(page); page = find_get_page(swap_address_space(swp), swp_offset(swp)); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index b4f160298632..e9f3f7422346 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2106,20 +2106,18 @@ void tag_pages_for_writeback(struct address_space *mapping, { #define WRITEBACK_TAG_BATCH 4096 unsigned long tagged = 0; - struct radix_tree_iter iter; - void **slot; + struct xa_state xas; + void *entry; + xas_init(&xas, start); mapping_lock_irq(mapping); - radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start, + xas_for_each_tag(&mapping->pages, &xas, entry, end, PAGECACHE_TAG_DIRTY) { - if (iter.index > end) - break; - radix_tree_iter_tag_set(&mapping->page_tree, &iter, - PAGECACHE_TAG_TOWRITE); + xas_set_tag(&mapping->pages, &xas, PAGECACHE_TAG_TOWRITE); tagged++; if ((tagged % WRITEBACK_TAG_BATCH) != 0) continue; - slot = radix_tree_iter_resume(slot, &iter); + xas_pause(&xas); mapping_unlock_irq(mapping); cond_resched(); mapping_lock_irq(mapping); @@ -2458,7 +2456,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, /* * For address_spaces which do not use buffers. Just tag the page as dirty in - * its radix tree. + * its xarray. * * This is also used when a single buffer is being dirtied: we want to set the * page dirty in that case, but not all the buffers. This is a "bottom-up" @@ -2472,6 +2470,7 @@ int __set_page_dirty_nobuffers(struct page *page) { lock_page_memcg(page); if (!TestSetPageDirty(page)) { + struct xa_state xas; struct address_space *mapping = page_mapping(page); unsigned long flags; @@ -2480,12 +2479,12 @@ int __set_page_dirty_nobuffers(struct page *page) return 1; } + xas_init(&xas, page_index(page)); mapping_lock_irqsave(mapping, flags); BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->page_tree, page_index(page), - PAGECACHE_TAG_DIRTY); + xas_set_tag(&mapping->pages, &xas, PAGECACHE_TAG_DIRTY); mapping_unlock_irqrestore(mapping, flags); unlock_page_memcg(page); @@ -2647,13 +2646,13 @@ EXPORT_SYMBOL(cancel_dirty_page); * Returns true if the page was previously dirty. * * This is for preparing to put the page under writeout. We leave the page - * tagged as dirty in the radix tree so that a concurrent write-for-sync + * tagged as dirty in the xarray so that a concurrent write-for-sync * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage * implementation will run either set_page_writeback() or set_page_dirty(), - * at which stage we bring the page's dirty flag and radix-tree dirty tag + * at which stage we bring the page's dirty flag and xarray dirty tag * back into sync. * - * This incoherency between the page's dirty flag and radix-tree tag is + * This incoherency between the page's dirty flag and xarray tag is * unfortunate, but it only exists while the page is locked. */ int clear_page_dirty_for_io(struct page *page) @@ -2725,15 +2724,16 @@ int test_clear_page_writeback(struct page *page) lock_page_memcg(page); if (mapping && mapping_use_writeback_tags(mapping)) { + struct xa_state xas; struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; + xas_init(&xas, page_index(page)); mapping_lock_irqsave(mapping, flags); ret = TestClearPageWriteback(page); if (ret) { - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), + xas_clear_tag(&mapping->pages, &xas, PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) { struct bdi_writeback *wb = inode_to_wb(inode); @@ -2768,10 +2768,12 @@ int __test_set_page_writeback(struct page *page, bool keep_write) lock_page_memcg(page); if (mapping && mapping_use_writeback_tags(mapping)) { + struct xa_state xas; struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; + xas_init(&xas, page_index(page)); mapping_lock_irqsave(mapping, flags); ret = TestSetPageWriteback(page); if (!ret) { @@ -2780,8 +2782,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), + xas_set_tag(&mapping->pages, &xas, PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); @@ -2795,12 +2796,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write) sb_mark_inode_writeback(mapping->host); } if (!PageDirty(page)) - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), + xas_clear_tag(&mapping->pages, &xas, PAGECACHE_TAG_DIRTY); if (!keep_write) - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), + xas_clear_tag(&mapping->pages, &xas, PAGECACHE_TAG_TOWRITE); mapping_unlock_irqrestore(mapping, flags); } else { @@ -2821,9 +2820,9 @@ EXPORT_SYMBOL(__test_set_page_writeback); * Return true if any of the pages in the mapping are marked with the * passed tag. */ -int mapping_tagged(struct address_space *mapping, int tag) +int mapping_tagged(struct address_space *mapping, xa_tag_t tag) { - return radix_tree_tagged(&mapping->page_tree, tag); + return xa_tagged(&mapping->pages, tag); } EXPORT_SYMBOL(mapping_tagged); diff --git a/mm/readahead.c b/mm/readahead.c index c4ca70239233..0544fc1a9364 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -174,10 +174,9 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, if (page_offset > end_index) break; - rcu_read_lock(); - page = radix_tree_lookup(&mapping->page_tree, page_offset); - rcu_read_unlock(); - if (page && !radix_tree_exceptional_entry(page)) + /* Turn this into a xas loop later */ + page = xa_load(&mapping->pages, page_offset); + if (page && !xa_is_exceptional(page)) continue; page = __page_cache_alloc(gfp_mask); diff --git a/mm/shmem.c b/mm/shmem.c index 56c94223437d..3e355c4f0795 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -306,28 +306,6 @@ void shmem_uncharge(struct inode *inode, long pages) } /* - * Replace item expected in radix tree by a new item, while holding tree lock. - */ -static int shmem_radix_tree_replace(struct address_space *mapping, - pgoff_t index, void *expected, void *replacement) -{ - struct radix_tree_node *node; - void **pslot; - void *item; - - VM_BUG_ON(!expected); - VM_BUG_ON(!replacement); - item = __radix_tree_lookup(&mapping->page_tree, index, &node, &pslot); - if (!item) - return -ENOENT; - if (item != expected) - return -ENOENT; - __radix_tree_replace(&mapping->page_tree, node, pslot, - replacement, NULL); - return 0; -} - -/* * Sometimes, before we decide whether to proceed or to fail, we must check * that an entry was not already brought back from swap by a racing thread. * @@ -337,12 +315,7 @@ static int shmem_radix_tree_replace(struct address_space *mapping, static bool shmem_confirm_swap(struct address_space *mapping, pgoff_t index, swp_entry_t swap) { - void *item; - - rcu_read_lock(); - item = radix_tree_lookup(&mapping->page_tree, index); - rcu_read_unlock(); - return item == swp_to_radix_entry(swap); + return xa_load(&mapping->pages, index) == xa_mk_swp_entry(swap); } /* @@ -552,10 +525,13 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, */ static int shmem_add_to_page_cache(struct page *page, struct address_space *mapping, - pgoff_t index, void *expected) + pgoff_t index, void *expected, gfp_t gfp) { + struct xa_state xas; + void *entry; int error, nr = hpage_nr_pages(page); + gfp &= ~__GFP_HIGHMEM; VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(index != round_down(index, nr), page); VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -566,46 +542,34 @@ static int shmem_add_to_page_cache(struct page *page, page->mapping = mapping; page->index = index; + xas_init_order(&xas, index, compound_order(page)); +retry: mapping_lock_irq(mapping); - if (PageTransHuge(page)) { - void __rcu **results; - pgoff_t idx; - int i; - - error = 0; - if (radix_tree_gang_lookup_slot(&mapping->page_tree, - &results, &idx, index, 1) && - idx < index + HPAGE_PMD_NR) { - error = -EEXIST; - } - - if (!error) { - for (i = 0; i < HPAGE_PMD_NR; i++) { - error = radix_tree_insert(&mapping->page_tree, - index + i, page + i); - VM_BUG_ON(error); - } - count_vm_event(THP_FILE_ALLOC); - } - } else if (!expected) { - error = radix_tree_insert(&mapping->page_tree, index, page); - } else { - error = shmem_radix_tree_replace(mapping, index, expected, - page); + error = 0; + /* Assumes page cache already supports positive order pages */ + xas_store_for_each(&mapping->pages, &xas, entry, page) { + if (entry != expected) + xas_set_err(&xas, EEXIST); } - if (!error) { + error = xas_error(&xas); + if (error) { + mapping_unlock_irq(mapping); + if (xas_nomem(&xas, gfp)) + goto retry; + page->mapping = NULL; + page_ref_sub(page, nr); + } else { mapping->nrpages += nr; - if (PageTransHuge(page)) + if (PageTransHuge(page)) { + count_vm_event(THP_FILE_ALLOC); __inc_node_page_state(page, NR_SHMEM_THPS); + } __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); mapping_unlock_irq(mapping); - } else { - page->mapping = NULL; - mapping_unlock_irq(mapping); - page_ref_sub(page, nr); } + xas_destroy(&xas); return error; } @@ -614,20 +578,24 @@ static int shmem_add_to_page_cache(struct page *page, */ static void shmem_delete_from_page_cache(struct page *page, void *radswap) { + struct xa_state xas; struct address_space *mapping = page->mapping; - int error; + void *curr; VM_BUG_ON_PAGE(PageCompound(page), page); + xas_init(&xas, page->index); mapping_lock_irq(mapping); - error = shmem_radix_tree_replace(mapping, page->index, page, radswap); + curr = xas_load(&mapping->pages, &xas); + if (curr == page) + xas_store(&mapping->pages, &xas, radswap); page->mapping = NULL; mapping->nrpages--; __dec_node_page_state(page, NR_FILE_PAGES); __dec_node_page_state(page, NR_SHMEM); mapping_unlock_irq(mapping); put_page(page); - BUG_ON(error); + BUG_ON(page != curr); } /* @@ -638,12 +606,10 @@ static int shmem_free_swap(struct address_space *mapping, { void *old; - mapping_lock_irq(mapping); - old = radix_tree_delete_item(&mapping->page_tree, index, radswap); - mapping_unlock_irq(mapping); + old = xa_replace(&mapping->pages, index, NULL, radswap, GFP_NOWAIT); if (old != radswap) return -ENOENT; - free_swap_and_cache(radix_to_swp_entry(radswap)); + free_swap_and_cache(xa_to_swp_entry(radswap)); return 0; } @@ -657,29 +623,24 @@ static int shmem_free_swap(struct address_space *mapping, unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end) { - struct radix_tree_iter iter; - void **slot; + struct xa_state xas; struct page *page; unsigned long swapped = 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { - if (iter.index >= end) - break; - - page = radix_tree_deref_slot(slot); - - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); + xas_init(&xas, start); + xas_for_each(&mapping->pages, &xas, page, end - 1) { + if (xa_is_retry(page)) { + xas_restart(&xas); continue; } - if (radix_tree_exceptional_entry(page)) + if (xa_is_exceptional(page)) swapped++; if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); + xas_pause(&xas); cond_resched_rcu(); } } @@ -790,7 +751,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (index >= end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_exceptional(page)) { if (unfalloc) continue; nr_swaps_freed += !shmem_free_swap(mapping, @@ -887,7 +848,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (index >= end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_exceptional(page)) { if (unfalloc) continue; if (shmem_free_swap(mapping, index, page)) { @@ -1067,24 +1028,28 @@ static void shmem_evict_inode(struct inode *inode) clear_inode(inode); } -static unsigned long find_swap_entry(struct radix_tree_root *root, void *item) +/* + * The return value is ambiguous -- did we not find it, or did we find it at + * index ~0UL ? + */ +static unsigned long find_swap_entry(struct xarray *xa, void *item) { - struct radix_tree_iter iter; - void **slot; + struct xa_state xas; + void *entry; unsigned long found = -1; - unsigned int checked = 0; + xas_init(&xas, 0); rcu_read_lock(); - radix_tree_for_each_slot(slot, root, &iter, 0) { - if (*slot == item) { - found = iter.index; + xas_for_each(xa, &xas, entry, ~0UL) { + if (entry == item) { + found = xas.xa_index; break; } - checked++; - if ((checked % 4096) != 0) - continue; - slot = radix_tree_iter_resume(slot, &iter); - cond_resched_rcu(); + + if (need_resched()) { + xas_pause(&xas); + cond_resched_rcu(); + } } rcu_read_unlock(); @@ -1103,8 +1068,8 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, gfp_t gfp; int error = 0; - radswap = swp_to_radix_entry(swap); - index = find_swap_entry(&mapping->page_tree, radswap); + radswap = xa_mk_swp_entry(swap); + index = find_swap_entry(&mapping->pages, radswap); if (index == -1) return -EAGAIN; /* tell shmem_unuse we found nothing */ @@ -1151,7 +1116,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, */ if (!error) error = shmem_add_to_page_cache(*pagep, mapping, index, - radswap); + radswap, gfp); if (error != -ENOMEM) { /* * Truncation and eviction use free_swap_and_cache(), which @@ -1316,7 +1281,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) spin_unlock_irq(&info->lock); swap_shmem_alloc(swap); - shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); + shmem_delete_from_page_cache(page, xa_mk_swp_entry(swap)); mutex_unlock(&shmem_swaplist_mutex); BUG_ON(page_mapped(page)); @@ -1408,21 +1373,15 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, struct vm_area_struct pvma; struct inode *inode = &info->vfs_inode; struct address_space *mapping = inode->i_mapping; - pgoff_t idx, hindex; - void __rcu **results; + pgoff_t hindex; struct page *page; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) return NULL; hindex = round_down(index, HPAGE_PMD_NR); - rcu_read_lock(); - if (radix_tree_gang_lookup_slot(&mapping->page_tree, &results, &idx, - hindex, 1) && idx < hindex + HPAGE_PMD_NR) { - rcu_read_unlock(); + if (xa_find(&mapping->pages, &hindex, hindex + HPAGE_PMD_NR - 1)) return NULL; - } - rcu_read_unlock(); shmem_pseudo_vma_init(&pvma, info, hindex); page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, @@ -1506,7 +1465,8 @@ static bool shmem_should_replace_page(struct page *page, gfp_t gfp) static int shmem_replace_page(struct page **pagep, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { - struct page *oldpage, *newpage; + struct xa_state xas; + struct page *oldpage, *newpage, *currpage; struct address_space *swap_mapping; pgoff_t swap_index; int error; @@ -1538,23 +1498,24 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, * Our caller will very soon move newpage out of swapcache, but it's * a nice clean interface for us to replace oldpage by newpage there. */ + xas_init(&xas, swap_index); mapping_lock_irq(swap_mapping); - error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, - newpage); - if (!error) { + currpage = xas_load(&swap_mapping->pages, &xas); + if (currpage == oldpage) { + xas_store(&swap_mapping->pages, &xas, newpage); __inc_node_page_state(newpage, NR_FILE_PAGES); __dec_node_page_state(oldpage, NR_FILE_PAGES); - } - mapping_unlock_irq(swap_mapping); - - if (unlikely(error)) { + } else { /* * Is this possible? I think not, now that our callers check * both PageSwapCache and page_private after getting page lock; * but be defensive. Reverse old to newpage for clear and free. */ oldpage = newpage; - } else { + } + mapping_unlock_irq(swap_mapping); + + if (oldpage != newpage) { mem_cgroup_migrate(oldpage, newpage); lru_cache_add_anon(newpage); *pagep = newpage; @@ -1603,8 +1564,8 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, repeat: swap.val = 0; page = find_lock_entry(mapping, index); - if (radix_tree_exceptional_entry(page)) { - swap = radix_to_swp_entry(page); + if (xa_is_exceptional(page)) { + swap = xa_to_swp_entry(page); page = NULL; } @@ -1679,7 +1640,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, false); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, - swp_to_radix_entry(swap)); + xa_mk_swp_entry(swap), gfp); /* * We already confirmed swap under page lock, and make * no memory allocation here, so usually no possibility @@ -1786,13 +1747,8 @@ alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, info, sbinfo, PageTransHuge(page)); if (error) goto unacct; - error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK, - compound_order(page)); - if (!error) { - error = shmem_add_to_page_cache(page, mapping, hindex, - NULL); - radix_tree_preload_end(); - } + error = shmem_add_to_page_cache(page, mapping, hindex, NULL, + gfp); if (error) { mem_cgroup_cancel_charge(page, memcg, PageTransHuge(page)); @@ -2259,11 +2215,7 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, if (ret) goto out_release; - ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); - if (!ret) { - ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL); - radix_tree_preload_end(); - } + ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, gfp); if (ret) goto out_release_uncharge; @@ -2519,7 +2471,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, index = indices[i]; } page = pvec.pages[i]; - if (page && !radix_tree_exceptional_entry(page)) { + if (page && !xa_is_exceptional(page)) { if (!PageUptodate(page)) page = NULL; } @@ -2585,35 +2537,27 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) static void shmem_tag_pins(struct address_space *mapping) { - struct radix_tree_iter iter; - void **slot; - pgoff_t start; + struct xa_state xas; struct page *page; lru_add_drain(); - start = 0; - rcu_read_lock(); + xas_init(&xas, 0); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { - page = radix_tree_deref_slot(slot); - if (!page || radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - } else if (page_count(page) - page_mapcount(page) > 1) { - mapping_lock_irq(mapping); - radix_tree_tag_set(&mapping->page_tree, iter.index, - SHMEM_TAG_PINNED); - mapping_unlock_irq(mapping); - } + mapping_lock_irq(mapping); + xas_for_each(&mapping->pages, &xas, page, ~0UL) { + if (xa_is_exceptional(page)) + continue; + if (page_count(page) - page_mapcount(page) > 1) + xas_set_tag(&mapping->pages, &xas, SHMEM_TAG_PINNED); if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); - cond_resched_rcu(); + xas_pause(&xas); + mapping_unlock_irq(mapping); + cond_resched(); + mapping_lock_irq(mapping); } } - rcu_read_unlock(); + mapping_unlock_irq(mapping); } /* @@ -2627,9 +2571,7 @@ static void shmem_tag_pins(struct address_space *mapping) */ static int shmem_wait_for_pins(struct address_space *mapping) { - struct radix_tree_iter iter; - void **slot; - pgoff_t start; + struct xa_state xas; struct page *page; int error, scan; @@ -2637,7 +2579,7 @@ static int shmem_wait_for_pins(struct address_space *mapping) error = 0; for (scan = 0; scan <= LAST_SCAN; scan++) { - if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED)) + if (!xa_tagged(&mapping->pages, SHMEM_TAG_PINNED)) break; if (!scan) @@ -2645,25 +2587,14 @@ static int shmem_wait_for_pins(struct address_space *mapping) else if (schedule_timeout_killable((HZ << scan) / 200)) scan = LAST_SCAN; - start = 0; - rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, - start, SHMEM_TAG_PINNED) { - - page = radix_tree_deref_slot(slot); - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - - page = NULL; - } - - if (page && - page_count(page) - page_mapcount(page) != 1) { + xas_init(&xas, 0); + mapping_lock_irq(mapping); + xas_for_each_tag(&mapping->pages, &xas, page, ~0UL, + SHMEM_TAG_PINNED) { + BUG_ON(xa_is_exceptional(page)); + if (page_count(page) - page_mapcount(page) != 1) { if (scan < LAST_SCAN) - goto continue_resched; + continue; /* * On the last scan, we clean up all those tags @@ -2673,17 +2604,15 @@ static int shmem_wait_for_pins(struct address_space *mapping) error = -EBUSY; } - mapping_lock_irq(mapping); - radix_tree_tag_clear(&mapping->page_tree, - iter.index, SHMEM_TAG_PINNED); - mapping_unlock_irq(mapping); -continue_resched: + xas_clear_tag(&mapping->pages, &xas, SHMEM_TAG_PINNED); if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); - cond_resched_rcu(); + mapping_unlock_irq(mapping); + xas_pause(&xas); + cond_resched(); + mapping_lock_irq(mapping); } } - rcu_read_unlock(); + mapping_unlock_irq(mapping); } return error; diff --git a/mm/swap_state.c b/mm/swap_state.c index 4b652101aa06..e2332f878dc5 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -85,11 +85,12 @@ void show_swap_cache_info(void) } /* - * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, + * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ -int __add_to_swap_cache(struct page *page, swp_entry_t entry) +int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) { + struct xa_state xas; int error; struct address_space *address_space; @@ -102,41 +103,26 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) set_page_private(page, entry.val); address_space = swap_address_space(entry); + xas_init(&xas, swp_offset(entry)); +repeat: mapping_lock_irq(address_space); - error = radix_tree_insert(&address_space->page_tree, - swp_offset(entry), page); - if (likely(!error)) { - address_space->nrpages++; - __inc_node_page_state(page, NR_FILE_PAGES); - INC_CACHE_INFO(add_total); - } - mapping_unlock_irq(address_space); - + xas_store(&address_space->pages, &xas, page); + error = xas_error(&xas); if (unlikely(error)) { - /* - * Only the context which have set SWAP_HAS_CACHE flag - * would call add_to_swap_cache(). - * So add_to_swap_cache() doesn't returns -EEXIST. - */ - VM_BUG_ON(error == -EEXIST); + mapping_unlock_irq(address_space); + if (xas_nomem(&xas, gfp_mask)) + goto repeat; set_page_private(page, 0UL); ClearPageSwapCache(page); put_page(page); + } else { + address_space->nrpages++; + __inc_node_page_state(page, NR_FILE_PAGES); + INC_CACHE_INFO(add_total); + mapping_unlock_irq(address_space); } - return error; -} - - -int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) -{ - int error; - - error = radix_tree_maybe_preload(gfp_mask); - if (!error) { - error = __add_to_swap_cache(page, entry); - radix_tree_preload_end(); - } + xas_destroy(&xas); return error; } @@ -146,6 +132,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) */ void __delete_from_swap_cache(struct page *page) { + struct xa_state xas; swp_entry_t entry; struct address_space *address_space; @@ -155,7 +142,8 @@ void __delete_from_swap_cache(struct page *page) entry.val = page_private(page); address_space = swap_address_space(entry); - radix_tree_delete(&address_space->page_tree, swp_offset(entry)); + xas_init(&xas, swp_offset(entry)); + xas_store(&address_space->pages, &xas, NULL); set_page_private(page, 0); ClearPageSwapCache(page); address_space->nrpages--; @@ -345,18 +333,10 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, } /* - * call radix_tree_preload() while we can wait. - */ - err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL); - if (err) - break; - - /* * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); if (err == -EEXIST) { - radix_tree_preload_end(); /* * We might race against get_swap_page() and stumble * across a SWAP_HAS_CACHE swap_map entry whose page @@ -375,17 +355,14 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, cond_resched(); continue; } - if (err) { /* swp entry is obsolete ? */ - radix_tree_preload_end(); + if (err) /* swp entry is obsolete ? */ break; - } /* May fail (-ENOMEM) if radix-tree node allocation failed. */ __SetPageLocked(new_page); __SetPageSwapBacked(new_page); - err = __add_to_swap_cache(new_page, entry); + err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); if (likely(!err)) { - radix_tree_preload_end(); /* * Initiate read into locked page and return. */ @@ -393,7 +370,6 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, *new_page_allocated = true; return new_page; } - radix_tree_preload_end(); __ClearPageLocked(new_page); /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely @@ -538,12 +514,11 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages) return -ENOMEM; for (i = 0; i < nr; i++) { space = spaces + i; - INIT_RADIX_TREE(&space->page_tree, GFP_ATOMIC|__GFP_NOWARN); + xa_init(&space->pages); atomic_set(&space->i_mmap_writable, 0); space->a_ops = &swap_aops; /* swap cache doesn't use writeback related tags */ mapping_set_no_writeback_tags(space); - spin_lock_init(&space->tree_lock); } nr_swapper_spaces[type] = nr; rcu_assign_pointer(swapper_spaces[type], spaces); diff --git a/mm/truncate.c b/mm/truncate.c index 6fb93d6d62b3..4de08d8ff67f 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -28,23 +28,22 @@ static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, void *entry) { - struct radix_tree_node *node; - void **slot; + struct xa_state xas; + void *curr; + xas_init(&xas, index); + xas.xa_update = workingset_update_node; mapping_lock_irq(mapping); /* * Regular page slots are stabilized by the page lock even * without the tree itself locked. These unlocked entries * need verification under the tree lock. */ - if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) - goto unlock; - if (*slot != entry) - goto unlock; - __radix_tree_replace(&mapping->page_tree, node, slot, NULL, - workingset_update_node); - mapping->nrexceptional--; -unlock: + curr = xas_load(&mapping->pages, &xas); + if (entry == curr) { + xas_store(&mapping->pages, &xas, NULL); + mapping->nrexceptional--; + } mapping_unlock_irq(mapping); } @@ -304,7 +303,7 @@ void truncate_inode_pages_range(struct address_space *mapping, if (index >= end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_exceptional(page)) { truncate_exceptional_entry(mapping, index, page); continue; @@ -394,7 +393,7 @@ void truncate_inode_pages_range(struct address_space *mapping, break; } - if (radix_tree_exceptional_entry(page)) { + if (xa_is_exceptional(page)) { truncate_exceptional_entry(mapping, index, page); continue; @@ -513,7 +512,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, if (index > end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_exceptional(page)) { invalidate_exceptional_entry(mapping, index, page); continue; @@ -636,7 +635,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, if (index > end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_exceptional(page)) { if (!invalidate_exceptional_entry2(mapping, index, page)) ret = -EBUSY; diff --git a/mm/vmscan.c b/mm/vmscan.c index ccdb78c09b81..393cb432a18a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -729,7 +729,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * only page cache pages found in these are zero pages * covering holes, and because we don't want to mix DAX * exceptional entries and shadow exceptional entries in the - * same page_tree. + * same mapping. */ if (reclaimed && page_is_file_cache(page) && !mapping_exiting(mapping) && !dax_mapping(mapping)) diff --git a/mm/workingset.c b/mm/workingset.c index 9e77f06d9db8..b98924172217 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -154,14 +154,12 @@ * refault distance will immediately activate the refaulting page. */ -#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ - NODES_SHIFT + \ - MEM_CGROUP_ID_SHIFT) +#define EVICTION_SHIFT (1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) /* * Eviction timestamps need to be able to cover the full range of - * actionable refaults. However, bits are tight in the radix tree + * actionable refaults. However, bits are tight in the exceptional * entry, and after storing the identifier for the lruvec there might * not be enough left to represent every single actionable refault. In * that case, we have to sacrifice granularity for distance, and group @@ -174,18 +172,16 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction) eviction >>= bucket_order; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; - eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); - return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); + return xa_mk_exceptional(eviction); } static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, unsigned long *evictionp) { - unsigned long entry = (unsigned long)shadow; + unsigned long entry = xa_exceptional_value(shadow); int memcgid, nid; - entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); @@ -339,9 +335,9 @@ void workingset_activation(struct page *page) static struct list_lru shadow_nodes; #define node_mapping(node) \ - container_of(node->root, struct address_space, page_tree) + container_of(node->array, struct address_space, pages) -void workingset_update_node(struct radix_tree_node *node) +void workingset_update_node(struct xa_node *node) { struct address_space *mapping = node_mapping(node); @@ -379,7 +375,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, local_irq_enable(); /* - * Approximate a reasonable limit for the radix tree nodes + * Approximate a reasonable limit for the xa_nodes * containing shadow entries. We don't need to keep more * shadow entries than possible pages on the active list, * since refault distances bigger than that are dismissed. @@ -394,11 +390,11 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, * worst-case density of 1/8th. Below that, not all eligible * refaults can be detected anymore. * - * On 64-bit with 7 radix_tree_nodes per page and 64 slots + * On 64-bit with 7 xa_nodes per page and 64 slots * each, this will reclaim shadow entries when they consume * ~1.8% of available memory: * - * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE + * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE */ if (sc->memcg) { cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, @@ -407,7 +403,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); } - max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3); + max_nodes = cache >> (XA_CHUNK_SHIFT - 3); if (nodes <= max_nodes) return 0; @@ -419,9 +415,9 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) { + struct xa_state xas; struct address_space *mapping; - struct radix_tree_node *node; - unsigned int i; + struct xa_node *node; int ret; /* @@ -429,14 +425,14 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * the shadow node LRU under the mapping_lock and the * lru_lock. Because the page cache is emptied before * the inode can be destroyed, holding the lru_lock pins any - * address_space that has radix tree nodes on the LRU. + * address_space that has xarray nodes on the LRU. * * We can then safely transition to the mapping_lock to * pin only the address_space of the particular node we want * to reclaim, take the node off-LRU, and drop the lru_lock. */ - node = container_of(item, struct radix_tree_node, private_list); + node = container_of(item, struct xa_node, private_list); mapping = node_mapping(node); /* Coming from the list, invert the lock order */ @@ -458,25 +454,17 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, goto out_invalid; if (WARN_ON_ONCE(node->count != node->exceptional)) goto out_invalid; - for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { - if (node->slots[i]) { - if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i]))) - goto out_invalid; - if (WARN_ON_ONCE(!node->exceptional)) - goto out_invalid; - if (WARN_ON_ONCE(!mapping->nrexceptional)) - goto out_invalid; - node->slots[i] = NULL; - node->exceptional--; - node->count--; - mapping->nrexceptional--; - } - } - if (WARN_ON_ONCE(node->exceptional)) - goto out_invalid; + mapping->nrexceptional -= node->exceptional; + xas.xa_node = node->parent; + xas.xa_offset = node->offset; + xas.xa_update = workingset_update_node; + /* + * If we chose to, we could store a shadow entry here which was the + * minimum of the shadow entries we were tracking. Probably wouldn't + * help us make better refault decisions though. + */ + xas_store(node->array, &xas, NULL); inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); - __radix_tree_delete_node(&mapping->page_tree, node, - workingset_update_node); out_invalid: mapping_unlock(mapping);