/* list_lru_walk_cb has to always return one of those */
 enum lru_status {
        LRU_REMOVED,            /* item removed from list */
+       LRU_REMOVED_RETRY,      /* item removed, but lock has been
+                                  dropped and reacquired */
        LRU_ROTATE,             /* item referenced, give another pass */
        LRU_SKIP,               /* item cannot be locked, skip */
        LRU_RETRY,              /* item not freeable. May drop the lock
 };
 
 void list_lru_destroy(struct list_lru *lru);
-int list_lru_init(struct list_lru *lru);
+int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key);
+static inline int list_lru_init(struct list_lru *lru)
+{
+       return list_lru_init_key(lru, NULL);
+}
 
 /**
  * list_lru_add: add an element to the lru list's tail
 
 #endif
        WORKINGSET_REFAULT,
        WORKINGSET_ACTIVATE,
+       WORKINGSET_NODERECLAIM,
        NR_ANON_TRANSPARENT_HUGEPAGES,
        NR_FREE_CMA_PAGES,
        NR_VM_ZONE_STAT_ITEMS };
 
 #define RADIX_TREE_TAG_LONGS   \
        ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
 
+#define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
+#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
+                                         RADIX_TREE_MAP_SHIFT))
+
+/* Height component in node->path */
+#define RADIX_TREE_HEIGHT_SHIFT        (RADIX_TREE_MAX_PATH + 1)
+#define RADIX_TREE_HEIGHT_MASK ((1UL << RADIX_TREE_HEIGHT_SHIFT) - 1)
+
+/* Internally used bits of node->count */
+#define RADIX_TREE_COUNT_SHIFT (RADIX_TREE_MAP_SHIFT + 1)
+#define RADIX_TREE_COUNT_MASK  ((1UL << RADIX_TREE_COUNT_SHIFT) - 1)
+
 struct radix_tree_node {
-       unsigned int    height;         /* Height from the bottom */
+       unsigned int    path;   /* Offset in parent & height from the bottom */
        unsigned int    count;
        union {
-               struct radix_tree_node *parent; /* Used when ascending tree */
-               struct rcu_head rcu_head;       /* Used when freeing node */
+               struct {
+                       /* Used when ascending tree */
+                       struct radix_tree_node *parent;
+                       /* For tree user */
+                       void *private_data;
+               };
+               /* Used when freeing node */
+               struct rcu_head rcu_head;
        };
+       /* For tree user */
+       struct list_head private_list;
        void __rcu      *slots[RADIX_TREE_MAP_SIZE];
        unsigned long   tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
 };
 
-#define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
-#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
-                                         RADIX_TREE_MAP_SHIFT))
-
 /* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
 struct radix_tree_root {
        unsigned int            height;
                          struct radix_tree_node **nodep, void ***slotp);
 void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
 void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
-bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index,
+bool __radix_tree_delete_node(struct radix_tree_root *root,
                              struct radix_tree_node *node);
 void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
 void *radix_tree_delete(struct radix_tree_root *, unsigned long);
 
 void *workingset_eviction(struct address_space *mapping, struct page *page);
 bool workingset_refault(void *shadow);
 void workingset_activation(struct page *page);
+extern struct list_lru workingset_shadow_nodes;
+
+static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
+{
+       return node->count & RADIX_TREE_COUNT_MASK;
+}
+
+static inline void workingset_node_pages_inc(struct radix_tree_node *node)
+{
+       node->count++;
+}
+
+static inline void workingset_node_pages_dec(struct radix_tree_node *node)
+{
+       node->count--;
+}
+
+static inline unsigned int workingset_node_shadows(struct radix_tree_node *node)
+{
+       return node->count >> RADIX_TREE_COUNT_SHIFT;
+}
+
+static inline void workingset_node_shadows_inc(struct radix_tree_node *node)
+{
+       node->count += 1U << RADIX_TREE_COUNT_SHIFT;
+}
+
+static inline void workingset_node_shadows_dec(struct radix_tree_node *node)
+{
+       node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
+}
 
 /* linux/mm/page_alloc.c */
 extern unsigned long totalram_pages;
 
 
                /* Increase the height.  */
                newheight = root->height+1;
-               node->height = newheight;
+               BUG_ON(newheight & ~RADIX_TREE_HEIGHT_MASK);
+               node->path = newheight;
                node->count = 1;
                node->parent = NULL;
                slot = root->rnode;
                        /* Have to add a child node.  */
                        if (!(slot = radix_tree_node_alloc(root)))
                                return -ENOMEM;
-                       slot->height = height;
+                       slot->path = height;
                        slot->parent = node;
                        if (node) {
                                rcu_assign_pointer(node->slots[offset], slot);
                                node->count++;
+                               slot->path |= offset << RADIX_TREE_HEIGHT_SHIFT;
                        } else
                                rcu_assign_pointer(root->rnode, ptr_to_indirect(slot));
                }
        }
        node = indirect_to_ptr(node);
 
-       height = node->height;
+       height = node->path & RADIX_TREE_HEIGHT_MASK;
        if (index > radix_tree_maxindex(height))
                return NULL;
 
                return (index == 0);
        node = indirect_to_ptr(node);
 
-       height = node->height;
+       height = node->path & RADIX_TREE_HEIGHT_MASK;
        if (index > radix_tree_maxindex(height))
                return 0;
 
 {
        unsigned shift, tag = flags & RADIX_TREE_ITER_TAG_MASK;
        struct radix_tree_node *rnode, *node;
-       unsigned long index, offset;
+       unsigned long index, offset, height;
 
        if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag))
                return NULL;
                return NULL;
 
 restart:
-       shift = (rnode->height - 1) * RADIX_TREE_MAP_SHIFT;
+       height = rnode->path & RADIX_TREE_HEIGHT_MASK;
+       shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
        offset = index >> shift;
 
        /* Index outside of the tree */
        unsigned int shift, height;
        unsigned long i;
 
-       height = slot->height;
+       height = slot->path & RADIX_TREE_HEIGHT_MASK;
        shift = (height-1) * RADIX_TREE_MAP_SHIFT;
 
        for ( ; height > 1; height--) {
                }
 
                node = indirect_to_ptr(node);
-               max_index = radix_tree_maxindex(node->height);
+               max_index = radix_tree_maxindex(node->path &
+                                               RADIX_TREE_HEIGHT_MASK);
                if (cur_index > max_index) {
                        rcu_read_unlock();
                        break;
  *
  *     Returns %true if @node was freed, %false otherwise.
  */
-bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index,
+bool __radix_tree_delete_node(struct radix_tree_root *root,
                              struct radix_tree_node *node)
 {
        bool deleted = false;
 
                parent = node->parent;
                if (parent) {
-                       index >>= RADIX_TREE_MAP_SHIFT;
+                       unsigned int offset;
 
-                       parent->slots[index & RADIX_TREE_MAP_MASK] = NULL;
+                       offset = node->path >> RADIX_TREE_HEIGHT_SHIFT;
+                       parent->slots[offset] = NULL;
                        parent->count--;
                } else {
                        root_tag_clear_all(root);
        node->slots[offset] = NULL;
        node->count--;
 
-       __radix_tree_delete_node(root, index, node);
+       __radix_tree_delete_node(root, node);
 
        return entry;
 }
 EXPORT_SYMBOL(radix_tree_tagged);
 
 static void
-radix_tree_node_ctor(void *node)
+radix_tree_node_ctor(void *arg)
 {
-       memset(node, 0, sizeof(struct radix_tree_node));
+       struct radix_tree_node *node = arg;
+
+       memset(node, 0, sizeof(*node));
+       INIT_LIST_HEAD(&node->private_list);
 }
 
 static __init unsigned long __maxindex(unsigned int height)
 
 static void page_cache_tree_delete(struct address_space *mapping,
                                   struct page *page, void *shadow)
 {
-       if (shadow) {
-               void **slot;
+       struct radix_tree_node *node;
+       unsigned long index;
+       unsigned int offset;
+       unsigned int tag;
+       void **slot;
 
-               slot = radix_tree_lookup_slot(&mapping->page_tree, page->index);
-               radix_tree_replace_slot(slot, shadow);
+       VM_BUG_ON(!PageLocked(page));
+
+       __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
+
+       if (shadow) {
                mapping->nrshadows++;
                /*
                 * Make sure the nrshadows update is committed before
                 * same time and miss a shadow entry.
                 */
                smp_wmb();
-       } else
-               radix_tree_delete(&mapping->page_tree, page->index);
+       }
        mapping->nrpages--;
+
+       if (!node) {
+               /* Clear direct pointer tags in root node */
+               mapping->page_tree.gfp_mask &= __GFP_BITS_MASK;
+               radix_tree_replace_slot(slot, shadow);
+               return;
+       }
+
+       /* Clear tree tags for the removed page */
+       index = page->index;
+       offset = index & RADIX_TREE_MAP_MASK;
+       for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
+               if (test_bit(offset, node->tags[tag]))
+                       radix_tree_tag_clear(&mapping->page_tree, index, tag);
+       }
+
+       /* Delete page, swap shadow entry */
+       radix_tree_replace_slot(slot, shadow);
+       workingset_node_pages_dec(node);
+       if (shadow)
+               workingset_node_shadows_inc(node);
+       else
+               if (__radix_tree_delete_node(&mapping->page_tree, node))
+                       return;
+
+       /*
+        * Track node that only contains shadow entries.
+        *
+        * Avoid acquiring the list_lru lock if already tracked.  The
+        * list_empty() test is safe as node->private_list is
+        * protected by mapping->tree_lock.
+        */
+       if (!workingset_node_pages(node) &&
+           list_empty(&node->private_list)) {
+               node->private_data = mapping;
+               list_lru_add(&workingset_shadow_nodes, &node->private_list);
+       }
 }
 
 /*
 static int page_cache_tree_insert(struct address_space *mapping,
                                  struct page *page, void **shadowp)
 {
+       struct radix_tree_node *node;
        void **slot;
        int error;
 
-       slot = radix_tree_lookup_slot(&mapping->page_tree, page->index);
-       if (slot) {
+       error = __radix_tree_create(&mapping->page_tree, page->index,
+                                   &node, &slot);
+       if (error)
+               return error;
+       if (*slot) {
                void *p;
 
                p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
                if (!radix_tree_exceptional_entry(p))
                        return -EEXIST;
-               radix_tree_replace_slot(slot, page);
-               mapping->nrshadows--;
-               mapping->nrpages++;
                if (shadowp)
                        *shadowp = p;
-               return 0;
+               mapping->nrshadows--;
+               if (node)
+                       workingset_node_shadows_dec(node);
        }
-       error = radix_tree_insert(&mapping->page_tree, page->index, page);
-       if (!error)
-               mapping->nrpages++;
-       return error;
+       radix_tree_replace_slot(slot, page);
+       mapping->nrpages++;
+       if (node) {
+               workingset_node_pages_inc(node);
+               /*
+                * Don't track node that contains actual pages.
+                *
+                * Avoid acquiring the list_lru lock if already
+                * untracked.  The list_empty() test is safe as
+                * node->private_list is protected by
+                * mapping->tree_lock.
+                */
+               if (!list_empty(&node->private_list))
+                       list_lru_del(&workingset_shadow_nodes,
+                                    &node->private_list);
+       }
+       return 0;
 }
 
 static int __add_to_page_cache_locked(struct page *page,
 
 
                ret = isolate(item, &nlru->lock, cb_arg);
                switch (ret) {
+               case LRU_REMOVED_RETRY:
+                       assert_spin_locked(&nlru->lock);
                case LRU_REMOVED:
                        if (--nlru->nr_items == 0)
                                node_clear(nid, lru->active_nodes);
                        WARN_ON_ONCE(nlru->nr_items < 0);
                        isolated++;
+                       /*
+                        * If the lru lock has been dropped, our list
+                        * traversal is now invalid and so we have to
+                        * restart from scratch.
+                        */
+                       if (ret == LRU_REMOVED_RETRY)
+                               goto restart;
                        break;
                case LRU_ROTATE:
                        list_move_tail(item, &nlru->list);
                         * The lru lock has been dropped, our list traversal is
                         * now invalid and so we have to restart from scratch.
                         */
+                       assert_spin_locked(&nlru->lock);
                        goto restart;
                default:
                        BUG();
 }
 EXPORT_SYMBOL_GPL(list_lru_walk_node);
 
-int list_lru_init(struct list_lru *lru)
+int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
 {
        int i;
        size_t size = sizeof(*lru->node) * nr_node_ids;
        nodes_clear(lru->active_nodes);
        for (i = 0; i < nr_node_ids; i++) {
                spin_lock_init(&lru->node[i].lock);
+               if (key)
+                       lockdep_set_class(&lru->node[i].lock, key);
                INIT_LIST_HEAD(&lru->node[i].list);
                lru->node[i].nr_items = 0;
        }
        return 0;
 }
-EXPORT_SYMBOL_GPL(list_lru_init);
+EXPORT_SYMBOL_GPL(list_lru_init_key);
 
 void list_lru_destroy(struct list_lru *lru)
 {
 
 static void clear_exceptional_entry(struct address_space *mapping,
                                    pgoff_t index, void *entry)
 {
+       struct radix_tree_node *node;
+       void **slot;
+
        /* Handled by shmem itself */
        if (shmem_mapping(mapping))
                return;
         * without the tree itself locked.  These unlocked entries
         * need verification under the tree lock.
         */
-       if (radix_tree_delete_item(&mapping->page_tree, index, entry) == entry)
-               mapping->nrshadows--;
+       if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
+               goto unlock;
+       if (*slot != entry)
+               goto unlock;
+       radix_tree_replace_slot(slot, NULL);
+       mapping->nrshadows--;
+       if (!node)
+               goto unlock;
+       workingset_node_shadows_dec(node);
+       /*
+        * Don't track node without shadow entries.
+        *
+        * Avoid acquiring the list_lru lock if already untracked.
+        * The list_empty() test is safe as node->private_list is
+        * protected by mapping->tree_lock.
+        */
+       if (!workingset_node_shadows(node) &&
+           !list_empty(&node->private_list))
+               list_lru_del(&workingset_shadow_nodes, &node->private_list);
+       __radix_tree_delete_node(&mapping->page_tree, node);
+unlock:
        spin_unlock_irq(&mapping->tree_lock);
 }
 
 
 #endif
        "workingset_refault",
        "workingset_activate",
+       "workingset_nodereclaim",
        "nr_anon_transparent_hugepages",
        "nr_free_cma",
        "nr_dirty_threshold",
 
 {
        atomic_long_inc(&page_zone(page)->inactive_age);
 }
+
+/*
+ * Shadow entries reflect the share of the working set that does not
+ * fit into memory, so their number depends on the access pattern of
+ * the workload.  In most cases, they will refault or get reclaimed
+ * along with the inode, but a (malicious) workload that streams
+ * through files with a total size several times that of available
+ * memory, while preventing the inodes from being reclaimed, can
+ * create excessive amounts of shadow nodes.  To keep a lid on this,
+ * track shadow nodes and reclaim them when they grow way past the
+ * point where they would still be useful.
+ */
+
+struct list_lru workingset_shadow_nodes;
+
+static unsigned long count_shadow_nodes(struct shrinker *shrinker,
+                                       struct shrink_control *sc)
+{
+       unsigned long shadow_nodes;
+       unsigned long max_nodes;
+       unsigned long pages;
+
+       /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
+       local_irq_disable();
+       shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid);
+       local_irq_enable();
+
+       pages = node_present_pages(sc->nid);
+       /*
+        * Active cache pages are limited to 50% of memory, and shadow
+        * entries that represent a refault distance bigger than that
+        * do not have any effect.  Limit the number of shadow nodes
+        * such that shadow entries do not exceed the number of active
+        * cache pages, assuming a worst-case node population density
+        * of 1/8th on average.
+        *
+        * On 64-bit with 7 radix_tree_nodes per page and 64 slots
+        * each, this will reclaim shadow entries when they consume
+        * ~2% of available memory:
+        *
+        * PAGE_SIZE / radix_tree_nodes / node_entries / PAGE_SIZE
+        */
+       max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3);
+
+       if (shadow_nodes <= max_nodes)
+               return 0;
+
+       return shadow_nodes - max_nodes;
+}
+
+static enum lru_status shadow_lru_isolate(struct list_head *item,
+                                         spinlock_t *lru_lock,
+                                         void *arg)
+{
+       struct address_space *mapping;
+       struct radix_tree_node *node;
+       unsigned int i;
+       int ret;
+
+       /*
+        * Page cache insertions and deletions synchroneously maintain
+        * the shadow node LRU under the mapping->tree_lock and the
+        * lru_lock.  Because the page cache tree is emptied before
+        * the inode can be destroyed, holding the lru_lock pins any
+        * address_space that has radix tree nodes on the LRU.
+        *
+        * We can then safely transition to the mapping->tree_lock to
+        * pin only the address_space of the particular node we want
+        * to reclaim, take the node off-LRU, and drop the lru_lock.
+        */
+
+       node = container_of(item, struct radix_tree_node, private_list);
+       mapping = node->private_data;
+
+       /* Coming from the list, invert the lock order */
+       if (!spin_trylock(&mapping->tree_lock)) {
+               spin_unlock(lru_lock);
+               ret = LRU_RETRY;
+               goto out;
+       }
+
+       list_del_init(item);
+       spin_unlock(lru_lock);
+
+       /*
+        * The nodes should only contain one or more shadow entries,
+        * no pages, so we expect to be able to remove them all and
+        * delete and free the empty node afterwards.
+        */
+
+       BUG_ON(!node->count);
+       BUG_ON(node->count & RADIX_TREE_COUNT_MASK);
+
+       for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
+               if (node->slots[i]) {
+                       BUG_ON(!radix_tree_exceptional_entry(node->slots[i]));
+                       node->slots[i] = NULL;
+                       BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT));
+                       node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
+                       BUG_ON(!mapping->nrshadows);
+                       mapping->nrshadows--;
+               }
+       }
+       BUG_ON(node->count);
+       inc_zone_state(page_zone(virt_to_page(node)), WORKINGSET_NODERECLAIM);
+       if (!__radix_tree_delete_node(&mapping->page_tree, node))
+               BUG();
+
+       spin_unlock(&mapping->tree_lock);
+       ret = LRU_REMOVED_RETRY;
+out:
+       local_irq_enable();
+       cond_resched();
+       local_irq_disable();
+       spin_lock(lru_lock);
+       return ret;
+}
+
+static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
+                                      struct shrink_control *sc)
+{
+       unsigned long ret;
+
+       /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
+       local_irq_disable();
+       ret =  list_lru_walk_node(&workingset_shadow_nodes, sc->nid,
+                                 shadow_lru_isolate, NULL, &sc->nr_to_scan);
+       local_irq_enable();
+       return ret;
+}
+
+static struct shrinker workingset_shadow_shrinker = {
+       .count_objects = count_shadow_nodes,
+       .scan_objects = scan_shadow_nodes,
+       .seeks = DEFAULT_SEEKS,
+       .flags = SHRINKER_NUMA_AWARE,
+};
+
+/*
+ * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
+ * mapping->tree_lock.
+ */
+static struct lock_class_key shadow_nodes_key;
+
+static int __init workingset_init(void)
+{
+       int ret;
+
+       ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
+       if (ret)
+               goto err;
+       ret = register_shrinker(&workingset_shadow_shrinker);
+       if (ret)
+               goto err_list_lru;
+       return 0;
+err_list_lru:
+       list_lru_destroy(&workingset_shadow_nodes);
+err:
+       return ret;
+}
+module_init(workingset_init);