8. LRU
         Each memcg has its own private LRU. Now, its handling is under global
-       VM's control (means that it's handled under global zone->lru_lock).
+       VM's control (means that it's handled under global zone_lru_lock).
        Almost all routines around memcg's LRU is called by global LRU's
-       list management functions under zone->lru_lock().
+       list management functions under zone_lru_lock().
 
        A special function is mem_cgroup_isolate_pages(). This scans
        memcg's private LRU and call __isolate_lru_page() to extract a page
 
    Other lock order is following:
    PG_locked.
    mm->page_table_lock
-       zone->lru_lock
+       zone_lru_lock
          lock_page_cgroup.
   In many cases, just lock_page_cgroup() is called.
   per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
-  zone->lru_lock, it has no lock of its own.
+  zone_lru_lock, it has no lock of its own.
 
 2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
 
 
         */
        union {
                struct list_head lru;   /* Pageout list, eg. active_list
-                                        * protected by zone->lru_lock !
+                                        * protected by zone_lru_lock !
                                         * Can be used as a generic list
                                         * by the page owner.
                                         */
 
 struct pglist_data;
 
 /*
- * zone->lock and zone->lru_lock are two of the hottest locks in the kernel.
+ * zone->lock and the zone lru_lock are two of the hottest locks in the kernel.
  * So add a wild amount of padding here to ensure that they fall into separate
  * cachelines.  There are very few zone structures in the machine, so space
  * consumption is not a concern here.
        /* Write-intensive fields used by page reclaim */
 
        /* Fields commonly accessed by the page reclaim scanner */
-       spinlock_t              lru_lock;
        struct lruvec           lruvec;
 
        /*
        /* Number of pages migrated during the rate limiting time interval */
        unsigned long numabalancing_migrate_nr_pages;
 #endif
+       /* Write-intensive fields used by page reclaim */
+       ZONE_PADDING(_pad1_)
+       spinlock_t              lru_lock;
 
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
        /*
 
 #define node_start_pfn(nid)    (NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))
+static inline spinlock_t *zone_lru_lock(struct zone *zone)
+{
+       return &zone->zone_pgdat->lru_lock;
+}
 
 static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
 {
 
                 * if contended.
                 */
                if (!(low_pfn % SWAP_CLUSTER_MAX)
-                   && compact_unlock_should_abort(&zone->lru_lock, flags,
+                   && compact_unlock_should_abort(zone_lru_lock(zone), flags,
                                                                &locked, cc))
                        break;
 
                        if (unlikely(__PageMovable(page)) &&
                                        !PageIsolated(page)) {
                                if (locked) {
-                                       spin_unlock_irqrestore(&zone->lru_lock,
+                                       spin_unlock_irqrestore(zone_lru_lock(zone),
                                                                        flags);
                                        locked = false;
                                }
 
                /* If we already hold the lock, we can skip some rechecking */
                if (!locked) {
-                       locked = compact_trylock_irqsave(&zone->lru_lock,
+                       locked = compact_trylock_irqsave(zone_lru_lock(zone),
                                                                &flags, cc);
                        if (!locked)
                                break;
                 */
                if (nr_isolated) {
                        if (locked) {
-                               spin_unlock_irqrestore(&zone->lru_lock, flags);
+                               spin_unlock_irqrestore(zone_lru_lock(zone), flags);
                                locked = false;
                        }
                        acct_isolated(zone, cc);
                low_pfn = end_pfn;
 
        if (locked)
-               spin_unlock_irqrestore(&zone->lru_lock, flags);
+               spin_unlock_irqrestore(zone_lru_lock(zone), flags);
 
        /*
         * Update the pageblock-skip information and cached scanner pfn,
 
  *    ->swap_lock              (try_to_unmap_one)
  *    ->private_lock           (try_to_unmap_one)
  *    ->tree_lock              (try_to_unmap_one)
- *    ->zone.lru_lock          (follow_page->mark_page_accessed)
- *    ->zone.lru_lock          (check_pte_range->isolate_lru_page)
+ *    ->zone_lru_lock(zone)    (follow_page->mark_page_accessed)
+ *    ->zone_lru_lock(zone)    (check_pte_range->isolate_lru_page)
  *    ->private_lock           (page_remove_rmap->set_page_dirty)
  *    ->tree_lock              (page_remove_rmap->set_page_dirty)
  *    bdi.wb->list_lock                (page_remove_rmap->set_page_dirty)
 
                spin_unlock(&head->mapping->tree_lock);
        }
 
-       spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags);
+       spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
 
        unfreeze_page(head);
 
                lru_add_drain();
 
        /* prevent PageLRU to go away from under us, and freeze lru stats */
-       spin_lock_irqsave(&page_zone(head)->lru_lock, flags);
+       spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags);
 
        if (mapping) {
                void **pslot;
                spin_unlock(&pgdata->split_queue_lock);
 fail:          if (mapping)
                        spin_unlock(&mapping->tree_lock);
-               spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags);
+               spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
                unfreeze_page(head);
                ret = -EBUSY;
        }
 
 {
        struct zone *zone = page_zone(page);
 
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(zone_lru_lock(zone));
        if (PageLRU(page)) {
                struct lruvec *lruvec;
 
                SetPageLRU(page);
                add_page_to_lru_list(page, lruvec, page_lru(page));
        }
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(zone_lru_lock(zone));
 }
 
 static void commit_charge(struct page *page, struct mem_cgroup *memcg,
 
 /*
  * Because tail pages are not marked as "used", set it. We're under
- * zone->lru_lock and migration entries setup in all page mappings.
+ * zone_lru_lock and migration entries setup in all page mappings.
  */
 void mem_cgroup_split_huge_fixup(struct page *head)
 {
 
         * might otherwise copy PageMlocked to part of the tail pages before
         * we clear it in the head page. It also stabilizes hpage_nr_pages().
         */
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(zone_lru_lock(zone));
 
        nr_pages = hpage_nr_pages(page);
        if (!TestClearPageMlocked(page))
        __mod_zone_page_state(zone, NR_MLOCK, -nr_pages);
 
        if (__munlock_isolate_lru_page(page, true)) {
-               spin_unlock_irq(&zone->lru_lock);
+               spin_unlock_irq(zone_lru_lock(zone));
                __munlock_isolated_page(page);
                goto out;
        }
        __munlock_isolation_failed(page);
 
 unlock_out:
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(zone_lru_lock(zone));
 
 out:
        return nr_pages - 1;
        pagevec_init(&pvec_putback, 0);
 
        /* Phase 1: page isolation */
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(zone_lru_lock(zone));
        for (i = 0; i < nr; i++) {
                struct page *page = pvec->pages[i];
 
        }
        delta_munlocked = -nr + pagevec_count(&pvec_putback);
        __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(zone_lru_lock(zone));
 
        /* Now we can release pins of pages that we are not munlocking */
        pagevec_release(&pvec_putback);
 
        init_waitqueue_head(&pgdat->kcompactd_wait);
 #endif
        pgdat_page_ext_init(pgdat);
+       spin_lock_init(&pgdat->lru_lock);
 
        for (j = 0; j < MAX_NR_ZONES; j++) {
                struct zone *zone = pgdat->node_zones + j;
                zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
 #endif
                zone->name = zone_names[j];
+               zone->zone_pgdat = pgdat;
                spin_lock_init(&zone->lock);
-               spin_lock_init(&zone->lru_lock);
                zone_seqlock_init(zone);
-               zone->zone_pgdat = pgdat;
                zone_pcp_init(zone);
 
                /* For bootup, initialized properly in watermark setup */
 
                return NULL;
 
        zone = page_zone(page);
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(zone_lru_lock(zone));
        if (unlikely(!PageLRU(page))) {
                put_page(page);
                page = NULL;
        }
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(zone_lru_lock(zone));
        return page;
 }
 
 
  *         mapping->i_mmap_rwsem
  *           anon_vma->rwsem
  *             mm->page_table_lock or pte_lock
- *               zone->lru_lock (in mark_page_accessed, isolate_lru_page)
+ *               zone_lru_lock (in mark_page_accessed, isolate_lru_page)
  *               swap_lock (in swap_duplicate, swap_info_get)
  *                 mmlist_lock (in mmput, drain_mmlist and others)
  *                 mapping->private_lock (in __set_page_dirty_buffers)
 
                struct lruvec *lruvec;
                unsigned long flags;
 
-               spin_lock_irqsave(&zone->lru_lock, flags);
+               spin_lock_irqsave(zone_lru_lock(zone), flags);
                lruvec = mem_cgroup_page_lruvec(page, zone);
                VM_BUG_ON_PAGE(!PageLRU(page), page);
                __ClearPageLRU(page);
                del_page_from_lru_list(page, lruvec, page_off_lru(page));
-               spin_unlock_irqrestore(&zone->lru_lock, flags);
+               spin_unlock_irqrestore(zone_lru_lock(zone), flags);
        }
        mem_cgroup_uncharge(page);
 }
 
                if (pagezone != zone) {
                        if (zone)
-                               spin_unlock_irqrestore(&zone->lru_lock, flags);
+                               spin_unlock_irqrestore(zone_lru_lock(zone), flags);
                        zone = pagezone;
-                       spin_lock_irqsave(&zone->lru_lock, flags);
+                       spin_lock_irqsave(zone_lru_lock(zone), flags);
                }
 
                lruvec = mem_cgroup_page_lruvec(page, zone);
                (*move_fn)(page, lruvec, arg);
        }
        if (zone)
-               spin_unlock_irqrestore(&zone->lru_lock, flags);
+               spin_unlock_irqrestore(zone_lru_lock(zone), flags);
        release_pages(pvec->pages, pvec->nr, pvec->cold);
        pagevec_reinit(pvec);
 }
        struct zone *zone = page_zone(page);
 
        page = compound_head(page);
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(zone_lru_lock(zone));
        __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL);
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(zone_lru_lock(zone));
 }
 #endif
 
        struct zone *zone = page_zone(page);
        struct lruvec *lruvec;
 
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(zone_lru_lock(zone));
        lruvec = mem_cgroup_page_lruvec(page, zone);
        ClearPageActive(page);
        SetPageUnevictable(page);
        SetPageLRU(page);
        add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(zone_lru_lock(zone));
 }
 
 /**
                 * same zone. The lock is held only if zone != NULL.
                 */
                if (zone && ++lock_batch == SWAP_CLUSTER_MAX) {
-                       spin_unlock_irqrestore(&zone->lru_lock, flags);
+                       spin_unlock_irqrestore(zone_lru_lock(zone), flags);
                        zone = NULL;
                }
 
 
                if (PageCompound(page)) {
                        if (zone) {
-                               spin_unlock_irqrestore(&zone->lru_lock, flags);
+                               spin_unlock_irqrestore(zone_lru_lock(zone), flags);
                                zone = NULL;
                        }
                        __put_compound_page(page);
 
                        if (pagezone != zone) {
                                if (zone)
-                                       spin_unlock_irqrestore(&zone->lru_lock,
+                                       spin_unlock_irqrestore(zone_lru_lock(zone),
                                                                        flags);
                                lock_batch = 0;
                                zone = pagezone;
-                               spin_lock_irqsave(&zone->lru_lock, flags);
+                               spin_lock_irqsave(zone_lru_lock(zone), flags);
                        }
 
                        lruvec = mem_cgroup_page_lruvec(page, zone);
                list_add(&page->lru, &pages_to_free);
        }
        if (zone)
-               spin_unlock_irqrestore(&zone->lru_lock, flags);
+               spin_unlock_irqrestore(zone_lru_lock(zone), flags);
 
        mem_cgroup_uncharge_list(&pages_to_free);
        free_hot_cold_page_list(&pages_to_free, cold);
        VM_BUG_ON_PAGE(PageCompound(page_tail), page);
        VM_BUG_ON_PAGE(PageLRU(page_tail), page);
        VM_BUG_ON(NR_CPUS != 1 &&
-                 !spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
+                 !spin_is_locked(zone_lru_lock(lruvec_zone(lruvec))));
 
        if (!list)
                SetPageLRU(page_tail);
 
 }
 
 /*
- * zone->lru_lock is heavily contended.  Some of the functions that
+ * zone_lru_lock is heavily contended.  Some of the functions that
  * shrink the lists perform better by taking out a batch of pages
  * and working on them outside the LRU lock.
  *
                struct zone *zone = page_zone(page);
                struct lruvec *lruvec;
 
-               spin_lock_irq(&zone->lru_lock);
+               spin_lock_irq(zone_lru_lock(zone));
                lruvec = mem_cgroup_page_lruvec(page, zone);
                if (PageLRU(page)) {
                        int lru = page_lru(page);
                        del_page_from_lru_list(page, lruvec, lru);
                        ret = 0;
                }
-               spin_unlock_irq(&zone->lru_lock);
+               spin_unlock_irq(zone_lru_lock(zone));
        }
        return ret;
 }
                VM_BUG_ON_PAGE(PageLRU(page), page);
                list_del(&page->lru);
                if (unlikely(!page_evictable(page))) {
-                       spin_unlock_irq(&zone->lru_lock);
+                       spin_unlock_irq(zone_lru_lock(zone));
                        putback_lru_page(page);
-                       spin_lock_irq(&zone->lru_lock);
+                       spin_lock_irq(zone_lru_lock(zone));
                        continue;
                }
 
                        del_page_from_lru_list(page, lruvec, lru);
 
                        if (unlikely(PageCompound(page))) {
-                               spin_unlock_irq(&zone->lru_lock);
+                               spin_unlock_irq(zone_lru_lock(zone));
                                mem_cgroup_uncharge(page);
                                (*get_compound_page_dtor(page))(page);
-                               spin_lock_irq(&zone->lru_lock);
+                               spin_lock_irq(zone_lru_lock(zone));
                        } else
                                list_add(&page->lru, &pages_to_free);
                }
        if (!sc->may_writepage)
                isolate_mode |= ISOLATE_CLEAN;
 
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(zone_lru_lock(zone));
 
        nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
                                     &nr_scanned, sc, isolate_mode, lru);
                else
                        __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
        }
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(zone_lru_lock(zone));
 
        if (nr_taken == 0)
                return 0;
                                &nr_writeback, &nr_immediate,
                                false);
 
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(zone_lru_lock(zone));
 
        if (global_reclaim(sc)) {
                if (current_is_kswapd())
 
        __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
 
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(zone_lru_lock(zone));
 
        mem_cgroup_uncharge_list(&page_list);
        free_hot_cold_page_list(&page_list, true);
  * processes, from rmap.
  *
  * If the pages are mostly unmapped, the processing is fast and it is
- * appropriate to hold zone->lru_lock across the whole operation.  But if
+ * appropriate to hold zone_lru_lock across the whole operation.  But if
  * the pages are mapped, the processing is slow (page_referenced()) so we
- * should drop zone->lru_lock around each page.  It's impossible to balance
+ * should drop zone_lru_lock around each page.  It's impossible to balance
  * this, so instead we remove the pages from the LRU while processing them.
  * It is safe to rely on PG_active against the non-LRU pages in here because
  * nobody will play with that bit on a non-LRU page.
                        del_page_from_lru_list(page, lruvec, lru);
 
                        if (unlikely(PageCompound(page))) {
-                               spin_unlock_irq(&zone->lru_lock);
+                               spin_unlock_irq(zone_lru_lock(zone));
                                mem_cgroup_uncharge(page);
                                (*get_compound_page_dtor(page))(page);
-                               spin_lock_irq(&zone->lru_lock);
+                               spin_lock_irq(zone_lru_lock(zone));
                        } else
                                list_add(&page->lru, pages_to_free);
                }
        if (!sc->may_writepage)
                isolate_mode |= ISOLATE_CLEAN;
 
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(zone_lru_lock(zone));
 
        nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
                                     &nr_scanned, sc, isolate_mode, lru);
                __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
        __count_zone_vm_events(PGREFILL, zone, nr_scanned);
 
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(zone_lru_lock(zone));
 
        while (!list_empty(&l_hold)) {
                cond_resched();
        /*
         * Move pages back to the lru list.
         */
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(zone_lru_lock(zone));
        /*
         * Count referenced pages from currently used mappings as rotated,
         * even though only some of them are actually re-activated.  This
        move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
        move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
        __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(zone_lru_lock(zone));
 
        mem_cgroup_uncharge_list(&l_hold);
        free_hot_cold_page_list(&l_hold, true);
        file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
                lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
 
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(zone_lru_lock(zone));
        if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
                reclaim_stat->recent_scanned[0] /= 2;
                reclaim_stat->recent_rotated[0] /= 2;
 
        fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
        fp /= reclaim_stat->recent_rotated[1] + 1;
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(zone_lru_lock(zone));
 
        fraction[0] = ap;
        fraction[1] = fp;
                pagezone = page_zone(page);
                if (pagezone != zone) {
                        if (zone)
-                               spin_unlock_irq(&zone->lru_lock);
+                               spin_unlock_irq(zone_lru_lock(zone));
                        zone = pagezone;
-                       spin_lock_irq(&zone->lru_lock);
+                       spin_lock_irq(zone_lru_lock(zone));
                }
                lruvec = mem_cgroup_page_lruvec(page, zone);
 
        if (zone) {
                __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
                __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
-               spin_unlock_irq(&zone->lru_lock);
+               spin_unlock_irq(zone_lru_lock(zone));
        }
 }
 #endif /* CONFIG_SHMEM */