blkg->q = disk->queue;
        INIT_LIST_HEAD(&blkg->q_node);
        blkg->blkcg = blkcg;
+       blkg->iostat.blkg = blkg;
 #ifdef CONFIG_BLK_CGROUP_PUNT_BIO
        spin_lock_init(&blkg->async_bio_lock);
        bio_list_init(&blkg->async_bios);
                smp_mb();
 
                WRITE_ONCE(bisc->lqueued, false);
+               if (bisc == &blkg->iostat)
+                       goto propagate_up; /* propagate up to parent only */
 
                /* fetch the current per-cpu values */
                do {
 
                blkcg_iostat_update(blkg, &cur, &bisc->last);
 
+propagate_up:
                /* propagate global delta to parent (unless that's root) */
-               if (parent && parent->parent)
+               if (parent && parent->parent) {
                        blkcg_iostat_update(parent, &blkg->iostat.cur,
                                            &blkg->iostat.last);
+                       /*
+                        * Queue parent->iostat to its blkcg's lockless
+                        * list to propagate up to the grandparent if the
+                        * iostat hasn't been queued yet.
+                        */
+                       if (!parent->iostat.lqueued) {
+                               struct llist_head *plhead;
+
+                               plhead = per_cpu_ptr(parent->blkcg->lhead, cpu);
+                               llist_add(&parent->iostat.lnode, plhead);
+                               parent->iostat.lqueued = true;
+                       }
+               }
        }
        raw_spin_unlock_irqrestore(&blkg_stat_lock, flags);
 out: