spin_unlock_irq(&q->queue_lock);
 }
 
+static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw)
+{
+       /* throtl is FIFO - if bios are already queued, should queue */
+       if (tg->service_queue.nr_queued[rw])
+               return false;
+
+       return tg_may_dispatch(tg, bio, NULL);
+}
+
+static void tg_dispatch_in_debt(struct throtl_grp *tg, struct bio *bio, bool rw)
+{
+       if (!bio_flagged(bio, BIO_BPS_THROTTLED))
+               tg->carryover_bytes[rw] -= throtl_bio_data_size(bio);
+       tg->carryover_ios[rw]--;
+}
+
 bool __blk_throtl_bio(struct bio *bio)
 {
        struct request_queue *q = bdev_get_queue(bio->bi_bdev);
        sq = &tg->service_queue;
 
        while (true) {
-               /* throtl is FIFO - if bios are already queued, should queue */
-               if (sq->nr_queued[rw])
+               if (tg_within_limit(tg, bio, rw)) {
+                       /* within limits, let's charge and dispatch directly */
+                       throtl_charge_bio(tg, bio);
+
+                       /*
+                        * We need to trim slice even when bios are not being
+                        * queued otherwise it might happen that a bio is not
+                        * queued for a long time and slice keeps on extending
+                        * and trim is not called for a long time. Now if limits
+                        * are reduced suddenly we take into account all the IO
+                        * dispatched so far at new low rate and * newly queued
+                        * IO gets a really long dispatch time.
+                        *
+                        * So keep on trimming slice even if bio is not queued.
+                        */
+                       throtl_trim_slice(tg, rw);
+               } else if (bio_issue_as_root_blkg(bio)) {
+                       /*
+                        * IOs which may cause priority inversions are
+                        * dispatched directly, even if they're over limit.
+                        * Debts are handled by carryover_bytes/ios while
+                        * calculating wait time.
+                        */
+                       tg_dispatch_in_debt(tg, bio, rw);
+               } else {
+                       /* if above limits, break to queue */
                        break;
-
-               /* if above limits, break to queue */
-               if (!tg_may_dispatch(tg, bio, NULL))
-                       break;
-
-               /* within limits, let's charge and dispatch directly */
-               throtl_charge_bio(tg, bio);
-
-               /*
-                * We need to trim slice even when bios are not being queued
-                * otherwise it might happen that a bio is not queued for
-                * a long time and slice keeps on extending and trim is not
-                * called for a long time. Now if limits are reduced suddenly
-                * we take into account all the IO dispatched so far at new
-                * low rate and * newly queued IO gets a really long dispatch
-                * time.
-                *
-                * So keep on trimming slice even if bio is not queued.
-                */
-               throtl_trim_slice(tg, rw);
+               }
 
                /*
                 * @bio passed through this layer without being throttled.