static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
 {
        unsigned long pflags;
+       bool clamp;
        u64 now = ktime_to_ns(ktime_get());
        u64 exp;
        u64 delay_nsec = 0;
        int tok;
 
        while (blkg->parent) {
-               if (atomic_read(&blkg->use_delay)) {
+               int use_delay = atomic_read(&blkg->use_delay);
+
+               if (use_delay) {
+                       u64 this_delay;
+
                        blkcg_scale_delay(blkg, now);
-                       delay_nsec = max_t(u64, delay_nsec,
-                                          atomic64_read(&blkg->delay_nsec));
+                       this_delay = atomic64_read(&blkg->delay_nsec);
+                       if (this_delay > delay_nsec) {
+                               delay_nsec = this_delay;
+                               clamp = use_delay > 0;
+                       }
                }
                blkg = blkg->parent;
        }
         * Let's not sleep for all eternity if we've amassed a huge delay.
         * Swapping or metadata IO can accumulate 10's of seconds worth of
         * delay, and we want userspace to be able to do _something_ so cap the
-        * delays at 1 second.  If there's 10's of seconds worth of delay then
-        * the tasks will be delayed for 1 second for every syscall.
+        * delays at 0.25s. If there's 10's of seconds worth of delay then the
+        * tasks will be delayed for 0.25 second for every syscall. If
+        * blkcg_set_delay() was used as indicated by negative use_delay, the
+        * caller is responsible for regulating the range.
         */
-       delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
+       if (clamp)
+               delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
 
        if (use_memdelay)
                psi_memstall_enter(&pflags);
 
        /* unbusy hysterisis */
        UNBUSY_THR_PCT          = 75,
 
+       /*
+        * The effect of delay is indirect and non-linear and a huge amount of
+        * future debt can accumulate abruptly while unthrottled. Linearly scale
+        * up delay as debt is going up and then let it decay exponentially.
+        * This gives us quick ramp ups while delay is accumulating and long
+        * tails which can help reducing the frequency of debt explosions on
+        * unthrottle. The parameters are experimentally determined.
+        *
+        * The delay mechanism provides adequate protection and behavior in many
+        * cases. However, this is far from ideal and falls shorts on both
+        * fronts. The debtors are often throttled too harshly costing a
+        * significant level of fairness and possibly total work while the
+        * protection against their impacts on the system can be choppy and
+        * unreliable.
+        *
+        * The shortcoming primarily stems from the fact that, unlike for page
+        * cache, the kernel doesn't have well-defined back-pressure propagation
+        * mechanism and policies for anonymous memory. Fully addressing this
+        * issue will likely require substantial improvements in the area.
+        */
+       MIN_DELAY_THR_PCT       = 500,
+       MAX_DELAY_THR_PCT       = 25000,
+       MIN_DELAY               = 250,
+       MAX_DELAY               = 250 * USEC_PER_MSEC,
+
        /* don't let cmds which take a very long time pin lagging for too long */
        MAX_LAGGING_PERIODS     = 10,
 
        atomic64_t                      done_vtime;
        u64                             abs_vdebt;
 
+       /* current delay in effect and when it started */
+       u64                             delay;
+       u64                             delay_at;
+
        /*
         * The period this iocg was last active in.  Used for deactivation
         * and invalidating `vtime`.
 
        struct wait_queue_head          waitq;
        struct hrtimer                  waitq_timer;
-       struct hrtimer                  delay_timer;
 
        /* timestamp at the latest activation */
        u64                             activated_at;
 {
        struct ioc *ioc = iocg->ioc;
        struct blkcg_gq *blkg = iocg_to_blkg(iocg);
-       u64 vtime = atomic64_read(&iocg->vtime);
-       u64 delta_ns, expires, oexpires;
+       u64 tdelta, delay, new_delay;
+       s64 vover, vover_pct;
        u32 hwa;
 
        lockdep_assert_held(&iocg->waitq.lock);
 
-       /* debt-adjust vtime */
+       /* calculate the current delay in effect - 1/2 every second */
+       tdelta = now->now - iocg->delay_at;
+       if (iocg->delay)
+               delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC);
+       else
+               delay = 0;
+
+       /* calculate the new delay from the debt amount */
        current_hweight(iocg, &hwa, NULL);
-       vtime += abs_cost_to_cost(iocg->abs_vdebt, hwa);
+       vover = atomic64_read(&iocg->vtime) +
+               abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow;
+       vover_pct = div64_s64(100 * vover, ioc->period_us * now->vrate);
+
+       if (vover_pct <= MIN_DELAY_THR_PCT)
+               new_delay = 0;
+       else if (vover_pct >= MAX_DELAY_THR_PCT)
+               new_delay = MAX_DELAY;
+       else
+               new_delay = MIN_DELAY +
+                       div_u64((MAX_DELAY - MIN_DELAY) *
+                               (vover_pct - MIN_DELAY_THR_PCT),
+                               MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT);
 
-       /*
-        * Clear or maintain depending on the overage. Non-zero vdebt is what
-        * guarantees that @iocg is online and future iocg_kick_delay() will
-        * clear use_delay. Don't leave it on when there's no vdebt.
-        */
-       if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) {
-               blkcg_clear_delay(blkg);
-               return false;
+       /* pick the higher one and apply */
+       if (new_delay > delay) {
+               iocg->delay = new_delay;
+               iocg->delay_at = now->now;
+               delay = new_delay;
        }
-       if (!atomic_read(&blkg->use_delay) &&
-           time_before_eq64(vtime, now->vnow + ioc->margins.target))
-               return false;
-
-       /* use delay */
-       delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow,
-                                     now->vrate) * NSEC_PER_USEC;
-       blkcg_set_delay(blkg, delta_ns);
-       expires = now->now_ns + delta_ns;
 
-       /* if already active and close enough, don't bother */
-       oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
-       if (hrtimer_is_queued(&iocg->delay_timer) &&
-           abs(oexpires - expires) <= ioc->timer_slack_ns)
+       if (delay >= MIN_DELAY) {
+               blkcg_set_delay(blkg, delay * NSEC_PER_USEC);
                return true;
-
-       hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
-                              ioc->timer_slack_ns, HRTIMER_MODE_ABS);
-       return true;
-}
-
-static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
-{
-       struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
-       struct ioc_now now;
-       unsigned long flags;
-
-       spin_lock_irqsave(&iocg->waitq.lock, flags);
-       ioc_now(iocg->ioc, &now);
-       iocg_kick_delay(iocg, &now);
-       spin_unlock_irqrestore(&iocg->waitq.lock, flags);
-
-       return HRTIMER_NORESTART;
+       } else {
+               iocg->delay = 0;
+               blkcg_clear_delay(blkg);
+               return false;
+       }
 }
 
 static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost,
                atomic64_add(vpay, &iocg->done_vtime);
                iocg_pay_debt(iocg, abs_vpay, now);
                vbudget -= vpay;
+       }
 
+       if (iocg->abs_vdebt || iocg->delay)
                iocg_kick_delay(iocg, now);
-       }
 
        /*
         * Debt can still be outstanding if we haven't paid all yet or the
         */
        list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
                if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
-                   !iocg_is_idle(iocg))
+                   !iocg->delay && !iocg_is_idle(iocg))
                        continue;
 
                spin_lock(&iocg->waitq.lock);
 
-               if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
+               if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt ||
+                   iocg->delay) {
                        /* might be oversleeping vtime / hweight changes, kick */
                        iocg_kick_waitq(iocg, true, &now);
                } else if (iocg_is_idle(iocg)) {
        init_waitqueue_head(&iocg->waitq);
        hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
        iocg->waitq_timer.function = iocg_waitq_timer_fn;
-       hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
-       iocg->delay_timer.function = iocg_delay_timer_fn;
 
        iocg->level = blkg->blkcg->css.cgroup->level;
 
                spin_unlock_irqrestore(&ioc->lock, flags);
 
                hrtimer_cancel(&iocg->waitq_timer);
-               hrtimer_cancel(&iocg->delay_timer);
        }
        free_percpu(iocg->pcpu_stat);
        kfree(iocg);