block, bfq: fix decrement of num_active_groups

author Paolo Valente <paolo.valente@linaro.org>

Thu, 6 Dec 2018 18:18:18 +0000 (19:18 +0100)

committer Jens Axboe <axboe@kernel.dk>

Fri, 7 Dec 2018 14:40:07 +0000 (07:40 -0700)
author Paolo Valente <paolo.valente@linaro.org>
Thu, 6 Dec 2018 18:18:18 +0000 (19:18 +0100)
committer Jens Axboe <axboe@kernel.dk>
Fri, 7 Dec 2018 14:40:07 +0000 (07:40 -0700)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

index 3a27d31fcda60250854ced98c68fe3db32ac587c..97337214bec4286afa212cf5f271ce26b578d399 100644 (file)
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -638,7 +638,7 @@ static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
                  bfqd->queue_weights_tree.rb_node->rb_right)
  #ifdef CONFIG_BFQ_GROUP_IOSCHED
                ) ||
-               (bfqd->num_active_groups > 0
+               (bfqd->num_groups_with_pending_reqs > 0
  #endif
                );
  }
@@ -802,7 +802,21 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd,
                          */
                         break;
                 }
-               bfqd->num_active_groups--;
+
+               /*
+                * The decrement of num_groups_with_pending_reqs is
+                * not performed immediately upon the deactivation of
+                * entity, but it is delayed to when it also happens
+                * that the first leaf descendant bfqq of entity gets
+                * all its pending requests completed. The following
+                * instructions perform this delayed decrement, if
+                * needed. See the comments on
+                * num_groups_with_pending_reqs for details.
+                */
+               if (entity->in_groups_with_pending_reqs) {
+                       entity->in_groups_with_pending_reqs = false;
+                       bfqd->num_groups_with_pending_reqs--;
+               }
         }
  }
  
@@ -3529,27 +3543,44 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
          * fact, if there are active groups, then, for condition (i)
          * to become false, it is enough that an active group contains
          * more active processes or sub-groups than some other active
-        * group. We address this issue with the following bi-modal
-        * behavior, implemented in the function
+        * group. More precisely, for condition (i) to hold because of
+        * such a group, it is not even necessary that the group is
+        * (still) active: it is sufficient that, even if the group
+        * has become inactive, some of its descendant processes still
+        * have some request already dispatched but still waiting for
+        * completion. In fact, requests have still to be guaranteed
+        * their share of the throughput even after being
+        * dispatched. In this respect, it is easy to show that, if a
+        * group frequently becomes inactive while still having
+        * in-flight requests, and if, when this happens, the group is
+        * not considered in the calculation of whether the scenario
+        * is asymmetric, then the group may fail to be guaranteed its
+        * fair share of the throughput (basically because idling may
+        * not be performed for the descendant processes of the group,
+        * but it had to be).  We address this issue with the
+        * following bi-modal behavior, implemented in the function
          * bfq_symmetric_scenario().
          *
-        * If there are active groups, then the scenario is tagged as
+        * If there are groups with requests waiting for completion
+        * (as commented above, some of these groups may even be
+        * already inactive), then the scenario is tagged as
          * asymmetric, conservatively, without checking any of the
          * conditions (i) and (ii). So the device is idled for bfqq.
          * This behavior matches also the fact that groups are created
-        * exactly if controlling I/O (to preserve bandwidth and
-        * latency guarantees) is a primary concern.
+        * exactly if controlling I/O is a primary concern (to
+        * preserve bandwidth and latency guarantees).
          *
-        * On the opposite end, if there are no active groups, then
-        * only condition (i) is actually controlled, i.e., provided
-        * that condition (i) holds, idling is not performed,
-        * regardless of whether condition (ii) holds. In other words,
-        * only if condition (i) does not hold, then idling is
-        * allowed, and the device tends to be prevented from queueing
-        * many requests, possibly of several processes. Since there
-        * are no active groups, then, to control condition (i) it is
-        * enough to check whether all active queues have the same
-        * weight.
+        * On the opposite end, if there are no groups with requests
+        * waiting for completion, then only condition (i) is actually
+        * controlled, i.e., provided that condition (i) holds, idling
+        * is not performed, regardless of whether condition (ii)
+        * holds. In other words, only if condition (i) does not hold,
+        * then idling is allowed, and the device tends to be
+        * prevented from queueing many requests, possibly of several
+        * processes. Since there are no groups with requests waiting
+        * for completion, then, to control condition (i) it is enough
+        * to check just whether all the queues with requests waiting
+        * for completion also have the same weight.
          *
          * Not checking condition (ii) evidently exposes bfqq to the
          * risk of getting less throughput than its fair share.
@@ -3607,10 +3638,11 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
          * bfqq is weight-raised is checked explicitly here. More
          * precisely, the compound condition below takes into account
          * also the fact that, even if bfqq is being weight-raised,
-        * the scenario is still symmetric if all active queues happen
-        * to be weight-raised. Actually, we should be even more
-        * precise here, and differentiate between interactive weight
-        * raising and soft real-time weight raising.
+        * the scenario is still symmetric if all queues with requests
+        * waiting for completion happen to be
+        * weight-raised. Actually, we should be even more precise
+        * here, and differentiate between interactive weight raising
+        * and soft real-time weight raising.
          *
          * As a side note, it is worth considering that the above
          * device-idling countermeasures may however fail in the
@@ -5417,7 +5449,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
         bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
  
         bfqd->queue_weights_tree = RB_ROOT;
-       bfqd->num_active_groups = 0;
+       bfqd->num_groups_with_pending_reqs = 0;
  
         INIT_LIST_HEAD(&bfqd->active_list);
         INIT_LIST_HEAD(&bfqd->idle_list);
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h

index 77651d817ecd36fe59827f2aa55f9c4ec5ffb979..0b02bf302de07706fbfdc5b5ef66da2545e01eee 100644 (file)
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -196,6 +196,9 @@ struct bfq_entity {
  
         /* flag, set to request a weight, ioprio or ioprio_class change  */
         int prio_changed;
+
+       /* flag, set if the entity is counted in groups_with_pending_reqs */
+       bool in_groups_with_pending_reqs;
  };
  
  struct bfq_group;
@@ -448,10 +451,54 @@ struct bfq_data {
          * bfq_weights_tree_[add|remove] for further details).
          */
         struct rb_root queue_weights_tree;
+
         /*
-        * number of groups with requests still waiting for completion
+        * Number of groups with at least one descendant process that
+        * has at least one request waiting for completion. Note that
+        * this accounts for also requests already dispatched, but not
+        * yet completed. Therefore this number of groups may differ
+        * (be larger) than the number of active groups, as a group is
+        * considered active only if its corresponding entity has
+        * descendant queues with at least one request queued. This
+        * number is used to decide whether a scenario is symmetric.
+        * For a detailed explanation see comments on the computation
+        * of the variable asymmetric_scenario in the function
+        * bfq_better_to_idle().
+        *
+        * However, it is hard to compute this number exactly, for
+        * groups with multiple descendant processes. Consider a group
+        * that is inactive, i.e., that has no descendant process with
+        * pending I/O inside BFQ queues. Then suppose that
+        * num_groups_with_pending_reqs is still accounting for this
+        * group, because the group has descendant processes with some
+        * I/O request still in flight. num_groups_with_pending_reqs
+        * should be decremented when the in-flight request of the
+        * last descendant process is finally completed (assuming that
+        * nothing else has changed for the group in the meantime, in
+        * terms of composition of the group and active/inactive state of child
+        * groups and processes). To accomplish this, an additional
+        * pending-request counter must be added to entities, and must
+        * be updated correctly. To avoid this additional field and operations,
+        * we resort to the following tradeoff between simplicity and
+        * accuracy: for an inactive group that is still counted in
+        * num_groups_with_pending_reqs, we decrement
+        * num_groups_with_pending_reqs when the first descendant
+        * process of the group remains with no request waiting for
+        * completion.
+        *
+        * Even this simpler decrement strategy requires a little
+        * carefulness: to avoid multiple decrements, we flag a group,
+        * more precisely an entity representing a group, as still
+        * counted in num_groups_with_pending_reqs when it becomes
+        * inactive. Then, when the first descendant queue of the
+        * entity remains with no request waiting for completion,
+        * num_groups_with_pending_reqs is decremented, and this flag
+        * is reset. After this flag is reset for the entity,
+        * num_groups_with_pending_reqs won't be decremented any
+        * longer in case a new descendant queue of the entity remains
+        * with no request waiting for completion.
          */
-       unsigned int num_active_groups;
+       unsigned int num_groups_with_pending_reqs;
  
         /*
          * Number of bfq_queues containing requests (including the
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c

index 4b0d5fb6916005571d4d4b9885e5a24d194e7a7d..63e0f12be7c98fe7770eb9f1e817f5319690c392 100644 (file)
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -1012,7 +1012,10 @@ static void __bfq_activate_entity(struct bfq_entity *entity,
                         container_of(entity, struct bfq_group, entity);
                 struct bfq_data *bfqd = bfqg->bfqd;
  
-               bfqd->num_active_groups++;
+               if (!entity->in_groups_with_pending_reqs) {
+                       entity->in_groups_with_pending_reqs = true;
+                       bfqd->num_groups_with_pending_reqs++;
+               }
         }
  #endif
author	Paolo Valente <paolo.valente@linaro.org>
	Thu, 6 Dec 2018 18:18:18 +0000 (19:18 +0100)
committer	Jens Axboe <axboe@kernel.dk>
	Fri, 7 Dec 2018 14:40:07 +0000 (07:40 -0700)
block/bfq-iosched.c		patch \| blob \| history
block/bfq-iosched.h		patch \| blob \| history
block/bfq-wf2q.c		patch \| blob \| history