sched/fair: Add util_est on top of PELT

author Patrick Bellasi <patrick.bellasi@arm.com>

Fri, 9 Mar 2018 09:52:42 +0000 (09:52 +0000)

committer Ingo Molnar <mingo@kernel.org>

Tue, 20 Mar 2018 07:11:06 +0000 (08:11 +0100)
author Patrick Bellasi <patrick.bellasi@arm.com>
Fri, 9 Mar 2018 09:52:42 +0000 (09:52 +0000)
committer Ingo Molnar <mingo@kernel.org>
Tue, 20 Mar 2018 07:11:06 +0000 (08:11 +0100)
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 21b1168da9513659018134886f2f9c0b369f4ee7..f228c6033832cf1c960a77c59ef786121e296fc5 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -274,6 +274,34 @@ struct load_weight {
         u32                             inv_weight;
  };
  
+/**
+ * struct util_est - Estimation utilization of FAIR tasks
+ * @enqueued: instantaneous estimated utilization of a task/cpu
+ * @ewma:     the Exponential Weighted Moving Average (EWMA)
+ *            utilization of a task
+ *
+ * Support data structure to track an Exponential Weighted Moving Average
+ * (EWMA) of a FAIR task's utilization. New samples are added to the moving
+ * average each time a task completes an activation. Sample's weight is chosen
+ * so that the EWMA will be relatively insensitive to transient changes to the
+ * task's workload.
+ *
+ * The enqueued attribute has a slightly different meaning for tasks and cpus:
+ * - task:   the task's util_avg at last task dequeue time
+ * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
+ * Thus, the util_est.enqueued of a task represents the contribution on the
+ * estimated utilization of the CPU where that task is currently enqueued.
+ *
+ * Only for tasks we track a moving average of the past instantaneous
+ * estimated utilization. This allows to absorb sporadic drops in utilization
+ * of an otherwise almost periodic task.
+ */
+struct util_est {
+       unsigned int                    enqueued;
+       unsigned int                    ewma;
+#define UTIL_EST_WEIGHT_SHIFT          2
+};
+
  /*
   * The load_avg/util_avg accumulates an infinite geometric series
   * (see __update_load_avg() in kernel/sched/fair.c).
@@ -335,6 +363,7 @@ struct sched_avg {
         unsigned long                   load_avg;
         unsigned long                   runnable_load_avg;
         unsigned long                   util_avg;
+       struct util_est                 util_est;
  };
  
  struct sched_statistics {
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c

index 644d9a464380f8c607856bc0e6684be8d5ab0889..332303be4beba15d0b3375dff649fb70cdc071fe 100644 (file)
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -541,6 +541,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
                         cfs_rq->avg.runnable_load_avg);
         SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
                         cfs_rq->avg.util_avg);
+       SEQ_printf(m, "  .%-30s: %u\n", "util_est_enqueued",
+                       cfs_rq->avg.util_est.enqueued);
         SEQ_printf(m, "  .%-30s: %ld\n", "removed.load_avg",
                         cfs_rq->removed.load_avg);
         SEQ_printf(m, "  .%-30s: %ld\n", "removed.util_avg",
@@ -989,6 +991,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
         P(se.avg.runnable_load_avg);
         P(se.avg.util_avg);
         P(se.avg.last_update_time);
+       P(se.avg.util_est.ewma);
+       P(se.avg.util_est.enqueued);
  #endif
         P(policy);
         P(prio);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 3582117e1580857a2945c0d91a57529a6fe58821..22b59a7facd2eb6244d26c64501bb9c9ca7da974 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3873,6 +3873,113 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
  
  static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
  
+static inline unsigned long task_util(struct task_struct *p)
+{
+       return READ_ONCE(p->se.avg.util_avg);
+}
+
+static inline unsigned long _task_util_est(struct task_struct *p)
+{
+       struct util_est ue = READ_ONCE(p->se.avg.util_est);
+
+       return max(ue.ewma, ue.enqueued);
+}
+
+static inline unsigned long task_util_est(struct task_struct *p)
+{
+       return max(task_util(p), _task_util_est(p));
+}
+
+static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
+                                   struct task_struct *p)
+{
+       unsigned int enqueued;
+
+       if (!sched_feat(UTIL_EST))
+               return;
+
+       /* Update root cfs_rq's estimated utilization */
+       enqueued  = cfs_rq->avg.util_est.enqueued;
+       enqueued += _task_util_est(p);
+       WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+}
+
+/*
+ * Check if a (signed) value is within a specified (unsigned) margin,
+ * based on the observation that:
+ *
+ *     abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
+ *
+ * NOTE: this only works when value + maring < INT_MAX.
+ */
+static inline bool within_margin(int value, int margin)
+{
+       return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
+}
+
+static void
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
+{
+       long last_ewma_diff;
+       struct util_est ue;
+
+       if (!sched_feat(UTIL_EST))
+               return;
+
+       /*
+        * Update root cfs_rq's estimated utilization
+        *
+        * If *p is the last task then the root cfs_rq's estimated utilization
+        * of a CPU is 0 by definition.
+        */
+       ue.enqueued = 0;
+       if (cfs_rq->nr_running) {
+               ue.enqueued  = cfs_rq->avg.util_est.enqueued;
+               ue.enqueued -= min_t(unsigned int, ue.enqueued,
+                                    _task_util_est(p));
+       }
+       WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
+
+       /*
+        * Skip update of task's estimated utilization when the task has not
+        * yet completed an activation, e.g. being migrated.
+        */
+       if (!task_sleep)
+               return;
+
+       /*
+        * Skip update of task's estimated utilization when its EWMA is
+        * already ~1% close to its last activation value.
+        */
+       ue = p->se.avg.util_est;
+       ue.enqueued = task_util(p);
+       last_ewma_diff = ue.enqueued - ue.ewma;
+       if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
+               return;
+
+       /*
+        * Update Task's estimated utilization
+        *
+        * When *p completes an activation we can consolidate another sample
+        * of the task size. This is done by storing the current PELT value
+        * as ue.enqueued and by using this value to update the Exponential
+        * Weighted Moving Average (EWMA):
+        *
+        *  ewma(t) = w *  task_util(p) + (1-w) * ewma(t-1)
+        *          = w *  task_util(p) +         ewma(t-1)  - w * ewma(t-1)
+        *          = w * (task_util(p) -         ewma(t-1)) +     ewma(t-1)
+        *          = w * (      last_ewma_diff            ) +     ewma(t-1)
+        *          = w * (last_ewma_diff  +  ewma(t-1) / w)
+        *
+        * Where 'w' is the weight of new samples, which is configured to be
+        * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
+        */
+       ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
+       ue.ewma  += last_ewma_diff;
+       ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
+       WRITE_ONCE(p->se.avg.util_est, ue);
+}
+
  #else /* CONFIG_SMP */
  
  static inline int
@@ -3902,6 +4009,13 @@ static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
         return 0;
  }
  
+static inline void
+util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
+
+static inline void
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
+                bool task_sleep) {}
+
  #endif /* CONFIG_SMP */
  
  static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -5249,6 +5363,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
         if (!se)
                 add_nr_running(rq, 1);
  
+       util_est_enqueue(&rq->cfs, p);
         hrtick_update(rq);
  }
  
@@ -5308,6 +5423,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
         if (!se)
                 sub_nr_running(rq, 1);
  
+       util_est_dequeue(&rq->cfs, p, task_sleep);
         hrtick_update(rq);
  }
  
@@ -5835,7 +5951,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
         return target;
  }
  
-static inline unsigned long task_util(struct task_struct *p);
  static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
  
  static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
@@ -6351,11 +6466,6 @@ static unsigned long cpu_util(int cpu)
         return (util >= capacity) ? capacity : util;
  }
  
-static inline unsigned long task_util(struct task_struct *p)
-{
-       return p->se.avg.util_avg;
-}
-
  /*
   * cpu_util_wake: Compute CPU utilization with any contributions from
   * the waking task p removed.
diff --git a/kernel/sched/features.h b/kernel/sched/features.h

index 9552fd5854bffc2c07ec77225a1aed9176da53cb..c459a4b6154440c4b373523bcb521924e96846c0 100644 (file)
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -85,3 +85,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true)
  SCHED_FEAT(WA_IDLE, true)
  SCHED_FEAT(WA_WEIGHT, true)
  SCHED_FEAT(WA_BIAS, true)
+
+/*
+ * UtilEstimation. Use estimated CPU utilization.
+ */
+SCHED_FEAT(UTIL_EST, false)
author	Patrick Bellasi <patrick.bellasi@arm.com>
	Fri, 9 Mar 2018 09:52:42 +0000 (09:52 +0000)
committer	Ingo Molnar <mingo@kernel.org>
	Tue, 20 Mar 2018 07:11:06 +0000 (08:11 +0100)
include/linux/sched.h		patch \| blob \| history
kernel/sched/debug.c		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history
kernel/sched/features.h		patch \| blob \| history