sched/uclamp: Add a new sysctl to control RT default boost value

author Qais Yousef <qais.yousef@arm.com>

Thu, 16 Jul 2020 11:03:45 +0000 (12:03 +0100)

committer Peter Zijlstra <peterz@infradead.org>

Wed, 29 Jul 2020 11:51:47 +0000 (13:51 +0200)
author Qais Yousef <qais.yousef@arm.com>
Thu, 16 Jul 2020 11:03:45 +0000 (12:03 +0100)
committer Peter Zijlstra <peterz@infradead.org>
Wed, 29 Jul 2020 11:51:47 +0000 (13:51 +0200)
diff --git a/include/linux/sched.h b/include/linux/sched.h

index adf0125190d412c320a9c764b67cc6034bbad652..a6bf77c34687645d8174e097634d0b97b5bf5019 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -686,9 +686,15 @@ struct task_struct {
         struct sched_dl_entity          dl;
  
  #ifdef CONFIG_UCLAMP_TASK
-       /* Clamp values requested for a scheduling entity */
+       /*
+        * Clamp values requested for a scheduling entity.
+        * Must be updated with task_rq_lock() held.
+        */
         struct uclamp_se                uclamp_req[UCLAMP_CNT];
-       /* Effective clamp values used for a scheduling entity */
+       /*
+        * Effective clamp values used for a scheduling entity.
+        * Must be updated with task_rq_lock() held.
+        */
         struct uclamp_se                uclamp[UCLAMP_CNT];
  #endif
  
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h

index 24be30a4081450e422cd7c038771f0adb31aea1e..3c31ba88aca59eb04dab769aad927fcfad7057ca 100644 (file)
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -67,6 +67,7 @@ extern unsigned int sysctl_sched_dl_period_min;
  #ifdef CONFIG_UCLAMP_TASK
  extern unsigned int sysctl_sched_uclamp_util_min;
  extern unsigned int sysctl_sched_uclamp_util_max;
+extern unsigned int sysctl_sched_uclamp_util_min_rt_default;
  #endif
  
  #ifdef CONFIG_CFS_BANDWIDTH
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h

index 38359071236ad7d81602f4f896335630c1f80035..e7ddab095baf4ca9677c82c5c5b7980d60f24924 100644 (file)
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -55,6 +55,7 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
  extern void init_idle(struct task_struct *idle, int cpu);
  
  extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
+extern void sched_post_fork(struct task_struct *p);
  extern void sched_dead(struct task_struct *p);
  
  void __noreturn do_task_dead(void);
diff --git a/kernel/fork.c b/kernel/fork.c

index efc5493203ae0b744d0684ec10f531bb5c7b9557..e75c2e41f3d1e40682a545eb171b1c85bb4ae16c 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2304,6 +2304,7 @@ static __latent_entropy struct task_struct *copy_process(
         write_unlock_irq(&tasklist_lock);
  
         proc_fork_connector(p);
+       sched_post_fork(p);
         cgroup_post_fork(p, args);
         perf_event_fork(p);
  
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index e44d83f3e0e6bb9514ad12cc585e73553a885f6e..12e1f3a2cabc649aa90a06d54fbd90b180534bcc 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -889,6 +889,23 @@ unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
  /* Max allowed maximum utilization */
  unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
  
+/*
+ * By default RT tasks run at the maximum performance point/capacity of the
+ * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
+ * SCHED_CAPACITY_SCALE.
+ *
+ * This knob allows admins to change the default behavior when uclamp is being
+ * used. In battery powered devices, particularly, running at the maximum
+ * capacity and frequency will increase energy consumption and shorten the
+ * battery life.
+ *
+ * This knob only affects RT tasks that their uclamp_se->user_defined == false.
+ *
+ * This knob will not override the system default sched_util_clamp_min defined
+ * above.
+ */
+unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
+
  /* All clamps are required to be less or equal than these values */
  static struct uclamp_se uclamp_default[UCLAMP_CNT];
  
@@ -991,6 +1008,64 @@ unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
         return uclamp_idle_value(rq, clamp_id, clamp_value);
  }
  
+static void __uclamp_update_util_min_rt_default(struct task_struct *p)
+{
+       unsigned int default_util_min;
+       struct uclamp_se *uc_se;
+
+       lockdep_assert_held(&p->pi_lock);
+
+       uc_se = &p->uclamp_req[UCLAMP_MIN];
+
+       /* Only sync if user didn't override the default */
+       if (uc_se->user_defined)
+               return;
+
+       default_util_min = sysctl_sched_uclamp_util_min_rt_default;
+       uclamp_se_set(uc_se, default_util_min, false);
+}
+
+static void uclamp_update_util_min_rt_default(struct task_struct *p)
+{
+       struct rq_flags rf;
+       struct rq *rq;
+
+       if (!rt_task(p))
+               return;
+
+       /* Protect updates to p->uclamp_* */
+       rq = task_rq_lock(p, &rf);
+       __uclamp_update_util_min_rt_default(p);
+       task_rq_unlock(rq, p, &rf);
+}
+
+static void uclamp_sync_util_min_rt_default(void)
+{
+       struct task_struct *g, *p;
+
+       /*
+        * copy_process()                       sysctl_uclamp
+        *                                        uclamp_min_rt = X;
+        *   write_lock(&tasklist_lock)           read_lock(&tasklist_lock)
+        *   // link thread                       smp_mb__after_spinlock()
+        *   write_unlock(&tasklist_lock)         read_unlock(&tasklist_lock);
+        *   sched_post_fork()                    for_each_process_thread()
+        *     __uclamp_sync_rt()                   __uclamp_sync_rt()
+        *
+        * Ensures that either sched_post_fork() will observe the new
+        * uclamp_min_rt or for_each_process_thread() will observe the new
+        * task.
+        */
+       read_lock(&tasklist_lock);
+       smp_mb__after_spinlock();
+       read_unlock(&tasklist_lock);
+
+       rcu_read_lock();
+       for_each_process_thread(g, p)
+               uclamp_update_util_min_rt_default(p);
+       rcu_read_unlock();
+}
+
  static inline struct uclamp_se
  uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
  {
@@ -1278,12 +1353,13 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
                                 void *buffer, size_t *lenp, loff_t *ppos)
  {
         bool update_root_tg = false;
-       int old_min, old_max;
+       int old_min, old_max, old_min_rt;
         int result;
  
         mutex_lock(&uclamp_mutex);
         old_min = sysctl_sched_uclamp_util_min;
         old_max = sysctl_sched_uclamp_util_max;
+       old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
  
         result = proc_dointvec(table, write, buffer, lenp, ppos);
         if (result)
@@ -1292,7 +1368,9 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
                 goto done;
  
         if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
-           sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
+           sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
+           sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
+
                 result = -EINVAL;
                 goto undo;
         }
@@ -1313,6 +1391,11 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
                 uclamp_update_root_tg();
         }
  
+       if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
+               static_branch_enable(&sched_uclamp_used);
+               uclamp_sync_util_min_rt_default();
+       }
+
         /*
          * We update all RUNNABLE tasks only when task groups are in use.
          * Otherwise, keep it simple and do just a lazy update at each next
@@ -1324,6 +1407,7 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
  undo:
         sysctl_sched_uclamp_util_min = old_min;
         sysctl_sched_uclamp_util_max = old_max;
+       sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
  done:
         mutex_unlock(&uclamp_mutex);
  
@@ -1369,17 +1453,20 @@ static void __setscheduler_uclamp(struct task_struct *p,
          */
         for_each_clamp_id(clamp_id) {
                 struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
-               unsigned int clamp_value = uclamp_none(clamp_id);
  
                 /* Keep using defined clamps across class changes */
                 if (uc_se->user_defined)
                         continue;
  
-               /* By default, RT tasks always get 100% boost */
+               /*
+                * RT by default have a 100% boost value that could be modified
+                * at runtime.
+                */
                 if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
-                       clamp_value = uclamp_none(UCLAMP_MAX);
+                       __uclamp_update_util_min_rt_default(p);
+               else
+                       uclamp_se_set(uc_se, uclamp_none(clamp_id), false);
  
-               uclamp_se_set(uc_se, clamp_value, false);
         }
  
         if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
@@ -1400,6 +1487,10 @@ static void uclamp_fork(struct task_struct *p)
  {
         enum uclamp_id clamp_id;
  
+       /*
+        * We don't need to hold task_rq_lock() when updating p->uclamp_* here
+        * as the task is still at its early fork stages.
+        */
         for_each_clamp_id(clamp_id)
                 p->uclamp[clamp_id].active = false;
  
@@ -1412,6 +1503,11 @@ static void uclamp_fork(struct task_struct *p)
         }
  }
  
+static void uclamp_post_fork(struct task_struct *p)
+{
+       uclamp_update_util_min_rt_default(p);
+}
+
  static void __init init_uclamp_rq(struct rq *rq)
  {
         enum uclamp_id clamp_id;
@@ -1462,6 +1558,7 @@ static inline int uclamp_validate(struct task_struct *p,
  static void __setscheduler_uclamp(struct task_struct *p,
                                   const struct sched_attr *attr) { }
  static inline void uclamp_fork(struct task_struct *p) { }
+static inline void uclamp_post_fork(struct task_struct *p) { }
  static inline void init_uclamp(void) { }
  #endif /* CONFIG_UCLAMP_TASK */
  
@@ -3205,6 +3302,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
         return 0;
  }
  
+void sched_post_fork(struct task_struct *p)
+{
+       uclamp_post_fork(p);
+}
+
  unsigned long to_ratio(u64 period, u64 runtime)
  {
         if (runtime == RUNTIME_INF)
@@ -5724,6 +5826,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
                 kattr.sched_nice = task_nice(p);
  
  #ifdef CONFIG_UCLAMP_TASK
+       /*
+        * This could race with another potential updater, but this is fine
+        * because it'll correctly read the old or the new value. We don't need
+        * to guarantee who wins the race as long as it doesn't return garbage.
+        */
         kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
         kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
  #endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index 4aea67d3d55263981fdb558b4ec0d6f92e893e54..1b4d2dc270a59f55d121e8ccf10007a159fe133d 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1815,6 +1815,13 @@ static struct ctl_table kern_table[] = {
                 .mode           = 0644,
                 .proc_handler   = sysctl_sched_uclamp_handler,
         },
+       {
+               .procname       = "sched_util_clamp_min_rt_default",
+               .data           = &sysctl_sched_uclamp_util_min_rt_default,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = sysctl_sched_uclamp_handler,
+       },
  #endif
  #ifdef CONFIG_SCHED_AUTOGROUP
         {
author	Qais Yousef <qais.yousef@arm.com>
	Thu, 16 Jul 2020 11:03:45 +0000 (12:03 +0100)
committer	Peter Zijlstra <peterz@infradead.org>
	Wed, 29 Jul 2020 11:51:47 +0000 (13:51 +0200)
include/linux/sched.h		patch \| blob \| history
include/linux/sched/sysctl.h		patch \| blob \| history
include/linux/sched/task.h		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
kernel/sysctl.c		patch \| blob \| history