*/
 int sysctl_sched_rt_runtime = 950000;
 
+/*
+ * Maximum bandwidth available for all -deadline tasks and groups
+ * (if group scheduling is configured) on each CPU.
+ *
+ * default: 5%
+ */
+unsigned int sysctl_sched_dl_period = 1000000;
+int sysctl_sched_dl_runtime = 50000;
+
 
 
 /*
        return 0;
 }
 
+unsigned long to_ratio(u64 period, u64 runtime)
+{
+       if (runtime == RUNTIME_INF)
+               return 1ULL << 20;
+
+       /*
+        * Doing this here saves a lot of checks in all
+        * the calling paths, and returning zero seems
+        * safe for them anyway.
+        */
+       if (period == 0)
+               return 0;
+
+       return div64_u64(runtime << 20, period);
+}
+
+#ifdef CONFIG_SMP
+inline struct dl_bw *dl_bw_of(int i)
+{
+       return &cpu_rq(i)->rd->dl_bw;
+}
+
+static inline int __dl_span_weight(struct rq *rq)
+{
+       return cpumask_weight(rq->rd->span);
+}
+#else
+inline struct dl_bw *dl_bw_of(int i)
+{
+       return &cpu_rq(i)->dl.dl_bw;
+}
+
+static inline int __dl_span_weight(struct rq *rq)
+{
+       return 1;
+}
+#endif
+
+static inline
+void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
+{
+       dl_b->total_bw -= tsk_bw;
+}
+
+static inline
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
+{
+       dl_b->total_bw += tsk_bw;
+}
+
+static inline
+bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+{
+       return dl_b->bw != -1 &&
+              dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
+}
+
+/*
+ * We must be sure that accepting a new task (or allowing changing the
+ * parameters of an existing one) is consistent with the bandwidth
+ * constraints. If yes, this function also accordingly updates the currently
+ * allocated bandwidth to reflect the new situation.
+ *
+ * This function is called while holding p's rq->lock.
+ */
+static int dl_overflow(struct task_struct *p, int policy,
+                      const struct sched_attr *attr)
+{
+
+       struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+       u64 period = attr->sched_period;
+       u64 runtime = attr->sched_runtime;
+       u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
+       int cpus = __dl_span_weight(task_rq(p));
+       int err = -1;
+
+       if (new_bw == p->dl.dl_bw)
+               return 0;
+
+       /*
+        * Either if a task, enters, leave, or stays -deadline but changes
+        * its parameters, we may need to update accordingly the total
+        * allocated bandwidth of the container.
+        */
+       raw_spin_lock(&dl_b->lock);
+       if (dl_policy(policy) && !task_has_dl_policy(p) &&
+           !__dl_overflow(dl_b, cpus, 0, new_bw)) {
+               __dl_add(dl_b, new_bw);
+               err = 0;
+       } else if (dl_policy(policy) && task_has_dl_policy(p) &&
+                  !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
+               __dl_clear(dl_b, p->dl.dl_bw);
+               __dl_add(dl_b, new_bw);
+               err = 0;
+       } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
+               __dl_clear(dl_b, p->dl.dl_bw);
+               err = 0;
+       }
+       raw_spin_unlock(&dl_b->lock);
+
+       return err;
+}
+
+extern void init_dl_bw(struct dl_bw *dl_b);
+
 /*
  * wake_up_new_task - wake up a newly created task for the first time.
  *
        dl_se->dl_deadline = attr->sched_deadline;
        dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
        dl_se->flags = attr->sched_flags;
+       dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
        dl_se->dl_throttled = 0;
        dl_se->dl_new = 1;
 }
  * This function validates the new parameters of a -deadline task.
  * We ask for the deadline not being zero, and greater or equal
  * than the runtime, as well as the period of being zero or
- * greater than deadline.
+ * greater than deadline. Furthermore, we have to be sure that
+ * user parameters are above the internal resolution (1us); we
+ * check sched_runtime only since it is always the smaller one.
  */
 static bool
 __checkparam_dl(const struct sched_attr *attr)
        return attr && attr->sched_deadline != 0 &&
                (attr->sched_period == 0 ||
                (s64)(attr->sched_period   - attr->sched_deadline) >= 0) &&
-               (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0;
+               (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0  &&
+               attr->sched_runtime >= (2 << (DL_SCALE - 1));
 }
 
 /*
        }
 change:
 
-#ifdef CONFIG_RT_GROUP_SCHED
        if (user) {
+#ifdef CONFIG_RT_GROUP_SCHED
                /*
                 * Do not allow realtime tasks into groups that have no runtime
                 * assigned.
                        task_rq_unlock(rq, p, &flags);
                        return -EPERM;
                }
-       }
 #endif
+#ifdef CONFIG_SMP
+               if (dl_bandwidth_enabled() && dl_policy(policy)) {
+                       cpumask_t *span = rq->rd->span;
+                       cpumask_t act_affinity;
+
+                       /*
+                        * cpus_allowed mask is statically initialized with
+                        * CPU_MASK_ALL, span is instead dynamic. Here we
+                        * compute the "dynamic" affinity of a task.
+                        */
+                       cpumask_and(&act_affinity, &p->cpus_allowed,
+                                   cpu_active_mask);
+
+                       /*
+                        * Don't allow tasks with an affinity mask smaller than
+                        * the entire root_domain to become SCHED_DEADLINE. We
+                        * will also fail if there's no bandwidth available.
+                        */
+                       if (!cpumask_equal(&act_affinity, span) ||
+                                          rq->rd->dl_bw.bw == 0) {
+                               task_rq_unlock(rq, p, &flags);
+                               return -EPERM;
+                       }
+               }
+#endif
+       }
 
        /* recheck policy now with rq lock held */
        if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                task_rq_unlock(rq, p, &flags);
                goto recheck;
        }
+
+       /*
+        * If setscheduling to SCHED_DEADLINE (or changing the parameters
+        * of a SCHED_DEADLINE task) we need to check if enough bandwidth
+        * is available.
+        */
+       if ((dl_policy(policy) || dl_task(p)) &&
+           dl_overflow(p, policy, attr)) {
+               task_rq_unlock(rq, p, &flags);
+               return -EBUSY;
+       }
+
        on_rq = p->on_rq;
        running = task_current(rq, p);
        if (on_rq)
        if (retval)
                goto out_unlock;
 
+       /*
+        * Since bandwidth control happens on root_domain basis,
+        * if admission test is enabled, we only admit -deadline
+        * tasks allowed to run on all the CPUs in the task's
+        * root_domain.
+        */
+#ifdef CONFIG_SMP
+       if (task_has_dl_policy(p)) {
+               const struct cpumask *span = task_rq(p)->rd->span;
+
+               if (dl_bandwidth_enabled() &&
+                   !cpumask_equal(in_mask, span)) {
+                       retval = -EBUSY;
+                       goto out_unlock;
+               }
+       }
+#endif
+
        cpuset_cpus_allowed(p, cpus_allowed);
        cpumask_and(new_mask, in_mask, cpus_allowed);
 again:
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
+/*
+ * When dealing with a -deadline task, we have to check if moving it to
+ * a new CPU is possible or not. In fact, this is only true iff there
+ * is enough bandwidth available on such CPU, otherwise we want the
+ * whole migration progedure to fail over.
+ */
+static inline
+bool set_task_cpu_dl(struct task_struct *p, unsigned int cpu)
+{
+       struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+       struct dl_bw *cpu_b = dl_bw_of(cpu);
+       int ret = 1;
+       u64 bw;
+
+       if (dl_b == cpu_b)
+               return 1;
+
+       raw_spin_lock(&dl_b->lock);
+       raw_spin_lock(&cpu_b->lock);
+
+       bw = cpu_b->bw * cpumask_weight(cpu_rq(cpu)->rd->span);
+       if (dl_bandwidth_enabled() &&
+           bw < cpu_b->total_bw + p->dl.dl_bw) {
+               ret = 0;
+               goto unlock;
+       }
+       dl_b->total_bw -= p->dl.dl_bw;
+       cpu_b->total_bw += p->dl.dl_bw;
+
+unlock:
+       raw_spin_unlock(&cpu_b->lock);
+       raw_spin_unlock(&dl_b->lock);
+
+       return ret;
+}
+
 /*
  * Move (not current) task off this cpu, onto dest cpu. We're doing
  * this because either it can't run here any more (set_cpus_allowed()
        if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
                goto fail;
 
+       /*
+        * If p is -deadline, proceed only if there is enough
+        * bandwidth available on dest_cpu
+        */
+       if (unlikely(dl_task(p)) && !set_task_cpu_dl(p, dest_cpu))
+               goto fail;
+
        /*
         * If we're not on a rq, the next wake-up will ensure we're
         * placed properly.
        if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
                goto free_dlo_mask;
 
+       init_dl_bw(&rd->dl_bw);
+
        if (cpupri_init(&rd->cpupri) != 0)
                goto free_rto_mask;
        return 0;
 #endif /* CONFIG_CPUMASK_OFFSTACK */
        }
 
+       init_rt_bandwidth(&def_rt_bandwidth,
+                       global_rt_period(), global_rt_runtime());
+       init_dl_bandwidth(&def_dl_bandwidth,
+                       global_dl_period(), global_dl_runtime());
+
 #ifdef CONFIG_SMP
        init_defrootdomain();
 #endif
 
-       init_rt_bandwidth(&def_rt_bandwidth,
-                       global_rt_period(), global_rt_runtime());
-
 #ifdef CONFIG_RT_GROUP_SCHED
        init_rt_bandwidth(&root_task_group.rt_bandwidth,
                        global_rt_period(), global_rt_runtime());
 }
 #endif /* CONFIG_CGROUP_SCHED */
 
-#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
-static unsigned long to_ratio(u64 period, u64 runtime)
-{
-       if (runtime == RUNTIME_INF)
-               return 1ULL << 20;
-
-       return div64_u64(runtime << 20, period);
-}
-#endif
-
 #ifdef CONFIG_RT_GROUP_SCHED
 /*
  * Ensure that the real time constraints are schedulable.
        do_div(rt_period_us, NSEC_PER_USEC);
        return rt_period_us;
 }
+#endif /* CONFIG_RT_GROUP_SCHED */
 
+/*
+ * Coupling of -rt and -deadline bandwidth.
+ *
+ * Here we check if the new -rt bandwidth value is consistent
+ * with the system settings for the bandwidth available
+ * to -deadline tasks.
+ *
+ * IOW, we want to enforce that
+ *
+ *   rt_bandwidth + dl_bandwidth <= 100%
+ *
+ * is always true.
+ */
+static bool __sched_rt_dl_global_constraints(u64 rt_bw)
+{
+       unsigned long flags;
+       u64 dl_bw;
+       bool ret;
+
+       raw_spin_lock_irqsave(&def_dl_bandwidth.dl_runtime_lock, flags);
+       if (global_rt_runtime() == RUNTIME_INF ||
+           global_dl_runtime() == RUNTIME_INF) {
+               ret = true;
+               goto unlock;
+       }
+
+       dl_bw = to_ratio(def_dl_bandwidth.dl_period,
+                        def_dl_bandwidth.dl_runtime);
+
+       ret = rt_bw + dl_bw <= to_ratio(RUNTIME_INF, RUNTIME_INF);
+unlock:
+       raw_spin_unlock_irqrestore(&def_dl_bandwidth.dl_runtime_lock, flags);
+
+       return ret;
+}
+
+#ifdef CONFIG_RT_GROUP_SCHED
 static int sched_rt_global_constraints(void)
 {
-       u64 runtime, period;
+       u64 runtime, period, bw;
        int ret = 0;
 
        if (sysctl_sched_rt_period <= 0)
        if (runtime > period && runtime != RUNTIME_INF)
                return -EINVAL;
 
+       bw = to_ratio(period, runtime);
+       if (!__sched_rt_dl_global_constraints(bw))
+               return -EINVAL;
+
        mutex_lock(&rt_constraints_mutex);
        read_lock(&tasklist_lock);
        ret = __rt_schedulable(NULL, 0, 0);
 static int sched_rt_global_constraints(void)
 {
        unsigned long flags;
-       int i;
+       int i, ret = 0;
+       u64 bw;
 
        if (sysctl_sched_rt_period <= 0)
                return -EINVAL;
 
-       /*
-        * There's always some RT tasks in the root group
-        * -- migration, kstopmachine etc..
-        */
-       if (sysctl_sched_rt_runtime == 0)
-               return -EBUSY;
-
        raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
+       bw = to_ratio(global_rt_period(), global_rt_runtime());
+       if (!__sched_rt_dl_global_constraints(bw)) {
+               ret = -EINVAL;
+               goto unlock;
+       }
+
        for_each_possible_cpu(i) {
                struct rt_rq *rt_rq = &cpu_rq(i)->rt;
 
                rt_rq->rt_runtime = global_rt_runtime();
                raw_spin_unlock(&rt_rq->rt_runtime_lock);
        }
+unlock:
        raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
 
-       return 0;
+       return ret;
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
+/*
+ * Coupling of -dl and -rt bandwidth.
+ *
+ * Here we check, while setting the system wide bandwidth available
+ * for -dl tasks and groups, if the new values are consistent with
+ * the system settings for the bandwidth available to -rt entities.
+ *
+ * IOW, we want to enforce that
+ *
+ *   rt_bandwidth + dl_bandwidth <= 100%
+ *
+ * is always true.
+ */
+static bool __sched_dl_rt_global_constraints(u64 dl_bw)
+{
+       u64 rt_bw;
+       bool ret;
+
+       raw_spin_lock(&def_rt_bandwidth.rt_runtime_lock);
+       if (global_dl_runtime() == RUNTIME_INF ||
+           global_rt_runtime() == RUNTIME_INF) {
+               ret = true;
+               goto unlock;
+       }
+
+       rt_bw = to_ratio(ktime_to_ns(def_rt_bandwidth.rt_period),
+                        def_rt_bandwidth.rt_runtime);
+
+       ret = rt_bw + dl_bw <= to_ratio(RUNTIME_INF, RUNTIME_INF);
+unlock:
+       raw_spin_unlock(&def_rt_bandwidth.rt_runtime_lock);
+
+       return ret;
+}
+
+static bool __sched_dl_global_constraints(u64 runtime, u64 period)
+{
+       if (!period || (runtime != RUNTIME_INF && runtime > period))
+               return -EINVAL;
+
+       return 0;
+}
+
+static int sched_dl_global_constraints(void)
+{
+       u64 runtime = global_dl_runtime();
+       u64 period = global_dl_period();
+       u64 new_bw = to_ratio(period, runtime);
+       int ret, i;
+
+       ret = __sched_dl_global_constraints(runtime, period);
+       if (ret)
+               return ret;
+
+       if (!__sched_dl_rt_global_constraints(new_bw))
+               return -EINVAL;
+
+       /*
+        * Here we want to check the bandwidth not being set to some
+        * value smaller than the currently allocated bandwidth in
+        * any of the root_domains.
+        *
+        * FIXME: Cycling on all the CPUs is overdoing, but simpler than
+        * cycling on root_domains... Discussion on different/better
+        * solutions is welcome!
+        */
+       for_each_possible_cpu(i) {
+               struct dl_bw *dl_b = dl_bw_of(i);
+
+               raw_spin_lock(&dl_b->lock);
+               if (new_bw < dl_b->total_bw) {
+                       raw_spin_unlock(&dl_b->lock);
+                       return -EBUSY;
+               }
+               raw_spin_unlock(&dl_b->lock);
+       }
+
+       return 0;
+}
+
 int sched_rr_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
        return ret;
 }
 
+int sched_dl_handler(struct ctl_table *table, int write,
+               void __user *buffer, size_t *lenp,
+               loff_t *ppos)
+{
+       int ret;
+       int old_period, old_runtime;
+       static DEFINE_MUTEX(mutex);
+       unsigned long flags;
+
+       mutex_lock(&mutex);
+       old_period = sysctl_sched_dl_period;
+       old_runtime = sysctl_sched_dl_runtime;
+
+       ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+       if (!ret && write) {
+               raw_spin_lock_irqsave(&def_dl_bandwidth.dl_runtime_lock,
+                                     flags);
+
+               ret = sched_dl_global_constraints();
+               if (ret) {
+                       sysctl_sched_dl_period = old_period;
+                       sysctl_sched_dl_runtime = old_runtime;
+               } else {
+                       u64 new_bw;
+                       int i;
+
+                       def_dl_bandwidth.dl_period = global_dl_period();
+                       def_dl_bandwidth.dl_runtime = global_dl_runtime();
+                       if (global_dl_runtime() == RUNTIME_INF)
+                               new_bw = -1;
+                       else
+                               new_bw = to_ratio(global_dl_period(),
+                                                 global_dl_runtime());
+                       /*
+                        * FIXME: As above...
+                        */
+                       for_each_possible_cpu(i) {
+                               struct dl_bw *dl_b = dl_bw_of(i);
+
+                               raw_spin_lock(&dl_b->lock);
+                               dl_b->bw = new_bw;
+                               raw_spin_unlock(&dl_b->lock);
+                       }
+               }
+
+               raw_spin_unlock_irqrestore(&def_dl_bandwidth.dl_runtime_lock,
+                                          flags);
+       }
+       mutex_unlock(&mutex);
+
+       return ret;
+}
+
 #ifdef CONFIG_CGROUP_SCHED
 
 static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
 
 #define NICE_0_LOAD            SCHED_LOAD_SCALE
 #define NICE_0_SHIFT           SCHED_LOAD_SHIFT
 
+/*
+ * Single value that decides SCHED_DEADLINE internal math precision.
+ * 10 -> just above 1us
+ * 9  -> just above 0.5us
+ */
+#define DL_SCALE (10)
+
 /*
  * These are the 'tuning knobs' of the scheduler:
  */
        return dl_policy(p->policy);
 }
 
-static inline int dl_time_before(u64 a, u64 b)
+static inline bool dl_time_before(u64 a, u64 b)
 {
        return (s64)(a - b) < 0;
 }
 /*
  * Tells if entity @a should preempt entity @b.
  */
-static inline
-int dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
+static inline bool
+dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
 {
        return dl_time_before(a->deadline, b->deadline);
 }
        u64                     rt_runtime;
        struct hrtimer          rt_period_timer;
 };
+/*
+ * To keep the bandwidth of -deadline tasks and groups under control
+ * we need some place where:
+ *  - store the maximum -deadline bandwidth of the system (the group);
+ *  - cache the fraction of that bandwidth that is currently allocated.
+ *
+ * This is all done in the data structure below. It is similar to the
+ * one used for RT-throttling (rt_bandwidth), with the main difference
+ * that, since here we are only interested in admission control, we
+ * do not decrease any runtime while the group "executes", neither we
+ * need a timer to replenish it.
+ *
+ * With respect to SMP, the bandwidth is given on a per-CPU basis,
+ * meaning that:
+ *  - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
+ *  - dl_total_bw array contains, in the i-eth element, the currently
+ *    allocated bandwidth on the i-eth CPU.
+ * Moreover, groups consume bandwidth on each CPU, while tasks only
+ * consume bandwidth on the CPU they're running on.
+ * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
+ * that will be shown the next time the proc or cgroup controls will
+ * be red. It on its turn can be changed by writing on its own
+ * control.
+ */
+struct dl_bandwidth {
+       raw_spinlock_t dl_runtime_lock;
+       u64 dl_runtime;
+       u64 dl_period;
+};
+
+static inline int dl_bandwidth_enabled(void)
+{
+       return sysctl_sched_dl_runtime >= 0;
+}
+
+extern struct dl_bw *dl_bw_of(int i);
+
+struct dl_bw {
+       raw_spinlock_t lock;
+       u64 bw, total_bw;
+};
+
+static inline u64 global_dl_period(void);
+static inline u64 global_dl_runtime(void);
 
 extern struct mutex sched_domains_mutex;
 
         */
        struct rb_root pushable_dl_tasks_root;
        struct rb_node *pushable_dl_tasks_leftmost;
+#else
+       struct dl_bw dl_bw;
 #endif
 };
 
         */
        cpumask_var_t dlo_mask;
        atomic_t dlo_count;
+       struct dl_bw dl_bw;
 
        /*
         * The "RT overload" flag: it gets set if a CPU has more than
        return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 
+static inline u64 global_dl_period(void)
+{
+       return (u64)sysctl_sched_dl_period * NSEC_PER_USEC;
+}
+
+static inline u64 global_dl_runtime(void)
+{
+       if (sysctl_sched_dl_runtime < 0)
+               return RUNTIME_INF;
 
+       return (u64)sysctl_sched_dl_runtime * NSEC_PER_USEC;
+}
 
 static inline int task_current(struct rq *rq, struct task_struct *p)
 {
 extern void init_sched_dl_class(void);
 extern void init_sched_rt_class(void);
 extern void init_sched_fair_class(void);
+extern void init_sched_dl_class(void);
 
 extern void resched_task(struct task_struct *p);
 extern void resched_cpu(int cpu);
 extern struct rt_bandwidth def_rt_bandwidth;
 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
 
+extern struct dl_bandwidth def_dl_bandwidth;
+extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
 extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
 
+unsigned long to_ratio(u64 period, u64 runtime);
+
 extern void update_idle_cpu_load(struct rq *this_rq);
 
 extern void init_task_runnable_average(struct task_struct *p);