* through the jobs entity pointer.
  */
 
+/**
+ * DOC: Flow Control
+ *
+ * The DRM GPU scheduler provides a flow control mechanism to regulate the rate
+ * in which the jobs fetched from scheduler entities are executed.
+ *
+ * In this context the &drm_gpu_scheduler keeps track of a driver specified
+ * credit limit representing the capacity of this scheduler and a credit count;
+ * every &drm_sched_job carries a driver specified number of credits.
+ *
+ * Once a job is executed (but not yet finished), the job's credits contribute
+ * to the scheduler's credit count until the job is finished. If by executing
+ * one more job the scheduler's credit count would exceed the scheduler's
+ * credit limit, the job won't be executed. Instead, the scheduler will wait
+ * until the credit count has decreased enough to not overflow its credit limit.
+ * This implies waiting for previously executed jobs.
+ *
+ * Optionally, drivers may register a callback (update_job_credits) provided by
+ * struct drm_sched_backend_ops to update the job's credits dynamically. The
+ * scheduler executes this callback every time the scheduler considers a job for
+ * execution and subsequently checks whether the job fits the scheduler's credit
+ * limit.
+ */
+
 #include <linux/wait.h>
 #include <linux/sched.h>
 #include <linux/completion.h>
 MODULE_PARM_DESC(sched_policy, "Specify the scheduling policy for entities on a run-queue, " __stringify(DRM_SCHED_POLICY_RR) " = Round Robin, " __stringify(DRM_SCHED_POLICY_FIFO) " = FIFO (default).");
 module_param_named(sched_policy, drm_sched_policy, int, 0444);
 
+static u32 drm_sched_available_credits(struct drm_gpu_scheduler *sched)
+{
+       u32 credits;
+
+       drm_WARN_ON(sched, check_sub_overflow(sched->credit_limit,
+                                             atomic_read(&sched->credit_count),
+                                             &credits));
+
+       return credits;
+}
+
+/**
+ * drm_sched_can_queue -- Can we queue more to the hardware?
+ * @sched: scheduler instance
+ * @entity: the scheduler entity
+ *
+ * Return true if we can push at least one more job from @entity, false
+ * otherwise.
+ */
+static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched,
+                               struct drm_sched_entity *entity)
+{
+       struct drm_sched_job *s_job;
+
+       s_job = to_drm_sched_job(spsc_queue_peek(&entity->job_queue));
+       if (!s_job)
+               return false;
+
+       if (sched->ops->update_job_credits) {
+               s_job->credits = sched->ops->update_job_credits(s_job);
+
+               drm_WARN(sched, !s_job->credits,
+                        "Jobs with zero credits bypass job-flow control.\n");
+       }
+
+       /* If a job exceeds the credit limit, truncate it to the credit limit
+        * itself to guarantee forward progress.
+        */
+       if (drm_WARN(sched, s_job->credits > sched->credit_limit,
+                    "Jobs may not exceed the credit limit, truncate.\n"))
+               s_job->credits = sched->credit_limit;
+
+       return drm_sched_available_credits(sched) >= s_job->credits;
+}
+
 static __always_inline bool drm_sched_entity_compare_before(struct rb_node *a,
                                                            const struct rb_node *b)
 {
 /**
  * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
  *
+ * @sched: the gpu scheduler
  * @rq: scheduler run queue to check.
  *
- * Try to find a ready entity, returns NULL if none found.
+ * Try to find the next ready entity.
+ *
+ * Return an entity if one is found; return an error-pointer (!NULL) if an
+ * entity was ready, but the scheduler had insufficient credits to accommodate
+ * its job; return NULL, if no ready entity was found.
  */
 static struct drm_sched_entity *
-drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
+drm_sched_rq_select_entity_rr(struct drm_gpu_scheduler *sched,
+                             struct drm_sched_rq *rq)
 {
        struct drm_sched_entity *entity;
 
        if (entity) {
                list_for_each_entry_continue(entity, &rq->entities, list) {
                        if (drm_sched_entity_is_ready(entity)) {
+                               /* If we can't queue yet, preserve the current
+                                * entity in terms of fairness.
+                                */
+                               if (!drm_sched_can_queue(sched, entity)) {
+                                       spin_unlock(&rq->lock);
+                                       return ERR_PTR(-ENOSPC);
+                               }
+
                                rq->current_entity = entity;
                                reinit_completion(&entity->entity_idle);
                                spin_unlock(&rq->lock);
        }
 
        list_for_each_entry(entity, &rq->entities, list) {
-
                if (drm_sched_entity_is_ready(entity)) {
+                       /* If we can't queue yet, preserve the current entity in
+                        * terms of fairness.
+                        */
+                       if (!drm_sched_can_queue(sched, entity)) {
+                               spin_unlock(&rq->lock);
+                               return ERR_PTR(-ENOSPC);
+                       }
+
                        rq->current_entity = entity;
                        reinit_completion(&entity->entity_idle);
                        spin_unlock(&rq->lock);
 /**
  * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
  *
+ * @sched: the gpu scheduler
  * @rq: scheduler run queue to check.
  *
- * Find oldest waiting ready entity, returns NULL if none found.
+ * Find oldest waiting ready entity.
+ *
+ * Return an entity if one is found; return an error-pointer (!NULL) if an
+ * entity was ready, but the scheduler had insufficient credits to accommodate
+ * its job; return NULL, if no ready entity was found.
  */
 static struct drm_sched_entity *
-drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
+drm_sched_rq_select_entity_fifo(struct drm_gpu_scheduler *sched,
+                               struct drm_sched_rq *rq)
 {
        struct rb_node *rb;
 
 
                entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
                if (drm_sched_entity_is_ready(entity)) {
+                       /* If we can't queue yet, preserve the current entity in
+                        * terms of fairness.
+                        */
+                       if (!drm_sched_can_queue(sched, entity)) {
+                               spin_unlock(&rq->lock);
+                               return ERR_PTR(-ENOSPC);
+                       }
+
                        rq->current_entity = entity;
                        reinit_completion(&entity->entity_idle);
                        break;
        struct drm_sched_fence *s_fence = s_job->s_fence;
        struct drm_gpu_scheduler *sched = s_fence->sched;
 
-       atomic_dec(&sched->hw_rq_count);
+       atomic_sub(s_job->credits, &sched->credit_count);
        atomic_dec(sched->score);
 
        trace_drm_sched_process_job(s_fence);
                                              &s_job->cb)) {
                        dma_fence_put(s_job->s_fence->parent);
                        s_job->s_fence->parent = NULL;
-                       atomic_dec(&sched->hw_rq_count);
+                       atomic_sub(s_job->credits, &sched->credit_count);
                } else {
                        /*
                         * remove job from pending_list.
        list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
                struct dma_fence *fence = s_job->s_fence->parent;
 
-               atomic_inc(&sched->hw_rq_count);
+               atomic_add(s_job->credits, &sched->credit_count);
 
                if (!full_recovery)
                        continue;
  * drm_sched_job_init - init a scheduler job
  * @job: scheduler job to init
  * @entity: scheduler entity to use
+ * @credits: the number of credits this job contributes to the schedulers
+ * credit limit
  * @owner: job owner for debugging
  *
  * Refer to drm_sched_entity_push_job() documentation
  */
 int drm_sched_job_init(struct drm_sched_job *job,
                       struct drm_sched_entity *entity,
-                      void *owner)
+                      u32 credits, void *owner)
 {
        if (!entity->rq) {
                /* This will most likely be followed by missing frames
                return -ENOENT;
        }
 
+       if (unlikely(!credits)) {
+               pr_err("*ERROR* %s: credits cannot be 0!\n", __func__);
+               return -EINVAL;
+       }
+
        job->entity = entity;
+       job->credits = credits;
        job->s_fence = drm_sched_fence_alloc(entity, owner);
        if (!job->s_fence)
                return -ENOMEM;
 }
 EXPORT_SYMBOL(drm_sched_job_cleanup);
 
-/**
- * drm_sched_can_queue -- Can we queue more to the hardware?
- * @sched: scheduler instance
- *
- * Return true if we can push more jobs to the hw, otherwise false.
- */
-static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched)
-{
-       return atomic_read(&sched->hw_rq_count) <
-               sched->hw_submission_limit;
-}
-
 /**
  * drm_sched_wakeup - Wake up the scheduler if it is ready to queue
  * @sched: scheduler instance
+ * @entity: the scheduler entity
  *
  * Wake up the scheduler if we can queue jobs.
  */
                      struct drm_sched_entity *entity)
 {
        if (drm_sched_entity_is_ready(entity))
-               if (drm_sched_can_queue(sched))
+               if (drm_sched_can_queue(sched, entity))
                        drm_sched_run_job_queue(sched);
 }
 
  *
  * @sched: scheduler instance
  *
- * Returns the entity to process or NULL if none are found.
+ * Return an entity to process or NULL if none are found.
+ *
+ * Note, that we break out of the for-loop when "entity" is non-null, which can
+ * also be an error-pointer--this assures we don't process lower priority
+ * run-queues. See comments in the respectively called functions.
  */
 static struct drm_sched_entity *
 drm_sched_select_entity(struct drm_gpu_scheduler *sched)
        struct drm_sched_entity *entity;
        int i;
 
-       if (!drm_sched_can_queue(sched))
-               return NULL;
-
        /* Kernel run queue has higher priority than normal run queue*/
        for (i = sched->num_rqs - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
                entity = drm_sched_policy == DRM_SCHED_POLICY_FIFO ?
-                       drm_sched_rq_select_entity_fifo(sched->sched_rq[i]) :
-                       drm_sched_rq_select_entity_rr(sched->sched_rq[i]);
+                       drm_sched_rq_select_entity_fifo(sched, sched->sched_rq[i]) :
+                       drm_sched_rq_select_entity_rr(sched, sched->sched_rq[i]);
                if (entity)
                        break;
        }
 
-       return entity;
+       return IS_ERR(entity) ? NULL : entity;
 }
 
 /**
 
        s_fence = sched_job->s_fence;
 
-       atomic_inc(&sched->hw_rq_count);
+       atomic_add(sched_job->credits, &sched->credit_count);
        drm_sched_job_begin(sched_job);
 
        trace_drm_run_job(sched_job, entity);
  * @submit_wq: workqueue to use for submission. If NULL, an ordered wq is
  *            allocated and used
  * @num_rqs: number of runqueues, one for each priority, up to DRM_SCHED_PRIORITY_COUNT
- * @hw_submission: number of hw submissions that can be in flight
+ * @credit_limit: the number of credits this scheduler can hold from all jobs
  * @hang_limit: number of times to allow a job to hang before dropping it
  * @timeout: timeout value in jiffies for the scheduler
  * @timeout_wq: workqueue to use for timeout work. If NULL, the system_wq is
 int drm_sched_init(struct drm_gpu_scheduler *sched,
                   const struct drm_sched_backend_ops *ops,
                   struct workqueue_struct *submit_wq,
-                  u32 num_rqs, uint32_t hw_submission, unsigned int hang_limit,
+                  u32 num_rqs, u32 credit_limit, unsigned int hang_limit,
                   long timeout, struct workqueue_struct *timeout_wq,
                   atomic_t *score, const char *name, struct device *dev)
 {
        int i, ret;
 
        sched->ops = ops;
-       sched->hw_submission_limit = hw_submission;
+       sched->credit_limit = credit_limit;
        sched->name = name;
        sched->timeout = timeout;
        sched->timeout_wq = timeout_wq ? : system_wq;
        init_waitqueue_head(&sched->job_scheduled);
        INIT_LIST_HEAD(&sched->pending_list);
        spin_lock_init(&sched->job_list_lock);
-       atomic_set(&sched->hw_rq_count, 0);
+       atomic_set(&sched->credit_count, 0);
        INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
        INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
        INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
 
  * @sched: the scheduler instance on which this job is scheduled.
  * @s_fence: contains the fences for the scheduling of job.
  * @finish_cb: the callback for the finished fence.
+ * @credits: the number of credits this job contributes to the scheduler
  * @work: Helper to reschdeule job kill to different context.
  * @id: a unique id assigned to each job scheduled on the scheduler.
  * @karma: increment on every hang caused by this job. If this exceeds the hang
        struct drm_gpu_scheduler        *sched;
        struct drm_sched_fence          *s_fence;
 
+       u32                             credits;
+
        /*
         * work is used only after finish_cb has been used and will not be
         * accessed anymore.
          * and it's time to clean it up.
         */
        void (*free_job)(struct drm_sched_job *sched_job);
+
+       /**
+        * @update_job_credits: Called when the scheduler is considering this
+        * job for execution.
+        *
+        * This callback returns the number of credits the job would take if
+        * pushed to the hardware. Drivers may use this to dynamically update
+        * the job's credit count. For instance, deduct the number of credits
+        * for already signalled native fences.
+        *
+        * This callback is optional.
+        */
+       u32 (*update_job_credits)(struct drm_sched_job *sched_job);
 };
 
 /**
  * struct drm_gpu_scheduler - scheduler instance-specific data
  *
  * @ops: backend operations provided by the driver.
- * @hw_submission_limit: the max size of the hardware queue.
+ * @credit_limit: the credit limit of this scheduler
+ * @credit_count: the current credit count of this scheduler
  * @timeout: the time after which a job is removed from the scheduler.
  * @name: name of the ring for which this scheduler is being used.
  * @num_rqs: Number of run-queues. This is at most DRM_SCHED_PRIORITY_COUNT,
  * @job_scheduled: once @drm_sched_entity_do_release is called the scheduler
  *                 waits on this wait queue until all the scheduled jobs are
  *                 finished.
- * @hw_rq_count: the number of jobs currently in the hardware queue.
  * @job_id_count: used to assign unique id to the each job.
  * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
  * @timeout_wq: workqueue used to queue @work_tdr
  */
 struct drm_gpu_scheduler {
        const struct drm_sched_backend_ops      *ops;
-       uint32_t                        hw_submission_limit;
+       u32                             credit_limit;
+       atomic_t                        credit_count;
        long                            timeout;
        const char                      *name;
        u32                             num_rqs;
        struct drm_sched_rq             **sched_rq;
        wait_queue_head_t               job_scheduled;
-       atomic_t                        hw_rq_count;
        atomic64_t                      job_id_count;
        struct workqueue_struct         *submit_wq;
        struct workqueue_struct         *timeout_wq;
 int drm_sched_init(struct drm_gpu_scheduler *sched,
                   const struct drm_sched_backend_ops *ops,
                   struct workqueue_struct *submit_wq,
-                  u32 num_rqs, uint32_t hw_submission, unsigned int hang_limit,
+                  u32 num_rqs, u32 credit_limit, unsigned int hang_limit,
                   long timeout, struct workqueue_struct *timeout_wq,
                   atomic_t *score, const char *name, struct device *dev);
 
 void drm_sched_fini(struct drm_gpu_scheduler *sched);
 int drm_sched_job_init(struct drm_sched_job *job,
                       struct drm_sched_entity *entity,
-                      void *owner);
+                      u32 credits, void *owner);
 void drm_sched_job_arm(struct drm_sched_job *job);
 int drm_sched_job_add_dependency(struct drm_sched_job *job,
                                 struct dma_fence *fence);