]> www.infradead.org Git - nvme.git/commitdiff
drm/i915: Request watchdog infrastructure
authorTvrtko Ursulin <tvrtko.ursulin@intel.com>
Wed, 24 Mar 2021 12:13:33 +0000 (12:13 +0000)
committerDaniel Vetter <daniel.vetter@ffwll.ch>
Thu, 25 Mar 2021 23:58:52 +0000 (00:58 +0100)
Prepares the plumbing for setting request/fence expiration time. All code
is put in place but is never activated due yet missing ability to actually
configure the timer.

Outline of the basic operation:

A timer is started when request is ready for execution. If the request
completes (retires) before the timer fires, timer is cancelled and nothing
further happens.

If the timer fires request is added to a lockless list and worker queued.
Purpose of this is twofold: a) It allows request cancellation from a more
friendly context and b) coalesces multiple expirations into a single event
of consuming the list.

Worker locklessly consumes the list of expired requests and cancels them
all using previous added i915_request_cancel().

Associated timeout value is stored in rq->context.watchdog.timeout_us.

v2:
 * Log expiration.

v3:
 * Include more information about user timeline in the log message.

v4:
 * Remove obsolete comment and fix formatting. (Matt)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20210324121335.2307063-6-tvrtko.ursulin@linux.intel.com
drivers/gpu/drm/i915/gt/intel_context_types.h
drivers/gpu/drm/i915/gt/intel_execlists_submission.h
drivers/gpu/drm/i915/gt/intel_gt.c
drivers/gpu/drm/i915/gt/intel_gt.h
drivers/gpu/drm/i915/gt/intel_gt_requests.c
drivers/gpu/drm/i915/gt/intel_gt_types.h
drivers/gpu/drm/i915/i915_request.c
drivers/gpu/drm/i915/i915_request.h

index e10d78601bbd830f84a740dd4507bd3be093db89..b457d6c493252ba57b86420be6c99744ce203dca 100644 (file)
@@ -97,6 +97,10 @@ struct intel_context {
 #define CONTEXT_FORCE_SINGLE_SUBMISSION        7
 #define CONTEXT_NOPREEMPT              8
 
+       struct {
+               u64 timeout_us;
+       } watchdog;
+
        u32 *lrc_reg_state;
        union {
                struct {
index a8fd7adefd8240cc4229acf8b361fe1ca1dd76c0..fd61dae820e9e940a34539523c279d3f76718d9d 100644 (file)
@@ -6,6 +6,7 @@
 #ifndef __INTEL_EXECLISTS_SUBMISSION_H__
 #define __INTEL_EXECLISTS_SUBMISSION_H__
 
+#include <linux/llist.h>
 #include <linux/types.h>
 
 struct drm_printer;
@@ -13,6 +14,7 @@ struct drm_printer;
 struct i915_request;
 struct intel_context;
 struct intel_engine_cs;
+struct intel_gt;
 
 enum {
        INTEL_CONTEXT_SCHEDULE_IN = 0,
index d8e1ab41263430b5c9447ed6f35973ef524208ca..ff63034cff9c0f1b45510ce2381d49b991691620 100644 (file)
@@ -29,6 +29,9 @@ void intel_gt_init_early(struct intel_gt *gt, struct drm_i915_private *i915)
        INIT_LIST_HEAD(&gt->closed_vma);
        spin_lock_init(&gt->closed_lock);
 
+       init_llist_head(&gt->watchdog.list);
+       INIT_WORK(&gt->watchdog.work, intel_gt_watchdog_work);
+
        intel_gt_init_buffer_pool(gt);
        intel_gt_init_reset(gt);
        intel_gt_init_requests(gt);
index 9157c7411f60398cf6daabb029a7135cbd5b70fa..35d3bb13372f68e60f8c53f8a4dd18759977da7a 100644 (file)
@@ -77,4 +77,6 @@ static inline bool intel_gt_is_wedged(const struct intel_gt *gt)
 void intel_gt_info_print(const struct intel_gt_info *info,
                         struct drm_printer *p);
 
+void intel_gt_watchdog_work(struct work_struct *work);
+
 #endif /* __INTEL_GT_H__ */
index dc06c78c9eebca3898e2c3355b457e42f55f90fb..f7e5ce2e22919f942bb07636b3142b7041169363 100644 (file)
@@ -9,6 +9,7 @@
 #include "i915_drv.h" /* for_each_engine() */
 #include "i915_request.h"
 #include "intel_engine_heartbeat.h"
+#include "intel_execlists_submission.h"
 #include "intel_gt.h"
 #include "intel_gt_pm.h"
 #include "intel_gt_requests.h"
@@ -243,4 +244,31 @@ void intel_gt_fini_requests(struct intel_gt *gt)
 {
        /* Wait until the work is marked as finished before unloading! */
        cancel_delayed_work_sync(&gt->requests.retire_work);
+
+       flush_work(&gt->watchdog.work);
+}
+
+void intel_gt_watchdog_work(struct work_struct *work)
+{
+       struct intel_gt *gt =
+               container_of(work, typeof(*gt), watchdog.work);
+       struct i915_request *rq, *rn;
+       struct llist_node *first;
+
+       first = llist_del_all(&gt->watchdog.list);
+       if (!first)
+               return;
+
+       llist_for_each_entry_safe(rq, rn, first, watchdog.link) {
+               if (!i915_request_completed(rq)) {
+                       struct dma_fence *f = &rq->fence;
+
+                       pr_notice("Fence expiration time out i915-%s:%s:%llx!\n",
+                                 f->ops->get_driver_name(f),
+                                 f->ops->get_timeline_name(f),
+                                 f->seqno);
+                       i915_request_cancel(rq, -EINTR);
+               }
+               i915_request_put(rq);
+       }
 }
index f7dab4fc926c964d0b427639553023edca8b5af7..0caf6ca0a784f143addd06f3e6713ff5f5255e64 100644 (file)
@@ -8,10 +8,12 @@
 
 #include <linux/ktime.h>
 #include <linux/list.h>
+#include <linux/llist.h>
 #include <linux/mutex.h>
 #include <linux/notifier.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
+#include <linux/workqueue.h>
 
 #include "uc/intel_uc.h"
 
@@ -52,6 +54,11 @@ struct intel_gt {
                struct delayed_work retire_work;
        } requests;
 
+       struct {
+               struct llist_head list;
+               struct work_struct work;
+       } watchdog;
+
        struct intel_wakeref wakeref;
        atomic_t user_wakeref;
 
index a031b86f850830f738389f9a77a021fab864083d..63968d163c14829148c580b2b42266f9e36f0154 100644 (file)
@@ -321,6 +321,53 @@ static void remove_from_engine(struct i915_request *rq)
        __notify_execute_cb_imm(rq);
 }
 
+static void __rq_init_watchdog(struct i915_request *rq)
+{
+       rq->watchdog.timer.function = NULL;
+}
+
+static enum hrtimer_restart __rq_watchdog_expired(struct hrtimer *hrtimer)
+{
+       struct i915_request *rq =
+               container_of(hrtimer, struct i915_request, watchdog.timer);
+       struct intel_gt *gt = rq->engine->gt;
+
+       if (!i915_request_completed(rq)) {
+               if (llist_add(&rq->watchdog.link, &gt->watchdog.list))
+                       schedule_work(&gt->watchdog.work);
+       } else {
+               i915_request_put(rq);
+       }
+
+       return HRTIMER_NORESTART;
+}
+
+static void __rq_arm_watchdog(struct i915_request *rq)
+{
+       struct i915_request_watchdog *wdg = &rq->watchdog;
+       struct intel_context *ce = rq->context;
+
+       if (!ce->watchdog.timeout_us)
+               return;
+
+       hrtimer_init(&wdg->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       wdg->timer.function = __rq_watchdog_expired;
+       hrtimer_start_range_ns(&wdg->timer,
+                              ns_to_ktime(ce->watchdog.timeout_us *
+                                          NSEC_PER_USEC),
+                              NSEC_PER_MSEC,
+                              HRTIMER_MODE_REL);
+       i915_request_get(rq);
+}
+
+static void __rq_cancel_watchdog(struct i915_request *rq)
+{
+       struct i915_request_watchdog *wdg = &rq->watchdog;
+
+       if (wdg->timer.function && hrtimer_try_to_cancel(&wdg->timer) > 0)
+               i915_request_put(rq);
+}
+
 bool i915_request_retire(struct i915_request *rq)
 {
        if (!__i915_request_is_complete(rq))
@@ -332,6 +379,8 @@ bool i915_request_retire(struct i915_request *rq)
        trace_i915_request_retire(rq);
        i915_request_mark_complete(rq);
 
+       __rq_cancel_watchdog(rq);
+
        /*
         * We know the GPU must have read the request to have
         * sent us the seqno + interrupt, so use the position
@@ -761,6 +810,8 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 
                if (unlikely(fence->error))
                        i915_request_set_error_once(request, fence->error);
+               else
+                       __rq_arm_watchdog(request);
 
                /*
                 * We need to serialize use of the submit_request() callback
@@ -947,6 +998,7 @@ __i915_request_create(struct intel_context *ce, gfp_t gfp)
 
        /* No zalloc, everything must be cleared after use */
        rq->batch = NULL;
+       __rq_init_watchdog(rq);
        GEM_BUG_ON(rq->capture_list);
        GEM_BUG_ON(!llist_empty(&rq->execute_cb));
 
index e4d190ab76b29786c214d8ff8774d62e8873a0af..36071d3d383da9372c4737dddc215112212f3afc 100644 (file)
@@ -26,7 +26,9 @@
 #define I915_REQUEST_H
 
 #include <linux/dma-fence.h>
+#include <linux/hrtimer.h>
 #include <linux/irq_work.h>
+#include <linux/llist.h>
 #include <linux/lockdep.h>
 
 #include "gem/i915_gem_context_types.h"
@@ -277,6 +279,12 @@ struct i915_request {
        /** timeline->request entry for this request */
        struct list_head link;
 
+       /** Watchdog support fields. */
+       struct i915_request_watchdog {
+               struct llist_node link;
+               struct hrtimer timer;
+       } watchdog;
+
        I915_SELFTEST_DECLARE(struct {
                struct list_head link;
                unsigned long delay;