]> www.infradead.org Git - users/hch/configfs.git/commitdiff
drm/xe: Store process name and pid in xe file
authorMatthew Brost <matthew.brost@intel.com>
Tue, 23 Jul 2024 15:10:45 +0000 (08:10 -0700)
committerMatthew Brost <matthew.brost@intel.com>
Tue, 23 Jul 2024 17:45:40 +0000 (10:45 -0700)
An xe file can outlive the associated process as the GPU cleanup is just
triggered upon file close (process kill) and completes sometime later.
If the file close triggers error conditions (GPU hangs) the process
cannot be safely referenced to retrieve the name and pid for debug
information. Store the process name and pid directly in the xe file to
be safe.

v2:
 - Access file->pid via rcu_access_pointer (Matthew Auld)

Fixes: b10d0c5e9df7 ("drm/xe: Add process name to devcoredump")
Fixes: f6ca930d974e ("drm/xe: Add process name and PID to job timedout message")
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240723151045.1725417-1-matthew.brost@intel.com
drivers/gpu/drm/xe/xe_devcoredump.c
drivers/gpu/drm/xe/xe_device.c
drivers/gpu/drm/xe/xe_device_types.h
drivers/gpu/drm/xe/xe_guc_submit.c

index 62c2b10fbf1d2e4166600437ebabcdf408f7d09d..d8d8ca2c19d368c774bf04a523cc5f4810b6cf56 100644 (file)
@@ -171,7 +171,6 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
        u32 adj_logical_mask = q->logical_mask;
        u32 width_mask = (0x1 << q->width) - 1;
        const char *process_name = "no process";
-       struct task_struct *task = NULL;
 
        int i;
        bool cookie;
@@ -179,14 +178,9 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
        ss->snapshot_time = ktime_get_real();
        ss->boot_time = ktime_get_boottime();
 
-       if (q->vm && q->vm->xef) {
-               task = get_pid_task(q->vm->xef->drm->pid, PIDTYPE_PID);
-               if (task)
-                       process_name = task->comm;
-       }
+       if (q->vm && q->vm->xef)
+               process_name = q->vm->xef->process_name;
        strscpy(ss->process_name, process_name);
-       if (task)
-               put_task_struct(task);
 
        ss->gt = q->gt;
        INIT_WORK(&ss->work, xe_devcoredump_deferred_snap_work);
index b677608eb592e810f09c4a4a7a39df484abbce22..1aba6f9eaa192b75220283b1cf67431011360708 100644 (file)
@@ -64,6 +64,7 @@ static int xe_file_open(struct drm_device *dev, struct drm_file *file)
        struct xe_drm_client *client;
        struct xe_file *xef;
        int ret = -ENOMEM;
+       struct task_struct *task = NULL;
 
        xef = kzalloc(sizeof(*xef), GFP_KERNEL);
        if (!xef)
@@ -92,6 +93,13 @@ static int xe_file_open(struct drm_device *dev, struct drm_file *file)
        file->driver_priv = xef;
        kref_init(&xef->refcount);
 
+       task = get_pid_task(rcu_access_pointer(file->pid), PIDTYPE_PID);
+       if (task) {
+               xef->process_name = kstrdup(task->comm, GFP_KERNEL);
+               xef->pid = task->pid;
+               put_task_struct(task);
+       }
+
        return 0;
 }
 
@@ -110,6 +118,7 @@ static void xe_file_destroy(struct kref *ref)
        spin_unlock(&xe->clients.lock);
 
        xe_drm_client_put(xef->client);
+       kfree(xef->process_name);
        kfree(xef);
 }
 
index 36252d5b1663d916e34dd11f4cf2eacb645e023b..5b7292a9a66dc93c0d60e62a025d76aa67cad3b4 100644 (file)
@@ -582,6 +582,18 @@ struct xe_file {
        /** @client: drm client */
        struct xe_drm_client *client;
 
+       /**
+        * @process_name: process name for file handle, used to safely output
+        * during error situations where xe file can outlive process
+        */
+       char *process_name;
+
+       /**
+        * @pid: pid for file handle, used to safely output uring error
+        * situations where xe file can outlive process
+        */
+       pid_t pid;
+
        /** @refcount: ref count of this xe file */
        struct kref refcount;
 };
index da2ead86b9ae43726174aed9e3b875368844ebf5..a4570631926f6f7df2151a3d86c8dc1093563818 100644 (file)
@@ -1072,7 +1072,6 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
        struct xe_gpu_scheduler *sched = &q->guc->sched;
        struct xe_guc *guc = exec_queue_to_guc(q);
        const char *process_name = "no process";
-       struct task_struct *task = NULL;
        int err = -ETIME;
        pid_t pid = -1;
        int i = 0;
@@ -1172,17 +1171,12 @@ trigger_reset:
        }
 
        if (q->vm && q->vm->xef) {
-               task = get_pid_task(q->vm->xef->drm->pid, PIDTYPE_PID);
-               if (task) {
-                       process_name = task->comm;
-                       pid = task->pid;
-               }
+               process_name = q->vm->xef->process_name;
+               pid = q->vm->xef->pid;
        }
        xe_gt_notice(guc_to_gt(guc), "Timedout job: seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx in %s [%d]",
                     xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
                     q->guc->id, q->flags, process_name, pid);
-       if (task)
-               put_task_struct(task);
 
        trace_xe_sched_job_timedout(job);