]> www.infradead.org Git - linux.git/commitdiff
drm/amdgpu: Implement Enforce Isolation Handler for KGD/KFD serialization
authorSrinivasan Shanmugam <srinivasan.shanmugam@amd.com>
Thu, 6 Jun 2024 07:58:02 +0000 (13:28 +0530)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 21 Aug 2024 02:07:35 +0000 (22:07 -0400)
This commit introduces the Enforce Isolation Handler designed to enforce
shader isolation on AMD GPUs, which helps to prevent data leakage
between different processes.

The handler counts the number of emitted fences for each GFX and compute
ring. If there are any fences, it schedules the `enforce_isolation_work`
to be run after a delay of `GFX_SLICE_PERIOD`. If there are no fences,
it signals the Kernel Fusion Driver (KFD) to resume the runqueue.

The function is synchronized using the `enforce_isolation_mutex`.

This commit also introduces a reference count mechanism
(kfd_sch_req_count) to keep track of the number of requests to enable
the KFD scheduler. When a request to enable the KFD scheduler is made,
the reference count is decremented. When the reference count reaches
zero, a delayed work is scheduled to enforce isolation after a delay of
GFX_SLICE_PERIOD.

When a request to disable the KFD scheduler is made, the function first
checks if the reference count is zero. If it is, it cancels the delayed
work for enforcing isolation and checks if the KFD scheduler is active.
If the KFD scheduler is active, it sends a request to stop the KFD
scheduler and sets the KFD scheduler state to inactive. Then, it
increments the reference count.

The function is synchronized using the kfd_sch_mutex to ensure that the
KFD scheduler state and reference count are updated atomically.

Cc: Christian König <christian.koenig@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Srinivasan Shanmugam <srinivasan.shanmugam@amd.com>
Suggested-by: Christian König <christian.koenig@amd.com>
Suggested-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h

index aa97bbefe934b2ec071844c9f377e9497f2f915c..e8c284aea1f25eea73605460bc3fc37a0e183fa7 100644 (file)
 
 #define MAX_GPU_INSTANCE               64
 
+#define GFX_SLICE_PERIOD               msecs_to_jiffies(250)
+
 struct amdgpu_gpu_instance {
        struct amdgpu_device            *adev;
        int                             mgpu_fan_enabled;
index 2f1bc02309fe9ce97497d71999baa2118cbdb035..ad97f03f135853ebafca16ae63d17d9aafa9402a 100644 (file)
@@ -4067,6 +4067,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
        mutex_init(&adev->gfx.reset_sem_mutex);
        /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */
        mutex_init(&adev->enforce_isolation_mutex);
+       mutex_init(&adev->gfx.kfd_sch_mutex);
 
        amdgpu_device_init_apu_flags(adev);
 
@@ -4098,6 +4099,21 @@ int amdgpu_device_init(struct amdgpu_device *adev,
                          amdgpu_device_delayed_init_work_handler);
        INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
                          amdgpu_device_delay_enable_gfx_off);
+       /*
+        * Initialize the enforce_isolation work structures for each XCP
+        * partition.  This work handler is responsible for enforcing shader
+        * isolation on AMD GPUs.  It counts the number of emitted fences for
+        * each GFX and compute ring.  If there are any fences, it schedules
+        * the `enforce_isolation_work` to be run after a delay.  If there are
+        * no fences, it signals the Kernel Fusion Driver (KFD) to resume the
+        * runqueue.
+        */
+       for (i = 0; i < MAX_XCP; i++) {
+               INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work,
+                                 amdgpu_gfx_enforce_isolation_handler);
+               adev->gfx.enforce_isolation[i].adev = adev;
+               adev->gfx.enforce_isolation[i].xcp_id = i;
+       }
 
        INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
 
index 76f77cf562afcea9da63c445e715f253290e1db4..b4efeef848de7a96ae0c9df08556e608e5e798cc 100644 (file)
@@ -1686,3 +1686,170 @@ void amdgpu_gfx_cleaner_shader_init(struct amdgpu_device *adev,
                memcpy_toio(adev->gfx.cleaner_shader_cpu_ptr, cleaner_shader_ptr,
                            cleaner_shader_size);
 }
+
+/**
+ * amdgpu_gfx_kfd_sch_ctrl - Control the KFD scheduler from the KGD (Graphics Driver)
+ * @adev: amdgpu_device pointer
+ * @idx: Index of the scheduler to control
+ * @enable: Whether to enable or disable the KFD scheduler
+ *
+ * This function is used to control the KFD (Kernel Fusion Driver) scheduler
+ * from the KGD. It is part of the cleaner shader feature. This function plays
+ * a key role in enforcing process isolation on the GPU.
+ *
+ * The function uses a reference count mechanism (kfd_sch_req_count) to keep
+ * track of the number of requests to enable the KFD scheduler. When a request
+ * to enable the KFD scheduler is made, the reference count is decremented.
+ * When the reference count reaches zero, a delayed work is scheduled to
+ * enforce isolation after a delay of GFX_SLICE_PERIOD.
+ *
+ * When a request to disable the KFD scheduler is made, the function first
+ * checks if the reference count is zero. If it is, it cancels the delayed work
+ * for enforcing isolation and checks if the KFD scheduler is active. If the
+ * KFD scheduler is active, it sends a request to stop the KFD scheduler and
+ * sets the KFD scheduler state to inactive. Then, it increments the reference
+ * count.
+ *
+ * The function is synchronized using the kfd_sch_mutex to ensure that the KFD
+ * scheduler state and reference count are updated atomically.
+ *
+ * Note: If the reference count is already zero when a request to enable the
+ * KFD scheduler is made, it means there's an imbalance bug somewhere. The
+ * function triggers a warning in this case.
+ */
+static void amdgpu_gfx_kfd_sch_ctrl(struct amdgpu_device *adev, u32 idx,
+                                   bool enable)
+{
+       mutex_lock(&adev->gfx.kfd_sch_mutex);
+
+       if (enable) {
+               /* If the count is already 0, it means there's an imbalance bug somewhere.
+                * Note that the bug may be in a different caller than the one which triggers the
+                * WARN_ON_ONCE.
+                */
+               if (WARN_ON_ONCE(adev->gfx.kfd_sch_req_count[idx] == 0)) {
+                       dev_err(adev->dev, "Attempted to enable KFD scheduler when reference count is already zero\n");
+                       goto unlock;
+               }
+
+               adev->gfx.kfd_sch_req_count[idx]--;
+
+               if (adev->gfx.kfd_sch_req_count[idx] == 0 &&
+                   adev->gfx.kfd_sch_inactive[idx]) {
+                       schedule_delayed_work(&adev->gfx.enforce_isolation[idx].work,
+                                             GFX_SLICE_PERIOD);
+               }
+       } else {
+               if (adev->gfx.kfd_sch_req_count[idx] == 0) {
+                       cancel_delayed_work_sync(&adev->gfx.enforce_isolation[idx].work);
+                       if (!adev->gfx.kfd_sch_inactive[idx]) {
+                               amdgpu_amdkfd_stop_sched(adev, idx);
+                               adev->gfx.kfd_sch_inactive[idx] = true;
+                       }
+               }
+
+               adev->gfx.kfd_sch_req_count[idx]++;
+       }
+
+unlock:
+       mutex_unlock(&adev->gfx.kfd_sch_mutex);
+}
+
+/**
+ * amdgpu_gfx_enforce_isolation_handler - work handler for enforcing shader isolation
+ *
+ * @work: work_struct.
+ *
+ * This function is the work handler for enforcing shader isolation on AMD GPUs.
+ * It counts the number of emitted fences for each GFX and compute ring. If there
+ * are any fences, it schedules the `enforce_isolation_work` to be run after a
+ * delay of `GFX_SLICE_PERIOD`. If there are no fences, it signals the Kernel Fusion
+ * Driver (KFD) to resume the runqueue. The function is synchronized using the
+ * `enforce_isolation_mutex`.
+ */
+void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work)
+{
+       struct amdgpu_isolation_work *isolation_work =
+               container_of(work, struct amdgpu_isolation_work, work.work);
+       struct amdgpu_device *adev = isolation_work->adev;
+       u32 i, idx, fences = 0;
+
+       if (isolation_work->xcp_id == AMDGPU_XCP_NO_PARTITION)
+               idx = 0;
+       else
+               idx = isolation_work->xcp_id;
+
+       if (idx >= MAX_XCP)
+               return;
+
+       mutex_lock(&adev->enforce_isolation_mutex);
+       for (i = 0; i < AMDGPU_MAX_GFX_RINGS; ++i) {
+               if (isolation_work->xcp_id == adev->gfx.gfx_ring[i].xcp_id)
+                       fences += amdgpu_fence_count_emitted(&adev->gfx.gfx_ring[i]);
+       }
+       for (i = 0; i < (AMDGPU_MAX_COMPUTE_RINGS * AMDGPU_MAX_GC_INSTANCES); ++i) {
+               if (isolation_work->xcp_id == adev->gfx.compute_ring[i].xcp_id)
+                       fences += amdgpu_fence_count_emitted(&adev->gfx.compute_ring[i]);
+       }
+       if (fences) {
+               schedule_delayed_work(&adev->gfx.enforce_isolation[idx].work,
+                                     GFX_SLICE_PERIOD);
+       } else {
+               /* Tell KFD to resume the runqueue */
+               if (adev->kfd.init_complete) {
+                       WARN_ON_ONCE(!adev->gfx.kfd_sch_inactive[idx]);
+                       WARN_ON_ONCE(adev->gfx.kfd_sch_req_count[idx]);
+                               amdgpu_amdkfd_start_sched(adev, idx);
+                               adev->gfx.kfd_sch_inactive[idx] = false;
+               }
+       }
+       mutex_unlock(&adev->enforce_isolation_mutex);
+}
+
+void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring)
+{
+       struct amdgpu_device *adev = ring->adev;
+       u32 idx;
+
+       if (!adev->gfx.enable_cleaner_shader)
+               return;
+
+       if (ring->xcp_id == AMDGPU_XCP_NO_PARTITION)
+               idx = 0;
+       else
+               idx = ring->xcp_id;
+
+       if (idx >= MAX_XCP)
+               return;
+
+       mutex_lock(&adev->enforce_isolation_mutex);
+       if (adev->enforce_isolation[idx]) {
+               if (adev->kfd.init_complete)
+                       amdgpu_gfx_kfd_sch_ctrl(adev, idx, false);
+       }
+       mutex_unlock(&adev->enforce_isolation_mutex);
+}
+
+void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring)
+{
+       struct amdgpu_device *adev = ring->adev;
+       u32 idx;
+
+       if (!adev->gfx.enable_cleaner_shader)
+               return;
+
+       if (ring->xcp_id == AMDGPU_XCP_NO_PARTITION)
+               idx = 0;
+       else
+               idx = ring->xcp_id;
+
+       if (idx >= MAX_XCP)
+               return;
+
+       mutex_lock(&adev->enforce_isolation_mutex);
+       if (adev->enforce_isolation[idx]) {
+               if (adev->kfd.init_complete)
+                       amdgpu_gfx_kfd_sch_ctrl(adev, idx, true);
+       }
+       mutex_unlock(&adev->enforce_isolation_mutex);
+}
index f7b37c340e3665bab43195e957c4843f9907a5e5..e28c1ebfa98f0cd8109a06f1d700b4de74ce509a 100644 (file)
@@ -34,6 +34,7 @@
 #include "soc15.h"
 #include "amdgpu_ras.h"
 #include "amdgpu_ring_mux.h"
+#include "amdgpu_xcp.h"
 
 /* GFX current status */
 #define AMDGPU_GFX_NORMAL_MODE                 0x00000000L
@@ -343,6 +344,12 @@ struct amdgpu_me {
        DECLARE_BITMAP(queue_bitmap, AMDGPU_MAX_GFX_QUEUES);
 };
 
+struct amdgpu_isolation_work {
+       struct amdgpu_device            *adev;
+       u32                             xcp_id;
+       struct delayed_work             work;
+};
+
 struct amdgpu_gfx {
        struct mutex                    gpu_clock_mutex;
        struct amdgpu_gfx_config        config;
@@ -454,6 +461,11 @@ struct amdgpu_gfx {
        void                            *cleaner_shader_cpu_ptr;
        const void                      *cleaner_shader_ptr;
        bool                            enable_cleaner_shader;
+       struct amdgpu_isolation_work    enforce_isolation[MAX_XCP];
+       /* Mutex for synchronizing KFD scheduler operations */
+       struct mutex                    kfd_sch_mutex;
+       u64                             kfd_sch_req_count[MAX_XCP];
+       bool                            kfd_sch_inactive[MAX_XCP];
 };
 
 struct amdgpu_gfx_ras_reg_entry {
@@ -563,6 +575,9 @@ void amdgpu_gfx_cleaner_shader_init(struct amdgpu_device *adev,
                                    const void *cleaner_shader_ptr);
 int amdgpu_gfx_sysfs_isolation_shader_init(struct amdgpu_device *adev);
 void amdgpu_gfx_sysfs_isolation_shader_fini(struct amdgpu_device *adev);
+void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work);
+void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring);
+void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring);
 
 static inline const char *amdgpu_gfx_compute_mode_desc(int mode)
 {