]> www.infradead.org Git - users/dwmw2/linux.git/commitdiff
drm/amdkfd: Add GPU recoverable fault SMI event
authorPhilip Yang <Philip.Yang@amd.com>
Fri, 14 Jan 2022 00:22:54 +0000 (19:22 -0500)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 30 Jun 2022 19:30:54 +0000 (15:30 -0400)
Use ktime_get_boottime_ns() as timestamp to correlate with other
APIs. Output timestamp when GPU recoverable fault starts and ends to
recover the fault, if migration happened or only GPU page table is
updated to recover, fault address, if read or write fault.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
drivers/gpu/drm/amd/amdkfd/kfd_svm.c
drivers/gpu/drm/amd/amdkfd/kfd_svm.h

index 55ed026435e27a38e3a8cb99a2c07df31c2ae027..b7e68283925f66af480292b5b4dc2e8b1e89d01e 100644 (file)
@@ -244,6 +244,23 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
                          task_info.pid, task_info.task_name);
 }
 
+void kfd_smi_event_page_fault_start(struct kfd_dev *dev, pid_t pid,
+                                   unsigned long address, bool write_fault,
+                                   ktime_t ts)
+{
+       kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_PAGE_FAULT_START,
+                         "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid,
+                         address, dev->id, write_fault ? 'W' : 'R');
+}
+
+void kfd_smi_event_page_fault_end(struct kfd_dev *dev, pid_t pid,
+                                 unsigned long address, bool migration)
+{
+       kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_PAGE_FAULT_END,
+                         "%lld -%d @%lx(%x) %c\n", ktime_get_boottime_ns(),
+                         pid, address, dev->id, migration ? 'M' : 'U');
+}
+
 int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
 {
        struct kfd_smi_client *client;
index dfe101c211666f6fa13f524c664090c9b802ce77..7903718cd9eb893bdc0354a0ca36296e89dc0524 100644 (file)
@@ -29,5 +29,9 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
 void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
                                             uint64_t throttle_bitmask);
 void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset);
-
+void kfd_smi_event_page_fault_start(struct kfd_dev *dev, pid_t pid,
+                                   unsigned long address, bool write_fault,
+                                   ktime_t ts);
+void kfd_smi_event_page_fault_end(struct kfd_dev *dev, pid_t pid,
+                                 unsigned long address, bool migration);
 #endif
index 7b332246eda3e1cb71b14c61dbaaac1aee960dd6..a3c7dd411b772fd09dbebbbb449d53023e25b929 100644 (file)
@@ -32,6 +32,7 @@
 #include "kfd_priv.h"
 #include "kfd_svm.h"
 #include "kfd_migrate.h"
+#include "kfd_smi_events.h"
 
 #ifdef dev_fmt
 #undef dev_fmt
@@ -43,7 +44,7 @@
 /* Long enough to ensure no retry fault comes after svm range is restored and
  * page table is updated.
  */
-#define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING   2000
+#define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING   (2UL * NSEC_PER_MSEC)
 
 struct criu_svm_metadata {
        struct list_head list;
@@ -1617,7 +1618,7 @@ unreserve_out:
        svm_range_unreserve_bos(&ctx);
 
        if (!r)
-               prange->validate_timestamp = ktime_to_us(ktime_get());
+               prange->validate_timestamp = ktime_get_boottime();
 
        return r;
 }
@@ -2694,11 +2695,12 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
        struct svm_range_list *svms;
        struct svm_range *prange;
        struct kfd_process *p;
-       uint64_t timestamp;
+       ktime_t timestamp = ktime_get_boottime();
        int32_t best_loc;
        int32_t gpuidx = MAX_GPU_INSTANCE;
        bool write_locked = false;
        struct vm_area_struct *vma;
+       bool migration = false;
        int r = 0;
 
        if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) {
@@ -2775,9 +2777,9 @@ retry_write_locked:
                goto out_unlock_range;
        }
 
-       timestamp = ktime_to_us(ktime_get()) - prange->validate_timestamp;
        /* skip duplicate vm fault on different pages of same range */
-       if (timestamp < AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING) {
+       if (ktime_before(timestamp, ktime_add_ns(prange->validate_timestamp,
+                               AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING))) {
                pr_debug("svms 0x%p [0x%lx %lx] already restored\n",
                         svms, prange->start, prange->last);
                r = 0;
@@ -2813,7 +2815,11 @@ retry_write_locked:
                 svms, prange->start, prange->last, best_loc,
                 prange->actual_loc);
 
+       kfd_smi_event_page_fault_start(adev->kfd.dev, p->lead_thread->pid, addr,
+                                      write_fault, timestamp);
+
        if (prange->actual_loc != best_loc) {
+               migration = true;
                if (best_loc) {
                        r = svm_migrate_to_vram(prange, best_loc, mm);
                        if (r) {
@@ -2842,6 +2848,9 @@ retry_write_locked:
                pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n",
                         r, svms, prange->start, prange->last);
 
+       kfd_smi_event_page_fault_end(adev->kfd.dev, p->lead_thread->pid, addr,
+                                    migration);
+
 out_unlock_range:
        mutex_unlock(&prange->migrate_mutex);
 out_unlock_svms:
index 2d54147b4ddaac835c8fae3ffd324de6fa3145a1..eab7f6d3b13cab256df6c5a265418496c8606835 100644 (file)
@@ -125,7 +125,7 @@ struct svm_range {
        uint32_t                        actual_loc;
        uint8_t                         granularity;
        atomic_t                        invalid;
-       uint64_t                        validate_timestamp;
+       ktime_t                         validate_timestamp;
        struct mmu_interval_notifier    notifier;
        struct svm_work_list_item       work_item;
        struct list_head                deferred_list;