amdgpu_reset_get_desc(reset_context, reset_cause,
sizeof(reset_cause));
- kfd_smi_event_add(0, dev, event, "%x %s\n",
- dev->reset_seq_num,
- reset_cause);
+ kfd_smi_event_add(0, dev, event, KFD_EVENT_FMT_UPDATE_GPU_RESET(
+ dev->reset_seq_num, reset_cause));
}
void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
uint64_t throttle_bitmask)
{
- kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, "%llx:%llx\n",
+ kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, KFD_EVENT_FMT_THERMAL_THROTTLING(
throttle_bitmask,
- amdgpu_dpm_get_thermal_throttling_counter(dev->adev));
+ amdgpu_dpm_get_thermal_throttling_counter(dev->adev)));
}
void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid)
if (task_info) {
/* Report VM faults from user applications, not retry from kernel */
if (task_info->pid)
- kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n",
- task_info->pid, task_info->task_name);
+ kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, KFD_EVENT_FMT_VMFAULT(
+ task_info->pid, task_info->task_name));
amdgpu_vm_put_task_info(task_info);
}
}
ktime_t ts)
{
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_START,
- "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid,
- address, node->id, write_fault ? 'W' : 'R');
+ KFD_EVENT_FMT_PAGEFAULT_START(ktime_to_ns(ts), pid,
+ address, node->id, write_fault ? 'W' : 'R'));
}
void kfd_smi_event_page_fault_end(struct kfd_node *node, pid_t pid,
unsigned long address, bool migration)
{
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_END,
- "%lld -%d @%lx(%x) %c\n", ktime_get_boottime_ns(),
- pid, address, node->id, migration ? 'M' : 'U');
+ KFD_EVENT_FMT_PAGEFAULT_END(ktime_get_boottime_ns(),
+ pid, address, node->id, migration ? 'M' : 'U'));
}
void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid,
uint32_t trigger)
{
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_START,
- "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n",
+ KFD_EVENT_FMT_MIGRATE_START(
ktime_get_boottime_ns(), pid, start, end - start,
- from, to, prefetch_loc, preferred_loc, trigger);
+ from, to, prefetch_loc, preferred_loc, trigger));
}
void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid,
uint32_t from, uint32_t to, uint32_t trigger)
{
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_END,
- "%lld -%d @%lx(%lx) %x->%x %d\n",
+ KFD_EVENT_FMT_MIGRATE_END(
ktime_get_boottime_ns(), pid, start, end - start,
- from, to, trigger);
+ from, to, trigger));
}
void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid,
uint32_t trigger)
{
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_QUEUE_EVICTION,
- "%lld -%d %x %d\n", ktime_get_boottime_ns(), pid,
- node->id, trigger);
+ KFD_EVENT_FMT_QUEUE_EVICTION(ktime_get_boottime_ns(), pid,
+ node->id, trigger));
}
void kfd_smi_event_queue_restore(struct kfd_node *node, pid_t pid)
{
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_QUEUE_RESTORE,
- "%lld -%d %x\n", ktime_get_boottime_ns(), pid,
- node->id);
+ KFD_EVENT_FMT_QUEUE_RESTORE(ktime_get_boottime_ns(), pid,
+ node->id, 0));
}
void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm)
kfd_smi_event_add(p->lead_thread->pid, pdd->dev,
KFD_SMI_EVENT_QUEUE_RESTORE,
- "%lld -%d %x %c\n", ktime_get_boottime_ns(),
- p->lead_thread->pid, pdd->dev->id, 'R');
+ KFD_EVENT_FMT_QUEUE_RESTORE(ktime_get_boottime_ns(),
+ p->lead_thread->pid, pdd->dev->id, 'R'));
}
kfd_unref_process(p);
}
uint32_t trigger)
{
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_UNMAP_FROM_GPU,
- "%lld -%d @%lx(%lx) %x %d\n", ktime_get_boottime_ns(),
- pid, address, last - address + 1, node->id, trigger);
+ KFD_EVENT_FMT_UNMAP_FROM_GPU(ktime_get_boottime_ns(),
+ pid, address, last - address + 1, node->id, trigger));
}
int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd)
KFD_SMI_EVENT_ALL_PROCESS = 64
};
+/* The reason of the page migration event */
enum KFD_MIGRATE_TRIGGERS {
- KFD_MIGRATE_TRIGGER_PREFETCH,
- KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU,
- KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU,
- KFD_MIGRATE_TRIGGER_TTM_EVICTION
+ KFD_MIGRATE_TRIGGER_PREFETCH, /* Prefetch to GPU VRAM or system memory */
+ KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, /* GPU page fault recover */
+ KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU, /* CPU page fault recover */
+ KFD_MIGRATE_TRIGGER_TTM_EVICTION /* TTM eviction */
};
+/* The reason of user queue evition event */
enum KFD_QUEUE_EVICTION_TRIGGERS {
- KFD_QUEUE_EVICTION_TRIGGER_SVM,
- KFD_QUEUE_EVICTION_TRIGGER_USERPTR,
- KFD_QUEUE_EVICTION_TRIGGER_TTM,
- KFD_QUEUE_EVICTION_TRIGGER_SUSPEND,
- KFD_QUEUE_EVICTION_CRIU_CHECKPOINT,
- KFD_QUEUE_EVICTION_CRIU_RESTORE
+ KFD_QUEUE_EVICTION_TRIGGER_SVM, /* SVM buffer migration */
+ KFD_QUEUE_EVICTION_TRIGGER_USERPTR, /* userptr movement */
+ KFD_QUEUE_EVICTION_TRIGGER_TTM, /* TTM move buffer */
+ KFD_QUEUE_EVICTION_TRIGGER_SUSPEND, /* GPU suspend */
+ KFD_QUEUE_EVICTION_CRIU_CHECKPOINT, /* CRIU checkpoint */
+ KFD_QUEUE_EVICTION_CRIU_RESTORE /* CRIU restore */
};
+/* The reason of unmap buffer from GPU event */
enum KFD_SVM_UNMAP_TRIGGERS {
- KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY,
- KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE,
- KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU
+ KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY, /* MMU notifier CPU buffer movement */
+ KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE,/* MMU notifier page migration */
+ KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU /* Unmap to free the buffer */
};
#define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))
__u32 anon_fd; /* from KFD */
};
+/*
+ * SVM event tracing via SMI system management interface
+ *
+ * Open event file descriptor
+ * use ioctl AMDKFD_IOC_SMI_EVENTS, pass in gpuid and return a anonymous file
+ * descriptor to receive SMI events.
+ * If calling with sudo permission, then file descriptor can be used to receive
+ * SVM events from all processes, otherwise, to only receive SVM events of same
+ * process.
+ *
+ * To enable the SVM event
+ * Write event file descriptor with KFD_SMI_EVENT_MASK_FROM_INDEX(event) bitmap
+ * mask to start record the event to the kfifo, use bitmap mask combination
+ * for multiple events. New event mask will overwrite the previous event mask.
+ * KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_ALL_PROCESS) bit requires sudo
+ * permisson to receive SVM events from all process.
+ *
+ * To receive the event
+ * Application can poll file descriptor to wait for the events, then read event
+ * from the file into a buffer. Each event is one line string message, starting
+ * with the event id, then the event specific information.
+ *
+ * To decode event information
+ * The following event format string macro can be used with sscanf to decode
+ * the specific event information.
+ * event triggers: the reason to generate the event, defined as enum for unmap,
+ * eviction and migrate events.
+ * node, from, to, prefetch_loc, preferred_loc: GPU ID, or 0 for system memory.
+ * addr: user mode address, in pages
+ * size: in pages
+ * pid: the process ID to generate the event
+ * ns: timestamp in nanosecond-resolution, starts at system boot time but
+ * stops during suspend
+ * migrate_update: GPU page fault is recovered by 'M' for migrate, 'U' for update
+ * rw: 'W' for write page fault, 'R' for read page fault
+ * rescheduled: 'R' if the queue restore failed and rescheduled to try again
+ */
+#define KFD_EVENT_FMT_UPDATE_GPU_RESET(reset_seq_num, reset_cause)\
+ "%x %s\n", (reset_seq_num), (reset_cause)
+
+#define KFD_EVENT_FMT_THERMAL_THROTTLING(bitmask, counter)\
+ "%llx:%llx\n", (bitmask), (counter)
+
+#define KFD_EVENT_FMT_VMFAULT(pid, task_name)\
+ "%x:%s\n", (pid), (task_name)
+
+#define KFD_EVENT_FMT_PAGEFAULT_START(ns, pid, addr, node, rw)\
+ "%lld -%d @%lx(%x) %c\n", (ns), (pid), (addr), (node), (rw)
+
+#define KFD_EVENT_FMT_PAGEFAULT_END(ns, pid, addr, node, migrate_update)\
+ "%lld -%d @%lx(%x) %c\n", (ns), (pid), (addr), (node), (migrate_update)
+
+#define KFD_EVENT_FMT_MIGRATE_START(ns, pid, start, size, from, to, prefetch_loc,\
+ preferred_loc, migrate_trigger)\
+ "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n", (ns), (pid), (start), (size),\
+ (from), (to), (prefetch_loc), (preferred_loc), (migrate_trigger)
+
+#define KFD_EVENT_FMT_MIGRATE_END(ns, pid, start, size, from, to, migrate_trigger)\
+ "%lld -%d @%lx(%lx) %x->%x %d\n", (ns), (pid), (start), (size),\
+ (from), (to), (migrate_trigger)
+
+#define KFD_EVENT_FMT_QUEUE_EVICTION(ns, pid, node, evict_trigger)\
+ "%lld -%d %x %d\n", (ns), (pid), (node), (evict_trigger)
+
+#define KFD_EVENT_FMT_QUEUE_RESTORE(ns, pid, node, rescheduled)\
+ "%lld -%d %x %c\n", (ns), (pid), (node), (rescheduled)
+
+#define KFD_EVENT_FMT_UNMAP_FROM_GPU(ns, pid, addr, size, node, unmap_trigger)\
+ "%lld -%d @%lx(%lx) %x %d\n", (ns), (pid), (addr), (size),\
+ (node), (unmap_trigger)
+
/**************************************************************************************************
* CRIU IOCTLs (Checkpoint Restore In Userspace)
*