From 335d62ade5feaa46082f8da755ffdc569ae51768 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Tue, 14 May 2024 21:00:14 +0200 Subject: [PATCH] drm/xe/pf: Track adverse events notifications from GuC MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit When thresholds used to monitor VFs activities are configured, then GuC may send GUC2PF_ADVERSE_EVENT messages informing the PF driver about exceeded thresholds. Start handling such messages. Reviewed-by: Piotr Piórkowski Signed-off-by: Michal Wajdeczko Link: https://patchwork.freedesktop.org/patch/msgid/20240514190015.2172-8-michal.wajdeczko@intel.com --- drivers/gpu/drm/xe/Makefile | 1 + drivers/gpu/drm/xe/xe_gt_sriov_pf_monitor.c | 147 ++++++++++++++++++ drivers/gpu/drm/xe/xe_gt_sriov_pf_monitor.h | 27 ++++ .../gpu/drm/xe/xe_gt_sriov_pf_monitor_types.h | 22 +++ drivers/gpu/drm/xe/xe_gt_sriov_pf_types.h | 5 + drivers/gpu/drm/xe/xe_guc_ct.c | 4 + 6 files changed, 206 insertions(+) create mode 100644 drivers/gpu/drm/xe/xe_gt_sriov_pf_monitor.c create mode 100644 drivers/gpu/drm/xe/xe_gt_sriov_pf_monitor.h create mode 100644 drivers/gpu/drm/xe/xe_gt_sriov_pf_monitor_types.h diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile index 6acde66f0827..8fe7bb80501f 100644 --- a/drivers/gpu/drm/xe/Makefile +++ b/drivers/gpu/drm/xe/Makefile @@ -164,6 +164,7 @@ xe-$(CONFIG_PCI_IOV) += \ xe_gt_sriov_pf_config.o \ xe_gt_sriov_pf_control.o \ xe_gt_sriov_pf_debugfs.o \ + xe_gt_sriov_pf_monitor.o \ xe_gt_sriov_pf_policy.o \ xe_gt_sriov_pf_service.o \ xe_lmtt.o \ diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_monitor.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_monitor.c new file mode 100644 index 000000000000..7d532bded02a --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_monitor.c @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023-2024 Intel Corporation + */ + +#include "abi/guc_actions_sriov_abi.h" +#include "abi/guc_messages_abi.h" + +#include "xe_gt_sriov_pf_config.h" +#include "xe_gt_sriov_pf_helpers.h" +#include "xe_gt_sriov_pf_monitor.h" +#include "xe_gt_sriov_printk.h" +#include "xe_guc_klv_helpers.h" +#include "xe_guc_klv_thresholds_set.h" + +/** + * xe_gt_sriov_pf_monitor_flr - Cleanup VF data after VF FLR. + * @gt: the &xe_gt + * @vfid: the VF identifier + * + * On FLR this function will reset all event data related to the VF. + * This function is for PF only. + */ +void xe_gt_sriov_pf_monitor_flr(struct xe_gt *gt, u32 vfid) +{ + int e; + + xe_gt_assert(gt, IS_SRIOV_PF(gt_to_xe(gt))); + xe_gt_sriov_pf_assert_vfid(gt, vfid); + + for (e = 0; e < XE_GUC_KLV_NUM_THRESHOLDS; e++) + gt->sriov.pf.vfs[vfid].monitor.guc.events[e] = 0; +} + +static void pf_update_event_counter(struct xe_gt *gt, u32 vfid, + enum xe_guc_klv_threshold_index e) +{ + xe_gt_assert(gt, IS_SRIOV_PF(gt_to_xe(gt))); + xe_gt_assert(gt, e < XE_GUC_KLV_NUM_THRESHOLDS); + + gt->sriov.pf.vfs[vfid].monitor.guc.events[e]++; +} + +static int pf_handle_vf_threshold_event(struct xe_gt *gt, u32 vfid, u32 threshold) +{ + char origin[8]; + int e; + + e = xe_guc_klv_threshold_key_to_index(threshold); + xe_sriov_function_name(vfid, origin, sizeof(origin)); + + /* was there a new KEY added that we missed? */ + if (unlikely(e < 0)) { + xe_gt_sriov_notice(gt, "unknown threshold key %#x reported for %s\n", + threshold, origin); + return -ENOTCONN; + } + + xe_gt_sriov_dbg(gt, "%s exceeded threshold %u %s\n", + origin, xe_gt_sriov_pf_config_get_threshold(gt, vfid, e), + xe_guc_klv_key_to_string(threshold)); + + pf_update_event_counter(gt, vfid, e); + + return 0; +} + +/** + * xe_gt_sriov_pf_monitor_process_guc2pf - Handle adverse event notification from the GuC. + * @gt: the &xe_gt + * @msg: G2H event message + * @len: length of the message + * + * This function is intended for PF only. + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_gt_sriov_pf_monitor_process_guc2pf(struct xe_gt *gt, const u32 *msg, u32 len) +{ + struct xe_device *xe = gt_to_xe(gt); + u32 vfid; + u32 threshold; + + xe_gt_assert(gt, len >= GUC_HXG_MSG_MIN_LEN); + xe_gt_assert(gt, FIELD_GET(GUC_HXG_MSG_0_ORIGIN, msg[0]) == GUC_HXG_ORIGIN_GUC); + xe_gt_assert(gt, FIELD_GET(GUC_HXG_MSG_0_TYPE, msg[0]) == GUC_HXG_TYPE_EVENT); + xe_gt_assert(gt, FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, msg[0]) == + GUC_ACTION_GUC2PF_ADVERSE_EVENT); + + if (unlikely(!IS_SRIOV_PF(xe))) + return -EPROTO; + + if (unlikely(FIELD_GET(GUC2PF_ADVERSE_EVENT_EVENT_MSG_0_MBZ, msg[0]))) + return -EPFNOSUPPORT; + + if (unlikely(len < GUC2PF_ADVERSE_EVENT_EVENT_MSG_LEN)) + return -EPROTO; + + vfid = FIELD_GET(GUC2PF_ADVERSE_EVENT_EVENT_MSG_1_VFID, msg[1]); + threshold = FIELD_GET(GUC2PF_ADVERSE_EVENT_EVENT_MSG_2_THRESHOLD, msg[2]); + + if (unlikely(vfid > xe_gt_sriov_pf_get_totalvfs(gt))) + return -EINVAL; + + return pf_handle_vf_threshold_event(gt, vfid, threshold); +} + +/** + * xe_gt_sriov_pf_monitor_print_events - Print adverse events counters. + * @gt: the &xe_gt to print events from + * @p: the &drm_printer + * + * Print adverse events counters for all VFs. + * VFs with no events are not printed. + * + * This function can only be called on PF. + */ +void xe_gt_sriov_pf_monitor_print_events(struct xe_gt *gt, struct drm_printer *p) +{ + unsigned int n, total_vfs = xe_gt_sriov_pf_get_totalvfs(gt); + const struct xe_gt_sriov_monitor *data; + int e; + + xe_gt_assert(gt, IS_SRIOV_PF(gt_to_xe(gt))); + + for (n = 1; n <= total_vfs; n++) { + data = >->sriov.pf.vfs[n].monitor; + + for (e = 0; e < XE_GUC_KLV_NUM_THRESHOLDS; e++) + if (data->guc.events[e]) + break; + + /* skip empty unless in debug mode */ + if (e >= XE_GUC_KLV_NUM_THRESHOLDS && + !IS_ENABLED(CONFIG_DRM_XE_DEBUG_SRIOV)) + continue; + +#define __format(...) "%s:%u " +#define __value(TAG, NAME, ...) , #NAME, data->guc.events[MAKE_XE_GUC_KLV_THRESHOLD_INDEX(TAG)] + + drm_printf(p, "VF%u:\t" MAKE_XE_GUC_KLV_THRESHOLDS_SET(__format) "\n", + n MAKE_XE_GUC_KLV_THRESHOLDS_SET(__value)); + +#undef __format +#undef __value + } +} diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_monitor.h b/drivers/gpu/drm/xe/xe_gt_sriov_pf_monitor.h new file mode 100644 index 000000000000..7ca9351a271b --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_monitor.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023-2024 Intel Corporation + */ + +#ifndef _XE_GT_SRIOV_PF_MONITOR_H_ +#define _XE_GT_SRIOV_PF_MONITOR_H_ + +#include +#include + +struct drm_printer; +struct xe_gt; + +void xe_gt_sriov_pf_monitor_flr(struct xe_gt *gt, u32 vfid); +void xe_gt_sriov_pf_monitor_print_events(struct xe_gt *gt, struct drm_printer *p); + +#ifdef CONFIG_PCI_IOV +int xe_gt_sriov_pf_monitor_process_guc2pf(struct xe_gt *gt, const u32 *msg, u32 len); +#else +static inline int xe_gt_sriov_pf_monitor_process_guc2pf(struct xe_gt *gt, const u32 *msg, u32 len) +{ + return -EPROTO; +} +#endif + +#endif diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_monitor_types.h b/drivers/gpu/drm/xe/xe_gt_sriov_pf_monitor_types.h new file mode 100644 index 000000000000..e27c0308c5db --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_monitor_types.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023-2024 Intel Corporation + */ + +#ifndef _XE_GT_SRIOV_PF_MONITOR_TYPES_H_ +#define _XE_GT_SRIOV_PF_MONITOR_TYPES_H_ + +#include "xe_guc_klv_thresholds_set_types.h" + +/** + * struct xe_gt_sriov_monitor - GT level per-VF monitoring data. + */ +struct xe_gt_sriov_monitor { + /** @guc: monitoring data related to the GuC. */ + struct { + /** @guc.events: number of adverse events reported by the GuC. */ + unsigned int events[XE_GUC_KLV_NUM_THRESHOLDS]; + } guc; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_types.h b/drivers/gpu/drm/xe/xe_gt_sriov_pf_types.h index 880754f3e215..40cbaea3ef44 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_types.h +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_types.h @@ -9,6 +9,7 @@ #include #include "xe_gt_sriov_pf_config_types.h" +#include "xe_gt_sriov_pf_monitor_types.h" #include "xe_gt_sriov_pf_policy_types.h" #include "xe_gt_sriov_pf_service_types.h" @@ -18,6 +19,10 @@ struct xe_gt_sriov_metadata { /** @config: per-VF provisioning data. */ struct xe_gt_sriov_config config; + + /** @monitor: per-VF monitoring data. */ + struct xe_gt_sriov_monitor monitor; + /** @version: negotiated VF/PF ABI version */ struct xe_gt_sriov_pf_service_version version; }; diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c index 0151d29b3c58..c1f258348f5c 100644 --- a/drivers/gpu/drm/xe/xe_guc_ct.c +++ b/drivers/gpu/drm/xe/xe_guc_ct.c @@ -22,6 +22,7 @@ #include "xe_gt_pagefault.h" #include "xe_gt_printk.h" #include "xe_gt_sriov_pf_control.h" +#include "xe_gt_sriov_pf_monitor.h" #include "xe_gt_tlb_invalidation.h" #include "xe_guc.h" #include "xe_guc_relay.h" @@ -1071,6 +1072,9 @@ static int process_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len) case GUC_ACTION_GUC2PF_VF_STATE_NOTIFY: ret = xe_gt_sriov_pf_control_process_guc2pf(gt, hxg, hxg_len); break; + case GUC_ACTION_GUC2PF_ADVERSE_EVENT: + ret = xe_gt_sriov_pf_monitor_process_guc2pf(gt, hxg, hxg_len); + break; default: xe_gt_err(gt, "unexpected G2H action 0x%04x\n", action); } -- 2.49.0