From 58f04d731ed92a0dc1caf84f98cfa2345b523d2f Mon Sep 17 00:00:00 2001 From: jeff-lien-wdc Date: Tue, 16 Jul 2024 15:26:49 -0500 Subject: [PATCH] wdc: OCP 2.5 Log Page Updates Add panic count and prev panic id to Error Recovery log page (0xC1) Add debug telemetry log size to Latency Monitor log page (0xc3) Signed-off-by: jeff-lien-wdc --- plugins/wdc/wdc-nvme.c | 143 +++++++++++++++++++++++++++-------------- plugins/wdc/wdc-nvme.h | 2 +- 2 files changed, 97 insertions(+), 48 deletions(-) diff --git a/plugins/wdc/wdc-nvme.c b/plugins/wdc/wdc-nvme.c index ac6e75c4..fa4157d2 100644 --- a/plugins/wdc/wdc-nvme.c +++ b/plugins/wdc/wdc-nvme.c @@ -1119,14 +1119,14 @@ struct __packed wdc_bd_ca_log_format { __u8 raw_value[8]; }; -#define LATENCY_LOG_BUCKET_READ 3 -#define LATENCY_LOG_BUCKET_WRITE 2 -#define LATENCY_LOG_BUCKET_TRIM 1 -#define LATENCY_LOG_BUCKET_RESERVED 0 +#define WDC_LATENCY_LOG_BUCKET_READ 3 +#define WDC_LATENCY_LOG_BUCKET_WRITE 2 +#define WDC_LATENCY_LOG_BUCKET_TRIM 1 +#define WDC_LATENCY_LOG_BUCKET_RESERVED 0 -#define LATENCY_LOG_MEASURED_LAT_READ 2 -#define LATENCY_LOG_MEASURED_LAT_WRITE 1 -#define LATENCY_LOG_MEASURED_LAT_TRIM 0 +#define WDC_LATENCY_LOG_MEASURED_LAT_READ 2 +#define WDC_LATENCY_LOG_MEASURED_LAT_WRITE 1 +#define WDC_LATENCY_LOG_MEASURED_LAT_TRIM 0 struct __packed wdc_ssd_latency_monitor_log { __u8 feature_status; /* 0x00 */ @@ -1151,8 +1151,9 @@ struct __packed wdc_ssd_latency_monitor_log { __le64 static_latency_timestamp[4][3]; /* 0x130 - 0x18F */ __le16 static_measured_latency[4][3]; /* 0x190 - 0x1A7 */ __le16 static_latency_stamp_units; /* 0x1A8 */ - __u8 rsvd4[0x16]; /* 0x1AA */ + __u8 rsvd4[10]; /* 0x1AA */ + __u8 debug_telemetry_log_size[12]; /* 0x1B4 */ __le16 debug_log_trigger_enable; /* 0x1C0 */ __le16 debug_log_measured_latency; /* 0x1C2 */ __le64 debug_log_latency_stamp; /* 0x1C4 */ @@ -1220,25 +1221,29 @@ struct __packed wdc_ssd_d0_smart_log { #define WDC_OCP_C1_GUID_LENGTH 16 #define WDC_ERROR_REC_LOG_BUF_LEN 512 #define WDC_ERROR_REC_LOG_ID 0xC1 -#define WDC_ERROR_REC_LOG_VERSION1 0001 -#define WDC_ERROR_REC_LOG_VERSION2 0002 struct __packed wdc_ocp_c1_error_recovery_log { - __le16 panic_reset_wait_time; /* 000 - Panic Reset Wait Time */ - __u8 panic_reset_action; /* 002 - Panic Reset Action */ - __u8 dev_recovery_action1; /* 003 - Device Recovery Action 1 */ - __le64 panic_id; /* 004 - Panic ID */ - __le32 dev_capabilities; /* 012 - Device Capabilities */ - __u8 vs_recovery_opc; /* 016 - Vendor Specific Recovery Opcode */ - __u8 rsvd1[3]; /* 017 - 3 Reserved Bytes */ - __le32 vs_cmd_cdw12; /* 020 - Vendor Specific Command CDW12 */ - __le32 vs_cmd_cdw13; /* 024 - Vendor Specific Command CDW13 */ - __u8 vs_cmd_to; /* 028 - Vendor Specific Command Timeout V2 */ - __u8 dev_recovery_action2; /* 029 - Device Recovery Action 2 V2 */ - __u8 dev_recovery_action2_to; /* 030 - Device Recovery Action 2 Timeout V2 */ - __u8 rsvd2[463]; /* 031 - 463 Reserved Bytes */ - __le16 log_page_version; /* 494 - Log Page Version */ - __u8 log_page_guid[WDC_OCP_C1_GUID_LENGTH]; /* 496 - Log Page GUID */ + __le16 panic_reset_wait_time; /* 000 - Panic Reset Wait Time */ + __u8 panic_reset_action; /* 002 - Panic Reset Action */ + __u8 dev_recovery_action1; /* 003 - Device Recovery Action 1 */ + __le64 panic_id; /* 004 - Panic ID */ + __le32 dev_capabilities; /* 012 - Device Capabilities */ + __u8 vs_recovery_opc; /* 016 - Vendor Specific Recovery Opcode */ + __u8 rsvd1[3]; /* 017 - 3 Reserved Bytes */ + __le32 vs_cmd_cdw12; /* 020 - Vendor Specific Command CDW12 */ + __le32 vs_cmd_cdw13; /* 024 - Vendor Specific Command CDW13 */ + __u8 vs_cmd_to; /* 028 - Vendor Specific Command Timeout V2 */ + __u8 dev_recovery_action2; /* 029 - Device Recovery Action 2 V2 */ + __u8 dev_recovery_action2_to; /* 030 - Device Recovery Action 2 Timeout V2 */ + __u8 panic_count; /* 031 - Number of panics encountered */ + __le64 prev_panic_ids[4]; /* 032 - 063 Previous Panic ID's */ + __u8 rsvd2[430]; /* 064 - 493 Reserved Bytes */ + /* 430 reserved bytes aligns with the rest */ + /* of the data structure. The size of 463 */ + /* bytes mentioned in the OCP spec */ + /* (version 2.5) would not fit here. */ + __le16 log_page_version; /* 494 - Log Page Version */ + __u8 log_page_guid[WDC_OCP_C1_GUID_LENGTH]; /* 496 - Log Page GUID */ }; static __u8 wdc_ocp_c1_guid[WDC_OCP_C1_GUID_LENGTH] = { 0x44, 0xD9, 0x31, 0x21, 0xFE, 0x30, 0x34, 0xAE, @@ -4652,20 +4657,30 @@ static int wdc_print_latency_monitor_log_normal(struct nvme_dev *dev, printf(" Active Latency Minimum Window %d ms\n", 100*log_data->active_latency_min_window); printf(" Active Latency Stamp Units %d\n", le16_to_cpu(log_data->active_latency_stamp_units)); printf(" Static Latency Stamp Units %d\n", le16_to_cpu(log_data->static_latency_stamp_units)); - printf(" Debug Log Trigger Enable %d\n", le16_to_cpu(log_data->debug_log_trigger_enable)); + if (le16_to_cpu(log_data->log_page_version) >= 4) + printf(" Debug Telemetry Log Size %"PRIu64"\n", + le64_to_cpu(*(uint64_t *)log_data->debug_telemetry_log_size)); + printf(" Debug Log Trigger Enable %d\n", + le16_to_cpu(log_data->debug_log_trigger_enable)); + printf(" Log Page Version %d\n", + le16_to_cpu(log_data->log_page_version)); + printf(" Log page GUID 0x"); + for (j = 0; j < WDC_C3_GUID_LENGTH; j++) + printf("%x", log_data->log_page_guid[j]); + printf("\n"); printf(" Read Write Deallocate/Trim\n"); for (i = 0; i <= 3; i++) printf(" Active Bucket Counter: Bucket %d %27d %27d %27d\n", - i, le32_to_cpu(log_data->active_bucket_counter[i][LATENCY_LOG_BUCKET_READ]), - le32_to_cpu(log_data->active_bucket_counter[i][LATENCY_LOG_BUCKET_WRITE]), - le32_to_cpu(log_data->active_bucket_counter[i][LATENCY_LOG_BUCKET_TRIM])); + i, le32_to_cpu(log_data->active_bucket_counter[i][WDC_LATENCY_LOG_BUCKET_READ]), + le32_to_cpu(log_data->active_bucket_counter[i][WDC_LATENCY_LOG_BUCKET_WRITE]), + le32_to_cpu(log_data->active_bucket_counter[i][WDC_LATENCY_LOG_BUCKET_TRIM])); for (i = 3; i >= 0; i--) printf(" Active Measured Latency: Bucket %d %27d ms %27d ms %27d ms\n", - 3-i, le16_to_cpu(log_data->active_measured_latency[i][LATENCY_LOG_MEASURED_LAT_READ]), - le16_to_cpu(log_data->active_measured_latency[i][LATENCY_LOG_MEASURED_LAT_WRITE]), - le16_to_cpu(log_data->active_measured_latency[i][LATENCY_LOG_MEASURED_LAT_TRIM])); + 3-i, le16_to_cpu(log_data->active_measured_latency[i][WDC_LATENCY_LOG_MEASURED_LAT_READ]), + le16_to_cpu(log_data->active_measured_latency[i][WDC_LATENCY_LOG_MEASURED_LAT_WRITE]), + le16_to_cpu(log_data->active_measured_latency[i][WDC_LATENCY_LOG_MEASURED_LAT_TRIM])); for (i = 3; i >= 0; i--) { printf(" Active Latency Time Stamp: Bucket %d ", 3-i); @@ -4682,15 +4697,15 @@ static int wdc_print_latency_monitor_log_normal(struct nvme_dev *dev, for (i = 0; i <= 3; i++) printf(" Static Bucket Counter: Bucket %d %27d %27d %27d\n", - i, le32_to_cpu(log_data->static_bucket_counter[i][LATENCY_LOG_BUCKET_READ]), - le32_to_cpu(log_data->static_bucket_counter[i][LATENCY_LOG_BUCKET_WRITE]), - le32_to_cpu(log_data->static_bucket_counter[i][LATENCY_LOG_BUCKET_TRIM])); + i, le32_to_cpu(log_data->static_bucket_counter[i][WDC_LATENCY_LOG_BUCKET_READ]), + le32_to_cpu(log_data->static_bucket_counter[i][WDC_LATENCY_LOG_BUCKET_WRITE]), + le32_to_cpu(log_data->static_bucket_counter[i][WDC_LATENCY_LOG_BUCKET_TRIM])); for (i = 3; i >= 0; i--) printf(" Static Measured Latency: Bucket %d %27d ms %27d ms %27d ms\n", - 3-i, le16_to_cpu(log_data->static_measured_latency[i][LATENCY_LOG_MEASURED_LAT_READ]), - le16_to_cpu(log_data->static_measured_latency[i][LATENCY_LOG_MEASURED_LAT_WRITE]), - le16_to_cpu(log_data->static_measured_latency[i][LATENCY_LOG_MEASURED_LAT_TRIM])); + 3-i, le16_to_cpu(log_data->static_measured_latency[i][WDC_LATENCY_LOG_MEASURED_LAT_READ]), + le16_to_cpu(log_data->static_measured_latency[i][WDC_LATENCY_LOG_MEASURED_LAT_WRITE]), + le16_to_cpu(log_data->static_measured_latency[i][WDC_LATENCY_LOG_MEASURED_LAT_TRIM])); for (i = 3; i >= 0; i--) { printf(" Static Latency Time Stamp: Bucket %d ", 3-i); @@ -4726,7 +4741,22 @@ static void wdc_print_latency_monitor_log_json(struct wdc_ssd_latency_monitor_lo json_object_add_value_int(root, "Active Lantency Minimum Window", 100*log_data->active_latency_min_window); json_object_add_value_int(root, "Active Latency Stamp Units", le16_to_cpu(log_data->active_latency_stamp_units)); json_object_add_value_int(root, "Static Latency Stamp Units", le16_to_cpu(log_data->static_latency_stamp_units)); - json_object_add_value_int(root, "Debug Log Trigger Enable", le16_to_cpu(log_data->debug_log_trigger_enable)); + if (le16_to_cpu(log_data->log_page_version) >= 4) { + json_object_add_value_int(root, "Debug Telemetry Log Size", + le64_to_cpu(*(uint64_t *)log_data->debug_telemetry_log_size)); + } + json_object_add_value_int(root, "Debug Log Trigger Enable", + le16_to_cpu(log_data->debug_log_trigger_enable)); + json_object_add_value_int(root, "Log Page Version", + le16_to_cpu(log_data->log_page_version)); + + char guid[40]; + + memset((void *)guid, 0, 40); + sprintf((char *)guid, "0x%"PRIx64"%"PRIx64"", + (uint64_t)le64_to_cpu(*(uint64_t *)&log_data->log_page_guid[8]), + (uint64_t)le64_to_cpu(*(uint64_t *)&log_data->log_page_guid[0])); + json_object_add_value_string(root, "Log page GUID", guid); for (i = 0; i <= 3; i++) { for (j = 2; j >= 0; j--) { @@ -4785,13 +4815,20 @@ static void wdc_print_error_rec_log_normal(struct wdc_ocp_c1_error_recovery_log printf(" Vendor Specific Recovery Opcode : 0x%x\n", log_data->vs_recovery_opc); printf(" Vendor Specific Command CDW12 : 0x%x\n", le32_to_cpu(log_data->vs_cmd_cdw12)); printf(" Vendor Specific Command CDW13 : 0x%x\n", le32_to_cpu(log_data->vs_cmd_cdw13)); - if (le16_to_cpu(log_data->log_page_version) == WDC_ERROR_REC_LOG_VERSION2) { + if (le16_to_cpu(log_data->log_page_version) >= 2) { printf(" Vendor Specific Command Timeout : 0x%x\n", log_data->vs_cmd_to); printf(" Device Recovery Action 2 : 0x%x\n", log_data->dev_recovery_action2); printf(" Device Recovery Action 2 Timeout : 0x%x\n", log_data->dev_recovery_action2_to); } - printf(" Log Page Version : 0x%x\n", le16_to_cpu(log_data->log_page_version)); - printf(" Log page GUID : 0x"); + if (le16_to_cpu(log_data->log_page_version) >= 3) { + printf(" Panic Count : 0x%x\n", log_data->panic_count); + for (j = 0; j < 4; j++) + printf(" Previous Panic ID N-%d : 0x%"PRIx64"\n", + j+1, le64_to_cpu(log_data->prev_panic_ids[j])); + } + printf(" Log Page Version : 0x%x\n", + le16_to_cpu(log_data->log_page_version)); + printf(" Log page GUID : 0x"); for (j = 0; j < WDC_OCP_C1_GUID_LENGTH; j++) printf("%x", log_data->log_page_guid[j]); printf("\n"); @@ -4799,6 +4836,8 @@ static void wdc_print_error_rec_log_normal(struct wdc_ocp_c1_error_recovery_log static void wdc_print_error_rec_log_json(struct wdc_ocp_c1_error_recovery_log *log_data) { + int j; + char buf[128]; struct json_object *root = json_create_object(); json_object_add_value_int(root, "Panic Reset Wait Time", le16_to_cpu(log_data->panic_reset_wait_time)); @@ -4809,12 +4848,21 @@ static void wdc_print_error_rec_log_json(struct wdc_ocp_c1_error_recovery_log *l json_object_add_value_int(root, "Vendor Specific Recovery Opcode", log_data->vs_recovery_opc); json_object_add_value_int(root, "Vendor Specific Command CDW12", le32_to_cpu(log_data->vs_cmd_cdw12)); json_object_add_value_int(root, "Vendor Specific Command CDW13", le32_to_cpu(log_data->vs_cmd_cdw13)); - if (le16_to_cpu(log_data->log_page_version) == WDC_ERROR_REC_LOG_VERSION2) { + if (le16_to_cpu(log_data->log_page_version) >= 2) { json_object_add_value_int(root, "Vendor Specific Command Timeout", log_data->vs_cmd_to); json_object_add_value_int(root, "Device Recovery Action 2", log_data->dev_recovery_action2); json_object_add_value_int(root, "Device Recovery Action 2 Timeout", log_data->dev_recovery_action2_to); } - json_object_add_value_int(root, "Log Page Version", le16_to_cpu(log_data->log_page_version)); + if (le16_to_cpu(log_data->log_page_version) >= 3) { + json_object_add_value_int(root, "Panic Count", log_data->panic_count); + for (j = 0; j < 4; j++) { + sprintf(buf, "Previous Panic ID N-%d", j+1); + json_object_add_value_int(root, buf, + le64_to_cpu(log_data->prev_panic_ids[j])); + } + } + json_object_add_value_int(root, "Log Page Version", + le16_to_cpu(log_data->log_page_version)); char guid[40]; @@ -7351,9 +7399,10 @@ static int wdc_get_ocp_c1_log_page(nvme_root_t r, struct nvme_dev *dev, char *fo log_data = (struct wdc_ocp_c1_error_recovery_log *)data; /* check log page version */ - if ((log_data->log_page_version != WDC_ERROR_REC_LOG_VERSION1) && - (log_data->log_page_version != WDC_ERROR_REC_LOG_VERSION2)) { - fprintf(stderr, "ERROR: WDC: invalid error recovery log version - %d\n", log_data->log_page_version); + if ((log_data->log_page_version < 1) || + (log_data->log_page_version > 3)) { + fprintf(stderr, "ERROR: WDC: invalid error recovery log version - %d\n", + log_data->log_page_version); ret = -1; goto out; } diff --git a/plugins/wdc/wdc-nvme.h b/plugins/wdc/wdc-nvme.h index e2343b76..a18da165 100644 --- a/plugins/wdc/wdc-nvme.h +++ b/plugins/wdc/wdc-nvme.h @@ -5,7 +5,7 @@ #if !defined(WDC_NVME) || defined(CMD_HEADER_MULTI_READ) #define WDC_NVME -#define WDC_PLUGIN_VERSION "2.9.0" +#define WDC_PLUGIN_VERSION "2.9.1" #include "cmd.h" PLUGIN(NAME("wdc", "Western Digital vendor specific extensions", WDC_PLUGIN_VERSION), -- 2.50.1