From ae1647624486fca0070b297d0e2fd4e53443c10b Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Mon, 12 Feb 2024 10:35:25 +0000 Subject: [PATCH] rasdaemon: ras-mc-ctl: Add support for CXL AER correctable trace events Add support for CXL AER correctable events to the ras-mc-ctl tool. Signed-off-by: Shiju Jose Signed-off-by: Mauro Carvalho Chehab --- util/ras-mc-ctl.in | 79 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in index 1745a26..bcab43f 100755 --- a/util/ras-mc-ctl.in +++ b/util/ras-mc-ctl.in @@ -1240,6 +1240,46 @@ sub get_cxl_ue_error_status_text return join (", ", @out); } +use constant { + CXL_AER_CE_CACHE_DATA_ECC => 0x0001, + CXL_AER_CE_MEM_DATA_ECC => 0x0002, + CXL_AER_CE_CRC_THRESH => 0x0004, + CXL_AER_CE_RETRY_THRESH => 0x0008, + CXL_AER_CE_CACHE_POISON => 0x0010, + CXL_AER_CE_MEM_POISON => 0x0020, + CXL_AER_CE_PHYS_LAYER_ERR => 0x0040, +}; + +sub get_cxl_ce_error_status_text +{ + my $error_status = $_[0]; + my @out; + + if ($error_status & CXL_AER_CE_CACHE_DATA_ECC) { + push @out, (sprintf "\'Cache Data ECC Error\' "); + } + if ($error_status & CXL_AER_CE_MEM_DATA_ECC) { + push @out, (sprintf "\'Memory Data ECC Error\' "); + } + if ($error_status & CXL_AER_CE_CRC_THRESH) { + push @out, (sprintf "\'CRC Threshold Hit\' "); + } + if ($error_status & CXL_AER_CE_RETRY_THRESH) { + push @out, (sprintf "\'Retry Threshold\' "); + } + if ($error_status & CXL_AER_CE_CACHE_POISON) { + push @out, (sprintf "\'Received Cache Poison From Peer\' "); + } + if ($error_status & CXL_AER_CE_MEM_POISON) { + push @out, (sprintf "\'Received Memory Poison From Peer\' "); + } + if ($error_status & CXL_AER_CE_PHYS_LAYER_ERR) { + push @out, (sprintf "\'Received Error From Physical Layer\' "); + } + + return join (", ", @out); +} + sub summary { require DBI; @@ -1320,6 +1360,22 @@ sub summary print "No CXL AER uncorrectable errors.\n\n"; } $query_handle->finish; + + # CXL AER correctable errors + $query = "select memdev, count(*) from cxl_aer_ce_event$conf{opt}{since} group by memdev"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); + $query_handle->bind_columns(\($memdev, $count)); + $out = ""; + while($query_handle->fetch()) { + $out .= "\t$memdev errors: $count\n"; + } + if ($out ne "") { + print "CXL AER correctable events summary:\n$out\n"; + } else { + print "No CXL AER correctable errors.\n\n"; + } + $query_handle->finish; } # extlog errors @@ -1529,6 +1585,29 @@ sub errors print "No CXL AER uncorrectable errors.\n\n"; } $query_handle->finish; + + # CXL AER correctable errors + $query = "select id, timestamp, memdev, host, serial, error_status from cxl_aer_ce_event$conf{opt}{since} order by id"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); + $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $error_status)); + $out = ""; + while($query_handle->fetch()) { + $out .= "$id $timestamp error: "; + $out .= "memdev=$memdev, " if (defined $memdev && length $memdev); + $out .= "host=$host, " if (defined $host && length $host); + $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial); + if (defined $error_status && length $error_status) { + $out .= sprintf "error_status: %s, ", get_cxl_ce_error_status_text($error_status); + } + $out .= "\n"; + } + if ($out ne "") { + print "CXL AER correctable events:\n$out\n"; + } else { + print "No CXL AER correctable errors.\n\n"; + } + $query_handle->finish; } # Extlog errors -- 2.49.0