]> www.infradead.org Git - users/willy/pagecache.git/commitdiff
EDAC/i10nm: Skip the absent memory controllers
authorQiuxu Zhuo <qiuxu.zhuo@intel.com>
Mon, 10 Jul 2023 01:32:32 +0000 (09:32 +0800)
committerTony Luck <tony.luck@intel.com>
Mon, 24 Jul 2023 15:57:26 +0000 (08:57 -0700)
Some Sapphire Rapids workstations' absent memory controllers
still appear as PCIe devices that fool the i10nm_edac driver
and result in "shift exponent -66 is negative" call traces
from skx_get_dimm_info().

Skip the absent memory controllers to avoid the call traces.

Reported-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Closes: https://lore.kernel.org/linux-edac/CAAd53p41Ku1m1rapeqb1xtD+kKuk+BaUW=dumuoF0ZO3GhFjFA@mail.gmail.com/T/#m5de16dce60a8c836ec235868c7c16e3fefad0cc2
Tested-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Reported-by: Koba Ko <koba.ko@canonical.com>
Closes: https://lore.kernel.org/linux-edac/SA1PR11MB71305B71CCCC3D9305835202892AA@SA1PR11MB7130.namprd11.prod.outlook.com/T/#t
Tested-by: Koba Ko <koba.ko@canonical.com>
Fixes: d4dc89d069aa ("EDAC, i10nm: Add a driver for Intel 10nm server processors")
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20230710013232.59712-1-qiuxu.zhuo@intel.com
drivers/edac/i10nm_base.c

index a897b6aff368681d2385bb4a1857e53c34631ca3..349ff6cfb3796e579348b7595b16370546f02da1 100644 (file)
@@ -658,13 +658,49 @@ static struct pci_dev *get_ddr_munit(struct skx_dev *d, int i, u32 *offset, unsi
        return mdev;
 }
 
+/**
+ * i10nm_imc_absent() - Check whether the memory controller @imc is absent
+ *
+ * @imc    : The pointer to the structure of memory controller EDAC device.
+ *
+ * RETURNS : true if the memory controller EDAC device is absent, false otherwise.
+ */
+static bool i10nm_imc_absent(struct skx_imc *imc)
+{
+       u32 mcmtr;
+       int i;
+
+       switch (res_cfg->type) {
+       case SPR:
+               for (i = 0; i < res_cfg->ddr_chan_num; i++) {
+                       mcmtr = I10NM_GET_MCMTR(imc, i);
+                       edac_dbg(1, "ch%d mcmtr reg %x\n", i, mcmtr);
+                       if (mcmtr != ~0)
+                               return false;
+               }
+
+               /*
+                * Some workstations' absent memory controllers still
+                * appear as PCIe devices, misleading the EDAC driver.
+                * By observing that the MMIO registers of these absent
+                * memory controllers consistently hold the value of ~0.
+                *
+                * We identify a memory controller as absent by checking
+                * if its MMIO register "mcmtr" == ~0 in all its channels.
+                */
+               return true;
+       default:
+               return false;
+       }
+}
+
 static int i10nm_get_ddr_munits(void)
 {
        struct pci_dev *mdev;
        void __iomem *mbase;
        unsigned long size;
        struct skx_dev *d;
-       int i, j = 0;
+       int i, lmc, j = 0;
        u32 reg, off;
        u64 base;
 
@@ -690,7 +726,7 @@ static int i10nm_get_ddr_munits(void)
                edac_dbg(2, "socket%d mmio base 0x%llx (reg 0x%x)\n",
                         j++, base, reg);
 
-               for (i = 0; i < res_cfg->ddr_imc_num; i++) {
+               for (lmc = 0, i = 0; i < res_cfg->ddr_imc_num; i++) {
                        mdev = get_ddr_munit(d, i, &off, &size);
 
                        if (i == 0 && !mdev) {
@@ -700,8 +736,6 @@ static int i10nm_get_ddr_munits(void)
                        if (!mdev)
                                continue;
 
-                       d->imc[i].mdev = mdev;
-
                        edac_dbg(2, "mc%d mmio base 0x%llx size 0x%lx (reg 0x%x)\n",
                                 i, base + off, size, reg);
 
@@ -712,7 +746,17 @@ static int i10nm_get_ddr_munits(void)
                                return -ENODEV;
                        }
 
-                       d->imc[i].mbase = mbase;
+                       d->imc[lmc].mbase = mbase;
+                       if (i10nm_imc_absent(&d->imc[lmc])) {
+                               pci_dev_put(mdev);
+                               iounmap(mbase);
+                               d->imc[lmc].mbase = NULL;
+                               edac_dbg(2, "Skip absent mc%d\n", i);
+                               continue;
+                       } else {
+                               d->imc[lmc].mdev = mdev;
+                               lmc++;
+                       }
                }
        }