From cbfa447edd6a3825fdb8a4ffae74ff7208f2d2c0 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 1 Feb 2019 22:55:51 +0000 Subject: x86/MCE/AMD, EDAC/mce_amd: Add new MP5, NBIO, and PCIE SMCA bank types Add the (HWID, MCATYPE) tuples and names for the new MP5, NBIO, and PCIE SMCA bank types. Also, add their respective error descriptions to the MCE decoding module edac_mce_amd. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: Arnd Bergmann Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kees Cook Cc: linux-edac Cc: Mauro Carvalho Chehab Cc: Pu Wen Cc: Qiuxu Zhuo Cc: Shirish S Cc: Thomas Gleixner Cc: Tony Luck Cc: Vishal Verma Cc: x86-ml Link: https://lkml.kernel.org/r/20190201225534.8177-2-Yazen.Ghannam@amd.com --- arch/x86/include/asm/mce.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index c1a812bd5a27d..91b65d859ca8c 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -312,6 +312,9 @@ enum smca_bank_types { SMCA_PB, /* Parameter Block */ SMCA_PSP, /* Platform Security Processor */ SMCA_SMU, /* System Management Unit */ + SMCA_MP5, /* Microprocessor 5 Unit */ + SMCA_NBIO, /* Northbridge IO Unit */ + SMCA_PCIE, /* PCI Express Unit */ N_SMCA_BANK_TYPES }; -- cgit v1.2.3 From 3ad7e748c12cc771df6020a552def3e1727e8a17 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 1 Feb 2019 22:55:52 +0000 Subject: x86/MCE/AMD, EDAC/mce_amd: Add new McaTypes for CS, PSP, and SMU units The existing CS, PSP, and SMU SMCA bank types will see new versions (as indicated by their McaTypes) in future SMCA systems. Add the new (HWID, MCATYPE) tuples for these new versions. Reuse the same names as the older versions, since they are logically the same to the user. SMCA systems won't mix and match IP blocks with different McaType versions in the same system, so there isn't a need to distinguish them. The MCA_IPID register is saved when logging an MCA error, and that can be used to triage the error. Also, add the new error descriptions to edac_mce_amd. Some error types (positions in the list) are overloaded compared to the previous McaTypes. Therefore, just create new lists of the error descriptions to keep things simple even if some of the error descriptions are the same between versions. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: Arnd Bergmann Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kees Cook Cc: linux-edac Cc: Mauro Carvalho Chehab Cc: Pu Wen Cc: Qiuxu Zhuo Cc: Shirish S Cc: Thomas Gleixner Cc: Tony Luck Cc: Vishal Verma Cc: x86-ml Link: https://lkml.kernel.org/r/20190201225534.8177-3-Yazen.Ghannam@amd.com --- arch/x86/include/asm/mce.h | 3 +++ arch/x86/kernel/cpu/mce/amd.c | 6 +++++ drivers/edac/mce_amd.c | 55 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 91b65d859ca8c..299a385365679 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -307,11 +307,14 @@ enum smca_bank_types { SMCA_FP, /* Floating Point */ SMCA_L3_CACHE, /* L3 Cache */ SMCA_CS, /* Coherent Slave */ + SMCA_CS_V2, /* Coherent Slave */ SMCA_PIE, /* Power, Interrupts, etc. */ SMCA_UMC, /* Unified Memory Controller */ SMCA_PB, /* Parameter Block */ SMCA_PSP, /* Platform Security Processor */ + SMCA_PSP_V2, /* Platform Security Processor */ SMCA_SMU, /* System Management Unit */ + SMCA_SMU_V2, /* System Management Unit */ SMCA_MP5, /* Microprocessor 5 Unit */ SMCA_NBIO, /* Northbridge IO Unit */ SMCA_PCIE, /* PCI Express Unit */ diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 00f60b8c7e4f3..bd1331b241cad 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -88,11 +88,14 @@ static struct smca_bank_name smca_names[] = { [SMCA_FP] = { "floating_point", "Floating Point Unit" }, [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" }, [SMCA_CS] = { "coherent_slave", "Coherent Slave" }, + [SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" }, [SMCA_PIE] = { "pie", "Power, Interrupts, etc." }, [SMCA_UMC] = { "umc", "Unified Memory Controller" }, [SMCA_PB] = { "param_block", "Parameter Block" }, [SMCA_PSP] = { "psp", "Platform Security Processor" }, + [SMCA_PSP_V2] = { "psp", "Platform Security Processor" }, [SMCA_SMU] = { "smu", "System Management Unit" }, + [SMCA_SMU_V2] = { "smu", "System Management Unit" }, [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" }, [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" }, [SMCA_PCIE] = { "pcie", "PCI Express Unit" }, @@ -153,6 +156,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = { /* Data Fabric MCA types */ { SMCA_CS, HWID_MCATYPE(0x2E, 0x0), 0x1FF }, { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0xF }, + { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2), 0x3FFF }, /* Unified Memory Controller MCA type */ { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0x3F }, @@ -162,9 +166,11 @@ static struct smca_hwid smca_hwid_mcatypes[] = { /* Platform Security Processor MCA type */ { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0), 0x1 }, + { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1), 0x3FFFF }, /* System Management Unit MCA type */ { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 }, + { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1), 0x7FF }, /* Microprocessor 5 Unit MCA type */ { SMCA_MP5, HWID_MCATYPE(0x01, 0x2), 0x3FF }, diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 5ab4ab3f0ce60..184c90172d176 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -257,6 +257,23 @@ static const char * const smca_cs_mce_desc[] = { "ECC error on probe filter access", }; +static const char * const smca_cs2_mce_desc[] = { + "Illegal Request", + "Address Violation", + "Security Violation", + "Illegal Response", + "Unexpected Response", + "Request or Probe Parity Error", + "Read Response Parity Error", + "Atomic Request Parity Error", + "SDP read response had no match in the CS queue", + "Probe Filter Protocol Error", + "Probe Filter ECC Error", + "SDP read response had an unexpected RETRY error", + "Counter overflow error", + "Counter underflow error", +}; + static const char * const smca_pie_mce_desc[] = { "HW assert", "Internal PIE register security violation", @@ -281,10 +298,45 @@ static const char * const smca_psp_mce_desc[] = { "PSP RAM ECC or parity error", }; +static const char * const smca_psp2_mce_desc[] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", + "Instruction Cache Bank 0 ECC or parity error", + "Instruction Cache Bank 1 ECC or parity error", + "Instruction Tag Ram 0 parity error", + "Instruction Tag Ram 1 parity error", + "Data Cache Bank 0 ECC or parity error", + "Data Cache Bank 1 ECC or parity error", + "Data Cache Bank 2 ECC or parity error", + "Data Cache Bank 3 ECC or parity error", + "Data Tag Bank 0 parity error", + "Data Tag Bank 1 parity error", + "Data Tag Bank 2 parity error", + "Data Tag Bank 3 parity error", + "Dirty Data Ram parity error", + "TLB Bank 0 parity error", + "TLB Bank 1 parity error", + "System Hub Read Buffer ECC or parity error", +}; + static const char * const smca_smu_mce_desc[] = { "SMU RAM ECC or parity error", }; +static const char * const smca_smu2_mce_desc[] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", + "Data Cache Bank A ECC or parity error", + "Data Cache Bank B ECC or parity error", + "Data Tag Cache Bank A ECC or parity error", + "Data Tag Cache Bank B ECC or parity error", + "Instruction Cache Bank A ECC or parity error", + "Instruction Cache Bank B ECC or parity error", + "Instruction Tag Cache Bank A ECC or parity error", + "Instruction Tag Cache Bank B ECC or parity error", + "System Hub Read Buffer ECC or parity error", +}; + static const char * const smca_mp5_mce_desc[] = { "High SRAM ECC or parity error", "Low SRAM ECC or parity error", @@ -328,11 +380,14 @@ static struct smca_mce_desc smca_mce_descs[] = { [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) }, [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) }, [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) }, + [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) }, [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) }, [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) }, [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) }, [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) }, + [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc) }, [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) }, + [SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc) }, [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) }, [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc) }, [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc) }, -- cgit v1.2.3 From 3f4da372ec8e4ce58c17ac4f2e3c8891bbfea17e Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 12 Feb 2019 21:24:28 +0000 Subject: EDAC/mce_amd: Decode MCA_STATUS[Scrub] bit Previous AMD systems have had a bit in MCA_STATUS to indicate that an error was detected on a scrub operation. However, this bit was defined differently within different banks and families/models. Starting with Family 17h, MCA_STATUS[40] is either Reserved/Read-as-Zero or defined as "Scrub", for all MCA banks and CPU models. Therefore, this bit can be defined as the "Scrub" bit. Define MCA_STATUS[40] as "Scrub" and decode it in the AMD MCE decoding module for Family 17h and newer systems. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: linux-edac Cc: Mauro Carvalho Chehab Cc: Pu Wen Cc: Qiuxu Zhuo Cc: Thomas Gleixner Cc: Tony Luck Cc: Vishal Verma Cc: x86-ml Link: https://lkml.kernel.org/r/20190212212417.107049-1-Yazen.Ghannam@amd.com --- arch/x86/include/asm/mce.h | 1 + drivers/edac/mce_amd.c | 3 +++ 2 files changed, 4 insertions(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 299a385365679..22d05e3835f0b 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -48,6 +48,7 @@ #define MCI_STATUS_SYNDV BIT_ULL(53) /* synd reg. valid */ #define MCI_STATUS_DEFERRED BIT_ULL(44) /* uncorrected error, deferred exception */ #define MCI_STATUS_POISON BIT_ULL(43) /* access poisonous data */ +#define MCI_STATUS_SCRUB BIT_ULL(40) /* Error detected during scrub operation */ /* * McaX field if set indicates a given bank supports MCA extensions: diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index f286b880f9812..b349c22bb3860 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -1078,6 +1078,9 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) if (ecc) pr_cont("|%sECC", ((ecc == 2) ? "C" : "U")); + if (fam >= 0x17) + pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-")); + pr_cont("]: 0x%016llx\n", m->status); if (m->status & MCI_STATUS_ADDRV) -- cgit v1.2.3