NetBSD-Bugs archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
Re: kern/58775 (apei(4) spamming console)
The following reply was made to PR kern/58775; it has been noted by GNATS.
From: Taylor R Campbell <riastradh%NetBSD.org@localhost>
To: Hauke Fath <hf%spg.tu-darmstadt.de@localhost>
Cc: gnats-bugs%netbsd.org@localhost, gnats-admin%netbsd.org@localhost, netbsd-bugs%netbsd.org@localhost
Subject: Re: kern/58775 (apei(4) spamming console)
Date: Sat, 26 Oct 2024 15:49:32 +0000
This is a multi-part message in MIME format.
--=_fDlpu8OzeqfFpAJUhARZcts9raUpcXJJ
Content-Transfer-Encoding: quoted-printable
> Date: Sat, 26 Oct 2024 03:21:49 +0200
> From: Hauke Fath <hf%spg.tu-darmstadt.de@localhost>
>=20
> On Fri, 25 Oct 2024 18:36:32 +0000, Taylor R Campbell wrote:
> > Can you share the output of the following commands?
>=20
> <ftp://ftp.causeuse.org/pub/NetBSD/kern-58775/kern-58775.pcidevs.gz>
Thanks! So, the new apei(4) code and pcictl(8) both confirm that your
PCI device is unhappy with lots of hardware errors -- corrected
errors, but still alarming. This is almost certainly an actual
hardware problem that you might want to address (once we're done
doing science!).
> Note the commands were run under a Dom0 kernel which does not have=20
> apei(4). With the regular kernel, the machine was so busy logging I=20
> couldn't even ssh in...
Sure, that's fine. I wanted to get independent confirmation from
other PCI code of the correctable errors, just in case the apei(4)
patch I drafted was misinterpreting any of the registers.
Can you revert the previous patch and try the attached patch instead,
which applies a rate limit to the console output?
https://www.NetBSD.org/~riastradh/tmp/20241026/pr58775-apeipcieerror-v2.pat=
ch
Previously, you were getting up to two messages every five seconds;
with the patch, since the errors are all corrected, it should be at
most once per minute.
(The patch rate-limits each of four severity levels -- corrected,
uncorrectable/recoverable, uncorrectable/fatal, and other or bad
firmware -- independently to one message per minute, so at most four
per minute if the system is correcting and also crashing and burning
at which point it's probably moot.)
--=_fDlpu8OzeqfFpAJUhARZcts9raUpcXJJ
Content-Type: text/plain; charset="ISO-8859-1"; name="pr58775-apeipcieerror-v2"
Content-Transfer-Encoding: quoted-printable
Content-Disposition: attachment; filename="pr58775-apeipcieerror-v2.patch"
diff -r b4e17a9d10b4 -r ceecab6e9cd9 sys/dev/acpi/apei.c
--- a/sys/dev/acpi/apei.c Mon Oct 21 15:57:45 2024 +0000
+++ b/sys/dev/acpi/apei.c Thu Oct 24 20:08:59 2024 +0000
@@ -58,6 +58,7 @@
#include <dev/acpi/apei_hestvar.h>
#include <dev/acpi/apei_interp.h>
#include <dev/acpi/apeivar.h>
+#include <dev/pci/pci_error.h>
=20
#define _COMPONENT ACPI_RESOURCE_COMPONENT
ACPI_MODULE_NAME ("apei")
@@ -313,10 +314,10 @@ apei_format_guid(const struct uuid *uuid
{
=20
snprintf(guidstr, 69, "{0x%08x,0x%04x,0x%04x,"
- "0x%02x%02x,"
- "{0x%02x,0x%02x,0x%02x,0x%02x,0x%02x,0x%02x}}",
+ "{0x%02x,%02x,"
+ "0x%02x,0x%02x,0x%02x,0x%02x,0x%02x,0x%02x}}",
uuid->time_low, uuid->time_mid, uuid->time_hi_and_version,
- uuid->clock_seq_hi_and_reserved, uuid->clock_seq_hi_and_reserved,
+ uuid->clock_seq_hi_and_reserved, uuid->clock_seq_low,
uuid->node[0], uuid->node[1], uuid->node[2],
uuid->node[3], uuid->node[4], uuid->node[5]);
}
@@ -356,6 +357,8 @@ static const char *const apei_gede_sever
};
=20
/*
+ * N.2.5. Memory Error Section
+ *
* https://uefi.org/specs/UEFI/2.10/Apx_N_Common_Platform_Error_Record.htm=
l#memory-error-section
*/
static const struct uuid CPER_MEMORY_ERROR_SECTION =3D
@@ -363,11 +366,17 @@ static const struct uuid CPER_MEMORY_ERR
=20
static void
apei_cper_memory_error_report(struct apei_softc *sc, const void *buf,
- size_t len, const char *ctx)
+ size_t len, const char *ctx, bool ratelimitok)
{
const struct cper_memory_error *ME =3D buf;
char bitbuf[1024];
=20
+ /*
+ * If we've hit the rate limit, skip printing the error.
+ */
+ if (!ratelimitok)
+ goto out;
+
snprintb(bitbuf, sizeof(bitbuf),
CPER_MEMORY_ERROR_VALIDATION_BITS_FMT, ME->ValidationBits);
aprint_debug_dev(sc->sc_dev, "%s: ValidationBits=3D%s\n", ctx, bitbuf);
@@ -472,6 +481,110 @@ apei_cper_memory_error_report(struct ape
ctx, t);
}
}
+
+out: /*
+ * XXX pass this through to uvm(9) or userland for decisions
+ * like page retirement
+ */
+ return;
+}
+
+/*
+ * N.2.7. PCI Express Error Section
+ *
+ * https://uefi.org/specs/UEFI/2.10/Apx_N_Common_Platform_Error_Record.htm=
l#pci-express-error-section
+ */
+static const struct uuid CPER_PCIE_ERROR_SECTION =3D
+ {0xd995e954,0xbbc1,0x430f,0xad,0x91,{0xb4,0x4d,0xcb,0x3c,0x6f,0x35}};
+
+static const char *const cper_pcie_error_port_type[] =3D {
+#define F(LN, SN, V) [LN] =3D #SN,
+ CPER_PCIE_ERROR_PORT_TYPES(F)
+#undef F
+};
+
+static void
+apei_cper_pcie_error_report(struct apei_softc *sc, const void *buf, size_t=
len,
+ const char *ctx, bool ratelimitok)
+{
+ const struct cper_pcie_error *PE =3D buf;
+ char bitbuf[1024];
+
+ /*
+ * If we've hit the rate limit, skip printing the error.
+ */
+ if (!ratelimitok)
+ goto out;
+
+ snprintb(bitbuf, sizeof(bitbuf),
+ CPER_PCIE_ERROR_VALIDATION_BITS_FMT, PE->ValidationBits);
+ aprint_debug_dev(sc->sc_dev, "%s: ValidationBits=3D%s\n", ctx, bitbuf);
+ if (PE->ValidationBits & CPER_PCIE_ERROR_VALID_PORT_TYPE) {
+ const uint32_t t =3D PE->PortType;
+ const char *n =3D t < __arraycount(cper_pcie_error_port_type)
+ ? cper_pcie_error_port_type[t] : NULL;
+
+ if (n) {
+ device_printf(sc->sc_dev, "%s: PortType=3D%"PRIu32
+ " (%s)\n", ctx, t, n);
+ } else {
+ device_printf(sc->sc_dev, "%s: PortType=3D%"PRIu32"\n",
+ ctx, t);
+ }
+ }
+ if (PE->ValidationBits & CPER_PCIE_ERROR_VALID_VERSION) {
+ /* XXX BCD */
+ device_printf(sc->sc_dev, "%s: Version=3D0x%"PRIx32"\n",
+ ctx, PE->Version);
+ }
+ if (PE->ValidationBits & CPER_PCIE_ERROR_VALID_COMMAND_STATUS) {
+ device_printf(sc->sc_dev, "%s: CommandStatus=3D0x04%"PRIx32"\n",
+ ctx, PE->CommandStatus);
+ }
+ if (PE->ValidationBits & CPER_PCIE_ERROR_VALID_DEVICE_ID) {
+ /* XXX decode vendor/product/class/fun/dev/seg/bus */
+ char hex[2*sizeof(PE->DeviceID) + 1];
+ const unsigned char *p =3D (const void *)&PE->DeviceID;
+ unsigned i;
+
+ for (i =3D 0; i < sizeof(PE->DeviceID); i++)
+ snprintf(hex + 2*i, sizeof(hex) - 2*i, "%02hhx", p[i]);
+ device_printf(sc->sc_dev, "%s: DeviceID=3D{%s}\n", ctx, hex);
+ }
+ if (PE->ValidationBits & CPER_PCIE_ERROR_VALID_DEVICE_SERIAL) {
+ device_printf(sc->sc_dev, "%s: DeviceSerial=3D{%016"PRIx64"}\n",
+ ctx, PE->DeviceSerial);
+ }
+ if (PE->ValidationBits & CPER_PCIE_ERROR_VALID_BRIDGE_CONTROL_STATUS) {
+ device_printf(sc->sc_dev, "%s: BridgeControlStatus=3D%"PRIx32
+ "\n", ctx, PE->BridgeControlStatus);
+ }
+ if (PE->ValidationBits & CPER_PCIE_ERROR_VALID_CAPABILITY_STRUCTURE) {
+ char hex[2*sizeof(PE->CapabilityStructure) + 1];
+ unsigned i;
+
+ for (i =3D 0; i < sizeof(PE->CapabilityStructure); i++) {
+ snprintf(hex + 2*i, sizeof(hex) - 2*i, "%02hhx",
+ PE->CapabilityStructure[i]);
+ }
+ device_printf(sc->sc_dev, "%s: CapabilityStructure=3D{%s}\n",
+ ctx, hex);
+ }
+ if (PE->ValidationBits & CPER_PCIE_ERROR_VALID_AER_INFO) {
+ char hex[2*sizeof(PE->AERInfo) + 1];
+ unsigned i;
+
+ for (i =3D 0; i < sizeof(PE->AERInfo); i++) {
+ snprintf(hex + 2*i, sizeof(hex) - 2*i, "%02hhx",
+ PE->AERInfo[i]);
+ }
+ device_printf(sc->sc_dev, "%s: AERInfo=3D{%s}\n", ctx, hex);
+ }
+
+out: /*
+ * Let the PCI subsystem handle it.
+ */
+ pci_cper_error(PE, ratelimitok ? PCI_ERROR_PRINT : 0);
}
=20
/*
@@ -489,18 +602,22 @@ static const struct apei_cper_report {
const char *name;
const struct uuid *type;
size_t minlength;
- void (*func)(struct apei_softc *, const void *, size_t, const char *);
+ void (*func)(struct apei_softc *, const void *, size_t, const char *,
+ bool);
} apei_cper_reports[] =3D {
{ "memory", &CPER_MEMORY_ERROR_SECTION,
sizeof(struct cper_memory_error),
apei_cper_memory_error_report },
+ { "PCIe", &CPER_PCIE_ERROR_SECTION,
+ sizeof(struct cper_pcie_error),
+ apei_cper_pcie_error_report },
};
=20
/*
- * apei_gede_report_header(sc, gede, ctx, &headerlen, &report)
+ * apei_gede_report_header(sc, gede, ctx, ratelimitok, &headerlen, &report)
*
* Report the header of the ith Generic Error Data Entry in the
- * given context.
+ * given context, if ratelimitok is true.
*
* Return the actual length of the header in headerlen, or 0 if
* not known because the revision isn't recognized.
@@ -510,7 +627,7 @@ static const struct apei_cper_report {
*/
static void
apei_gede_report_header(struct apei_softc *sc,
- const ACPI_HEST_GENERIC_DATA *gede, const char *ctx,
+ const ACPI_HEST_GENERIC_DATA *gede, const char *ctx, bool ratelimitok,
size_t *headerlenp, const struct apei_cper_report **reportp)
{
const ACPI_HEST_GENERIC_DATA_V300 *const gede_v3 =3D (const void *)gede;
@@ -538,14 +655,19 @@ apei_gede_report_header(struct apei_soft
=20
if (memcmp(§ype, report->type, sizeof(sectype)) !=3D 0)
continue;
- device_printf(sc->sc_dev, "%s: SectionType=3D%s (%s error)\n",
- ctx, guidstr, report->name);
+ if (ratelimitok) {
+ device_printf(sc->sc_dev, "%s:"
+ " SectionType=3D%s (%s error)\n",
+ ctx, guidstr, report->name);
+ }
*reportp =3D report;
break;
}
if (i =3D=3D __arraycount(apei_cper_reports)) {
- device_printf(sc->sc_dev, "%s: SectionType=3D%s\n", ctx,
- guidstr);
+ if (ratelimitok) {
+ device_printf(sc->sc_dev, "%s: SectionType=3D%s\n", ctx,
+ guidstr);
+ }
*reportp =3D NULL;
}
=20
@@ -553,11 +675,14 @@ apei_gede_report_header(struct apei_soft
* Print the numeric severity and, if we have it, a symbolic
* name for it.
*/
- device_printf(sc->sc_dev, "%s: ErrorSeverity=3D%"PRIu32" (%s)\n", ctx,
- gede->ErrorSeverity,
- (gede->ErrorSeverity < __arraycount(apei_gede_severity)
- ? apei_gede_severity[gede->ErrorSeverity]
- : "unknown"));
+ if (ratelimitok) {
+ device_printf(sc->sc_dev, "%s: ErrorSeverity=3D%"PRIu32" (%s)\n",
+ ctx,
+ gede->ErrorSeverity,
+ (gede->ErrorSeverity < __arraycount(apei_gede_severity)
+ ? apei_gede_severity[gede->ErrorSeverity]
+ : "unknown"));
+ }
=20
/*
* The Revision may not often be useful, but this is only ever
@@ -565,8 +690,10 @@ apei_gede_report_header(struct apei_soft
* you can glean at your convenience with acpidump. So print
* it anyway.
*/
- device_printf(sc->sc_dev, "%s: Revision=3D0x%"PRIx16"\n", ctx,
- gede->Revision);
+ if (ratelimitok) {
+ device_printf(sc->sc_dev, "%s: Revision=3D0x%"PRIx16"\n", ctx,
+ gede->Revision);
+ }
=20
/*
* Don't touch anything past the Revision until we've
@@ -587,38 +714,49 @@ apei_gede_report_header(struct apei_soft
* Print the validation bits at debug level. Only really
* helpful if there are bits we _don't_ know about.
*/
- /* XXX define this format somewhere */
- snprintb(buf, sizeof(buf), "\177\020"
- "b\000" "FRU_ID\0"
- "b\001" "FRU_TEXT\0" /* `FRU string', sometimes */
- "b\002" "TIMESTAMP\0"
- "\0", gede->ValidationBits);
- aprint_debug_dev(sc->sc_dev, "%s: ValidationBits=3D%s\n", ctx, buf);
+ if (ratelimitok) {
+ /* XXX define this format somewhere */
+ snprintb(buf, sizeof(buf), "\177\020"
+ "b\000" "FRU_ID\0"
+ "b\001" "FRU_TEXT\0" /* `FRU string', sometimes */
+ "b\002" "TIMESTAMP\0"
+ "\0", gede->ValidationBits);
+ aprint_debug_dev(sc->sc_dev, "%s: ValidationBits=3D%s\n", ctx,
+ buf);
+ }
=20
/*
* Print the CPER section flags.
*/
- snprintb(buf, sizeof(buf), CPER_SECTION_FLAGS_FMT, gede->Flags);
- device_printf(sc->sc_dev, "%s: Flags=3D%s\n", ctx, buf);
+ if (ratelimitok) {
+ snprintb(buf, sizeof(buf), CPER_SECTION_FLAGS_FMT,
+ gede->Flags);
+ device_printf(sc->sc_dev, "%s: Flags=3D%s\n", ctx, buf);
+ }
=20
/*
* The ErrorDataLength is unlikely to be useful for the log, so
* print it at debug level only.
*/
- aprint_debug_dev(sc->sc_dev, "%s: ErrorDataLength=3D0x%"PRIu32"\n",
- ctx, gede->ErrorDataLength);
+ if (ratelimitok) {
+ aprint_debug_dev(sc->sc_dev, "%s:"
+ " ErrorDataLength=3D0x%"PRIu32"\n",
+ ctx, gede->ErrorDataLength);
+ }
=20
/*
* Print the FRU Id and text, if available.
*/
- if (gede->ValidationBits & ACPI_HEST_GEN_VALID_FRU_ID) {
+ if (ratelimitok &&
+ (gede->ValidationBits & ACPI_HEST_GEN_VALID_FRU_ID) !=3D 0) {
struct uuid fruid;
=20
apei_cper_guid_dec(gede->FruId, &fruid);
apei_format_guid(&fruid, guidstr);
device_printf(sc->sc_dev, "%s: FruId=3D%s\n", ctx, guidstr);
}
- if (gede->ValidationBits & ACPI_HEST_GEN_VALID_FRU_STRING) {
+ if (ratelimitok &&
+ (gede->ValidationBits & ACPI_HEST_GEN_VALID_FRU_STRING) !=3D 0) {
device_printf(sc->sc_dev, "%s: FruText=3D%.20s\n",
ctx, gede->FruText);
}
@@ -627,7 +765,8 @@ apei_gede_report_header(struct apei_soft
* Print the timestamp, if available by the revision number and
* the validation bits.
*/
- if (gede->Revision >=3D 0x0300 && gede->Revision < 0x0400 &&
+ if (ratelimitok &&
+ gede->Revision >=3D 0x0300 && gede->Revision < 0x0400 &&
gede->ValidationBits & ACPI_HEST_GEN_VALID_TIMESTAMP) {
const uint8_t *const t =3D (const uint8_t *)&gede_v3->TimeStamp;
const uint8_t s =3D t[0];
@@ -648,6 +787,100 @@ apei_gede_report_header(struct apei_soft
}
=20
/*
+ * apei_gesb_ratelimit
+ *
+ * State to limit the rate of console log messages about hardware
+ * errors. For each of the four severity levels,
+ *
+ * - ACPI_HEST_GEN_ERROR_RECOVERABLE (uncorrectable but recoverable),
+ * - ACPI_HEST_GEN_ERROR_FATAL (uncorrectable and fatally so),
+ * - ACPI_HEST_GEN_ERROR_CORRECTED, and
+ * - ACPI_HEST_GEN_ERROR_NONE (including ill-formed errors),
+ *
+ * we record the last time it happened, protected by a CPU simple
+ * lock that we only try-acquire so it is safe to use in any
+ * context, including non-maskable interrupt context.
+ */
+
+static struct {
+ __cpu_simple_lock_t lock __aligned(COHERENCY_UNIT);
+ struct timeval lasttime;
+ volatile uint32_t suppressed;
+} apei_gesb_ratelimit[4] __cacheline_aligned;
+
+static void
+atomic_incsat_32(volatile uint32_t *p)
+{
+ uint32_t o, n;
+
+ do {
+ o =3D atomic_load_relaxed(p);
+ if (__predict_false(o =3D=3D UINT_MAX))
+ return;
+ n =3D o + 1;
+ } while (__predict_false(atomic_cas_32(p, o, n) !=3D o));
+}
+
+/*
+ * apei_gesb_ratecheck(sc, severity, suppressed)
+ *
+ * Check for a rate limit on errors of the specified severity.
+ *
+ * =3D> Return true if the error should be printed, and format into
+ * the buffer suppressed a message saying how many errors were
+ * previously suppressed.
+ *
+ * =3D> Return false if the error should be suppressed because the
+ * last one printed was too recent.
+ */
+static bool
+apei_gesb_ratecheck(struct apei_softc *sc, uint32_t severity,
+ char suppressed[static sizeof(" (4294967295 or more errors suppressed)=
")])
+{
+ /* one of each type per minute (XXX worth making configurable?) */
+ const struct timeval mininterval =3D {60, 0};
+ unsigned i =3D MIN(severity, ACPI_HEST_GEN_ERROR_NONE); /* paranoia */
+ bool ok =3D false;
+
+ /*
+ * If the lock is contended, the rate limit is probably
+ * exceeded, so it's not OK to print.
+ *
+ * Otherwise, with the lock held, ask ratecheck(9) whether it's
+ * OK to print.
+ */
+ if (!__cpu_simple_lock_try(&apei_gesb_ratelimit[i].lock))
+ goto out;
+ ok =3D ratecheck(&apei_gesb_ratelimit[i].lasttime, &mininterval);
+ __cpu_simple_unlock(&apei_gesb_ratelimit[i].lock);
+
+out: /*
+ * If it's OK to print, report the number of errors that were
+ * suppressed. If it's not OK to print, count a suppressed
+ * error.
+ */
+ if (ok) {
+ const uint32_t n =3D
+ atomic_swap_32(&apei_gesb_ratelimit[i].suppressed, 0);
+
+ if (n =3D=3D 0) {
+ suppressed[0] =3D '\0';
+ } else {
+ snprintf(suppressed,
+ sizeof(" (4294967295 or more errors suppressed)"),
+ " (%u%s error%s suppressed)",
+ n,
+ n =3D=3D UINT32_MAX ? " or more" : "",
+ n =3D=3D 1 ? "" : "s");
+ }
+ } else {
+ atomic_incsat_32(&apei_gesb_ratelimit[i].suppressed);
+ suppressed[0] =3D '\0';
+ }
+ return ok;
+}
+
+/*
* apei_gesb_report(sc, gesb, size, ctx)
*
* Check a Generic Error Status Block, of at most the specified
@@ -663,7 +896,8 @@ apei_gesb_report(struct apei_softc *sc,=20
uint32_t datalen, rawdatalen;
const ACPI_HEST_GENERIC_DATA *gede0, *gede;
const unsigned char *rawdata;
- char statusbuf[128];
+ bool ratelimitok =3D false;
+ char suppressed[sizeof(" (4294967295 or more errors suppressed)")];
bool fatal =3D false;
=20
/*
@@ -671,8 +905,13 @@ apei_gesb_report(struct apei_softc *sc,=20
* Block before we try to touch anything in it.
*/
if (size < sizeof(*gesb)) {
- device_printf(sc->sc_dev, "%s: truncated GESB, %zu < %zu\n",
- ctx, size, sizeof(*gesb));
+ ratelimitok =3D apei_gesb_ratecheck(sc, ACPI_HEST_GEN_ERROR_NONE,
+ suppressed);
+ if (ratelimitok) {
+ device_printf(sc->sc_dev,
+ "%s: truncated GESB, %zu < %zu%s\n",
+ ctx, size, sizeof(*gesb), suppressed);
+ }
status =3D 0;
goto out;
}
@@ -696,29 +935,42 @@ apei_gesb_report(struct apei_softc *sc,=20
goto out;
}
=20
- /* XXX define this format somewhere */
- snprintb(statusbuf, sizeof(statusbuf), "\177\020"
- "b\000" "UE\0"
- "b\001" "CE\0"
- "b\002" "MULTI_UE\0"
- "b\003" "MULTI_CE\0"
- "f\004\010" "GEDE_COUNT\0"
- "\0", status);
+ /*
+ * Read out the severity and get the number of entries in this
+ * status block.
+ */
+ severity =3D gesb->ErrorSeverity;
+ nentries =3D __SHIFTOUT(status, ACPI_HEST_ERROR_ENTRY_COUNT);
=20
/*
* Print a message to the console and dmesg about the severity
* of the error.
*/
- severity =3D gesb->ErrorSeverity;
- nentries =3D __SHIFTOUT(status, ACPI_HEST_ERROR_ENTRY_COUNT);
- if (severity < __arraycount(apei_gesb_severity)) {
- device_printf(sc->sc_dev, "%s reported hardware error:"
- " severity=3D%s nentries=3D%u status=3D%s\n",
- ctx, apei_gesb_severity[severity], nentries, statusbuf);
- } else {
- device_printf(sc->sc_dev, "%s reported error:"
- " severity=3D%"PRIu32" nentries=3D%u status=3D%s\n",
- ctx, severity, nentries, statusbuf);
+ ratelimitok =3D apei_gesb_ratecheck(sc, severity, suppressed);
+ if (ratelimitok) {
+ char statusbuf[128];
+
+ /* XXX define this format somewhere */
+ snprintb(statusbuf, sizeof(statusbuf), "\177\020"
+ "b\000" "UE\0"
+ "b\001" "CE\0"
+ "b\002" "MULTI_UE\0"
+ "b\003" "MULTI_CE\0"
+ "f\004\010" "GEDE_COUNT\0"
+ "\0", status);
+
+ if (severity < __arraycount(apei_gesb_severity)) {
+ device_printf(sc->sc_dev, "%s"
+ " reported hardware error%s:"
+ " severity=3D%s nentries=3D%u status=3D%s\n",
+ ctx, suppressed,
+ apei_gesb_severity[severity], nentries, statusbuf);
+ } else {
+ device_printf(sc->sc_dev, "%s reported error%s:"
+ " severity=3D%"PRIu32" nentries=3D%u status=3D%s\n",
+ ctx, suppressed,
+ severity, nentries, statusbuf);
+ }
}
=20
/*
@@ -750,9 +1002,8 @@ apei_gesb_report(struct apei_softc *sc,=20
unknownstatus &=3D ~ACPI_HEST_CORRECTABLE;
unknownstatus &=3D ~ACPI_HEST_MULTIPLE_CORRECTABLE;
unknownstatus &=3D ~ACPI_HEST_ERROR_ENTRY_COUNT;
- if (unknownstatus !=3D 0) {
+ if (ratelimitok && unknownstatus !=3D 0) {
/* XXX dtrace */
- /* XXX rate-limit? */
device_printf(sc->sc_dev, "%s: unknown BlockStatus bits:"
" 0x%"PRIx32"\n", ctx, unknownstatus);
}
@@ -769,9 +1020,12 @@ apei_gesb_report(struct apei_softc *sc,=20
*/
datalen =3D gesb->DataLength;
if (size < datalen) {
- device_printf(sc->sc_dev, "%s:"
- " GESB DataLength exceeds bounds: %zu < %"PRIu32"\n",
- ctx, size, datalen);
+ if (ratelimitok) {
+ device_printf(sc->sc_dev, "%s:"
+ " GESB DataLength exceeds bounds:"
+ " %zu < %"PRIu32"\n",
+ ctx, size, datalen);
+ }
datalen =3D size;
}
size -=3D datalen;
@@ -795,9 +1049,11 @@ apei_gesb_report(struct apei_softc *sc,=20
* GEDE header, stop here.
*/
if (datalen < sizeof(*gede)) {
- device_printf(sc->sc_dev, "%s:"
- " truncated GEDE: %"PRIu32" < %zu bytes\n",
- subctx, datalen, sizeof(*gede));
+ if (ratelimitok) {
+ device_printf(sc->sc_dev, "%s:"
+ " truncated GEDE: %"PRIu32" < %zu bytes\n",
+ subctx, datalen, sizeof(*gede));
+ }
break;
}
=20
@@ -806,7 +1062,7 @@ apei_gesb_report(struct apei_softc *sc,=20
* vary from revision to revision of the GEDE) and the
* CPER report function if possible.
*/
- apei_gede_report_header(sc, gede, subctx,
+ apei_gede_report_header(sc, gede, subctx, ratelimitok,
&headerlen, &report);
=20
/*
@@ -814,9 +1070,11 @@ apei_gesb_report(struct apei_softc *sc,=20
* unfamiliar revision, stop here.
*/
if (headerlen =3D=3D 0) {
- device_printf(sc->sc_dev, "%s:"
- " unknown revision: 0x%"PRIx16"\n",
- subctx, gede->Revision);
+ if (ratelimitok) {
+ device_printf(sc->sc_dev, "%s:"
+ " unknown revision: 0x%"PRIx16"\n",
+ subctx, gede->Revision);
+ }
break;
}
=20
@@ -826,9 +1084,12 @@ apei_gesb_report(struct apei_softc *sc,=20
*/
datalen -=3D headerlen;
if (datalen < gede->ErrorDataLength) {
- device_printf(sc->sc_dev, "%s: truncated GEDE payload:"
- " %"PRIu32" < %"PRIu32" bytes\n",
- subctx, datalen, gede->ErrorDataLength);
+ if (ratelimitok) {
+ device_printf(sc->sc_dev, "%s:"
+ " truncated GEDE payload:"
+ " %"PRIu32" < %"PRIu32" bytes\n",
+ subctx, datalen, gede->ErrorDataLength);
+ }
break;
}
=20
@@ -837,10 +1098,14 @@ apei_gesb_report(struct apei_softc *sc,=20
* this Generic Error Data Entry.
*/
if (report =3D=3D NULL) {
- device_printf(sc->sc_dev, "%s: [unknown type]\n", ctx);
+ if (ratelimitok) {
+ device_printf(sc->sc_dev, "%s:"
+ " [unknown type]\n", ctx);
+ }
} else {
+ /* XXX pass ratelimit through */
(*report->func)(sc, (const char *)gede + headerlen,
- gede->ErrorDataLength, subctx);
+ gede->ErrorDataLength, subctx, ratelimitok);
}
=20
/*
@@ -866,9 +1131,12 @@ apei_gesb_report(struct apei_softc *sc,=20
*/
rawdatalen =3D gesb->RawDataLength;
if (size < rawdatalen) {
- device_printf(sc->sc_dev, "%s:"
- " GESB RawDataLength exceeds bounds: %zu < %"PRIu32"\n",
- ctx, size, rawdatalen);
+ if (ratelimitok) {
+ device_printf(sc->sc_dev, "%s:"
+ " GESB RawDataLength exceeds bounds:"
+ " %zu < %"PRIu32"\n",
+ ctx, size, rawdatalen);
+ }
rawdatalen =3D size;
}
size -=3D rawdatalen;
@@ -876,7 +1144,7 @@ apei_gesb_report(struct apei_softc *sc,=20
/*
* Hexdump the raw data, if any.
*/
- if (rawdatalen > 0) {
+ if (ratelimitok && rawdatalen > 0) {
char devctx[128];
=20
snprintf(devctx, sizeof(devctx), "%s: %s: raw data",
@@ -887,7 +1155,7 @@ apei_gesb_report(struct apei_softc *sc,=20
/*
* If there's anything left after the raw data, warn.
*/
- if (size > 0) {
+ if (ratelimitok && size > 0) {
device_printf(sc->sc_dev, "%s: excess data: %zu bytes\n",
ctx, size);
}
diff -r b4e17a9d10b4 -r ceecab6e9cd9 sys/dev/acpi/apei_cper.h
--- a/sys/dev/acpi/apei_cper.h Mon Oct 21 15:57:45 2024 +0000
+++ b/sys/dev/acpi/apei_cper.h Thu Oct 24 20:08:59 2024 +0000
@@ -62,14 +62,14 @@ struct cper_header {
} __packed;
__CTASSERT(sizeof(struct cper_header) =3D=3D 128);
=20
-enum { /* struct cper_header::error_severity */
+enum { /* struct cper_header::ErrorSeverity */
CPER_ERROR_SEVERITY_RECOVERABLE =3D 0,
CPER_ERROR_SEVERITY_FATAL =3D 1,
CPER_ERROR_SEVERITY_CORRECTED =3D 2,
CPER_ERROR_SEVERITY_INFORMATIONAL =3D 3,
};
=20
-enum { /* struct cper_header::validation_bits */
+enum { /* struct cper_header::ValidationBits */
CPER_VALID_PLATFORM_ID =3D __BIT(0),
CPER_VALID_TIMESTAMP =3D __BIT(1),
CPER_VALID_PARTITION_ID =3D __BIT(2),
@@ -78,7 +78,7 @@ enum { /* struct cper_header::validat
/*
* https://uefi.org/specs/UEFI/2.10/Apx_N_Common_Platform_Error_Record.htm=
l#error-record-header-flags
*/
-enum { /* struct cper_header::flags */
+enum { /* struct cper_header::Flags */
CPER_HW_ERROR_FLAG_RECOVERED =3D __BIT(0),
CPER_HW_ERROR_FLAG_PREVERR =3D __BIT(1),
CPER_HW_ERROR_FLAG_SIMULATED =3D __BIT(2),
@@ -110,6 +110,8 @@ enum {
"\0"
=20
/*
+ * N.2.5. Memory Error Section
+ *
* https://uefi.org/specs/UEFI/2.10/Apx_N_Common_Platform_Error_Record.htm=
l#memory-error-section
*
* Type: {0xa5bc1114,0x6f64,0x4ede,{0xb8,0x63,0x3e,0x83,0xed,0x7c,0x83,0xb=
1}}
@@ -144,7 +146,7 @@ struct cper_memory_error_ext {
} __packed;
__CTASSERT(sizeof(struct cper_memory_error_ext) =3D=3D 80);
=20
-enum { /* struct cper_memory_error::validation_bits */
+enum { /* struct cper_memory_error::ValidationBits */
CPER_MEMORY_ERROR_VALID_ERROR_STATUS =3D __BIT(0),
CPER_MEMORY_ERROR_VALID_PHYSICAL_ADDRESS =3D __BIT(1),
CPER_MEMORY_ERROR_VALID_PHYSICAL_ADDRESS_MASK =3D __BIT(2),
@@ -194,7 +196,7 @@ enum { /* struct cper_memory_error::v
"b\025" "CHIP_ID\0" \
"\0"
=20
-enum { /* struct cper_memory_error::bank */
+enum { /* struct cper_memory_error::Bank */
CPER_MEMORY_ERROR_BANK_ADDRESS =3D __BITS(7,0),
CPER_MEMORY_ERROR_BANK_GROUP =3D __BITS(15,8),
};
@@ -219,16 +221,92 @@ enum { /* struct cper_memory_error::b
F(CPER_MEMORY_ERROR_PHYSMEM_MAPOUT_EVENT, PHYSMEM_MAPOUT_EVENT, 15) \
/* end of CPER_MEMORY_ERROR_TYPES */
=20
-enum cper_memory_error_type { /* struct cper_memory_error::memory_error_ty=
pe */
+enum cper_memory_error_type { /* struct cper_memory_error::MemoryErrorType=
*/
#define CPER_MEMORY_ERROR_TYPE_DEF(LN, SN, V) LN =3D V,
CPER_MEMORY_ERROR_TYPES(CPER_MEMORY_ERROR_TYPE_DEF)
#undef CPER_MEMORY_ERROR_TYPE_DEF
};
=20
-enum { /* struct cper_memory_error_ext::extended */
+enum { /* struct cper_memory_error_ext::Extended */
CPER_MEMORY_ERROR_EXTENDED_ROWBIT16 =3D __BIT(0),
CPER_MEMORY_ERROR_EXTENDED_ROWBIT17 =3D __BIT(1),
CPER_MEMORY_ERROR_EXTENDED_CHIPID =3D __BITS(7,5),
};
=20
+/*
+ * N.2.7. PCI Express Error Section
+ *
+ * https://uefi.org/specs/UEFI/2.10/Apx_N_Common_Platform_Error_Record.htm=
l#pci-express-error-section
+ *
+ * Type: {0xd995e954,0xbbc1,0x430f,{0xad,0x91,0xb4,0x4d,0xcb,0x3c,0x6f,0x3=
5}}
+ */
+
+struct cper_pcie_error {
+ uint64_t ValidationBits;
+ uint32_t PortType;
+ uint32_t Version;
+ uint32_t CommandStatus;
+ uint32_t Reserved0;
+ struct {
+ uint8_t VendorID[2];
+ uint8_t DeviceID[2]; /* product */
+ uint8_t ClassCode[3];
+ uint8_t Function;
+ uint8_t Device;
+ uint8_t Segment[2];
+ uint8_t PrimaryBus;
+ uint8_t SecondaryBus;
+ uint8_t Slot[2]; /* bits 0:2 resv, bits 3:15 slot */
+ uint8_t Reserved0;
+ } DeviceID;
+ uint64_t DeviceSerial;
+ uint32_t BridgeControlStatus;
+ uint8_t CapabilityStructure[60];
+ uint8_t AERInfo[96];
+};
+__CTASSERT(sizeof(struct cper_pcie_error) =3D=3D 208);
+
+enum { /* struct cper_pcie_error::ValidationBits */
+ CPER_PCIE_ERROR_VALID_PORT_TYPE =3D __BIT(0),
+ CPER_PCIE_ERROR_VALID_VERSION =3D __BIT(1),
+ CPER_PCIE_ERROR_VALID_COMMAND_STATUS =3D __BIT(2),
+ CPER_PCIE_ERROR_VALID_DEVICE_ID =3D __BIT(3),
+ CPER_PCIE_ERROR_VALID_DEVICE_SERIAL =3D __BIT(4),
+ CPER_PCIE_ERROR_VALID_BRIDGE_CONTROL_STATUS =3D __BIT(5),
+ CPER_PCIE_ERROR_VALID_CAPABILITY_STRUCTURE =3D __BIT(6),
+ CPER_PCIE_ERROR_VALID_AER_INFO =3D __BIT(7),
+};
+
+#define CPER_PCIE_ERROR_VALIDATION_BITS_FMT "\177\020" \
+ "b\000" "PORT_TYPE\0" \
+ "b\001" "VERSION\0" \
+ "b\002" "COMMAND_STATUS\0" \
+ "b\003" "DEVICE_ID\0" \
+ "b\004" "DEVICE_SERIAL\0" \
+ "b\005" "BRIDGE_CONTROL_STATUS\0" \
+ "b\006" "CAPABILITY_STRUCTURE\0" \
+ "b\007" "AER_INFO\0" \
+ "\0"
+
+#define CPER_PCIE_ERROR_PORT_TYPES(F) \
+ F(CPER_PCIE_ERROR_PORT_TYPE_PCIE_ENDPOINT, PCIE_ENDPOINT, 0) \
+ F(CPER_PCIE_ERROR_PORT_TYPE_LEGACY_PCI_ENDPOINT, LEGACY_PCI_ENDPOINT, \
+ 1) \
+ F(CPER_PCIE_ERROR_PORT_TYPE_ROOTPORT5_UPSTREAMSWITCH, \
+ ROOTPORT5_UPSTREAMSWITCH, 4) \
+ F(CPER_PCIE_ERROR_PORT_TYPE_DOWNSTREAMSWITCH, DOWNSTREAMSWITCH, 6) \
+ F(CPER_PCIE_ERROR_PORT_TYPE_PCIE_PCI_BRIDGE, PCIE_PCI_BRIDGE, 7) \
+ F(CPER_PCIE_ERROR_PORT_TYPE_PCI_PCIE_BRIDGE, PCI_PCIE_BRIDGE, 8) \
+ F(CPER_PCIE_ERROR_PORT_TYPE_RCIEP_DEV, RCIEP_DEV, 9) \
+ /* Root Complex Integrated Endpoint Device */ \
+ F(CPER_PCIE_ERROR_PORT_TYPE_RCEC, RCEC, 10) \
+ /* Root Complex Event Collector */ \
+ /* end of CPER_PCIE_ERROR_PORT_TYPES */
+
+enum cper_pcie_error_port_type { /* struct cper_pcie_error::PortType */
+#define CPER_PCIE_ERROR_PORT_TYPE_DEF(LN, SN, V) LN =3D V,
+ CPER_PCIE_ERROR_PORT_TYPES(CPER_PCIE_ERROR_PORT_TYPE_DEF)
+#undef CPER_PCIE_ERROR_PORT_TYPE_DEF
+};
+
#endif /* _SYS_DEV_ACPI_APEI_CPER_H_ */
diff -r b4e17a9d10b4 -r ceecab6e9cd9 sys/dev/acpi/apei_hest.c
--- a/sys/dev/acpi/apei_hest.c Mon Oct 21 15:57:45 2024 +0000
+++ b/sys/dev/acpi/apei_hest.c Thu Oct 24 20:08:59 2024 +0000
@@ -265,7 +265,7 @@ apei_hest_ghes_v2_poll(void *cookie)
* confusion, let's try to have only one CPU process error
* notifications at a time.
*/
-static __cpu_simple_lock_t apei_hest_nmi_lock;
+static __cpu_simple_lock_t apei_hest_nmi_lock =3D __SIMPLELOCK_UNLOCKED;
=20
/*
* apei_hest_ghes_nmi(tf, cookie)
@@ -400,6 +400,8 @@ apei_hest_attach_ghes(struct apei_softc=20
*/
switch (ghes->Notify.Type) {
case ACPI_HEST_NOTIFY_POLLED:
+ if (ghes->Notify.PollInterval =3D=3D 0) /* paranoia */
+ break;
callout_init(&src->as_ch, CALLOUT_MPSAFE);
callout_setfunc(&src->as_ch, &apei_hest_ghes_poll, src);
callout_schedule(&src->as_ch, 0);
@@ -451,6 +453,8 @@ apei_hest_detach_ghes(struct apei_softc=20
*/
switch (ghes->Notify.Type) {
case ACPI_HEST_NOTIFY_POLLED:
+ if (ghes->Notify.PollInterval =3D=3D 0) /* paranoia */
+ break;
callout_halt(&src->as_ch, NULL);
callout_destroy(&src->as_ch);
break;
@@ -583,6 +587,8 @@ apei_hest_attach_ghes_v2(struct apei_sof
*/
switch (ghes_v2->Notify.Type) {
case ACPI_HEST_NOTIFY_POLLED:
+ if (ghes_v2->Notify.PollInterval =3D=3D 0) /* paranoia */
+ break;
callout_init(&src->as_ch, CALLOUT_MPSAFE);
callout_setfunc(&src->as_ch, &apei_hest_ghes_v2_poll, src);
callout_schedule(&src->as_ch, 0);
@@ -634,6 +640,8 @@ apei_hest_detach_ghes_v2(struct apei_sof
*/
switch (ghes_v2->Notify.Type) {
case ACPI_HEST_NOTIFY_POLLED:
+ if (ghes_v2->Notify.PollInterval =3D=3D 0) /* paranoia */
+ break;
callout_halt(&src->as_ch, NULL);
callout_destroy(&src->as_ch);
break;
@@ -907,7 +915,7 @@ apei_hest_attach(struct apei_softc *sc)
* limit on it; if you have gigabytes of HEST something is
* probably wrong.
*/
- if (n > INT32_MAX/sizeof(hsc->hsc_source[0])) {
+ if (n > MIN(SIZE_MAX, INT32_MAX)/sizeof(hsc->hsc_source[0])) {
aprint_error_dev(sc->sc_dev, "HEST: too many error sources\n");
return;
}
diff -r b4e17a9d10b4 -r ceecab6e9cd9 sys/dev/pci/files.pci
--- a/sys/dev/pci/files.pci Mon Oct 21 15:57:45 2024 +0000
+++ b/sys/dev/pci/files.pci Thu Oct 24 20:08:59 2024 +0000
@@ -19,6 +19,7 @@ defflag opt_pciide.h PCIIDE_CMD064x_DISA
device pci {[dev =3D -1], [function =3D -1]}
attach pci at pcibus
file dev/pci/pci.c pci needs-flag
+file dev/pci/pci_error.c pci
file dev/pci/pci_map.c pci
file dev/pci/pci_quirks.c pci
file dev/pci/pci_resource.c pci & pci_resource
diff -r b4e17a9d10b4 -r ceecab6e9cd9 sys/dev/pci/pci_error.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/dev/pci/pci_error.c Thu Oct 24 20:08:59 2024 +0000
@@ -0,0 +1,277 @@
+/* $NetBSD$ */
+
+/*-
+ * Copyright (c) 2024 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTO=
RS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIM=
ITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICU=
LAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTO=
RS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF =
THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * PCI error reporting
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD$");
+
+#include <dev/acpi/apei_cper.h> /* XXX not APEI- or even ACPI-specific */
+#include <dev/pci/pci_error.h>
+#include <dev/pci/pcireg.h>
+#include <dev/pci/pcivar.h>
+
+static int
+pci_cper_match(void *cookie, const struct pci_attach_args *pa)
+{
+ const struct cper_pcie_error *PE =3D cookie;
+
+ if (le16dec(PE->DeviceID.Segment) !=3D pci_get_segment(pa->pa_pc))
+ return 0;
+ if (PE->DeviceID.PrimaryBus !=3D pa->pa_bus)
+ return 0;
+ if (PE->DeviceID.Device !=3D pa->pa_device)
+ return 0;
+ if (PE->DeviceID.Function !=3D pa->pa_function)
+ return 0;
+
+ return 1;
+}
+
+/*
+ * pci_cper_error(PE, flags)
+ *
+ * Act on notification of a PCI error report via Common Platform
+ * Error Record.
+ *
+ * If flags has PCI_ERROR_PRINT set, also print to the console.
+ * Callers can use this to rate-limit error reports.
+ */
+void
+pci_cper_error(const struct cper_pcie_error *PE, int flags)
+{
+ struct pci_attach_args pa;
+
+ /*
+ * If there's no device ID, nothing for us to do.
+ *
+ * XXX Report this back to the caller?
+ */
+ if ((PE->ValidationBits & CPER_PCIE_ERROR_VALID_DEVICE_ID) =3D=3D 0)
+ return;
+
+ /*
+ * Find a matching device. If none, do nothing -- we can't do
+ * anything to acknowledge this.
+ */
+ if (!pci_find_device1(&pa, pci_cper_match, __UNCONST(PE))) {
+ if (flags & PCI_ERROR_PRINT) {
+ char devbuf[sizeof("PCI 0000:00:00.000")];
+
+ snprintf(devbuf, sizeof(devbuf),
+ "PCI %04x:%02x:%02x.%u",
+ le16dec(PE->DeviceID.Segment),
+ PE->DeviceID.PrimaryBus,
+ PE->DeviceID.Device,
+ PE->DeviceID.Function);
+ aprint_error("%s: hardware error in unknown device\n",
+ devbuf);
+ }
+ return;
+ }
+
+ /*
+ * Handle via the pci_attach_args that we now have.
+ */
+ pci_error(&pa, flags);
+}
+
+/*
+ * pci_error(pa, flags)
+ *
+ * Check for, report, and acknowledge any errors in the PCI device
+ * described by pa.
+ *
+ * If flags has PCI_ERROR_PRINT set, also print to the console.
+ * Callers can use this to rate-limit error reports.
+ */
+void
+pci_error(const struct pci_attach_args *pa, int flags)
+{
+ char devbuf[sizeof("PCI 0000:00:00.000")];
+ const pci_chipset_tag_t pc =3D pa->pa_pc;
+ const pcitag_t tag =3D pa->pa_tag;
+ pcireg_t aer, pcie;
+ char bitbuf[1024];
+
+ snprintf(devbuf, sizeof(devbuf), "PCI %04x:%02x:%02x.%u",
+ pci_get_segment(pa->pa_pc),
+ pa->pa_bus, pa->pa_device, pa->pa_function);
+
+ /*
+ * If we have Advanced Error Reporting capability, read and
+ * write back any uncorrectable or corrected error status.
+ */
+ if (pci_get_ext_capability(pc, tag, PCI_EXTCAP_AER, &aer, NULL)) {
+ pcireg_t uc_status, uc_mask, uc_sev;
+ pcireg_t control;
+ pcireg_t cor_status, cor_mask;
+
+ /*
+ * Read the status, mask, severity, and control (which
+ * has the number of the first error bit).
+ */
+ uc_status =3D pci_conf_read(pc, tag, aer + PCI_AER_UC_STATUS);
+ uc_mask =3D pci_conf_read(pc, tag, aer + PCI_AER_UC_MASK);
+ uc_sev =3D pci_conf_read(pc, tag, aer + PCI_AER_UC_SEVERITY);
+
+ cor_status =3D pci_conf_read(pc, tag, aer + PCI_AER_COR_STATUS);
+ cor_mask =3D pci_conf_read(pc, tag, aer + PCI_AER_COR_MASK);
+
+ control =3D pci_conf_read(pc, tag, aer + PCI_AER_CAP_CONTROL);
+
+ /*
+ * Acknowledge error status bits.
+ */
+ pci_conf_write(pc, tag, aer + PCI_AER_UC_STATUS, uc_status);
+ pci_conf_write(pc, tag, aer + PCI_AER_COR_STATUS, cor_status);
+
+ /* XXX move me to pcireg.h */
+#define PCI_AER_UC_STATUS_FMT "\177\020" \
+ "b\000" "UNDEFINED\0" \
+ "b\004" "DL_PROTOCOL_ERROR\0" \
+ "b\005" "SURPRISE_DOWN_ERROR\0" \
+ "b\014" "POISONED_TLP\0" \
+ "b\015" "FC_PROTOCOL_ERROR\0" \
+ "b\016" "COMPLETION_TIMEOUT\0" \
+ "b\017" "COMPLETION_ABORT\0" \
+ "b\020" "UNEXPECTED_COMPLETION\0" \
+ "b\021" "RECEIVER_OVERFLOW\0" \
+ "b\022" "MALFORMED_TLP\0" \
+ "b\023" "ECRC_ERROR\0" \
+ "b\024" "UNSUPPORTED_REQUEST_ERROR\0" \
+ "b\025" "ACS_VIOLATION\0" \
+ "b\026" "INTERNAL_ERROR\0" \
+ "b\027" "MC_BLOCKED_TLP\0" \
+ "b\030" "ATOMIC_OP_EGRESS_BLOCKED\0" \
+ "b\031" "TLP_PREFIX_BLOCKED_ERROR\0" \
+ "b\032" "POISONTLP_EGRESS_BLOCKED\0" \
+ "\0"
+
+ /*
+ * Report uncorrectable fatal errors.
+ */
+ if ((flags & PCI_ERROR_PRINT) !=3D 0 &&
+ (uc_status & uc_sev) !=3D 0) {
+ snprintb(bitbuf, sizeof(bitbuf), PCI_AER_UC_STATUS_FMT,
+ uc_status & uc_sev);
+ aprint_error("%s: hardware fatal uncorrectable error:"
+ " %s (mask=3D0x%"PRIx32")\n",
+ devbuf, bitbuf,
+ (uint32_t)uc_mask);
+ }
+
+ /*
+ * Report uncorrectable non-fatal errors.
+ */
+ if ((flags & PCI_ERROR_PRINT) !=3D 0 &&
+ (uc_status & ~uc_sev) !=3D 0) {
+ snprintb(bitbuf, sizeof(bitbuf), PCI_AER_UC_STATUS_FMT,
+ uc_status & ~uc_sev);
+ aprint_error("%s: hardware uncorrectable error: %s"
+ " (mask=3D0x%"PRIx32")\n",
+ devbuf, bitbuf,
+ (uint32_t)uc_mask);
+ }
+
+ /*
+ * Show the first error, if any.
+ */
+ if ((flags & PCI_ERROR_PRINT) !=3D 0 &&
+ uc_status !=3D 0) {
+ pcireg_t first =3D __SHIFTOUT(control,
+ PCI_AER_FIRST_ERROR_PTR);
+ snprintb(bitbuf, sizeof(bitbuf), PCI_AER_UC_STATUS_FMT,
+ (uint32_t)1 << first);
+ aprint_error("%s: hardware first uncorrectable error:"
+ " %s\n",
+ devbuf, bitbuf);
+ }
+
+ /*
+ * Report corrected errors.
+ *
+ * XXX sysctl knob to suppress this
+ */
+ if ((flags & PCI_ERROR_PRINT) !=3D 0 &&
+ cor_status !=3D 0) {
+ /* XXX move me to pcireg.h */
+ snprintb(bitbuf, sizeof(bitbuf), "\177\020"
+ "b\000" "RECEIVER_ERROR\0"
+ "b\006" "BAD_TLP\0"
+ "b\007" "BAD_DLLP\0"
+ "b\010" "REPLAY_NUM_ROLLOVER\0"
+ "b\014" "REPLAY_TIMER_TIMEOUT\0"
+ "b\015" "ADVISORY_NF_ERROR\0"
+ "b\016" "INTERNAL_ERROR\0"
+ "b\017" "HEADER_LOG_OVERFLOW\0"
+ "\0", cor_status);
+ aprint_error("%s: hardware corrected error: %s"
+ " (mask=3D0x%"PRIx32")\n",
+ devbuf, bitbuf, (uint32_t)cor_mask);
+ }
+ }
+
+ /*
+ * If we have PCIe at all, read and write back any error
+ * status.
+ */
+ if (pci_get_capability(pc, tag, PCI_CAP_PCIEXPRESS, &pcie, NULL)) {
+ pcireg_t dcsr =3D pci_conf_read(pc, tag, pcie + PCIE_DCSR);
+ uint16_t dsr =3D __SHIFTOUT(dcsr, __BITS(31,16));
+
+ /*
+ * If any status bits are set, acknowledge all status
+ * bits, write back control bits unchanged, and print
+ * the status.
+ */
+ if (dsr !=3D 0) {
+ pci_conf_write(pc, tag, pcie + PCIE_DCSR, dcsr);
+
+ if (flags & PCI_ERROR_PRINT) {
+ /*
+ * XXX move me to pcireg.h; note: high
+ * half of DCSR
+ */
+ snprintb(bitbuf, sizeof(bitbuf), "\177\020"
+ "b\000" "CORRECTABLE_ERROR\0"
+ "b\001"
+ "NONFATAL_UNCORRECTABLE_ERROR\0"
+ "b\002" "FATAL_ERROR\0"
+ "b\003" "UNSUPPORTED_REQUEST\0"
+ "b\004" "AUX_POWER\0"
+ "b\005" "TRANSACTIONS_PENDING\0"
+ "\0", dsr);
+ aprint_error("%s: hardware error: DSR=3D%s\n",
+ devbuf, bitbuf);
+ }
+ }
+ }
+}
diff -r b4e17a9d10b4 -r ceecab6e9cd9 sys/dev/pci/pci_error.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/dev/pci/pci_error.h Thu Oct 24 20:08:59 2024 +0000
@@ -0,0 +1,44 @@
+/* $NetBSD$ */
+
+/*-
+ * Copyright (c) 2024 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTO=
RS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIM=
ITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICU=
LAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTO=
RS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF =
THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _DEV_PCI_PCI_ERROR_H_
+#define _DEV_PCI_PCI_ERROR_H_
+
+#include <sys/cdefs.h>
+
+struct cper_pcie_error;
+struct pci_attach_args;
+
+enum {
+ PCI_ERROR_PRINT =3D __BIT(0),
+};
+
+void pci_cper_error(const struct cper_pcie_error *, int);
+void pci_error(const struct pci_attach_args *, int);
+
+#endif /* _DEV_PCI_PCI_ERROR_H_ */
--=_fDlpu8OzeqfFpAJUhARZcts9raUpcXJJ--
Home |
Main Index |
Thread Index |
Old Index