Subject: Possible fix for resets under heavy load
To: None <port-amd64@netbsd.org>
From: Christopher SEKIYA <wileyc@rezrov.net>
List: port-amd64
Date: 06/03/2004 11:07:00
All,
For those who were experiencing spontaneous reboots under heavy load, please
try the appended patch. It brings the amd64 pmap into closer alignment with
the i386 pmap; namely, the use of splay tree macros and more cpu locking.
(Also, it removes i386-specific cruft).
It seems to do the right thing on my machine.
If there is no objection, I'll commit it over the weekend and request a
pullup.
-- Chris
GPG key FEB9DE7F (91AF 4534 4529 4BCC 31A5 938E 023E EEFB FEB9 DE7F)
Index: amd64/pmap.c
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/amd64/pmap.c,v
retrieving revision 1.9.2.1
diff -u -r1.9.2.1 pmap.c
--- amd64/pmap.c 1 Jun 2004 04:34:52 -0000 1.9.2.1
+++ amd64/pmap.c 2 Jun 2004 08:00:37 -0000
@@ -435,6 +435,24 @@
#define PVE_HIWAT (PVE_LOWAT + (PVE_PER_PVPAGE * 2))
/* high water mark */
+static __inline int
+pv_compare(struct pv_entry *a, struct pv_entry *b)
+{
+ if (a->pv_pmap < b->pv_pmap)
+ return (-1);
+ else if (a->pv_pmap > b->pv_pmap)
+ return (1);
+ else if (a->pv_va < b->pv_va)
+ return (-1);
+ else if (a->pv_va > b->pv_va)
+ return (1);
+ else
+ return (0);
+}
+
+SPLAY_PROTOTYPE(pvtree, pv_entry, pv_node, pv_compare);
+SPLAY_GENERATE(pvtree, pv_entry, pv_node, pv_compare);
+
/*
* linked list of all non-kernel pmaps
*/
@@ -693,13 +711,9 @@
if (pmap_is_active(pmap, ci->ci_cpuid)) {
pq = &pmap_tlb_shootdown_q[ci->ci_cpuid];
s = splipi();
-#ifdef MULTIPROCESSOR
__cpu_simple_lock(&pq->pq_slock);
-#endif
pq->pq_flushu++;
-#ifdef MULTIPROCESSOR
__cpu_simple_unlock(&pq->pq_slock);
-#endif
splx(s);
x86_send_ipi(ci, X86_IPI_TLB);
}
@@ -867,7 +881,8 @@
panic("pmap_kremove: PG_PVLIST mapping for 0x%lx",
va);
#endif
- pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask);
+ if ((opte & (PG_V | PG_U)) == (PG_V | PG_U))
+ pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask);
}
pmap_tlb_shootnow(cpumask);
}
@@ -1094,12 +1109,6 @@
idt_paddr = avail_start; /* steal a page */
avail_start += 2 * PAGE_SIZE;
-#if defined(I586_CPU)
- /* pentium f00f bug stuff */
- pentium_idt_vaddr = virtual_avail; /* don't need pte */
- virtual_avail += PAGE_SIZE; pte++;
-#endif
-
#ifdef _LP64
/*
* Grab a page below 4G for things that need it (i.e.
@@ -1340,7 +1349,7 @@
}
pv = pvpage->pvinfo.pvpi_pvfree;
KASSERT(pv);
- pvpage->pvinfo.pvpi_pvfree = pv->pv_next;
+ pvpage->pvinfo.pvpi_pvfree = SPLAY_RIGHT(pv, pv_node);
pv_nfpvents--; /* took one from pool */
} else {
pv = NULL; /* need more of them */
@@ -1400,7 +1409,7 @@
pvpage->pvinfo.pvpi_nfree--; /* can't go to zero */
pv = pvpage->pvinfo.pvpi_pvfree;
KASSERT(pv);
- pvpage->pvinfo.pvpi_pvfree = pv->pv_next;
+ pvpage->pvinfo.pvpi_pvfree = SPLAY_RIGHT(pv, pv_node);
pv_nfpvents--; /* took one from pool */
return(pv);
}
@@ -1465,7 +1474,8 @@
pvp->pvinfo.pvpi_pvfree = NULL;
pvp->pvinfo.pvpi_nfree = tofree;
for (lcv = 0 ; lcv < tofree ; lcv++) {
- pvp->pvents[lcv].pv_next = pvp->pvinfo.pvpi_pvfree;
+ SPLAY_RIGHT(&pvp->pvents[lcv], pv_node) =
+ pvp->pvinfo.pvpi_pvfree;
pvp->pvinfo.pvpi_pvfree = &pvp->pvents[lcv];
}
if (need_entry)
@@ -1501,7 +1511,7 @@
}
/* free it */
- pv->pv_next = pvp->pvinfo.pvpi_pvfree;
+ SPLAY_RIGHT(pv, pv_node) = pvp->pvinfo.pvpi_pvfree;
pvp->pvinfo.pvpi_pvfree = pv;
/*
@@ -1555,7 +1565,7 @@
simple_lock(&pvalloc_lock);
for ( /* null */ ; pvs != NULL ; pvs = nextpv) {
- nextpv = pvs->pv_next;
+ nextpv = SPLAY_RIGHT(pvs, pv_node);
pmap_free_pv_doit(pvs);
}
@@ -1652,10 +1662,7 @@
pve->pv_pmap = pmap;
pve->pv_va = va;
pve->pv_ptp = ptp; /* NULL for kernel pmap */
- simple_lock(&pvh->pvh_lock); /* lock pv_head */
- pve->pv_next = pvh->pvh_list; /* add to ... */
- pvh->pvh_list = pve; /* ... locked list */
- simple_unlock(&pvh->pvh_lock); /* unlock, done! */
+ SPLAY_INSERT(pvtree, &pvh->pvh_root, pve); /* add to locked list */
}
/*
@@ -1674,18 +1681,14 @@
struct pmap *pmap;
vaddr_t va;
{
- struct pv_entry *pve, **prevptr;
+ struct pv_entry tmp, *pve;
- prevptr = &pvh->pvh_list; /* previous pv_entry pointer */
- pve = *prevptr;
- while (pve) {
- if (pve->pv_pmap == pmap && pve->pv_va == va) { /* match? */
- *prevptr = pve->pv_next; /* remove it! */
- break;
- }
- prevptr = &pve->pv_next; /* previous pointer */
- pve = pve->pv_next; /* advance */
- }
+ tmp.pv_pmap = pmap;
+ tmp.pv_va = va;
+ pve = SPLAY_FIND(pvtree, &pvh->pvh_root, &tmp);
+ if (pve == NULL)
+ return (NULL);
+ SPLAY_REMOVE(pvtree, &pvh->pvh_root, pve);
return(pve); /* return removed pve */
}
@@ -1703,13 +1706,11 @@
pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) {
return (pmap->pm_ptphint[lidx]);
}
- if (lidx == 0)
- pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
- else {
- simple_lock(&pmap->pm_obj[lidx].vmobjlock);
- pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
- simple_unlock(&pmap->pm_obj[lidx].vmobjlock);
- }
+
+ simple_lock(&pmap->pm_obj[lidx].vmobjlock);
+ pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
+ simple_unlock(&pmap->pm_obj[lidx].vmobjlock);
+
return pg;
}
@@ -1723,14 +1724,13 @@
obj = &pmap->pm_obj[lidx];
pmap->pm_stats.resident_count--;
- if (lidx != 0)
- simple_lock(&obj->vmobjlock);
+
+ simple_lock(&obj->vmobjlock);
if (pmap->pm_ptphint[lidx] == ptp)
pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq);
ptp->wire_count = 0;
uvm_pagefree(ptp);
- if (lidx != 0)
- simple_unlock(&obj->vmobjlock);
+ simple_unlock(&obj->vmobjlock);
}
static void
@@ -2456,7 +2456,8 @@
pmap->pm_stats.wired_count--;
pmap->pm_stats.resident_count--;
- pmap_tlb_shootdown(pmap, startva, opte, cpumaskp);
+ if (opte & PG_U)
+ pmap_tlb_shootdown(pmap, startva, opte, cpumaskp);
if (ptp)
ptp->wire_count--; /* dropping a PTE */
@@ -2491,7 +2492,7 @@
simple_unlock(&vm_physmem[bank].pmseg.pvhead[off].pvh_lock);
if (pve) {
- pve->pv_next = pv_tofree;
+ SPLAY_RIGHT(pve, pv_node) = pv_tofree;
pv_tofree = pve;
}
@@ -2541,7 +2542,8 @@
if (ptp)
ptp->wire_count--; /* dropping a PTE */
- pmap_tlb_shootdown(pmap, va, opte, cpumaskp);
+ if (opte & PG_U)
+ pmap_tlb_shootdown(pmap, va, opte, cpumaskp);
/*
* if we are not on a pv_head list we are done.
@@ -2733,7 +2735,7 @@
{
int bank, off;
struct pv_head *pvh;
- struct pv_entry *pve, *npve, **prevptr, *killlist = NULL;
+ struct pv_entry *pve, *npve, *killlist = NULL;
pt_entry_t *ptes, opte;
pd_entry_t **pdes;
#ifdef DIAGNOSTIC
@@ -2743,13 +2745,11 @@
/* XXX: vm_page should either contain pv_head or have a pointer to it */
bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
- if (bank == -1) {
- printf("pmap_page_remove: unmanaged page?\n");
- return;
- }
+ if (bank == -1)
+ panic("pmap_page_remove: unmanaged page?");
pvh = &vm_physmem[bank].pmseg.pvhead[off];
- if (pvh->pvh_list == NULL) {
+ if (SPLAY_ROOT(&pvh->pvh_root) == NULL) {
return;
}
@@ -2759,9 +2759,8 @@
/* XXX: needed if we hold head->map lock? */
simple_lock(&pvh->pvh_lock);
- for (prevptr = &pvh->pvh_list, pve = pvh->pvh_list;
- pve != NULL; pve = npve) {
- npve = pve->pv_next;
+ for (pve = SPLAY_MIN(pvtree, &pvh->pvh_root); pve != NULL; pve = npve) {
+ npve = SPLAY_NEXT(pvtree, &pvh->pvh_root, pve);
pmap_map_ptes(pve->pv_pmap, &ptes, &pdes); /* locks pmap */
#ifdef DIAGNOSTIC
@@ -2785,7 +2784,9 @@
pve->pv_pmap->pm_stats.wired_count--;
pve->pv_pmap->pm_stats.resident_count--;
- pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, opte, &cpumask);
+ if (opte & PG_U)
+ pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, opte,
+ &cpumask);
/* sync R/M bits */
vm_physmem[bank].pmseg.attrs[off] |= (opte & (PG_U|PG_M));
@@ -2799,12 +2800,11 @@
}
}
pmap_unmap_ptes(pve->pv_pmap); /* unlocks pmap */
- *prevptr = npve; /* remove it */
- pve->pv_next = killlist; /* mark it for death */
+ SPLAY_REMOVE(pvtree, &pvh->pvh_root, pve); /* remove it */
+ SPLAY_RIGHT(pve, pv_node) = killlist; /* mark it for death */
killlist = pve;
}
pmap_free_pvs(NULL, killlist);
- pvh->pvh_list = NULL;
simple_unlock(&pvh->pvh_lock);
PMAP_HEAD_TO_MAP_UNLOCK();
pmap_tlb_shootnow(cpumask);
@@ -2837,10 +2837,8 @@
/* XXX: vm_page should either contain pv_head or have a pointer to it */
bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
- if (bank == -1) {
- printf("pmap_test_attrs: unmanaged page?\n");
- return(FALSE);
- }
+ if (bank == -1)
+ panic("pmap_test_attrs: unmanaged page?");
/*
* before locking: see if attributes are already set and if so,
@@ -2853,7 +2851,7 @@
/* test to see if there is a list before bothering to lock */
pvh = &vm_physmem[bank].pmseg.pvhead[off];
- if (pvh->pvh_list == NULL) {
+ if (SPLAY_ROOT(&pvh->pvh_root) == NULL) {
return(FALSE);
}
@@ -2862,8 +2860,9 @@
/* XXX: needed if we hold head->map lock? */
simple_lock(&pvh->pvh_lock);
- for (pve = pvh->pvh_list; pve != NULL && (*myattrs & testbits) == 0;
- pve = pve->pv_next) {
+ for (pve = SPLAY_MIN(pvtree, &pvh->pvh_root);
+ pve != NULL && (*myattrs & testbits) == 0;
+ pve = SPLAY_NEXT(pvtree, &pvh->pvh_root, pve)) {
pmap_map_ptes(pve->pv_pmap, &ptes, &pdes);
pte = ptes[pl1_i(pve->pv_va)];
pmap_unmap_ptes(pve->pv_pmap);
@@ -2903,10 +2902,8 @@
/* XXX: vm_page should either contain pv_head or have a pointer to it */
bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
- if (bank == -1) {
- printf("pmap_change_attrs: unmanaged page?\n");
- return(FALSE);
- }
+ if (bank == -1)
+ panic("pmap_change_attrs: unmanaged page?");
PMAP_HEAD_TO_MAP_LOCK();
pvh = &vm_physmem[bank].pmseg.pvhead[off];
@@ -2917,7 +2914,7 @@
result = *myattrs & clearbits;
*myattrs &= ~clearbits;
- for (pve = pvh->pvh_list; pve != NULL; pve = pve->pv_next) {
+ SPLAY_FOREACH(pve, pvtree, &pvh->pvh_root) {
pmap_map_ptes(pve->pv_pmap, &ptes, &pdes); /* locks pmap */
#ifdef DIAGNOSTIC
if (!pmap_pdes_valid(pve->pv_va, pdes, NULL))
@@ -3648,9 +3645,7 @@
if (ci != self && !(ci->ci_flags & CPUF_RUNNING))
continue;
pq = &pmap_tlb_shootdown_q[ci->ci_cpuid];
-#if defined(MULTIPROCESSOR)
__cpu_simple_lock(&pq->pq_slock);
-#endif
/*
* If there's a global flush already queued, or a
@@ -3659,28 +3654,9 @@
*/
if (pq->pq_flushg > 0 ||
(pq->pq_flushu > 0 && (pte & pmap_pg_g) == 0)) {
-#if defined(MULTIPROCESSOR)
__cpu_simple_unlock(&pq->pq_slock);
-#endif
- continue;
- }
-
-#ifdef I386_CPU
- /*
- * i386 CPUs can't invalidate a single VA, only
- * flush the entire TLB, so don't bother allocating
- * jobs for them -- just queue a `flushu'.
- *
- * XXX note that this can be executed for non-i386
- * when called * early (before identifycpu() has set
- * cpu_class)
- */
- if (cpu_class == CPUCLASS_386) {
- pq->pq_flushu++;
- *cpumaskp |= 1U << ci->ci_cpuid;
continue;
}
-#endif
pj = pmap_tlb_shootdown_job_get(pq);
pq->pq_pte |= pte;
@@ -3693,9 +3669,7 @@
*/
if (ci == self && pq->pq_count < PMAP_TLB_MAXJOBS) {
pmap_update_pg(va);
-#if defined(MULTIPROCESSOR)
__cpu_simple_unlock(&pq->pq_slock);
-#endif
continue;
} else {
if (pq->pq_pte & pmap_pg_g)
@@ -3717,9 +3691,7 @@
TAILQ_INSERT_TAIL(&pq->pq_head, pj, pj_list);
*cpumaskp |= 1U << ci->ci_cpuid;
}
-#if defined(MULTIPROCESSOR)
__cpu_simple_unlock(&pq->pq_slock);
-#endif
}
splx(s);
}
@@ -3743,9 +3715,7 @@
s = splipi();
-#ifdef MULTIPROCESSOR
__cpu_simple_lock(&pq->pq_slock);
-#endif
if (pq->pq_flushg) {
COUNT(flushg);
@@ -3779,8 +3749,8 @@
for (CPU_INFO_FOREACH(cii, ci))
x86_atomic_clearbits_ul(&ci->ci_tlb_ipi_mask,
(1U << cpu_id));
- __cpu_simple_unlock(&pq->pq_slock);
#endif
+ __cpu_simple_unlock(&pq->pq_slock);
splx(s);
}
@@ -3825,9 +3795,7 @@
if (pq->pq_count >= PMAP_TLB_MAXJOBS)
return (NULL);
-#ifdef MULTIPROCESSOR
__cpu_simple_lock(&pmap_tlb_shootdown_job_lock);
-#endif
if (pj_free == NULL) {
#ifdef MULTIPROCESSOR
__cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
@@ -3837,9 +3805,7 @@
pj = &pj_free->pja_job;
pj_free =
(union pmap_tlb_shootdown_job_al *)pj_free->pja_job.pj_nextfree;
-#ifdef MULTIPROCESSOR
__cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
-#endif
pq->pq_count++;
return (pj);
@@ -3862,14 +3828,10 @@
if (pq->pq_count == 0)
panic("pmap_tlb_shootdown_job_put: queue length inconsistency");
#endif
-#ifdef MULTIPROCESSOR
__cpu_simple_lock(&pmap_tlb_shootdown_job_lock);
-#endif
pj->pj_nextfree = &pj_free->pja_job;
pj_free = (union pmap_tlb_shootdown_job_al *)pj;
-#ifdef MULTIPROCESSOR
__cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
-#endif
pq->pq_count--;
}
Index: include/pmap.h
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/include/pmap.h,v
retrieving revision 1.1
diff -u -r1.1 pmap.h
--- include/pmap.h 26 Apr 2003 18:39:46 -0000 1.1
+++ include/pmap.h 2 Jun 2004 08:00:37 -0000
@@ -334,19 +334,12 @@
* describes one mapping).
*/
-struct pv_entry;
-
-struct pv_head {
- struct simplelock pvh_lock; /* locks every pv on this list */
- struct pv_entry *pvh_list; /* head of list (locked by pvh_lock) */
-};
-
-struct pv_entry { /* locked by its list's pvh_lock */
- struct pv_entry *pv_next; /* next entry */
- struct pmap *pv_pmap; /* the pmap */
- vaddr_t pv_va; /* the virtual address */
- struct vm_page *pv_ptp; /* the vm_page of the PTP */
-};
+struct pv_entry { /* locked by its list's pvh_lock */
+ SPLAY_ENTRY(pv_entry) pv_node; /* splay-tree node */
+ struct pmap *pv_pmap; /* the pmap */
+ vaddr_t pv_va; /* the virtual address */
+ struct vm_page *pv_ptp; /* the vm_page of the PTP */
+};
/*
* pv_entrys are dynamically allocated in chunks from a single page.
Index: include/vmparam.h
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/include/vmparam.h,v
retrieving revision 1.4
diff -u -r1.4 vmparam.h
--- include/vmparam.h 23 Mar 2004 18:54:32 -0000 1.4
+++ include/vmparam.h 2 Jun 2004 08:00:37 -0000
@@ -37,6 +37,8 @@
#ifndef _VMPARAM_H_
#define _VMPARAM_H_
+#include <sys/tree.h>
+
/*
* Machine dependent constants for 386.
*/
@@ -123,6 +125,25 @@
#define __HAVE_PMAP_PHYSSEG
+#define __HAVE_VM_PAGE_MD
+#define VM_MDPAGE_INIT(pg) \
+ memset(&(pg)->mdpage, 0, sizeof((pg)->mdpage)); \
+ simple_lock_init(&(pg)->mdpage.mp_pvhead.pvh_lock); \
+ SPLAY_INIT(&(pg)->mdpage.mp_pvhead.pvh_root);
+
+struct pv_entry;
+
+struct pv_head {
+ struct simplelock pvh_lock; /* locks every pv in this tree */
+ SPLAY_HEAD(pvtree, pv_entry) pvh_root;
+ /* head of tree (locked by pvh_lock) */
+};
+
+struct vm_page_md {
+ struct pv_head mp_pvhead;
+ int mp_attrs;
+};
+
/*
* pmap specific data stored in the vm_physmem[] array
*/