Index: arch/amd64/amd64/locore.S =================================================================== RCS file: /cvsroot/src/sys/arch/amd64/amd64/locore.S,v retrieving revision 1.46 diff -u -p -r1.46 locore.S --- arch/amd64/amd64/locore.S 21 May 2008 03:50:42 -0000 1.46 +++ arch/amd64/amd64/locore.S 2 Jun 2008 14:09:50 -0000 @@ -1249,15 +1249,20 @@ iret_return: jmp .Losyscall_checkast /* re-check ASTs */ /* - * void sse2_zero_page(void *pg) + * void sse2_idlezero_page(void *pg) * - * Zero a page without polluting the cache. + * Zero a page without polluting the cache. Preemption must be + * disabled by the caller. Abort if a preemption is pending. */ -ENTRY(sse2_zero_page) - movl $PAGE_SIZE, %ecx +ENTRY(sse2_idlezero_page) + pushq %rbp + movq %rsp,%rbp + movl $(PAGE_SIZE/64), %ecx xorq %rax, %rax .align 16 1: + cmpl $0, CPUVAR(RESCHED) + jnz 2f movnti %rax, 0(%rdi) movnti %rax, 8(%rdi) movnti %rax, 16(%rdi) @@ -1266,32 +1271,14 @@ ENTRY(sse2_zero_page) movnti %rax, 40(%rdi) movnti %rax, 48(%rdi) movnti %rax, 56(%rdi) - subl $64, %ecx - leaq 64(%rdi), %rdi + addq $64, %rdi + decl %ecx jnz 1b sfence + incl %eax + popq %rbp ret - -/* - * void sse2_copy_page(void *src, void *dst) - * - * Copy a page without polluting the cache. - */ -ENTRY(sse2_copy_page) - movl $PAGE_SIZE, %ecx - .align 16 -1: - movq 0(%rdi), %rax - movq 8(%rdi), %rdx - movq 16(%rdi), %r8 - movq 24(%rdi), %r9 - movnti %rax, 0(%rsi) - movnti %rdx, 8(%rsi) - movnti %r8, 16(%rsi) - movnti %r9, 24(%rsi) - subl $32, %ecx - leaq 32(%rdi), %rdi - leaq 32(%rsi), %rsi - jnz 1b +2: sfence + popq %rbp ret Index: arch/i386/i386/locore.S =================================================================== RCS file: /cvsroot/src/sys/arch/i386/i386/locore.S,v retrieving revision 1.75 diff -u -p -r1.75 locore.S --- arch/i386/i386/locore.S 1 Jun 2008 18:37:12 -0000 1.75 +++ arch/i386/i386/locore.S 2 Jun 2008 14:09:57 -0000 @@ -1227,18 +1227,21 @@ END(npx586bug1) #endif /* NNPX > 0 */ /* - * void sse2_zero_page(void *pg) + * void sse2_idlezero_page(void *pg) * - * Zero a page without polluting the cache. + * Zero a page without polluting the cache. Preemption must be + * disabled by the caller. Abort if a preemption is pending. */ -ENTRY(sse2_zero_page) +ENTRY(sse2_idlezero_page) pushl %ebp movl %esp,%ebp movl 8(%esp), %edx - movl $(PAGE_SIZE/64), %ecx + movl $(PAGE_SIZE/32), %ecx xorl %eax, %eax .align 16 1: + cmpl $0, CPUVAR(RESCHED) + jnz 2f movnti %eax, 0(%edx) movnti %eax, 4(%edx) movnti %eax, 8(%edx) @@ -1247,60 +1250,15 @@ ENTRY(sse2_zero_page) movnti %eax, 20(%edx) movnti %eax, 24(%edx) movnti %eax, 28(%edx) - movnti %eax, 32(%edx) - movnti %eax, 36(%edx) - movnti %eax, 40(%edx) - movnti %eax, 44(%edx) - movnti %eax, 48(%edx) - movnti %eax, 52(%edx) - movnti %eax, 56(%edx) - movnti %eax, 60(%edx) - addl $64, %edx + addl $32, %edx decl %ecx jnz 1b sfence + incl %eax pop %ebp ret -END(sse2_zero_page) - -/* - * void sse2_copy_page(void *src, void *dst) - * - * Copy a page without polluting the cache. - */ -ENTRY(sse2_copy_page) - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp), %esi - movl 24(%esp), %edi - xorl %ebp, %ebp - .align 16 -1: - movl 0(%esi,%ebp), %eax - movl 4(%esi,%ebp), %ebx - movl 8(%esi,%ebp), %ecx - movl 12(%esi,%ebp), %edx - movnti %eax, 0(%edi,%ebp) - movnti %ebx, 4(%edi,%ebp) - movnti %ecx, 8(%edi,%ebp) - movnti %edx, 12(%edi,%ebp) - movl 16(%esi,%ebp), %eax - movl 20(%esi,%ebp), %ebx - movl 24(%esi,%ebp), %ecx - movl 28(%esi,%ebp), %edx - movnti %eax, 16(%edi,%ebp) - movnti %ebx, 20(%edi,%ebp) - movnti %ecx, 24(%edi,%ebp) - movnti %edx, 28(%edi,%ebp) - addl $32, %ebp - cmpl $PAGE_SIZE, %ebp - jne 1b +2: sfence - popl %edi - popl %esi - popl %ebx popl %ebp ret -END(sse2_copy_page) +END(sse2_idlezero_page) Index: arch/x86/include/pmap.h =================================================================== RCS file: /cvsroot/src/sys/arch/x86/include/pmap.h,v retrieving revision 1.14 diff -u -p -r1.14 pmap.h --- arch/x86/include/pmap.h 3 May 2008 02:56:13 -0000 1.14 +++ arch/x86/include/pmap.h 2 Jun 2008 14:09:59 -0000 @@ -350,8 +350,7 @@ paddr_t vtophys(vaddr_t); vaddr_t pmap_map(vaddr_t, paddr_t, paddr_t, vm_prot_t); void pmap_cpu_init_early(struct cpu_info *); void pmap_cpu_init_late(struct cpu_info *); -void sse2_zero_page(void *); -void sse2_copy_page(void *, void *); +bool sse2_idlezero_page(void *); #ifdef XEN Index: arch/x86/x86/bus_dma.c =================================================================== RCS file: /cvsroot/src/sys/arch/x86/x86/bus_dma.c,v retrieving revision 1.42 diff -u -p -r1.42 bus_dma.c --- arch/x86/x86/bus_dma.c 28 Apr 2008 20:23:40 -0000 1.42 +++ arch/x86/x86/bus_dma.c 2 Jun 2008 14:09:59 -0000 @@ -171,13 +171,13 @@ _bus_dmamem_alloc_range(bus_dma_tag_t t, * Compute the location, size, and number of segments actually * returned by the VM code. */ - m = mlist.tqh_first; + m = TAILQ_FIRST(&mlist); curseg = 0; lastaddr = segs[curseg].ds_addr = VM_PAGE_TO_PHYS(m); segs[curseg].ds_len = PAGE_SIZE; - m = m->pageq.tqe_next; + m = m->pageq.queue.tqe_next; - for (; m != NULL; m = m->pageq.tqe_next) { + for (; m != NULL; m = m->pageq.queue.tqe_next) { curaddr = VM_PAGE_TO_PHYS(m); #ifdef DIAGNOSTIC if (curaddr < low || curaddr >= high) { @@ -986,7 +986,7 @@ _bus_dmamem_free(bus_dma_tag_t t, bus_dm addr < (segs[curseg].ds_addr + segs[curseg].ds_len); addr += PAGE_SIZE) { m = _BUS_BUS_TO_VM_PAGE(addr); - TAILQ_INSERT_TAIL(&mlist, m, pageq); + TAILQ_INSERT_TAIL(&mlist, m, pageq.queue); } } Index: arch/x86/x86/cpu.c =================================================================== RCS file: /cvsroot/src/sys/arch/x86/x86/cpu.c,v retrieving revision 1.54 diff -u -p -r1.54 cpu.c --- arch/x86/x86/cpu.c 28 May 2008 11:50:01 -0000 1.54 +++ arch/x86/x86/cpu.c 2 Jun 2008 14:10:00 -0000 @@ -322,6 +322,9 @@ cpu_attach(device_t parent, device_t sel ci->ci_cpuid = caa->cpu_number; ci->ci_func = caa->cpu_func; + /* Must be before mi_cpu_attach(). */ + cpu_vm_init(ci); + if (caa->cpu_role == CPU_ROLE_AP) { int error; @@ -404,7 +407,6 @@ cpu_attach(device_t parent, device_t sel panic("unknown processor type??\n"); } - cpu_vm_init(ci); atomic_or_32(&cpus_attached, ci->ci_cpumask); if (!pmf_device_register(self, cpu_suspend, cpu_resume)) @@ -522,6 +524,9 @@ cpu_boot_secondary_processors(void) /* Now that we know about the TSC, attach the timecounter. */ tsc_tc_init(); + + /* Enable zeroing of pages in the idle loop if we have SSE2. */ + vm_page_zero_enable = ((cpu_feature & CPUID_SSE2) != 0); } static void Index: arch/x86/x86/identcpu.c =================================================================== RCS file: /cvsroot/src/sys/arch/x86/x86/identcpu.c,v retrieving revision 1.8 diff -u -p -r1.8 identcpu.c --- arch/x86/x86/identcpu.c 30 May 2008 18:49:03 -0000 1.8 +++ arch/x86/x86/identcpu.c 2 Jun 2008 14:10:00 -0000 @@ -444,12 +444,6 @@ cpu_probe_cyrix_cmn(struct cpu_info *ci) cyrix_write_reg(0x3c, cyrix_read_reg(0x3c) | 0x87); /* disable access to ccr4/ccr5 */ cyrix_write_reg(0xC3, c3); - - /* - * XXX disable page zero in the idle loop, it seems to - * cause panics on these CPUs. - */ - vm_page_zero_enable = FALSE; } static void Index: arch/x86/x86/pmap.c =================================================================== RCS file: /cvsroot/src/sys/arch/x86/x86/pmap.c,v retrieving revision 1.64 diff -u -p -r1.64 pmap.c --- arch/x86/x86/pmap.c 28 May 2008 11:50:01 -0000 1.64 +++ arch/x86/x86/pmap.c 2 Jun 2008 14:10:01 -0000 @@ -2963,12 +2963,9 @@ pmap_zero_page(paddr_t pa) pmap_pte_flush(); pmap_update_pg((vaddr_t)zerova); /* flush TLB */ - if (cpu_feature & CPUID_SSE2) - sse2_zero_page(zerova); - else - memset(zerova, 0, PAGE_SIZE); + memset(zerova, 0, PAGE_SIZE); -#if defined(DIAGNOSTIC) || defined(XEN) +#if defined(XEN) pmap_pte_set(zpte, 0); /* zap ! */ pmap_pte_flush(); #endif @@ -2984,9 +2981,30 @@ pmap_zero_page(paddr_t pa) bool pmap_pageidlezero(paddr_t pa) { + pt_entry_t *zpte; + void *zerova; + bool rv; + int id; - pmap_zero_page(pa); - return true; + id = cpu_number(); + zpte = PTESLEW(zero_pte, id); + zerova = VASLEW(zerop, id); + + KASSERT(cpu_feature & CPUID_SSE2); + KASSERT(*zpte == 0); + + pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k); + pmap_pte_flush(); + pmap_update_pg((vaddr_t)zerova); /* flush TLB */ + + rv = sse2_idlezero_page(zerova); + +#ifdef XEN + pmap_pte_set(zpte, 0); /* zap ! */ + pmap_pte_flush(); +#endif + + return rv; } /* @@ -3009,21 +3027,17 @@ pmap_copy_page(paddr_t srcpa, paddr_t ds csrcva = VASLEW(csrcp, id); cdstva = VASLEW(cdstp, id); -#ifdef DIAGNOSTIC - if (*spte || *dpte) - panic("pmap_copy_page: lock botch"); -#endif + KASSERT(*spte == 0 && *dpte == 0); pmap_pte_set(spte, pmap_pa2pte(srcpa) | PG_V | PG_RW | PG_U | PG_k); pmap_pte_set(dpte, pmap_pa2pte(dstpa) | PG_V | PG_RW | PG_M | PG_U | PG_k); pmap_pte_flush(); pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva); - if (cpu_feature & CPUID_SSE2) - sse2_copy_page(csrcva, cdstva); - else - memcpy(cdstva, csrcva, PAGE_SIZE); -#if defined(DIAGNOSTIC) || defined(XEN) + + memcpy(cdstva, csrcva, PAGE_SIZE); + +#ifdef XEN pmap_pte_set(spte, 0); pmap_pte_set(dpte, 0); pmap_pte_flush(); Index: miscfs/genfs/genfs_io.c =================================================================== RCS file: /cvsroot/src/sys/miscfs/genfs/genfs_io.c,v retrieving revision 1.7 diff -u -p -r1.7 genfs_io.c --- miscfs/genfs/genfs_io.c 14 May 2008 16:49:47 -0000 1.7 +++ miscfs/genfs/genfs_io.c 2 Jun 2008 14:10:14 -0000 @@ -857,7 +857,7 @@ retry: endmp.offset = (voff_t)-1; endmp.flags = PG_BUSY; pg = TAILQ_FIRST(&uobj->memq); - TAILQ_INSERT_TAIL(&uobj->memq, &endmp, listq); + TAILQ_INSERT_TAIL(&uobj->memq, &endmp, listq.queue); uvm_lwp_hold(l); } else { pg = uvm_pagelookup(uobj, off); @@ -882,7 +882,7 @@ retry: if (pg->flags & (PG_RELEASED|PG_PAGEOUT)) { wasclean = false; } - pg = TAILQ_NEXT(pg, listq); + pg = TAILQ_NEXT(pg, listq.queue); continue; } off = pg->offset; @@ -921,9 +921,9 @@ retry: break; } if (by_list) { - TAILQ_INSERT_BEFORE(pg, &curmp, listq); + TAILQ_INSERT_BEFORE(pg, &curmp, listq.queue); UVMHIST_LOG(ubchist, "curmp next %p", - TAILQ_NEXT(&curmp, listq), 0,0,0); + TAILQ_NEXT(&curmp, listq.queue), 0,0,0); } if (yld) { mutex_exit(slock); @@ -936,9 +936,9 @@ retry: } if (by_list) { UVMHIST_LOG(ubchist, "after next %p", - TAILQ_NEXT(&curmp, listq), 0,0,0); - pg = TAILQ_NEXT(&curmp, listq); - TAILQ_REMOVE(&uobj->memq, &curmp, listq); + TAILQ_NEXT(&curmp, listq.queue), 0,0,0); + pg = TAILQ_NEXT(&curmp, listq.queue); + TAILQ_REMOVE(&uobj->memq, &curmp, listq.queue); } else { pg = uvm_pagelookup(uobj, off); } @@ -1049,7 +1049,7 @@ retry: for (i = 0; i < npages; i++) { tpg = pgs[i]; KASSERT(tpg->uobject == uobj); - if (by_list && tpg == TAILQ_NEXT(pg, listq)) + if (by_list && tpg == TAILQ_NEXT(pg, listq.queue)) pg = tpg; if (tpg->offset < startoff || tpg->offset >= endoff) continue; @@ -1071,7 +1071,7 @@ retry: * and needs_clean is false. */ - nextpg = TAILQ_NEXT(tpg, listq); + nextpg = TAILQ_NEXT(tpg, listq.queue); uvm_pagefree(tpg); if (pagedaemon) uvmexp.pdfreed++; @@ -1091,14 +1091,14 @@ retry: if (by_list) { TAILQ_INSERT_AFTER(&uobj->memq, pg, &curmp, - listq); + listq.queue); } mutex_exit(slock); error = GOP_WRITE(vp, pgs, npages, flags); mutex_enter(slock); if (by_list) { - pg = TAILQ_NEXT(&curmp, listq); - TAILQ_REMOVE(&uobj->memq, &curmp, listq); + pg = TAILQ_NEXT(&curmp, listq.queue); + TAILQ_REMOVE(&uobj->memq, &curmp, listq.queue); } if (error) { break; @@ -1117,7 +1117,7 @@ retry: pg = nextpg; nextpg = NULL; } else { - pg = TAILQ_NEXT(pg, listq); + pg = TAILQ_NEXT(pg, listq.queue); } } else { off += (npages - nback) << PAGE_SHIFT; @@ -1127,7 +1127,7 @@ retry: } } if (by_list) { - TAILQ_REMOVE(&uobj->memq, &endmp, listq); + TAILQ_REMOVE(&uobj->memq, &endmp, listq.queue); uvm_lwp_rele(l); } @@ -1146,7 +1146,7 @@ retry: if (cleanall && wasclean && gp->g_dirtygen == dirtygen && (vp->v_iflag & VI_ONWORKLST) != 0) { #if defined(DEBUG) - TAILQ_FOREACH(pg, &uobj->memq, listq) { + TAILQ_FOREACH(pg, &uobj->memq, listq.queue) { if ((pg->flags & PG_CLEAN) == 0) { printf("%s: %p: !CLEAN\n", __func__, pg); } Index: nfs/nfs_subs.c =================================================================== RCS file: /cvsroot/src/sys/nfs/nfs_subs.c,v retrieving revision 1.203 diff -u -p -r1.203 nfs_subs.c --- nfs/nfs_subs.c 10 May 2008 02:26:10 -0000 1.203 +++ nfs/nfs_subs.c 2 Jun 2008 14:10:17 -0000 @@ -2670,7 +2670,7 @@ nfs_clearcommit(mp) np->n_commitflags &= ~(NFS_COMMIT_PUSH_VALID | NFS_COMMIT_PUSHED_VALID); mutex_enter(&vp->v_uobj.vmobjlock); - TAILQ_FOREACH(pg, &vp->v_uobj.memq, listq) { + TAILQ_FOREACH(pg, &vp->v_uobj.memq, listq.queue) { pg->flags &= ~PG_NEEDCOMMIT; } mutex_exit(&vp->v_uobj.vmobjlock); Index: sys/cpu_data.h =================================================================== RCS file: /cvsroot/src/sys/sys/cpu_data.h,v retrieving revision 1.26 diff -u -p -r1.26 cpu_data.h --- sys/cpu_data.h 1 Jun 2008 21:24:15 -0000 1.26 +++ sys/cpu_data.h 2 Jun 2008 14:10:17 -0000 @@ -86,6 +86,7 @@ struct cpu_data { u_int cpu_nsyscall; /* syscall counter */ u_int cpu_ntrap; /* trap counter */ u_int cpu_nswtch; /* context switch counter */ + void *cpu_uvm; /* uvm per-cpu data */ void *cpu_softcpu; /* soft interrupt table */ TAILQ_HEAD(,buf) cpu_biodone; /* finished block xfers */ percpu_cpu_t cpu_percpu; /* per-cpu data */ Index: ufs/lfs/lfs_vnops.c =================================================================== RCS file: /cvsroot/src/sys/ufs/lfs/lfs_vnops.c,v retrieving revision 1.216 diff -u -p -r1.216 lfs_vnops.c --- ufs/lfs/lfs_vnops.c 28 Apr 2008 20:24:11 -0000 1.216 +++ ufs/lfs/lfs_vnops.c 2 Jun 2008 14:10:18 -0000 @@ -1839,7 +1839,7 @@ check_dirty(struct lfs *fs, struct vnode ((curpg->offset & fs->lfs_bmask) || curpg->offset >= vp->v_size || curpg->offset >= endoffset)) - curpg = TAILQ_NEXT(curpg, listq); + curpg = TAILQ_NEXT(curpg, listq.queue); } if (curpg == NULL) break; @@ -1896,7 +1896,7 @@ check_dirty(struct lfs *fs, struct vnode } if (pages_per_block > 0 && nonexistent >= pages_per_block) { if (by_list) { - curpg = TAILQ_NEXT(curpg, listq); + curpg = TAILQ_NEXT(curpg, listq.queue); } else { soff += fs->lfs_bsize; } @@ -1940,7 +1940,7 @@ check_dirty(struct lfs *fs, struct vnode break; if (by_list) { - curpg = TAILQ_NEXT(curpg, listq); + curpg = TAILQ_NEXT(curpg, listq.queue); } else { soff += MAX(PAGE_SIZE, fs->lfs_bsize); } Index: uvm/uvm.h =================================================================== RCS file: /cvsroot/src/sys/uvm/uvm.h,v retrieving revision 1.53 diff -u -p -r1.53 uvm.h --- uvm/uvm.h 2 Jan 2008 11:49:15 -0000 1.53 +++ uvm/uvm.h 2 Jun 2008 14:10:18 -0000 @@ -74,6 +74,16 @@ struct workqueue; /* + * per-cpu data + */ + +struct uvm_cpu { + struct pgfreelist page_free[VM_NFREELIST]; /* unallocated pages */ + int page_free_nextcolor; /* next color to allocate from */ + int page_idlezero_next; /* which color to zero next */ +}; + +/* * uvm structure (vm global state: collected in one structure for ease * of reference...) */ @@ -83,7 +93,6 @@ struct uvm { /* vm_page queues */ struct pgfreelist page_free[VM_NFREELIST]; /* unallocated pages */ - int page_free_nextcolor; /* next color to allocate from */ bool page_init_done; /* TRUE if uvm_page_init() finished */ bool page_idle_zero; /* TRUE if we should try to zero pages in the idle loop */ @@ -95,11 +104,6 @@ struct uvm { /* aiodone daemon */ struct workqueue *aiodone_queue; - /* page hash */ - struct pglist *page_hash; /* page hash table (vp/off->page) */ - int page_nhash; /* number of buckets */ - int page_hashmask; /* hash mask */ - /* aio_done is locked by uvm.pagedaemon_lock and splbio! */ TAILQ_HEAD(, buf) aio_done; /* done async i/o reqs */ @@ -108,6 +112,9 @@ struct uvm { kcondvar_t scheduler_cv; bool scheduler_kicked; int swapout_enabled; + + /* per-cpu data */ + struct uvm_cpu cpus[MAXCPUS]; }; /* Index: uvm/uvm_aobj.c =================================================================== RCS file: /cvsroot/src/sys/uvm/uvm_aobj.c,v retrieving revision 1.100 diff -u -p -r1.100 uvm_aobj.c --- uvm/uvm_aobj.c 5 May 2008 17:11:17 -0000 1.100 +++ uvm/uvm_aobj.c 2 Jun 2008 14:10:19 -0000 @@ -808,7 +808,7 @@ uao_put(struct uvm_object *uobj, voff_t */ if (by_list) { - TAILQ_INSERT_TAIL(&uobj->memq, &endmp, listq); + TAILQ_INSERT_TAIL(&uobj->memq, &endmp, listq.queue); nextpg = TAILQ_FIRST(&uobj->memq); uvm_lwp_hold(curlwp); } else { @@ -822,7 +822,7 @@ uao_put(struct uvm_object *uobj, voff_t pg = nextpg; if (pg == &endmp) break; - nextpg = TAILQ_NEXT(pg, listq); + nextpg = TAILQ_NEXT(pg, listq.queue); if (pg->offset < start || pg->offset >= stop) continue; } else { @@ -841,16 +841,16 @@ uao_put(struct uvm_object *uobj, voff_t if (pg->flags & PG_BUSY) { if (by_list) { - TAILQ_INSERT_BEFORE(pg, &curmp, listq); + TAILQ_INSERT_BEFORE(pg, &curmp, listq.queue); } pg->flags |= PG_WANTED; UVM_UNLOCK_AND_WAIT(pg, &uobj->vmobjlock, 0, "uao_put", 0); mutex_enter(&uobj->vmobjlock); if (by_list) { - nextpg = TAILQ_NEXT(&curmp, listq); + nextpg = TAILQ_NEXT(&curmp, listq.queue); TAILQ_REMOVE(&uobj->memq, &curmp, - listq); + listq.queue); } else curoff -= PAGE_SIZE; continue; @@ -909,7 +909,7 @@ uao_put(struct uvm_object *uobj, voff_t } } if (by_list) { - TAILQ_REMOVE(&uobj->memq, &endmp, listq); + TAILQ_REMOVE(&uobj->memq, &endmp, listq.queue); uvm_lwp_rele(curlwp); } mutex_exit(&uobj->vmobjlock); Index: uvm/uvm_extern.h =================================================================== RCS file: /cvsroot/src/sys/uvm/uvm_extern.h,v retrieving revision 1.145 diff -u -p -r1.145 uvm_extern.h --- uvm/uvm_extern.h 29 Feb 2008 20:35:23 -0000 1.145 +++ uvm/uvm_extern.h 2 Jun 2008 14:10:20 -0000 @@ -326,6 +326,8 @@ struct uvmexp { aborted */ int colorhit; /* pagealloc where we got optimal color */ int colormiss; /* pagealloc where we didn't */ + int cpuhit; /* pagealloc where we allocated locally */ + int cpumiss; /* pagealloc where we didn't */ /* fault subcounters. XXX: should be 64-bit counters */ int fltnoram; /* number of times fault was out of ram */ @@ -448,6 +450,8 @@ struct uvmexp_sysctl { int64_t colorhit; int64_t colormiss; int64_t ncolors; + int64_t cpuhit; + int64_t cpumiss; }; #ifdef _KERNEL Index: uvm/uvm_glue.c =================================================================== RCS file: /cvsroot/src/sys/uvm/uvm_glue.c,v retrieving revision 1.127 diff -u -p -r1.127 uvm_glue.c --- uvm/uvm_glue.c 31 May 2008 21:26:01 -0000 1.127 +++ uvm/uvm_glue.c 2 Jun 2008 14:10:21 -0000 @@ -276,16 +276,6 @@ uvm_lwp_fork(struct lwp *l1, struct lwp cpu_lwp_fork(l1, l2, stack, stacksize, func, arg); } -/* - * uvm_cpu_attach: initialize per-CPU data structures. - */ - -void -uvm_cpu_attach(struct cpu_info *ci) -{ - -} - static int uarea_swapin(vaddr_t addr) { Index: uvm/uvm_init.c =================================================================== RCS file: /cvsroot/src/sys/uvm/uvm_init.c,v retrieving revision 1.32 diff -u -p -r1.32 uvm_init.c --- uvm/uvm_init.c 28 Jan 2008 12:22:47 -0000 1.32 +++ uvm/uvm_init.c 2 Jun 2008 14:10:21 -0000 @@ -158,7 +158,6 @@ uvm_init(void) * of kernel objects. */ - uvm_page_rehash(); uao_create(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS, UAO_FLAG_KERNSWAP); Index: uvm/uvm_map.c =================================================================== RCS file: /cvsroot/src/sys/uvm/uvm_map.c,v retrieving revision 1.255 diff -u -p -r1.255 uvm_map.c --- uvm/uvm_map.c 31 May 2008 13:00:03 -0000 1.255 +++ uvm/uvm_map.c 2 Jun 2008 14:10:24 -0000 @@ -4874,7 +4874,7 @@ uvm_object_printit(struct uvm_object *uo return; } (*pr)(" PAGES :\n "); - TAILQ_FOREACH(pg, &uobj->memq, listq) { + TAILQ_FOREACH(pg, &uobj->memq, listq.queue) { cnt++; (*pr)("<%p,0x%llx> ", pg, (long long)pg->offset); if ((cnt % 3) == 0) { @@ -4899,7 +4899,7 @@ uvm_page_printit(struct vm_page *pg, boo { struct vm_page *tpg; struct uvm_object *uobj; - struct pglist *pgl; + struct pgflist *pgl; char pgbuf[128]; char pqbuf[128]; @@ -4935,7 +4935,7 @@ uvm_page_printit(struct vm_page *pg, boo uobj = pg->uobject; if (uobj) { (*pr)(" checking object list\n"); - TAILQ_FOREACH(tpg, &uobj->memq, listq) { + TAILQ_FOREACH(tpg, &uobj->memq, listq.queue) { if (tpg == pg) { break; } @@ -4960,7 +4960,7 @@ uvm_page_printit(struct vm_page *pg, boo if (pgl) { (*pr)(" checking pageq list\n"); - TAILQ_FOREACH(tpg, pgl, pageq) { + LIST_FOREACH(tpg, pgl, pageq.list) { if (tpg == pg) { break; } Index: uvm/uvm_meter.c =================================================================== RCS file: /cvsroot/src/sys/uvm/uvm_meter.c,v retrieving revision 1.48 diff -u -p -r1.48 uvm_meter.c --- uvm/uvm_meter.c 24 Apr 2008 15:35:31 -0000 1.48 +++ uvm/uvm_meter.c 2 Jun 2008 14:10:24 -0000 @@ -243,6 +243,8 @@ sysctl_vm_uvmexp2(SYSCTLFN_ARGS) u.execpages = uvmexp.execpages; u.colorhit = uvmexp.colorhit; u.colormiss = uvmexp.colormiss; + u.cpuhit = uvmexp.cpuhit; + u.cpumiss = uvmexp.cpumiss; node = *rnode; node.sysctl_data = &u; Index: uvm/uvm_object.h =================================================================== RCS file: /cvsroot/src/sys/uvm/uvm_object.h,v retrieving revision 1.24 diff -u -p -r1.24 uvm_object.h --- uvm/uvm_object.h 2 Jan 2008 11:49:18 -0000 1.24 +++ uvm/uvm_object.h 2 Jun 2008 14:10:24 -0000 @@ -41,6 +41,10 @@ * uvm_object.h */ +#ifdef _KERNEL +#include +#endif + /* * uvm_object: all that is left of mach objects. */ @@ -51,6 +55,9 @@ struct uvm_object { struct pglist memq; /* pages in this object */ int uo_npages; /* # of pages in memq */ int uo_refs; /* reference count */ +#ifdef _KERNEL + struct rb_tree rb_tree; /* tree of pages */ +#endif }; /* @@ -102,6 +109,8 @@ extern const struct uvm_pagerops aobj_pa #define UVM_OBJ_IS_AOBJ(uobj) \ ((uobj)->pgops == &aobj_pager) +extern const struct rb_tree_ops uvm_page_tree_ops; + #define UVM_OBJ_INIT(uobj, ops, refs) \ do { \ mutex_init(&(uobj)->vmobjlock, MUTEX_DEFAULT, IPL_NONE);\ @@ -109,6 +118,7 @@ extern const struct uvm_pagerops aobj_pa TAILQ_INIT(&(uobj)->memq); \ (uobj)->uo_npages = 0; \ (uobj)->uo_refs = (refs); \ + rb_tree_init(&(uobj)->rb_tree, &uvm_page_tree_ops); \ } while (/* CONSTCOND */ 0) #define UVM_OBJ_DESTROY(uobj) \ Index: uvm/uvm_page.c =================================================================== RCS file: /cvsroot/src/sys/uvm/uvm_page.c,v retrieving revision 1.132 diff -u -p -r1.132 uvm_page.c --- uvm/uvm_page.c 2 Jun 2008 11:11:14 -0000 1.132 +++ uvm/uvm_page.c 2 Jun 2008 14:10:25 -0000 @@ -84,6 +84,7 @@ __KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v #include #include #include +#include #include #include @@ -124,14 +125,6 @@ static vaddr_t virtual_space_start; static vaddr_t virtual_space_end; /* - * we use a hash table with only one bucket during bootup. we will - * later rehash (resize) the hash table once the allocator is ready. - * we static allocate the one bootstrap bucket below... - */ - -static struct pglist uvm_bootbucket; - -/* * we allocate an initial number of page colors in uvm_page_init(), * and remember them. We may re-color pages as cache sizes are * discovered during the autoconfiguration phase. But we can never @@ -148,20 +141,6 @@ vaddr_t uvm_zerocheckkva; #endif /* DEBUG */ /* - * locks on the hash table. allocated in 32 byte chunks to try - * and reduce cache traffic between CPUs. - */ - -#define UVM_HASHLOCK_CNT 32 -#define uvm_hashlock(hash) \ - (&uvm_hashlocks[(hash) & (UVM_HASHLOCK_CNT - 1)].lock) - -static union { - kmutex_t lock; - uint8_t pad[32]; -} uvm_hashlocks[UVM_HASHLOCK_CNT] __aligned(32); - -/* * local prototypes */ @@ -170,11 +149,49 @@ static void uvm_pageinsert_after(struct static void uvm_pageremove(struct vm_page *); /* + * per-object tree of pages + */ + +static signed int +uvm_page_compare_nodes(const struct rb_node *n1, const struct rb_node *n2) +{ + const struct vm_page *pg1 = (const void *)n1; + const struct vm_page *pg2 = (const void *)n2; + const voff_t a = pg1->offset; + const voff_t b = pg2->offset; + + if (a < b) + return 1; + if (a > b) + return -1; + return 0; +} + +static signed int +uvm_page_compare_key(const struct rb_node *n, const void *key) +{ + const struct vm_page *pg = (const void *)n; + const voff_t a = pg->offset; + const voff_t b = *(const voff_t *)key; + + if (a < b) + return 1; + if (a > b) + return -1; + return 0; +} + +const struct rb_tree_ops uvm_page_tree_ops = { + .rb_compare_nodes = uvm_page_compare_nodes, + .rb_compare_key = uvm_page_compare_key, +}; + +/* * inline functions */ /* - * uvm_pageinsert: insert a page in the object and the hash table + * uvm_pageinsert: insert a page in the object. * uvm_pageinsert_after: insert a page into the specified place in listq * * => caller must lock object @@ -186,22 +203,14 @@ static void uvm_pageremove(struct vm_pag inline static void uvm_pageinsert_after(struct vm_page *pg, struct vm_page *where) { - struct pglist *buck; struct uvm_object *uobj = pg->uobject; - kmutex_t *lock; - u_int hash; KASSERT(mutex_owned(&uobj->vmobjlock)); KASSERT((pg->flags & PG_TABLED) == 0); KASSERT(where == NULL || (where->flags & PG_TABLED)); KASSERT(where == NULL || (where->uobject == uobj)); - hash = uvm_pagehash(uobj, pg->offset); - buck = &uvm.page_hash[hash]; - lock = uvm_hashlock(hash); - mutex_spin_enter(lock); - TAILQ_INSERT_TAIL(buck, pg, hashq); - mutex_spin_exit(lock); + rb_tree_insert_node(&uobj->rb_tree, &pg->rb_node); if (UVM_OBJ_IS_VNODE(uobj)) { if (uobj->uo_npages == 0) { @@ -219,9 +228,9 @@ uvm_pageinsert_after(struct vm_page *pg, } if (where) - TAILQ_INSERT_AFTER(&uobj->memq, where, pg, listq); + TAILQ_INSERT_AFTER(&uobj->memq, where, pg, listq.queue); else - TAILQ_INSERT_TAIL(&uobj->memq, pg, listq); + TAILQ_INSERT_TAIL(&uobj->memq, pg, listq.queue); pg->flags |= PG_TABLED; uobj->uo_npages++; } @@ -234,7 +243,7 @@ uvm_pageinsert(struct vm_page *pg) } /* - * uvm_page_remove: remove page from object and hash + * uvm_page_remove: remove page from object. * * => caller must lock object * => caller must lock page queues @@ -243,20 +252,12 @@ uvm_pageinsert(struct vm_page *pg) static inline void uvm_pageremove(struct vm_page *pg) { - struct pglist *buck; struct uvm_object *uobj = pg->uobject; - kmutex_t *lock; - u_int hash; KASSERT(mutex_owned(&uobj->vmobjlock)); KASSERT(pg->flags & PG_TABLED); - hash = uvm_pagehash(uobj, pg->offset); - buck = &uvm.page_hash[hash]; - lock = uvm_hashlock(hash); - mutex_spin_enter(lock); - TAILQ_REMOVE(buck, pg, hashq); - mutex_spin_exit(lock); + rb_tree_remove_node(&uobj->rb_tree, &pg->rb_node); if (UVM_OBJ_IS_VNODE(uobj)) { if (uobj->uo_npages == 1) { @@ -275,7 +276,7 @@ uvm_pageremove(struct vm_page *pg) /* object should be locked */ uobj->uo_npages--; - TAILQ_REMOVE(&uobj->memq, pg, listq); + TAILQ_REMOVE(&uobj->memq, pg, listq.queue); pg->flags &= ~PG_TABLED; pg->uobject = NULL; } @@ -287,7 +288,7 @@ uvm_page_init_buckets(struct pgfreelist for (color = 0; color < uvmexp.ncolors; color++) { for (i = 0; i < PGFL_NQUEUES; i++) { - TAILQ_INIT(&pgfl->pgfl_buckets[color].pgfl_queues[i]); + LIST_INIT(&pgfl->pgfl_buckets[color].pgfl_queues[i]); } } } @@ -302,44 +303,26 @@ void uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp) { vsize_t freepages, pagecount, bucketcount, n; - struct pgflbucket *bucketarray; + struct pgflbucket *bucketarray, *cpuarray; struct vm_page *pagearray; int lcv; u_int i; paddr_t paddr; + KASSERT(ncpus <= 1); + /* * init the page queues and page queue locks, except the free * list; we allocate that later (with the initial vm_page * structures). */ + curcpu()->ci_data.cpu_uvm = &uvm.cpus[0]; uvmpdpol_init(); mutex_init(&uvm_pageqlock, MUTEX_DRIVER, IPL_NONE); mutex_init(&uvm_fpageqlock, MUTEX_DRIVER, IPL_VM); /* - * init the => hash table. for now - * we just have one bucket (the bootstrap bucket). later on we - * will allocate new buckets as we dynamically resize the hash table. - */ - - uvm.page_nhash = 1; /* 1 bucket */ - uvm.page_hashmask = 0; /* mask for hash function */ - uvm.page_hash = &uvm_bootbucket; /* install bootstrap bucket */ - TAILQ_INIT(uvm.page_hash); /* init hash table */ - - /* - * init hashtable locks. these must be spinlocks, as they are - * called from sites in the pmap modules where we cannot block. - * if taking multiple locks, the order is: low numbered first, - * high numbered second. - */ - - for (i = 0; i < UVM_HASHLOCK_CNT; i++) - mutex_init(&uvm_hashlocks[i].lock, MUTEX_SPIN, IPL_VM); - - /* * allocate vm_page structures. */ @@ -388,14 +371,18 @@ uvm_page_init(vaddr_t *kvm_startp, vaddr (PAGE_SIZE + sizeof(struct vm_page)); bucketarray = (void *)uvm_pageboot_alloc((bucketcount * - sizeof(struct pgflbucket)) + (pagecount * + sizeof(struct pgflbucket) * 2) + (pagecount * sizeof(struct vm_page))); - pagearray = (struct vm_page *)(bucketarray + bucketcount); + cpuarray = bucketarray + bucketcount; + pagearray = (struct vm_page *)(bucketarray + bucketcount * 2); for (lcv = 0; lcv < VM_NFREELIST; lcv++) { uvm.page_free[lcv].pgfl_buckets = (bucketarray + (lcv * uvmexp.ncolors)); uvm_page_init_buckets(&uvm.page_free[lcv]); + uvm.cpus[0].page_free[lcv].pgfl_buckets = + (cpuarray + (lcv * uvmexp.ncolors)); + uvm_page_init_buckets(&uvm.cpus[0].page_free[lcv]); } memset(pagearray, 0, pagecount * sizeof(struct vm_page)); @@ -835,97 +822,11 @@ uvm_page_physload(paddr_t start, paddr_t vm_nphysseg++; if (!preload) { - uvm_page_rehash(); uvmpdpol_reinit(); } } /* - * uvm_page_rehash: reallocate hash table based on number of free pages. - */ - -void -uvm_page_rehash(void) -{ - int freepages, lcv, bucketcount, oldcount, i; - struct pglist *newbuckets, *oldbuckets; - struct vm_page *pg; - size_t newsize, oldsize; - - /* - * compute number of pages that can go in the free pool - */ - - freepages = 0; - for (lcv = 0 ; lcv < vm_nphysseg ; lcv++) - freepages += - (vm_physmem[lcv].avail_end - vm_physmem[lcv].avail_start); - - /* - * compute number of buckets needed for this number of pages - */ - - bucketcount = 1; - while (bucketcount < freepages) - bucketcount = bucketcount * 2; - - /* - * compute the size of the current table and new table. - */ - - oldbuckets = uvm.page_hash; - oldcount = uvm.page_nhash; - oldsize = round_page(sizeof(struct pglist) * oldcount); - newsize = round_page(sizeof(struct pglist) * bucketcount); - - /* - * allocate the new buckets - */ - - newbuckets = (struct pglist *) uvm_km_alloc(kernel_map, newsize, - 0, UVM_KMF_WIRED); - if (newbuckets == NULL) { - printf("uvm_page_physrehash: WARNING: could not grow page " - "hash table\n"); - return; - } - for (lcv = 0 ; lcv < bucketcount ; lcv++) - TAILQ_INIT(&newbuckets[lcv]); - - /* - * now replace the old buckets with the new ones and rehash everything - */ - - for (i = 0; i < UVM_HASHLOCK_CNT; i++) - mutex_spin_enter(&uvm_hashlocks[i].lock); - - uvm.page_hash = newbuckets; - uvm.page_nhash = bucketcount; - uvm.page_hashmask = bucketcount - 1; /* power of 2 */ - - /* ... and rehash */ - for (lcv = 0 ; lcv < oldcount ; lcv++) { - while ((pg = oldbuckets[lcv].tqh_first) != NULL) { - TAILQ_REMOVE(&oldbuckets[lcv], pg, hashq); - TAILQ_INSERT_TAIL( - &uvm.page_hash[uvm_pagehash(pg->uobject, pg->offset)], - pg, hashq); - } - } - - for (i = 0; i < UVM_HASHLOCK_CNT; i++) - mutex_spin_exit(&uvm_hashlocks[i].lock); - - /* - * free old bucket array if is not the boot-time table - */ - - if (oldbuckets != &uvm_bootbucket) - uvm_km_free(kernel_map, (vaddr_t) oldbuckets, oldsize, - UVM_KMF_WIRED); -} - -/* * uvm_page_recolor: Recolor the pages if the new bucket count is * larger than the old one. */ @@ -933,11 +834,15 @@ uvm_page_rehash(void) void uvm_page_recolor(int newncolors) { - struct pgflbucket *bucketarray, *oldbucketarray; - struct pgfreelist pgfl; + struct pgflbucket *bucketarray, *cpuarray, *oldbucketarray; + struct pgfreelist gpgfl, pgfl; struct vm_page *pg; vsize_t bucketcount; int lcv, color, i, ocolors; + struct uvm_cpu *ucpu; + + KASSERT(CPU_IS_PRIMARY(curcpu())); + KASSERT(ncpus <= 1); if (newncolors <= uvmexp.ncolors) return; @@ -948,8 +853,9 @@ uvm_page_recolor(int newncolors) } bucketcount = newncolors * VM_NFREELIST; - bucketarray = malloc(bucketcount * sizeof(struct pgflbucket), + bucketarray = malloc(bucketcount * sizeof(struct pgflbucket) * 2, M_VMPAGE, M_NOWAIT); + cpuarray = bucketarray + bucketcount; if (bucketarray == NULL) { printf("WARNING: unable to allocate %ld page color buckets\n", (long) bucketcount); @@ -971,24 +877,30 @@ uvm_page_recolor(int newncolors) uvmexp.ncolors = newncolors; uvmexp.colormask = uvmexp.ncolors - 1; + ucpu = curcpu()->ci_data.cpu_uvm; for (lcv = 0; lcv < VM_NFREELIST; lcv++) { - pgfl.pgfl_buckets = (bucketarray + (lcv * newncolors)); + gpgfl.pgfl_buckets = (bucketarray + (lcv * newncolors)); + pgfl.pgfl_buckets = (cpuarray + (lcv * uvmexp.ncolors)); + uvm_page_init_buckets(&gpgfl); uvm_page_init_buckets(&pgfl); for (color = 0; color < ocolors; color++) { for (i = 0; i < PGFL_NQUEUES; i++) { - while ((pg = TAILQ_FIRST(&uvm.page_free[ + while ((pg = LIST_FIRST(&uvm.page_free[ lcv].pgfl_buckets[color].pgfl_queues[i])) != NULL) { - TAILQ_REMOVE(&uvm.page_free[ - lcv].pgfl_buckets[ - color].pgfl_queues[i], pg, pageq); - TAILQ_INSERT_TAIL(&pgfl.pgfl_buckets[ + LIST_REMOVE(pg, pageq.list); /* global */ + LIST_REMOVE(pg, listq.list); /* cpu */ + LIST_INSERT_HEAD(&gpgfl.pgfl_buckets[ + VM_PGCOLOR_BUCKET(pg)].pgfl_queues[ + i], pg, pageq.list); + LIST_INSERT_HEAD(&pgfl.pgfl_buckets[ VM_PGCOLOR_BUCKET(pg)].pgfl_queues[ - i], pg, pageq); + i], pg, listq.list); } } } - uvm.page_free[lcv].pgfl_buckets = pgfl.pgfl_buckets; + uvm.page_free[lcv].pgfl_buckets = gpgfl.pgfl_buckets; + ucpu->page_free[lcv].pgfl_buckets = pgfl.pgfl_buckets; } if (have_recolored_pages) { @@ -1002,34 +914,86 @@ uvm_page_recolor(int newncolors) } /* + * uvm_cpu_attach: initialize per-CPU data structures. + */ + +void +uvm_cpu_attach(struct cpu_info *ci) +{ + struct pgflbucket *bucketarray; + struct pgfreelist pgfl; + struct uvm_cpu *ucpu; + vsize_t bucketcount; + int lcv; + + if (CPU_IS_PRIMARY(ci)) { + /* Already done in uvm_page_init(). */ + return; + } + + bucketcount = uvmexp.ncolors * VM_NFREELIST; + bucketarray = malloc(bucketcount * sizeof(struct pgflbucket), + M_VMPAGE, M_WAITOK); + ucpu = &uvm.cpus[cpu_index(ci)]; + ci->ci_data.cpu_uvm = ucpu; + for (lcv = 0; lcv < VM_NFREELIST; lcv++) { + pgfl.pgfl_buckets = (bucketarray + (lcv * uvmexp.ncolors)); + uvm_page_init_buckets(&pgfl); + ucpu->page_free[lcv].pgfl_buckets = pgfl.pgfl_buckets; + } +} + +/* * uvm_pagealloc_pgfl: helper routine for uvm_pagealloc_strat */ static struct vm_page * -uvm_pagealloc_pgfl(struct pgfreelist *pgfl, int try1, int try2, +uvm_pagealloc_pgfl(struct uvm_cpu *ucpu, int flist, int try1, int try2, int *trycolorp) { - struct pglist *freeq; + struct pgflist *freeq; struct vm_page *pg; int color, trycolor = *trycolorp; + struct pgfreelist *gpgfl, *pgfl; KASSERT(mutex_owned(&uvm_fpageqlock)); color = trycolor; + pgfl = &ucpu->page_free[flist]; + gpgfl = &uvm.page_free[flist]; do { - if ((pg = TAILQ_FIRST((freeq = - &pgfl->pgfl_buckets[color].pgfl_queues[try1]))) != NULL) + /* cpu, try1 */ + if ((pg = LIST_FIRST((freeq = + &pgfl->pgfl_buckets[color].pgfl_queues[try1]))) != NULL) { + uvmexp.cpuhit++; goto gotit; - if ((pg = TAILQ_FIRST((freeq = - &pgfl->pgfl_buckets[color].pgfl_queues[try2]))) != NULL) + } + /* global, try1 */ + if ((pg = LIST_FIRST((freeq = + &gpgfl->pgfl_buckets[color].pgfl_queues[try1]))) != NULL) { + uvmexp.cpumiss++; goto gotit; + } + /* cpu, try2 */ + if ((pg = LIST_FIRST((freeq = + &pgfl->pgfl_buckets[color].pgfl_queues[try2]))) != NULL) { + uvmexp.cpuhit++; + goto gotit; + } + /* global, try2 */ + if ((pg = LIST_FIRST((freeq = + &gpgfl->pgfl_buckets[color].pgfl_queues[try2]))) != NULL) { + uvmexp.cpumiss++; + goto gotit; + } color = (color + 1) & uvmexp.colormask; } while (color != trycolor); return (NULL); gotit: - TAILQ_REMOVE(freeq, pg, pageq); + LIST_REMOVE(pg, pageq.list); /* global list */ + LIST_REMOVE(pg, listq.list); /* per-cpu list */ uvmexp.free--; /* update zero'd page count */ @@ -1051,7 +1015,7 @@ uvm_pagealloc_pgfl(struct pgfreelist *pg * * => return null if no pages free * => wake up pagedaemon if number of free pages drops below low water mark - * => if obj != NULL, obj must be locked (to put in hash) + * => if obj != NULL, obj must be locked (to put in obj's tree) * => if anon != NULL, anon must be locked (to put in anon) * => only one of obj or anon can be non-null * => caller must activate/deactivate page if it is not wired. @@ -1069,6 +1033,7 @@ uvm_pagealloc_strat(struct uvm_object *o int flags, int strat, int free_list) { int lcv, try1, try2, zeroit = 0, color; + struct uvm_cpu *ucpu; struct vm_page *pg; bool use_reserve; @@ -1084,11 +1049,11 @@ uvm_pagealloc_strat(struct uvm_object *o * This implements a global round-robin page coloring * algorithm. * - * XXXJRT: Should we make the `nextcolor' per-CPU? * XXXJRT: What about virtually-indexed caches? */ - color = uvm.page_free_nextcolor; + ucpu = curcpu()->ci_data.cpu_uvm; + color = ucpu->page_free_nextcolor; /* * check to see if we need to generate some free pages waking @@ -1134,7 +1099,7 @@ uvm_pagealloc_strat(struct uvm_object *o case UVM_PGA_STRAT_NORMAL: /* Check all freelists in descending priority order. */ for (lcv = 0; lcv < VM_NFREELIST; lcv++) { - pg = uvm_pagealloc_pgfl(&uvm.page_free[lcv], + pg = uvm_pagealloc_pgfl(ucpu, lcv, try1, try2, &color); if (pg != NULL) goto gotit; @@ -1147,7 +1112,7 @@ uvm_pagealloc_strat(struct uvm_object *o case UVM_PGA_STRAT_FALLBACK: /* Attempt to allocate from the specified free list. */ KASSERT(free_list >= 0 && free_list < VM_NFREELIST); - pg = uvm_pagealloc_pgfl(&uvm.page_free[free_list], + pg = uvm_pagealloc_pgfl(ucpu, free_list, try1, try2, &color); if (pg != NULL) goto gotit; @@ -1172,7 +1137,7 @@ uvm_pagealloc_strat(struct uvm_object *o * the next color accordingly. */ - uvm.page_free_nextcolor = (color + 1) & uvmexp.colormask; + ucpu->page_free_nextcolor = (color + 1) & uvmexp.colormask; /* * update allocation statistics and remember if we have to @@ -1317,7 +1282,7 @@ uvm_pagezerocheck(struct vm_page *pg) /* * uvm_pagefree: free page * - * => erase page's identity (i.e. remove from hash/object) + * => erase page's identity (i.e. remove from object) * => put page on free list * => caller must lock owning object (either anon or uvm_object) * => caller must lock page queues @@ -1327,7 +1292,9 @@ uvm_pagezerocheck(struct vm_page *pg) void uvm_pagefree(struct vm_page *pg) { - struct pglist *pgfl; + struct pgflist *pgfl; + struct uvm_cpu *ucpu; + int index, color, queue; bool iszero; #ifdef DEBUG @@ -1421,9 +1388,9 @@ uvm_pagefree(struct vm_page *pg) */ iszero = (pg->flags & PG_ZERO); - pgfl = &uvm.page_free[uvm_page_lookup_freelist(pg)]. - pgfl_buckets[VM_PGCOLOR_BUCKET(pg)]. - pgfl_queues[iszero ? PGFL_ZEROS : PGFL_UNKNOWN]; + index = uvm_page_lookup_freelist(pg); + color = VM_PGCOLOR_BUCKET(pg); + queue = (iszero ? PGFL_ZEROS : PGFL_UNKNOWN); pg->pqflags = PQ_FREE; #ifdef DEBUG @@ -1439,7 +1406,15 @@ uvm_pagefree(struct vm_page *pg) uvm_pagezerocheck(pg); #endif /* DEBUG */ - TAILQ_INSERT_HEAD(pgfl, pg, pageq); + /* global list */ + pgfl = &uvm.page_free[index].pgfl_buckets[color].pgfl_queues[queue]; + LIST_INSERT_HEAD(pgfl, pg, pageq.list); + + /* per-cpu list */ + ucpu = curcpu()->ci_data.cpu_uvm; + pgfl = &ucpu->page_free[index].pgfl_buckets[color].pgfl_queues[queue]; + LIST_INSERT_HEAD(pgfl, pg, listq.list); + uvmexp.free++; if (iszero) uvmexp.zeropages++; @@ -1571,19 +1546,16 @@ void uvm_pageidlezero(void) { struct vm_page *pg; - struct pgfreelist *pgfl; - int free_list, firstbucket; - static int nextbucket; - static __cpu_simple_lock_t idlezero_lock = __SIMPLELOCK_UNLOCKED; + struct pgfreelist *pgfl, *gpgfl; + struct uvm_cpu *ucpu; + int free_list, firstbucket, nextbucket; - if (!__cpu_simple_lock_try(&idlezero_lock)) { - return; - } if (!mutex_tryenter(&uvm_fpageqlock)) { - __cpu_simple_unlock(&idlezero_lock); return; } - firstbucket = nextbucket; + ucpu = curcpu()->ci_data.cpu_uvm; + firstbucket = ucpu->page_idlezero_next; + nextbucket = firstbucket; do { if (sched_curcpu_runnable_p()) { goto quit; @@ -1593,15 +1565,15 @@ uvm_pageidlezero(void) goto quit; } for (free_list = 0; free_list < VM_NFREELIST; free_list++) { - pgfl = &uvm.page_free[free_list]; - while ((pg = TAILQ_FIRST(&pgfl->pgfl_buckets[ + pgfl = &ucpu->page_free[free_list]; + gpgfl = &uvm.page_free[free_list]; + while ((pg = LIST_FIRST(&pgfl->pgfl_buckets[ nextbucket].pgfl_queues[PGFL_UNKNOWN])) != NULL) { if (sched_curcpu_runnable_p()) { goto quit; } - TAILQ_REMOVE(&pgfl->pgfl_buckets[ - nextbucket].pgfl_queues[PGFL_UNKNOWN], - pg, pageq); + LIST_REMOVE(pg, pageq.list); /* global list */ + LIST_REMOVE(pg, listq.list); /* per-cpu list */ uvmexp.free--; mutex_spin_exit(&uvm_fpageqlock); #ifdef PMAP_PAGEIDLEZERO @@ -1615,9 +1587,12 @@ uvm_pageidlezero(void) */ mutex_spin_enter(&uvm_fpageqlock); - TAILQ_INSERT_HEAD(&pgfl->pgfl_buckets[ + LIST_INSERT_HEAD(&gpgfl->pgfl_buckets[ nextbucket].pgfl_queues[ - PGFL_UNKNOWN], pg, pageq); + PGFL_UNKNOWN], pg, pageq.list); + LIST_INSERT_HEAD(&pgfl->pgfl_buckets[ + nextbucket].pgfl_queues[ + PGFL_UNKNOWN], pg, listq.list); uvmexp.free++; uvmexp.zeroaborts++; goto quit; @@ -1628,9 +1603,12 @@ uvm_pageidlezero(void) pg->flags |= PG_ZERO; mutex_spin_enter(&uvm_fpageqlock); - TAILQ_INSERT_HEAD(&pgfl->pgfl_buckets[ + LIST_INSERT_HEAD(&gpgfl->pgfl_buckets[ + nextbucket].pgfl_queues[PGFL_ZEROS], + pg, pageq.list); + LIST_INSERT_HEAD(&pgfl->pgfl_buckets[ nextbucket].pgfl_queues[PGFL_ZEROS], - pg, pageq); + pg, listq.list); uvmexp.free++; uvmexp.zeropages++; } @@ -1638,8 +1616,8 @@ uvm_pageidlezero(void) nextbucket = (nextbucket + 1) & uvmexp.colormask; } while (nextbucket != firstbucket); quit: + ucpu->page_idlezero_next = nextbucket; mutex_spin_exit(&uvm_fpageqlock); - __cpu_simple_unlock(&idlezero_lock); } /* @@ -1653,22 +1631,11 @@ struct vm_page * uvm_pagelookup(struct uvm_object *obj, voff_t off) { struct vm_page *pg; - struct pglist *buck; - kmutex_t *lock; - u_int hash; KASSERT(mutex_owned(&obj->vmobjlock)); - hash = uvm_pagehash(obj, off); - buck = &uvm.page_hash[hash]; - lock = uvm_hashlock(hash); - mutex_spin_enter(lock); - TAILQ_FOREACH(pg, buck, hashq) { - if (pg->uobject == obj && pg->offset == off) { - break; - } - } - mutex_spin_exit(lock); + pg = (struct vm_page *)rb_tree_find_node(&obj->rb_tree, &off); + KASSERT(pg == NULL || obj->uo_npages != 0); KASSERT(pg == NULL || (pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 || (pg->flags & PG_BUSY) != 0); Index: uvm/uvm_page.h =================================================================== RCS file: /cvsroot/src/sys/uvm/uvm_page.h,v retrieving revision 1.53 diff -u -p -r1.53 uvm_page.h --- uvm/uvm_page.h 2 Jun 2008 11:11:14 -0000 1.53 +++ uvm/uvm_page.h 2 Jun 2008 14:10:25 -0000 @@ -84,8 +84,9 @@ * page, indexed by page number. Each structure * is an element of several lists: * - * A hash table bucket used to quickly - * perform object/offset lookups + * A red-black tree rooted with the containing + * object is used to quickly perform object+ + * offset lookups * * A list of all pages for a given object, * so they can be quickly deactivated at @@ -117,11 +118,23 @@ #include #include +#ifdef _KERNEL +#include +#endif + struct vm_page { - TAILQ_ENTRY(vm_page) pageq; /* queue info for FIFO +#ifdef _KERNEL + struct rb_node rb_node; /* tree of pages in obj (O) */ +#endif + union { + TAILQ_ENTRY(vm_page) queue; + LIST_ENTRY(vm_page) list; + } pageq; /* queue info for FIFO * queue or free list (P) */ - TAILQ_ENTRY(vm_page) hashq; /* hash table links (O)*/ - TAILQ_ENTRY(vm_page) listq; /* pages in same object (O)*/ + union { + TAILQ_ENTRY(vm_page) queue; + LIST_ENTRY(vm_page) list; + } listq; /* pages in same object (O)*/ struct vm_anon *uanon; /* anon (O,P) */ struct uvm_object *uobject; /* object (O,P) */ Index: uvm/uvm_pdpolicy_clock.c =================================================================== RCS file: /cvsroot/src/sys/uvm/uvm_pdpolicy_clock.c,v retrieving revision 1.11 diff -u -p -r1.11 uvm_pdpolicy_clock.c --- uvm/uvm_pdpolicy_clock.c 7 Mar 2008 08:44:51 -0000 1.11 +++ uvm/uvm_pdpolicy_clock.c 2 Jun 2008 14:10:25 -0000 @@ -193,7 +193,7 @@ uvmpdpol_selectvictim(void) if (pg == NULL) { break; } - ss->ss_nextpg = TAILQ_NEXT(pg, pageq); + ss->ss_nextpg = TAILQ_NEXT(pg, pageq.queue); uvmexp.pdscans++; @@ -257,7 +257,7 @@ uvmpdpol_balancequeue(int swap_shortage) for (p = TAILQ_FIRST(&pdpol_state.s_activeq); p != NULL && (inactive_shortage > 0 || swap_shortage > 0); p = nextpg) { - nextpg = TAILQ_NEXT(p, pageq); + nextpg = TAILQ_NEXT(p, pageq.queue); /* * if there's a shortage of swap slots, try to free it. @@ -288,7 +288,7 @@ uvmpdpol_pagedeactivate(struct vm_page * KASSERT(mutex_owned(&uvm_pageqlock)); if (pg->pqflags & PQ_ACTIVE) { - TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pageq); + TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pageq.queue); pg->pqflags &= ~PQ_ACTIVE; KASSERT(pdpol_state.s_active > 0); pdpol_state.s_active--; @@ -296,7 +296,7 @@ uvmpdpol_pagedeactivate(struct vm_page * if ((pg->pqflags & PQ_INACTIVE) == 0) { KASSERT(pg->wire_count == 0); pmap_clear_reference(pg); - TAILQ_INSERT_TAIL(&pdpol_state.s_inactiveq, pg, pageq); + TAILQ_INSERT_TAIL(&pdpol_state.s_inactiveq, pg, pageq.queue); pg->pqflags |= PQ_INACTIVE; pdpol_state.s_inactive++; } @@ -307,7 +307,7 @@ uvmpdpol_pageactivate(struct vm_page *pg { uvmpdpol_pagedequeue(pg); - TAILQ_INSERT_TAIL(&pdpol_state.s_activeq, pg, pageq); + TAILQ_INSERT_TAIL(&pdpol_state.s_activeq, pg, pageq.queue); pg->pqflags |= PQ_ACTIVE; pdpol_state.s_active++; } @@ -318,13 +318,13 @@ uvmpdpol_pagedequeue(struct vm_page *pg) if (pg->pqflags & PQ_ACTIVE) { KASSERT(mutex_owned(&uvm_pageqlock)); - TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pageq); + TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pageq.queue); pg->pqflags &= ~PQ_ACTIVE; KASSERT(pdpol_state.s_active > 0); pdpol_state.s_active--; } else if (pg->pqflags & PQ_INACTIVE) { KASSERT(mutex_owned(&uvm_pageqlock)); - TAILQ_REMOVE(&pdpol_state.s_inactiveq, pg, pageq); + TAILQ_REMOVE(&pdpol_state.s_inactiveq, pg, pageq.queue); pg->pqflags &= ~PQ_INACTIVE; KASSERT(pdpol_state.s_inactive > 0); pdpol_state.s_inactive--; Index: uvm/uvm_pdpolicy_clockpro.c =================================================================== RCS file: /cvsroot/src/sys/uvm/uvm_pdpolicy_clockpro.c,v retrieving revision 1.14 diff -u -p -r1.14 uvm_pdpolicy_clockpro.c --- uvm/uvm_pdpolicy_clockpro.c 22 Mar 2008 05:50:42 -0000 1.14 +++ uvm/uvm_pdpolicy_clockpro.c 2 Jun 2008 14:10:25 -0000 @@ -232,7 +232,7 @@ static void pageq_insert_tail(pageq_t *q, struct vm_page *pg) { - TAILQ_INSERT_TAIL(&q->q_q, pg, pageq); + TAILQ_INSERT_TAIL(&q->q_q, pg, pageq.queue); q->q_len++; } @@ -241,7 +241,7 @@ static void pageq_insert_head(pageq_t *q, struct vm_page *pg) { - TAILQ_INSERT_HEAD(&q->q_q, pg, pageq); + TAILQ_INSERT_HEAD(&q->q_q, pg, pageq.queue); q->q_len++; } #endif @@ -254,7 +254,7 @@ pageq_remove(pageq_t *q, struct vm_page KASSERT(clockpro_queue(&clockpro, clockpro_getq(pg)) == q); #endif KASSERT(q->q_len > 0); - TAILQ_REMOVE(&q->q_q, pg, pageq); + TAILQ_REMOVE(&q->q_q, pg, pageq.queue); q->q_len--; } @@ -1193,7 +1193,7 @@ clockpro_dropswap(pageq_t *q, int *todo) { struct vm_page *pg; - TAILQ_FOREACH_REVERSE(pg, &q->q_q, pglist, pageq) { + TAILQ_FOREACH_REVERSE(pg, &q->q_q, pglist, pageq.queue) { if (*todo <= 0) { break; } @@ -1312,7 +1312,7 @@ clockpro_dump(void) (name), nhot, ncold, ntest, nspeculative, ninitialref, nref) INITCOUNT(); - TAILQ_FOREACH(pg, &clockpro_queue(s, CLOCKPRO_NEWQ)->q_q, pageq) { + TAILQ_FOREACH(pg, &clockpro_queue(s, CLOCKPRO_NEWQ)->q_q, pageq.queue) { if (clockpro_getq(pg) != CLOCKPRO_NEWQ) { printf("newq corrupt %p\n", pg); } @@ -1322,7 +1322,7 @@ clockpro_dump(void) PRINTCOUNT("newq"); INITCOUNT(); - TAILQ_FOREACH(pg, &clockpro_queue(s, CLOCKPRO_COLDQ)->q_q, pageq) { + TAILQ_FOREACH(pg, &clockpro_queue(s, CLOCKPRO_COLDQ)->q_q, pageq.queue) { if (clockpro_getq(pg) != CLOCKPRO_COLDQ) { printf("coldq corrupt %p\n", pg); } @@ -1332,7 +1332,7 @@ clockpro_dump(void) PRINTCOUNT("coldq"); INITCOUNT(); - TAILQ_FOREACH(pg, &clockpro_queue(s, CLOCKPRO_HOTQ)->q_q, pageq) { + TAILQ_FOREACH(pg, &clockpro_queue(s, CLOCKPRO_HOTQ)->q_q, pageq.queue) { if (clockpro_getq(pg) != CLOCKPRO_HOTQ) { printf("hotq corrupt %p\n", pg); } @@ -1347,7 +1347,7 @@ clockpro_dump(void) PRINTCOUNT("hotq"); INITCOUNT(); - TAILQ_FOREACH(pg, &clockpro_queue(s, CLOCKPRO_LISTQ)->q_q, pageq) { + TAILQ_FOREACH(pg, &clockpro_queue(s, CLOCKPRO_LISTQ)->q_q, pageq.queue) { #if !defined(LISTQ) printf("listq %p\n", pg); #endif /* !defined(LISTQ) */ @@ -1377,7 +1377,7 @@ pdsim_dumpq(int qidx) pageq_t *q = clockpro_queue(s, qidx); struct vm_page *pg; - TAILQ_FOREACH(pg, &q->q_q, pageq) { + TAILQ_FOREACH(pg, &q->q_q, pageq.queue) { DPRINTF(" %" PRIu64 "%s%s%s%s%s%s", pg->offset >> PAGE_SHIFT, (pg->pqflags & PQ_HOT) ? "H" : "", Index: uvm/uvm_pglist.c =================================================================== RCS file: /cvsroot/src/sys/uvm/uvm_pglist.c,v retrieving revision 1.41 diff -u -p -r1.41 uvm_pglist.c --- uvm/uvm_pglist.c 2 Jun 2008 12:24:16 -0000 1.41 +++ uvm/uvm_pglist.c 2 Jun 2008 14:10:25 -0000 @@ -96,18 +96,18 @@ uvm_pglist_add(struct vm_page *pg, struc color = VM_PGCOLOR_BUCKET(pg); pgflidx = (pg->flags & PG_ZERO) ? PGFL_ZEROS : PGFL_UNKNOWN; #ifdef DEBUG - for (tp = TAILQ_FIRST(&uvm.page_free[ + for (tp = LIST_FIRST(&uvm.page_free[ free_list].pgfl_buckets[color].pgfl_queues[pgflidx]); tp != NULL; - tp = TAILQ_NEXT(tp, pageq)) { + tp = LIST_NEXT(tp, pageq.list)) { if (tp == pg) break; } if (tp == NULL) panic("uvm_pglistalloc: page not on freelist"); #endif - TAILQ_REMOVE(&uvm.page_free[free_list].pgfl_buckets[ - color].pgfl_queues[pgflidx], pg, pageq); + LIST_REMOVE(pg, pageq.list); /* global */ + LIST_REMOVE(pg, listq.list); /* cpu */ uvmexp.free--; if (pg->flags & PG_ZERO) uvmexp.zeropages--; @@ -115,7 +115,7 @@ uvm_pglist_add(struct vm_page *pg, struc pg->pqflags = 0; pg->uobject = NULL; pg->uanon = NULL; - TAILQ_INSERT_TAIL(rlist, pg, pageq); + TAILQ_INSERT_TAIL(rlist, pg, pageq.queue); STAT_INCR(uvm_pglistalloc_npages); } @@ -433,18 +433,21 @@ uvm_pglistalloc(psize_t size, paddr_t lo void uvm_pglistfree(struct pglist *list) { + struct uvm_cpu *ucpu; struct vm_page *pg; + int index, color, queue; /* * Lock the free list and free each page. */ mutex_spin_enter(&uvm_fpageqlock); + ucpu = curcpu()->ci_data.cpu_uvm; while ((pg = TAILQ_FIRST(list)) != NULL) { bool iszero; KASSERT(!uvmpdpol_pageisqueued_p(pg)); - TAILQ_REMOVE(list, pg, pageq); + TAILQ_REMOVE(list, pg, pageq.queue); iszero = (pg->flags & PG_ZERO); pg->pqflags = PQ_FREE; #ifdef DEBUG @@ -456,9 +459,13 @@ uvm_pglistfree(struct pglist *list) if (iszero) uvm_pagezerocheck(pg); #endif /* DEBUG */ - TAILQ_INSERT_HEAD(&uvm.page_free[uvm_page_lookup_freelist(pg)]. - pgfl_buckets[VM_PGCOLOR_BUCKET(pg)]. - pgfl_queues[iszero ? PGFL_ZEROS : PGFL_UNKNOWN], pg, pageq); + index = uvm_page_lookup_freelist(pg); + color = VM_PGCOLOR_BUCKET(pg); + queue = iszero ? PGFL_ZEROS : PGFL_UNKNOWN; + LIST_INSERT_HEAD(&uvm.page_free[index].pgfl_buckets[color]. + pgfl_queues[queue], pg, pageq.list); + LIST_INSERT_HEAD(&ucpu->page_free[index].pgfl_buckets[color]. + pgfl_queues[queue], pg, listq.list); uvmexp.free++; if (iszero) uvmexp.zeropages++; Index: uvm/uvm_pglist.h =================================================================== RCS file: /cvsroot/src/sys/uvm/uvm_pglist.h,v retrieving revision 1.6 diff -u -p -r1.6 uvm_pglist.h --- uvm/uvm_pglist.h 28 Apr 2008 20:24:12 -0000 1.6 +++ uvm/uvm_pglist.h 2 Jun 2008 14:10:25 -0000 @@ -1,7 +1,7 @@ /* $NetBSD: uvm_pglist.h,v 1.6 2008/04/28 20:24:12 martin Exp $ */ /*- - * Copyright (c) 2000, 2001 The NetBSD Foundation, Inc. + * Copyright (c) 2000, 2001, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation @@ -37,6 +37,7 @@ * list, etc. */ TAILQ_HEAD(pglist, vm_page); +LIST_HEAD(pgflist, vm_page); /* * A page free list consists of free pages of unknown contents and free @@ -47,7 +48,7 @@ TAILQ_HEAD(pglist, vm_page); #define PGFL_NQUEUES 2 struct pgflbucket { - struct pglist pgfl_queues[PGFL_NQUEUES]; + struct pgflist pgfl_queues[PGFL_NQUEUES]; }; struct pgfreelist {