Index: arch/i386/i386/genassym.cf =================================================================== RCS file: /cvsroot/src/sys/arch/i386/i386/genassym.cf,v retrieving revision 1.76.4.1 diff -u -p -r1.76.4.1 genassym.cf --- arch/i386/i386/genassym.cf 17 Nov 2008 18:53:54 -0000 1.76.4.1 +++ arch/i386/i386/genassym.cf 11 Jan 2009 13:20:51 -0000 @@ -170,6 +170,7 @@ define PG_RW PG_RW define PG_V PG_V define PG_KW PG_KW define PG_KR PG_KR +define PG_G PG_G define PGEX_U PGEX_U define L2_SLOT_KERNBASE pl2_pi(KERNBASE) @@ -294,7 +295,6 @@ endif define CPU_INFO_SELF offsetof(struct cpu_info, ci_self) define CPU_INFO_RESCHED offsetof(struct cpu_info, ci_want_resched) define CPU_INFO_WANT_PMAPLOAD offsetof(struct cpu_info, ci_want_pmapload) -define CPU_INFO_PMAP_CPU offsetof(struct cpu_info, ci_pmap_cpu) define CPU_INFO_TLBSTATE offsetof(struct cpu_info, ci_tlbstate) define TLBSTATE_VALID TLBSTATE_VALID define TLBSTATE_LAZY TLBSTATE_LAZY @@ -433,12 +433,11 @@ define RW_THREAD RW_THREAD define RW_READER RW_READER define RW_WRITER RW_WRITER -define MB_POINTER offsetof(struct pmap_mbox, mb_pointer) -define MB_GLOBAL offsetof(struct pmap_mbox, mb_global) -define MB_ADDR1 offsetof(struct pmap_mbox, mb_addr1) -define MB_ADDR2 offsetof(struct pmap_mbox, mb_addr2) -define MB_HEAD offsetof(struct pmap_mbox, mb_head) -define MB_TAIL offsetof(struct pmap_mbox, mb_tail) +define TM_PENDING offsetof(struct pmap_tlb_mailbox, tm_pending) +define TP_COUNT offsetof(struct pmap_tlb_packet, tp_count) +define TP_VA offsetof(struct pmap_tlb_packet, tp_va) +define TP_USERMASK offsetof(struct pmap_tlb_packet, tp_usermask) +define TP_PTE offsetof(struct pmap_tlb_packet, tp_pte) define PM_CPUS offsetof(struct pmap, pm_cpus) Index: arch/i386/i386/vector.S =================================================================== RCS file: /cvsroot/src/sys/arch/i386/i386/vector.S,v retrieving revision 1.42 diff -u -p -r1.42 vector.S --- arch/i386/i386/vector.S 7 Jul 2008 13:01:16 -0000 1.42 +++ arch/i386/i386/vector.S 11 Jan 2009 13:20:51 -0000 @@ -168,10 +168,10 @@ IDTVEC(resume_lapic_ipi) IDTVEC_END(resume_lapic_ipi) /* - * Multicast TLB shootdown handler for !kernel_pmap. + * TLB shootdown handler. */ -IDTVEC(intr_lapic_tlb_mcast) - /* Save state. */ +IDTVEC(intr_lapic_tlb) + /* Save state and ack the interrupt. */ pushl %eax pushl %ebx pushl %ecx @@ -180,45 +180,44 @@ IDTVEC(intr_lapic_tlb_mcast) pushl %fs movl $GSEL(GDATA_SEL, SEL_KPL), %eax movl $GSEL(GCPU_SEL, SEL_KPL), %edx - movl %eax, %ds - movl %edx, %fs - /* Count it. */ - addl $1, CPUVAR(TLB_EVCNT)+EV_COUNT - adcl $0, CPUVAR(TLB_EVCNT)+EV_COUNT+4 - /* Find out what we need to invalidate. */ - movl CPUVAR(PMAP_CPU), %ecx - movl MB_ADDR1(%ecx), %eax - movl MB_ADDR2(%ecx), %edx - xorl %ebx, %ebx - xchgl MB_POINTER(%ecx), %ebx + mov %ax, %ds + mov %dx, %fs movl $0, _C_LABEL(local_apic)+LAPIC_EOI - cmpl $-1, %eax + + /* Find out what we need to invalidate. */ + leal _C_LABEL(pmap_tlb_packet), %ebx + movswl TP_COUNT(%ebx), %ecx + cmpl $-1, %ecx je 4f + leal TP_VA(%ebx), %edx 1: /* Invalidate a single page or a range of pages. */ + movl (%edx), %eax invlpg (%eax) - addl $PAGE_SIZE, %eax - cmpl %edx, %eax - jb 1b + addl $4, %edx + decl %ecx + jg 1b 2: - /* Ack the request. */ - lock - incl (%ebx) /* * Check the current TLB state. If we don't want further * invalidations for this pmap, then take the CPU out of * the pmap's bitmask. */ + movl CPUVAR(CPUMASK), %eax cmpl $TLBSTATE_LAZY, CPUVAR(TLBSTATE) jne 3f + testl %eax, TP_USERMASK(%ebx) + jz 3f movl CPUVAR(PMAP), %edx - movl CPUVAR(CPUMASK), %ecx + movl %eax, %ecx notl %ecx lock andl %ecx, PM_CPUS(%edx) movl $TLBSTATE_STALE, CPUVAR(TLBSTATE) 3: - /* Restore state and return. */ + /* Ack the request, restore state & return. */ + lock + xorl %eax, _C_LABEL(pmap_tlb_mailbox)+TM_PENDING popl %fs popl %ds popl %edx @@ -227,60 +226,23 @@ IDTVEC(intr_lapic_tlb_mcast) popl %eax iret 4: - /* Invalidate all user pages. */ + /* Invalidate whole address space: */ + testw $PG_G, TP_PTE(%ebx) + jnz 5f + /* -> user TLB entries only */ movl %cr3, %eax movl %eax, %cr3 jmp 2b -IDTVEC_END(intr_lapic_tlb_mcast) - -/* - * Broadcast TLB shootdown handler for kernel_pmap. - */ -IDTVEC(intr_lapic_tlb_bcast) - /* Save state and ack the interrupt. */ - pushl %eax - pushl %ebx - pushl %edx - /* Find out what we need to invalidate. */ - movl %ss:_C_LABEL(pmap_mbox)+MB_ADDR1, %eax - movl %ss:_C_LABEL(pmap_mbox)+MB_ADDR2, %edx - movl %ss:_C_LABEL(pmap_mbox)+MB_GLOBAL, %ebx - movl $0, %ss:_C_LABEL(local_apic)+LAPIC_EOI - cmpl $-1, %eax - je,pn 3f -1: - /* Invalidate a single page or a range of pages. */ - invlpg %ss:(%eax) - addl $PAGE_SIZE, %eax - cmpl %edx, %eax - jb 1b -2: - /* Ack the request, restore state & return. */ - lock - incl %ss:_C_LABEL(pmap_mbox)+MB_TAIL - popl %edx - popl %ebx - popl %eax - iret -3: - testl %ebx, %ebx - jz 4f - /* - * If the CPU understands global pages and we have been asked - * to invalidate the entire TLB we arrive here. - */ +5: + /* -> user and kernel TLB entries */ movl %cr4, %eax movl %eax, %edx andl $~CR4_PGE, %edx movl %edx, %cr4 movl %eax, %cr4 jmp 2b -4: - /* Invalidate user TLB entries. */ - movl %cr3, %eax - movl %eax, %cr3 - jmp 2b -IDTVEC_END(intr_lapic_tlb_bcast) + +IDTVEC_END(intr_lapic_tlb) #if defined(DDB) IDTVEC(intrddbipi) Index: arch/x86/include/cpu.h =================================================================== RCS file: /cvsroot/src/sys/arch/x86/include/cpu.h,v retrieving revision 1.9 diff -u -p -r1.9 cpu.h --- arch/x86/include/cpu.h 25 Oct 2008 19:13:40 -0000 1.9 +++ arch/x86/include/cpu.h 11 Jan 2009 13:20:53 -0000 @@ -89,7 +89,6 @@ struct cpu_info { */ struct cpu_info *ci_next; /* next cpu */ struct lwp *ci_curlwp; /* current owner of the processor */ - struct pmap_cpu *ci_pmap_cpu; /* per-CPU pmap data */ struct lwp *ci_fpcurlwp; /* current owner of the FPU */ int ci_fpsaving; /* save in progress */ int ci_fpused; /* XEN: FPU was used by curlwp */ @@ -114,6 +113,7 @@ struct cpu_info { #define TLBSTATE_STALE 2 /* we might have stale user tlbs */ int ci_curldt; /* current LDT descriptor */ uint64_t ci_scratch; + uintptr_t ci_pmap_data[128 / sizeof(uintptr_t)]; #ifdef XEN struct iplsource *ci_isources[NIPL]; Index: arch/x86/include/i82489var.h =================================================================== RCS file: /cvsroot/src/sys/arch/x86/include/i82489var.h,v retrieving revision 1.12 diff -u -p -r1.12 i82489var.h --- arch/x86/include/i82489var.h 28 Apr 2008 20:23:40 -0000 1.12 +++ arch/x86/include/i82489var.h 11 Jan 2009 13:20:53 -0000 @@ -76,11 +76,8 @@ extern void Xrecurse_lapic_ipi(void); extern void Xresume_lapic_ipi(void); #define LAPIC_IPI_VECTOR 0xe0 -extern void Xintr_lapic_tlb_bcast(void); -#define LAPIC_TLB_BCAST_VECTOR 0xe1 - -extern void Xintr_lapic_tlb_mcast(void); -#define LAPIC_TLB_MCAST_VECTOR 0xe2 +extern void Xintr_lapic_tlb(void); +#define LAPIC_TLB_VECTOR 0xe1 /* * Vector used for local apic timer interrupts. Index: arch/x86/include/pmap.h =================================================================== RCS file: /cvsroot/src/sys/arch/x86/include/pmap.h,v retrieving revision 1.20 diff -u -p -r1.20 pmap.h --- arch/x86/include/pmap.h 16 Sep 2008 19:55:31 -0000 1.20 +++ arch/x86/include/pmap.h 11 Jan 2009 13:20:53 -0000 @@ -167,6 +167,8 @@ struct pmap { uint32_t pm_cpus; /* mask of CPUs using pmap */ uint32_t pm_kernel_cpus; /* mask of CPUs using kernel part of pmap */ + uint64_t pm_ncsw; /* for assertions */ + struct vm_page *pm_gc_ptp; /* pages from pmap g/c */ }; /* pm_flags */ @@ -228,8 +230,26 @@ void pmap_remove_all(struct pmap *); vaddr_t reserve_dumppages(vaddr_t); /* XXX: not a pmap fn */ -void pmap_tlb_shootdown(pmap_t, vaddr_t, vaddr_t, pt_entry_t); -void pmap_tlb_shootwait(void); +typedef enum tlbwhy { + TLBSHOOT_APTE, + TLBSHOOT_KENTER, + TLBSHOOT_KREMOVE, + TLBSHOOT_FREE_PTP1, + TLBSHOOT_FREE_PTP2, + TLBSHOOT_REMOVE_PTE, + TLBSHOOT_REMOVE_PTES, + TLBSHOOT_SYNC_PV1, + TLBSHOOT_SYNC_PV2, + TLBSHOOT_WRITE_PROTECT, + TLBSHOOT_ENTER, + TLBSHOOT_UPDATE, + TLBSHOOT_BUS_DMA, + TLBSHOOT_BUS_SPACE, + TLBSHOOT__MAX, +} tlbwhy_t; + +void pmap_tlb_shootdown(pmap_t, vaddr_t, pt_entry_t, tlbwhy_t); +void pmap_tlb_shootnow(void); #define PMAP_GROWKERNEL /* turn on pmap_growkernel interface */ #define PMAP_FORK /* turn on pmap_fork interface */ @@ -343,8 +363,6 @@ kvtopte(vaddr_t va) paddr_t vtophys(vaddr_t); vaddr_t pmap_map(vaddr_t, paddr_t, paddr_t, vm_prot_t); -void pmap_cpu_init_early(struct cpu_info *); -void pmap_cpu_init_late(struct cpu_info *); bool sse2_idlezero_page(void *); @@ -414,16 +432,28 @@ paddr_t vtomach(vaddr_t); #define POOL_VTOPHYS(va) vtophys((vaddr_t) (va)) /* - * TLB shootdown mailbox. + * TLB shootdown structures. */ -struct pmap_mbox { - volatile void *mb_pointer; - volatile uintptr_t mb_addr1; - volatile uintptr_t mb_addr2; - volatile uintptr_t mb_head; - volatile uintptr_t mb_tail; - volatile uintptr_t mb_global; +struct pmap_tlb_packet { +#ifdef _LP64 + uintptr_t tp_va[14]; /* whole struct: 128 bytes */ +#else + uintptr_t tp_va[13]; /* whole struct: 64 bytes */ +#endif + uint16_t tp_count; + uint16_t tp_pte; + uint32_t tp_cpumask; + uint32_t tp_usermask; +}; +#define TP_MAXVA 6 /* no more than N seperate invlpg */ + +struct pmap_tlb_mailbox { + uintptr_t tm_pending; + uintptr_t tm_gen; + uintptr_t tm_usergen; + uintptr_t tm_globalgen; + char tm_pad[64 - sizeof(uintptr_t) * 4]; }; #endif /* _KERNEL */ Index: arch/x86/x86/bus_dma.c =================================================================== RCS file: /cvsroot/src/sys/arch/x86/x86/bus_dma.c,v retrieving revision 1.45 diff -u -p -r1.45 bus_dma.c --- arch/x86/x86/bus_dma.c 28 Jun 2008 17:23:01 -0000 1.45 +++ arch/x86/x86/bus_dma.c 11 Jan 2009 13:20:53 -0000 @@ -1,7 +1,7 @@ /* $NetBSD: bus_dma.c,v 1.45 2008/06/28 17:23:01 bouyer Exp $ */ /*- - * Copyright (c) 1996, 1997, 1998, 2007 The NetBSD Foundation, Inc. + * Copyright (c) 1996, 1997, 1998, 2007, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation @@ -1021,7 +1021,7 @@ _bus_dmamem_map(bus_dma_tag_t t, bus_dma bus_addr_t addr; int curseg; int nocache; - pt_entry_t *pte, opte, xpte; + pt_entry_t *pte, opte; const uvm_flag_t kmflags = (flags & BUS_DMA_NOWAIT) != 0 ? UVM_KMF_NOWAIT : 0; @@ -1036,8 +1036,8 @@ _bus_dmamem_map(bus_dma_tag_t t, bus_dma *kvap = (void *)va; sva = va; eva = sva + size; - xpte = 0; + kpreempt_disable(); for (curseg = 0; curseg < nsegs; curseg++) { for (addr = segs[curseg].ds_addr; addr < (segs[curseg].ds_addr + segs[curseg].ds_len); @@ -1050,24 +1050,25 @@ _bus_dmamem_map(bus_dma_tag_t t, bus_dma /* * mark page as non-cacheable */ - if (nocache) { - pte = kvtopte(va); - opte = *pte; - if ((opte & PG_N) == 0) { - pmap_pte_setbits(pte, PG_N); - xpte |= opte; - } + if (!nocache) { + continue; } - } - } + pte = kvtopte(va); + opte = *pte; + if ((opte & PG_N) != 0) { + continue; + } + pmap_pte_setbits(pte, PG_N); + if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { #ifndef XEN /* XXX */ - if ((xpte & (PG_V | PG_U)) == (PG_V | PG_U)) { - kpreempt_disable(); - pmap_tlb_shootdown(pmap_kernel(), sva, eva, xpte); - kpreempt_enable(); + pmap_tlb_shootdown(pmap_kernel(), va, opte, + TLBSHOOT_BUS_DMA); +#endif + } + } } + kpreempt_enable(); pmap_update(pmap_kernel()); -#endif return (0); } Index: arch/x86/x86/bus_space.c =================================================================== RCS file: /cvsroot/src/sys/arch/x86/x86/bus_space.c,v retrieving revision 1.20 diff -u -p -r1.20 bus_space.c --- arch/x86/x86/bus_space.c 21 Oct 2008 15:46:32 -0000 1.20 +++ arch/x86/x86/bus_space.c 11 Jan 2009 13:20:54 -0000 @@ -1,7 +1,7 @@ /* $NetBSD: bus_space.c,v 1.20 2008/10/21 15:46:32 cegger Exp $ */ /*- - * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc. + * Copyright (c) 1996, 1997, 1998, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation @@ -303,7 +303,7 @@ x86_mem_add_mapping(bus_addr_t bpa, bus_ { u_long pa, endpa; vaddr_t va, sva; - pt_entry_t *pte, xpte; + pt_entry_t *pte; pa = x86_trunc_page(bpa); endpa = x86_round_page(bpa + size); @@ -327,8 +327,8 @@ x86_mem_add_mapping(bus_addr_t bpa, bus_ *bshp = (bus_space_handle_t)(sva + (bpa & PGOFSET)); va = sva; - xpte = 0; + kpreempt_disable(); for (; pa < endpa; pa += PAGE_SIZE, va += PAGE_SIZE) { /* * PG_N doesn't exist on 386's, so we assume that @@ -349,12 +349,11 @@ x86_mem_add_mapping(bus_addr_t bpa, bus_ pmap_pte_clearbits(pte, PG_N); else pmap_pte_setbits(pte, PG_N); - xpte |= *pte; + pmap_tlb_shootdown(pmap_kernel(), va, *pte, + TLBSHOOT_BUS_SPACE); } - kpreempt_disable(); - pmap_tlb_shootdown(pmap_kernel(), sva, sva + (endpa - pa), xpte); - pmap_tlb_shootwait(); kpreempt_enable(); + pmap_update(pmap_kernel()); return 0; } Index: arch/x86/x86/cpu.c =================================================================== RCS file: /cvsroot/src/sys/arch/x86/x86/cpu.c,v retrieving revision 1.57.4.1 diff -u -p -r1.57.4.1 cpu.c --- arch/x86/x86/cpu.c 13 Nov 2008 00:04:07 -0000 1.57.4.1 +++ arch/x86/x86/cpu.c 11 Jan 2009 13:20:54 -0000 @@ -149,6 +149,7 @@ struct cpu_info cpu_info_primary __align .ci_idepth = -1, .ci_curlwp = &lwp0, .ci_curldt = -1, + .ci_cpumask = 1, #ifdef TRAPLOG .ci_tlog_base = &tlog_primary, #endif /* !TRAPLOG */ @@ -165,7 +166,7 @@ static void tss_init(struct i386tss *, v static void cpu_init_idle_lwp(struct cpu_info *); uint32_t cpus_attached = 0; -uint32_t cpus_running = 0; +uint32_t cpus_running = 1; extern char x86_64_doubleflt_stack[]; @@ -341,6 +342,7 @@ cpu_attach(device_t parent, device_t sel cpu_init_tss(ci); } else { KASSERT(ci->ci_data.cpu_idlelwp != NULL); + cpus_running = (1 << cpu_index(ci)); } ci->ci_cpumask = (1 << cpu_index(ci)); @@ -359,7 +361,6 @@ cpu_attach(device_t parent, device_t sel cpu_get_tsc_freq(ci); cpu_init(ci); cpu_set_tss_gates(ci); - pmap_cpu_init_late(ci); if (caa->cpu_role != CPU_ROLE_SP) { /* Enable lapic. */ lapic_enable(); @@ -395,8 +396,6 @@ cpu_attach(device_t parent, device_t sel cpu_intr_init(ci); gdt_alloc_cpu(ci); cpu_set_tss_gates(ci); - pmap_cpu_init_early(ci); - pmap_cpu_init_late(ci); cpu_start_secondary(ci); if (ci->ci_flags & CPUF_PRESENT) { cpu_identify(ci); @@ -704,6 +704,7 @@ cpu_hatch(void *v) /* Because the text may have been patched in x86_patch(). */ wbinvd(); x86_flush(); + tlbflushg(); KASSERT((ci->ci_flags & CPUF_RUNNING) == 0); Index: arch/x86/x86/lapic.c =================================================================== RCS file: /cvsroot/src/sys/arch/x86/x86/lapic.c,v retrieving revision 1.42 diff -u -p -r1.42 lapic.c --- arch/x86/x86/lapic.c 3 Jul 2008 14:02:25 -0000 1.42 +++ arch/x86/x86/lapic.c 11 Jan 2009 13:20:54 -0000 @@ -227,10 +227,8 @@ lapic_boot_init(paddr_t lapic_base) #ifdef MULTIPROCESSOR idt_vec_reserve(LAPIC_IPI_VECTOR); idt_vec_set(LAPIC_IPI_VECTOR, Xintr_lapic_ipi); - idt_vec_reserve(LAPIC_TLB_MCAST_VECTOR); - idt_vec_set(LAPIC_TLB_MCAST_VECTOR, Xintr_lapic_tlb_mcast); - idt_vec_reserve(LAPIC_TLB_BCAST_VECTOR); - idt_vec_set(LAPIC_TLB_BCAST_VECTOR, Xintr_lapic_tlb_bcast); + idt_vec_reserve(LAPIC_TLB_VECTOR); + idt_vec_set(LAPIC_TLB_VECTOR, Xintr_lapic_tlb); #endif idt_vec_reserve(LAPIC_SPURIOUS_VECTOR); idt_vec_set(LAPIC_SPURIOUS_VECTOR, Xintrspurious); Index: arch/x86/x86/pmap.c =================================================================== RCS file: /cvsroot/src/sys/arch/x86/x86/pmap.c,v retrieving revision 1.74 diff -u -p -r1.74 pmap.c --- arch/x86/x86/pmap.c 25 Oct 2008 14:16:35 -0000 1.74 +++ arch/x86/x86/pmap.c 11 Jan 2009 13:20:55 -0000 @@ -1,5 +1,34 @@ /* $NetBSD: pmap.c,v 1.74 2008/10/25 14:16:35 yamt Exp $ */ +/*- + * Copyright (c) 2008 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Andrew Doran. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + /* * Copyright (c) 2007 Manuel Bouyer. * @@ -308,21 +337,10 @@ __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.7 * * tlb shootdowns are hard interrupts that operate outside the spl * framework: they don't need to be blocked provided that the pmap module - * gets the order of events correct. the calls are made by talking directly - * to the lapic. the stubs to handle the interrupts are quite short and do - * one of the following: invalidate a single page, a range of pages, all - * user tlb entries or the entire tlb. - * - * the cpus synchronize with each other using pmap_mbox structures which are - * aligned on 64-byte cache lines. tlb shootdowns against the kernel pmap - * use a global mailbox and are generated using a broadcast ipi (broadcast - * to all but the sending cpu). shootdowns against regular pmaps use - * per-cpu mailboxes and are multicast. kernel and user shootdowns can - * execute simultaneously, as can shootdowns within different multithreaded - * processes. TODO: - * - * 1. figure out which waitpoints can be deferered to pmap_update(). - * 2. see if there is a cheap way to batch some updates. + * gets the order of events correct. the calls are made by poking the + * lapic directly. the stub to handle the interrupts is short and does + * one of the following: invalidate a set of pages, all user tlb entries + * or the entire tlb. */ const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; @@ -359,24 +377,39 @@ static vaddr_t pmap_maxkvaddr; #endif /* defined(DIAGNOSTIC) */ /* - * Global TLB shootdown mailbox. + * tlb shootdown state. */ struct evcnt pmap_tlb_evcnt __aligned(64); -struct pmap_mbox pmap_mbox __aligned(64); +struct pmap_tlb_packet pmap_tlb_packet __aligned(64); +struct pmap_tlb_mailbox pmap_tlb_mailbox __aligned(64); /* - * Per-CPU data. The pmap mailbox is cache intensive so gets its - * own line. Note that the mailbox must be the first item. + * tlb shootdown statistics. */ -struct pmap_cpu { - /* TLB shootdown */ - struct pmap_mbox pc_mbox; -}; -union { - struct pmap_cpu pc; - uint8_t padding[64]; -} pmap_cpu[MAXCPUS] __aligned(64); +#ifdef TLBSTATS +static struct evcnt tlbstat_local[TLBSHOOT__MAX]; +static struct evcnt tlbstat_remote[TLBSHOOT__MAX]; +static struct evcnt tlbstat_kernel[TLBSHOOT__MAX]; +static struct evcnt tlbstat_single_req; +static struct evcnt tlbstat_single_issue; +static const char *tlbstat_name[] = { + "APTE", + "KENTER", + "KREMOVE", + "FREE_PTP1", + "FREE_PTP2", + "REMOVE_PTE", + "REMOVE_PTES", + "SYNC_PV1", + "SYNC_PV2", + "WRITE_PROTECT", + "ENTER", + "UPDATE", + "BUS_DMA", + "BUS_SPACE", +}; +#endif /* * global data structures @@ -591,7 +624,7 @@ static void pmap_do_remove(struct pmap static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *, vaddr_t, int, struct pv_entry **); -static pt_entry_t pmap_remove_ptes(struct pmap *, struct vm_page *, +static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t, vaddr_t, int, struct pv_entry **); #define PMAP_REMOVE_ALL 0 /* remove all mappings */ @@ -736,6 +769,7 @@ pmap_is_active(struct pmap *pmap, struct (kernel && (pmap->pm_kernel_cpus & ci->ci_cpumask) != 0)); } +#ifdef XEN static void pmap_apte_flush(struct pmap *pmap) { @@ -749,9 +783,10 @@ pmap_apte_flush(struct pmap *pmap) * * XXXthorpej -- find a way to defer the IPI. */ - pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, 0); - pmap_tlb_shootwait(); + pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_APTE); + pmap_tlb_shootnow(); } +#endif /* * Add a reference to the specified pmap. @@ -775,15 +810,14 @@ static void pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, pd_entry_t **ptepp, pd_entry_t * const **pdeppp) { +#ifdef XEN pd_entry_t opde, npde; struct pmap *ourpmap; struct cpu_info *ci; struct lwp *l; bool iscurrent; uint64_t ncsw; -#ifdef XEN int s; -#endif /* the kernel's pmap is always accessible */ if (pmap == pmap_kernel()) { @@ -799,14 +833,14 @@ pmap_map_ptes(struct pmap *pmap, struct ncsw = l->l_ncsw; ourpmap = NULL; ci = curcpu(); -#if defined(XEN) && defined(__x86_64__) +#if defined(__x86_64__) /* * curmap can only be pmap_kernel so at this point * pmap_is_curpmap is always false */ iscurrent = 0; ourpmap = pmap_kernel(); -#else /* XEN && __x86_64__*/ +#else /* __x86_64__*/ if (ci->ci_want_pmapload && vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) { pmap_load(); @@ -823,7 +857,7 @@ pmap_map_ptes(struct pmap *pmap, struct goto out; } ourpmap = ci->ci_pmap; -#endif /* XEN && __x86_64__ */ +#endif /* __x86_64__ */ /* need to lock both curpmap and pmap: use ordered locking */ pmap_reference(ourpmap); @@ -841,7 +875,6 @@ pmap_map_ptes(struct pmap *pmap, struct /* need to load a new alternate pt space into curpmap? */ COUNT(apdp_pde_map); opde = *APDP_PDE; -#ifdef XEN if (!pmap_valid_entry(opde) || pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) { int i; @@ -868,21 +901,11 @@ pmap_map_ptes(struct pmap *pmap, struct pmap_apte_flush(ourpmap); splx(s); } -#else /* XEN */ - npde = pmap_pa2pte(pmap_pdirpa(pmap, 0)) | PG_RW | PG_V; - if (!pmap_valid_entry(opde) || - pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) { - pmap_pte_set(APDP_PDE, npde); - pmap_pte_flush(); - if (pmap_valid_entry(opde)) - pmap_apte_flush(ourpmap); - } -#endif /* XEN */ *pmap2 = ourpmap; *ptepp = APTE_BASE; *pdeppp = alternate_pdes; KASSERT(l->l_ncsw == ncsw); -#if !defined(XEN) || !defined(__x86_64__) +#if !defined(__x86_64__) out: #endif /* @@ -897,8 +920,62 @@ pmap_map_ptes(struct pmap *pmap, struct mutex_exit(&pmap->pm_lock); goto retry; } +#else /* XEN */ + struct pmap *curpmap; + struct cpu_info *ci; + uint32_t cpumask; + lwp_t *l; + + /* the kernel's pmap is always accessible */ + if (pmap == pmap_kernel()) { + *pmap2 = NULL; + *ptepp = PTE_BASE; + *pdeppp = normal_pdes; + return; + } + KASSERT(kpreempt_disabled()); - return; + l = curlwp; + retry: + ci = curcpu(); + mutex_enter(&pmap->pm_lock); + curpmap = ci->ci_pmap; + if (vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) { + /* our own pmap so just load it: easy. */ + if (ci->ci_want_pmapload) { + mutex_exit(&pmap->pm_lock); + pmap_load(); + goto retry; + } + KASSERT(pmap == curpmap); + } else if (pmap == curpmap) { + /* + * already on the CPU: make it valid. this is very + * often the case during exit(), when we have switched + * to the kernel pmap in order to destroy a user pmap. + */ + if (!pmap_reactivate(pmap)) { + tlbflush(); + } + } else { + /* + * toss current pmap from CPU, but keep ref to it. + * can happen if we block during exit(). + */ + cpumask = ci->ci_cpumask; + atomic_and_32(&curpmap->pm_cpus, ~cpumask); + atomic_and_32(&curpmap->pm_kernel_cpus, ~cpumask); + ci->ci_pmap = pmap; + ci->ci_tlbstate = TLBSTATE_VALID; + atomic_or_32(&pmap->pm_cpus, cpumask); + atomic_or_32(&pmap->pm_kernel_cpus, cpumask); + lcr3(pmap->pm_pdirpa); + } + pmap->pm_ncsw = l->l_ncsw; + *pmap2 = curpmap; + *ptepp = PTE_BASE; + *pdeppp = normal_pdes; +#endif /* XEN */ } /* @@ -908,6 +985,7 @@ pmap_map_ptes(struct pmap *pmap, struct static void pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2) { +#ifdef XEN if (pmap == pmap_kernel()) { return; @@ -916,21 +994,62 @@ pmap_unmap_ptes(struct pmap *pmap, struc if (pmap2 == NULL) { mutex_exit(&pmap->pm_lock); } else { -#if defined(XEN) && defined(__x86_64__) +#if defined(__x86_64__) KASSERT(pmap2 == pmap_kernel()); #else KASSERT(curcpu()->ci_pmap == pmap2); -#endif +#endif /* __x86_64__ */ #if defined(MULTIPROCESSOR) pmap_pte_set(APDP_PDE, 0); pmap_pte_flush(); pmap_apte_flush(pmap2); -#endif +#endif /* MULTIPROCESSOR */ COUNT(apdp_pde_unmap); mutex_exit(&pmap->pm_lock); mutex_exit(&pmap2->pm_lock); pmap_destroy(pmap2); } +#else /* XEN */ + struct cpu_info *ci; + struct pmap *mypmap; + + KASSERT(kpreempt_disabled()); + + /* the kernel's pmap is always accessible. */ + if (pmap == pmap_kernel()) { + return; + } + + /* + * we can't tolerate context switches while mapped in. + * if it's our own pmap all we have to do is unlock. + */ + KASSERT(pmap->pm_ncsw == curlwp->l_ncsw); + mypmap = vm_map_pmap(&curproc->p_vmspace->vm_map); + if (pmap == mypmap) { + mutex_exit(&pmap->pm_lock); + return; + } + + /* + * mark whatever's on the cpu now as lazy and unlock. + * if the pmap was already installed, we are done. + */ + ci = curcpu(); + ci->ci_tlbstate = TLBSTATE_LAZY; + ci->ci_want_pmapload = (mypmap != pmap_kernel()); + mutex_exit(&pmap->pm_lock); + if (pmap == pmap2) { + return; + } + + /* + * we installed another pmap on the CPU. grab a reference to + * it and leave in place. toss the evicted pmap (can block). + */ + pmap_reference(pmap); + pmap_destroy(pmap2); +#endif /* XEN */ } inline static void @@ -1044,9 +1163,12 @@ pmap_kenter_pa(vaddr_t va, paddr_t pa, v panic("pmap_kenter_pa: PG_PS"); #endif if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { - /* This should not happen, so no need to batch updates. */ +#if defined(DIAGNOSTIC) + printf("pmap_kenter_pa: mapping already present\n"); +#endif + /* This should not happen. */ kpreempt_disable(); - pmap_tlb_shootdown(pmap_kernel(), va, 0, opte); + pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER); kpreempt_enable(); } } @@ -1081,7 +1203,7 @@ pmap_kenter_ma(vaddr_t va, paddr_t ma, v if (pmap_valid_entry(opte)) { #if defined(MULTIPROCESSOR) kpreempt_disable(); - pmap_tlb_shootdown(pmap_kernel(), va, 0, opte); + pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER); kpreempt_enable(); #else /* Don't bother deferring in the single CPU case. */ @@ -1140,32 +1262,26 @@ pmap_changeprot_local(vaddr_t va, vm_pro void pmap_kremove(vaddr_t sva, vsize_t len) { - pt_entry_t *pte, xpte; + pt_entry_t *pte, opte; vaddr_t va, eva; eva = sva + len; - xpte = 0; + kpreempt_disable(); for (va = sva; va < eva; va += PAGE_SIZE) { if (va < VM_MIN_KERNEL_ADDRESS) pte = vtopte(va); else pte = kvtopte(va); - xpte |= pmap_pte_testset(pte, 0); /* zap! */ -#if defined(DIAGNOSTIC) - /* XXX For now... */ - if (xpte & PG_PS) - panic("pmap_kremove: PG_PS"); - if (xpte & PG_PVLIST) - panic("pmap_kremove: PG_PVLIST mapping for 0x%lx", - va); -#endif - } - if ((xpte & (PG_V | PG_U)) == (PG_V | PG_U)) { - kpreempt_disable(); - pmap_tlb_shootdown(pmap_kernel(), sva, eva, xpte); - kpreempt_enable(); + opte = pmap_pte_testset(pte, 0); /* zap! */ + if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { + pmap_tlb_shootdown(pmap_kernel(), va, opte, + TLBSHOOT_KREMOVE); + } + KASSERT((opte & PG_PS) == 0); + KASSERT((opte & PG_PVLIST) == 0); } + kpreempt_enable(); } /* @@ -1306,7 +1422,7 @@ pmap_bootstrap(vaddr_t kva_start) * "Intel Architecture Software Developer's Manual, * Volume 3: System Programming". */ - tlbflush(); + tlbflushg(); /* * now, remap the kernel text using large pages. we @@ -1319,7 +1435,7 @@ pmap_bootstrap(vaddr_t kva_start) pde = &L2_BASE[pl2_i(kva)]; *pde = pa | pmap_pg_g | PG_PS | PG_KR | PG_V; /* zap! */ - tlbflush(); + tlbflushg(); } #if defined(DEBUG) printf("kernel text is mapped with " @@ -1471,7 +1587,6 @@ pmap_bootstrap(vaddr_t kva_start) mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); LIST_INIT(&pmaps); - pmap_cpu_init_early(curcpu()); /* * initialize caches. @@ -1495,7 +1610,7 @@ pmap_bootstrap(vaddr_t kva_start) * ensure the TLB is sync'd with reality by flushing it... */ - tlbflush(); + tlbflushg(); /* * calculate pmap_maxkvaddr from nkptp[]. @@ -1584,6 +1699,28 @@ pmap_init(void) mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM); } +#ifdef TLBSTATS + for (i = 0; i < TLBSHOOT__MAX; i++) { + evcnt_attach_dynamic(&tlbstat_local[i], EVCNT_TYPE_MISC, + NULL, "tlbshoot local", tlbstat_name[i]); + } + for (i = 0; i < TLBSHOOT__MAX; i++) { + evcnt_attach_dynamic(&tlbstat_remote[i], EVCNT_TYPE_MISC, + NULL, "tlbshoot remote", tlbstat_name[i]); + } + for (i = 0; i < TLBSHOOT__MAX; i++) { + evcnt_attach_dynamic(&tlbstat_kernel[i], EVCNT_TYPE_MISC, + NULL, "tlbshoot kernel", tlbstat_name[i]); + } + evcnt_attach_dynamic(&tlbstat_single_req, EVCNT_TYPE_MISC, + NULL, "tlbshoot single page", "requests"); + evcnt_attach_dynamic(&tlbstat_single_issue, EVCNT_TYPE_MISC, + NULL, "tlbshoot single page", "issues"); +#endif + + evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR, + NULL, "TLB", "shootdown"); + /* * done: pmap module is up (and ready for business) */ @@ -1592,35 +1729,6 @@ pmap_init(void) } /* - * pmap_cpu_init_early: perform early per-CPU initialization. - */ - -void -pmap_cpu_init_early(struct cpu_info *ci) -{ - struct pmap_cpu *pc; - static uint8_t pmap_cpu_alloc; - - pc = &pmap_cpu[pmap_cpu_alloc++].pc; - ci->ci_pmap_cpu = pc; -} - -/* - * pmap_cpu_init_late: perform late per-CPU initialization. - */ - -void -pmap_cpu_init_late(struct cpu_info *ci) -{ - - if (ci == &cpu_info_primary) - evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR, - NULL, "global", "TLB IPI"); - evcnt_attach_dynamic(&ci->ci_tlb_evcnt, EVCNT_TYPE_MISC, - NULL, device_xname(ci->ci_dev), "TLB IPI"); -} - -/* * p v _ e n t r y f u n c t i o n s */ @@ -1815,11 +1923,13 @@ pmap_free_ptp(struct pmap *pmap, struct unsigned long index; int level; vaddr_t invaladdr; + pd_entry_t opde; +#ifdef XEN + struct pmap *curpmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); #ifdef MULTIPROCESSOR vaddr_t invaladdr2; #endif - pd_entry_t opde; - struct pmap *curpmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); +#endif KASSERT(pmap != pmap_kernel()); KASSERT(mutex_owned(&pmap->pm_lock)); @@ -1829,7 +1939,8 @@ pmap_free_ptp(struct pmap *pmap, struct do { index = pl_i(va, level + 1); opde = pmap_pte_testset(&pdes[level - 1][index], 0); -#if defined(XEN) && defined(__x86_64__) +#if defined(XEN) +# if defined(__x86_64__) /* * If ptp is a L3 currently mapped in kernel space, * clear it before freeing @@ -1837,20 +1948,26 @@ pmap_free_ptp(struct pmap *pmap, struct if (pmap->pm_pdirpa == xen_current_user_pgd && level == PTP_LEVELS - 1) pmap_pte_set(&pmap_kernel()->pm_pdir[index], 0); -#endif /* XEN && __x86_64__ */ - pmap_freepage(pmap, ptp, level); +# endif /*__x86_64__ */ invaladdr = level == 1 ? (vaddr_t)ptes : (vaddr_t)pdes[level - 2]; pmap_tlb_shootdown(curpmap, invaladdr + index * PAGE_SIZE, - 0, opde); -#if defined(MULTIPROCESSOR) + opde, TLBSHOOT_FREE_PTP1); +# if defined(MULTIPROCESSOR) invaladdr2 = level == 1 ? (vaddr_t)PTE_BASE : (vaddr_t)normal_pdes[level - 2]; if (pmap != curpmap || invaladdr != invaladdr2) { pmap_tlb_shootdown(pmap, invaladdr2 + index * PAGE_SIZE, - 0, opde); + opde, TLBSHOOT_FREE_PTP2); } -#endif +# endif /* MULTIPROCESSOR */ +#else /* XEN */ + invaladdr = level == 1 ? (vaddr_t)ptes : + (vaddr_t)pdes[level - 2]; + pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, + opde, TLBSHOOT_FREE_PTP1); +#endif /* XEN */ + pmap_freepage(pmap, ptp, level); if (level < PTP_LEVELS - 1) { ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1); ptp->wire_count--; @@ -2160,6 +2277,7 @@ pmap_create(void) pmap->pm_flags = 0; pmap->pm_cpus = 0; pmap->pm_kernel_cpus = 0; + pmap->pm_gc_ptp = NULL; /* init the LDT */ pmap->pm_ldt = NULL; @@ -2194,6 +2312,24 @@ pmap_create(void) } /* + * pmap_free_ptps: put a list of ptps back to the freelist. + */ + +static void +pmap_free_ptps(struct vm_page *empty_ptps) +{ + struct vm_page *ptp; + struct pmap_page *pp; + + while ((ptp = empty_ptps) != NULL) { + pp = VM_PAGE_TO_PP(ptp); + empty_ptps = pp->pp_link; + LIST_INIT(&pp->pp_head.pvh_list); + uvm_pagefree(ptp); + } +} + +/* * pmap_destroy: drop reference count on pmap. free pmap if * reference count goes to zero. */ @@ -2206,13 +2342,24 @@ pmap_destroy(struct pmap *pmap) struct cpu_info *ci; CPU_INFO_ITERATOR cii; #endif /* DIAGNOSTIC */ + lwp_t *l; /* * if we have torn down this pmap, process deferred frees and - * invalidations now. + * invalidations. free now if the system is low on memory. + * otherwise, free when the pmap is destroyed thus avoiding a + * TLB shootdown. */ - if (__predict_false(curlwp->l_md.md_gc_pmap == pmap)) { - pmap_update(pmap); + l = curlwp; + if (__predict_false(l->l_md.md_gc_pmap == pmap)) { + if (uvmexp.free < uvmexp.freetarg) { + pmap_update(pmap); + } else { + KASSERT(pmap->pm_gc_ptp == NULL); + pmap->pm_gc_ptp = l->l_md.md_gc_ptp; + l->l_md.md_gc_ptp = NULL; + l->l_md.md_gc_pmap = NULL; + } } /* @@ -2261,6 +2408,13 @@ pmap_destroy(struct pmap *pmap) mutex_exit(&pmaps_lock); /* + * process deferred PTP frees. no TLB shootdown required, as the + * PTP pages are no longer visible to any CPU. + */ + + pmap_free_ptps(pmap->pm_gc_ptp); + + /* * destroyed pmap shouldn't have remaining PTPs */ @@ -2739,7 +2893,7 @@ pmap_deactivate(struct lwp *l) * be coming off the CPU before it has a chance to call * pmap_update(). */ - pmap_tlb_shootwait(); + pmap_tlb_shootnow(); ci = curcpu(); @@ -2830,17 +2984,7 @@ pmap_extract(struct pmap *pmap, vaddr_t KPREEMPT_DISABLE(l); ci = l->l_cpu; - if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) || - pmap == pmap_kernel()) { - /* - * no need to lock, because it's pmap_kernel() or our - * own pmap and is active. if a user pmap, the caller - * will hold the vm_map write/read locked and so prevent - * entries from disappearing while we are here. ptps - * can disappear via pmap_remove(), pmap_protect() and - * pmap_collect(), but they are called with the vm_map - * write locked. - */ + if (pmap == pmap_kernel()) { hard = false; ptes = PTE_BASE; pdes = normal_pdes; @@ -3158,14 +3302,14 @@ pmap_unmap_pte(void) * => returns composite pte if at least one page should be shot down */ -static pt_entry_t +static void pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, vaddr_t startva, vaddr_t endva, int flags, struct pv_entry **pv_tofree) { struct pv_entry *pve; pt_entry_t *pte = (pt_entry_t *) ptpva; - pt_entry_t opte, xpte = 0; + pt_entry_t opte; KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled()); @@ -3198,13 +3342,17 @@ pmap_remove_ptes(struct pmap *pmap, stru pmap_exec_account(pmap, startva, opte, 0); pmap_stats_update_bypte(pmap, 0, opte); - xpte |= opte; if (ptp) { ptp->wire_count--; /* dropping a PTE */ /* Make sure that the PDE is flushed */ if (ptp->wire_count <= 1) - xpte |= PG_U; + opte |= PG_U; + } + + if ((opte & PG_U) != 0) { + pmap_tlb_shootdown(pmap, startva, opte, + TLBSHOOT_REMOVE_PTES); } /* @@ -3242,8 +3390,6 @@ pmap_remove_ptes(struct pmap *pmap, stru /* end of "for" loop: time for next pte */ } - - return xpte; } @@ -3285,14 +3431,15 @@ pmap_remove_pte(struct pmap *pmap, struc pmap_exec_account(pmap, va, opte, 0); pmap_stats_update_bypte(pmap, 0, opte); - if (opte & PG_U) - pmap_tlb_shootdown(pmap, va, 0, opte); - if (ptp) { ptp->wire_count--; /* dropping a PTE */ /* Make sure that the PDE is flushed */ - if ((ptp->wire_count <= 1) && !(opte & PG_U)) - pmap_tlb_shootdown(pmap, va, 0, opte); + if (ptp->wire_count <= 1) + opte |= PG_U; + } + + if ((opte & PG_U) != 0) { + pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE); } /* @@ -3340,6 +3487,7 @@ pmap_remove_pte(struct pmap *pmap, struc void pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) { + pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL); } @@ -3352,7 +3500,7 @@ pmap_remove(struct pmap *pmap, vaddr_t s static void pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags) { - pt_entry_t *ptes, xpte = 0; + pt_entry_t *ptes; pd_entry_t pde; pd_entry_t * const *pdes; struct pv_entry *pv_tofree = NULL; @@ -3450,7 +3598,7 @@ pmap_do_remove(struct pmap *pmap, vaddr_ "detected"); #endif } - xpte |= pmap_remove_ptes(pmap, ptp, + pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va, blkendva, flags, &pv_tofree); @@ -3458,8 +3606,6 @@ pmap_do_remove(struct pmap *pmap, vaddr_ if (ptp && ptp->wire_count <= 1) { pmap_free_ptp(pmap, ptp, va, ptes, pdes); } - if ((xpte & PG_U) != 0) - pmap_tlb_shootdown(pmap, sva, eva, xpte); } pmap_unmap_ptes(pmap, pmap2); /* unlock pmap */ kpreempt_enable(); @@ -3518,8 +3664,9 @@ pmap_sync_pv(struct pv_pte *pvpte, pt_en pmap_unmap_pte(); if (clearbits != 0) { - pmap_tlb_shootdown(pmap, va, 0, - (pmap == pmap_kernel() ? PG_G : 0)); + pmap_tlb_shootdown(pmap, va, + (pmap == pmap_kernel() ? PG_G : 0), + TLBSHOOT_SYNC_PV1); } return EAGAIN; } @@ -3558,7 +3705,7 @@ pmap_sync_pv(struct pv_pte *pvpte, pt_en } while (pmap_pte_cas(ptep, opte, npte) != opte); if (need_shootdown) { - pmap_tlb_shootdown(pmap, va, 0, opte); + pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV2); } pmap_unmap_pte(); @@ -3642,7 +3789,7 @@ startover: KASSERT(pmap != pmap_kernel()); - pmap_tlb_shootwait(); + pmap_tlb_shootnow(); pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); pmap_stats_update_bypte(pmap, 0, opte); ptp->wire_count--; @@ -3688,7 +3835,7 @@ pmap_test_attrs(struct vm_page *pg, unsi pt_entry_t expect; u_int result; -#if DIAGNOSTIC +#ifdef DIAGNOSTIC int bank, off; bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off); @@ -3867,7 +4014,8 @@ pmap_write_protect(struct pmap *pmap, va vaddr_t tva; tva = x86_ptob(spte - ptes); - pmap_tlb_shootdown(pmap, tva, 0, opte); + pmap_tlb_shootdown(pmap, tva, opte, + TLBSHOOT_WRITE_PROTECT); } next:; } @@ -4164,7 +4312,7 @@ same_pa: if ((~opte & (PG_V | PG_U)) == 0 && ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) { - pmap_tlb_shootdown(pmap, va, 0, opte); + pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER); } error = 0; @@ -4461,45 +4609,45 @@ pmap_dump(struct pmap *pmap, vaddr_t sva } #endif +static inline void +pmap_tlbstat_count(struct pmap *pm, vaddr_t va, tlbwhy_t why) +{ +#ifdef TLBSTATS + uint32_t mask; + + if (va != (vaddr_t)-1LL) { + atomic_inc_64(&tlbstat_single_req.ev_count); + } + if (pm == pmap_kernel()) { + atomic_inc_64(&tlbstat_kernel[why].ev_count); + return; + + } + if (va >= VM_MAXUSER_ADDRESS) { + mask = pm->pm_cpus | pm->pm_kernel_cpus; + } else { + mask = pm->pm_cpus; + } + if ((mask & curcpu()->ci_cpumask) != 0) { + atomic_inc_64(&tlbstat_local[why].ev_count); + } + if ((mask & ~curcpu()->ci_cpumask) != 0) { + atomic_inc_64(&tlbstat_remote[why].ev_count); + } +#endif +} + /* - * pmap_tlb_shootdown: invalidate pages on all CPUs using pmap 'pm' - * - * => always invalidates locally before returning - * => returns before remote CPUs have invalidated - * => must be called with preemption disabled + * pmap_tlb_shootdown: invalidate a page on all CPUs using pmap 'pm' */ -void -pmap_tlb_shootdown(struct pmap *pm, vaddr_t sva, vaddr_t eva, pt_entry_t pte) +__noinline void +pmap_tlb_shootdown(struct pmap *pm, vaddr_t va, pt_entry_t pte, tlbwhy_t why) { -#ifdef MULTIPROCESSOR - extern bool x86_mp_online; - struct cpu_info *ci; - struct pmap_mbox *mb, *selfmb; - CPU_INFO_ITERATOR cii; - uintptr_t head; - u_int count; + struct pmap_tlb_packet *tp; int s; -#endif /* MULTIPROCESSOR */ - struct cpu_info *self; - bool kernel; - - KASSERT(eva == 0 || eva >= sva); - KASSERT(kpreempt_disabled()); - if (pte & PG_PS) - sva &= PG_LGFRAME; - pte &= PG_G; - self = curcpu(); - - if (sva == (vaddr_t)-1LL) { - kernel = true; - } else { - if (eva == 0) - eva = sva + PAGE_SIZE; - kernel = sva >= VM_MAXUSER_ADDRESS; - KASSERT(kernel == (eva > VM_MAXUSER_ADDRESS)); - } + KASSERT((pte & PG_G) == 0 || pm == pmap_kernel()); /* * if tearing down the pmap, do nothing. we'll flush later @@ -4509,167 +4657,171 @@ pmap_tlb_shootdown(struct pmap *pm, vadd return; } - /* - * If the range is larger than 32 pages, then invalidate - * everything. - */ - if (sva != (vaddr_t)-1LL && eva - sva > (32 * PAGE_SIZE)) { - sva = (vaddr_t)-1LL; - eva = sva; + if ((pte & PG_PS) != 0) { + va &= PG_LGFRAME; } -#ifdef MULTIPROCESSOR - if (ncpu > 1 && x86_mp_online) { - selfmb = &self->ci_pmap_cpu->pc_mbox; - - /* - * If the CPUs have no notion of global pages then - * reload of %cr3 is sufficient. - */ - if (pte != 0 && (cpu_feature & CPUID_PGE) == 0) - pte = 0; - - if (pm == pmap_kernel()) { - /* - * Mapped on all CPUs: use the broadcast mechanism. - * Once we have the lock, increment the counter. - */ - s = splvm(); - mb = &pmap_mbox; - count = SPINLOCK_BACKOFF_MIN; - do { - if ((head = mb->mb_head) != mb->mb_tail) { - splx(s); - while ((head = mb->mb_head) != - mb->mb_tail) - SPINLOCK_BACKOFF(count); - s = splvm(); - } - } while (atomic_cas_ulong( - (volatile u_long *)&mb->mb_head, - head, head + ncpu - 1) != head); - - /* - * Once underway we must stay at IPL_VM until the - * IPI is dispatched. Otherwise interrupt handlers - * on this CPU can deadlock against us. - */ - pmap_tlb_evcnt.ev_count++; - mb->mb_pointer = self; - mb->mb_addr1 = sva; - mb->mb_addr2 = eva; - mb->mb_global = pte; - x86_ipi(LAPIC_TLB_BCAST_VECTOR, LAPIC_DEST_ALLEXCL, - LAPIC_DLMODE_FIXED); - self->ci_need_tlbwait = 1; - splx(s); - } else if ((pm->pm_cpus & ~self->ci_cpumask) != 0 || - (kernel && (pm->pm_kernel_cpus & ~self->ci_cpumask) != 0)) { - /* - * We don't bother traversing the CPU list if only - * used by this CPU. - * - * We can't do global flushes with the multicast - * mechanism. - */ - KASSERT(pte == 0); - - /* - * Take ownership of the shootdown mailbox on each - * CPU, fill the details and fire it off. - */ - s = splvm(); - for (CPU_INFO_FOREACH(cii, ci)) { - if (ci == self || - !pmap_is_active(pm, ci, kernel) || - !(ci->ci_flags & CPUF_RUNNING)) - continue; - selfmb->mb_head++; - mb = &ci->ci_pmap_cpu->pc_mbox; - count = SPINLOCK_BACKOFF_MIN; - while (atomic_cas_ulong( - (u_long *)&mb->mb_pointer, - 0, (u_long)&selfmb->mb_tail) != 0) { - splx(s); - while (mb->mb_pointer != 0) - SPINLOCK_BACKOFF(count); - s = splvm(); - } - mb->mb_addr1 = sva; - mb->mb_addr2 = eva; - mb->mb_global = pte; - if (x86_ipi(LAPIC_TLB_MCAST_VECTOR, - ci->ci_cpuid, LAPIC_DLMODE_FIXED)) - panic("pmap_tlb_shootdown: ipi failed"); - } - self->ci_need_tlbwait = 1; - splx(s); - } + /* + * add the shootdown operation to our pending set. + */ + s = splvm(); + tp = (struct pmap_tlb_packet *)curcpu()->ci_pmap_data; + tp->tp_pte |= (uint16_t)pte; + if (tp->tp_count < TP_MAXVA && va != (vaddr_t)-1LL) { + /* flush a single page. */ + tp->tp_va[tp->tp_count++] = va; + } else { + /* flush everything. */ + tp->tp_count = (uint16_t)-1; } -#endif /* MULTIPROCESSOR */ - - /* Update the current CPU before waiting for others. */ - if (!pmap_is_active(pm, self, kernel)) - return; - - if (sva == (vaddr_t)-1LL) { - if (pte != 0) - tlbflushg(); - else - tlbflush(); + if (pm == pmap_kernel()) { + tp->tp_cpumask = cpus_running; + } else if (va >= VM_MAXUSER_ADDRESS) { + tp->tp_cpumask |= (pm->pm_cpus | pm->pm_kernel_cpus); + tp->tp_usermask |= (pm->pm_cpus | pm->pm_kernel_cpus);; } else { - do { - pmap_update_pg(sva); - sva += PAGE_SIZE; - } while (sva < eva); + tp->tp_cpumask |= pm->pm_cpus; + tp->tp_usermask |= pm->pm_cpus; } + pmap_tlbstat_count(pm, va, why); + splx(s); } /* - * pmap_tlb_shootwait: wait for pending TLB shootdowns to complete + * pmap_tlb_shootnow: process pending TLB shootdowns queued on curcpu * - * => only waits for operations generated by the current CPU * => must be called with preemption disabled */ -void -pmap_tlb_shootwait(void) +__noinline void +pmap_tlb_shootnow(void) { - struct cpu_info *self; - struct pmap_mbox *mb; + struct pmap_tlb_packet *tp; + struct pmap_tlb_mailbox *tm; + struct cpu_info *ci, *lci; + CPU_INFO_ITERATOR cii; + uint32_t remote; + uintptr_t gen; + int s, err, i, count; KASSERT(kpreempt_disabled()); + s = splvm(); + ci = curcpu(); + tp = (struct pmap_tlb_packet *)ci->ci_pmap_data; + if (tp->tp_count == 0) { + splx(s); + return; + } + tm = &pmap_tlb_mailbox; + remote = tp->tp_cpumask & ~ci->ci_cpumask; + gen = 0; /* XXXgcc */ + if (remote != 0) { + /* + * gain ownership of the shootdown mailbox. we must stay + * at splvm once we own it or could deadlock against an + * interrupt on this cpu trying to do the same. + */ + while (atomic_cas_32(&tm->tm_pending, 0, remote) != 0) { + splx(s); + count = SPINLOCK_BACKOFF_MIN; + while (tm->tm_pending != 0) { + SPINLOCK_BACKOFF(count); + } + s = splvm(); + /* an interrupt might have done it for us. */ + if (tp->tp_count == 0) { + splx(s); + return; + } + } + + /* + * start a new generation of updates. copy our shootdown + * requests into the global buffer. + */ + gen = ++tm->tm_gen; + memcpy(&pmap_tlb_packet, tp, sizeof(*tp)); + pmap_tlb_evcnt.ev_count++; + + /* + * initiate shootdowns on remote CPUs. + */ + if (tp->tp_cpumask == cpus_running) { + err = x86_ipi(LAPIC_TLB_VECTOR, LAPIC_DEST_ALLEXCL, + LAPIC_DLMODE_FIXED); + } else { + err = 0; + for (CPU_INFO_FOREACH(cii, lci)) { + if ((lci->ci_cpumask & remote) == 0) { + continue; + } + if ((lci->ci_flags & CPUF_RUNNING) == 0) { + remote &= ~lci->ci_cpumask; + atomic_and_32(&tm->tm_pending, remote); + continue; + } + err |= x86_ipi(LAPIC_TLB_VECTOR, + lci->ci_cpuid, LAPIC_DLMODE_FIXED); + } + } + if (__predict_false(err != 0)) { + panic("pmap_tlb_shootdown: ipi failed"); + } + } + /* - * Anything to do? XXX Really we want to avoid touching the cache - * lines of the two mailboxes, but the processor may read ahead. + * shootdowns on remote CPUs are now in flight. in the meantime, + * perform local shootdowns. */ - self = curcpu(); - if (!self->ci_need_tlbwait) - return; - self->ci_need_tlbwait = 0; + if ((tp->tp_cpumask & ci->ci_cpumask) != 0) { + if (tp->tp_count == (uint16_t)-1) { + if ((tp->tp_pte & PG_G) != 0) { + tlbflushg(); + } else { + tlbflush(); + } + } else { + for (i = tp->tp_count - 1; i >= 0; i--) { + pmap_update_pg(tp->tp_va[i]); + } + } + } - /* If we own the global mailbox, wait for it to drain. */ - mb = &pmap_mbox; - while (mb->mb_pointer == self && mb->mb_head != mb->mb_tail) - x86_pause(); + /* + * clear out our local buffer. + */ +#ifdef TLBSTATS + if (tp->tp_count != (uint16_t)-1) { + atomic_add_64(&tlbstat_single_issue.ev_count, tp->tp_count); + } +#endif + tp->tp_count = 0; + tp->tp_pte = 0; + tp->tp_cpumask = 0; + tp->tp_usermask = 0; + splx(s); - /* If we own other CPU's mailboxes, wait for them to drain. */ - mb = &self->ci_pmap_cpu->pc_mbox; - KASSERT(mb->mb_pointer != &mb->mb_tail); - while (mb->mb_head != mb->mb_tail) - x86_pause(); + /* + * now wait for the current generation of updates to be + * processed by remote CPUs. + */ + if (remote != 0 && tm->tm_pending != 0) { + count = SPINLOCK_BACKOFF_MIN; + while (tm->tm_pending != 0 && tm->tm_gen == gen) { + SPINLOCK_BACKOFF(count); + } + } } /* - * pmap_update: process deferred invalidations + * pmap_update: process deferred invalidations and frees. */ -void +__noinline void pmap_update(struct pmap *pmap) { - struct vm_page *ptp, *empty_ptps; - struct pmap_page *pp; + struct vm_page *empty_ptps; lwp_t *l; /* @@ -4680,17 +4832,19 @@ pmap_update(struct pmap *pmap) if (__predict_false(l->l_md.md_gc_pmap == pmap)) { l->l_md.md_gc_pmap = NULL; KPREEMPT_DISABLE(l); - pmap_tlb_shootdown(pmap, -1, -1, 0); + pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_UPDATE); KPREEMPT_ENABLE(l); } /* - * wait for tlb shootdowns to complete before returning control - * to the caller. + * initiate any pending tlb shootdowns. wait for them to + * complete before returning control to the caller. */ - kpreempt_disable(); - pmap_tlb_shootwait(); - kpreempt_enable(); + if (((struct pmap_tlb_packet *)curcpu()->ci_pmap_data)->tp_count) { + KPREEMPT_DISABLE(l); + pmap_tlb_shootnow(); + KPREEMPT_ENABLE(l); + } /* * now that shootdowns are complete, process deferred frees, @@ -4703,14 +4857,7 @@ pmap_update(struct pmap *pmap) empty_ptps = l->l_md.md_gc_ptp; l->l_md.md_gc_ptp = NULL; - - while ((ptp = empty_ptps) != NULL) { - ptp->flags |= PG_ZERO; - pp = VM_PAGE_TO_PP(ptp); - empty_ptps = pp->pp_link; - LIST_INIT(&pp->pp_head.pvh_list); - uvm_pagefree(ptp); - } + pmap_free_ptps(empty_ptps); } }