Index: arch/amd64/amd64/autoconf.c =================================================================== RCS file: /cvsroot/src/sys/arch/amd64/amd64/autoconf.c,v retrieving revision 1.28 diff -u -p -r1.28 autoconf.c --- arch/amd64/amd64/autoconf.c 22 Oct 2017 00:59:28 -0000 1.28 +++ arch/amd64/amd64/autoconf.c 22 Dec 2019 19:48:09 -0000 @@ -60,9 +60,14 @@ __KERNEL_RCSID(0, "$NetBSD: autoconf.c,v #include #include +#include "acpica.h" #include "ioapic.h" #include "lapic.h" +#if NACPICA > 0 +#include +#endif + #if NIOAPIC > 0 #include #endif @@ -112,6 +117,11 @@ cpu_configure(void) cpu_init_idle_lwps(); #endif +#if NACPICA > 0 + /* Load NUMA memory regions into UVM. */ + acpisrat_load_uvm(); +#endif + spl0(); lcr8(0); } Index: ddb/db_command.c =================================================================== RCS file: /cvsroot/src/sys/ddb/db_command.c,v retrieving revision 1.165 diff -u -p -r1.165 db_command.c --- ddb/db_command.c 15 Dec 2019 20:29:08 -0000 1.165 +++ ddb/db_command.c 22 Dec 2019 19:48:13 -0000 @@ -1,7 +1,8 @@ /* $NetBSD: db_command.c,v 1.165 2019/12/15 20:29:08 joerg Exp $ */ /* - * Copyright (c) 1996, 1997, 1998, 1999, 2002, 2009 The NetBSD Foundation, Inc. + * Copyright (c) 1996, 1997, 1998, 1999, 2002, 2009, 2019 + * The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation @@ -193,6 +194,7 @@ static void db_help_print_cmd(db_exp static void db_lock_print_cmd(db_expr_t, bool, db_expr_t, const char *); static void db_show_all_locks(db_expr_t, bool, db_expr_t, const char *); static void db_show_lockstats(db_expr_t, bool, db_expr_t, const char *); +static void db_show_all_freelists(db_expr_t, bool, db_expr_t, const char *); static void db_mount_print_cmd(db_expr_t, bool, db_expr_t, const char *); static void db_show_all_mount(db_expr_t, bool, db_expr_t, const char *); static void db_mbuf_print_cmd(db_expr_t, bool, db_expr_t, const char *); @@ -234,6 +236,8 @@ static const struct db_command db_show_c 0 ,"Show all held locks", "[/t]", NULL) }, { DDB_ADD_CMD("mount", db_show_all_mount, 0, "Print all mount structures.", "[/f]", NULL) }, + { DDB_ADD_CMD("freelists", db_show_all_freelists, + 0 ,"Show all freelists", NULL, NULL) }, #ifdef AIO /*added from all sub cmds*/ { DDB_ADD_CMD("aio_jobs", db_show_aio_jobs, 0, @@ -1285,6 +1289,16 @@ db_show_all_locks(db_expr_t addr, bool h } static void +db_show_all_freelists(db_expr_t addr, bool have_addr, + db_expr_t count, const char *modif) +{ + +#ifdef _KERNEL /* XXX CRASH(8) */ + uvm_page_print_freelists(db_printf); +#endif +} + +static void db_show_lockstats(db_expr_t addr, bool have_addr, db_expr_t count, const char *modif) { Index: dev/acpi/acpi_srat.c =================================================================== RCS file: /cvsroot/src/sys/dev/acpi/acpi_srat.c,v retrieving revision 1.6 diff -u -p -r1.6 acpi_srat.c --- dev/acpi/acpi_srat.c 1 Oct 2019 18:00:08 -0000 1.6 +++ dev/acpi/acpi_srat.c 22 Dec 2019 19:48:13 -0000 @@ -39,6 +39,8 @@ __KERNEL_RCSID(0, "$NetBSD: acpi_srat.c, #include #include +#include + static ACPI_TABLE_SRAT *srat; static uint32_t nnodes; /* Number of NUMA nodes */ @@ -460,6 +462,28 @@ acpisrat_dump(void) } } +void +acpisrat_load_uvm(void) +{ + uint32_t i, j, nn, nm; + struct acpisrat_mem m; + + nn = acpisrat_nodes(); + aprint_debug("SRAT: %u NUMA nodes\n", nn); + for (i = 0; i < nn; i++) { + nm = acpisrat_node_memoryranges(i); + for (j = 0; j < nm; j++) { + acpisrat_mem(i, j, &m); + aprint_debug("SRAT: node %u memory range %u (0x%" + PRIx64" - 0x%"PRIx64" flags %u)\n", + m.nodeid, j, m.baseaddress, + m.baseaddress + m.length, m.flags); + uvm_page_numa_load(trunc_page(m.baseaddress), + trunc_page(m.length), m.nodeid); + } + } +} + /* * Get number of NUMA nodes. */ Index: dev/acpi/acpi_srat.h =================================================================== RCS file: /cvsroot/src/sys/dev/acpi/acpi_srat.h,v retrieving revision 1.4 diff -u -p -r1.4 acpi_srat.h --- dev/acpi/acpi_srat.h 28 Dec 2017 08:49:28 -0000 1.4 +++ dev/acpi/acpi_srat.h 22 Dec 2019 19:48:13 -0000 @@ -68,6 +68,7 @@ int acpisrat_init(void); int acpisrat_refresh(void); int acpisrat_exit(void); void acpisrat_dump(void); +void acpisrat_load_uvm(void); uint32_t acpisrat_nodes(void); uint32_t acpisrat_node_cpus(acpisrat_nodeid_t); uint32_t acpisrat_node_memoryranges(acpisrat_nodeid_t); Index: kern/init_main.c =================================================================== RCS file: /cvsroot/src/sys/kern/init_main.c,v retrieving revision 1.512 diff -u -p -r1.512 init_main.c --- kern/init_main.c 22 Dec 2019 15:00:42 -0000 1.512 +++ kern/init_main.c 22 Dec 2019 19:48:17 -0000 @@ -814,6 +814,8 @@ configure2(void) for (CPU_INFO_FOREACH(cii, ci)) { uvm_cpu_attach(ci); } + uvm_page_rebucket(); + mp_online = true; #if defined(MULTIPROCESSOR) cpu_boot_secondary_processors(); /* Index: uvm/files.uvm =================================================================== RCS file: /cvsroot/src/sys/uvm/files.uvm,v retrieving revision 1.31 diff -u -p -r1.31 files.uvm --- uvm/files.uvm 15 Dec 2019 21:11:35 -0000 1.31 +++ uvm/files.uvm 22 Dec 2019 19:48:19 -0000 @@ -42,6 +42,7 @@ file uvm/uvm_pager.c uvm file uvm/uvm_pdaemon.c uvm file uvm/uvm_pdpolicy_clock.c !pdpolicy_clockpro file uvm/uvm_pdpolicy_clockpro.c pdpolicy_clockpro +file uvm/uvm_pgflcache.c uvm file uvm/uvm_pglist.c uvm file uvm/uvm_physseg.c uvm file uvm/uvm_readahead.c uvm Index: uvm/uvm.h =================================================================== RCS file: /cvsroot/src/sys/uvm/uvm.h,v retrieving revision 1.70 diff -u -p -r1.70 uvm.h --- uvm/uvm.h 13 Dec 2019 20:10:22 -0000 1.70 +++ uvm/uvm.h 22 Dec 2019 19:48:19 -0000 @@ -71,21 +71,19 @@ #include struct workqueue; +struct pgflcache; /* * per-cpu data */ struct uvm_cpu { - struct pgfreelist page_free[VM_NFREELIST]; /* unallocated pages */ - int page_free_nextcolor; /* next color to allocate from */ - int page_idlezero_next; /* which color to zero next */ - bool page_idle_zero; /* TRUE if we should try to zero - pages in the idle loop */ - int pages[PGFL_NQUEUES]; /* total of pages in page_free */ - u_int emap_gen; /* emap generation number */ - - krndsource_t rs; /* entropy source */ + struct pgflcache *pgflcache[VM_NFREELIST];/* cpu-local cached pages */ + void *pgflcachemem; /* pointer to allocated mem */ + size_t pgflcachememsz; /* size of allocated memory */ + u_int pgflcolor; /* next color to allocate */ + u_int pgflbucket; /* where to send our pages */ + krndsource_t rs; /* entropy source */ }; /* @@ -98,7 +96,9 @@ struct uvm { /* vm_page queues */ struct pgfreelist page_free[VM_NFREELIST]; /* unallocated pages */ - bool page_init_done; /* TRUE if uvm_page_init() finished */ + u_int bucketcount; + bool page_init_done; /* true if uvm_page_init() finished */ + bool numa_alloc; /* use NUMA page allocation strategy */ /* page daemon trigger */ int pagedaemon; /* daemon sleeps on this */ @@ -123,7 +123,6 @@ extern struct uvm_object *uvm_kernel_obj * locks (made globals for lockstat). */ -extern kmutex_t uvm_fpageqlock; /* lock for free page q */ extern kmutex_t uvm_kentry_lock; #endif /* _KERNEL */ Index: uvm/uvm_ddb.h =================================================================== RCS file: /cvsroot/src/sys/uvm/uvm_ddb.h,v retrieving revision 1.15 diff -u -p -r1.15 uvm_ddb.h --- uvm/uvm_ddb.h 17 May 2011 04:18:07 -0000 1.15 +++ uvm/uvm_ddb.h 22 Dec 2019 19:48:19 -0000 @@ -40,6 +40,7 @@ void uvm_object_printit(struct uvm_objec void uvm_page_printit(struct vm_page *, bool, void (*)(const char *, ...)); void uvm_page_printall(void (*)(const char *, ...)); +void uvm_page_print_freelists(void (*)(const char *, ...)); void uvmexp_print(void (*)(const char *, ...)); #endif /* DDB || DEBUGPRINT */ Index: uvm/uvm_extern.h =================================================================== RCS file: /cvsroot/src/sys/uvm/uvm_extern.h,v retrieving revision 1.215 diff -u -p -r1.215 uvm_extern.h --- uvm/uvm_extern.h 21 Dec 2019 12:58:26 -0000 1.215 +++ uvm/uvm_extern.h 22 Dec 2019 19:48:19 -0000 @@ -210,6 +210,7 @@ b\32UNMAP\0\ #define UVM_PGA_STRAT_NORMAL 0 /* priority (low id to high) walk */ #define UVM_PGA_STRAT_ONLY 1 /* only specified free list */ #define UVM_PGA_STRAT_FALLBACK 2 /* ONLY falls back on NORMAL */ +#define UVM_PGA_STRAT_NUMA 3 /* strongly prefer ideal bucket */ /* * flags for uvm_pagealloc_strat() @@ -736,6 +737,7 @@ void uvm_obj_unwirepages(struct uvm_ob /* uvm_page.c */ int uvm_free(void); +void uvm_page_numa_load(paddr_t, paddr_t, u_int); struct vm_page *uvm_pagealloc_strat(struct uvm_object *, voff_t, struct vm_anon *, int, int, int); #define uvm_pagealloc(obj, off, anon, flags) \ Index: uvm/uvm_glue.c =================================================================== RCS file: /cvsroot/src/sys/uvm/uvm_glue.c,v retrieving revision 1.172 diff -u -p -r1.172 uvm_glue.c --- uvm/uvm_glue.c 21 Dec 2019 13:00:25 -0000 1.172 +++ uvm/uvm_glue.c 22 Dec 2019 19:48:19 -0000 @@ -86,6 +86,7 @@ __KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v #include #include +#include /* * uvm_kernacc: test if kernel can access a memory region. @@ -500,9 +501,17 @@ uvm_scheduler(void) lwp_changepri(l, PRI_VM); lwp_unlock(l); + /* Start the freelist cache. */ + uvm_pgflcache_start(); + for (;;) { /* Update legacy stats for post-mortem debugging. */ uvm_update_uvmexp(); + + /* See if the pagedaemon needs to generate some free pages. */ + uvm_kick_pdaemon(); + + /* Calculate process statistics. */ sched_pstats(); (void)kpause("uvm", false, hz, NULL); } Index: uvm/uvm_init.c =================================================================== RCS file: /cvsroot/src/sys/uvm/uvm_init.c,v retrieving revision 1.51 diff -u -p -r1.51 uvm_init.c --- uvm/uvm_init.c 13 Dec 2019 20:10:22 -0000 1.51 +++ uvm/uvm_init.c 22 Dec 2019 19:48:19 -0000 @@ -64,7 +64,6 @@ const int * const uvmexp_pagemask = &uvm const int * const uvmexp_pageshift = &uvmexp.pageshift; #endif -kmutex_t uvm_fpageqlock __cacheline_aligned; kmutex_t uvm_kentry_lock __cacheline_aligned; /* Index: uvm/uvm_page.c =================================================================== RCS file: /cvsroot/src/sys/uvm/uvm_page.c,v retrieving revision 1.212 diff -u -p -r1.212 uvm_page.c --- uvm/uvm_page.c 22 Dec 2019 16:37:36 -0000 1.212 +++ uvm/uvm_page.c 22 Dec 2019 19:48:19 -0000 @@ -1,5 +1,34 @@ /* $NetBSD: uvm_page.c,v 1.212 2019/12/22 16:37:36 ad Exp $ */ +/*- + * Copyright (c) 2019 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Andrew Doran. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + /* * Copyright (c) 1997 Charles D. Cranor and Washington University. * Copyright (c) 1991, 1993, The Regents of the University of California. @@ -87,6 +116,7 @@ __KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v #include #include #include +#include /* * Some supported CPUs in a given architecture don't support all @@ -130,6 +160,25 @@ static vaddr_t virtual_space_end; */ static size_t recolored_pages_memsize /* = 0 */; +static char *recolored_pages_mem; + +/* + * freelist locks - one per bucket. + */ + +union uvm_freelist_lock uvm_freelist_locks[PGFL_MAX_BUCKETS] + __cacheline_aligned; + +/* + * basic NUMA information. + */ + +static struct uvm_page_numa_region { + struct uvm_page_numa_region *next; + paddr_t start; + paddr_t size; + u_int numa_id; +} *uvm_page_numa_region; #ifdef DEBUG vaddr_t uvm_zerocheckkva; @@ -243,15 +292,15 @@ uvm_pageremove_tree(struct uvm_object *u } static void -uvm_page_init_buckets(struct pgfreelist *pgfl) +uvm_page_init_bucket(struct pgfreelist *pgfl, struct pgflbucket *pgb, int num) { - int color, i; + int i; - for (color = 0; color < uvmexp.ncolors; color++) { - for (i = 0; i < PGFL_NQUEUES; i++) { - LIST_INIT(&pgfl->pgfl_buckets[color].pgfl_queues[i]); - } + pgb->pgb_nfree = 0; + for (i = 0; i < uvmexp.ncolors; i++) { + LIST_INIT(&pgb->pgb_colors[i]); } + pgfl->pgfl_buckets[num] = pgb; } /* @@ -263,18 +312,18 @@ uvm_page_init_buckets(struct pgfreelist void uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp) { - static struct uvm_cpu boot_cpu; - psize_t freepages, pagecount, bucketcount, n; - struct pgflbucket *bucketarray, *cpuarray; + static struct uvm_cpu boot_cpu __cacheline_aligned; + psize_t freepages, pagecount, bucketsize, n; + struct pgflbucket *pgb; struct vm_page *pagearray; + char *bucketarray; uvm_physseg_t bank; - int lcv; + int fl, b; KASSERT(ncpu <= 1); - CTASSERT(sizeof(pagearray->offset) >= sizeof(struct uvm_cpu *)); /* - * init the page queues and free page queue lock, except the + * init the page queues and free page queue locks, except the * free list; we allocate that later (with the initial vm_page * structures). */ @@ -282,7 +331,9 @@ uvm_page_init(vaddr_t *kvm_startp, vaddr uvm.cpus[0] = &boot_cpu; curcpu()->ci_data.cpu_uvm = &boot_cpu; uvmpdpol_init(); - mutex_init(&uvm_fpageqlock, MUTEX_DRIVER, IPL_VM); + for (b = 0; b < __arraycount(uvm_freelist_locks); b++) { + mutex_init(&uvm_freelist_locks[b].lock, MUTEX_DEFAULT, IPL_VM); + } /* * allocate vm_page structures. @@ -323,6 +374,9 @@ uvm_page_init(vaddr_t *kvm_startp, vaddr uvmexp.colormask = uvmexp.ncolors - 1; KASSERT((uvmexp.colormask & uvmexp.ncolors) == 0); + /* We always start with only 1 bucket. */ + uvm.bucketcount = 1; + /* * we now know we have (PAGE_SIZE * freepages) bytes of memory we can * use. for each page of memory we use we need a vm_page structure. @@ -332,28 +386,28 @@ uvm_page_init(vaddr_t *kvm_startp, vaddr * truncation errors (since we can only allocate in terms of whole * pages). */ - - bucketcount = uvmexp.ncolors * VM_NFREELIST; pagecount = ((freepages + 1) << PAGE_SHIFT) / (PAGE_SIZE + sizeof(struct vm_page)); - - bucketarray = (void *)uvm_pageboot_alloc((bucketcount * - sizeof(struct pgflbucket) * 2) + (pagecount * - sizeof(struct vm_page))); - cpuarray = bucketarray + bucketcount; - pagearray = (struct vm_page *)(bucketarray + bucketcount * 2); - - for (lcv = 0; lcv < VM_NFREELIST; lcv++) { - uvm.page_free[lcv].pgfl_buckets = - (bucketarray + (lcv * uvmexp.ncolors)); - uvm_page_init_buckets(&uvm.page_free[lcv]); - uvm.cpus[0]->page_free[lcv].pgfl_buckets = - (cpuarray + (lcv * uvmexp.ncolors)); - uvm_page_init_buckets(&uvm.cpus[0]->page_free[lcv]); + bucketsize = offsetof(struct pgflbucket, pgb_colors[uvmexp.ncolors]); + bucketsize = roundup2(bucketsize, coherency_unit); + bucketarray = (void *)uvm_pageboot_alloc( + bucketsize * VM_NFREELIST + + pagecount * sizeof(struct vm_page)); + pagearray = (struct vm_page *) + (bucketarray + bucketsize * VM_NFREELIST); + + for (fl = 0; fl < VM_NFREELIST; fl++) { + pgb = (struct pgflbucket *)(bucketarray + bucketsize * fl); + uvm_page_init_bucket(&uvm.page_free[fl], pgb, 0); } memset(pagearray, 0, pagecount * sizeof(struct vm_page)); /* + * init the freelist cache in the disabled state. + */ + uvm_pgflcache_init(); + + /* * init the vm_page structures and put them in the correct place. */ /* First init the extent */ @@ -396,12 +450,6 @@ uvm_page_init(vaddr_t *kvm_startp, vaddr uvmexp.reserve_kernel = vm_page_reserve_kernel; /* - * determine if we should zero pages in the idle loop. - */ - - uvm.cpus[0]->page_idle_zero = vm_page_zero_enable; - - /* * done! */ @@ -409,6 +457,34 @@ uvm_page_init(vaddr_t *kvm_startp, vaddr } /* + * uvm_pgfl_lock: lock all freelist buckets + */ + +void +uvm_pgfl_lock(void) +{ + int i; + + for (i = 0; i < __arraycount(uvm_freelist_locks); i++) { + mutex_spin_enter(&uvm_freelist_locks[i].lock); + } +} + +/* + * uvm_pgfl_unlock: unlock all freelist buckets + */ + +void +uvm_pgfl_unlock(void) +{ + int i; + + for (i = 0; i < __arraycount(uvm_freelist_locks); i++) { + mutex_spin_exit(&uvm_freelist_locks[i].lock); + } +} + +/* * uvm_setpagesize: set the page size * * => sets page_shift and page_mask from uvmexp.pagesize. @@ -612,129 +688,294 @@ uvm_vm_page_to_phys(const struct vm_page } /* - * uvm_page_recolor: Recolor the pages if the new bucket count is - * larger than the old one. + * uvm_page_numa_load: load NUMA range description. */ - void -uvm_page_recolor(int newncolors) +uvm_page_numa_load(paddr_t start, paddr_t size, u_int numa_id) { - struct pgflbucket *bucketarray, *cpuarray, *oldbucketarray; - struct pgfreelist gpgfl, pgfl; - struct vm_page *pg; - vsize_t bucketcount; - size_t bucketmemsize, oldbucketmemsize; - int color, i, ocolors; - int lcv; - struct uvm_cpu *ucpu; + struct uvm_page_numa_region *d; + + KASSERT(numa_id < PGFL_MAX_BUCKETS); + + d = kmem_alloc(sizeof(*d), KM_SLEEP); + d->start = start; + d->size = size; + d->numa_id = numa_id; + d->next = uvm_page_numa_region; + uvm_page_numa_region = d; +} + +/* + * uvm_page_numa_lookup: lookup NUMA node for the given page. + */ +static u_int +uvm_page_numa_lookup(struct vm_page *pg) +{ + struct uvm_page_numa_region *d; + paddr_t pa; + + KASSERT(uvm.numa_alloc); + KASSERT(uvm_page_numa_region != NULL); + + pa = VM_PAGE_TO_PHYS(pg); + for (d = uvm_page_numa_region; d != NULL; d = d->next) { + if (pa >= d->start && pa < d->start + d->size) { + return d->numa_id; + } + } + + return 0; +} + +/* + * uvm_page_redim: adjust freelist dimensions if they have changed. + */ +static void +uvm_page_redim(int newncolors, int newnbuckets) +{ + struct pgfreelist npgfl; + struct pgflbucket *opgb, *npgb; + struct pgflist *ohead, *nhead; + struct vm_page *pg; + size_t bucketsize, bucketmemsize, oldbucketmemsize; + int fl, ob, oc, nb, nc, obuckets, ocolors; + char *bucketarray, *oldbucketmem, *bucketmem; + KASSERT(((newncolors - 1) & newncolors) == 0); - if (newncolors <= uvmexp.ncolors) + /* Anything to do? */ + if (newncolors <= uvmexp.ncolors && + newnbuckets == uvm.bucketcount) { return; - + } if (uvm.page_init_done == false) { uvmexp.ncolors = newncolors; return; } - bucketcount = newncolors * VM_NFREELIST; - bucketmemsize = bucketcount * sizeof(struct pgflbucket) * 2; - bucketarray = kmem_alloc(bucketmemsize, KM_SLEEP); - cpuarray = bucketarray + bucketcount; + bucketsize = offsetof(struct pgflbucket, pgb_colors[newncolors]); + bucketsize = roundup2(bucketsize, coherency_unit); + bucketmemsize = bucketsize * newnbuckets * VM_NFREELIST + + coherency_unit - 1; + bucketmem = kmem_zalloc(bucketmemsize, KM_SLEEP); + bucketarray = (char *)roundup2((uintptr_t)bucketmem, coherency_unit); + + ocolors = uvmexp.ncolors; + obuckets = uvm.bucketcount; - mutex_spin_enter(&uvm_fpageqlock); + /* Freelist cache musn't be enabled. */ + uvm_pgflcache_pause(); /* Make sure we should still do this. */ - if (newncolors <= uvmexp.ncolors) { - mutex_spin_exit(&uvm_fpageqlock); - kmem_free(bucketarray, bucketmemsize); + uvm_pgfl_lock(); + if (newncolors <= uvmexp.ncolors && + newnbuckets == uvm.bucketcount) { + uvm_pgfl_unlock(); + kmem_free(bucketmem, bucketmemsize); return; } - oldbucketarray = uvm.page_free[0].pgfl_buckets; - ocolors = uvmexp.ncolors; - uvmexp.ncolors = newncolors; uvmexp.colormask = uvmexp.ncolors - 1; + uvm.bucketcount = newnbuckets; - ucpu = curcpu()->ci_data.cpu_uvm; - for (lcv = 0; lcv < VM_NFREELIST; lcv++) { - gpgfl.pgfl_buckets = (bucketarray + (lcv * newncolors)); - pgfl.pgfl_buckets = (cpuarray + (lcv * uvmexp.ncolors)); - uvm_page_init_buckets(&gpgfl); - uvm_page_init_buckets(&pgfl); - for (color = 0; color < ocolors; color++) { - for (i = 0; i < PGFL_NQUEUES; i++) { - while ((pg = LIST_FIRST(&uvm.page_free[ - lcv].pgfl_buckets[color].pgfl_queues[i])) - != NULL) { - LIST_REMOVE(pg, pageq.list); /* global */ - LIST_REMOVE(pg, listq.list); /* cpu */ - LIST_INSERT_HEAD(&gpgfl.pgfl_buckets[ - VM_PGCOLOR(pg)].pgfl_queues[ - i], pg, pageq.list); - LIST_INSERT_HEAD(&pgfl.pgfl_buckets[ - VM_PGCOLOR(pg)].pgfl_queues[ - i], pg, listq.list); + for (fl = 0; fl < VM_NFREELIST; fl++) { + /* Init new buckets in new freelist. */ + memset(&npgfl, 0, sizeof(npgfl)); + for (nb = 0; nb < newnbuckets; nb++) { + npgb = (struct pgflbucket *)bucketarray; + uvm_page_init_bucket(&npgfl, npgb, nb); + bucketarray += bucketsize; + } + /* Now transfer pages from the old freelist. */ + for (nb = ob = 0; ob < obuckets; ob++) { + opgb = uvm.page_free[fl].pgfl_buckets[ob]; + for (oc = 0; oc < ocolors; oc++) { + ohead = &opgb->pgb_colors[oc]; + while ((pg = LIST_FIRST(ohead)) != NULL) { + LIST_REMOVE(pg, pageq.list); + /* + * Here we decide on the NEW color & + * bucket for the page. For NUMA + * we'll use the info that the + * hardware gave us. Otherwise we + * just do a round-robin among the + * buckets. + */ + KASSERT( + uvm_page_get_bucket(pg) == ob); + KASSERT(fl == + uvm_page_get_freelist(pg)); + if (uvm.numa_alloc) { + nb = uvm_page_numa_lookup(pg); + } else if (nb + 1 < newnbuckets) { + nb = nb + 1; + } else { + nb = 0; + } + uvm_page_set_bucket(pg, nb); + npgb = npgfl.pgfl_buckets[nb]; + npgb->pgb_nfree++; + nc = VM_PGCOLOR(pg); + nhead = &npgb->pgb_colors[nc]; + LIST_INSERT_HEAD(nhead, pg, pageq.list); } } } - uvm.page_free[lcv].pgfl_buckets = gpgfl.pgfl_buckets; - ucpu->page_free[lcv].pgfl_buckets = pgfl.pgfl_buckets; + /* Install the new freelist. */ + memcpy(&uvm.page_free[fl], &npgfl, sizeof(npgfl)); } + /* Unlock and free the old memory. */ oldbucketmemsize = recolored_pages_memsize; - + oldbucketmem = recolored_pages_mem; recolored_pages_memsize = bucketmemsize; - mutex_spin_exit(&uvm_fpageqlock); + recolored_pages_mem = bucketmem; + uvm_pgfl_unlock(); if (oldbucketmemsize) { - kmem_free(oldbucketarray, oldbucketmemsize); + kmem_free(oldbucketmem, oldbucketmemsize); } + uvm_pgflcache_resume(); + /* * this calls uvm_km_alloc() which may want to hold - * uvm_fpageqlock. + * uvm_freelist_lock. */ uvm_pager_realloc_emerg(); } /* + * uvm_page_recolor: Recolor the pages if the new color count is + * larger than the old one. + */ + +void +uvm_page_recolor(int newncolors) +{ + + uvm_page_redim(newncolors, uvm.bucketcount); +} + +/* + * uvm_page_rebucket: Determine a bucket structure and redim the free + * lists to match. + */ + +void +uvm_page_rebucket(void) +{ + u_int min_numa, max_numa, npackage, shift; + struct cpu_info *ci, *ci2, *ci3; + CPU_INFO_ITERATOR cii; + + /* + * If we have more than one NUMA node, and the maximum NUMA node ID + * is less than PGFL_MAX_BUCKETS, then we'll use NUMA distribution + * for free pages. uvm_pagefree() will not reassign pages to a + * different bucket on free. + */ + min_numa = (u_int)-1; + max_numa = 0; + for (CPU_INFO_FOREACH(cii, ci)) { + if (ci->ci_numa_id < min_numa) { + min_numa = ci->ci_numa_id; + } + if (ci->ci_numa_id > max_numa) { + max_numa = ci->ci_numa_id; + } + } + if (min_numa != max_numa && max_numa < PGFL_MAX_BUCKETS) { +#ifdef NUMA + /* + * We can do this, and it seems to work well, but until + * further experiments are done we'll stick with the cache + * locality strategy. + */ + aprint_debug("UVM: using NUMA allocation scheme\n"); + for (CPU_INFO_FOREACH(cii, ci)) { + ci->ci_data.cpu_uvm->pgflbucket = ci->ci_numa_id; + } + uvm.numa_alloc = true; + uvm_page_redim(uvmexp.ncolors, max_numa + 1); + return; +#endif + } + + /* + * Otherwise we'll go with a scheme to maximise L2/L3 cache locality + * and minimise lock contention. Count the total number of CPU + * packages, and then try to distribute the buckets among CPU + * packages evenly. uvm_pagefree() will reassign pages to the + * freeing CPU's preferred bucket on free. + */ + npackage = 0; + ci = curcpu(); + ci2 = ci; + do { + npackage++; + ci2 = ci2->ci_sibling[CPUREL_PEER]; + } while (ci2 != ci); + + /* + * Figure out how to arrange the packages & buckets, and the total + * number of buckets we need. XXX 2 may not be the best factor. + */ + for (shift = 0; npackage >= PGFL_MAX_BUCKETS; shift++) { + npackage >>= 1; + } + uvm_page_redim(uvmexp.ncolors, npackage); + + /* + * Now tell each CPU which bucket to use. In the outer loop, scroll + * through all CPU pacakges. + */ + npackage = 0; + ci = curcpu(); + ci2 = ci; + do { + /* + * In the inner loop, scroll through all CPUs in the package + * and assign the same bucket ID. + */ + ci3 = ci2; + do { + ci3->ci_data.cpu_uvm->pgflbucket = npackage >> shift; + ci3 = ci3->ci_sibling[CPUREL_PACKAGE]; + } while (ci3 != ci2); + npackage++; + ci2 = ci2->ci_sibling[CPUREL_PEER]; + } while (ci2 != ci); + + aprint_debug("UVM: using package allocation scheme, " + "%d package(s) per bucket\n", 1 << shift); +} + +/* * uvm_cpu_attach: initialize per-CPU data structures. */ void uvm_cpu_attach(struct cpu_info *ci) { - struct pgflbucket *bucketarray; - struct pgfreelist pgfl; struct uvm_cpu *ucpu; - vsize_t bucketcount; - int lcv; - if (CPU_IS_PRIMARY(ci)) { - /* Already done in uvm_page_init(). */ - goto attachrnd; - } - - /* Add more reserve pages for this CPU. */ - uvmexp.reserve_kernel += vm_page_reserve_kernel; - - /* Configure this CPU's free lists. */ - bucketcount = uvmexp.ncolors * VM_NFREELIST; - bucketarray = kmem_alloc(bucketcount * sizeof(struct pgflbucket), - KM_SLEEP); - ucpu = kmem_zalloc(sizeof(*ucpu), KM_SLEEP); - uvm.cpus[cpu_index(ci)] = ucpu; - ci->ci_data.cpu_uvm = ucpu; - for (lcv = 0; lcv < VM_NFREELIST; lcv++) { - pgfl.pgfl_buckets = (bucketarray + (lcv * uvmexp.ncolors)); - uvm_page_init_buckets(&pgfl); - ucpu->page_free[lcv].pgfl_buckets = pgfl.pgfl_buckets; + /* Already done in uvm_page_init(). */ + if (!CPU_IS_PRIMARY(ci)) { + /* Add more reserve pages for this CPU. */ + uvmexp.reserve_kernel += vm_page_reserve_kernel; + + /* Allocate per-CPU data structures. */ + ucpu = kmem_zalloc(sizeof(struct uvm_cpu) + coherency_unit - 1, + KM_SLEEP); + ucpu = (struct uvm_cpu *)roundup2((uintptr_t)ucpu, + coherency_unit); + uvm.cpus[cpu_index(ci)] = ucpu; + ci->ci_data.cpu_uvm = ucpu; } -attachrnd: /* * Attach RNG source for this CPU's VM events */ @@ -742,101 +984,140 @@ attachrnd: ci->ci_data.cpu_name, RND_TYPE_VM, RND_FLAG_COLLECT_TIME|RND_FLAG_COLLECT_VALUE| RND_FLAG_ESTIMATE_VALUE); - } /* - * uvm_free: return total number of free pages in system. + * uvm_free: fetch the total amount of free memory in pages. This can have a + * detrimental effect on performance due to false sharing; don't call unless + * needed. */ int uvm_free(void) { + struct pgfreelist *pgfl; + int fl, b, fpages; - return uvmexp.free; + fpages = 0; + for (fl = 0; fl < VM_NFREELIST; fl++) { + pgfl = &uvm.page_free[fl]; + for (b = 0; b < uvm.bucketcount; b++) { + fpages += pgfl->pgfl_buckets[b]->pgb_nfree; + } + } + return fpages; } /* - * uvm_pagealloc_pgfl: helper routine for uvm_pagealloc_strat + * uvm_pagealloc_pgb: helper routine that tries to allocate any color from a + * specific freelist and specific bucket only. + * + * => must be at IPL_VM or higher to protect per-CPU data structures. */ static struct vm_page * -uvm_pagealloc_pgfl(struct uvm_cpu *ucpu, int flist, int try1, int try2, - int *trycolorp) +uvm_pagealloc_pgb(struct uvm_cpu *ucpu, int f, int b, int *trycolorp, int flags) { - struct pgflist *freeq; + int c, trycolor, colormask; + struct pgflbucket *pgb; struct vm_page *pg; - int color, trycolor = *trycolorp; - struct pgfreelist *gpgfl, *pgfl; + kmutex_t *lock; + + /* + * Skip the bucket if empty, no lock needed. There could be many + * empty freelists/buckets. + */ + pgb = uvm.page_free[f].pgfl_buckets[b]; + if (pgb->pgb_nfree == 0) { + return NULL; + } - KASSERT(mutex_owned(&uvm_fpageqlock)); + /* Skip bucket if low on memory. */ + lock = &uvm_freelist_locks[b].lock; + mutex_spin_enter(lock); + if (__predict_false(pgb->pgb_nfree <= uvmexp.reserve_kernel)) { + if ((flags & UVM_PGA_USERESERVE) == 0 || + (pgb->pgb_nfree <= uvmexp.reserve_pagedaemon && + curlwp != uvm.pagedaemon_lwp)) { + /* Have pagedaemon free some memory. */ + mutex_spin_exit(lock); + uvm_kick_pdaemon(); + return NULL; + } + } - color = trycolor; - pgfl = &ucpu->page_free[flist]; - gpgfl = &uvm.page_free[flist]; + /* Try all page colors as needed. */ + c = trycolor = *trycolorp; + colormask = uvmexp.colormask; do { - /* cpu, try1 */ - if ((pg = LIST_FIRST((freeq = - &pgfl->pgfl_buckets[color].pgfl_queues[try1]))) != NULL) { - KASSERT(pg->flags & PG_FREE); - KASSERT(try1 == PGFL_ZEROS || !(pg->flags & PG_ZERO)); - KASSERT(try1 == PGFL_UNKNOWN || (pg->flags & PG_ZERO)); - KASSERT(ucpu == VM_FREE_PAGE_TO_CPU(pg)); - VM_FREE_PAGE_TO_CPU(pg)->pages[try1]--; - CPU_COUNT(CPU_COUNT_CPUHIT, 1); - goto gotit; - } - /* global, try1 */ - if ((pg = LIST_FIRST((freeq = - &gpgfl->pgfl_buckets[color].pgfl_queues[try1]))) != NULL) { + pg = LIST_FIRST(&pgb->pgb_colors[c]); + if (__predict_true(pg != NULL)) { + /* + * Got a free page! PG_FREE must be cleared under + * lock because of uvm_pglistalloc(). + */ + LIST_REMOVE(pg, pageq.list); KASSERT(pg->flags & PG_FREE); - KASSERT(try1 == PGFL_ZEROS || !(pg->flags & PG_ZERO)); - KASSERT(try1 == PGFL_UNKNOWN || (pg->flags & PG_ZERO)); - KASSERT(ucpu != VM_FREE_PAGE_TO_CPU(pg)); - VM_FREE_PAGE_TO_CPU(pg)->pages[try1]--; - CPU_COUNT(CPU_COUNT_CPUMISS, 1); - goto gotit; - } - /* cpu, try2 */ - if ((pg = LIST_FIRST((freeq = - &pgfl->pgfl_buckets[color].pgfl_queues[try2]))) != NULL) { - KASSERT(pg->flags & PG_FREE); - KASSERT(try2 == PGFL_ZEROS || !(pg->flags & PG_ZERO)); - KASSERT(try2 == PGFL_UNKNOWN || (pg->flags & PG_ZERO)); - KASSERT(ucpu == VM_FREE_PAGE_TO_CPU(pg)); - VM_FREE_PAGE_TO_CPU(pg)->pages[try2]--; - CPU_COUNT(CPU_COUNT_CPUHIT, 1); - goto gotit; - } - /* global, try2 */ - if ((pg = LIST_FIRST((freeq = - &gpgfl->pgfl_buckets[color].pgfl_queues[try2]))) != NULL) { - KASSERT(pg->flags & PG_FREE); - KASSERT(try2 == PGFL_ZEROS || !(pg->flags & PG_ZERO)); - KASSERT(try2 == PGFL_UNKNOWN || (pg->flags & PG_ZERO)); - KASSERT(ucpu != VM_FREE_PAGE_TO_CPU(pg)); - VM_FREE_PAGE_TO_CPU(pg)->pages[try2]--; - CPU_COUNT(CPU_COUNT_CPUMISS, 1); - goto gotit; + pg->flags &= PG_ZERO; + pgb->pgb_nfree--; + + /* + * While we have the bucket locked and our data + * structures fresh in L1 cache, we have an ideal + * opportunity to grab some pages for the freelist + * cache without causing extra contention. Only do + * so if we found pages in this CPU's preferred + * bucket. + */ + if (__predict_true(b == ucpu->pgflbucket)) { + uvm_pgflcache_fill(ucpu, f, b, c); + } + mutex_spin_exit(lock); + KASSERT(uvm_page_get_bucket(pg) == b); + CPU_COUNT(c == trycolor ? + CPU_COUNT_COLORHIT : CPU_COUNT_COLORMISS, 1); + CPU_COUNT(CPU_COUNT_CPUMISS, 1); + *trycolorp = c; + return pg; } - color = (color + 1) & uvmexp.colormask; - } while (color != trycolor); + c = (c + 1) & colormask; + } while (c != trycolor); + mutex_spin_exit(lock); - return (NULL); + return NULL; +} - gotit: - LIST_REMOVE(pg, pageq.list); /* global list */ - LIST_REMOVE(pg, listq.list); /* per-cpu list */ - uvmexp.free--; - - if (color == trycolor) - CPU_COUNT(CPU_COUNT_COLORHIT, 1); - else { - CPU_COUNT(CPU_COUNT_COLORMISS, 1); - *trycolorp = color; +/* + * uvm_pagealloc_pgfl: helper routine for uvm_pagealloc_strat that allocates + * any color from any bucket, in a specific freelist. + * + * => must be at IPL_VM or higher to protect per-CPU data structures. + */ + +static struct vm_page * +uvm_pagealloc_pgfl(struct uvm_cpu *ucpu, int f, int *trycolorp, int flags) +{ + int b, trybucket, bucketcount; + struct vm_page *pg; + + /* Try for the exact thing in the per-CPU cache. */ + if ((pg = uvm_pgflcache_alloc(ucpu, f, *trycolorp)) != NULL) { + return pg; } - return (pg); + /* Walk through all buckets, trying our preferred bucket first. */ + trybucket = ucpu->pgflbucket; + b = trybucket; + bucketcount = uvm.bucketcount; + do { + pg = uvm_pagealloc_pgb(ucpu, f, b, trycolorp, flags); + if (pg != NULL) { + return pg; + } + b = (b + 1 == bucketcount ? 0 : b + 1); + } while (b != trybucket); + + return NULL; } /* @@ -861,8 +1142,8 @@ struct vm_page * uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon, int flags, int strat, int free_list) { - int try1, try2, zeroit = 0, color; - int lcv, error; + int zeroit = 0, color; + int lcv, error, s; struct uvm_cpu *ucpu; struct vm_page *pg; lwp_t *l; @@ -879,21 +1160,15 @@ uvm_pagealloc_strat(struct uvm_object *o * algorithm. */ + s = splvm(); ucpu = curcpu()->ci_data.cpu_uvm; if (flags & UVM_FLAG_COLORMATCH) { color = atop(off) & uvmexp.colormask; } else { - color = ucpu->page_free_nextcolor; + color = ucpu->pgflcolor; } /* - * check to see if we need to generate some free pages waking - * the pagedaemon. - */ - - uvm_kick_pdaemon(); - - /* * fail if any of these conditions is true: * [1] there really are no free pages, or * [2] only kernel "reserved" pages remain and @@ -903,55 +1178,39 @@ uvm_pagealloc_strat(struct uvm_object *o * we make kernel reserve pages available if called by a * kernel thread or a realtime thread. */ - mutex_spin_enter(&uvm_fpageqlock); l = curlwp; if (__predict_true(l != NULL) && lwp_eprio(l) >= PRI_KTHREAD) { flags |= UVM_PGA_USERESERVE; } - if ((uvmexp.free <= uvmexp.reserve_kernel && - (flags & UVM_PGA_USERESERVE) == 0) || - (uvmexp.free <= uvmexp.reserve_pagedaemon && - curlwp != uvm.pagedaemon_lwp)) - goto fail; - -#if PGFL_NQUEUES != 2 -#error uvm_pagealloc_strat needs to be updated -#endif - /* - * If we want a zero'd page, try the ZEROS queue first, otherwise - * we try the UNKNOWN queue first. - */ - if (flags & UVM_PGA_ZERO) { - try1 = PGFL_ZEROS; - try2 = PGFL_UNKNOWN; - } else { - try1 = PGFL_UNKNOWN; - try2 = PGFL_ZEROS; + /* If the allocator's running in NUMA mode, go with NUMA strategy. */ + if (uvm.numa_alloc && strat == UVM_PGA_STRAT_NORMAL) { + strat = UVM_PGA_STRAT_NUMA; } again: switch (strat) { case UVM_PGA_STRAT_NORMAL: - /* Check freelists: descending priority (ascending id) order */ + /* Check freelists: descending priority (ascending id) order. */ for (lcv = 0; lcv < VM_NFREELIST; lcv++) { - pg = uvm_pagealloc_pgfl(ucpu, lcv, - try1, try2, &color); - if (pg != NULL) + pg = uvm_pagealloc_pgfl(ucpu, lcv, &color, flags); + if (pg != NULL) { goto gotit; + } } /* No pages free! */ - goto fail; + splx(s); + return NULL; case UVM_PGA_STRAT_ONLY: case UVM_PGA_STRAT_FALLBACK: /* Attempt to allocate from the specified free list. */ KASSERT(free_list >= 0 && free_list < VM_NFREELIST); - pg = uvm_pagealloc_pgfl(ucpu, free_list, - try1, try2, &color); - if (pg != NULL) + pg = uvm_pagealloc_pgfl(ucpu, free_list, &color, flags); + if (pg != NULL) { goto gotit; + } /* Fall back, if possible. */ if (strat == UVM_PGA_STRAT_FALLBACK) { @@ -960,7 +1219,29 @@ uvm_pagealloc_strat(struct uvm_object *o } /* No pages free! */ - goto fail; + splx(s); + return NULL; + + case UVM_PGA_STRAT_NUMA: + /* + * NUMA strategy: allocating from the correct bucket is more + * important than observing freelist priority. Look only to + * the current NUMA node; if that fails, we need to look to + * other NUMA nodes, so retry with the normal strategy. + */ + for (lcv = 0; lcv < VM_NFREELIST; lcv++) { + pg = uvm_pgflcache_alloc(ucpu, lcv, color); + if (pg != NULL) { + goto gotit; + } + pg = uvm_pagealloc_pgb(ucpu, lcv, + ucpu->pgflbucket, &color, flags); + if (pg != NULL) { + goto gotit; + } + } + strat = UVM_PGA_STRAT_NORMAL; + goto again; default: panic("uvm_pagealloc_strat: bad strat %d", strat); @@ -973,11 +1254,11 @@ uvm_pagealloc_strat(struct uvm_object *o * the next color accordingly. */ - ucpu->page_free_nextcolor = (color + 1) & uvmexp.colormask; + ucpu->pgflcolor = (color + 1) & uvmexp.colormask; /* - * update allocation statistics and remember if we have to - * zero the page + * while still at IPL_VM, update allocation statistics and remember + * if we have to zero the page */ if (flags & UVM_PGA_ZERO) { @@ -988,9 +1269,6 @@ uvm_pagealloc_strat(struct uvm_object *o CPU_COUNT(CPU_COUNT_PGA_ZEROMISS, 1); zeroit = 1; } - if (ucpu->pages[PGFL_ZEROS] < ucpu->pages[PGFL_UNKNOWN]) { - ucpu->page_idle_zero = vm_page_zero_enable; - } } if (pg->flags & PG_ZERO) { CPU_COUNT(CPU_COUNT_ZEROPAGES, -1); @@ -998,12 +1276,9 @@ uvm_pagealloc_strat(struct uvm_object *o if (anon) { CPU_COUNT(CPU_COUNT_ANONPAGES, 1); } + splx(s); KASSERT((pg->flags & ~(PG_ZERO|PG_FREE)) == 0); - /* mark the page as allocated and then drop uvm_fpageqlock. */ - pg->flags &= ~PG_FREE; - mutex_spin_exit(&uvm_fpageqlock); - /* * assign the page to the object. as the page was free, we know * that pg->uobject and pg->uanon are NULL. we only need to take @@ -1050,10 +1325,6 @@ uvm_pagealloc_strat(struct uvm_object *o } return(pg); - - fail: - mutex_spin_exit(&uvm_fpageqlock); - return (NULL); } /* @@ -1133,7 +1404,6 @@ uvm_pagezerocheck(struct vm_page *pg) int *p, *ep; KASSERT(uvm_zerocheckkva != 0); - KASSERT(mutex_owned(&uvm_fpageqlock)); /* * XXX assuming pmap_kenter_pa and pmap_kremove never call @@ -1170,10 +1440,12 @@ uvm_pagezerocheck(struct vm_page *pg) void uvm_pagefree(struct vm_page *pg) { - struct pgflist *pgfl; + struct pgfreelist *pgfl; + struct pgflbucket *pgb; struct uvm_cpu *ucpu; - int index, color, queue; - bool iszero, locked; + kmutex_t *lock; + int bucket, s; + bool locked; #ifdef DEBUG if (pg->uobject == (void *)0xdeadbeef && @@ -1184,7 +1456,6 @@ uvm_pagefree(struct vm_page *pg) KASSERT((pg->flags & PG_PAGEOUT) == 0); KASSERT(!(pg->flags & PG_FREE)); - //KASSERT(mutex_owned(&uvm_pageqlock) || !uvmpdpol_pageisqueued_p(pg)); KASSERT(pg->uobject == NULL || mutex_owned(pg->uobject->vmobjlock)); KASSERT(pg->uobject != NULL || pg->uanon == NULL || mutex_owned(pg->uanon->an_lock)); @@ -1285,44 +1556,46 @@ uvm_pagefree(struct vm_page *pg) * and put on free queue */ - iszero = (pg->flags & PG_ZERO); - index = uvm_page_get_freelist(pg); - color = VM_PGCOLOR(pg); - queue = (iszero ? PGFL_ZEROS : PGFL_UNKNOWN); - #ifdef DEBUG pg->uobject = (void *)0xdeadbeef; pg->uanon = (void *)0xdeadbeef; -#endif - - mutex_spin_enter(&uvm_fpageqlock); - pg->flags = PG_FREE; - -#ifdef DEBUG - if (iszero) + if (pg->flags & PG_ZERO) uvm_pagezerocheck(pg); #endif /* DEBUG */ + s = splvm(); + ucpu = curcpu()->ci_data.cpu_uvm; - /* global list */ - pgfl = &uvm.page_free[index].pgfl_buckets[color].pgfl_queues[queue]; - LIST_INSERT_HEAD(pgfl, pg, pageq.list); - uvmexp.free++; - if (iszero) { - CPU_COUNT(CPU_COUNT_ZEROPAGES, 1); + /* + * If we're using the NUMA strategy, we'll only cache this page if + * it came from the local CPU's NUMA node. Otherwise we're using + * the L2/L3 cache locality strategy and we'll cache anything. + */ + if (uvm.numa_alloc) { + bucket = uvm_page_get_bucket(pg); + } else { + bucket = ucpu->pgflbucket; + uvm_page_set_bucket(pg, bucket); } - /* per-cpu list */ - ucpu = curcpu()->ci_data.cpu_uvm; - pg->offset = (uintptr_t)ucpu; - pgfl = &ucpu->page_free[index].pgfl_buckets[color].pgfl_queues[queue]; - LIST_INSERT_HEAD(pgfl, pg, listq.list); - ucpu->pages[queue]++; - if (ucpu->pages[PGFL_ZEROS] < ucpu->pages[PGFL_UNKNOWN]) { - ucpu->page_idle_zero = vm_page_zero_enable; + /* Try to send the page to the per-CPU cache. */ + if (bucket == ucpu->pgflbucket && uvm_pgflcache_free(ucpu, pg)) { + splx(s); + return; } - mutex_spin_exit(&uvm_fpageqlock); + /* Didn't work. Never mind, send it to a global bucket. */ + pgfl = &uvm.page_free[uvm_page_get_freelist(pg)]; + pgb = pgfl->pgfl_buckets[bucket]; + lock = &uvm_freelist_locks[bucket].lock; + + mutex_spin_enter(lock); + /* PG_FREE must be set under lock because of uvm_pglistalloc(). */ + pg->flags = (pg->flags & PG_ZERO) | PG_FREE; + LIST_INSERT_HEAD(&pgb->pgb_colors[VM_PGCOLOR(pg)], pg, pageq.list); + pgb->pgb_nfree++; + mutex_spin_exit(lock); + splx(s); } /* @@ -1411,116 +1684,22 @@ uvm_page_own(struct vm_page *pg, const c "page (%p)\n", pg); panic("uvm_page_own"); } - if (!uvmpdpol_pageisqueued_p(pg)) { - KASSERT((pg->uanon == NULL && pg->uobject == NULL) || - pg->wire_count > 0); - } else { - KASSERT(pg->wire_count == 0); - } pg->owner_tag = NULL; } #endif /* * uvm_pageidlezero: zero free pages while the system is idle. - * - * => try to complete one color bucket at a time, to reduce our impact - * on the CPU cache. - * => we loop until we either reach the target or there is a lwp ready - * to run, or MD code detects a reason to break early. */ void uvm_pageidlezero(void) { - struct vm_page *pg; - struct pgfreelist *pgfl, *gpgfl; - struct uvm_cpu *ucpu; - int free_list, firstbucket, nextbucket; - bool lcont = false; - - ucpu = curcpu()->ci_data.cpu_uvm; - if (!ucpu->page_idle_zero || - ucpu->pages[PGFL_UNKNOWN] < uvmexp.ncolors) { - ucpu->page_idle_zero = false; - return; - } - if (!mutex_tryenter(&uvm_fpageqlock)) { - /* Contention: let other CPUs to use the lock. */ - return; - } - firstbucket = ucpu->page_free_nextcolor; - nextbucket = firstbucket; - do { - for (free_list = 0; free_list < VM_NFREELIST; free_list++) { - if (sched_curcpu_runnable_p()) { - goto quit; - } - pgfl = &ucpu->page_free[free_list]; - gpgfl = &uvm.page_free[free_list]; - while ((pg = LIST_FIRST(&pgfl->pgfl_buckets[ - nextbucket].pgfl_queues[PGFL_UNKNOWN])) != NULL) { - if (lcont || sched_curcpu_runnable_p()) { - goto quit; - } - LIST_REMOVE(pg, pageq.list); /* global list */ - LIST_REMOVE(pg, listq.list); /* per-cpu list */ - ucpu->pages[PGFL_UNKNOWN]--; - uvmexp.free--; - KASSERT(pg->flags == PG_FREE); - pg->flags = 0; - mutex_spin_exit(&uvm_fpageqlock); -#ifdef PMAP_PAGEIDLEZERO - if (!PMAP_PAGEIDLEZERO(VM_PAGE_TO_PHYS(pg))) { - /* - * The machine-dependent code detected - * some reason for us to abort zeroing - * pages, probably because there is a - * process now ready to run. - */ - - mutex_spin_enter(&uvm_fpageqlock); - pg->flags = PG_FREE; - LIST_INSERT_HEAD(&gpgfl->pgfl_buckets[ - nextbucket].pgfl_queues[ - PGFL_UNKNOWN], pg, pageq.list); - LIST_INSERT_HEAD(&pgfl->pgfl_buckets[ - nextbucket].pgfl_queues[ - PGFL_UNKNOWN], pg, listq.list); - ucpu->pages[PGFL_UNKNOWN]++; - uvmexp.free++; - uvmexp.zeroaborts++; - goto quit; - } -#else - pmap_zero_page(VM_PAGE_TO_PHYS(pg)); -#endif /* PMAP_PAGEIDLEZERO */ - if (!mutex_tryenter(&uvm_fpageqlock)) { - lcont = true; - mutex_spin_enter(&uvm_fpageqlock); - } else { - lcont = false; - } - pg->flags = PG_FREE | PG_ZERO; - LIST_INSERT_HEAD(&gpgfl->pgfl_buckets[ - nextbucket].pgfl_queues[PGFL_ZEROS], - pg, pageq.list); - LIST_INSERT_HEAD(&pgfl->pgfl_buckets[ - nextbucket].pgfl_queues[PGFL_ZEROS], - pg, listq.list); - ucpu->pages[PGFL_ZEROS]++; - uvmexp.free++; - CPU_COUNT(CPU_COUNT_ZEROPAGES, 1); - } - } - if (ucpu->pages[PGFL_UNKNOWN] < uvmexp.ncolors) { - break; - } - nextbucket = (nextbucket + 1) & uvmexp.colormask; - } while (nextbucket != firstbucket); - ucpu->page_idle_zero = false; - quit: - mutex_spin_exit(&uvm_fpageqlock); + /* + * Disabled for the moment. Previous strategy too cache heavy. In + * the future we may experiment with zeroing the pages held in the + * per-CPU cache (uvm_pgflcache). + */ } /* @@ -1585,7 +1764,6 @@ uvm_pageunwire(struct vm_page *pg) KASSERT(uvm_page_locked_p(pg)); KASSERT(pg->wire_count != 0); - KASSERT(!uvmpdpol_pageisqueued_p(pg)); mutex_enter(&pg->interlock); pg->wire_count--; mutex_exit(&pg->interlock); @@ -1611,7 +1789,6 @@ uvm_pagedeactivate(struct vm_page *pg) KASSERT(uvm_page_locked_p(pg)); if (pg->wire_count == 0) { - KASSERT(uvmpdpol_pageisqueued_p(pg)); uvmpdpol_pagedeactivate(pg); } } @@ -1800,6 +1977,7 @@ uvm_page_printit(struct vm_page *pg, boo { struct vm_page *tpg; struct uvm_object *uobj; + struct pgflbucket *pgb; struct pgflist *pgl; char pgbuf[128]; @@ -1848,14 +2026,9 @@ uvm_page_printit(struct vm_page *pg, boo /* cross-verify page queue */ if (pg->flags & PG_FREE) { int fl = uvm_page_get_freelist(pg); - int color = VM_PGCOLOR(pg); - pgl = &uvm.page_free[fl].pgfl_buckets[color].pgfl_queues[ - ((pg)->flags & PG_ZERO) ? PGFL_ZEROS : PGFL_UNKNOWN]; - } else { - pgl = NULL; - } - - if (pgl) { + int b = uvm_page_get_bucket(pg); + pgb = uvm.page_free[fl].pgfl_buckets[b]; + pgl = &pgb->pgb_colors[VM_PGCOLOR(pg)]; (*pr)(" checking pageq list\n"); LIST_FOREACH(tpg, pgl, pageq.list) { if (tpg == pg) { @@ -1905,4 +2078,36 @@ uvm_page_printall(void (*pr)(const char } } +/* + * uvm_page_print_freelists - print a summary freelists + */ + +void +uvm_page_print_freelists(void (*pr)(const char *, ...)) +{ + struct pgfreelist *pgfl; + struct pgflbucket *pgb; + int fl, b, c; + + (*pr)("There are %d freelists with %d buckets of %d colors.\n\n", + VM_NFREELIST, uvm.bucketcount, uvmexp.ncolors); + + for (fl = 0; fl < VM_NFREELIST; fl++) { + pgfl = &uvm.page_free[fl]; + (*pr)("freelist(%d) @ %p\n", fl, pgfl); + for (b = 0; b < uvm.bucketcount; b++) { + pgb = uvm.page_free[fl].pgfl_buckets[b]; + (*pr)(" bucket(%d) @ %p, nfree = %d, lock @ %p:\n", + b, pgb, pgb->pgb_nfree, + &uvm_freelist_locks[b].lock); + for (c = 0; c < uvmexp.ncolors; c++) { + (*pr)(" color(%d) @ %p, ", c, + &pgb->pgb_colors[c]); + (*pr)("first page = %p\n", + LIST_FIRST(&pgb->pgb_colors[c])); + } + } + } +} + #endif /* DDB || DEBUGPRINT */ Index: uvm/uvm_page.h =================================================================== RCS file: /cvsroot/src/sys/uvm/uvm_page.h,v retrieving revision 1.88 diff -u -p -r1.88 uvm_page.h --- uvm/uvm_page.h 21 Dec 2019 14:41:44 -0000 1.88 +++ uvm/uvm_page.h 22 Dec 2019 19:48:19 -0000 @@ -119,7 +119,6 @@ * * o free * => pageq.list is entry on global free page queue - * => listq.list is entry on per-CPU free page queue * => uanon is unused (or (void *)0xdeadbeef for DEBUG) * => uobject is unused (or (void *)0xdeadbeef for DEBUG) * => PG_FREE is set in flags @@ -129,13 +128,11 @@ * => uobject is owner * o owned by a vm_anon * => pageq is unused (XXX correct?) - * => listq is unused (XXX correct?) * => uanon is owner * => uobject is NULL * => PG_ANON is set in flags * o allocated by uvm_pglistalloc * => pageq.queue is entry on resulting pglist, owned by caller - * => listq is unused (XXX correct?) * => uanon is unused * => uobject is unused * @@ -153,11 +150,6 @@ struct vm_page { * or uvm_pglistalloc output */ LIST_ENTRY(vm_page) list; /* f: global free page queue */ } pageq; - - union { - LIST_ENTRY(vm_page) list; /* f: CPU free page queue */ - } listq; - struct vm_anon *uanon; /* o,i: anon */ struct uvm_object *uobject; /* o,i: object */ voff_t offset; /* o: offset into object */ @@ -302,6 +295,7 @@ void uvm_page_own(struct vm_page *, cons bool uvm_page_physget(paddr_t *); #endif void uvm_page_recolor(int); +void uvm_page_rebucket(void); void uvm_pageidlezero(void); void uvm_pageactivate(struct vm_page *); @@ -318,6 +312,8 @@ void uvm_pagewire(struct vm_page *); void uvm_pagezero(struct vm_page *); bool uvm_pageismanaged(paddr_t); bool uvm_page_locked_p(struct vm_page *); +void uvm_pgfl_lock(void); +void uvm_pgfl_unlock(void); int uvm_page_lookup_freelist(struct vm_page *); @@ -348,8 +344,12 @@ int uvm_direct_process(struct vm_page ** #define VM_PGCOLOR(pg) \ (atop(VM_PAGE_TO_PHYS((pg))) & uvmexp.colormask) #define PHYS_TO_VM_PAGE(pa) uvm_phys_to_vm_page(pa) + +/* + * VM_PAGE_IS_FREE() can't tell if the page is on global free list, or a + * per-CPU cache. If you need to be certain, pause caching. + */ #define VM_PAGE_IS_FREE(entry) ((entry)->flags & PG_FREE) -#define VM_FREE_PAGE_TO_CPU(pg) ((struct uvm_cpu *)((uintptr_t)pg->offset)) /* * Use the lower 10 bits of pg->phys_addr to cache some some locators for Index: uvm/uvm_pglist.c =================================================================== RCS file: /cvsroot/src/sys/uvm/uvm_pglist.c,v retrieving revision 1.77 diff -u -p -r1.77 uvm_pglist.c --- uvm/uvm_pglist.c 21 Dec 2019 14:50:34 -0000 1.77 +++ uvm/uvm_pglist.c 22 Dec 2019 19:48:19 -0000 @@ -1,12 +1,12 @@ /* $NetBSD: uvm_pglist.c,v 1.77 2019/12/21 14:50:34 ad Exp $ */ /*- - * Copyright (c) 1997 The NetBSD Foundation, Inc. + * Copyright (c) 1997, 2019 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, - * NASA Ames Research Center. + * NASA Ames Research Center, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -42,6 +42,7 @@ __KERNEL_RCSID(0, "$NetBSD: uvm_pglist.c #include #include +#include #ifdef VM_PAGE_ALLOC_MEMORY_STATS #define STAT_INCR(v) (v)++ @@ -79,34 +80,25 @@ u_long uvm_pglistalloc_npages; static void uvm_pglist_add(struct vm_page *pg, struct pglist *rlist) { - int free_list __unused, color __unused, pgflidx; + struct pgfreelist *pgfl; + struct pgflbucket *pgb; - KASSERT(mutex_owned(&uvm_fpageqlock)); + pgfl = &uvm.page_free[uvm_page_get_freelist(pg)]; + pgb = pgfl->pgfl_buckets[uvm_page_get_bucket(pg)]; -#if PGFL_NQUEUES != 2 -#error uvm_pglistalloc needs to be updated -#endif - - free_list = uvm_page_get_freelist(pg); - color = VM_PGCOLOR(pg); - pgflidx = (pg->flags & PG_ZERO) ? PGFL_ZEROS : PGFL_UNKNOWN; #ifdef UVMDEBUG struct vm_page *tp; - LIST_FOREACH(tp, - &uvm.page_free[free_list].pgfl_buckets[color].pgfl_queues[pgflidx], - pageq.list) { + LIST_FOREACH(tp, &pgb->pgb_colors[VM_PGCOLOR(pg)], pageq.list) { if (tp == pg) break; } if (tp == NULL) panic("uvm_pglistalloc: page not on freelist"); #endif - LIST_REMOVE(pg, pageq.list); /* global */ - LIST_REMOVE(pg, listq.list); /* cpu */ - uvmexp.free--; + LIST_REMOVE(pg, pageq.list); + pgb->pgb_nfree--; if (pg->flags & PG_ZERO) CPU_COUNT(CPU_COUNT_ZEROPAGES, -1); - VM_FREE_PAGE_TO_CPU(pg)->pages[pgflidx]--; pg->flags = PG_CLEAN; pg->uobject = NULL; pg->uanon = NULL; @@ -129,8 +121,6 @@ uvm_pglistalloc_c_ps(uvm_physseg_t psi, printf("pgalloc: contig %d pgs from psi %zd\n", num, ps - vm_physmem); #endif - KASSERT(mutex_owned(&uvm_fpageqlock)); - low = atop(low); high = atop(high); alignment = atop(alignment); @@ -316,7 +306,7 @@ uvm_pglistalloc_contig(int num, paddr_t /* * Block all memory allocation and lock the free list. */ - mutex_spin_enter(&uvm_fpageqlock); + uvm_pgfl_lock(); /* Are there even any free pages? */ if (uvm_free() <= (uvmexp.reserve_pagedaemon + uvmexp.reserve_kernel)) @@ -352,7 +342,7 @@ out: * the pagedaemon. */ - mutex_spin_exit(&uvm_fpageqlock); + uvm_pgfl_unlock(); uvm_kick_pdaemon(); return (error); } @@ -368,7 +358,6 @@ uvm_pglistalloc_s_ps(uvm_physseg_t psi, printf("pgalloc: simple %d pgs from psi %zd\n", num, psi); #endif - KASSERT(mutex_owned(&uvm_fpageqlock)); KASSERT(uvm_physseg_get_start(psi) <= uvm_physseg_get_avail_start(psi)); KASSERT(uvm_physseg_get_start(psi) <= uvm_physseg_get_avail_end(psi)); KASSERT(uvm_physseg_get_avail_start(psi) <= uvm_physseg_get_end(psi)); @@ -461,7 +450,7 @@ again: /* * Block all memory allocation and lock the free list. */ - mutex_spin_enter(&uvm_fpageqlock); + uvm_pgfl_lock(); count++; /* Are there even any free pages? */ @@ -493,7 +482,7 @@ out: * the pagedaemon. */ - mutex_spin_exit(&uvm_fpageqlock); + uvm_pgfl_unlock(); uvm_kick_pdaemon(); if (error) { @@ -539,6 +528,12 @@ uvm_pglistalloc(psize_t size, paddr_t lo TAILQ_INIT(rlist); + /* + * Turn off the caching of free pages - we need everything to be on + * the global freelists. + */ + uvm_pgflcache_pause(); + if ((nsegs < size >> PAGE_SHIFT) || (alignment != PAGE_SIZE) || (boundary != 0)) res = uvm_pglistalloc_contig(num, low, high, alignment, @@ -546,6 +541,8 @@ uvm_pglistalloc(psize_t size, paddr_t lo else res = uvm_pglistalloc_simple(num, low, high, rlist, waitok); + uvm_pgflcache_resume(); + return (res); } @@ -558,45 +555,34 @@ uvm_pglistalloc(psize_t size, paddr_t lo void uvm_pglistfree(struct pglist *list) { - struct uvm_cpu *ucpu; + struct pgfreelist *pgfl; + struct pgflbucket *pgb; struct vm_page *pg; - int index, color, queue; - bool iszero; + int c, b; /* * Lock the free list and free each page. */ - mutex_spin_enter(&uvm_fpageqlock); - ucpu = curcpu()->ci_data.cpu_uvm; + uvm_pgfl_lock(); while ((pg = TAILQ_FIRST(list)) != NULL) { - KASSERT(!uvmpdpol_pageisqueued_p(pg)); TAILQ_REMOVE(list, pg, pageq.queue); - iszero = (pg->flags & PG_ZERO); pg->flags = (pg->flags & PG_ZERO) | PG_FREE; #ifdef DEBUG pg->uobject = (void *)0xdeadbeef; pg->uanon = (void *)0xdeadbeef; -#endif /* DEBUG */ -#ifdef DEBUG - if (iszero) + if (pg->flags & PG_ZERO) uvm_pagezerocheck(pg); #endif /* DEBUG */ - index = uvm_page_get_freelist(pg); - color = VM_PGCOLOR(pg); - queue = iszero ? PGFL_ZEROS : PGFL_UNKNOWN; - pg->offset = (uintptr_t)ucpu; - LIST_INSERT_HEAD(&uvm.page_free[index].pgfl_buckets[color]. - pgfl_queues[queue], pg, pageq.list); - LIST_INSERT_HEAD(&ucpu->page_free[index].pgfl_buckets[color]. - pgfl_queues[queue], pg, listq.list); - uvmexp.free++; - if (iszero) + c = VM_PGCOLOR(pg); + b = uvm_page_get_bucket(pg); + pgfl = &uvm.page_free[uvm_page_get_freelist(pg)]; + pgb = pgfl->pgfl_buckets[b]; + if (pg->flags & PG_ZERO) CPU_COUNT(CPU_COUNT_ZEROPAGES, 1); - ucpu->pages[queue]++; + pgb->pgb_nfree++; + LIST_INSERT_HEAD(&pgb->pgb_colors[c], pg, pageq.list); STAT_DECR(uvm_pglistalloc_npages); } - if (ucpu->pages[PGFL_ZEROS] < ucpu->pages[PGFL_UNKNOWN]) - ucpu->page_idle_zero = vm_page_zero_enable; - mutex_spin_exit(&uvm_fpageqlock); + uvm_pgfl_unlock(); } Index: uvm/uvm_pglist.h =================================================================== RCS file: /cvsroot/src/sys/uvm/uvm_pglist.h,v retrieving revision 1.8 diff -u -p -r1.8 uvm_pglist.h --- uvm/uvm_pglist.h 6 Nov 2010 15:48:00 -0000 1.8 +++ uvm/uvm_pglist.h 22 Dec 2019 19:48:19 -0000 @@ -1,11 +1,11 @@ /* $NetBSD: uvm_pglist.h,v 1.8 2010/11/06 15:48:00 uebayasi Exp $ */ /*- - * Copyright (c) 2000, 2001, 2008 The NetBSD Foundation, Inc. + * Copyright (c) 2000, 2001, 2008, 2019 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation - * by Jason R. Thorpe. + * by Jason R. Thorpe, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -41,19 +41,51 @@ TAILQ_HEAD(pglist, vm_page); LIST_HEAD(pgflist, vm_page); /* - * A page free list consists of free pages of unknown contents and free - * pages of all zeros. + * The global uvm.page_free list (uvm_page.c, uvm_pglist.c). Free pages are + * stored according to freelist, bucket, and cache colour. + * + * pglist = &uvm.page_free[freelist].pgfl_buckets[bucket].pgb_color[color]; + * + * Freelists provide a priority ordering of pages for allocation, based upon + * how valuable they are for special uses (e.g. device driver DMA). + * + * Pages are then grouped in buckets according to some common factor, for + * example L2/L3 cache locality. Each bucket has its own lock, and the + * locks are shared among freelists for the same numbered buckets. + * + * Inside each bucket, pages are further distributed by cache color. + * + * We want these data structures to occupy as few cache lines as possible, + * as they will be highly contended. */ -#define PGFL_UNKNOWN 0 -#define PGFL_ZEROS 1 -#define PGFL_NQUEUES 2 - struct pgflbucket { - struct pgflist pgfl_queues[PGFL_NQUEUES]; + uintptr_t pgb_nfree; /* total # free pages, all colors */ + struct pgflist pgb_colors[1]; /* variable size array */ }; +/* + * At the root, the freelists. MD code decides the number and structure of + * these. They are always arranged in descending order of allocation + * priority. + * + * 8 buckets should be enough to cover most all current x86 systems (2019), + * given the way package/core/smt IDs are structured on x86. For systems + * that report high package counts despite having a single physical CPU + * package (e.g. Ampere eMAG) a little bit of sharing isn't going to hurt + * in the least. + */ +#define PGFL_MAX_BUCKETS 8 struct pgfreelist { - struct pgflbucket *pgfl_buckets; + struct pgflbucket *pgfl_buckets[PGFL_MAX_BUCKETS]; +}; + +/* + * Lock for each bucket. + */ +union uvm_freelist_lock { + kmutex_t lock; + uint8_t padding[COHERENCY_UNIT]; }; +extern union uvm_freelist_lock uvm_freelist_locks[PGFL_MAX_BUCKETS]; #endif /* _UVM_UVM_PGLIST_H_ */ --- /dev/null 2019-12-22 19:45:34.503176590 +0000 +++ uvm/uvm_pgflcache.h 2019-12-13 11:24:49.000000000 +0000 @@ -0,0 +1,43 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2019 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Andrew Doran. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#if !defined(_UVM_PGFLCACHE_H_) +#define _UVM_PGFLCACHE_H_ + +struct vm_page *uvm_pgflcache_alloc(struct uvm_cpu *, int, int); +void uvm_pgflcache_fill(struct uvm_cpu *, int, int, int); +bool uvm_pgflcache_free(struct uvm_cpu *, struct vm_page *); +void uvm_pgflcache_init(void); +void uvm_pgflcache_pause(void); +void uvm_pgflcache_resume(void); +void uvm_pgflcache_start(void); + +#endif /* !_UVM_PGFLCACHE_H_ */ --- /dev/null 2019-12-22 19:45:34.503176590 +0000 +++ uvm/uvm_pgflcache.c 2019-12-22 15:58:33.966575290 +0000 @@ -0,0 +1,474 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2019 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Andrew Doran. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * uvm_pgflcache.c: page freelist cache. + * + * This implements a tiny per-CPU cache of pages that sits between the main + * page allocator and the freelists. By allocating and freeing pages in + * batch, it reduces freelist contention by an order of magnitude. + * + * The cache can be paused & resumed at runtime so that UVM_HOTPLUG, + * uvm_pglistalloc() and uvm_page_redim() can have a consistent view of the + * world. On system with one CPU per physical package (e.g. a uniprocessor) + * the cache is not enabled. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD$"); + +#include "opt_uvm.h" +#include "opt_multiprocessor.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* There is no point doing any of this on a uniprocessor. */ +#ifdef MULTIPROCESSOR + +/* + * MAXPGS - maximum pages per color, per bucket. + * FILLPGS - number of pages to allocate at once, per color, per bucket. + * + * Why the chosen values: + * + * (1) In 2019, an average Intel system has 4kB pages and 8x L2 cache + * colors. We make the assumption that most of the time allocation activity + * will be centered around one UVM freelist, so most of the time there will + * be no more than 224kB worth of cached pages per-CPU. That's tiny, but + * enough to hugely reduce contention on the freelist locks, and give us a + * small pool of pages which if we're very lucky may have some L1/L2 cache + * locality, and do so without subtracting too much from the L2/L3 cache + * benefits of having per-package free lists in the page allocator. + * + * (2) With the chosen values on _LP64, the data structure for each color + * takes up a single cache (64 bytes) giving this very low overhead even + * in the "miss" case. + * + * (3) We don't want to cause too much pressure by hiding away memory that + * could otherwise be put to good use. + */ +#define MAXPGS 7 +#define FILLPGS 6 + +/* Variable size, according to # colors. */ +struct pgflcache { + struct pccolor { + intptr_t count; + struct vm_page *pages[MAXPGS]; + } color[1]; +}; + +static kmutex_t uvm_pgflcache_lock; +static kcondvar_t uvm_pgflcache_cv; +static int uvm_pgflcache_sem; +static bool uvm_pgflcache_draining; + +/* + * uvm_pgflcache_fill: fill specified freelist/color from global list + * + * => must be called at IPL_VM + * => must be called with given bucket lock held + * => must only fill from the correct bucket for this CPU + */ + +void +uvm_pgflcache_fill(struct uvm_cpu *ucpu, int fl, int b, int c) +{ + struct pgflbucket *pgb; + struct pgflcache *pc; + struct pccolor *pcc; + struct pgflist *head; + struct vm_page *pg; + int count; + + KASSERT(mutex_owned(&uvm_freelist_locks[b].lock)); + KASSERT(ucpu->pgflbucket == b); + + /* If caching is off, then bail out. */ + if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) { + return; + } + + /* Fill only to the limit. */ + pcc = &pc->color[c]; + pgb = uvm.page_free[fl].pgfl_buckets[b]; + head = &pgb->pgb_colors[c]; + if (pcc->count >= FILLPGS) { + return; + } + + /* Pull pages from the bucket until it's empty, or we are full. */ + count = pcc->count; + pg = LIST_FIRST(head); + while (__predict_true(pg != NULL && count < FILLPGS)) { + KASSERT(pg->flags & PG_FREE); + KASSERT(uvm_page_get_bucket(pg) == b); + pcc->pages[count++] = pg; + pg = LIST_NEXT(pg, pageq.list); + } + + /* Violate LIST abstraction to remove all pages at once. */ + head->lh_first = pg; + if (__predict_true(pg != NULL)) { + pg->pageq.list.le_prev = &head->lh_first; + } + pgb->pgb_nfree -= (count - pcc->count); + pcc->count = count; +} + +/* + * uvm_pgflcache_spill: spill specified freelist/color to global list + * + * => must be called at IPL_VM + * => mark __noinline so we don't pull it into uvm_pgflcache_free() + */ + +static void __noinline +uvm_pgflcache_spill(struct uvm_cpu *ucpu, int fl, int c) +{ + struct pgflbucket *pgb; + struct pgfreelist *pgfl; + struct pgflcache *pc; + struct pccolor *pcc; + struct pgflist *head; + kmutex_t *lock; + int b, adj; + + pc = ucpu->pgflcache[fl]; + pcc = &pc->color[c]; + pgfl = &uvm.page_free[fl]; + b = ucpu->pgflbucket; + pgb = pgfl->pgfl_buckets[b]; + head = &pgb->pgb_colors[c]; + lock = &uvm_freelist_locks[b].lock; + + mutex_spin_enter(lock); + for (adj = pcc->count; pcc->count != 0;) { + pcc->count--; + KASSERT(pcc->pages[pcc->count] != NULL); + KASSERT(pcc->pages[pcc->count]->flags & PG_FREE); + LIST_INSERT_HEAD(head, pcc->pages[pcc->count], pageq.list); + } + pgb->pgb_nfree += adj; + mutex_spin_exit(lock); +} + +/* + * uvm_pgflcache_alloc: try to allocate a cached page. + * + * => must be called at IPL_VM + * => allocate only from the given freelist and given page color + */ + +struct vm_page * +uvm_pgflcache_alloc(struct uvm_cpu *ucpu, int fl, int c) +{ + struct pgflcache *pc; + struct pccolor *pcc; + struct vm_page *pg; + + /* If caching is off, then bail out. */ + if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) { + return NULL; + } + + /* Very simple: if we have a page then return it. */ + pcc = &pc->color[c]; + if (__predict_false(pcc->count == 0)) { + return NULL; + } + pg = pcc->pages[--(pcc->count)]; + KASSERT(pg != NULL); + KASSERT(pg->flags & PG_FREE); + KASSERT(uvm_page_get_freelist(pg) == fl); + KASSERT(uvm_page_get_bucket(pg) == ucpu->pgflbucket); + pg->flags &= PG_ZERO; + CPU_COUNT(CPU_COUNT_CPUHIT, 1); + CPU_COUNT(CPU_COUNT_COLORHIT, 1); + return pg; +} + +/* + * uvm_pgflcache_free: cache a page, if possible. + * + * => must be called at IPL_VM + * => must only send pages for the correct bucket for this CPU + */ + +bool +uvm_pgflcache_free(struct uvm_cpu *ucpu, struct vm_page *pg) +{ + struct pgflcache *pc; + struct pccolor *pcc; + int fl, c; + + KASSERT(uvm_page_get_bucket(pg) == ucpu->pgflbucket); + + /* If caching is off, then bail out. */ + fl = uvm_page_get_freelist(pg); + if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) { + return false; + } + + /* If the array is full spill it first, then add page to array. */ + c = VM_PGCOLOR(pg); + pcc = &pc->color[c]; + KASSERT((pg->flags & PG_FREE) == 0); + if (__predict_false(pcc->count == MAXPGS)) { + uvm_pgflcache_spill(ucpu, fl, c); + } + pg->flags = (pg->flags & PG_ZERO) | PG_FREE; + pcc->pages[pcc->count] = pg; + pcc->count++; + return true; +} + +/* + * uvm_pgflcache_init: allocate and initialize per-CPU data structures for + * the free page cache. Don't set anything in motion - that's taken care + * of by uvm_pgflcache_resume(). + */ + +static void +uvm_pgflcache_init_cpu(struct cpu_info *ci) +{ + struct uvm_cpu *ucpu; + size_t sz; + + ucpu = ci->ci_data.cpu_uvm; + KASSERT(ucpu->pgflcachemem == NULL); + KASSERT(ucpu->pgflcache[0] == NULL); + + sz = offsetof(struct pgflcache, color[uvmexp.ncolors]); + ucpu->pgflcachememsz = + (roundup2(sz * VM_NFREELIST, coherency_unit) + coherency_unit - 1); + ucpu->pgflcachemem = kmem_zalloc(ucpu->pgflcachememsz, KM_SLEEP); +} + +/* + * uvm_pgflcache_fini_cpu: dump all cached pages back to global free list + * and shut down caching on the CPU. Called on each CPU in the system via + * xcall. + */ + +static void +uvm_pgflcache_fini_cpu(void) +{ + struct uvm_cpu *ucpu; + int fl, color, s; + + ucpu = curcpu()->ci_data.cpu_uvm; + for (fl = 0; fl < VM_NFREELIST; fl++) { + s = splvm(); + for (color = 0; color < uvmexp.ncolors; color++) { + uvm_pgflcache_spill(ucpu, fl, color); + } + ucpu->pgflcache[fl] = NULL; + splx(s); + } +} + +/* + * uvm_pgflcache_pause: pause operation of the caches + */ + +void +uvm_pgflcache_pause(void) +{ + uint64_t where; + + /* First one in starts draining. Everyone else waits. */ + mutex_enter(&uvm_pgflcache_lock); + if (uvm_pgflcache_sem++ == 0) { + uvm_pgflcache_draining = true; + mutex_exit(&uvm_pgflcache_lock); + where = xc_broadcast(0, (xcfunc_t)uvm_pgflcache_fini_cpu, + NULL, NULL); + xc_wait(where); + mutex_enter(&uvm_pgflcache_lock); + uvm_pgflcache_draining = false; + cv_broadcast(&uvm_pgflcache_cv); + } else { + while (uvm_pgflcache_draining) { + cv_wait(&uvm_pgflcache_cv, &uvm_pgflcache_lock); + } + } + mutex_exit(&uvm_pgflcache_lock); +} + +/* + * uvm_pgflcache_resume: resume operation of the caches + */ + +void +uvm_pgflcache_resume(void) +{ + CPU_INFO_ITERATOR cii; + struct cpu_info *ci; + struct uvm_cpu *ucpu; + uintptr_t addr; + size_t sz; + int fl; + + /* Last guy out takes care of business. */ + mutex_enter(&uvm_pgflcache_lock); + KASSERT(!uvm_pgflcache_draining); + KASSERT(uvm_pgflcache_sem > 0); + if (uvm_pgflcache_sem-- > 1) { + mutex_exit(&uvm_pgflcache_lock); + return; + } + + /* + * Make sure dependant data structure updates are remotely visible. + * Essentially this functions as a global memory barrier. + */ + xc_barrier(XC_HIGHPRI); + + /* + * Then set all of the pointers in place on each CPU. As soon as + * each pointer is set, caching is operational in that dimension. + */ + sz = offsetof(struct pgflcache, color[uvmexp.ncolors]); + for (CPU_INFO_FOREACH(cii, ci)) { + ucpu = ci->ci_data.cpu_uvm; + addr = roundup2((uintptr_t)ucpu->pgflcachemem, coherency_unit); + for (fl = 0; fl < VM_NFREELIST; fl++) { + ucpu->pgflcache[fl] = (struct pgflcache *)addr; + addr += sz; + } + } + mutex_exit(&uvm_pgflcache_lock); +} + +/* + * uvm_pgflcache_start: start operation of the cache. + * + * => called once only, when init(8) is about to be started + */ + +void +uvm_pgflcache_start(void) +{ + CPU_INFO_ITERATOR cii; + struct cpu_info *ci; + + KASSERT(uvm_pgflcache_sem > 0); + + /* + * There's not much point doing this if every CPU has its own + * bucket (and that includes the uniprocessor case). + */ + if (ncpu == uvm.bucketcount) { + return; + } + + /* Create each CPU's buckets. */ + for (CPU_INFO_FOREACH(cii, ci)) { + uvm_pgflcache_init_cpu(ci); + } + + /* Kick it into action. */ + uvm_pgflcache_resume(); +} + +/* + * uvm_pgflcache_init: set up data structures for the free page cache. + */ + +void +uvm_pgflcache_init(void) +{ + + uvm_pgflcache_sem = 1; + mutex_init(&uvm_pgflcache_lock, MUTEX_DEFAULT, IPL_NONE); + cv_init(&uvm_pgflcache_cv, "flcache"); +} + +#else /* MULTIPROCESSOR */ + +struct vm_page * +uvm_pgflcache_alloc(struct uvm_cpu *ucpu, int fl, int c) +{ + + return NULL; +} + +bool +uvm_pgflcache_free(struct uvm_cpu *ucpu, struct vm_page *pg) +{ + + return false; +} + +void +uvm_pgflcache_fill(struct uvm_cpu *ucpu, int fl, int b, int c) +{ + +} + +void +uvm_pgflcache_pause(void) +{ + +} + +void +uvm_pgflcache_resume(void) +{ + +} + +void +uvm_pgflcache_start(void) +{ + +} + +void +uvm_pgflcache_init(void) +{ + +} + +#endif /* MULTIPROCESSOR */