Index: sys/arch/amd64/amd64/mainbus.c =================================================================== RCS file: /cvsroot/src/sys/arch/amd64/amd64/mainbus.c,v retrieving revision 1.26 diff -u -p -r1.26 mainbus.c --- sys/arch/amd64/amd64/mainbus.c 10 Nov 2008 14:36:59 -0000 1.26 +++ sys/arch/amd64/amd64/mainbus.c 15 Dec 2008 23:03:22 -0000 @@ -54,11 +54,13 @@ __KERNEL_RCSID(0, "$NetBSD: mainbus.c,v #include "opt_mpbios.h" #include "opt_pcifixup.h" -#include +#include #include #include #include +#include + #if NACPI > 0 #include #endif @@ -92,7 +94,6 @@ union mainbus_attach_args { const char *mba_busname; /* first elem of all */ struct pcibus_attach_args mba_pba; struct isabus_attach_args mba_iba; - struct cpu_attach_args mba_caa; #if NACPI > 0 struct acpibus_attach_args mba_acpi; #endif @@ -133,6 +134,8 @@ int mp_verbose = 0; #endif #endif +int acpi_present = 0; +uint32_t numa_bootflags = 0; /* * Probe for the mainbus; always succeeds. @@ -150,17 +153,15 @@ mainbus_match(device_t parent, cfdata_t void mainbus_attach(device_t parent, device_t self, void *aux) { + struct numa_info *node = NULL; #if NPCI > 0 union mainbus_attach_args mba; #endif -#if NACPI > 0 - int acpi_present = 0; -#endif #ifdef MPBIOS int mpbios_present = 0; #endif + bool numa_faketopology = false; int mpacpi_active = 0; - int numcpus = 0; #if defined(PCI_BUS_FIXUP) int pci_maxbus = 0; #endif @@ -194,33 +195,44 @@ mainbus_attach(device_t parent, device_t #if NACPI > 0 if ((boothowto & RB_MD2) == 0 && acpi_check(self, "acpibus")) acpi_present = acpi_probe(); +#endif + + numa_bootflags = mi_numa_init(); + + if (numa_bootflags & NUMAF_FAKETOPOLOGY) + numa_faketopology = true; + + if (numa_faketopology) { + aprint_normal_dev(self, "fake a one node NUMA system\n"); + + /* Fake a one node NUMA system */ + node = numanode_getbyid_alloc(0, true); + if (!node) + panic("NUMA: can't allocate memory for NUMA node\n"); + } + +#if NACPI > 0 /* * First, see if the MADT contains CPUs, and possibly I/O APICs. * Building the interrupt routing structures can only * be done later (via a callback). */ if (acpi_present) - mpacpi_active = mpacpi_scan_apics(self, &numcpus); + mpacpi_active = mpacpi_scan_apics(self); #endif if (!mpacpi_active) { #ifdef MPBIOS if (mpbios_present) - mpbios_scan(self, &numcpus); - else + mpbios_scan(self); #endif - if (numcpus == 0) { - struct cpu_attach_args caa; - - memset(&caa, 0, sizeof(caa)); - caa.cpu_number = 0; - caa.cpu_role = CPU_ROLE_SP; - caa.cpu_func = 0; - - config_found_ia(self, "cpubus", &caa, mainbus_print); - } } + if (numa_faketopology) + numa_fakememory(node); + + md_numa_init(self); + #if NISADMA > 0 && NACPI > 0 /* * ACPI needs ISA DMA initialized before they start probing. Index: sys/arch/amd64/conf/GENERIC =================================================================== RCS file: /cvsroot/src/sys/arch/amd64/conf/GENERIC,v retrieving revision 1.235 diff -u -p -r1.235 GENERIC --- sys/arch/amd64/conf/GENERIC 14 Dec 2008 00:45:03 -0000 1.235 +++ sys/arch/amd64/conf/GENERIC 15 Dec 2008 23:03:23 -0000 @@ -87,9 +87,12 @@ options POWERNOW_K8 #options BUFQ_PRIOCSCAN # Diagnostic/debugging support options -#options DIAGNOSTIC # expensive kernel consistency checks -#options DEBUG # expensive debugging checks/support -#options KMEMSTATS # kernel memory statistics (vmstat -m) +options DIAGNOSTIC # expensive kernel consistency checks +options DEBUG # expensive debugging checks/support +options LOCKDEBUG +options KMEMSTATS # kernel memory statistics (vmstat -m) +options MALLOC_DEBUG +options UVMHIST # # Because gcc omits the frame pointer for any -O level, the line below @@ -156,6 +159,7 @@ file-system TMPFS # Efficient memory f # File system options options QUOTA # UFS quotas #options FFS_EI # FFS Endian Independent support +options SOFTDEP # FFS soft updates support. options WAPBL # File system journaling support - Experimental # Note that UFS_DIRHASH is suspected of causing kernel memory corruption. # It is not recommended for general use. Index: sys/arch/amd64/conf/XEN3_DOM0 =================================================================== RCS file: /cvsroot/src/sys/arch/amd64/conf/XEN3_DOM0,v retrieving revision 1.35 diff -u -p -r1.35 XEN3_DOM0 --- sys/arch/amd64/conf/XEN3_DOM0 24 Nov 2008 11:41:07 -0000 1.35 +++ sys/arch/amd64/conf/XEN3_DOM0 15 Dec 2008 23:03:23 -0000 @@ -176,7 +176,8 @@ config netbsd root on ? type ? mainbus0 at root -cpu* at mainbus? +numa* at mainbus? +cpu* at numa? # IPMI support ipmi0 at mainbus? Index: sys/arch/amd64/conf/files.amd64 =================================================================== RCS file: /cvsroot/src/sys/arch/amd64/conf/files.amd64,v retrieving revision 1.61 diff -u -p -r1.61 files.amd64 --- sys/arch/amd64/conf/files.amd64 19 Nov 2008 18:35:58 -0000 1.61 +++ sys/arch/amd64/conf/files.amd64 15 Dec 2008 23:03:23 -0000 @@ -88,7 +88,7 @@ file arch/amd64/amd64/bios32.c bios32 n # # XXX BIOS32 only if something that uses it is configured! -device mainbus: isabus, pcibus, bios32, acpibus, cpubus, ioapicbus, ipmibus +device mainbus: isabus, pcibus, bios32, acpibus, ioapicbus, ipmibus, numabus attach mainbus at root file arch/amd64/amd64/mainbus.c mainbus Index: sys/arch/amd64/conf/std.amd64 =================================================================== RCS file: /cvsroot/src/sys/arch/amd64/conf/std.amd64,v retrieving revision 1.7 diff -u -p -r1.7 std.amd64 --- sys/arch/amd64/conf/std.amd64 11 Dec 2008 05:42:18 -0000 1.7 +++ sys/arch/amd64/conf/std.amd64 15 Dec 2008 23:03:23 -0000 @@ -12,7 +12,8 @@ options MTRR options MULTIPROCESSOR mainbus0 at root -cpu* at mainbus? +numa* at mainbus? +cpu* at numa? ioapic* at mainbus? apid ? # Atheros HAL options Index: sys/arch/amd64/include/numa.h =================================================================== RCS file: sys/arch/amd64/include/numa.h diff -N sys/arch/amd64/include/numa.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/arch/amd64/include/numa.h 15 Dec 2008 23:03:23 -0000 @@ -0,0 +1,3 @@ +/* $NetBSD: $ */ + +#include Index: sys/arch/amd64/include/vmparam.h =================================================================== RCS file: /cvsroot/src/sys/arch/amd64/include/vmparam.h,v retrieving revision 1.20 diff -u -p -r1.20 vmparam.h --- sys/arch/amd64/include/vmparam.h 13 Dec 2008 14:07:10 -0000 1.20 +++ sys/arch/amd64/include/vmparam.h 15 Dec 2008 23:03:23 -0000 @@ -155,7 +155,7 @@ /* virtual sizes (bytes) for various kernel submaps */ #define VM_PHYS_SIZE (USRIOSIZE*PAGE_SIZE) -#define VM_PHYSSEG_MAX 10 /* 1 "hole" + 9 free lists */ +#define VM_PHYSSEG_MAX 20 /* 1 "hole" + 9 free lists */ #define VM_PHYSSEG_STRAT VM_PSTRAT_BIGFIRST #define VM_PHYSSEG_NOADD /* can't add RAM after vm_mem_init */ Index: sys/arch/i386/conf/XEN2_DOM0 =================================================================== RCS file: /cvsroot/src/sys/arch/i386/conf/XEN2_DOM0,v retrieving revision 1.55 diff -u -p -r1.55 XEN2_DOM0 --- sys/arch/i386/conf/XEN2_DOM0 24 Nov 2008 11:41:12 -0000 1.55 +++ sys/arch/i386/conf/XEN2_DOM0 15 Dec 2008 23:03:24 -0000 @@ -185,7 +185,8 @@ config netbsd root on ? type ? mainbus0 at root -cpu* at mainbus? +numa* at mainbus? +cpu* at numa? # IPMI support ipmi0 at mainbus? Index: sys/arch/i386/conf/XEN2_DOMU =================================================================== RCS file: /cvsroot/src/sys/arch/i386/conf/XEN2_DOMU,v retrieving revision 1.21 diff -u -p -r1.21 XEN2_DOMU --- sys/arch/i386/conf/XEN2_DOMU 24 Nov 2008 11:41:12 -0000 1.21 +++ sys/arch/i386/conf/XEN2_DOMU 15 Dec 2008 23:03:24 -0000 @@ -164,7 +164,8 @@ config netbsd root on ? type ? mainbus0 at root -cpu* at mainbus? +numa* at mainbus? +cpu* at numa? hypervisor* at mainbus? # Xen hypervisor Index: sys/arch/i386/conf/XEN3_DOMU =================================================================== RCS file: /cvsroot/src/sys/arch/i386/conf/XEN3_DOMU,v retrieving revision 1.11 diff -u -p -r1.11 XEN3_DOMU --- sys/arch/i386/conf/XEN3_DOMU 13 Nov 2008 01:45:48 -0000 1.11 +++ sys/arch/i386/conf/XEN3_DOMU 15 Dec 2008 23:03:24 -0000 @@ -7,7 +7,8 @@ options XEN3 #Xen 3.x support options XEN_COMPAT_030001 #compatible with Xen3 before 3.0.2 options MAXPHYS=32768 #xbd doesn't handle 64k transfers -no cpu* at mainbus? +no numa* at mainbus? +no cpu* at numa? no xennet* at hypervisor? no xbd* at hypervisor? Index: sys/arch/i386/conf/files.i386 =================================================================== RCS file: /cvsroot/src/sys/arch/i386/conf/files.i386,v retrieving revision 1.340 diff -u -p -r1.340 files.i386 --- sys/arch/i386/conf/files.i386 20 Nov 2008 10:53:08 -0000 1.340 +++ sys/arch/i386/conf/files.i386 15 Dec 2008 23:03:24 -0000 @@ -145,7 +145,7 @@ define vesabiosbus {} # XXX BIOS32 only if something that uses it is configured! device mainbus: isabus, eisabus, mcabus, pcibus, bios32, acpibus, - cpubus, ioapicbus, apmbus, pnpbiosbus, vesabiosbus, ipmibus, + ioapicbus, apmbus, pnpbiosbus, vesabiosbus, ipmibus, numabus, bioscall attach mainbus at root file arch/i386/i386/mainbus.c mainbus Index: sys/arch/i386/conf/std.i386 =================================================================== RCS file: /cvsroot/src/sys/arch/i386/conf/std.i386,v retrieving revision 1.29 diff -u -p -r1.29 std.i386 --- sys/arch/i386/conf/std.i386 11 Dec 2008 05:42:18 -0000 1.29 +++ sys/arch/i386/conf/std.i386 15 Dec 2008 23:03:24 -0000 @@ -17,7 +17,8 @@ options MULTIPROCESSOR # multiprocesso options MPBIOS # configure CPUs and APICs using MPBIOS mainbus0 at root -cpu* at mainbus? +numa* at mainbus? +cpu* at numa? ioapic* at mainbus? # Atheros HAL options Index: sys/arch/i386/i386/mainbus.c =================================================================== RCS file: /cvsroot/src/sys/arch/i386/i386/mainbus.c,v retrieving revision 1.79 diff -u -p -r1.79 mainbus.c --- sys/arch/i386/i386/mainbus.c 10 Nov 2008 14:36:59 -0000 1.79 +++ sys/arch/i386/i386/mainbus.c 15 Dec 2008 23:03:24 -0000 @@ -60,7 +60,7 @@ __KERNEL_RCSID(0, "$NetBSD: mainbus.c,v #include "opt_mpbios.h" #include "opt_pcifixup.h" -#include +#include #include #include #include @@ -119,7 +119,6 @@ union mainbus_attach_args { #if NPNPBIOS > 0 struct pnpbios_attach_args mba_paa; #endif - struct cpu_attach_args mba_caa; struct apic_attach_args aaa_caa; #if NACPI > 0 struct acpibus_attach_args mba_acpi; @@ -181,24 +180,25 @@ mainbus_match(struct device *parent, str return 1; } +int acpi_present = 0; +uint32_t numa_bootflags = 0; + /* * Attach the mainbus. */ void mainbus_attach(struct device *parent, struct device *self, void *aux) { + struct numa_info *node = NULL; union mainbus_attach_args mba; -#if NACPI > 0 - int acpi_present = 0; -#endif #ifdef MPBIOS int mpbios_present = 0; #endif #if defined(PCI_BUS_FIXUP) int pci_maxbus = 0; #endif + bool numa_faketopology = false; int mpacpi_active = 0; - int numcpus = 0; aprint_naive("\n"); aprint_normal("\n"); @@ -229,32 +229,43 @@ mainbus_attach(struct device *parent, st #if NACPI > 0 if ((boothowto & RB_MD2) == 0 && acpi_check(self, "acpibus")) acpi_present = acpi_probe(); +#endif + + numa_bootflags = mi_numa_init(); + + if (numa_bootflags & NUMAF_FAKETOPOLOGY) + numa_faketopology = true; + + if (numa_faketopology) { + aprint_normal_dev(self, "fake a one node NUMA system\n"); + + /* Fake a one node NUMA system */ + node = numanode_getbyid_alloc(0, true); + if (!node) + panic("NUMA: can't allocate memory for NUMA node\n"); + } + +#if NACPI > 0 /* * First, see if the MADT contains CPUs, and possibly I/O APICs. * Building the interrupt routing structures can only * be done later (via a callback). */ if (acpi_present) - mpacpi_active = mpacpi_scan_apics(self, &numcpus); + mpacpi_active = mpacpi_scan_apics(self); #endif if (!mpacpi_active) { #ifdef MPBIOS if (mpbios_present) - mpbios_scan(self, &numcpus); - else + mpbios_scan(self); #endif - if (numcpus == 0) { - struct cpu_attach_args caa; + } - memset(&caa, 0, sizeof(caa)); - caa.cpu_number = 0; - caa.cpu_role = CPU_ROLE_SP; - caa.cpu_func = 0; + if (numa_faketopology) + numa_fakememory(node); - config_found_ia(self, "cpubus", &caa, mainbus_print); - } - } + md_numa_init(self); #if NVESABIOS > 0 if (vbeprobe()) Index: sys/arch/i386/include/numa.h =================================================================== RCS file: sys/arch/i386/include/numa.h diff -N sys/arch/i386/include/numa.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/arch/i386/include/numa.h 15 Dec 2008 23:03:25 -0000 @@ -0,0 +1,3 @@ +/* $NetBSD: $ */ + +#include Index: sys/arch/i386/include/vmparam.h =================================================================== RCS file: /cvsroot/src/sys/arch/i386/include/vmparam.h,v retrieving revision 1.69 diff -u -p -r1.69 vmparam.h --- sys/arch/i386/include/vmparam.h 13 Dec 2008 14:07:10 -0000 1.69 +++ sys/arch/i386/include/vmparam.h 15 Dec 2008 23:03:25 -0000 @@ -135,7 +135,7 @@ #define VM_PHYSSEG_MAX 1 #define VM_NFREELIST 1 #else -#define VM_PHYSSEG_MAX 10 /* 1 "hole" + 9 free lists */ +#define VM_PHYSSEG_MAX 20 /* 1 "hole" + 9 free lists */ #define VM_NFREELIST 2 #define VM_FREELIST_FIRST16 1 #endif /* XEN */ Index: sys/arch/x86/conf/files.x86 =================================================================== RCS file: /cvsroot/src/sys/arch/x86/conf/files.x86,v retrieving revision 1.44 diff -u -p -r1.44 files.x86 --- sys/arch/x86/conf/files.x86 3 Aug 2008 19:32:03 -0000 1.44 +++ sys/arch/x86/conf/files.x86 15 Dec 2008 23:03:26 -0000 @@ -35,6 +35,15 @@ define ioapicbus { [apid = -1] } define ipmibus {} # +# NUMA +# +device numabus { } +device numa: cpubus +attach numa at numabus +file arch/x86/x86/x86_numa.c numa +file arch/x86/x86/x86_numa_dev.c numa + +# # CPUs # device cpu Index: sys/arch/x86/include/cpu.h =================================================================== RCS file: /cvsroot/src/sys/arch/x86/include/cpu.h,v retrieving revision 1.9 diff -u -p -r1.9 cpu.h --- sys/arch/x86/include/cpu.h 25 Oct 2008 19:13:40 -0000 1.9 +++ sys/arch/x86/include/cpu.h 15 Dec 2008 23:03:26 -0000 @@ -57,6 +57,7 @@ #include #include +#include #include #include @@ -80,6 +81,7 @@ struct device; struct cpu_info { struct device *ci_dev; /* pointer to our device */ struct cpu_info *ci_self; /* self-pointer */ + struct numa_cpu_info *ci_nci; /* back-pointer to numa data */ volatile struct vcpu_info *ci_vcpu; /* for XEN */ void *ci_tlog_base; /* Trap log base */ int32_t ci_tlog_offset; /* Trap log current offset */ Index: sys/arch/x86/include/cpuvar.h =================================================================== RCS file: /cvsroot/src/sys/arch/x86/include/cpuvar.h,v retrieving revision 1.27 diff -u -p -r1.27 cpuvar.h --- sys/arch/x86/include/cpuvar.h 13 May 2008 22:39:17 -0000 1.27 +++ sys/arch/x86/include/cpuvar.h 15 Dec 2008 23:03:26 -0000 @@ -78,10 +78,14 @@ extern const struct cpu_functions mp_cpu #define CPU_ROLE_BP 1 #define CPU_ROLE_AP 2 +struct numa_cpu_info; + struct cpu_attach_args { + const char *cpu_busname; int cpu_number; int cpu_role; const struct cpu_functions *cpu_func; + struct numa_cpu_info *cpu_nci; }; #ifdef _KERNEL Index: sys/arch/x86/include/mpacpi.h =================================================================== RCS file: /cvsroot/src/sys/arch/x86/include/mpacpi.h,v retrieving revision 1.8 diff -u -p -r1.8 mpacpi.h --- sys/arch/x86/include/mpacpi.h 9 Nov 2008 15:34:14 -0000 1.8 +++ sys/arch/x86/include/mpacpi.h 15 Dec 2008 23:03:26 -0000 @@ -5,7 +5,7 @@ struct pcibus_attach_args; -int mpacpi_scan_apics(device_t, int *); +int mpacpi_scan_apics(device_t); int mpacpi_find_interrupts(void *); int mpacpi_pci_attach_hook(device_t, device_t, struct pcibus_attach_args *); Index: sys/arch/x86/include/mpbiosvar.h =================================================================== RCS file: /cvsroot/src/sys/arch/x86/include/mpbiosvar.h,v retrieving revision 1.7 diff -u -p -r1.7 mpbiosvar.h --- sys/arch/x86/include/mpbiosvar.h 9 Nov 2008 15:34:14 -0000 1.7 +++ sys/arch/x86/include/mpbiosvar.h 15 Dec 2008 23:03:26 -0000 @@ -45,7 +45,7 @@ struct pcibus_attach_args; #if defined(_KERNEL) -void mpbios_scan(device_t, int *); +void mpbios_scan(device_t); int mpbios_probe(device_t); int mpbios_pci_attach_hook(device_t, device_t, struct pcibus_attach_args *); Index: sys/arch/x86/include/numa.h =================================================================== RCS file: sys/arch/x86/include/numa.h diff -N sys/arch/x86/include/numa.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/arch/x86/include/numa.h 15 Dec 2008 23:03:26 -0000 @@ -0,0 +1,109 @@ +/* $NetBSD: $ */ +/* + * Copyright (c) 2008 Christoph Egger + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _X86_NUMA_H +#define _X86_NUMA_H + +#include + +#define __HAVE_NUMAINFO 1 + +struct pmem_arena; +struct device; +struct numa_cpu_info; +struct numa_mem_info; + +/* per numa-node info */ +struct numa_info { + struct device *ni_dev; /* pointer to our device */ + struct numa_info *ni_self; /* self-pointer */ + uint32_t ni_nodeid; + + /* Will be accessed by other NUMA nodes */ + struct numa_data ni_data; /* MI numa node data */ + struct numa_info *ni_next; /* next numa node */ + + uint32_t ni_ncpus; /* number of cpus in this node, + * 0 means cpu-less node */ + struct numa_cpu_info *ni_cpuinfo; /* list of cpus */ + + struct pmem_arena *ni_arena; + + /* + * Private members. + */ +}; + +/* per-cpu numa info */ +struct numa_cpu_info { + struct numa_info *nci_numainfo; /* back-pointer to the node */ + struct cpu_info *nci_cpuinfo; /* pointer to cpu_info */ + + struct numa_cpu_data nci_data; /* MI per-cpu numa node data */ + struct numa_cpu_info *nci_next; /* next cpu in this node */ + + uint32_t nci_flags; + + uint8_t nci_acpiid; /* ACPI processor id */ + uint8_t nci_apicid; /* Processor's local apic id */ + uint8_t nci_sapicid; /* SAPIC id */ + uint8_t nci_sapiceid; /* SAPIC eid */ +}; + +extern uint32_t cpu_bsp_number; +extern uint32_t numa_bootflags; +extern struct numa_info *numa_info_primary; +extern struct numa_info *numa_info_list; + +#define NUMA_INFO_ITERATOR int +#define NUMA_INFO_FOREACH(nii, ni) nii = 0, ni = numa_info_list; \ + ni != NULL; ni = ni->ni_next + + +#define NUMACPU_ITERATOR int +#define NUMACPU_FOREACH(node, ncii, nci) \ + ncii = 0, nci = node->ni_cpuinfo; nci != NULL; nci = nci->nci_next + +#define NUMAINFO_IS_PRIMARY(ni) ((ni) == numa_info_primary) +#define NUMACPU_IS_PRIMARY(ni) ((ni)->nci_flags & NUMACPU_FLAG_PRIMARY) + +#define curnode() (curcpu()->ci_nci->nci_numainfo) + +uint32_t numa_get_topology(void); +uint32_t numa_get_affinity(void); + +int numa_fakememory(struct numa_info *); + +#define numanode_getbyid(id) numanode_getbyid_alloc((id), false) +struct numa_info *numanode_getbyid_alloc(uint32_t, bool); + +struct numa_cpu_info *numa_cpuinfo_alloc(struct numa_info *); + +struct numa_cpu_info *numa_cpuinfo_getby_apicid(struct numa_info *, uint8_t); +struct numa_cpu_info *numa_cpuinfo_getby_sapiceid(struct numa_info *, uint8_t); + +#endif Index: sys/arch/x86/include/numavar.h =================================================================== RCS file: sys/arch/x86/include/numavar.h diff -N sys/arch/x86/include/numavar.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/arch/x86/include/numavar.h 15 Dec 2008 23:03:26 -0000 @@ -0,0 +1,36 @@ +/* $NetBSD: $ */ +/* + * Copyright (c) 2008 Christoph Egger + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + + +#include +__KERNEL_RCSID(0, "$NetBSD: $"); + +struct numa_attach_args { + const char *naa_busname; /* first elem of all */ +}; + +int md_numa_init(device_t); Index: sys/arch/x86/x86/cpu.c =================================================================== RCS file: /cvsroot/src/sys/arch/x86/x86/cpu.c,v retrieving revision 1.59 diff -u -p -r1.59 cpu.c --- sys/arch/x86/x86/cpu.c 6 Nov 2008 19:29:46 -0000 1.59 +++ sys/arch/x86/x86/cpu.c 15 Dec 2008 23:03:26 -0000 @@ -324,6 +324,10 @@ cpu_attach(device_t parent, device_t sel ci->ci_dev = self; ci->ci_cpuid = caa->cpu_number; ci->ci_func = caa->cpu_func; + ci->ci_nci = caa->cpu_nci; + + KASSERT(ci->ci_nci != NULL); + //KASSERT(ci->ci_nci->nci_apicid == lapic_cpu_number()); /* Must be before mi_cpu_attach(). */ cpu_vm_init(ci); Index: sys/arch/x86/x86/mpacpi.c =================================================================== RCS file: /cvsroot/src/sys/arch/x86/x86/mpacpi.c,v retrieving revision 1.71 diff -u -p -r1.71 mpacpi.c --- sys/arch/x86/x86/mpacpi.c 9 Nov 2008 15:34:14 -0000 1.71 +++ sys/arch/x86/x86/mpacpi.c 15 Dec 2008 23:03:26 -0000 @@ -98,7 +98,6 @@ static TAILQ_HEAD(, mpacpi_pcibus) mpacp #endif -static int mpacpi_cpuprint(void *, const char *); static int mpacpi_ioapicprint(void *, const char *); /* acpi_madt_walk callbacks */ @@ -145,17 +144,6 @@ int mpacpi_step; int mpacpi_force; static int -mpacpi_cpuprint(void *aux, const char *pnp) -{ - struct cpu_attach_args *caa = aux; - - if (pnp) - aprint_normal("cpu at %s", pnp); - aprint_normal(" apid %d", caa->cpu_number); - return (UNCONF); -} - -static int mpacpi_ioapicprint(void *aux, const char *pnp) { struct apic_attach_args *aaa = aux; @@ -343,31 +331,53 @@ mpacpi_count(ACPI_SUBTABLE_HEADER *hdrp, static ACPI_STATUS mpacpi_config_cpu(ACPI_SUBTABLE_HEADER *hdrp, void *aux) { +#if 0 device_t parent = aux; - ACPI_MADT_LOCAL_APIC *p; - struct cpu_attach_args caa; - int cpunum = 0; - int locs[CPUBUSCF_NLOCS]; - -#if defined(MULTIPROCESSOR) || defined(IOAPIC) - if (mpacpi_ncpu > 1) - cpunum = lapic_cpu_number(); #endif + ACPI_MADT_LOCAL_APIC *apic; + ACPI_MADT_LOCAL_SAPIC *sapic; + struct numa_cpu_info *nci; + bool fake = false; - if (hdrp->Type == ACPI_MADT_TYPE_LOCAL_APIC) { - p = (ACPI_MADT_LOCAL_APIC *)hdrp; - if (p->LapicFlags & ACPI_MADT_ENABLED) { - if (p->Id != cpunum) - caa.cpu_role = CPU_ROLE_AP; - else - caa.cpu_role = CPU_ROLE_BP; - caa.cpu_number = p->Id; - caa.cpu_func = &mp_cpu_funcs; - locs[CPUBUSCF_APID] = caa.cpu_number; - config_found_sm_loc(parent, "cpubus", locs, - &caa, mpacpi_cpuprint, config_stdsubmatch); - } + if (numa_bootflags & NUMAF_FAKETOPOLOGY) + fake = true; + + switch (hdrp->Type) { + case ACPI_MADT_TYPE_LOCAL_APIC: + apic = (ACPI_MADT_LOCAL_APIC *)hdrp; + if (!apic->LapicFlags & ACPI_MADT_ENABLED) + break; + + if (fake) + nci = numa_cpuinfo_getby_apicid(NULL, 0xff); + else + nci = numa_cpuinfo_getby_apicid(NULL, apic->Id); + KASSERT(nci != NULL); + if (nci->nci_apicid == 0xff) + nci->nci_apicid = apic->Id; + if (nci->nci_acpiid == 0xff) + nci->nci_acpiid = apic->ProcessorId; + break; + + case ACPI_MADT_TYPE_LOCAL_SAPIC: + sapic = (ACPI_MADT_LOCAL_SAPIC *)hdrp; + if (!sapic->LapicFlags & ACPI_MADT_ENABLED) + break; + + if (fake) + nci = numa_cpuinfo_getby_sapiceid(NULL, 0xff); + else + nci = numa_cpuinfo_getby_sapiceid(NULL, sapic->Eid); + KASSERT(nci != NULL); + if (nci->nci_sapiceid == 0xff) + nci->nci_sapiceid = sapic->Eid; + if (nci->nci_sapicid == 0xff) + nci->nci_sapicid = sapic->Id; + if (nci->nci_acpiid == 0xff) + nci->nci_acpiid = sapic->ProcessorId; + break; } + return AE_OK; } @@ -376,31 +386,56 @@ mpacpi_config_ioapic(ACPI_SUBTABLE_HEADE { device_t parent = aux; struct apic_attach_args aaa; - ACPI_MADT_IO_APIC *p; + ACPI_MADT_IO_APIC *ioapic; + ACPI_MADT_IO_SAPIC *iosapic; int locs[IOAPICBUSCF_NLOCS]; - if (hdrp->Type == ACPI_MADT_TYPE_IO_APIC) { - p = (ACPI_MADT_IO_APIC *)hdrp; - aaa.apic_id = p->Id; - aaa.apic_address = p->Address; + switch (hdrp->Type) { + case ACPI_MADT_TYPE_IO_APIC: + ioapic = (ACPI_MADT_IO_APIC *)hdrp; + aaa.apic_id = ioapic->Id; + aaa.apic_address = ioapic->Address; aaa.apic_version = -1; aaa.flags = IOAPIC_VWIRE; - aaa.apic_vecbase = p->GlobalIrqBase; + aaa.apic_vecbase = ioapic->GlobalIrqBase; locs[IOAPICBUSCF_APID] = aaa.apic_id; config_found_sm_loc(parent, "ioapicbus", locs, &aaa, mpacpi_ioapicprint, config_stdsubmatch); + break; + + case ACPI_MADT_TYPE_IO_SAPIC: + aprint_normal_dev(parent, "configuring sioapic\n"); + iosapic = (ACPI_MADT_IO_SAPIC *)hdrp; + aaa.apic_id = iosapic->Id; + aaa.apic_address = iosapic->Address; + aaa.apic_version = -1; + aaa.flags = IOAPIC_VWIRE; + aaa.apic_vecbase = iosapic->GlobalIrqBase; + locs[IOAPICBUSCF_APID] = aaa.apic_id; +#ifdef notyet + config_found_sm_loc(parent, "ioapicbus", locs, &aaa, + mpacpi_ioapicprint, config_stdsubmatch); +#else + aprint_normal_dev(parent, "implement support for io sapic\n"); +#endif + break; } return AE_OK; } int -mpacpi_scan_apics(device_t self, int *ncpup) +mpacpi_scan_apics(device_t self) { int rv = 0; + struct numa_info *node; if (acpi_madt_map() != AE_OK) return 0; + /* If faked or not, a numa node id 0 must always exist */ + node = numanode_getbyid(0); + KASSERT(node != NULL); + mpacpi_ncpu = mpacpi_nintsrc = mpacpi_nioapic = 0; acpi_madt_walk(mpacpi_count, self); @@ -408,6 +443,18 @@ mpacpi_scan_apics(device_t self, int *nc lapic_boot_init(mpacpi_lapic_base); #endif + if (numa_bootflags & NUMAF_FAKETOPOLOGY) { + struct numa_cpu_info *nci; + uint32_t i; + + for (i = 0; i < mpacpi_ncpu; i++) { + nci = numa_cpuinfo_alloc(node); + if (!nci) + panic("%s: can't allocate memory for %i cpu\n", + device_xname(self), i); + } + } + acpi_madt_walk(mpacpi_config_cpu, self); if (mpacpi_ncpu == 0) @@ -426,7 +473,6 @@ mpacpi_scan_apics(device_t self, int *nc #endif rv = 1; done: - *ncpup = mpacpi_ncpu; acpi_madt_unmap(); return rv; } Index: sys/arch/x86/x86/mpbios.c =================================================================== RCS file: /cvsroot/src/sys/arch/x86/x86/mpbios.c,v retrieving revision 1.49 diff -u -p -r1.49 mpbios.c --- sys/arch/x86/x86/mpbios.c 9 Nov 2008 15:34:14 -0000 1.49 +++ sys/arch/x86/x86/mpbios.c 15 Dec 2008 23:03:26 -0000 @@ -492,7 +492,7 @@ static struct mp_bus nmi_bus = { * nintrs */ void -mpbios_scan(device_t self, int *ncpup) +mpbios_scan(device_t self) { const uint8_t *position, *end; int count; @@ -691,16 +691,18 @@ mpbios_scan(device_t self, int *ncpup) mpbios_unmap (&mp_cfg_table_map); } mpbios_scanned = 1; - - *ncpup = mpbios_ncpu; } static void mpbios_cpu(const uint8_t *ent, device_t self) { const struct mpbios_proc *entry = (const struct mpbios_proc *)ent; - struct cpu_attach_args caa; - int locs[CPUBUSCF_NLOCS]; + bool fake = false; + struct numa_info *node; + struct numa_cpu_info *nci; + + if (numa_bootflags & NUMAF_FAKETOPOLOGY) + fake = true; /* XXX move this into the CPU attachment goo. */ /* check for usability */ @@ -709,18 +711,36 @@ mpbios_cpu(const uint8_t *ent, device_t mpbios_ncpu++; + /* If faked or not, a numa node id 0 must always exist */ + node = numanode_getbyid(0); + KASSERT(node != NULL); + + if (fake) { +#if NACPI > 0 + if (mpacpi_ncpu == 0) { +#endif + nci = numa_cpuinfo_alloc(node); + if (!nci) + panic("%s: can't allocate memory for %i cpu\n", + device_xname(self), mpbios_ncpu); +#if NACPI > 0 + } else { + nci = numa_cpuinfo_getby_apicid(NULL, entry->apic_id); + } +#endif + } else { + nci = numa_cpuinfo_getby_apicid(NULL, entry->apic_id); + } + + KASSERT(nci != NULL); + /* check for BSP flag */ if (entry->cpu_flags & PROCENTRY_FLAG_BP) - caa.cpu_role = CPU_ROLE_BP; - else - caa.cpu_role = CPU_ROLE_AP; - - caa.cpu_number = entry->apic_id; - caa.cpu_func = &mp_cpu_funcs; - locs[CPUBUSCF_APID] = caa.cpu_number; + if (cpu_bsp_number == 0xff) + cpu_bsp_number = entry->apic_id; - config_found_sm_loc(self, "cpubus", locs, &caa, mp_cpuprint, - config_stdsubmatch); + if (nci->nci_apicid == 0xff) + nci->nci_apicid = entry->apic_id; } static void Index: sys/arch/x86/x86/x86_machdep.c =================================================================== RCS file: /cvsroot/src/sys/arch/x86/x86/x86_machdep.c,v retrieving revision 1.27 diff -u -p -r1.27 x86_machdep.c --- sys/arch/x86/x86/x86_machdep.c 15 Dec 2008 22:20:52 -0000 1.27 +++ sys/arch/x86/x86/x86_machdep.c 15 Dec 2008 23:03:26 -0000 @@ -46,6 +46,7 @@ __KERNEL_RCSID(0, "$NetBSD: x86_machdep. #include #include #include +#include #include #include @@ -111,6 +112,9 @@ check_pa_acc(paddr_t pa, vm_prot_t prot) const phys_ram_seg_t *seg = &mem_clusters[i]; paddr_t lstart = seg->start; + if (seg->type != PMEM_U_RAM) + continue; + if (lstart <= pa && pa - lstart <= seg->size) { return 0; } @@ -120,6 +124,9 @@ check_pa_acc(paddr_t pa, vm_prot_t prot) KAUTH_MACHDEP_UNMANAGEDMEM, NULL, NULL, NULL, NULL); } + +#define DEBUG_MEMLOAD + /* * This function is to initialize the mutex used by x86/msr_ipifuncs.c. */ @@ -411,7 +418,7 @@ add_mem_cluster(phys_ram_seg_t *seg_clus * the addresses are page rounded just to make * sure we get them all. */ - if (seg_start < 0x100000000ULL) { + if (type != PMEM_U_UNKNOWN && seg_start < 0x100000000ULL) { uint64_t io_end; if (seg_end > 0x100000000ULL) @@ -430,12 +437,6 @@ add_mem_cluster(phys_ram_seg_t *seg_clus } } - /* - * If it's not free memory, skip it. - */ - if (type != BIM_Memory) - return seg_cluster_cnt; - /* XXX XXX XXX */ if (seg_cluster_cnt >= VM_PHYSSEG_MAX) panic("%s: too many memory segments (increase VM_PHYSSEG_MAX)", @@ -456,23 +457,25 @@ add_mem_cluster(phys_ram_seg_t *seg_clus cluster = &seg_clusters[seg_cluster_cnt]; cluster->start = seg_start; - if (iomem_ex != NULL) + if (type == PMEM_U_RAM && iomem_ex != NULL) new_physmem = physmem + atop(seg_end - seg_start); #ifdef PHYSMEM_MAX_SIZE - if (iomem_ex != NULL) { + if (type == PMEM_U_RAM && iomem_ex != NULL) { if (physmem >= atop(MBTOB(PHYSMEM_MAX_SIZE))) return seg_cluster_cnt; if (new_physmem > atop(MBTOB(PHYSMEM_MAX_SIZE))) { - seg_end = seg_start + MBTOB(PHYSMEM_MAX_SIZE) - ptoa(physmem); + seg_end = seg_start + MBTOB(PHYSMEM_MAX_SIZE) + - ptoa(physmem); new_physmem = atop(MBTOB(PHYSMEM_MAX_SIZE)); } } #endif cluster->size = seg_end - seg_start; + cluster->type = type; - if (iomem_ex != NULL) { + if (type == PMEM_U_RAM && iomem_ex != NULL) { if (avail_end < seg_end) avail_end = seg_end; physmem = new_physmem; @@ -505,13 +508,16 @@ initx86_parse_memmap(struct btinfo_memma addr, size, type); #endif - /* - * If the segment is not memory, skip it. - */ switch (type) { case BIM_Memory: + type = PMEM_U_RAM; + break; + case BIM_Reserved: + type = PMEM_U_UNKNOWN; + break; case BIM_ACPI: case BIM_NVS: + type = PMEM_U_FIRMWARE; break; default: continue; @@ -546,6 +552,9 @@ initx86_parse_memmap(struct btinfo_memma mem_clusters, mem_cluster_cnt, iomem_ex, seg_start, 0xa0000, type); mem_cluster_cnt = add_mem_cluster( + mem_clusters, mem_cluster_cnt, NULL, + 0xa0000, 0x100000, PMEM_U_ROM); + mem_cluster_cnt = add_mem_cluster( mem_clusters, mem_cluster_cnt, iomem_ex, 0x100000, seg_end, type); } else @@ -579,6 +588,7 @@ initx86_fake_memmap(struct extent *iomem cluster = &mem_clusters[0]; cluster->start = 0; cluster->size = trunc_page(KBTOB(biosbasemem)); + cluster->type = PMEM_U_RAM; physmem += atop(cluster->size); if (extent_alloc_region(iomem_ex, IOM_END, KBTOB(biosextmem), @@ -614,6 +624,7 @@ initx86_fake_memmap(struct extent *iomem cluster = &mem_clusters[1]; cluster->start = IOM_END; cluster->size = trunc_page(KBTOB(biosextmem)); + cluster->type = PMEM_U_RAM; physmem += atop(cluster->size); mem_cluster_cnt = 2; @@ -633,6 +644,7 @@ initx86_load_memmap(paddr_t first_avail) { uint64_t seg_start, seg_end; uint64_t seg_start1, seg_end1; + uint32_t seg_type; int first16q, x; /* @@ -666,6 +678,7 @@ initx86_load_memmap(paddr_t first_avail) for (x = 0; x < mem_cluster_cnt; x++) { const phys_ram_seg_t *cluster = &mem_clusters[x]; + seg_type = cluster->type; seg_start = cluster->start; seg_end = cluster->start + cluster->size; seg_start1 = 0; @@ -694,6 +707,9 @@ initx86_load_memmap(paddr_t first_avail) seg_end1 = seg_end; seg_end = IOM_END; KASSERT(seg_end < seg_end1); + + pmem_region_create(IOM_END, first_avail, + PMEM_U_TEXT, PMEM_PROT_UNKNOWN, PMEM_P_UNKNOWN); } /* First hunk */ @@ -708,29 +724,46 @@ initx86_load_memmap(paddr_t first_avail) tmp = seg_end; if (tmp != seg_start) { + pmem_region_create(seg_start, tmp, + seg_type, + PMEM_PROT_UNKNOWN, + PMEM_P_UNKNOWN); + + if (seg_type == PMEM_U_RAM) { #ifdef DEBUG_MEMLOAD - printf("loading 0x%"PRIx64"-0x%"PRIx64 - " (0x%lx-0x%lx)\n", - seg_start, tmp, - atop(seg_start), atop(tmp)); -#endif - uvm_page_physload(atop(seg_start), - atop(tmp), atop(seg_start), - atop(tmp), first16q); + printf("loading 0x%"PRIx64"-0x%"PRIx64 + " (0x%lx-0x%lx), 0x%x\n", + seg_start, tmp, + atop(seg_start), atop(tmp), + seg_type); +#endif + uvm_page_physload( + atop(seg_start), atop(tmp), + atop(seg_start), atop(tmp), + first16q); + } } seg_start = tmp; } if (seg_start != seg_end) { + pmem_region_create(seg_start, seg_end, + seg_type, + PMEM_PROT_UNKNOWN, + PMEM_P_UNKNOWN); + + if (seg_type == PMEM_U_RAM) { #ifdef DEBUG_MEMLOAD - printf("loading 0x%"PRIx64"-0x%"PRIx64 - " (0x%lx-0x%lx)\n", - seg_start, seg_end, - atop(seg_start), atop(seg_end)); -#endif - uvm_page_physload(atop(seg_start), - atop(seg_end), atop(seg_start), - atop(seg_end), VM_FREELIST_DEFAULT); + printf("loading 0x%"PRIx64"-0x%"PRIx64 + " (0x%lx-0x%lx), 0x%x\n", + seg_start, seg_end, + atop(seg_start), atop(seg_end), + seg_type); +#endif + uvm_page_physload(atop(seg_start), + atop(seg_end), atop(seg_start), + atop(seg_end), VM_FREELIST_DEFAULT); + } } } @@ -746,29 +779,46 @@ initx86_load_memmap(paddr_t first_avail) tmp = seg_end1; if (tmp != seg_start1) { + pmem_region_create(seg_start1, tmp, + seg_type, + PMEM_PROT_UNKNOWN, + PMEM_P_UNKNOWN); + + if (seg_type == PMEM_U_RAM) { #ifdef DEBUG_MEMLOAD - printf("loading 0x%"PRIx64"-0x%"PRIx64 - " (0x%lx-0x%lx)\n", - seg_start1, tmp, - atop(seg_start1), atop(tmp)); -#endif - uvm_page_physload(atop(seg_start1), - atop(tmp), atop(seg_start1), - atop(tmp), first16q); + printf("loading 0x%"PRIx64"-0x%"PRIx64 + " (0x%lx-0x%lx), 0x%x\n", + seg_start1, tmp, + atop(seg_start1), atop(tmp), + seg_type); +#endif + uvm_page_physload( + atop(seg_start1), atop(tmp), + atop(seg_start1), atop(tmp), + first16q); + } } seg_start1 = tmp; } if (seg_start1 != seg_end1) { + pmem_region_create(seg_start1, seg_end1, + seg_type, + PMEM_PROT_UNKNOWN, + PMEM_P_UNKNOWN); + + if (seg_type == PMEM_U_RAM) { #ifdef DEBUG_MEMLOAD - printf("loading 0x%"PRIx64"-0x%"PRIx64 - " (0x%lx-0x%lx)\n", - seg_start1, seg_end1, - atop(seg_start1), atop(seg_end1)); -#endif - uvm_page_physload(atop(seg_start1), - atop(seg_end1), atop(seg_start1), - atop(seg_end1), VM_FREELIST_DEFAULT); + printf("loading 0x%"PRIx64"-0x%"PRIx64 + " (0x%lx-0x%lx), 0x%x\n", + seg_start1, seg_end1, + atop(seg_start1), atop(seg_end1), + seg_type); +#endif + uvm_page_physload(atop(seg_start1), + atop(seg_end1), atop(seg_start1), + atop(seg_end1), VM_FREELIST_DEFAULT); + } } } } Index: sys/arch/x86/x86/x86_numa.c =================================================================== RCS file: sys/arch/x86/x86/x86_numa.c diff -N sys/arch/x86/x86/x86_numa.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/arch/x86/x86/x86_numa.c 15 Dec 2008 23:03:27 -0000 @@ -0,0 +1,493 @@ +/* $NetBSD: $ */ +/* + * Copyright (c) 2008 Christoph Egger + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: $"); + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include /* for ptoa */ + +uint32_t cpu_bsp_number = 0xff; +struct numa_info *numa_info_primary = NULL; +struct numa_info *numa_info_list = NULL; + +#ifdef NUMA_ACPI_DUMP +static void +acpi_srat_dump(ACPI_TABLE_SRAT *srat) +{ + uint32_t srat_pos; + + printf("dumping SRAT table:\n"); + printf("Header: length: 0x%x, revision: 0x%x\n", + srat->Header.Length, srat->Header.Revision); + printf("TableRevision: 0x%x\n", srat->TableRevision); + + /* Content starts right after the header */ + srat_pos = sizeof(ACPI_TABLE_SRAT); + + while (srat_pos < srat->Header.Length) { + ACPI_SUBTABLE_HEADER *subtable = (ACPI_SUBTABLE_HEADER *)((char *)srat + srat_pos); + ACPI_SRAT_CPU_AFFINITY *srat_cpu; + ACPI_SRAT_MEM_AFFINITY *srat_mem; + + srat_pos += subtable->Length; + + switch (subtable->Type) { + case ACPI_SRAT_TYPE_CPU_AFFINITY: + srat_cpu = (ACPI_SRAT_CPU_AFFINITY *)subtable; + printf("SRAT subtable cpu, length: 0x%x\n", + subtable->Length); + printf("SRAT CPU: Node 0x%x\n", + (srat_cpu->ProximityDomainHi[2] << 24) | + (srat_cpu->ProximityDomainHi[1] << 16) | + (srat_cpu->ProximityDomainHi[0] << 8) | + (srat_cpu->ProximityDomainLo)); + printf("SRAT CPU: ProximityDomainLo %x\n", + srat_cpu->ProximityDomainLo); + printf("SRAT CPU: ApicId %x\n", + srat_cpu->ApicId); + printf("SRAT CPU: Flags %x\n", + srat_cpu->Flags); + printf("SRAT CPU: LocalSapicEid %x\n", + srat_cpu->LocalSapicEid); + printf("SRAT CPU: ProximityDomainHi %x, %x, %x\n", + srat_cpu->ProximityDomainHi[0], + srat_cpu->ProximityDomainHi[1], + srat_cpu->ProximityDomainHi[2]); + break; + case ACPI_SRAT_TYPE_MEMORY_AFFINITY: + srat_mem = (ACPI_SRAT_MEM_AFFINITY *)subtable; + printf("SRAT subtable mem, length: 0x%x\n", + subtable->Length); + printf("SRAT MEM: Node 0x%x\n", + srat_mem->ProximityDomain); + printf("SRAT MEM: BaseAddress 0x%"PRIx64"\n", + srat_mem->BaseAddress); + printf("SRAT MEM: Length 0x%"PRIx64"\n", + srat_mem->Length); + printf("SRAT MEM: MemoryType %x\n", + srat_mem->MemoryType); + printf("SRAT MEM: Flags %x\n", + srat_mem->Flags); + break; + case ACPI_SRAT_TYPE_RESERVED: + printf("SRAT subtable reserved, length: 0x%x\n", + subtable->Length); + break; + } + } + + return; +} +#endif /* NUMA_ACPI_DUMP */ + +static struct numa_info * +numanode_alloc(uint32_t nodeid) +{ + struct numa_info *node, *tmp; + + node = kmem_zalloc(sizeof(*node), KM_NOSLEEP); + if (!node) + return node; + + node->ni_self = node; + node->ni_nodeid = nodeid; + node->ni_next = NULL; + + if (numa_info_list == NULL) { + numa_info_list = node; + } else { + tmp = numa_info_list; + while (tmp->ni_next) + tmp = tmp->ni_next; + + tmp->ni_next = node; + } + + return node; +} + +struct numa_info * +numanode_getbyid_alloc(uint32_t nodeid, bool alloc) +{ + NUMA_INFO_ITERATOR nii; + struct numa_info *ni; + + for (NUMA_INFO_FOREACH(nii, ni)) { + if (ni->ni_nodeid == nodeid) + return ni; + } + + if (!alloc) + return NULL; + + return numanode_alloc(nodeid); +} + + +struct numa_cpu_info * +numa_cpuinfo_alloc(struct numa_info *ni) +{ + struct numa_cpu_info *nci, *tmp; + + nci = kmem_zalloc(sizeof(*nci), KM_NOSLEEP); + if (!nci) + return NULL; + + nci->nci_numainfo = ni; + nci->nci_acpiid = nci->nci_apicid = 0xff; + nci->nci_sapicid = nci->nci_sapiceid = 0xff; + if (ni->ni_cpuinfo == NULL) { + ni->ni_cpuinfo = nci; + ni->ni_ncpus = 1; + } else { + tmp = ni->ni_cpuinfo; + while (tmp->nci_next) + tmp = tmp->nci_next; + + tmp->nci_next = nci; + ni->ni_ncpus++; + } + + return nci; +} + +struct numa_cpu_info * +numa_cpuinfo_getby_apicid(struct numa_info *ni, uint8_t apicid) +{ + NUMA_INFO_ITERATOR nii; + struct numa_cpu_info *nci; + NUMACPU_ITERATOR ncii; + + if (ni == NULL) { + for (NUMA_INFO_FOREACH(nii, ni)) { + for (NUMACPU_FOREACH(ni, ncii, nci)) { + if (nci->nci_apicid == apicid) + return nci; + } + } + return NULL; + } + + + for (NUMACPU_FOREACH(ni, ncii, nci)) { + if (nci->nci_apicid == apicid) + return nci; + } + + return NULL; +} + +struct numa_cpu_info * +numa_cpuinfo_getby_sapiceid(struct numa_info *ni, uint8_t sapiceid) +{ + NUMA_INFO_ITERATOR nii; + struct numa_cpu_info *nci; + NUMACPU_ITERATOR ncii; + + if (ni == NULL) { + for (NUMA_INFO_FOREACH(nii, ni)) { + for (NUMACPU_FOREACH(ni, ncii, nci)) { + if (nci->nci_sapiceid == sapiceid) + return nci; + } + } + return NULL; + } + + + for (NUMACPU_FOREACH(ni, ncii, nci)) { + if (nci->nci_sapiceid == sapiceid) + return nci; + } + + return NULL; +} + +static bool +numa_acpi_srat_parse(ACPI_TABLE_SRAT *srat) +{ + ACPI_SRAT_CPU_AFFINITY *srat_cpu; + ACPI_SRAT_MEM_AFFINITY *srat_mem; + ACPI_SUBTABLE_HEADER *subtable; + struct numa_info *node = NULL; + struct numa_cpu_info *nci_node = NULL; + uint32_t numaid; + uint32_t srat_pos; + + KASSERT(srat != NULL); + +#ifdef NUMA_ACPI_DUMP + acpi_srat_dump(srat); +#endif + + /* Content starts right after the header */ + srat_pos = sizeof(ACPI_TABLE_SRAT); + + while (srat_pos < srat->Header.Length) { + subtable = (ACPI_SUBTABLE_HEADER *)((char *)srat + srat_pos); + srat_pos += subtable->Length; + + switch (subtable->Type) { + case ACPI_SRAT_TYPE_CPU_AFFINITY: + srat_cpu = (ACPI_SRAT_CPU_AFFINITY *)subtable; + numaid = (srat_cpu->ProximityDomainHi[2] << 24) | + (srat_cpu->ProximityDomainHi[1] << 16) | + (srat_cpu->ProximityDomainHi[0] << 8) | + (srat_cpu->ProximityDomainLo); + + node = numanode_getbyid_alloc(numaid, true); + if (!node) + panic("acpi_srat_parse: no memory for NUMA (cpu node)\n"); + nci_node = numa_cpuinfo_alloc(node); + if (!nci_node) + panic("acpi_srat_parse: no memory for NUMA (cpu node2)\n"); + + nci_node->nci_apicid = srat_cpu->ApicId; + nci_node->nci_sapiceid = srat_cpu->LocalSapicEid; + + break; + case ACPI_SRAT_TYPE_MEMORY_AFFINITY: + srat_mem = (ACPI_SRAT_MEM_AFFINITY *)subtable; + pmem_type_t memtype; + + numaid = srat_mem->ProximityDomain; + node = numanode_getbyid_alloc(numaid, true); + if (!node) + panic("acpi_srat_parse: no memory for NUMA (memory node)\n"); + + switch (srat_mem->MemoryType) { + case ACPI_ADDRESS_RANGE_MEMORY: + memtype = PMEM_U_RAM; + break; + case ACPI_ADDRESS_RANGE_RESERVED: + default: + memtype = PMEM_U_UNKNOWN; + break; + case ACPI_ADDRESS_RANGE_ACPI: + case ACPI_ADDRESS_RANGE_NVS: + memtype = PMEM_U_FIRMWARE; + break; + } + + if (node->ni_arena == NULL) { + node->ni_arena = pmem_arena_create(node, + srat_mem->BaseAddress, + srat_mem->BaseAddress + srat_mem->Length); + if (node->ni_arena == NULL) + panic("acpi_srat_parse: no memory for NUMA arena\n"); + } + + pmem_arena_loadrange(node->ni_arena, + srat_mem->BaseAddress, + srat_mem->BaseAddress + srat_mem->Length, + memtype); + + break; + case ACPI_SRAT_TYPE_RESERVED: + printf("SRAT subtable reserved, length: 0x%x\n", + subtable->Length); + break; + } + } + + return true; +} + +#ifdef NUMA_ACPI_DUMP +static void +acpi_slit_dump(ACPI_TABLE_SLIT *slit) +{ + uint32_t i; + + printf("dumping SLIT table:\n"); + printf("Header: length: 0x%x, revision: 0x%x\n", + slit->Header.Length, slit->Header.Revision); + printf("LocalityCount: %"PRIx64"\n", slit->LocalityCount); + + i = 0; + while (i < (slit->LocalityCount * slit->LocalityCount)) { + printf("%2u ", slit->Entry[i]); + i++; + if ((i % slit->LocalityCount) == 0) + printf("\n"); + } + + return; +} +#endif /* NUMA_ACPI_DUMP */ + +static bool +numa_acpi_slit_parse(ACPI_TABLE_SLIT *slit) +{ + + KASSERT(slit != NULL); + +#ifdef NUMA_ACPI_DUMP + acpi_slit_dump(slit); +#endif + + return false; +} + +static ACPI_TABLE_HEADER * +numa_acpi_find_table(ACPI_CONST_STRING signature) +{ + ACPI_TABLE_HEADER *table; + ACPI_STATUS rv; + + rv = AcpiGetTable(signature, 1, (ACPI_TABLE_HEADER **)&table); + if (ACPI_FAILURE(rv)) + return NULL; + + /* Check if header is valid */ + if (table == NULL) + return NULL; + + if (table->Length == 0xffffffff) + return NULL; + + return table; +} + +extern int acpi_present; + +uint32_t +numa_get_topology(void) +{ + bool srat_present, srat_parsed; + ACPI_TABLE_SRAT *srat; + uint32_t flag = 0; + + if (!acpi_present) { + flag |= NUMAF_FAKETOPOLOGY; + flag |= NUMAF_SCANTOPOLOGY; + goto out; + } + + /* find SRAT table */ + srat = (ACPI_TABLE_SRAT *)numa_acpi_find_table(ACPI_SIG_SRAT); + srat_present = (srat) ? true : false; + + printf("NUMA: SRAT table %s\n", srat ? "found" : "not found"); + + if (!srat_present) { + flag |= NUMAF_FAKETOPOLOGY; + flag |= NUMAF_SCANTOPOLOGY; + goto out; + } + + srat_parsed = numa_acpi_srat_parse(srat); + if (!srat_parsed) { + printf("NUMA: couldn't parse SRAT table\n"); + flag |= NUMAF_FAKETOPOLOGY; + flag |= NUMAF_SCANTOPOLOGY; + goto out; + } + +out: + return flag; +} + +uint32_t +numa_get_affinity(void) +{ + bool slit_present, slit_parsed; + ACPI_TABLE_SLIT *slit; + uint32_t flag = 0; + + if (!acpi_present) { + flag |= NUMAF_PROBEAFFINITY; + goto out; + } + + /* find SLIT table */ + slit = (ACPI_TABLE_SLIT *)numa_acpi_find_table(ACPI_SIG_SLIT); + slit_present = (slit) ? true : false; + + printf("NUMA: SLIT table %s\n", slit ? "found" : "not found"); + + if (!slit_present) { + flag |= NUMAF_PROBEAFFINITY; + goto out; + } + + slit_parsed = numa_acpi_slit_parse(slit); + if (!slit_parsed) { + printf("NUMA: couldn't parse SLIT table\n"); + flag |= NUMAF_PROBEAFFINITY; + goto out; + } + +out: + return flag; +} + +int +numa_fakememory(struct numa_info *node) +{ + if (node->ni_arena == NULL) { + node->ni_arena = pmem_arena_create(node, 0, ptoa(physmem)); + if (node->ni_arena == NULL) + panic("numa_fakememory: no memory for faked NUMA node\n"); + } + + pmem_arena_add_regions(node->ni_arena); + printf("NUMA: faked memory has no PCI hole. Take care!\n"); + + return 0; +} + +int +md_numa_init(device_t self) +{ + NUMA_INFO_ITERATOR nii; + struct numa_info *ni; + + if (cpu_bsp_number == 0xff) + cpu_bsp_number = lapic_cpu_number(); + + for (NUMA_INFO_FOREACH(nii, ni)) { + config_found_ia(self, "numabus", NULL, NULL); + } + + return 0; +} Index: sys/arch/x86/x86/x86_numa_dev.c =================================================================== RCS file: sys/arch/x86/x86/x86_numa_dev.c diff -N sys/arch/x86/x86/x86_numa_dev.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/arch/x86/x86/x86_numa_dev.c 15 Dec 2008 23:03:27 -0000 @@ -0,0 +1,144 @@ +/* $NetBSD: $ */ +/* + * Copyright (c) 2008 Christoph Egger + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: $"); + +#include +#include +#include +#include +#include +#include + +#include + +static int numa_match(device_t, cfdata_t, void *); +static void numa_attach(device_t, device_t, void *); +static int numabus_print(void *, const char *); + +CFATTACH_DECL_NEW(numa, 0, numa_match, numa_attach, NULL, NULL); + + +/* + * Probe for the numa device always succeeds. + */ +static int +numa_match(device_t parent, cfdata_t match, void *aux) +{ + return 1; +} + +/* + * Attach the numa. + */ +static void +numa_attach(device_t parent, device_t self, void *aux) +{ + struct numa_info *node; + uint32_t nodeid; + NUMACPU_ITERATOR ncii; + struct numa_cpu_info *nci; + + aprint_naive("\n"); + aprint_normal("\n"); + + nodeid = device_unit(self); + node = numanode_getbyid(nodeid); + + KASSERT(node != NULL); + + node->ni_dev = self; + + mi_numa_attach(node); + + /* A node w/o cpus is valid. Large NUMA machines + * have (dedicated) cpu-less nodes. + */ + for (NUMACPU_FOREACH(node, ncii, nci)) { + struct cpu_attach_args caa; + + memset(&caa, 0, sizeof(caa)); + caa.cpu_number = nci->nci_apicid; + caa.cpu_func = &mp_cpu_funcs; + caa.cpu_nci = nci; + if (caa.cpu_number == cpu_bsp_number) { + caa.cpu_role = CPU_ROLE_SP; + if (((boothowto & RB_MD1) != RB_MD1) + && (node->ni_ncpus > 1)) + caa.cpu_role = CPU_ROLE_BP; + + KASSERT(numa_info_primary == NULL); + numa_info_primary = node; + } else { + caa.cpu_role = CPU_ROLE_AP; + } + + config_found_ia(self, "cpubus", &caa, numabus_print); + } + + pmem_arena_add_regions(node->ni_arena); +#ifdef DEBUG + pmem_regions_dump(node->ni_arena); +#endif + + if (!pmf_device_register(self, NULL, NULL)) + aprint_error_dev(self, "couldn't establish power handler\n"); + +} + +static int +numabus_print(void *aux, const char *pnp) +{ + struct cpu_attach_args *caa = aux; + + if (pnp) + aprint_normal("cpuX at %s", pnp); + aprint_normal(" apic %u", caa->cpu_nci->nci_apicid); + + aprint_debug(" acpi %u sapicid %u sapiceid %u", + caa->cpu_nci->nci_acpiid, + caa->cpu_nci->nci_sapicid, + caa->cpu_nci->nci_sapiceid); + + switch (caa->cpu_role) { + case CPU_ROLE_BP: + aprint_normal(" (BP)"); + break; + case CPU_ROLE_SP: + aprint_normal(" (SP)"); + break; + case CPU_ROLE_AP: + aprint_normal(" (AP)"); + break; + default: + aprint_normal(" (unknown)"); + break; + } + + return UNCONF; +} Index: sys/arch/xen/conf/files.xen =================================================================== RCS file: /cvsroot/src/sys/arch/xen/conf/files.xen,v retrieving revision 1.90 diff -u -p -r1.90 files.xen --- sys/arch/xen/conf/files.xen 20 Nov 2008 10:53:09 -0000 1.90 +++ sys/arch/xen/conf/files.xen 15 Dec 2008 23:03:27 -0000 @@ -160,15 +160,24 @@ define vesabiosbus {} define hypervisorbus {} define xendevbus {} define ipmibus {} +device numabus { } # # System bus types # -device mainbus: cpubus, ioapicbus, hypervisorbus, bios32, ipmibus +device mainbus: ioapicbus, hypervisorbus, bios32, ipmibus, numabus attach mainbus at root file arch/xen/x86/mainbus.c mainbus +# +# NUMA +# +device numa: cpubus +attach numa at numabus +file arch/x86/x86/x86_numa.c numa +file arch/x86/x86/x86_numa_dev.c numa + # Xen hypervisor device hypervisor { [apid = -1]}: isabus, pcibus, sysmon_power, xendevbus, acpibus attach hypervisor at hypervisorbus Index: sys/arch/xen/x86/mainbus.c =================================================================== RCS file: /cvsroot/src/sys/arch/xen/x86/mainbus.c,v retrieving revision 1.6 diff -u -p -r1.6 mainbus.c --- sys/arch/xen/x86/mainbus.c 9 Nov 2008 14:24:14 -0000 1.6 +++ sys/arch/xen/x86/mainbus.c 15 Dec 2008 23:03:27 -0000 @@ -51,7 +51,7 @@ __KERNEL_RCSID(0, "$NetBSD: mainbus.c,v #include "ipmi.h" -#include +#include #include #include @@ -91,6 +91,7 @@ int mp_verbose = 0; #endif /* defined(MPBIOS) || NACPI > 0 */ #endif /* defined(XEN3) && NPCI > 0 */ +uint32_t numa_bootflags = 0; int mainbus_match(device_t, cfdata_t, void *); void mainbus_attach(device_t, device_t, void *); @@ -102,7 +103,6 @@ int mainbus_print(void *, const char *); union mainbus_attach_args { const char *mba_busname; /* first elem of all */ - struct cpu_attach_args mba_caa; #if NHYPERVISOR > 0 struct hypervisor_attach_args mba_haa; #endif @@ -129,12 +129,14 @@ mainbus_attach(device_t parent, device_t { union mainbus_attach_args mba; #if defined(DOM0OPS) && defined(XEN3) - int numcpus = 0; + bool numa_faketopology = false; #ifdef MPBIOS + struct numa_info *node = NULL; int mpbios_present = 0; #endif #endif /* defined(DOM0OPS) && defined(XEN3) */ + aprint_naive("\n"); aprint_normal("\n"); @@ -165,25 +167,38 @@ mainbus_attach(device_t parent, device_t #endif /* PCI_BUS_FIXUP */ #if NACPI > 0 acpi_present = acpi_probe(); +#endif + numa_bootflags = mi_numa_init(); + + if (numa_bootflags & NUMAF_FAKETOPOLOGY) + numa_faketopology = true; + + if (numa_faketopology) { + aprint_normal_dev(self, "fake a one node NUMA system\n"); + + /* Fake a one node NUMA system */ + node = numanode_getbyid_alloc(0, true); + if (!node) + panic("NUMA: can't allocate memory for NUMA node\n"); + } + +#if NACPI > 0 if (acpi_present) - mpacpi_active = mpacpi_scan_apics(self, &numcpus); + mpacpi_active = mpacpi_scan_apics(self); if (!mpacpi_active) #endif { #ifdef MPBIOS if (mpbios_present) - mpbios_scan(self, &numcpus); - else + mpbios_scan(self); #endif - if (numcpus == 0) { - memset(&mba.mba_caa, 0, sizeof(mba.mba_caa)); - mba.mba_caa.cpu_number = 0; - mba.mba_caa.cpu_role = CPU_ROLE_SP; - mba.mba_caa.cpu_func = 0; - config_found_ia(self, "cpubus", - &mba.mba_caa, mainbus_print); - } } + + if (numa_faketopology) + numa_fakememory(node); + + md_numa_init(self); + #if NIOAPIC > 0 ioapic_enable(); #endif Index: sys/conf/files =================================================================== RCS file: /cvsroot/src/sys/conf/files,v retrieving revision 1.930 diff -u -p -r1.930 files --- sys/conf/files 11 Dec 2008 05:42:18 -0000 1.930 +++ sys/conf/files 15 Dec 2008 23:03:27 -0000 @@ -1396,6 +1396,7 @@ file kern/kern_malloc_stdtype.c file kern/kern_malloc_debug.c malloc_debug file kern/kern_module.c file kern/kern_mutex.c +file kern/kern_numa.c numa file kern/kern_fileassoc.c fileassoc file kern/kern_ntptime.c file kern/kern_pax.c pax_mprotect | pax_segvguard @@ -1443,6 +1444,7 @@ file kern/subr_kobj.c file kern/subr_lockdebug.c file kern/subr_log.c file kern/subr_percpu.c +file kern/subr_pmem.c file kern/subr_pool.c file kern/subr_prf.c file kern/subr_prf2.c Index: sys/kern/kern_numa.c =================================================================== RCS file: sys/kern/kern_numa.c diff -N sys/kern/kern_numa.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/kern/kern_numa.c 15 Dec 2008 23:03:28 -0000 @@ -0,0 +1,136 @@ +/* $NetBSD: $ */ + +/* + * Copyright (c) 2007, 2008 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Andrew Doran. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: $"); + +#include +#include +#include +#include +#include +#include +#include +#include + +void numactlattach(int); + +dev_type_ioctl(numactl_ioctl); + +const struct cdevsw numactl_cdevsw = { + nullopen, nullclose, nullread, nullwrite, numactl_ioctl, + nullstop, notty, nopoll, nommap, nokqfilter, + D_OTHER | D_MPSAFE +}; + +kmutex_t numa_lock; +int nnuma; +struct numaqueue numa_queue = CIRCLEQ_HEAD_INITIALIZER(numa_queue); + +static struct numa_info *numa_infos[MAX_NUMA_NODES]; + +uint32_t +mi_numa_init(void) +{ + uint32_t flag = 0; + + /* Try to get the topology from the firmware first. + * If that fails, we fake a NUMA system with one + * node with all CPUs and all memory on it. + */ + flag |= numa_get_topology(); + + /* Try to get the cpu/memory affinity from the firmware first. + * If that fails, we must attach NUMA devices (i.e. CPUs) + * first and then do some probing/meassuring. + */ + flag |= numa_get_affinity(); + + return flag; +} + +int +mi_numa_attach(struct numa_info *ni) +{ + struct numa_data *nd; + + nd = &ni->ni_data; + nd->ni_index = nnuma; + + numa_infos[numa_index(ni)] = ni; + CIRCLEQ_INSERT_TAIL(&numa_queue, ni, ni_data.numa_qchain); +#if 0 + TAILQ_INIT(&ni->ni_data.numa_ld_locks); + __numa_simple_lock_init(&ni->ni_data.numa_ld_lock); +#endif + +#if notyet + sched_numaattach(ni); +#endif + + nnuma++; + + return 0; +} + +void +numactlattach(int dummy) +{ + +} + +int +numactl_ioctl(dev_t dev, u_long cmd, void *data, int flag, lwp_t *l) +{ +#if 0 + NUMA_INFO_ITERATOR nii; + struct numa_info *ni; + int error, i; + u_int id; + + error = 0; +#endif + + int error = 0; + + return error; +} + +struct numa_info * +numa_lookup(uint32_t idx) +{ + struct numa_info *ni = numa_infos[idx]; + + KASSERT(idx < __arraycount(numa_infos)); + KASSERT(ni == NULL || numa_index(ni) == idx); + + return ni; +} Index: sys/kern/subr_pmem.c =================================================================== RCS file: sys/kern/subr_pmem.c diff -N sys/kern/subr_pmem.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/kern/subr_pmem.c 15 Dec 2008 23:03:28 -0000 @@ -0,0 +1,733 @@ +/* $NetBSD: $ */ + +/* + * Copyright (c) 2008 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Christoph Egger. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: $"); + +#include +#include +#include +#include +#include +#include /* for "cold" */ + +#include /* for PAGE_SIZE */ + +/* structs */ + +#define PMEM_STORAGE 0 +#define PMEM_STORAGE_STATIC 1 + +struct pmem_dimm { + struct pmem_arena *d_pmarena; /* pointer to pmem_arena */ + vmem_t *d_vmem; + vmem_addr_t d_vmemaddr; + TAILQ_ENTRY(pmem_dimm) d_entry; + TAILQ_ENTRY(pmem_dimm) d_arena_entry; + + struct pmem_dimm_spec d_spec; + uint32_t d_storagetype; +}; + +struct pmem_phys_region { + struct pmem_arena *r_pmarena; /* pointer to pmem_arena */ + vmem_t *r_vmem; + vmem_addr_t r_vmemaddr; + TAILQ_ENTRY(pmem_phys_region) r_entry; + TAILQ_ENTRY(pmem_phys_region) r_arena_entry; + + struct pmem_region_spec r_spec; + uint32_t r_storagetype; +}; + +struct pmem_arena { + paddr_t pa_start; + paddr_t pa_end; + + struct numa_info *pa_numainfo; + + TAILQ_ENTRY(pmem_arena) pa_entry; + + TAILQ_HEAD(, pmem_dimm) pa_dimms; + TAILQ_HEAD(, pmem_phys_region) pa_regions; + uint32_t pa_storagetype; +}; + +/* static storage for early bootstrapping */ + +#ifndef PMEM_DIMM_STORAGE +#define PMEM_DIMM_STORAGE 4 +#endif +static struct pmem_dimm pmem_dimm_storage[PMEM_DIMM_STORAGE]; +static int pmem_ndimms = 0; + +static struct pmem_phys_region pmem_phys_region_storage[VM_PHYSSEG_MAX]; +static int pmem_nphys_regions = 0; + +#ifndef PMEM_ARENA_STORAGE +#define PMEM_ARENA_STORAGE 1 +#endif +static struct pmem_arena pmem_arena_storage[PMEM_ARENA_STORAGE]; +static int pmem_narenas = 0; + +/* lists */ + +static TAILQ_HEAD(pmem_dimm_head, pmem_dimm) pmem_dimm_head = + TAILQ_HEAD_INITIALIZER(pmem_dimm_head); + +static TAILQ_HEAD(pmem_region_head, pmem_phys_region) pmem_region_head = + TAILQ_HEAD_INITIALIZER(pmem_region_head); + +static TAILQ_HEAD(pmem_arena_head, pmem_arena) pmem_arena_head = + TAILQ_HEAD_INITIALIZER(pmem_arena_head); + +/* macros */ + +#define DIMM_ADD(dimm) \ + TAILQ_INSERT_TAIL(&(pmem_dimm_head), (dimm), d_entry) +#define DIMM_REMOVE(dimm) \ + TAILQ_REMOVE(&(pmem_dimm_head), (dimm), d_entry) +#define DIMM_FOREACH(dimm) \ + TAILQ_FOREACH((dimm), &(pmem_dimm_head), d_entry) + +#define REGION_ADD(region) \ + TAILQ_INSERT_TAIL(&(pmem_region_head), (region), r_entry) +#define REGION_REMOVE(region) \ + TAILQ_REMOVE(&(pmem_region_head), (region), r_entry) +#define REGION_FOREACH(idx) \ + TAILQ_FOREACH((idx), &(pmem_region_head), r_entry) +#define REGION_FIRST \ + TAILQ_FIRST(&(pmem_region_head)) +#define REGION_NEXT(region) \ + TAILQ_NEXT((region), r_entry) + + +#define ARENA_ADD(arena) \ + TAILQ_INSERT_TAIL(&(pmem_arena_head), (arena), pa_entry) +#define ARENA_REMOVE(arena) \ + TAILQ_REMOVE(&(pmem_arena_head), (arena), pa_entry) +#define ARENA_FOREACH(arena) \ + TAILQ_FOREACH((arena), &(pmem_arena_head), pa_entry) + + +#define ARENA_DIMM_INIT(arena) \ + TAILQ_INIT(&(arena)->pa_dimms) +#define ARENA_DIMM_ADD(arena, dimm) \ + TAILQ_INSERT_TAIL(&(arena)->pa_dimms, (dimm), d_arena_entry) +#define ARENA_DIMM_REMOVE(arena, dimm) \ + TAILQ_REMOVE(&(arena)->pa_dimms, (dimm), d_arena_entry) +#define ARENA_DIMM_EMPTY(arena) \ + TAILQ_EMPTY(&(arena)->pa_dimms) +#define ARENA_DIMM_FOREACH(arena, dimm) \ + TAILQ_FOREACH((dimm), &(arena)->pa_dimms, d_arena_entry) + +#define ARENA_REGION_INIT(arena) \ + TAILQ_INIT(&(arena)->pa_regions) +#define ARENA_REGION_ADD(arena, region) \ + TAILQ_INSERT_TAIL(&(arena)->pa_regions, region, r_arena_entry) +#define ARENA_REGION_REMOVE(arena, region) \ + TAILQ_REMOVE(&(arena)->pa_regions, region, r_arena_entry) +#define ARENA_REGION_EMPTY(arena) \ + TAILQ_EMPTY(&(arena)->pa_regions) +#define ARENA_REGION_FOREACH(arena, region) \ + TAILQ_FOREACH((region), &(arena)->pa_regions, r_arena_entry) + + +/* internal */ + +static struct pmem_dimm * +pmem_dimm_alloc(paddr_t start, paddr_t end, + pmem_type_t type, uint32_t serial) +{ + struct pmem_dimm *dimm; + + KASSERT(start < end); + + if (cold && pmem_ndimms < PMEM_DIMM_STORAGE) { + /* preload */ + dimm = &pmem_dimm_storage[pmem_ndimms]; + dimm->d_storagetype = PMEM_STORAGE_STATIC; + pmem_ndimms++; + } else { + dimm = kmem_zalloc(sizeof(struct pmem_dimm), KM_NOSLEEP); + dimm->d_storagetype = PMEM_STORAGE; + if (dimm == NULL) + return NULL; + } + + KASSERT(dimm != NULL); + dimm->d_spec.d_start = start; + dimm->d_spec.d_end = end; + dimm->d_spec.d_type = type; + dimm->d_spec.d_serial = serial; + + return dimm; +} + +static struct pmem_phys_region * +pmem_region_alloc(paddr_t start, paddr_t end, + pmem_type_t type, pmem_prot_t prot, pmem_props_t props) +{ + struct pmem_phys_region *region; + + KASSERT(start < end); + + if (cold && pmem_nphys_regions < VM_PHYSSEG_MAX) { + /* preload */ + region = &pmem_phys_region_storage[pmem_nphys_regions]; + region->r_storagetype = PMEM_STORAGE_STATIC; + pmem_nphys_regions++; + } else { + region = kmem_zalloc(sizeof(struct pmem_phys_region), + KM_NOSLEEP); + region->r_storagetype = PMEM_STORAGE; + if (region == NULL) + return NULL; + } + + KASSERT(region != NULL); + region->r_spec.r_start = start; + region->r_spec.r_end = end; + region->r_spec.r_type = type; + region->r_spec.r_prot = prot; + region->r_spec.r_props = props; + + return region; +} + +static bool +pmem_region_match(const struct pmem_phys_region *region, + paddr_t start, paddr_t end) +{ + KASSERT(region != NULL); + KASSERT(start < end); + + if (region->r_spec.r_end <= start) + return false; + if (region->r_spec.r_start >= end) + return false; + + return true; +} + +static struct pmem_phys_region * +pmem_region_search(const struct pmem_phys_region *region, + paddr_t start, paddr_t end) +{ + struct pmem_phys_region *reg; + + KASSERT(start < end); + + if (region == NULL) + reg = REGION_FIRST; + else + reg = REGION_NEXT(region); + + while (reg != NULL) { + if (pmem_region_match(reg, start, end)) + return reg; + + reg = REGION_NEXT(reg); + } + + return NULL; +} + +static struct pmem_arena * +pmem_arena_alloc(paddr_t start, paddr_t end) +{ + struct pmem_arena *arena; + + KASSERT(start < end || (start == 0 && end == 0)); + + if (pmem_narenas < PMEM_ARENA_STORAGE) { + /* preload */ + arena = &pmem_arena_storage[pmem_narenas]; + arena->pa_storagetype = PMEM_STORAGE_STATIC; + pmem_narenas++; + } else { + arena = kmem_zalloc(sizeof(struct pmem_arena), + KM_NOSLEEP); + arena->pa_storagetype = PMEM_STORAGE; + if (arena == NULL) + return NULL; + } + + KASSERT(arena != NULL); + arena->pa_start = start; + arena->pa_end = end; + ARENA_DIMM_INIT(arena); + ARENA_REGION_INIT(arena); + + return arena; +} + +static int +pmem_arena_add_region(struct pmem_arena *arena, + struct pmem_phys_region *region) +{ + int error; + + KASSERT(arena != NULL); + KASSERT(region != NULL); + + KASSERT(region != NULL); + region->r_vmem = vmem_create("pmem_region", + region->r_spec.r_start, + region->r_spec.r_end - region->r_spec.r_start, + PAGE_SIZE, + NULL, NULL, + NULL, /* vmem backend */ + 0, /* qcache_max */ + VM_NOSLEEP, IPL_NONE); + if (region->r_vmem == NULL) { + error = ENOMEM; + goto err0; + } + + region->r_pmarena = arena; + ARENA_REGION_ADD(arena, region); + + /* XXX register callback handler */ + + return 0; + +err0: + return error; +} + +/* API */ + +/* Load physical addresses [start, end) having the given default properties. + */ +int +pmem_region_create(paddr_t start, paddr_t end, + pmem_type_t type, pmem_prot_t prot, pmem_props_t props) +{ + struct pmem_phys_region *region; + + KASSERT(start < end); + + region = pmem_region_alloc(start, end, type, prot, props); + if (region == NULL) + return ENOMEM; + + REGION_ADD(region); + + return 0; +} + +/* Connect loaded physical addresses with this arena. */ +int +pmem_arena_add_regions(struct pmem_arena *arena) +{ + int error = 0; + struct pmem_phys_region *region; + + KASSERT(arena != NULL); + + REGION_FOREACH(region) { + /* already assigned? */ + if (region->r_pmarena != NULL) + continue; + KASSERT(region->r_vmem == NULL); + if (region->r_spec.r_start < arena->pa_start) + continue; + if (region->r_spec.r_end > arena->pa_end) + continue; + + error = pmem_arena_add_region(arena, region); + if (error) + goto out; + } + +out: + return error; +} + +int +pmem_arena_add_dimm(struct pmem_arena *arena, + paddr_t start, paddr_t end, pmem_type_t type, uint32_t serial) +{ + int error; + struct pmem_dimm *dimm; + + KASSERT(arena != NULL); + KASSERT(end > start); + + dimm = pmem_dimm_alloc(start, end, type, serial); + if (dimm == NULL) { + error = ENOMEM; + goto err0; + } + + KASSERT(dimm != NULL); + dimm->d_pmarena = arena; + + DIMM_ADD(dimm); + ARENA_DIMM_ADD(arena, dimm); + + /* XXX register callback handler */ + + return 0; + +err0: + return error; +} + +struct pmem_arena * +pmem_arena_create(struct numa_info *ni, paddr_t start, paddr_t end) +{ + struct pmem_arena *arena; + KASSERT(start < end); + KASSERT(ni != NULL); + + arena = pmem_arena_alloc(start, end); + if (arena == NULL) + return NULL; + + arena->pa_numainfo = ni; + ARENA_ADD(arena); + + return arena; +} + +/* + * Set arena size indirectly. + * Only for bootstrapping code. + */ +int +pmem_arena_loadrange(struct pmem_arena *arena, paddr_t start, paddr_t end, + pmem_type_t type) +{ + struct pmem_phys_region *region = NULL; + + KASSERT(arena != NULL); + KASSERT(start < end); + KASSERT(cold); /* this is only for bootstrapping */ + + if (start < arena->pa_start) + arena->pa_start = start; + if (end > arena->pa_end) + arena->pa_end = end; + + if (type == PMEM_U_UNKNOWN) + return 0; + + while ((region = pmem_region_search(region, start, end)) != NULL) { + KASSERT(region != NULL); + if (region->r_spec.r_type != PMEM_U_UNKNOWN) + continue; + + region->r_spec.r_type = type; + } + + return 0; +} + +int +pmem_arena_prime(struct pmem_arena *arena, paddr_t start, paddr_t end, + pmem_type_t type, pmem_prot_t prot, pmem_props_t props) +{ + int error; + struct pmem_phys_region *region; + + KASSERT(arena != NULL); + KASSERT(start > end); + + region = pmem_region_alloc(start, end, type, prot, props); + if (region == NULL) { + error = ENOMEM; + goto err0; + } + + error = pmem_arena_add_region(arena, region); + if (error) + goto err1; + + REGION_ADD(region); + return 0; + +err1: + kmem_free(region, sizeof(struct pmem_region)); +err0: + return error; +} + +/* Connect two arenas. */ +int +pmem_arena_connect(struct pmem_arena *left, struct pmem_arena *right, + struct pmem_mapping *m, pmem_metric_t metric) +{ + KASSERT(left != NULL); + KASSERT(right != NULL); + + return 0; +} + +/* Reserve a region in arena `a' that meets the given criteria. + * The region is returned with a reference count of at least 1. + */ +struct pmem_region * +pmem_alloc(struct pmem_arena *arena, paddr_t minaddr, paddr_t maxaddr, + pmem_prot_t prot, pmem_props_t props, pmem_type_t type, + size_t align, size_t phase, size_t size, size_t nocross, + vm_flag_t flags, pmem_metric_t maxmetric) +{ + struct pmem_region *r = NULL; + struct pmem_phys_region *tmp; + vmem_addr_t addr; + + KASSERT(arena != NULL); + KASSERT(maxaddr > minaddr); + KASSERT(size > 0); + KASSERT((maxaddr - minaddr) >= size); + KASSERT((flags & (VM_BESTFIT|VM_INSTANTFIT)) != 0); + KASSERT((~flags & (VM_BESTFIT|VM_INSTANTFIT)) != 0); + + /* Clip search area to the arena. */ + if (arena->pa_start > minaddr) + minaddr = arena->pa_start; + if (arena->pa_end < maxaddr) + maxaddr = arena->pa_end; + + /* Check if this arena is large enough */ + if ((maxaddr - minaddr) < size) + return NULL; + + /* Check bounds */ + if (arena->pa_start >= maxaddr) + return NULL; + if (arena->pa_end <= minaddr) + return NULL; + + r = kmem_zalloc(sizeof(struct pmem_region), KM_NOSLEEP); + if (r == NULL) + return NULL; + + ARENA_REGION_FOREACH(arena, tmp) { + if (!pmem_region_match(tmp, minaddr, maxaddr)) + continue; + + if (tmp->r_spec.r_prot != prot) + continue; + if (tmp->r_spec.r_props != props) + continue; + if (tmp->r_spec.r_type != type) + continue; + + addr = vmem_xalloc(tmp->r_vmem, size, align, phase, + nocross, minaddr, maxaddr, flags | VM_NOSLEEP); + if (addr == VMEM_ADDR_NULL) + continue; + + goto found; + } + + kmem_free(r, sizeof(struct pmem_region)); + return NULL; + +found: + r->r_spec.r_start = addr; + r->r_spec.r_end = addr + size; + r->r_spec.r_prot = tmp->r_spec.r_prot; + r->r_spec.r_props = tmp->r_spec.r_props; + r->r_spec.r_type = tmp->r_spec.r_type; + r->r_refcount = 1; + r->r_physregion = tmp; + + return r; +} + +int +pmem_free(struct pmem_region **r) +{ + struct pmem_region *r1; + KASSERT(r != NULL); + KASSERT(*r != NULL); + + r1 = *r; + + /* still referenced by others */ + if (r1->r_refcount > 1) + return EBUSY; + + vmem_xfree(r1->r_physregion->r_vmem, r1->r_spec.r_start, + r1->r_spec.r_end - r1->r_spec.r_start); + kmem_free(r1, sizeof(struct pmem_region)); + + *r = NULL; + return 0; +} + +/* Get/set properties on the region `r'. */ +int +pmem_get(struct pmem_region *r, pmem_prot_t *prot, pmem_props_t *props, + pmem_type_t *type) +{ + KASSERT(r != NULL); + KASSERT(prot != NULL); + KASSERT(props != NULL); + KASSERT(type != NULL); + + return 0; +} + +int +pmem_set(struct pmem_region *r, pmem_prot_t prot, pmem_props_t props, + pmem_type_t type) +{ + KASSERT(r != NULL); + + return 0; +} + +/* Count another reference to region `r'. */ +void +pmem_incref(struct pmem_region *r) +{ + KASSERT(r != NULL); + +} + +void +pmem_decref(struct pmem_region *r) +{ + KASSERT(r != NULL); + +} + +struct pmem_region * +pmem_map(struct pmem_arena *arena, struct pmem_region *r, paddr_t *paddr) +{ + KASSERT(arena != NULL); + KASSERT(r != NULL); + KASSERT(paddr != NULL); + + return NULL; +} + +void +pmem_unmap(struct pmem_region *r) +{ + KASSERT(r != NULL); + +} + +/* dumps */ + +static void +pmem_region_print(int i, struct pmem_phys_region *region) +{ + printf( " region %i: 0x%"PRIx64" - 0x%"PRIx64 + " protection: 0x%x, properties: 0x%x, type: 0x%x\n", + i, + region->r_spec.r_start, + region->r_spec.r_end, + region->r_spec.r_prot, + region->r_spec.r_props, + region->r_spec.r_type); +} + +void +pmem_regions_dump(struct pmem_arena *arena) +{ + int i = 0; + struct pmem_phys_region *region; + + if (arena) { + ARENA_REGION_FOREACH(arena, region) { + pmem_region_print(i, region); + i++; + } + return; + } + + REGION_FOREACH(region) { + pmem_region_print(i, region); + i++; + } +} + +static void +pmem_dimm_print(int i, struct pmem_dimm *dimm) +{ + printf( " dimm %i: 0x%"PRIx64" - 0x%"PRIx64 + " type: 0x%x, serial: 0x%x\n", + i, + dimm->d_spec.d_start, + dimm->d_spec.d_end, + dimm->d_spec.d_type, + dimm->d_spec.d_serial); +} + +void +pmem_dimms_dump(struct pmem_arena *arena) +{ + int i = 0; + struct pmem_dimm *dimm; + + if (arena) { + ARENA_DIMM_FOREACH(arena, dimm) { + pmem_dimm_print(i, dimm); + i++; + } + return; + } + + DIMM_FOREACH(dimm) { + pmem_dimm_print(i, dimm); + i++; + } +} + +void +pmem_arenas_dump(bool dump_regions, bool dump_dimms) +{ + int i = 0; + struct pmem_arena *arena; + + ARENA_FOREACH(arena) { + printf("arena %i: 0x%"PRIx64" - 0x%"PRIx64"\n", + i, + arena->pa_start, + arena->pa_end); + + if (dump_regions) { + pmem_regions_dump(arena); + } + + if (dump_dimms) { + pmem_dimms_dump(arena); + } + i++; + } +} Index: sys/sys/kcore.h =================================================================== RCS file: /cvsroot/src/sys/sys/kcore.h,v retrieving revision 1.2 diff -u -p -r1.2 kcore.h --- sys/sys/kcore.h 26 Dec 2005 18:41:36 -0000 1.2 +++ sys/sys/kcore.h 15 Dec 2008 23:03:28 -0000 @@ -47,6 +47,7 @@ typedef struct { u_quad_t start; /* Physical start address */ u_quad_t size; /* Size in bytes */ + uint32_t type; /* pmem(9) type of space */ } phys_ram_seg_t; typedef struct kcore_hdr { Index: sys/sys/numa.h =================================================================== RCS file: sys/sys/numa.h diff -N sys/sys/numa.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/sys/numa.h 15 Dec 2008 23:03:28 -0000 @@ -0,0 +1,72 @@ +/* $NetBSD: $ */ + +/* + * Copyright (c) 2008 Christoph Egger + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _SYS_NUMA_H_ +#define _SYS_NUMA_H_ + +#include + + +#ifndef NUMA_INFO_ITERATOR +#define NUMA_INFO_ITERATOR int +#define NUMA_INFO_FOREACH(nii, ni) \ + (void)nii, ni = curnode(); ni != NULL; ni = NULL +#endif + +#ifndef NUMAINFO_IS_PRIMARY +#define NUMAINFO_IS_PRIMARY(ni) ((void)ni, 1) +#endif + +/* MI NUMA flags */ +#define NUMAF_FAKETOPOLOGY 0x1 /* fake a NUMA topology */ +#define NUMAF_SCANTOPOLOGY 0x2 /* scan for cpu/memory devices + * and build topology based on + * the findings. + */ +#define NUMAF_PROBEAFFINITY 0x4 /* probe affinity between nodes */ + +/* MI NUMA cpu types */ + +/* MI NUMA cpu flags */ +#define NUMACPU_FLAG_PRIMARY 0x00000001 + +struct numa_info *numa_lookup(uint32_t); + +CIRCLEQ_HEAD(numaqueue, numa_info); + +extern kmutex_t numa_lock; +extern uint32_t maxnuma; +extern struct numaqueue numa_queue; + +static inline uint32_t +numa_index(struct numa_info *ni) +{ + return ni->ni_data.ni_index; +} + +#endif /* !_SYS_NUMA_H_ */ Index: sys/sys/numa_data.h =================================================================== RCS file: sys/sys/numa_data.h diff -N sys/sys/numa_data.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/sys/numa_data.h 15 Dec 2008 23:03:28 -0000 @@ -0,0 +1,84 @@ +/* $NetBSD: $ */ + +/* + * Copyright (c) 2008 Christoph Egger + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SYS_NUMA_DATA_H_ +#define _SYS_NUMA_DATA_H_ + +#include + +/* + * MI per numa-node data + * + * this structure is intended to be included in MD numa_info structure. + * struct numa_info { + * struct numa_data ni_data; + * } + * + * note that numa_data is not expected to contain much data, + * as numa_info is size-limited on most ports. + */ + +struct numa_data { + /* + * The first section is likely to be touched by other NUMAs + */ + CIRCLEQ_ENTRY(numa_info) numa_qchain; /* circleq of all NUMAs */ + + /* + * This section is mostly NUMA-private. + */ + uint32_t ni_index; /* NUMA node index */ +}; + +/* + * MI per cpu numa-node data + * + * this structure is intended to be included in MD numa_cpu_info structure. + * struct numa_cpu_info { + * struct numa_cpu_data nci_data; + * } + * + * note that numa_cpu_data is not expected to contain much data, + * as numa_cpu_info is size-limited on most ports. + */ + +struct numa_cpu_data { + uint32_t dummy; +}; + +int mi_numa_attach(struct numa_info *); + +#define NUMAF_FAKETOPOLOGY 0x1 /* fake a NUMA topology */ +#define NUMAF_SCANTOPOLGY 0x2 /* scan for cpu/memory devices + * and build topology based on + * the findings. + */ +#define NUMAF_PROBEAFFINITY 0x4 /* probe affinity between nodes */ +uint32_t mi_numa_init(void); + +#endif /* _SYS_NUMA_DATA_H_ */ Index: sys/sys/param.h =================================================================== RCS file: /cvsroot/src/sys/sys/param.h,v retrieving revision 1.335 diff -u -p -r1.335 param.h --- sys/sys/param.h 9 Dec 2008 20:48:52 -0000 1.335 +++ sys/sys/param.h 15 Dec 2008 23:03:28 -0000 @@ -174,6 +174,9 @@ #ifndef MAXCPUS #define MAXCPUS 32 #endif +#ifndef MAX_NUMA_NODES +#define MAX_NUMA_NODES 32 +#endif #ifndef MAX_LWP_PER_PROC #define MAX_LWP_PER_PROC 8000 #endif Index: sys/sys/pmem.h =================================================================== RCS file: sys/sys/pmem.h diff -N sys/sys/pmem.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/sys/pmem.h 15 Dec 2008 23:03:28 -0000 @@ -0,0 +1,247 @@ +/* $NetBSD: $ */ +/* + * Copyright (c) 2008 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Christoph Egger. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _SYS_PMEM_H_ +#define _SYS_PMEM_H_ + +#include +#include + +enum pmem_type { + /* physical RAM types */ + PMEM_T_NORMAL = 0x0000, /* normal */ + PMEM_T_HOTSPARE = 0x0001, /* reserved to replace ram in + * critical or defect state + */ + PMEM_T_CRITICAL = 0x0002, /* usable, but may become unusable + * if too frequently accessed or + * doesn't run with lower power, + * for example. + */ + PMEM_T_DEFECT = 0x0004, /* really broken, unusable */ + PMEM_T_OFFLINE = 0x0008, /* unusable but does not imply + * to be defect + */ + PMEM_T_SERIALNR = 0x0010, /* serial number is available */ + + PMEM_T_MASK = 0xffff, /* physical type mask */ + + /* logical *use* RAM types */ + PMEM_U_UNKNOWN = 0x00000000, /* Unknown/reserved region found by + * bootstrapping code, bus scanning + * code/drivers need to figure out + * what for and may change this type. + */ + PMEM_U_TEXT = 0x00010000, /* Code */ + PMEM_U_DMABUF = 0x00020000, /* DMA buffer */ + PMEM_U_FIRMWARE = 0x00040000, /* Firmware data (e.g. ACPI) */ + PMEM_U_RAM = 0x00080000, /* normal usable RAM */ + PMEM_U_ROM = 0x00100000, /* any ROM */ + PMEM_U_MMIO = 0x00200000, /* any MMIO (e.g. PCI memory) */ + + PMEM_U_MIRROR = 0x00400000, /* mirrors an other range to provide + * a valid copy in case of memory + * errors during access. + * Allows to turn uncorrectable + * machine-check errors into an + * correctable error, for example. + */ + PMEM_U_PTP = 0x00800000, /* Pagetable Pages (e.g. MMU, IOMMU) */ + PMEM_U_MASK = 0xffff0000, /* logical use mask */ +}; + +enum pmem_prot { /* hardware implementation */ + PMEM_PROT_UNKNOWN = 0x00, + PMEM_PROT_READ = 0x01, /* PCI bus bridge, IOMMU */ + PMEM_PROT_WRITE = 0x02, /* PCI bus bridge, IOMMU, MTRR, + * AMD Elan SC520 PAR + */ + PMEM_PROT_EXEC = 0x04, /* AMD Elan SC520 PAR */ +}; + +enum pmem_props { /* hardware implementation */ + PMEM_P_UNKNOWN = 0x00, + PMEM_P_WTHRU = 0x01, /* MTRR */ + PMEM_P_WBACK = 0x02, /* MTRR */ + PMEM_P_WCOMB = 0x04, /* MTRR */ + PMEM_P_UNCACHED = 0x08, /* MTRR, AMD Elan SC520 PAR */ + PMEM_P_PREFETCH = 0x10, /* PCI bus bridge */ + PMEM_P_32BIT = 0x20, /* 32-bit access */ + PMEM_P_64BIT = 0x40, /* 64-bit access */ + PMEM_P_DMA = 0x80, /* DMA-safe memory */ +}; + + +typedef enum pmem_type pmem_type_t; +typedef enum pmem_prot pmem_prot_t; +typedef enum pmem_props pmem_props_t; + + +struct numa_info; +struct pmem_arena; +struct pmem_mapping; +struct pmem_phys_region; + +/* Describe a memory DIMM + * you have physically in your machine + * Some information may provide MD bootstrap code, + * most information may provide spdmem(4). + */ +struct pmem_dimm_spec { + paddr_t d_start; + paddr_t d_end; + pmem_type_t d_type; + uint32_t d_serial; /* DIMM serial number */ +}; + +struct pmem_region_spec { + paddr_t r_start; + paddr_t r_end; + pmem_prot_t r_prot; + pmem_props_t r_props; + pmem_type_t r_type; +}; + +struct pmem_region { + struct pmem_region_spec r_spec; + + u_int r_refcount; + struct pmem_phys_region *r_physregion; +}; + +/* One arena per NUMA-node */ +struct numa_info; +struct pmem_arena; + +struct pmem_mapping { + int dummy; /* TBD */ +}; + +typedef uint32_t pmem_metric_t; + + +/* Create arena. [start, end) describes the address range + * of the arena including all holes. + */ +struct pmem_arena * +pmem_arena_create(struct numa_info *ni, paddr_t start, paddr_t end); + +/* Set arena size incrementally. Only needed if you can't get the + * information in an ordered way. + */ +int +pmem_arena_loadrange(struct pmem_arena *arena, paddr_t start, paddr_t end, + pmem_type_t type); + +/* Add new memory module to arena. */ +int +pmem_arena_add_dimm(struct pmem_arena *arena, + paddr_t start, paddr_t end, pmem_type_t type, uint32_t serial); + +/* Load physical addresses [start, end) having the given default properties. + */ +int +pmem_region_create(paddr_t start, paddr_t end, + pmem_type_t type, pmem_prot_t prot, pmem_props_t props); + +/* Connect loaded physical addresses with this arena. */ +int +pmem_arena_add_regions(struct pmem_arena *arena); + +/* Load arena with physical addresses [start, end) having the given + * default properties. This basically does the same as + * pmem_region_create() and pmem_arena_add_regions() in one step, + * but can't be used in very early MD bootstrapping. + */ +int +pmem_arena_prime(struct pmem_arena *arena, paddr_t start, paddr_t end, + pmem_type_t type, pmem_prot_t prot, pmem_props_t props); + +/* Connect two arenas. */ +int +pmem_arena_connect(struct pmem_arena *left, struct pmem_arena *right, + struct pmem_mapping *m, pmem_metric_t metric); + +/* Reserve a region in arena that meets the given criteria. + * The region is returned with a reference count of at least 1. + */ +struct pmem_region * +pmem_alloc(struct pmem_arena *arena, paddr_t minaddr, paddr_t maxaddr, + pmem_prot_t prot, pmem_props_t props, pmem_type_t type, + size_t align, size_t phase, size_t size, size_t nocross, + vm_flag_t flags, pmem_metric_t maxmetric); + +int +pmem_free(struct pmem_region **r); + +/* Get/set properties on the region `r'. */ +int +pmem_get(struct pmem_region *r, pmem_prot_t *prot, pmem_props_t *props, + pmem_type_t *type); + +int +pmem_set(struct pmem_region *r, pmem_prot_t prot, pmem_props_t props, + pmem_type_t type); + +/* Count another reference to region `r'. */ +void +pmem_incref(struct pmem_region *r); + +/* Reduce the reference count on `r' by one. pmem_decref may reclaim the + * resources held by `r'. + */ +void +pmem_decref(struct pmem_region *r); + +/* Map region `r' into arena `a'. + * + * Returns NULL on failure. `paddr' is undefined on failure. + * + * On success, return `r' if region `r' belongs to arena `a', or else + * return an alias for region `r' in `a'. The returned region's reference + * count is increased by one. Set `paddr' to the physical address of + * the start of the region `r' in arena `a'. + */ +struct pmem_region * +pmem_map(struct pmem_arena *arena, struct pmem_region *r, paddr_t *paddr); + +/* Remove a mapping of `r' from its arena. Decrease the reference count + * by one. + */ +void +pmem_unmap(struct pmem_region *r); + + +/* debug */ +void pmem_regions_dump(struct pmem_arena *arena); +void pmem_dimms_dump(struct pmem_arena *arena); +void pmem_arenas_dump(bool dump_regions, bool dump_dimms); + +#endif /* _SYS_PMEM_H_ */