mm: refactor page frame allocator

This is part 3 of the mm subsystem overhaul. The allocator doesn't rely on mutexes anymore and uses individual per-order spinlocks instead. Also, it is aware of multiple memory zones (normal and DMA) as well as emergency reserves. Page bitmaps take up 50 % less overhead now.
2 years ago · 385af1b7ef
parent 825a981d67
commit 385af1b7ef
11 changed files with 546 additions and 581 deletions
--- a/arch/x86/include/amd64/latom.h
+++ b/arch/x86/include/amd64/latom.h
@ -202,7 +202,7 @@ static inline bool latom_flip_bit(latom_t *latom, int pos)

 	__asm__ volatile(
 	X86_LOCK_PREFIX
-"	btcq	%1, (%2)	\n"
+"	btcq	%q1, (%2)	\n"
 "	setc	%b0		\n"
 	: "+r"(ret)
 	: "r"(pos), "r"(&latom->_value)
--- a/arch/x86/mm/amd64/init.c
+++ b/arch/x86/mm/amd64/init.c
@ -5,7 +5,6 @@
 #include <arch/multiboot.h>
 #include <arch/vmparam.h>

-#include <gay/linker.h>
 #include <gay/mm.h>
 #include <gay/vm/page.h>
 #include <gay/systm.h>
@ -15,7 +14,7 @@
 #include <string.h>

 struct vm_page *const vm_page_array = (vm_page_t)VM_PAGE_ARRAY_OFFSET;
-#ifdef DEBUG
+#if CFG_DEBUG_PGADDRS
 /* this gets updated in x86_setup_paging() once we know how big the array is */
 vm_page_t _vm_page_array_end = (vm_page_t)(VM_PAGE_ARRAY_OFFSET + VM_PAGE_ARRAY_LENGTH);
 #endif
@ -41,6 +40,51 @@ static void register_area(struct mb2_mmap_entry *entry)
 	}
 }

+/**
+ * @brief Map the entire physical memory to `DMAP_OFFSET`.
+ *
+ * This may overshoot up to 1 GB because we only use gigapages, but considering
+ * the fact that mapping literally the entire physical RAM is probably the
+ * bigger problem here i'd say it's fine.
+ *
+ * @param end End of physical memory
+ */
+static void map_direct_area(vm_paddr_t end)
+{
+	vm_paddr_t ppos = 0;
+	void *vpos = __v(0);
+	void *const vend = __v(end);
+	/* This assertion fails if > 4 TB of physical memory are available.
+	 * Sorry gamers, we don't support enough RAM for all your Chrome tabs. */
+	KASSERT(vend < DMAP_END);
+
+	while (vpos < vend) {
+		x86_pml4te_t *pml4te = X86_PML4TE(vpos);
+		vm_paddr_t pdpt_phys = __boot_pmalloc(PAGE_SHIFT, MM_ZONE_NORMAL);
+		panic_if(pdpt_phys == BOOT_PMALLOC_ERR,
+			 "cannot allocate memory for direct mapping");
+
+		__boot_clear_page(pdpt_phys);
+		pml4te->val = pdpt_phys | __P_PRESENT | __P_RW | __P_NOEXEC;
+		vm_flush();
+
+		for (int pdpti = 0; pdpti < 512; pdpti++) {
+			x86_pdpte_t *pdpte = X86_PDPTE(vpos);
+			pdpte->val = ppos | __P_PRESENT | __P_RW | __P_GLOBAL
+					  | __P_HUGE | __P_NOEXEC;
+
+			ppos += GIGAPAGE_SIZE;
+			vpos += GIGAPAGE_SIZE;
+			if (vpos >= vend)
+				break;
+		}
+
+		pml4te->flags.global = 1;
+	}
+
+	vm_flush();
+}
+
 /*
 * "Oh cool another deeply nested 100-liner that nobody understands"
 */
@ -68,16 +112,15 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
 	 * (this is gonna be a long one)
 	 */
 	struct vm_page *vm_page_array_end = vm_page_array + (end >> PAGE_SHIFT);
-#ifdef DEBUG
+#if CFG_DEBUG_PGADDRS
 	_vm_page_array_end = vm_page_array_end;
 #endif
 	void *map_pos = vm_page_array;
-	usize remaining_size = (void *)vm_page_array_end - (void *)vm_page_array;
-	remaining_size = align_ceil(remaining_size, PAGE_SIZE);
-	kprintf("Mapping %zu bytes for vm_page_array\n", remaining_size);
+	void *map_end = map_pos + ((void *)vm_page_array_end - (void *)vm_page_array);
+	kprintf("Mapping %zu bytes for vm_page_array\n", map_end - map_pos);

 	/* PML4T loop */
-	while (remaining_size != 0) {
+	while (map_pos < map_end) {
 		/* Is vm_page_array so huge that it spans almost the entire 2 TB
 		 * kernel region?  If that's the case, something has gone terribly
 		 * wrong, unless we somehow happen to have about an Exabyte of RAM
@ -85,7 +128,7 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
 		KASSERT(map_pos < (void *)KERNBASE);

 		x86_pml4te_t *pml4te = X86_PML4TE(map_pos);
-		vm_paddr_t pml4te_val = __boot_pmalloc(PAGE_SHIFT);
+		vm_paddr_t pml4te_val = __boot_pmalloc(PAGE_SHIFT, MM_ZONE_NORMAL);
 		panic_if(pml4te_val == BOOT_PMALLOC_ERR, "cannot reserve memory for vm_page_array");
 		__boot_clear_page(pml4te_val);
 		pml4te_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_NOEXEC;
@ -98,8 +141,8 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
 			vm_paddr_t pdpte_val;

 			/* try allocating a 1 GB gigapage first */
-			if (remaining_size >= 1 << X86_PDPT_SHIFT) {
-				pdpte_val = __boot_pmalloc(X86_PDPT_SHIFT);
+			if (map_end - map_pos > GIGAPAGE_SIZE) {
+				pdpte_val = __boot_pmalloc(X86_PDPT_SHIFT, MM_ZONE_NORMAL);
 				/* CLion is warning about this condition being always true, but
 				 * that is not the case.  I've checked the disassembly with -O2,
 				 * and clang is emitting the check.  So it's fine, i guess. */
@ -107,16 +150,15 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
 					pdpte_val |= __P_PRESENT | __P_RW | __P_HUGE
 						  | __P_GLOBAL | __P_NOEXEC;
 					pdpte->val = pdpte_val;
-					remaining_size -= 1 << X86_PDPT_SHIFT;
-					map_pos += 1 << X86_PDPT_SHIFT;
-					if (remaining_size == 0)
+					map_pos += GIGAPAGE_SIZE;
+					if (map_pos >= map_end)
 						goto map_done;
 					continue;
 				}
 			}

 			/* couldn't use a gigapage, continue in hugepage steps */
-			pdpte_val = __boot_pmalloc(PAGE_SHIFT);
+			pdpte_val = __boot_pmalloc(PAGE_SHIFT, MM_ZONE_NORMAL);
 			panic_if(pdpte_val == BOOT_PMALLOC_ERR,
 				 "cannot reserve memory for vm_page_array");
 			__boot_clear_page(pdpte_val);
@ -130,22 +172,21 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
 				vm_paddr_t pdte_val;

 				/* try allocating a 2 MB hugepage first */
-				if (remaining_size >= (1 << X86_PDT_SHIFT)) {
-					pdte_val = __boot_pmalloc(X86_PDT_SHIFT);
+				if (map_end - map_pos >= HUGEPAGE_SIZE) {
+					pdte_val = __boot_pmalloc(X86_PDT_SHIFT, MM_ZONE_NORMAL);
 					if (pdte_val != BOOT_PMALLOC_ERR) {
 						pdte_val |= __P_PRESENT | __P_RW | __P_GLOBAL
 							 | __P_HUGE | __P_NOEXEC;
 						pdte->val = pdte_val;
-						remaining_size -= 1 << X86_PDT_SHIFT;
-						map_pos += 1 << X86_PDT_SHIFT;
-						if (remaining_size == 0)
+						map_pos += HUGEPAGE_SIZE;
+						if (map_pos >= map_end)
 							goto map_done;
 						continue;
 					}
 				}

 				/* couldn't use a hugepage, continue in page steps */
-				pdte_val = __boot_pmalloc(PAGE_SHIFT);
+				pdte_val = __boot_pmalloc(PAGE_SHIFT, MM_ZONE_NORMAL);
 				panic_if(pdte_val == BOOT_PMALLOC_ERR,
 					 "cannot reserve memory for vm_page_array");
 				__boot_clear_page(pdpte_val);
@ -156,15 +197,14 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
 				/* PT loop */
 				for (int pt_index = 0; pt_index < 512; pt_index++) {
 					x86_pte_t *pte = X86_PTE(map_pos);
-					vm_paddr_t pte_val = __boot_pmalloc(X86_PT_SHIFT);
+					vm_paddr_t pte_val = __boot_pmalloc(X86_PT_SHIFT, MM_ZONE_NORMAL);
 					panic_if(pte_val == BOOT_PMALLOC_ERR,
 						 "cannot reserve memory for vm_page_array");
 					pte_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_NOEXEC;
 					pte->val = pte_val;

-					remaining_size -= 1 << X86_PT_SHIFT;
-					map_pos += 1 << X86_PT_SHIFT;
-					if (remaining_size == 0)
+					map_pos += PAGE_SIZE;
+					if (map_pos >= map_end)
 						goto map_done;
 				} /* end of PT loop */
 			} /* end of PDT loop */
@ -172,26 +212,8 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
 	} /* end of PML4T loop */

 map_done:
-	vm_flush();
-}
-
-static void init_page_range(vm_paddr_t start, vm_paddr_t end, u_int flags)
-{
-	KASSERT(start <= end);
-	vm_page_t cursor = vm_page_array + (start >> PAGE_SHIFT);
-	usize count = (end - start) >> PAGE_SHIFT;
-
-	if (flags == 0) {
-		memset(cursor, 0, count * sizeof(*cursor));
-	} else {
-		while (count--) {
-			atom_init(&cursor->count, 0);
-			cursor->flags = flags;
-			cursor->try_free = nil;
-			cursor->extra = nil;
-			cursor++;
-		}
-	}
+	map_direct_area(end);
+	paging_init(end);
 }

 /*
@ -199,7 +221,7 @@ static void init_page_range(vm_paddr_t start, vm_paddr_t end, u_int flags)
 * a page table, yet also need to reference it in the page table structures
 * (thereby mapping it into virtual memory) before we can zero it out.
 * This little hack temporarily maps the area at one PDP entry before KERNBASE
- * (meaning index 1022 of _pdp0), zeroes the area, and then unmaps it again.
+ * (meaning index 510 of _pdp0), zeroes the area, and then unmaps it again.
 */
 void __boot_clear_page(vm_paddr_t paddr)
 {
--- a/arch/x86/mm/amd64/page.c
+++ b/arch/x86/mm/amd64/page.c
@ -4,15 +4,10 @@
 #include <arch/trap.h>

 #include <gay/cdefs.h>
-#include <gay/config.h>
-#include <gay/errno.h>
 #include <gay/kprintf.h>
-#include <gay/mm.h>
 #include <gay/systm.h>
 #include <gay/types.h>

-#include <string.h>
-
 /*
 * Initial Page Directory Pointer Table and Page Map Level 4 Table for the
 * assembly startup routine (see setup64.S).  Used for statically mapping the
@ -21,46 +16,6 @@
 __asmlink x86_pdpt_t _pdpt0;
 __asmlink x86_pml4t_t _pml4t;

-int map_page(uintptr_t phys, void *virt, enum pflags flags)
-{
-	flags |= P_PRESENT;
-	x86_pml4te_t *pml4e = X86_PML4TE(virt);
-	if (!pml4e->flags.present) {
-		void *page = get_pages(0, M_ATOMIC);
-		if (page == nil)
-			return -ENOMEM;
-		pml4e->val = __p(page) | P_PRESENT | P_RW;
-	}
-
-	return 0;
-}
-
-/*
- * The only difference between this and map_page() is that we can't allocate
- * new pages using get_pages() but have to use __early_get_page() instead here.
- * So, all we need to do is ensure that map_page() doesn't need to allocate new
- * pages when we call it, which it only does if pflags does not have P_HUGE
- * set and the page table doesn't exist (present bit in the page directory is
- * clear).  Therefore, we just need to make sure that, if P_HUGE is *not*
- * set, the page table is already allocated and marked as present in the page
- * directory.
- */
-void __early_map_page(uintptr_t phys, void *virt, enum pflags pflags)
-{
-}
-
-uintptr_t unmap_page(void *virt)
-{
-}
-
-enum pflags get_pflags(void *page)
-{
-}
-
-int set_pflags(void *page, enum pflags pflags)
-{
-}
-
 void x86_isr_page_fault(trap_frame_t *frame, u32 error_code)
 {
 	void *address;
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@ -14,6 +14,10 @@ option(CFG_POISON_PAGES "Poison pages after allocate and free" ON)

 option(CFG_POISON_HEAP "Poison heap memory after kmalloc() and kfree()" ON)

+set(CFG_PAGE_EMERG_DENOM "16" CACHE STRING "Denominator for the fraction of pages kept in emergency reserves")
+
+set(CFG_PAGE_EMERG_MAX "1024" CACHE STRING "Absolute maximum number of pages kept in emergency reserves")
+
 option(CFG_SMP "Enable Symmetric Multiprocessing" ON)

 set(CFG_MAX_CPU "64" CACHE STRING "Maximum number of logical processors")
@ -28,6 +32,8 @@ option(CFG_DEBUG_PAGE_ALLOCS "Debug page frame allocations" OFF)

 option(CFG_DEBUG_PAGE_ALLOCS_NOISY "Debug page frame allocations in full detail (VERY noisy)" OFF)

+option(CFG_DEBUG_PGADDRS "Sanitize page frame addresses" OFF)
+
 option(CFG_DEBUG_SLAB_ALLOCS "Debug slab allocations" OFF)

 option(CFG_DEBUG_SLAB_ALLOCS_NOISY "Debug slab allocations in full detail (VERY noisy)" OFF)
--- a/include/gay/config.h.in
+++ b/include/gay/config.h.in
@ -31,6 +31,12 @@
 /** @brief Poison heap areas after `kmalloc()` and `kfree()` */
 #cmakedefine01 CFG_POISON_HEAP

+/** @brief Denominator for the fraction of pages kept in emergency reserves */
+#define CFG_PAGE_EMERG_DENOM @CFG_PAGE_EMERG_DENOM@
+
+/** @brief Absolute maximum number of pages kept in emergency reserves */
+#define CFG_PAGE_EMERG_MAX @CFG_PAGE_EMERG_THRESH@
+
 /** @brief Enable Symmetric Multiprocessing */
 #cmakedefine01 CFG_SMP

@ -52,6 +58,9 @@
 /** @brief Spit out the full details of page allocations */
 #cmakedefine01 CFG_DEBUG_PAGE_ALLOCS_NOISY

+/** @brief Sanitize page frame addresses */
+#cmakedefine01 CFG_DEBUG_PGADDRS
+
 /** @brief Debug slab allocations */
 #cmakedefine01 CFG_DEBUG_SLAB_ALLOCS

--- a/include/gay/mm.h
+++ b/include/gay/mm.h
@ -16,6 +16,13 @@
 * bigger areas of memory that are not physically contiguous (for regular user
 * allocations).  The entire physical memory is mapped statically in the range
 * `DMAP_START - DMAP_END`.
+ *
+ * Memory is split up into (currently) two zones: `MM_ZONE_NORMAL` and
+ * `MM_ZONE_DMA`.  As their names suggest, the former is for general purpose
+ * allocations and the latter for getting memory suitable for DMA transfers.
+ * Zones are further divided into pools, each of which hold a list of groups of
+ * free pages.  The size of these page groups is determined by the pool's order,
+ * where the pool of order `n` holds groups of `1 << n` pages.
 */

 #ifdef _KERNEL
@ -23,10 +30,14 @@
 #include <arch/page.h>

 #include <gay/cdefs.h>
+#include <gay/clist.h>
 #include <gay/config.h>
 #include <gay/kprintf.h>
+#include <gay/mutex.h>
 #include <gay/types.h>

+#include <string.h>
+
 #define _M_ZONE_NORMAL	0
 #define _M_ZONE_DMA	1
 #define _M_ZONE_INDEX(flags) ((flags) & 1)
@ -40,15 +51,34 @@ enum mm_zone_type {
 	MM_NR_ZONES
 };

+/** @brief Boot memory area. */
 struct _bmem_area {
 	struct clist link; /* -> struct mm_zone::_bmem_areas */
 	vm_paddr_t start;
 	vm_paddr_t end;
 };

+struct mm_pool {
+	struct clist freelist; /* -> vm_page_t::link */
+	/** @brief Number of items in `freelist`. */
+	usize free_entries;
+	/** @brief One bit per buddy *pair*, 1 if exactly one is allocated. */
+	latom_t *bitmap;
+	spin_t lock;
+};
+
+#define MM_NR_ORDERS 10
+#define MM_MAX_ORDER (MM_NR_ORDERS - 1)
+
 struct mm_zone {
-	patom_t freelist; /* -> struct vm_page */
-	usize length;
+	/** @brief Current number of free pages in all pools */
+	latom_t free_count;
+	/** @brief Thresholds for OOM behavior */
+	struct {
+		/** @brief Minimum number of pages reserved for emergency allocations */
+		u_long emerg;
+	} thrsh;
+	struct mm_pool pools[MM_NR_ORDERS];
 	struct clist _bmem_areas; /* -> struct _bmem_area */
 };

@ -59,7 +89,7 @@ struct mm_zone {
 * The mm subsystem isn't NUMA aware, because it's not really a thing on desktop
 * grade machines anyway and would only complicate things unnecessarily.
 */
-extern struct mm_zone mm_zones[MM_NR_ZONES];
+extern struct mm_zone mm_zones[MM_NR_ZONES]; /* kernel/mm/page.c */

 /**
 * @brief Memory allocation flags passed to `kmalloc()`.
@ -122,108 +152,33 @@ enum pflags {
 #endif
 };

-/*
- * Terrible hack that allows us to map pages before the page frame allocator is
- * set up.  Don't ever use these anywhere, because they *will* break everything.
- */
-void __early_map_page(uintptr_t phys, void *virt, enum pflags flags);
-/* This just shrinks phys_end by PAGE_SIZE and returns the page */
-uintptr_t __early_get_page(void);
-
-/**
- * @brief Map a page in physical memory to a virtual address.
- * Remember that if `vm` is the memory map currently in use, you will most
- * likely need to call `vm_update()` when you've finished mapping everything
- * to flush the TLB.
- *
- * @param phys Physical address of the page
- * @param virt Virtual address to map the page to
- * @param flags Flags to apply to the page
- * @returns 0 on success, or `-ENOMEM` if OOM (for allocating new page tables)
- */
-int map_page(uintptr_t phys, void *virt, enum pflags flags);
-
-/**
- * @brief Remove a page mapping.
- *
- * @param virt Virtual address the page is mapped to, must be page aligned
- * @returns The physical page address that was being mapped
- */
-uintptr_t unmap_page(void *virt);
-
-/**
- * @brief Get a page's flags in the page tables.
- *
- * @param page Page to get the flags of (if the page is in a hugepage area,
- *	the flags for that hugepage will be returned with `P_HUGE = 1`)
- * @return The flags, as currently stored in the page table structures
- *	(but not necessarily applied if they have been modified and `vm_flush()`
- *	has not been called yet!)
- */
-enum pflags get_pflags(void *page);
-
-/**
- * @brief Update a page's flags in the page tables.
- * You should always use this in conjunction with `get_pflags()`, as in getting
- * the flags first, then toggling the flags you need to, and then setting them
- * in the tables again.  This is because this method will clear *any* previous
- * flags.
- *
- * @param page Page to set flags for (if flags has `P_HUGE` set, must be
- *	`HUGEPAGE_SIZE` aligned, otherwise `PAGE_SIZE` aligned)
- * @param flags Flags to set
- * @return 0 on success, or a negative value if either a page table allocation
- *	failed or
- */
-int set_pflags(void *page, enum pflags flags);
-
-/**
- * @brief Initialize the memory allocator.
- *
- * This can only be called once, from the early `_boot()` routine.
- *
- * @param _phys_start Physical start address of the page area
- * @param _phys_end Physical end address of the page area
- * @returns 0 on success, or -1 if the pointers were garbage
- */
-int kmalloc_init(uintptr_t _phys_start, uintptr_t _phys_end);
-
-/** @brief Start of the mapped, physically contiguous kernel heap */
-extern void *kheap_start;
-/** @brief End of the mapped, physically contiguous kernel heap */
-extern void *kheap_end;
-
-/** @brief Start of the kernel heap in physical memory */
-extern uintptr_t phys_start;
-/** @brief End of the kernel heap in physical memory */
-extern uintptr_t phys_end;
-
 /**
 * @brief Initialize the buddy page frame allocator.
- * This is only called once, internally from `kmalloc_init()`.
- *
- * @return 0 on success, or -1 if it messed up
+ * This is only called once, from the arch dependent counterpart after it has
+ * reserved memory for and mapped `vm_page_array`, as well as mapped the direct
+ * area.
 */
-int pages_init(void);
+void paging_init(vm_paddr_t phys_end);

 /**
 * @brief Allocate a contiguous region in physical memory.
 * The returned region will be `(1 << order) * PAGE_SIZE` bytes long.
 *
- * @param order Order of magnitude (as in `1 << order`) for the region size
- * @param flags How to allocate (`order` must be 0 if `M_NOWAIT` is specified)
+ * **The pages are not initialized.**
+ * If you want zeroed pages, use `get_zero_pages()`.
+ *
+ * @param order Order of magnitude (as in `1 << order` pages)
+ * @param flags How to allocate
 * @return A pointer to the beginning of the region in the direct mapping area,
 *	or `nil` if the allocation failed
 */
-void *get_pages(int order, enum mflags flags) __malloc_like;
-#ifdef __HAVE_HUGEPAGES
-#define GET_PAGE_ORDERS (HUGEPAGE_SHIFT - PAGE_SHIFT + 1)
-#else
-#define GET_PAGE_ORDERS 10
-#endif
-#define GET_PAGE_MAX_ORDER (GET_PAGE_ORDERS - 1)
+void *get_pages(u_int order, enum mflags flags) __malloc_like;
+void *get_page(enum mflags flags) __malloc_like;
+void *get_zero_pages(u_int order, enum mflags flags) __malloc_like;
+void *get_zero_page(enum mflags flags) __malloc_like;

 void free_pages(void *ptr);
+#define free_page(ptr) free_pages(ptr)

 /**
 * @brief Initialize the slab caches.
@ -240,14 +195,8 @@ void slab_init(void);
 * @param phys Physical address
 * @return Virtual address
 */
-static inline void *__v(uintptr_t phys)
+static inline void *__v(vm_paddr_t phys)
 {
-#	ifdef DEBUG
-		if (phys > phys_end) {
-			kprintf("__v(%p): phys ptr out of range!\n", (void *)phys);
-			return nil;
-		}
-#	endif
 	return (void *)phys + DMAP_OFFSET;
 }

@ -262,7 +211,7 @@ static inline void *__v(uintptr_t phys)
 * @return The physical address, i.e. `virt - DMAP_OFFSET`
 * @see vtophys()
 */
-static inline uintptr_t __p(void *virt)
+static inline vm_paddr_t __p(void *virt)
 {
 #	ifdef DEBUG
 		if (virt < DMAP_START || virt >= DMAP_END) {
--- a/include/gay/vm/page.h
+++ b/include/gay/vm/page.h
@ -5,9 +5,23 @@
 #include <arch/page.h>

 #include <gay/cdefs.h>
+#include <gay/clist.h>
+#include <gay/config.h>
 #include <gay/systm.h>
 #include <gay/types.h>

+/*
+ * I'm trying really hard to keep the size of struct vm_page a power of two
+ * on LP64 systems, because that way we can quickly get to the page frame number
+ * by shifting the byte offset of the vm_page_t in vm_page_array to the right
+ * rather than doing a costly divide instruction (or store the page frame number
+ * within the structure itself, which takes up precious space).
+ *
+ * There is insane pressure on the size of this structure, because a typical
+ * system will have millions of instances of it.  Every additional byte makes
+ * a significant difference in memory management overhead.
+ */
+
 /**
 * @brief Stores information about a single page in physical memory.
 * There is exactly one of these for every physical page, no matter what that
@ -16,66 +30,94 @@
 struct vm_page {
 	/** @brief Reference count (0 = unused) */
 	atom_t count;
+	unsigned order:8;
 	/** @brief Various flags describing how and for what the page is used, see below */
-	u_int flags;
-	/** @brief Singly linked list, if the page is free */
-	patom_t next;
-	/**
-	 * @brief Request this page to be freed if possible.
-	 * This callback may be `nil` unless the `PG_FREEABLE` bit in `flags`
-	 * is set.  The presence of this bit does *not* guarantee that the page
-	 * is actually reclaimable, it's merely a performance optimization to
-	 * avoid having to call this function on pages that can never be
-	 * reclaimed anyway.
-	 *
-	 * @param page Pointer to the page itself
-	 * @return 0 if the page could be reclaimed and is now free
-	 */
-	int (*try_free)(struct vm_page *page);
+	unsigned flags:24;
+	struct clist link;
 	/**
 	 * @brief Optional extra data pointer, reserved for private use.
 	 * The current owner of the page may use this to track the underlying
 	 * object in memory (or pretty much anything else), for example the
 	 * `struct slab` if this page is currently used by the slab allocator.
-	 * Useful for implementing the `try_free()` callback.
 	 */
 	void *extra;
 };

 typedef struct vm_page *vm_page_t;

-/* values for struct page::flags */
+/* values for struct vm_page::flags */
+
 /** @brief Page must never be accessed */
 #define PG_RESERVED	(1 << 0)
-/** @brief Page is in an atomic per-cpu cache */
-#define PG_ATOMIC	(1 << 1)
+/** @brief Page is in a per-cpu cache */
+#define PG_PCPU		(1 << 1)
 /** @brief Page is used by the slab allocator */
 #define PG_SLAB		(1 << 2)
-/** @brief It **might** be possible to reclaim this page using `try_free()` */
-#define PG_FREEABLE	(1 << 3)
+/** @brief Page is in `MM_ZONE_DMA`, rather than `MM_ZONE_NORMAL` */
+#define PG_DMA		(1u << 3)

 /** @brief Array of every single page in physical memory, indexed by page frame number. */
 extern struct vm_page *const vm_page_array;
-#ifdef DEBUG
+
+#if CFG_DEBUG_PGADDRS
 extern vm_page_t _vm_page_array_end;
+#define PGADDR_ASSERT(x) KASSERT(x)
+#else
+#define PGADDR_ASSERT(x) ({})
 #endif

+static inline bool page_get(vm_page_t page)
+{
+	return atom_inc(&page->count);
+}
+
+static inline bool page_put(vm_page_t page)
+{
+	return atom_dec(&page->count);
+}
+
 /** @brief Get the page frame number of a page. */
-__pure2 static inline u_long pg2pfn(vm_page_t page)
+__pure2
+static inline u_long pg2pfn(vm_page_t page)
 {
-	KASSERT(page < _vm_page_array_end);
+	PGADDR_ASSERT(page < _vm_page_array_end);
 	return page - vm_page_array;
 }

-__pure2 static inline u_long paddr2pfn(vm_paddr_t paddr)
+__pure2
+static inline vm_page_t vaddr2pg(void *vaddr)
+{
+	PGADDR_ASSERT(vaddr >= DMAP_START && vaddr < (void *)_vm_page_array_end);
+	uintptr_t offset = (uintptr_t)vaddr - DMAP_OFFSET;
+	return &vm_page_array[offset >> PAGE_SHIFT];
+}
+
+__pure2
+static inline u_long vaddr2pfn(void *vaddr)
+{
+	u_long pfn = ((uintptr_t)vaddr - DMAP_OFFSET) >> PAGE_SHIFT;
+	PGADDR_ASSERT(vaddr >= DMAP_START && &vm_page_array[pfn] < _vm_page_array_end);
+	return pfn;
+}
+
+__pure2
+static inline u_long paddr2pfn(vm_paddr_t paddr)
 {
-	KASSERT(&vm_page_array[paddr >> PAGE_SHIFT] < _vm_page_array_end);
+	PGADDR_ASSERT(&vm_page_array[paddr >> PAGE_SHIFT] < _vm_page_array_end);
 	return paddr >> PAGE_SHIFT;
 }

-__pure2 static inline vm_page_t paddr2pg(vm_paddr_t paddr)
+__pure2
+static inline vm_page_t paddr2pg(vm_paddr_t paddr)
 {
 	vm_page_t page = vm_page_array + (paddr >> PAGE_SHIFT);
-	KASSERT(page < _vm_page_array_end);
+	PGADDR_ASSERT(page < _vm_page_array_end);
 	return page;
 }
+
+__pure2
+static inline void *pfn2vaddr(u_long pfn)
+{
+	PGADDR_ASSERT(&vm_page_array[pfn] < _vm_page_array_end);
+	return DMAP_START + (pfn << PAGE_SHIFT);
+}
--- a/kernel/mm/CMakeLists.txt
+++ b/kernel/mm/CMakeLists.txt
@ -2,7 +2,6 @@

 target_sources(gay_kernel PRIVATE
    boot.c
-    kmalloc.c
    page.c
    slab.c
 )
--- a/kernel/mm/boot.c
+++ b/kernel/mm/boot.c
@ -13,7 +13,7 @@ static CLIST(bmem_area_freelist);

 #ifdef DEBUG
 #define debug_free_bmem_area(area) ({ (area)->start = ~(vm_paddr_t)0; })
-#define debug_get_bmem_area(area) KASSERT((area)->start != ~(vm_paddr_t)0)
+#define debug_get_bmem_area(area) KASSERT((area)->start == ~(vm_paddr_t)0)
 #else
 #define debug_free_bmem_area(area) ({})
 #define debug_get_bmem_area(area) ({})
@ -62,6 +62,9 @@ void __boot_pmalloc_init(void)
 		debug_free_bmem_area(area);
 		clist_add(&bmem_area_freelist, &area->link);
 	}
+
+	for (int i = 0; i < MM_NR_ZONES; i++)
+		clist_init(&mm_zones[i]._bmem_areas);
 }

 void __boot_register_mem_area(vm_paddr_t start, vm_paddr_t end, enum mm_zone_type zone_type)
--- a/kernel/mm/kmalloc.c
+++ b/kernel/mm/kmalloc.c
@ -1,74 +0,0 @@
-/* Copyright (C) 2021 fef <owo@fef.moe>.  All rights reserved. */
-
-#include <gay/kprintf.h>
-#include <gay/mm.h>
-#include <gay/types.h>
-#include <gay/util.h>
-
-extern void _image_start_phys;
-extern void _image_end_phys;
-
-/* these are initialized by pages_init() */
-void *kheap_start;
-void *kheap_end;
-
-int kmalloc_init(uintptr_t _phys_start, uintptr_t _phys_end)
-{
-	phys_start = _phys_start;
-	phys_end = _phys_end;
-
-	/*
-	 * The kernel image is very likely gonna be within the physical memory
-	 * range, so we're gonna need to do some cropping in order to not hand
-	 * out pages that actually contain kernel code.
-	 * Furthermore, somebody should probably clean up this mess somehow.
-	 */
-	uintptr_t image_start_phys = (uintptr_t)&_image_start_phys;
-	uintptr_t image_end_phys = (uintptr_t)&_image_end_phys;
-	if (phys_start < image_start_phys && phys_end > image_start_phys) {
-		if (image_start_phys - phys_start > phys_end - image_start_phys)
-			phys_end = image_start_phys;
-		else
-			phys_start = image_end_phys;
-	}
-	if (phys_start < image_end_phys && _phys_end > image_end_phys) {
-		if (image_end_phys - phys_start > phys_end - image_end_phys)
-			phys_end = image_start_phys;
-		else
-			phys_start = image_end_phys;
-	}
-
-	phys_start = align_ceil(phys_start, HUGEPAGE_SIZE);
-	/*
-	 * This is intentionally not aligned to hugepages, because __early_get_page()
-	 * shrinks it in single PAGE_SIZE steps whenever it is called anyway.
-	 * I know, this is a terrible hack, but it will be aligned to a hugepage
-	 * from within pages_init(), right after the entire physical memory has
-	 * been mapped to the direct area (which is the only reason we need to
-	 * be able to allocate pages before the page frame allocator is set up
-	 * in the first place).
-	 */
-	phys_end = align_floor(phys_end, PAGE_SIZE);
-
-	int err = pages_init();
-	if (err)
-		return err;
-
-	slab_init();
-	return 0;
-}
-
-__weak void *malloc(usize size)
-{
-	return kmalloc(size, M_KERN);
-}
-
-__weak void free(void *ptr)
-{
-	kfree(ptr);
-}
-
-/*
- * Looking for kmalloc() and kfree()?
- * Those two are in slab.c for purely organizational reasons.
- */
--- a/kernel/mm/page.c
+++ b/kernel/mm/page.c
@ -1,32 +1,25 @@
 /* Copyright (C) 2021 fef <owo@fef.moe>.  All rights reserved. */

+#include <arch/cpufunc.h>
 #include <arch/page.h>

-#include <gay/bits.h>
 #include <gay/clist.h>
 #include <gay/config.h>
 #include <gay/kprintf.h>
 #include <gay/mm.h>
 #include <gay/mutex.h>
+#include <gay/poison.h>
 #include <gay/systm.h>
 #include <gay/types.h>
 #include <gay/util.h>
+#include <gay/vm/page.h>

 #include <limits.h>
 #include <string.h>
+#include <strings.h>

-#ifndef __HAVE_HUGEPAGES
-#error "Systems without huge pages are currently unsupported because i'm a dumb bitch"
-#endif
-
-#if DMAP_OFFSET % HUGEPAGE_SIZE != 0
-#error "DMAP_OFFSET must be an integral multiple of HUGEPAGE_SIZE"
-#endif
-
-/* this should be impossible because arch/page.h must also define PAGE_SHIFT
- * and HUGEPAGE_SHIFT, meaning the two are definitively powers of 2 */
-#if HUGEPAGE_SIZE % PAGE_SIZE != 0
-#error "HUGEPAGE_SIZE must be an integral multiple of PAGE_SIZE"
+#if DMAP_OFFSET % PAGE_SIZE != 0
+#error "DMAP_OFFSET must be an integral multiple of PAGE_SIZE"
 #endif

 #if PAGE_SIZE % LONG_BIT != 0
@ -40,6 +33,7 @@
 #if CFG_DEBUG_PAGE_ALLOCS
 #	define PAGE_ASSERT(x) KASSERT(x)
 #	define page_debug(msg, ...) kprintf("[page] " msg, ##__VA_ARGS__)
+#	define PAGE_DEBUG_BLOCK
 #	if CFG_DEBUG_PAGE_ALLOCS_NOISY
 #		define page_debug_noisy(msg, ...) kprintf("[page] " msg, ##__VA_ARGS__)
 #	else
@ -47,359 +41,419 @@
 #	endif
 #else
 #	define PAGE_ASSERT(x) ({})
+#	define PAGE_DEBUG_BLOCK if (0)
 #	define page_debug(msg, ...) ({})
 #	define page_debug_noisy(msg, ...) ({})
 #endif

-/**
- * We have cache levels for areas ranging from a single page up to a huge page
- * on a logarithmic scale.  Every level covers double the pages per entry than
- * the one below it, starting at one page per entry.  The effective result is
- * that a single entry in the cache on level L covers `(1 << L)` pages.
- */
-#define CACHE_ORDERS GET_PAGE_ORDERS
-
 #define ORDER_SHIFT(order) (PAGE_SHIFT + (order))
+#define ORDER_SIZE(order) (1 << ORDER_SHIFT(order))

-/** @brief There is one of this for every cache order. */
-struct cache_pool {
-	/**
-	 * @brief List of free blocks on this order of granularity.
-	 * The individual entries sit right at the beginning of each free block,
-	 * and are always aligned to `entry_size` bytes.
-	 */
-	struct clist freelist;
-	/**
-	 * @brief Bitmap that stores the allocated status of each entry.
-	 * 1 means allocated, 0 means not.
-	 */
-	unsigned long *bitmap;
-	/** @brief Number of items in `freelist`. */
-	usize free_entries;
-};
-static struct cache_pool caches[CACHE_ORDERS];
-static MTX(caches_lock);
-
-/* these get set in kmalloc_init() */
-uintptr_t phys_start;
-uintptr_t phys_end;
-
-uintptr_t __early_get_page(void)
-{
-	phys_end -= PAGE_SIZE;
-	return phys_end;
-}
+/* this should be the same as LONG_BIT because latom_t is really just a
+ * long wrapped in a struct, but my trust in compilers is exactly zero */
+#define LATOM_BIT (sizeof(latom_t) * CHAR_BIT)
+
+struct mm_zone mm_zones[MM_NR_ZONES];

-static int sanity_check(void)
+static inline u_int paddr_find_order(vm_paddr_t addr)
 {
-	KASSERT(phys_start < phys_end);
-	KASSERT(phys_start == HUGEPAGE_ALIGN(phys_start));
-	/* phys_end is only page aligned, see kmalloc_init() */
-	KASSERT(phys_end == PAGE_ALIGN(phys_end));
-
-	if ((phys_end - phys_start) < (32 * 1024 * 1024)) {
-		kprintf("Less than 32 MB of usable RAM, this wouldn't go well\n");
-		return 1;
-	}
+	int bit = ffsll((long long)addr) - 1;
+	if (bit == -1 || bit > ORDER_SHIFT(MM_MAX_ORDER))
+		bit = ORDER_SHIFT(MM_MAX_ORDER);

-	return 0;
+	KASSERT(bit >= PAGE_SHIFT);
+	return bit - PAGE_SHIFT;
 }

-/*
- * Map the entire physical memory into the direct contiguous area.
- * __early_map_page() might call __early_get_page() in order to allocate
- * new page table structures, which in turn shrinks the physical memory
- * size (see above).
- */
-static inline void map_direct_area(void)
+/** @brief Claim all free pages in one of the memory areas from the boot allocator. */
+static inline void claim_bmem_pages(struct mm_zone *zone, struct _bmem_area *area)
 {
-#ifdef __HAVE_HUGEPAGES
-	const usize step = HUGEPAGE_SIZE;
-	const enum pflags flags = P_PRESENT | P_RW | P_HUGE;
-#else
-	const usize step = PAGE_SIZE;
-	const enum pflags flags = P_PRESENT | P_RW;
-#endif
+	vm_paddr_t start = area->start;
+	vm_paddr_t end = area->end;
+	vm_paddr_t pos = start;
+	vm_size_t nr_pages = end - start / PAGE_SIZE;
+	latom_add(&zone->free_count, (long)nr_pages);
+
+	struct vm_page *page = &vm_page_array[start >> PAGE_SHIFT];
+	u_int order = paddr_find_order(start);
+	/* make sure the boot memory allocator cannot under any circumstances hand
+	 * out pages from this area anymore, even though that should be unnecessary */
+	clist_del(&area->link);

 	/*
-	 * It might be necessary to use a volatile pointer to phys_end for this
-	 * loop in case clang does The Optimization and caches its value for
-	 * whatever reason, even though at least for x86 this is not the case
-	 * (and i don't even thing the C standard allows it when calling
-	 * external functions in between, but still, Never Trust The Compiler).
+	 * We want to insert pages at the highest possible order.  However, the
+	 * start and end pointers of the area are only guaranteed to be page
+	 * aligned.  Therefore, we start with the highest possible order based
+	 * on the start address, and then increment the order in every loop
+	 * iteration (up to MM_MAX_ORDER).  We do this until we have reached
+	 * the end which, again, is only guaranteed to be page aligned, and
+	 * subsequently lower the order again.
 	 */
-	for (uintptr_t pos = phys_start; pos <= phys_end - step; pos += step)
-		__early_map_page(pos, __v(pos), flags);
+	while (pos < end) {
+		struct mm_pool *pool = &zone->pools[order];
+		clist_add(&pool->freelist, &page->link);
+		pool->free_entries++;
+
+		/* only the first page in the order group is inserted into
+		 * the freelist, but all of them need to be initialized */
+		for (u_int i = 0; i < (1 << order); i++) {
+			atom_init(&page[i].count, 0);
+			page[i].flags = 0;
+			page[i].order = 0;
+		}

-	vm_flush();
+		/*
+		 *       order
+		 *         ^
+		 *         |   _________ < MM_MAX_ORDER
+		 *         |  /         |
+		 * start   | /           \ < end order
+		 * order > |/
+		 *         |--------------|----> pos
+		 *       start           end
+		 */
+		pos += ORDER_SIZE(order);
+		page += (1 << order);
+		if (order < MM_MAX_ORDER && pos + ORDER_SIZE(order) <= end) {
+			/* this makes the rising part of the graph */
+			order++;
+		} else if (order > 0 && pos > end) {
+			/* we have overshot, lower the order */
+			pos -= ORDER_SIZE(order);
+			page -= (1 << order);
+			/* this makes the abrupt downwards jump at the end of the graph */
+			while (--order) {
+				if (pos + ORDER_SIZE(order) <= end) {
+					pos += ORDER_SIZE(order);
+					page += (1 << order);
+					break;
+				}
+			}
+		}
+	}
 }

-/*
- * This function maps the entire physical memory into the direct region
- * (DMAP_START - DMAP_END) and sets up the caches.
- * The bitmaps are stored one after another at the end of physical memory, and
- *
- */
-int pages_init(void)
+void paging_init(vm_paddr_t phys_end)
 {
-	if (sanity_check() != 0)
-		return 1;
-
-	map_direct_area();
+	/* Sizes of the individual bitmaps per order, rounded up to the
+	 * next full longword.  We use the same bitmaps in all zones. */
+	usize bitmap_sizes[MM_NR_ORDERS];
+	/* size of all bitmaps combined */
+	usize bitmap_total_size = 0;
+
+	for (int order = 0; order < MM_NR_ORDERS; order++) {
+		usize pages = phys_end >> ORDER_SHIFT(order + 1);
+		pages = align_ceil(pages, LATOM_BIT * 2);
+		usize bytes = pages / (CHAR_BIT * 2);
+		bitmap_sizes[order] = bytes;
+		bitmap_total_size += bytes;
+	}

-	/* phys_end gets aligned, as promised by the comment in kmalloc_init() */
-	phys_end = align_floor(phys_end, HUGEPAGE_SIZE);
-	usize phys_size = phys_end - phys_start;
+	page_debug("Reserving %zu bytes for page bitmaps\n", bitmap_total_size);

 	/*
-	 * calculate the size of each bitmap, as well as their combined size
+	 * allocate memory for the bitmaps and zero them out
 	 */
-	usize bitmap_bytes = 0;
-	for (int i = 0; i < CACHE_ORDERS; i++) {
-		usize bits = phys_size >> ORDER_SHIFT(i);
-		bits = align_ceil(bits, LONG_BIT);
-		bitmap_bytes += bits / 8;
-	}
-
-	page_debug("Page frame overhead = %zu bytes, %zu bytes total\n", bitmap_bytes, phys_size);
+	u_int bitmap_size_log2 = flsl((long)bitmap_total_size);
+	KASSERT(bitmap_size_log2 != 0);
+	bitmap_size_log2--; /* the bit index returned by flsl starts at 1 */
+	if (bitmap_total_size ^ (1ul << bitmap_size_log2))
+		bitmap_size_log2++; /* bitmap_total_size is not a power of 2, round up */
+	uintptr_t bitmap_start_phys = __boot_pmalloc(bitmap_size_log2, MM_ZONE_NORMAL);
+	panic_if(bitmap_start_phys == BOOT_PMALLOC_ERR,
+		 "cannot allocate memory for the page bitmaps");
+	memset(__v(bitmap_start_phys), 0, bitmap_total_size);

 	/*
-	 * zero out all bitmaps
+	 * initialize the pools
 	 */
-	uintptr_t bitmap_start_phys = phys_end - bitmap_bytes;
-	unsigned long *bitmap_start = __v(bitmap_start_phys);
-	memset(bitmap_start, 0, bitmap_bytes);
+	for (int zone_index = 0; zone_index < ARRAY_SIZE(mm_zones); zone_index++) {
+		struct mm_zone *zone = &mm_zones[zone_index];
+		latom_t *bitmap_pos = __v(bitmap_start_phys);
+		for (int order = 0; order < MM_NR_ORDERS; order++) {
+			zone->pools[order].bitmap = bitmap_pos;
+			clist_init(&zone->pools[order].freelist);
+			zone->pools[order].free_entries = 0;
+			latom_init(&zone->free_count, 0);
+
+			bitmap_pos += bitmap_sizes[order];
+		}
+	}

 	/*
-	 * populate the remaining members of the cache_pool structures and
-	 * preallocate entries that can't be handed out (i.e. the cache bitmaps)
+	 * mark *all* pages as reserved first
+	 *
+	 * XXX this is totally unnecessary and i'm only doing it because i'm
+	 *     too tired to work out an algorithm that finds all pages that are
+	 *     not in the _bmem_areas lists of the mm_zones
+	 *
+	 * if the reserved bit is set, all other fields in the page are invalid.
 	 */
-	unsigned long *bitmap_pos = bitmap_start;
-	for (int i = 0; i < CACHE_ORDERS; i++) {
-		/* total amount of entries on this level */
-		usize total_bits = phys_size >> ORDER_SHIFT(i);
-		/* number of entries on this level that the bitmap itself takes up */
-		usize wasted_bits = bitmap_bytes >> ORDER_SHIFT(i);
-		if (wasted_bits == 0)
-			wasted_bits = 1;
-		bit_set_range(bitmap_pos, total_bits - wasted_bits, wasted_bits);
-
-		caches[i].bitmap = bitmap_pos;
-		bitmap_pos += total_bits / LONG_BIT;
-
-		clist_init(&caches[i].freelist);
-		caches[i].free_entries = 0;
+	for (usize i = 0; i < phys_end >> PAGE_SHIFT; i++) {
+		/* This is merely an optimization to simplify checking whether
+		 * two buddies can be coalesced into one.  In reality, the
+		 * reference count is invalid because the page is reserved. */
+		atom_init(&vm_page_array[i].count, 1);
+		vm_page_array[i].flags = PG_RESERVED;
 	}

-	/* kheap_start and kheap_end are globals */
-	kheap_start = __v(phys_start);
-	kheap_end = align_floor(bitmap_start, HUGEPAGE_SIZE);
-
 	/*
-	 * populate the freelist on the highest order, all orders beneath it
-	 * stay empty until one of the large blocks gets split up
+	 * populate the freelists
 	 */
-	struct cache_pool *high_pool = &caches[CACHE_ORDERS - 1];
-	usize step = 1 << ORDER_SHIFT(CACHE_ORDERS - 1);
-	for (void *pos = kheap_start; pos < kheap_end; pos += step) {
-		struct clist *entry = pos;
-		clist_add(&high_pool->freelist, entry);
-		high_pool->free_entries++;
+	for (int i = 0; i < ARRAY_SIZE(mm_zones); i++) {
+		struct mm_zone *zone = &mm_zones[i];
+		struct _bmem_area *area, *tmp;
+		clist_foreach_entry_safe(&zone->_bmem_areas, area, tmp, link) {
+			claim_bmem_pages(zone, area);
+		}
+		zone->thrsh.emerg = latom_read(&zone->free_count) / CFG_PAGE_EMERG_DENOM;
+		if (zone->thrsh.emerg > CFG_PAGE_EMERG_MAX)
+			zone->thrsh.emerg = CFG_PAGE_EMERG_MAX;
 	}
-
-	return 0;
 }

-/**
- * @brief Split a block and return the lower half.
- * The block is assumed to already have been removed from its freelist.
- * The high half (i.e. the block that is *not* returned) is inserted into the
- * freelist one level below `level`.
- *
- * @param ptr Pointer to the block
- * @param level Current level of the block
- *	(`ptr` must be aligned to `1 << level` pages)
- */
-static void *split_buddy(void *ptr, int level);
-
-/**
- * @brief Attempt to coalesce a block with its buddy.
- * If coalition is possible, the buddy is removed from its freelist at `order`.
- *
- * @param ptr Pointer to the block
- * @param order Cache order, must be less than `CACHE_ORDERS - 1` (because you
- *	can't join blocks at the highest cache order)
- * @return The joined block, or `nil` if coalition was not possible
- */
-static void *try_join_buddy(void *ptr, int order);
-
-static inline usize get_bit_number(void *ptr, int order)
+static inline bool pg_flip_bit(struct mm_zone *zone, u_long pfn, u_int order)
 {
-	return ((uintptr_t)ptr - (uintptr_t)kheap_start) >> ORDER_SHIFT(order);
+	usize bit = pfn >> (order + 1);
+	latom_t *bitmap = &zone->pools[order].bitmap[bit / LATOM_BIT];
+	return latom_flip_bit(bitmap, (int)(bit % LATOM_BIT));
 }

-void *get_pages(int order, enum mflags flags)
+__malloc_like
+static void *__get_pages(u_int order, enum mflags flags)
 {
 	PAGE_ASSERT(order >= 0);
+	struct mm_zone *zone = &mm_zones[_M_ZONE_INDEX(flags)];

-	if (order >= GET_PAGE_ORDERS) {
+	if (order > MM_MAX_ORDER) {
 		page_debug("get_pages(%d, %#08x): Order too high!\n", order, flags);
 		return nil;
 	}

-	if (flags & M_NOWAIT) {
-		kprintf("get_pages(): M_NOWAIT requested, this is not implemented yet :(\n");
-		return nil;
+	u_long count_after = latom_sub(&zone->free_count, (1 << order)) - (1 << order);
+	if (count_after < zone->thrsh.emerg) {
+		if (count_after < 0 || !(flags & _M_EMERG)) {
+			latom_add(&zone->free_count, (1 << order));
+			return nil;
+		}
 	}
-	mtx_lock(&caches_lock);

-	struct clist *entry = nil;
-	int entry_order;
-	for (entry_order = order; entry_order < CACHE_ORDERS; entry_order++) {
-		if (caches[entry_order].free_entries > 0) {
-			entry = caches[entry_order].freelist.next;
-			break;
+	register_t cpuflags = read_flags();
+
+	/*
+	 * Search for a free page.  Start looking at the freelist for the
+	 * requested order, and if it's empty, go over to the next higher order.
+	 * Repeat until we found a page, or we've reached the highest order.
+	 */
+	vm_page_t page = nil;
+	u_int page_order = order;
+	while (page == nil && page_order < MM_NR_ORDERS) {
+		struct mm_pool *pool = &zone->pools[page_order];
+
+		disable_intr();
+		spin_lock(&pool->lock);
+		if (pool->free_entries > 0) {
+			page = clist_del_first_entry(&pool->freelist, typeof(*page), link);
+			/* increment the reference count while we hold the lock on the pool,
+			 * so that no other processor can try to coalesce this block if its
+			 * buddy is being freed (coalition is only possible if the buddy
+			 * has a reference count of zero, and while holding the pool lock) */
+			page_get(page);
+			pool->free_entries--;
+		} else {
+			page_order++;
 		}
+		spin_unlock(&pool->lock);
+		intr_restore(cpuflags);
 	}

-	if (entry_order != CACHE_ORDERS) {
-		clist_del(entry);
-		caches[entry_order].free_entries--;
-
-		usize bit_number = get_bit_number(entry, entry_order);
-		while (entry_order > order) {
-			entry = split_buddy(entry, entry_order);
-			bit_set(caches[entry_order].bitmap, bit_number);
-			entry_order--;
-			bit_number <<= 1;
+	/*
+	 * if we found a page, check if we need to split it up
+	 * (which is the case if we took one from a higher order freelist)
+	 */
+	if (page != nil) {
+		usize pfn = pg2pfn(page);
+		page_debug_noisy("alloc order %u, split pfn %#lx from order %u\n",
+				 order, pfn, page_order);
+		pg_flip_bit(zone, pfn, page_order);
+
+		/* split the page and insert the upper halves into the
+		 * respective freelist until we reach the requested order */
+		while (page_order-- > order) {
+			page_debug_noisy("split %p (order = %u)\n", pfn2vaddr(pfn), page_order);
+			struct mm_pool *pool = &zone->pools[page_order];
+			vm_page_t buddy = page + (1 << page_order);
+			buddy->order = page_order;
+			pg_flip_bit(zone, pfn + (1 << page_order), page_order);
+
+			disable_intr();
+			spin_lock(&pool->lock);
+			clist_add_first(&pool->freelist, &buddy->link);
+			pool->free_entries++;
+			spin_unlock(&pool->lock);
+			intr_restore(cpuflags);
 		}
-		bit_set(caches[order].bitmap, bit_number);

-#		if CFG_POISON_PAGES
-			memset(entry, 'a', 1 << ORDER_SHIFT(order));
-#		endif
+		page->order = order;
+		void *vaddr = pfn2vaddr(pfn);
+
+		return vaddr;
+	} else {
+		return nil;
 	}
+}

-	mtx_unlock(&caches_lock);
-	return (void *)entry;
+/* faster memset for whole pages */
+static inline void init_pages(u_long *start, u_long val, u_int order)
+{
+	u_long *end = start + (ORDER_SIZE(order) / sizeof(*start));
+	do {
+		*start++ = val;
+	} while (start != end);
 }

-void free_pages(void *ptr)
+void *get_pages(u_int order, enum mflags flags)
 {
-#	if CFG_DEBUG_PAGE_ALLOCS
-		if ((uintptr_t)ptr % PAGE_SIZE) {
-			kprintf("free_pages(%p): unaligned ptr!\n", ptr);
-			return;
-		}
-#	endif
+	void *pages = __get_pages(order, flags);

-	if (sus_nil(ptr)) {
-		page_debug("free_pages(%p): tried to free NULL!\n", ptr);
-		return;
-	}
+#if CFG_POISON_PAGES
+	if (pages != nil)
+		init_pages(pages, PAGE_POISON_ALLOC, order);
+#endif

-	int order = 0;
-	usize bit_number = get_bit_number(ptr, order);
-	for (; order < CACHE_ORDERS; order++) {
-		if (bit_tst(caches[order].bitmap, bit_number))
-			break;
-		bit_number >>= 1;
-	}
+	return pages;
+}

-	if (order == CACHE_ORDERS) {
-		page_debug("free_pages(%p): double free!\n", ptr);
-		return;
-	}
-	int original_order = order;
+void *get_page(enum mflags flags)
+{
+	void *pages = __get_pages(0, flags);

-	mtx_lock(&caches_lock);
+#if CFG_POISON_PAGES
+	if (pages != nil)
+		init_pages(pages, PAGE_POISON_ALLOC, 0);
+#endif

-	while (order < CACHE_ORDERS - 1) {
-		bit_clr(caches[order].bitmap, bit_number);
+	return pages;
+}

-		void *tmp = try_join_buddy(ptr, order);
-		if (tmp == nil)
-			break;
+void *get_zero_pages(u_int order, enum mflags flags)
+{
+	void *pages = __get_pages(order, flags);

-		ptr = tmp;
-		order++;
-		bit_number >>= 1;
-	}
+	if (pages != nil)
+		init_pages(pages, 0, order);

-	if (order == CACHE_ORDERS - 1 && original_order != CACHE_ORDERS - 1)
-		set_pflags(HUGEPAGE_ALIGN(ptr), P_HUGE | P_RW);
+	return pages;
+}

-#if CFG_POISON_PAGES
-	memset(ptr, 'A', 1 << ORDER_SHIFT(order));
-#endif
+void *get_zero_page(enum mflags flags)
+{
+	void *page = __get_pages(0, flags);

-	clist_add(&caches[order].freelist, (struct clist *)ptr);
-	caches[order].free_entries++;
+	if (page != nil)
+		init_pages(page, 0, 0);

-	mtx_unlock(&caches_lock);
+	return page;
 }

-static inline void *split_buddy(void *ptr, int level)
+/*
+ * Two buddies can be merged if:
+ * - you currently hold the lock for the pool
+ * - they both have a reference count of zero
+ * - they are in the same zone
+ * - neither of them is reserved
+ *
+ * This is only called from within the critical section of free_pages(),
+ * so execution speed is prioritized over anything else.
+ */
+static __always_inline bool can_merge(vm_page_t page, vm_page_t buddy)
 {
-#	if CFG_DEBUG_PAGE_ALLOCS
-		if ((uintptr_t)ptr % (1 << ORDER_SHIFT(level))) {
-			kprintf("split_buddy(ptr = %p, level = %d): unaligned ptr!\n", ptr, level);
-			return nil;
-		}
-		if (level < 1 || level >= CACHE_ORDERS) {
-			kprintf("split_buddy(ptr = %p, level = %d): invalid level!\n", ptr, level);
-			return nil;
-		}
-#	endif
+	bool merge = (atom_read(&buddy->count) == 0);

-	struct clist *high_buddy = ptr + (1 << ORDER_SHIFT(level - 1));
-	clist_add(&caches[level - 1].freelist, high_buddy);
-	caches[level - 1].free_entries++;
+	/* we know that `page` doesn't have PG_RESERVED set,
+	 * because we check that flag before anything else */
+	const unsigned mask = PG_RESERVED | PG_DMA;
+	merge &= (page->flags & mask) == (buddy->flags & mask);

-	page_debug_noisy("split (%p:%p), lvl=%d\n", ptr, (void *)high_buddy, level);
-
-	return ptr;
+	return merge;
 }

-static void *try_join_buddy(void *ptr, int order)
+void free_pages(void *ptr)
 {
-	const usize entry_size = 1 << ORDER_SHIFT(order);
-
-#	if CFG_DEBUG_PAGE_ALLOCS
-		if ((uintptr_t)ptr % entry_size) {
-			kprintf("try_join_buddy(%p, %d): unaligned ptr!\n", ptr, order);
-			return nil;
+	PAGE_DEBUG_BLOCK {
+		if (ptr < DMAP_START || ptr >= DMAP_END) {
+			panic("free_pages(%p): not in DMAP region\n", ptr);
 		}
-		/* order must be < CACHE_ORDERS - 1 because you
-		 * can't join blocks on the topmost order */
-		if (order >= CACHE_ORDERS - 1) {
-			kprintf("try_join_buddy(%p, %d): order >= CACHE_ORDERS - 1!\n", ptr, order);
-			return nil;
-		}
-#	endif
+	}

-	/*
-	 * Test whether the buddy block is allocated and return nil if it is.
-	 * entry_size is a power of 2, so we can quickly get to the buddy block
-	 * with a cheap XOR of the address and the entry size without the need
-	 * for any if branches.
-	 */
-	uintptr_t buddy = (uintptr_t)ptr ^ entry_size;
-	usize buddy_bitnum = get_bit_number((void *)buddy, order);
-	if (bit_tst(caches[order].bitmap, buddy_bitnum))
-		return nil;
+	register_t cpuflags = read_flags();

-	page_debug_noisy("join (%p:%p), order=%d\n", ptr, (void *)buddy, order);
+	vm_page_t page = vaddr2pg(ptr);
+	panic_if(page->flags & PG_RESERVED, "tried to free reserved page %p", ptr);

-	/* If the buddy is free, we remove it from the freelist ... */
-	clist_del((struct clist *)buddy);
-	caches[order].free_entries--;
+	u_int order = page->order;
+	PAGE_ASSERT((uintptr_t)ptr % ORDER_SIZE(order) == 0);
+	u_long pfn = vaddr2pfn(ptr);

-	/*
-	 * ... and return a pointer to the coalesced block.
-	 * We use the same trick as above to get to the even (lower) block, just
-	 * that this time we're zeroing the bit out rather than flipping it.
-	 */
-	uintptr_t even = (uintptr_t)ptr & ~entry_size;
-	return (void *)even;
+#if CFG_POISON_PAGES
+	init_pages(ptr, PAGE_POISON_FREE, order);
+#endif
+
+	int old_count = atom_sub(&page->count, 1);
+	if (old_count != 1) {
+		if (old_count == 0)
+			panic("double free of page %p", ptr);
+		else
+			panic("attempted to free page %p with references", ptr);
+	}
+
+	struct mm_zone *zone;
+	if (page->flags & PG_DMA)
+		zone = &mm_zones[MM_ZONE_DMA];
+	else
+		zone = &mm_zones[MM_ZONE_NORMAL];
+
+	latom_add(&zone->free_count, (1 << order));
+
+	/* try to coalesce free buddy blocks until we're reached the highest order */
+	while (order < MM_MAX_ORDER) {
+		if (pg_flip_bit(zone, pfn, order))
+			break;
+
+		page_debug_noisy("join %p (order = %u)\n", pfn2vaddr(pfn), order);
+
+		/* precompute all values we need inside the critical section
+		 * to avoid blocking other CPUs for longer than necessary */
+		vm_page_t buddy = &vm_page_array[pfn ^ (1ul << order)];
+		vm_page_t low = &vm_page_array[pfn & ~(1ul << order)];
+		struct mm_pool *current_order_pool = &zone->pools[order];
+		struct mm_pool *next_order_pool = &zone->pools[order + 1];
+
+		disable_intr();
+		spin_lock(&zone->pools[order].lock);
+		if (can_merge(page, buddy)) {
+			clist_del(&buddy->link);
+			current_order_pool->free_entries--;
+			buddy->order = order + 1;
+			page->order = order + 1;
+			clist_add(&next_order_pool->freelist, &low->link);
+			next_order_pool->free_entries++;
+		} else {
+			order = MM_MAX_ORDER; /* break out of the loop */
+		}
+		spin_unlock(&zone->pools[order].lock);
+		intr_restore(cpuflags);
+
+		page = low;
+		order++;
+	}
+
+	/* finally, we need to insert the page at its freelist */
+	struct mm_pool *pool = &zone->pools[order];
+	disable_intr();
+	spin_lock(&pool->lock);
+	clist_add(&pool->freelist, &page->link);
+	pool->free_entries++;
+	spin_unlock(&zone->pools[order].lock);
+	intr_restore(cpuflags);
 }