mm: refactor page frame allocator

This is part 3 of the mm subsystem overhaul. The allocator doesn't rely on mutexes anymore and uses individual per-order spinlocks instead. Also, it is aware of multiple memory zones (normal and DMA) as well as emergency reserves. Page bitmaps take up 50 % less overhead now.
2021-11-20 22:49:05 +01:00 · 2021-11-20 22:49:05 +01:00 · 385af1b7ef
commit 385af1b7ef
parent 825a981d67
11 changed files with 566 additions and 601 deletions
--- a/kernel/mm/CMakeLists.txt
+++ b/kernel/mm/CMakeLists.txt
@ -2,7 +2,6 @@

 target_sources(gay_kernel PRIVATE
    boot.c
-    kmalloc.c
    page.c
    slab.c
 )
--- a/kernel/mm/boot.c
+++ b/kernel/mm/boot.c
@ -13,7 +13,7 @@ static CLIST(bmem_area_freelist);

 #ifdef DEBUG
 #define debug_free_bmem_area(area) ({ (area)->start = ~(vm_paddr_t)0; })
-#define debug_get_bmem_area(area) KASSERT((area)->start != ~(vm_paddr_t)0)
+#define debug_get_bmem_area(area) KASSERT((area)->start == ~(vm_paddr_t)0)
 #else
 #define debug_free_bmem_area(area) ({})
 #define debug_get_bmem_area(area) ({})
@ -62,6 +62,9 @@ void __boot_pmalloc_init(void)
 		debug_free_bmem_area(area);
 		clist_add(&bmem_area_freelist, &area->link);
 	}
+
+	for (int i = 0; i < MM_NR_ZONES; i++)
+		clist_init(&mm_zones[i]._bmem_areas);
 }

 void __boot_register_mem_area(vm_paddr_t start, vm_paddr_t end, enum mm_zone_type zone_type)
--- a/kernel/mm/kmalloc.c
+++ b/kernel/mm/kmalloc.c
@ -1,74 +0,0 @@
-/* Copyright (C) 2021 fef <owo@fef.moe>.  All rights reserved. */
-
-#include <gay/kprintf.h>
-#include <gay/mm.h>
-#include <gay/types.h>
-#include <gay/util.h>
-
-extern void _image_start_phys;
-extern void _image_end_phys;
-
-/* these are initialized by pages_init() */
-void *kheap_start;
-void *kheap_end;
-
-int kmalloc_init(uintptr_t _phys_start, uintptr_t _phys_end)
-{
-	phys_start = _phys_start;
-	phys_end = _phys_end;
-
-	/*
-	 * The kernel image is very likely gonna be within the physical memory
-	 * range, so we're gonna need to do some cropping in order to not hand
-	 * out pages that actually contain kernel code.
-	 * Furthermore, somebody should probably clean up this mess somehow.
-	 */
-	uintptr_t image_start_phys = (uintptr_t)&_image_start_phys;
-	uintptr_t image_end_phys = (uintptr_t)&_image_end_phys;
-	if (phys_start < image_start_phys && phys_end > image_start_phys) {
-		if (image_start_phys - phys_start > phys_end - image_start_phys)
-			phys_end = image_start_phys;
-		else
-			phys_start = image_end_phys;
-	}
-	if (phys_start < image_end_phys && _phys_end > image_end_phys) {
-		if (image_end_phys - phys_start > phys_end - image_end_phys)
-			phys_end = image_start_phys;
-		else
-			phys_start = image_end_phys;
-	}
-
-	phys_start = align_ceil(phys_start, HUGEPAGE_SIZE);
-	/*
-	 * This is intentionally not aligned to hugepages, because __early_get_page()
-	 * shrinks it in single PAGE_SIZE steps whenever it is called anyway.
-	 * I know, this is a terrible hack, but it will be aligned to a hugepage
-	 * from within pages_init(), right after the entire physical memory has
-	 * been mapped to the direct area (which is the only reason we need to
-	 * be able to allocate pages before the page frame allocator is set up
-	 * in the first place).
-	 */
-	phys_end = align_floor(phys_end, PAGE_SIZE);
-
-	int err = pages_init();
-	if (err)
-		return err;
-
-	slab_init();
-	return 0;
-}
-
-__weak void *malloc(usize size)
-{
-	return kmalloc(size, M_KERN);
-}
-
-__weak void free(void *ptr)
-{
-	kfree(ptr);
-}
-
-/*
- * Looking for kmalloc() and kfree()?
- * Those two are in slab.c for purely organizational reasons.
- */
--- a/kernel/mm/page.c
+++ b/kernel/mm/page.c
@ -1,32 +1,25 @@
 /* Copyright (C) 2021 fef <owo@fef.moe>.  All rights reserved. */

+#include <arch/cpufunc.h>
 #include <arch/page.h>

-#include <gay/bits.h>
 #include <gay/clist.h>
 #include <gay/config.h>
 #include <gay/kprintf.h>
 #include <gay/mm.h>
 #include <gay/mutex.h>
+#include <gay/poison.h>
 #include <gay/systm.h>
 #include <gay/types.h>
 #include <gay/util.h>
+#include <gay/vm/page.h>

 #include <limits.h>
 #include <string.h>
+#include <strings.h>

-#ifndef __HAVE_HUGEPAGES
-#error "Systems without huge pages are currently unsupported because i'm a dumb bitch"
-#endif
-
-#if DMAP_OFFSET % HUGEPAGE_SIZE != 0
-#error "DMAP_OFFSET must be an integral multiple of HUGEPAGE_SIZE"
-#endif
-
-/* this should be impossible because arch/page.h must also define PAGE_SHIFT
- * and HUGEPAGE_SHIFT, meaning the two are definitively powers of 2 */
-#if HUGEPAGE_SIZE % PAGE_SIZE != 0
-#error "HUGEPAGE_SIZE must be an integral multiple of PAGE_SIZE"
+#if DMAP_OFFSET % PAGE_SIZE != 0
+#error "DMAP_OFFSET must be an integral multiple of PAGE_SIZE"
 #endif

 #if PAGE_SIZE % LONG_BIT != 0
@ -40,6 +33,7 @@
 #if CFG_DEBUG_PAGE_ALLOCS
 #	define PAGE_ASSERT(x) KASSERT(x)
 #	define page_debug(msg, ...) kprintf("[page] " msg, ##__VA_ARGS__)
+#	define PAGE_DEBUG_BLOCK
 #	if CFG_DEBUG_PAGE_ALLOCS_NOISY
 #		define page_debug_noisy(msg, ...) kprintf("[page] " msg, ##__VA_ARGS__)
 #	else
@ -47,359 +41,419 @@
 #	endif
 #else
 #	define PAGE_ASSERT(x) ({})
+#	define PAGE_DEBUG_BLOCK if (0)
 #	define page_debug(msg, ...) ({})
 #	define page_debug_noisy(msg, ...) ({})
 #endif

-/**
- * We have cache levels for areas ranging from a single page up to a huge page
- * on a logarithmic scale.  Every level covers double the pages per entry than
- * the one below it, starting at one page per entry.  The effective result is
- * that a single entry in the cache on level L covers `(1 << L)` pages.
- */
-#define CACHE_ORDERS GET_PAGE_ORDERS
-
 #define ORDER_SHIFT(order) (PAGE_SHIFT + (order))
+#define ORDER_SIZE(order) (1 << ORDER_SHIFT(order))

-/** @brief There is one of this for every cache order. */
-struct cache_pool {
-	/**
-	 * @brief List of free blocks on this order of granularity.
-	 * The individual entries sit right at the beginning of each free block,
-	 * and are always aligned to `entry_size` bytes.
-	 */
-	struct clist freelist;
-	/**
-	 * @brief Bitmap that stores the allocated status of each entry.
-	 * 1 means allocated, 0 means not.
-	 */
-	unsigned long *bitmap;
-	/** @brief Number of items in `freelist`. */
-	usize free_entries;
-};
-static struct cache_pool caches[CACHE_ORDERS];
-static MTX(caches_lock);
+/* this should be the same as LONG_BIT because latom_t is really just a
+ * long wrapped in a struct, but my trust in compilers is exactly zero */
+#define LATOM_BIT (sizeof(latom_t) * CHAR_BIT)

-/* these get set in kmalloc_init() */
-uintptr_t phys_start;
-uintptr_t phys_end;
+struct mm_zone mm_zones[MM_NR_ZONES];

-uintptr_t __early_get_page(void)
+static inline u_int paddr_find_order(vm_paddr_t addr)
 {
-	phys_end -= PAGE_SIZE;
-	return phys_end;
+	int bit = ffsll((long long)addr) - 1;
+	if (bit == -1 || bit > ORDER_SHIFT(MM_MAX_ORDER))
+		bit = ORDER_SHIFT(MM_MAX_ORDER);
+
+	KASSERT(bit >= PAGE_SHIFT);
+	return bit - PAGE_SHIFT;
 }

-static int sanity_check(void)
+/** @brief Claim all free pages in one of the memory areas from the boot allocator. */
+static inline void claim_bmem_pages(struct mm_zone *zone, struct _bmem_area *area)
 {
-	KASSERT(phys_start < phys_end);
-	KASSERT(phys_start == HUGEPAGE_ALIGN(phys_start));
-	/* phys_end is only page aligned, see kmalloc_init() */
-	KASSERT(phys_end == PAGE_ALIGN(phys_end));
+	vm_paddr_t start = area->start;
+	vm_paddr_t end = area->end;
+	vm_paddr_t pos = start;
+	vm_size_t nr_pages = end - start / PAGE_SIZE;
+	latom_add(&zone->free_count, (long)nr_pages);

-	if ((phys_end - phys_start) < (32 * 1024 * 1024)) {
-		kprintf("Less than 32 MB of usable RAM, this wouldn't go well\n");
-		return 1;
+	struct vm_page *page = &vm_page_array[start >> PAGE_SHIFT];
+	u_int order = paddr_find_order(start);
+	/* make sure the boot memory allocator cannot under any circumstances hand
+	 * out pages from this area anymore, even though that should be unnecessary */
+	clist_del(&area->link);
+
+	/*
+	 * We want to insert pages at the highest possible order.  However, the
+	 * start and end pointers of the area are only guaranteed to be page
+	 * aligned.  Therefore, we start with the highest possible order based
+	 * on the start address, and then increment the order in every loop
+	 * iteration (up to MM_MAX_ORDER).  We do this until we have reached
+	 * the end which, again, is only guaranteed to be page aligned, and
+	 * subsequently lower the order again.
+	 */
+	while (pos < end) {
+		struct mm_pool *pool = &zone->pools[order];
+		clist_add(&pool->freelist, &page->link);
+		pool->free_entries++;
+
+		/* only the first page in the order group is inserted into
+		 * the freelist, but all of them need to be initialized */
+		for (u_int i = 0; i < (1 << order); i++) {
+			atom_init(&page[i].count, 0);
+			page[i].flags = 0;
+			page[i].order = 0;
+		}
+
+		/*
+		 *       order
+		 *         ^
+		 *         |   _________ < MM_MAX_ORDER
+		 *         |  /         |
+		 * start   | /           \ < end order
+		 * order > |/
+		 *         |--------------|----> pos
+		 *       start           end
+		 */
+		pos += ORDER_SIZE(order);
+		page += (1 << order);
+		if (order < MM_MAX_ORDER && pos + ORDER_SIZE(order) <= end) {
+			/* this makes the rising part of the graph */
+			order++;
+		} else if (order > 0 && pos > end) {
+			/* we have overshot, lower the order */
+			pos -= ORDER_SIZE(order);
+			page -= (1 << order);
+			/* this makes the abrupt downwards jump at the end of the graph */
+			while (--order) {
+				if (pos + ORDER_SIZE(order) <= end) {
+					pos += ORDER_SIZE(order);
+					page += (1 << order);
+					break;
+				}
+			}
+		}
+	}
+}
+
+void paging_init(vm_paddr_t phys_end)
+{
+	/* Sizes of the individual bitmaps per order, rounded up to the
+	 * next full longword.  We use the same bitmaps in all zones. */
+	usize bitmap_sizes[MM_NR_ORDERS];
+	/* size of all bitmaps combined */
+	usize bitmap_total_size = 0;
+
+	for (int order = 0; order < MM_NR_ORDERS; order++) {
+		usize pages = phys_end >> ORDER_SHIFT(order + 1);
+		pages = align_ceil(pages, LATOM_BIT * 2);
+		usize bytes = pages / (CHAR_BIT * 2);
+		bitmap_sizes[order] = bytes;
+		bitmap_total_size += bytes;
 	}

-	return 0;
-}
-
-/*
- * Map the entire physical memory into the direct contiguous area.
- * __early_map_page() might call __early_get_page() in order to allocate
- * new page table structures, which in turn shrinks the physical memory
- * size (see above).
- */
-static inline void map_direct_area(void)
-{
-#ifdef __HAVE_HUGEPAGES
-	const usize step = HUGEPAGE_SIZE;
-	const enum pflags flags = P_PRESENT | P_RW | P_HUGE;
-#else
-	const usize step = PAGE_SIZE;
-	const enum pflags flags = P_PRESENT | P_RW;
-#endif
+	page_debug("Reserving %zu bytes for page bitmaps\n", bitmap_total_size);

 	/*
-	 * It might be necessary to use a volatile pointer to phys_end for this
-	 * loop in case clang does The Optimization and caches its value for
-	 * whatever reason, even though at least for x86 this is not the case
-	 * (and i don't even thing the C standard allows it when calling
-	 * external functions in between, but still, Never Trust The Compiler).
+	 * allocate memory for the bitmaps and zero them out
 	 */
-	for (uintptr_t pos = phys_start; pos <= phys_end - step; pos += step)
-		__early_map_page(pos, __v(pos), flags);
-
-	vm_flush();
-}
-
-/*
- * This function maps the entire physical memory into the direct region
- * (DMAP_START - DMAP_END) and sets up the caches.
- * The bitmaps are stored one after another at the end of physical memory, and
- *
- */
-int pages_init(void)
-{
-	if (sanity_check() != 0)
-		return 1;
-
-	map_direct_area();
-
-	/* phys_end gets aligned, as promised by the comment in kmalloc_init() */
-	phys_end = align_floor(phys_end, HUGEPAGE_SIZE);
-	usize phys_size = phys_end - phys_start;
+	u_int bitmap_size_log2 = flsl((long)bitmap_total_size);
+	KASSERT(bitmap_size_log2 != 0);
+	bitmap_size_log2--; /* the bit index returned by flsl starts at 1 */
+	if (bitmap_total_size ^ (1ul << bitmap_size_log2))
+		bitmap_size_log2++; /* bitmap_total_size is not a power of 2, round up */
+	uintptr_t bitmap_start_phys = __boot_pmalloc(bitmap_size_log2, MM_ZONE_NORMAL);
+	panic_if(bitmap_start_phys == BOOT_PMALLOC_ERR,
+		 "cannot allocate memory for the page bitmaps");
+	memset(__v(bitmap_start_phys), 0, bitmap_total_size);

 	/*
-	 * calculate the size of each bitmap, as well as their combined size
+	 * initialize the pools
 	 */
-	usize bitmap_bytes = 0;
-	for (int i = 0; i < CACHE_ORDERS; i++) {
-		usize bits = phys_size >> ORDER_SHIFT(i);
-		bits = align_ceil(bits, LONG_BIT);
-		bitmap_bytes += bits / 8;
+	for (int zone_index = 0; zone_index < ARRAY_SIZE(mm_zones); zone_index++) {
+		struct mm_zone *zone = &mm_zones[zone_index];
+		latom_t *bitmap_pos = __v(bitmap_start_phys);
+		for (int order = 0; order < MM_NR_ORDERS; order++) {
+			zone->pools[order].bitmap = bitmap_pos;
+			clist_init(&zone->pools[order].freelist);
+			zone->pools[order].free_entries = 0;
+			latom_init(&zone->free_count, 0);
+
+			bitmap_pos += bitmap_sizes[order];
+		}
 	}

-	page_debug("Page frame overhead = %zu bytes, %zu bytes total\n", bitmap_bytes, phys_size);
-
 	/*
-	 * zero out all bitmaps
+	 * mark *all* pages as reserved first
+	 *
+	 * XXX this is totally unnecessary and i'm only doing it because i'm
+	 *     too tired to work out an algorithm that finds all pages that are
+	 *     not in the _bmem_areas lists of the mm_zones
+	 *
+	 * if the reserved bit is set, all other fields in the page are invalid.
 	 */
-	uintptr_t bitmap_start_phys = phys_end - bitmap_bytes;
-	unsigned long *bitmap_start = __v(bitmap_start_phys);
-	memset(bitmap_start, 0, bitmap_bytes);
-
-	/*
-	 * populate the remaining members of the cache_pool structures and
-	 * preallocate entries that can't be handed out (i.e. the cache bitmaps)
-	 */
-	unsigned long *bitmap_pos = bitmap_start;
-	for (int i = 0; i < CACHE_ORDERS; i++) {
-		/* total amount of entries on this level */
-		usize total_bits = phys_size >> ORDER_SHIFT(i);
-		/* number of entries on this level that the bitmap itself takes up */
-		usize wasted_bits = bitmap_bytes >> ORDER_SHIFT(i);
-		if (wasted_bits == 0)
-			wasted_bits = 1;
-		bit_set_range(bitmap_pos, total_bits - wasted_bits, wasted_bits);
-
-		caches[i].bitmap = bitmap_pos;
-		bitmap_pos += total_bits / LONG_BIT;
-
-		clist_init(&caches[i].freelist);
-		caches[i].free_entries = 0;
+	for (usize i = 0; i < phys_end >> PAGE_SHIFT; i++) {
+		/* This is merely an optimization to simplify checking whether
+		 * two buddies can be coalesced into one.  In reality, the
+		 * reference count is invalid because the page is reserved. */
+		atom_init(&vm_page_array[i].count, 1);
+		vm_page_array[i].flags = PG_RESERVED;
 	}

-	/* kheap_start and kheap_end are globals */
-	kheap_start = __v(phys_start);
-	kheap_end = align_floor(bitmap_start, HUGEPAGE_SIZE);
-
 	/*
-	 * populate the freelist on the highest order, all orders beneath it
-	 * stay empty until one of the large blocks gets split up
+	 * populate the freelists
 	 */
-	struct cache_pool *high_pool = &caches[CACHE_ORDERS - 1];
-	usize step = 1 << ORDER_SHIFT(CACHE_ORDERS - 1);
-	for (void *pos = kheap_start; pos < kheap_end; pos += step) {
-		struct clist *entry = pos;
-		clist_add(&high_pool->freelist, entry);
-		high_pool->free_entries++;
+	for (int i = 0; i < ARRAY_SIZE(mm_zones); i++) {
+		struct mm_zone *zone = &mm_zones[i];
+		struct _bmem_area *area, *tmp;
+		clist_foreach_entry_safe(&zone->_bmem_areas, area, tmp, link) {
+			claim_bmem_pages(zone, area);
+		}
+		zone->thrsh.emerg = latom_read(&zone->free_count) / CFG_PAGE_EMERG_DENOM;
+		if (zone->thrsh.emerg > CFG_PAGE_EMERG_MAX)
+			zone->thrsh.emerg = CFG_PAGE_EMERG_MAX;
 	}
-
-	return 0;
 }

-/**
- * @brief Split a block and return the lower half.
- * The block is assumed to already have been removed from its freelist.
- * The high half (i.e. the block that is *not* returned) is inserted into the
- * freelist one level below `level`.
- *
- * @param ptr Pointer to the block
- * @param level Current level of the block
- *	(`ptr` must be aligned to `1 << level` pages)
- */
-static void *split_buddy(void *ptr, int level);
-
-/**
- * @brief Attempt to coalesce a block with its buddy.
- * If coalition is possible, the buddy is removed from its freelist at `order`.
- *
- * @param ptr Pointer to the block
- * @param order Cache order, must be less than `CACHE_ORDERS - 1` (because you
- *	can't join blocks at the highest cache order)
- * @return The joined block, or `nil` if coalition was not possible
- */
-static void *try_join_buddy(void *ptr, int order);
-
-static inline usize get_bit_number(void *ptr, int order)
+static inline bool pg_flip_bit(struct mm_zone *zone, u_long pfn, u_int order)
 {
-	return ((uintptr_t)ptr - (uintptr_t)kheap_start) >> ORDER_SHIFT(order);
+	usize bit = pfn >> (order + 1);
+	latom_t *bitmap = &zone->pools[order].bitmap[bit / LATOM_BIT];
+	return latom_flip_bit(bitmap, (int)(bit % LATOM_BIT));
 }

-void *get_pages(int order, enum mflags flags)
+__malloc_like
+static void *__get_pages(u_int order, enum mflags flags)
 {
 	PAGE_ASSERT(order >= 0);
+	struct mm_zone *zone = &mm_zones[_M_ZONE_INDEX(flags)];

-	if (order >= GET_PAGE_ORDERS) {
+	if (order > MM_MAX_ORDER) {
 		page_debug("get_pages(%d, %#08x): Order too high!\n", order, flags);
 		return nil;
 	}

-	if (flags & M_NOWAIT) {
-		kprintf("get_pages(): M_NOWAIT requested, this is not implemented yet :(\n");
+	u_long count_after = latom_sub(&zone->free_count, (1 << order)) - (1 << order);
+	if (count_after < zone->thrsh.emerg) {
+		if (count_after < 0 || !(flags & _M_EMERG)) {
+			latom_add(&zone->free_count, (1 << order));
+			return nil;
+		}
+	}
+
+	register_t cpuflags = read_flags();
+
+	/*
+	 * Search for a free page.  Start looking at the freelist for the
+	 * requested order, and if it's empty, go over to the next higher order.
+	 * Repeat until we found a page, or we've reached the highest order.
+	 */
+	vm_page_t page = nil;
+	u_int page_order = order;
+	while (page == nil && page_order < MM_NR_ORDERS) {
+		struct mm_pool *pool = &zone->pools[page_order];
+
+		disable_intr();
+		spin_lock(&pool->lock);
+		if (pool->free_entries > 0) {
+			page = clist_del_first_entry(&pool->freelist, typeof(*page), link);
+			/* increment the reference count while we hold the lock on the pool,
+			 * so that no other processor can try to coalesce this block if its
+			 * buddy is being freed (coalition is only possible if the buddy
+			 * has a reference count of zero, and while holding the pool lock) */
+			page_get(page);
+			pool->free_entries--;
+		} else {
+			page_order++;
+		}
+		spin_unlock(&pool->lock);
+		intr_restore(cpuflags);
+	}
+
+	/*
+	 * if we found a page, check if we need to split it up
+	 * (which is the case if we took one from a higher order freelist)
+	 */
+	if (page != nil) {
+		usize pfn = pg2pfn(page);
+		page_debug_noisy("alloc order %u, split pfn %#lx from order %u\n",
+				 order, pfn, page_order);
+		pg_flip_bit(zone, pfn, page_order);
+
+		/* split the page and insert the upper halves into the
+		 * respective freelist until we reach the requested order */
+		while (page_order-- > order) {
+			page_debug_noisy("split %p (order = %u)\n", pfn2vaddr(pfn), page_order);
+			struct mm_pool *pool = &zone->pools[page_order];
+			vm_page_t buddy = page + (1 << page_order);
+			buddy->order = page_order;
+			pg_flip_bit(zone, pfn + (1 << page_order), page_order);
+
+			disable_intr();
+			spin_lock(&pool->lock);
+			clist_add_first(&pool->freelist, &buddy->link);
+			pool->free_entries++;
+			spin_unlock(&pool->lock);
+			intr_restore(cpuflags);
+		}
+
+		page->order = order;
+		void *vaddr = pfn2vaddr(pfn);
+
+		return vaddr;
+	} else {
 		return nil;
 	}
-	mtx_lock(&caches_lock);
+}

-	struct clist *entry = nil;
-	int entry_order;
-	for (entry_order = order; entry_order < CACHE_ORDERS; entry_order++) {
-		if (caches[entry_order].free_entries > 0) {
-			entry = caches[entry_order].freelist.next;
-			break;
-		}
-	}
+/* faster memset for whole pages */
+static inline void init_pages(u_long *start, u_long val, u_int order)
+{
+	u_long *end = start + (ORDER_SIZE(order) / sizeof(*start));
+	do {
+		*start++ = val;
+	} while (start != end);
+}

-	if (entry_order != CACHE_ORDERS) {
-		clist_del(entry);
-		caches[entry_order].free_entries--;
+void *get_pages(u_int order, enum mflags flags)
+{
+	void *pages = __get_pages(order, flags);

-		usize bit_number = get_bit_number(entry, entry_order);
-		while (entry_order > order) {
-			entry = split_buddy(entry, entry_order);
-			bit_set(caches[entry_order].bitmap, bit_number);
-			entry_order--;
-			bit_number <<= 1;
-		}
-		bit_set(caches[order].bitmap, bit_number);
+#if CFG_POISON_PAGES
+	if (pages != nil)
+		init_pages(pages, PAGE_POISON_ALLOC, order);
+#endif

-#		if CFG_POISON_PAGES
-			memset(entry, 'a', 1 << ORDER_SHIFT(order));
-#		endif
-	}
+	return pages;
+}

-	mtx_unlock(&caches_lock);
-	return (void *)entry;
+void *get_page(enum mflags flags)
+{
+	void *pages = __get_pages(0, flags);
+
+#if CFG_POISON_PAGES
+	if (pages != nil)
+		init_pages(pages, PAGE_POISON_ALLOC, 0);
+#endif
+
+	return pages;
+}
+
+void *get_zero_pages(u_int order, enum mflags flags)
+{
+	void *pages = __get_pages(order, flags);
+
+	if (pages != nil)
+		init_pages(pages, 0, order);
+
+	return pages;
+}
+
+void *get_zero_page(enum mflags flags)
+{
+	void *page = __get_pages(0, flags);
+
+	if (page != nil)
+		init_pages(page, 0, 0);
+
+	return page;
+}
+
+/*
+ * Two buddies can be merged if:
+ * - you currently hold the lock for the pool
+ * - they both have a reference count of zero
+ * - they are in the same zone
+ * - neither of them is reserved
+ *
+ * This is only called from within the critical section of free_pages(),
+ * so execution speed is prioritized over anything else.
+ */
+static __always_inline bool can_merge(vm_page_t page, vm_page_t buddy)
+{
+	bool merge = (atom_read(&buddy->count) == 0);
+
+	/* we know that `page` doesn't have PG_RESERVED set,
+	 * because we check that flag before anything else */
+	const unsigned mask = PG_RESERVED | PG_DMA;
+	merge &= (page->flags & mask) == (buddy->flags & mask);
+
+	return merge;
 }

 void free_pages(void *ptr)
 {
-#	if CFG_DEBUG_PAGE_ALLOCS
-		if ((uintptr_t)ptr % PAGE_SIZE) {
-			kprintf("free_pages(%p): unaligned ptr!\n", ptr);
-			return;
+	PAGE_DEBUG_BLOCK {
+		if (ptr < DMAP_START || ptr >= DMAP_END) {
+			panic("free_pages(%p): not in DMAP region\n", ptr);
 		}
-#	endif
-
-	if (sus_nil(ptr)) {
-		page_debug("free_pages(%p): tried to free NULL!\n", ptr);
-		return;
 	}

-	int order = 0;
-	usize bit_number = get_bit_number(ptr, order);
-	for (; order < CACHE_ORDERS; order++) {
-		if (bit_tst(caches[order].bitmap, bit_number))
-			break;
-		bit_number >>= 1;
-	}
+	register_t cpuflags = read_flags();

-	if (order == CACHE_ORDERS) {
-		page_debug("free_pages(%p): double free!\n", ptr);
-		return;
-	}
-	int original_order = order;
+	vm_page_t page = vaddr2pg(ptr);
+	panic_if(page->flags & PG_RESERVED, "tried to free reserved page %p", ptr);

-	mtx_lock(&caches_lock);
-
-	while (order < CACHE_ORDERS - 1) {
-		bit_clr(caches[order].bitmap, bit_number);
-
-		void *tmp = try_join_buddy(ptr, order);
-		if (tmp == nil)
-			break;
-
-		ptr = tmp;
-		order++;
-		bit_number >>= 1;
-	}
-
-	if (order == CACHE_ORDERS - 1 && original_order != CACHE_ORDERS - 1)
-		set_pflags(HUGEPAGE_ALIGN(ptr), P_HUGE | P_RW);
+	u_int order = page->order;
+	PAGE_ASSERT((uintptr_t)ptr % ORDER_SIZE(order) == 0);
+	u_long pfn = vaddr2pfn(ptr);

 #if CFG_POISON_PAGES
-	memset(ptr, 'A', 1 << ORDER_SHIFT(order));
+	init_pages(ptr, PAGE_POISON_FREE, order);
 #endif

-	clist_add(&caches[order].freelist, (struct clist *)ptr);
-	caches[order].free_entries++;
+	int old_count = atom_sub(&page->count, 1);
+	if (old_count != 1) {
+		if (old_count == 0)
+			panic("double free of page %p", ptr);
+		else
+			panic("attempted to free page %p with references", ptr);
+	}

-	mtx_unlock(&caches_lock);
-}
-
-static inline void *split_buddy(void *ptr, int level)
-{
-#	if CFG_DEBUG_PAGE_ALLOCS
-		if ((uintptr_t)ptr % (1 << ORDER_SHIFT(level))) {
-			kprintf("split_buddy(ptr = %p, level = %d): unaligned ptr!\n", ptr, level);
-			return nil;
-		}
-		if (level < 1 || level >= CACHE_ORDERS) {
-			kprintf("split_buddy(ptr = %p, level = %d): invalid level!\n", ptr, level);
-			return nil;
-		}
-#	endif
-
-	struct clist *high_buddy = ptr + (1 << ORDER_SHIFT(level - 1));
-	clist_add(&caches[level - 1].freelist, high_buddy);
-	caches[level - 1].free_entries++;
-
-	page_debug_noisy("split (%p:%p), lvl=%d\n", ptr, (void *)high_buddy, level);
-
-	return ptr;
-}
-
-static void *try_join_buddy(void *ptr, int order)
-{
-	const usize entry_size = 1 << ORDER_SHIFT(order);
-
-#	if CFG_DEBUG_PAGE_ALLOCS
-		if ((uintptr_t)ptr % entry_size) {
-			kprintf("try_join_buddy(%p, %d): unaligned ptr!\n", ptr, order);
-			return nil;
-		}
-		/* order must be < CACHE_ORDERS - 1 because you
-		 * can't join blocks on the topmost order */
-		if (order >= CACHE_ORDERS - 1) {
-			kprintf("try_join_buddy(%p, %d): order >= CACHE_ORDERS - 1!\n", ptr, order);
-			return nil;
-		}
-#	endif
-
-	/*
-	 * Test whether the buddy block is allocated and return nil if it is.
-	 * entry_size is a power of 2, so we can quickly get to the buddy block
-	 * with a cheap XOR of the address and the entry size without the need
-	 * for any if branches.
-	 */
-	uintptr_t buddy = (uintptr_t)ptr ^ entry_size;
-	usize buddy_bitnum = get_bit_number((void *)buddy, order);
-	if (bit_tst(caches[order].bitmap, buddy_bitnum))
-		return nil;
-
-	page_debug_noisy("join (%p:%p), order=%d\n", ptr, (void *)buddy, order);
-
-	/* If the buddy is free, we remove it from the freelist ... */
-	clist_del((struct clist *)buddy);
-	caches[order].free_entries--;
-
-	/*
-	 * ... and return a pointer to the coalesced block.
-	 * We use the same trick as above to get to the even (lower) block, just
-	 * that this time we're zeroing the bit out rather than flipping it.
-	 */
-	uintptr_t even = (uintptr_t)ptr & ~entry_size;
-	return (void *)even;
+	struct mm_zone *zone;
+	if (page->flags & PG_DMA)
+		zone = &mm_zones[MM_ZONE_DMA];
+	else
+		zone = &mm_zones[MM_ZONE_NORMAL];
+
+	latom_add(&zone->free_count, (1 << order));
+
+	/* try to coalesce free buddy blocks until we're reached the highest order */
+	while (order < MM_MAX_ORDER) {
+		if (pg_flip_bit(zone, pfn, order))
+			break;
+
+		page_debug_noisy("join %p (order = %u)\n", pfn2vaddr(pfn), order);
+
+		/* precompute all values we need inside the critical section
+		 * to avoid blocking other CPUs for longer than necessary */
+		vm_page_t buddy = &vm_page_array[pfn ^ (1ul << order)];
+		vm_page_t low = &vm_page_array[pfn & ~(1ul << order)];
+		struct mm_pool *current_order_pool = &zone->pools[order];
+		struct mm_pool *next_order_pool = &zone->pools[order + 1];
+
+		disable_intr();
+		spin_lock(&zone->pools[order].lock);
+		if (can_merge(page, buddy)) {
+			clist_del(&buddy->link);
+			current_order_pool->free_entries--;
+			buddy->order = order + 1;
+			page->order = order + 1;
+			clist_add(&next_order_pool->freelist, &low->link);
+			next_order_pool->free_entries++;
+		} else {
+			order = MM_MAX_ORDER; /* break out of the loop */
+		}
+		spin_unlock(&zone->pools[order].lock);
+		intr_restore(cpuflags);
+
+		page = low;
+		order++;
+	}
+
+	/* finally, we need to insert the page at its freelist */
+	struct mm_pool *pool = &zone->pools[order];
+	disable_intr();
+	spin_lock(&pool->lock);
+	clist_add(&pool->freelist, &page->link);
+	pool->free_entries++;
+	spin_unlock(&zone->pools[order].lock);
+	intr_restore(cpuflags);
 }