mm: generalize boot allocator

The boot page frame allocator is now architecture independent. This is part 2 of the mm subsystem refactor.
2021-11-18 02:25:41 +01:00 · 2021-11-18 02:25:41 +01:00 · d19e665d47
commit d19e665d47
parent 7285c2e076
11 changed files with 309 additions and 249 deletions
--- a/arch/x86/boot/setup64.S
+++ b/arch/x86/boot/setup64.S
@ -42,8 +42,8 @@
 	.extern	_boot /* main boot routine -- see ./boot.c */

 	/* initial page maps -- see ../mm/amd64/page.c */
-	.extern _pml4
-	.extern _pdp0
+	.extern _pml4t
+	.extern _pdpt0

 	/* GDT stuff -- see ../mm/segment.S */
 	.extern _x86_gdt_desc
@ -137,8 +137,8 @@ ENTRY(_setup)
 #endif

 #define V48 0xffff000000000000
-#define PDP_OFFSET(ptr) (( (((ptr) - V48) >> X86_PDPT_SHIFT) % 512 ) * 8)
-#define PML4_OFFSET(ptr) ( ((ptr) - V48) >> (X86_PML4T_SHIFT) * 8 )
+#define PDPT_OFFSET(ptr) (( (((ptr) - V48) >> X86_PDPT_SHIFT) % 512 ) * 8)
+#define PML4T_OFFSET(ptr) ( ((ptr) - V48) >> (X86_PML4T_SHIFT) * 8 )

 	/*
 	 * statically map the low 2 GB to itself and to the high kernel half
@ -150,15 +150,15 @@ ENTRY(_setup)
 	 * both low and high memory, so techincally this creates a total of four
 	 * mappings (+0 GB, +510 GB, -512 GB, -2 GB), but we remove all except
 	 * the -2GB one once we have transitioned to high memory. */
-	movl	$0x00000083, PADDR(_pdpt0 + PDP_OFFSET(KERNBASE))
-	movl	$0x40000083, PADDR(_pdpt0 + PDP_OFFSET(KERNBASE + 0x40000000))
+	movl	$0x00000083, PADDR(_pdpt0 + PDPT_OFFSET(KERNBASE))
+	movl	$0x40000083, PADDR(_pdpt0 + PDPT_OFFSET(KERNBASE + 0x40000000))

 	movl	$PADDR(_pdpt0 + 0x003), PADDR(_pml4t) /* present (0), write (1), huge (7) */
-	movl	$PADDR(_pdpt0 + 0x003), PADDR(_pml4t + PML4_OFFSET(KERNBASE))
+	movl	$PADDR(_pdpt0 + 0x003), PADDR(_pml4t + PML4T_OFFSET(KERNBASE))

 	/* map the PML4 to itself */
-	movl	$PADDR(_pml4t + 0x003), PADDR(_pml4t + PML4_OFFSET(X86_PMAP_OFFSET))
-	movb	$0x80, PADDR(_pml4t + PML4_OFFSET(X86_PMAP_OFFSET) + 7) /* NX bit */
+	movl	$PADDR(_pml4t + 0x003), PADDR(_pml4t + PML4T_OFFSET(X86_PMAP_OFFSET))
+	movb	$0x80, PADDR(_pml4t + PML4T_OFFSET(X86_PMAP_OFFSET) + 7) /* NX bit */

 	/*
 	 * ensure paging is disabled by clearing CR0.PG (bit 31)
@ -192,7 +192,7 @@ ENTRY(_setup)

 	/*
 	 * enable:
-	 *   CR0.PG (Paging, bit31)
+	 *   CR0.PG (Paging, bit 31)
 	 *   CR0.WP (Write Protect, bit 16)
 	 */
 	movl	%cr0, %eax
--- a/arch/x86/include/amd64/page.h
+++ b/arch/x86/include/amd64/page.h
@ -63,8 +63,7 @@
 /** @brief Binary logarithm of `HUGEPAGE_SIZE`. */
 #define HUGEPAGE_SHIFT X86_PDT_SHIFT
 /** @brief Binary logarithm of `GIGAPAGE_SIZE`. */
-#define GIGAPAGE_SHIFT
-#define GIGAPAGE_SIZE (1 << GIGAPAGE_SHIFT)
+#define GIGAPAGE_SHIFT X86_PDPT_SHIFT

 #ifndef _ASM_SOURCE

--- a/arch/x86/include/arch/dma.h
+++ b/arch/x86/include/arch/dma.h
@ -0,0 +1,6 @@
+/* Copyright (C) 2021 fef <owo@fef.moe>.  All rights reserved. */
+
+#pragma once
+
+/** @brief Maximum address for legacy DMA transfers */
+#define DMA_LIMIT (1 << 24)
--- a/arch/x86/include/arch/page.h
+++ b/arch/x86/include/arch/page.h
@ -23,6 +23,9 @@
 #endif

 #define HUGEPAGE_SIZE (1 << HUGEPAGE_SHIFT)
+#ifdef __HAVE_GIGAPAGES
+#define GIGAPAGE_SIZE (1 << GIGAPAGE_SHIFT)
+#endif

 #ifndef _ASM_SOURCE

@ -38,6 +41,10 @@ void x86_paging_init(struct mb2_tag_mmap *mmap);

 #define PAGE_ALIGN(ptr) ((typeof(ptr))( (uintptr_t)(ptr) & PAGE_MASK ))
 #define HUGEPAGE_ALIGN(ptr) ((typeof(ptr))( (uintptr_t)(ptr) & HUGEPAGE_MASK ))
+#ifdef __HAVE_GIGAPAGES
+#define GIGAPAGE_MASK ( ~((unsigned long)GIGAPAGE_SIZE - 1) )
+#define GIGAPAGE_ALIGN(ptr) ((typeof(ptr))( (uintptr_t)(ptr) & GIGAPAGE_MASK ))
+#endif

 /* page fault status code bits */
 #define X86_PF_PRESENT		(1u << 0)
--- a/arch/x86/mm/amd64/init.c
+++ b/arch/x86/mm/amd64/init.c
@ -1,6 +1,7 @@
 /* Copyright (C) 2021 fef <owo@fef.moe>.  All rights reserved. */

 #include <arch/atom.h>
+#include <arch/dma.h>
 #include <arch/multiboot.h>
 #include <arch/vmparam.h>

@ -13,91 +14,39 @@
 #include <inttypes.h>
 #include <string.h>

-/*
- * This file is funny.
- * Our job here seems simple at first glance: initialize the vm_page_array.
- * The catch is that we can't use the regular kernel memory allocators for
- * doing so, because those depend on vm_page_array.  Classic chicken/egg stuff.
- * So, how do we allocate (and map!) memory for the array?  Simple, by using a
- * completely separate page frame allocator that is so basic that it can't even
- * free pages again.  That's not a problem though, because it doesn't need to.
- * Memory maps are created manually, which is very painful, but doable.
- * HOWEVER!  This boot page frame allocator needs to allocate memory for keeping
- * track of which memory areas were already allocated and which ones are still
- * free, too.  Areas might also have to be split, if the region we want to
- * allocate is not the exact size of the physical area.  Therefore, we have
- * *another* allocator, which is basically the most primitive slab allocator in
- * existence.  It uses a fixed-size "slab" (the `free_areas` array below), and
- * keeps track of which free areas are available.
- *
- * To sum up:
- * - The boot "slab" allocator hands out `struct free_area`s to ...
- * - the boot page frame allocator, which is used to set up ...
- * - the buddy page frame allocator, which serves as a backend to ...
- * - the kernel slab allocator.
- *
- * XXX the boot memory allocator could probably be moved to an architecture
- *     independent file, because it is not really specific to the x86.
- */
-
 struct vm_page *const vm_page_array = (vm_page_t)VM_PAGE_ARRAY_OFFSET;
 #ifdef DEBUG
 /* this gets updated in x86_setup_paging() once we know how big the array is */
 vm_page_t _vm_page_array_end = (vm_page_t)(VM_PAGE_ARRAY_OFFSET + VM_PAGE_ARRAY_LENGTH);
 #endif

-/**
- * @brief Memory area information for the boot page frame allocator.
- * The multiboot bootloader gives us an array of memory areas, and tells us
- * which ones are available and which aren't.  We insert all available areas
- * into a circular list (`free_area_list`), and the boot page frame allocator
- * iterates over that list for getting memory.
- *
- * Also, this is probably one of the most unfortunately named structures in the
- * entire system, because instances of this structure need to be allocated and,
- * well, freed.
- */
-struct free_area {
-	struct clist link;
-	vm_paddr_t start;
-	vm_size_t end;
-};
-/** @brief This is essentially a very basic slab. */
-static struct free_area free_areas[16];
-/** @brief List of all free memory areas, ordered by ascending address */
-static CLIST(free_area_list);
-/**
- * @brief List of all the unused members in `free_areas`.
- * This is essentially a very basic slab freelist.
- */
-static CLIST(free_area_freelist);
-
-/**
- * @brief VERY early page frame allocator.
- *
- * Allocates `1 << log2` bytes of memory, aligned to at least its own size.
- *
- * @param log2 Binary logarithm of the allocation size.  Must be at least `PAGE_SHIFT`.
- * @returns Physical address of the allocated region, or `BOOT_PMALLOC_ERR` on failure
- */
-static vm_paddr_t __boot_pmalloc(u_int log2);
-#define BOOT_PMALLOC_ERR (~0ul)
-/** @brief Zero out a single page (required for page tables) */
-static void __boot_clear_page(vm_paddr_t paddr);
-
 /** @brief Initialize the members of `vm_page_array` within the given range. */
 static void init_page_range(vm_paddr_t start, vm_paddr_t end, u_int flags);
-/** @brief Add a new entry to the list of free memory areas. */
-static void insert_free_area(struct mb2_mmap_entry *entry);
-static void init_free_area_freelist(void);
 static void print_mem_area(struct mb2_mmap_entry *entry);

+static void register_area(struct mb2_mmap_entry *entry)
+{
+	vm_paddr_t start = entry->addr;
+	vm_paddr_t end = start + entry->len;
+
+	if (start >= DMA_LIMIT) {
+		__boot_register_mem_area(start, end, MM_ZONE_NORMAL);
+	} else if (start < DMA_LIMIT && end > DMA_LIMIT) {
+		__boot_register_mem_area(start, DMA_LIMIT, MM_ZONE_DMA);
+		__boot_register_mem_area(DMA_LIMIT, end, MM_ZONE_NORMAL);
+	} else if (start < DMA_LIMIT && end <= DMA_LIMIT) {
+		__boot_register_mem_area(start, end, MM_ZONE_DMA);
+	} else {
+		panic("congratulations, you reached an unreachable branch");
+	}
+}
+
 /*
 * "Oh cool another deeply nested 100-liner that nobody understands"
 */
 void x86_paging_init(struct mb2_tag_mmap *mmap)
 {
-	init_free_area_freelist();
+	__boot_pmalloc_init();

 	/*
 	 * insert all free areas and find the end of physical memory
@ -110,7 +59,7 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
 		end = max(end, entry_end);
 		print_mem_area(entry);
 		if (entry->type == MB2_MEMORY_AVAILABLE)
-			insert_free_area(entry);
+			register_area(entry);
 		entry = (void *)entry + mmap->entry_size;
 	}

@ -127,15 +76,23 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
 	remaining_size = align_ceil(remaining_size, PAGE_SIZE);
 	kprintf("Mapping %zu bytes for vm_page_array\n", remaining_size);

+	/* PML4T loop */
 	while (remaining_size != 0) {
+		/* Is vm_page_array so huge that it spans almost the entire 2 TB
+		 * kernel region?  If that's the case, something has gone terribly
+		 * wrong, unless we somehow happen to have about an Exabyte of RAM
+		 * (which is not physically addressable by the CPU's 40-bit bus). */
+		KASSERT(map_pos < (void *)KERNBASE);
+
 		x86_pml4te_t *pml4te = X86_PML4TE(map_pos);
 		vm_paddr_t pml4te_val = __boot_pmalloc(PAGE_SHIFT);
-		KASSERT(pml4te_val != BOOT_PMALLOC_ERR);
+		panic_if(pml4te_val == BOOT_PMALLOC_ERR, "cannot reserve memory for vm_page_array");
 		__boot_clear_page(pml4te_val);
 		pml4te_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_NOEXEC;
 		pml4te->val = pml4te_val;
 		vm_flush();

+		/* PDPT loop */
 		for (int pdpt_index = 0; pdpt_index < 512; pdpt_index++) {
 			x86_pdpte_t *pdpte = X86_PDPTE(map_pos);
 			vm_paddr_t pdpte_val;
@ -148,7 +105,7 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
 				 * and clang is emitting the check.  So it's fine, i guess. */
 				if (pdpte_val != BOOT_PMALLOC_ERR) {
 					pdpte_val |= __P_PRESENT | __P_RW | __P_HUGE
-						 | __P_GLOBAL | __P_NOEXEC;
+						  | __P_GLOBAL | __P_NOEXEC;
 					pdpte->val = pdpte_val;
 					remaining_size -= 1 << X86_PDPT_SHIFT;
 					map_pos += 1 << X86_PDPT_SHIFT;
@ -160,12 +117,14 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)

 			/* couldn't use a gigapage, continue in hugepage steps */
 			pdpte_val = __boot_pmalloc(PAGE_SHIFT);
-			KASSERT(pdpte_val != BOOT_PMALLOC_ERR);
+			panic_if(pdpte_val == BOOT_PMALLOC_ERR,
+				 "cannot reserve memory for vm_page_array");
 			__boot_clear_page(pdpte_val);
 			pdpte_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_NOEXEC;
 			pdpte->val = pdpte_val;
 			vm_flush();

+			/* PDT loop */
 			for (int pdt_index = 0; pdt_index < 512; pdt_index++) {
 				x86_pdte_t *pdte = X86_PDTE(map_pos);
 				vm_paddr_t pdte_val;
@ -175,7 +134,7 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
 					pdte_val = __boot_pmalloc(X86_PDT_SHIFT);
 					if (pdte_val != BOOT_PMALLOC_ERR) {
 						pdte_val |= __P_PRESENT | __P_RW | __P_GLOBAL
-							| __P_HUGE | __P_NOEXEC;
+							 | __P_HUGE | __P_NOEXEC;
 						pdte->val = pdte_val;
 						remaining_size -= 1 << X86_PDT_SHIFT;
 						map_pos += 1 << X86_PDT_SHIFT;
@ -187,16 +146,19 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)

 				/* couldn't use a hugepage, continue in page steps */
 				pdte_val = __boot_pmalloc(PAGE_SHIFT);
-				KASSERT(pdte_val != BOOT_PMALLOC_ERR);
+				panic_if(pdte_val == BOOT_PMALLOC_ERR,
+					 "cannot reserve memory for vm_page_array");
 				__boot_clear_page(pdpte_val);
 				pdte_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_NOEXEC;
 				pdte->val = pdte_val;
 				vm_flush();

+				/* PT loop */
 				for (int pt_index = 0; pt_index < 512; pt_index++) {
 					x86_pte_t *pte = X86_PTE(map_pos);
 					vm_paddr_t pte_val = __boot_pmalloc(X86_PT_SHIFT);
-					KASSERT(pte_val != BOOT_PMALLOC_ERR);
+					panic_if(pte_val == BOOT_PMALLOC_ERR,
+						 "cannot reserve memory for vm_page_array");
 					pte_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_NOEXEC;
 					pte->val = pte_val;

@ -205,104 +167,12 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
 					if (remaining_size == 0)
 						goto map_done;
 				} /* end of PT loop */
-			} /* end of PD loop */
-		} /* end of PDP loop */
-	} /* end of PML4 loop */
+			} /* end of PDT loop */
+		} /* end of PDPT loop */
+	} /* end of PML4T loop */

 map_done:
 	vm_flush();
-
-	/*
-	 * initialize the individual pages and calculate the usable RAM size
-	 */
-	vm_paddr_t prev_end = 0;
-	vm_size_t available_ram = 0;
-	struct free_area *cursor;
-	clist_foreach_entry(&free_area_list, cursor, link) {
-		/* list should have been ordered by ascending size */
-		KASSERT(cursor->start >= prev_end);
-
-		if (cursor->start != prev_end) {
-			vm_paddr_t reserved_start = prev_end;
-			vm_paddr_t reserved_end = cursor->start;
-			init_page_range(reserved_start, reserved_end, PG_RESERVED);
-		}
-
-		init_page_range(cursor->start, cursor->end, 0);
-		prev_end = cursor->end;
-		available_ram += cursor->end - cursor->start;
-	}
-
-	kprintf("Available RAM: %"PRIdVM_SIZE" bytes\n", available_ram);
-}
-
-static struct free_area *alloc_free_area_entry(void)
-{
-	/* XXX this should pretty much never happen, but it would still be nice to
-	 *     have at least some sort of error recovery rather than giving up */
-	if (clist_is_empty(&free_area_freelist))
-		panic("Boot memory allocator has run out of free_areas");
-	return clist_del_first_entry(&free_area_freelist, struct free_area, link);
-}
-
-static void free_free_area_entry(struct free_area *area)
-{
-#ifdef DEBUG
-	area->start = ~0ul;
-	area->end = ~0ul;
-#endif
-	clist_add(&free_area_freelist, &area->link);
-}
-
-static void init_free_area_freelist(void)
-{
-	for (u_int i = 0; i < ARRAY_SIZE(free_areas); i++)
-		clist_add(&free_area_freelist, &free_areas[i].link);
-}
-
-static void insert_free_area(struct mb2_mmap_entry *entry)
-{
-	vm_paddr_t start = align_ceil(entry->addr, PAGE_SIZE);
-	vm_paddr_t end = align_floor(entry->addr + entry->len, PAGE_SIZE);
-	if (start <= image_start_phys && end >= image_end_phys) {
-		/*
-		 * This is the area that the kernel image is loaded in, which we need
-		 * to treat differently than all the others because it gets split up
-		 * into two usable areas.  Illustration (addresses are examples only):
-		 *
-		 * 0x01000000 ---------------------- end (high_end)
-		 *     :        <free real estate>
-		 * 0x00500000 ---------------------- image_end_phys (high_start)
-		 *     :       <kernel code & data>
-		 * 0x00400000 ---------------------- image_start_phys (low_end)
-		 *     :        <free real estate>
-		 * 0x00100000 ---------------------- start (low_start)
-		 *
-		 * (we silently assert that the image always spans only one region)
-		 */
-		vm_paddr_t low_start = start;
-		vm_paddr_t low_end = align_floor(image_start_phys, PAGE_SIZE);
-		if (low_start < low_end) {
-			struct free_area *area = alloc_free_area_entry();
-			area->start = low_start;
-			area->end = low_end;
-			clist_add(&free_area_list, &area->link);
-		}
-
-		vm_paddr_t high_start = align_ceil(image_end_phys, PAGE_SIZE);
-		vm_paddr_t high_end = end;
-		if (high_start < high_end) {
-			struct free_area *area = alloc_free_area_entry();
-			area->start = high_start;
-			area->end = high_end;
-			clist_add(&free_area_list, &area->link);
-		}
-	} else {
-		struct free_area *area = alloc_free_area_entry();
-		area->start = start;
-		area->end = end;
-		clist_add(&free_area_list, &area->link);
-	}
 }

 static void init_page_range(vm_paddr_t start, vm_paddr_t end, u_int flags)
@ -324,64 +194,6 @@ static void init_page_range(vm_paddr_t start, vm_paddr_t end, u_int flags)
 	}
 }

-/*
- * This works relatively simple, actually.
- * We iterate over the list of `struct free_area`s in reverse order because the
- * list is sorted by ascending physical address and i've decided that we prefer
- * using higher physical addresses for the page array.  The first fit wins, and
- * all that's left is to split up the area and insert the top and bottom
- * remainder back into the list, if applicable.
- */
-static vm_paddr_t __boot_pmalloc(u_int log2)
-{
-	const usize alloc_size = 1 << log2;
-	KASSERT(log2 >= PAGE_SHIFT); /* never hand out less than a full page */
-
-	struct free_area *cursor;
-	clist_foreach_entry_rev(&free_area_list, cursor, link) {
-		vm_paddr_t area_start = cursor->start;
-		vm_paddr_t area_end = cursor->end;
-		KASSERT(area_start < area_end);
-		/* the areas tend to be aligned to greater sizes at their beginning */
-		vm_paddr_t alloc_start = align_ceil(area_start, alloc_size);
-		vm_paddr_t alloc_end = alloc_start + alloc_size;
-
-		if (alloc_start >= area_start && alloc_end <= area_end) {
-			/*
-			 * Example with log2 == 21 (alloc_size == 0x00200000):
-			 *
-			 * 0x00500000 ------------------- area_end (not aligned)
-			 *     :          <high_rest>
-			 * 0x00400000 ------------------- alloc_end (aligned to alloc_size)
-			 *     :       <allocated block>
-			 * 0x00200000 ------------------- alloc_start (aligned to alloc_size)
-			 *     :          <low_rest>
-			 * 0x00100000 ------------------- area_start (not aligned)
-			 */
-
-			if (alloc_start > area_start) {
-				struct free_area *low_rest = alloc_free_area_entry();
-				low_rest->start = area_start;
-				low_rest->end = alloc_start;
-				clist_add(&cursor->link, &low_rest->link);
-			}
-
-			if (alloc_end < area_end) {
-				struct free_area *high_rest = alloc_free_area_entry();
-				high_rest->start = alloc_end;
-				high_rest->end = area_end;
-				clist_add_first(&cursor->link, &high_rest->link);
-			}
-
-			clist_del(&cursor->link);
-			free_free_area_entry(cursor);
-			return alloc_start;
-		}
-	}
-
-	return BOOT_PMALLOC_ERR;
-}
-
 /*
 * It's really unfortunate that we have to zero a page before we can use it as
 * a page table, yet also need to reference it in the page table structures
@ -389,7 +201,7 @@ static vm_paddr_t __boot_pmalloc(u_int log2)
 * This little hack temporarily maps the area at one PDP entry before KERNBASE
 * (meaning index 1022 of _pdp0), zeroes the area, and then unmaps it again.
 */
-static void __boot_clear_page(vm_paddr_t paddr)
+void __boot_clear_page(vm_paddr_t paddr)
 {
 	vm_paddr_t pbase = align_floor(paddr, 1 << X86_PDPT_SHIFT);
 	vm_offset_t offset = paddr - pbase;
--- a/include/gay/cdefs.h
+++ b/include/gay/cdefs.h
@ -9,15 +9,19 @@
 #include <gay/_null.h>

 #ifdef __cplusplus
+#if defined(_KERNEL) && !defined(_CXX_KERNEL)
+#error "C++ cannot be used in kernel code. Define _CXX_KERNEL if you know what you're doing."
+#endif
+
 /** @brief Use `__restrict` in header files, and just `restrict` in C code */
 #define __restrict

 #define __BEGIN_DELCS	extern "C" {
 #define __END_DECLS	}
-#else
+#else /* not __cplusplus */
 #define __BEGIN_DECLS
 #define __END_DECLS
-#endif
+#endif /* __cplusplus */

 /** @brief Annotated symbol is an alias for another symbol. */
 #define __alias(name) __attribute__(( alias(#name) ))
--- a/include/gay/clist.h
+++ b/include/gay/clist.h
@ -47,6 +47,7 @@ void clist_init(struct clist *list);
 * @param new New node to insert at the end
 */
 void clist_add(struct clist *head, struct clist *new);
+#define clist_insert_before(node, new) clist_add(node, new)

 /**
 * @brief Add a new node at the beginning of a clist.
@ -55,6 +56,7 @@ void clist_add(struct clist *head, struct clist *new);
 * @param new New node to insert at the beginning
 */
 void clist_add_first(struct clist *head, struct clist *new);
+#define clist_insert_after(node, new) clist_add_first(node, new)

 /**
 * @brief Remove a node from a clist.
--- a/include/gay/mm.h
+++ b/include/gay/mm.h
@ -23,6 +23,7 @@
 #include <arch/page.h>

 #include <gay/cdefs.h>
+#include <gay/config.h>
 #include <gay/kprintf.h>
 #include <gay/types.h>

@ -33,12 +34,22 @@
 #define _M_EMERG	(1 << 1)
 #define _M_NOWAIT	(1 << 2)

-#define MM_ZONE_NORMAL	0
-#define MM_ZONE_DMA	1
+enum mm_zone_type {
+	MM_ZONE_NORMAL	= _M_ZONE_NORMAL,
+	MM_ZONE_DMA	= _M_ZONE_DMA,
+	MM_NR_ZONES
+};
+
+struct _bmem_area {
+	struct clist link; /* -> struct mm_zone::_bmem_areas */
+	vm_paddr_t start;
+	vm_paddr_t end;
+};

 struct mm_zone {
-	patom_t freelist; /* -> struct page */
+	patom_t freelist; /* -> struct vm_page */
 	usize length;
+	struct clist _bmem_areas; /* -> struct _bmem_area */
 };

 /**
@ -48,7 +59,7 @@ struct mm_zone {
 * The mm subsystem isn't NUMA aware, because it's not really a thing on desktop
 * grade machines anyway and would only complicate things unnecessarily.
 */
-extern struct mm_zone mm_zones[2];
+extern struct mm_zone mm_zones[MM_NR_ZONES];

 /**
 * @brief Memory allocation flags passed to `kmalloc()`.
@ -262,4 +273,35 @@ static inline uintptr_t __p(void *virt)
 	return (uintptr_t)virt - DMAP_OFFSET;
 }

+/*
+ * Boot page frame allocator stuff, don't use these in regular code
+ */
+
+/** @brief Initialize the boot page frame allocator (called from `<arch>_paging_init()`) */
+void __boot_pmalloc_init(void);
+
+/**
+ * @brief Tell the boot page frame allocator about a free area in RAM.
+ * The area may overlap with the kernel image; this is checked automatically.
+ */
+void __boot_register_mem_area(vm_paddr_t start, vm_paddr_t end, enum mm_zone_type zone_type);
+
+/**
+ * @brief Allocate a physical memory area.
+ *
+ * @param log2 Binary logarithm of the desired allocation size (must be `>= PAGE_SHIFT`)
+ * @param zone_type What zone to allocate from (you always want `MM_ZONE_NORMAL`)
+ * @return Allocated region (will be aligned to at least its own size),
+ *	or `BOOT_PMALLOC_ERR` if the request could not be satisfied either
+ *	due to OOM or because the alignment constraints failed
+ */
+vm_paddr_t __boot_pmalloc(u_int log2, enum mm_zone_type zone_type);
+#define BOOT_PMALLOC_ERR ((vm_paddr_t)0 - 1)
+
+/**
+ * @brief Zero out a single physical page.
+ * @param addr Physical address of the page in memory (must be page aligned, obviously)
+ */
+void __boot_clear_page(vm_paddr_t addr); /* implemented in arch dependent code */
+
 #endif /* _KERNEL */
--- a/include/gay/systm.h
+++ b/include/gay/systm.h
@ -15,6 +15,10 @@
 * @param fmt printf style format string
 */
 void panic(const char *fmt, ...) __noreturn __printflike(1, 2);
+#define panic_if(condition, msg, ...) do {	\
+	if (__predict_false(condition))		\
+		panic(msg, ##__VA_ARGS__);	\
+} while (0)

 void print_regs(const trap_frame_t *ctx);

--- a/kernel/mm/CMakeLists.txt
+++ b/kernel/mm/CMakeLists.txt
@ -1,6 +1,7 @@
 # Copyright (C) 2021 fef <owo@fef.moe>.  All rights reserved.

 target_sources(gay_kernel PRIVATE
+    boot.c
    kmalloc.c
    page.c
    slab.c
--- a/kernel/mm/boot.c
+++ b/kernel/mm/boot.c
@ -0,0 +1,183 @@
+/* Copyright (C) 2021 fef <owo@fef.moe>.  All rights reserved. */
+
+#include <gay/clist.h>
+#include <gay/linker.h>
+#include <gay/mm.h>
+#include <gay/systm.h>
+#include <gay/util.h>
+
+#include <limits.h>
+
+static struct _bmem_area _bmem_area_cache[16];
+static CLIST(bmem_area_freelist);
+
+#ifdef DEBUG
+#define debug_free_bmem_area(area) ({ (area)->start = ~(vm_paddr_t)0; })
+#define debug_get_bmem_area(area) KASSERT((area)->start != ~(vm_paddr_t)0)
+#else
+#define debug_free_bmem_area(area) ({})
+#define debug_get_bmem_area(area) ({})
+#endif
+
+static struct _bmem_area *get_bmem_area(void)
+{
+	/* XXX this should pretty much never happen, but it would still be nice to
+	 *     have at least some sort of error recovery rather than giving up */
+	if (clist_is_empty(&bmem_area_freelist))
+		panic("Boot memory allocator has run out of areas");
+
+	struct _bmem_area *area = clist_del_first_entry(&bmem_area_freelist, typeof(*area), link);
+	debug_get_bmem_area(area);
+	return area;
+}
+
+static void free_bmem_area(struct _bmem_area *area)
+{
+	debug_free_bmem_area(area);
+	clist_add(&bmem_area_freelist, &area->link);
+}
+
+static void insert_area_unsafe(vm_paddr_t start, vm_paddr_t end, enum mm_zone_type zone_type)
+{
+	KASSERT((start % PAGE_SIZE) == 0);
+	KASSERT((end % PAGE_SIZE) == 0);
+
+	struct _bmem_area *area = get_bmem_area();
+	area->start = start;
+	area->end = end;
+
+	struct mm_zone *zone = &mm_zones[zone_type];
+	struct _bmem_area *cursor;
+	clist_foreach_entry(&zone->_bmem_areas, cursor, link) {
+		if (cursor->start > area->start)
+			break;
+	}
+	clist_insert_before(&cursor->link, &area->link);
+}
+
+void __boot_pmalloc_init(void)
+{
+	for (int i = 0; i < ARRAY_SIZE(_bmem_area_cache); i++) {
+		struct _bmem_area *area = &_bmem_area_cache[i];
+		debug_free_bmem_area(area);
+		clist_add(&bmem_area_freelist, &area->link);
+	}
+}
+
+void __boot_register_mem_area(vm_paddr_t start, vm_paddr_t end, enum mm_zone_type zone_type)
+{
+	KASSERT(start < end);
+
+	start = align_ceil(start, PAGE_SIZE);
+	end = align_floor(end, PAGE_SIZE);
+	if (start == end)
+		return;
+
+	/* check for any overlaps with the kernel image and avoid those regions */
+	if (start <= image_start_phys && end >= image_end_phys) {
+		/*
+		 * 0x8000 ---------------------- end (-> high_end)
+		 * 0x7000   <free real estate>
+		 * 0x6000 ---------------------- image_end_phys (-> high_start)
+		 * 0x5000  <kernel code & data>
+		 * 0x4000 ---------------------- image_start_phys (-> low_end)
+		 * 0x3000   <free real estate>
+		 * 0x2000 ---------------------- start (-> low_start)
+		 */
+		vm_paddr_t low_start = start;
+		vm_paddr_t low_end = align_floor(image_start_phys, PAGE_SIZE);
+		if (low_start < low_end)
+			insert_area_unsafe(low_start, low_end, zone_type);
+
+		vm_paddr_t high_start = align_ceil(image_end_phys, PAGE_SIZE);
+		vm_paddr_t high_end = end;
+		if (high_start < high_end)
+			insert_area_unsafe(high_start, high_end, zone_type);
+	} else if (start >= image_start_phys && start <= image_end_phys) {
+		/*
+		 * 0x8000 ---------------------- end (-> high_end)
+		 * 0x7000   <free real estate>
+		 * 0x6000 ---------------------- image_end_phys (-> high_start)
+		 * 0x5000  <kernel code & data>
+		 * 0x4000 ---------------------- start
+		 * 0x3000   <not part of area>
+		 * 0x2000 ---------------------- image_start_phys
+		 */
+		vm_paddr_t high_start = align_ceil(image_end_phys, PAGE_SIZE);
+		vm_paddr_t high_end = end;
+		if (high_start < high_end)
+			insert_area_unsafe(high_start, high_end, zone_type);
+	} else if (end >= image_start_phys && end <= image_end_phys) {
+		/*
+		 * 0x8000 ---------------------- image_end_phys
+		 * 0x7000   <not part of area>
+		 * 0x6000 ---------------------- end
+		 * 0x5000  <kernel code & data>
+		 * 0x4000 ---------------------- image_start_phys (-> low_end)
+		 * 0x3000   <free real estate>
+		 * 0x2000 ---------------------- start (-> low_start)
+		 */
+		vm_paddr_t low_start = start;
+		vm_paddr_t low_end = align_floor(image_start_phys, PAGE_SIZE);
+		if (low_start < low_end)
+			insert_area_unsafe(low_start, low_end, zone_type);
+	} else {
+		insert_area_unsafe(start, end, zone_type);
+	}
+}
+
+vm_paddr_t __boot_pmalloc(u_int log2, enum mm_zone_type zone_type)
+{
+	/* never hand out less than a full page */
+	KASSERT(log2 >= PAGE_SHIFT);
+	/* this might fail if someone accidentally gives us a size rather than shift */
+	KASSERT(log2 < sizeof(vm_paddr_t) * CHAR_BIT);
+
+	const vm_size_t alloc_size = (vm_size_t)1 << log2;
+	struct mm_zone *zone = &mm_zones[zone_type];
+
+	struct _bmem_area *cursor;
+	clist_foreach_entry_rev(&zone->_bmem_areas, cursor, link) {
+		vm_paddr_t area_start = cursor->start;
+		vm_paddr_t area_end = cursor->end;
+		KASSERT(area_start < area_end);
+
+		/* XXX we should really use a best-fit algorithm for this */
+		vm_paddr_t alloc_start = align_ceil(area_start, alloc_size);
+		vm_paddr_t alloc_end = alloc_start + alloc_size;
+
+		if (alloc_start >= area_start && alloc_end <= area_end) {
+			/*
+			 * Example with log2 == 18 (alloc_size == 0x4000):
+			 *
+			 * 0x8000 ------------------- area_end
+			 * 0x7000     <high_rest>
+			 * 0x8000 ------------------- alloc_end (aligned to 0x4000)
+			 *   :    <allocated block>
+			 * 0x4000 ------------------- alloc_start (aligned to 0x4000)
+			 * 0x3000     <low_rest>
+			 * 0x2000 ------------------- area_start
+			 */
+
+			if (alloc_start > area_start) {
+				struct _bmem_area *low_rest = get_bmem_area();
+				low_rest->start = area_start;
+				low_rest->end = alloc_start;
+				clist_insert_before(&cursor->link, &low_rest->link);
+			}
+
+			if (alloc_end < area_end) {
+				struct _bmem_area *high_rest = get_bmem_area();
+				high_rest->start = alloc_end;
+				high_rest->end = area_end;
+				clist_insert_after(&cursor->link, &high_rest->link);
+			}
+
+			clist_del(&cursor->link);
+			free_bmem_area(cursor);
+			return alloc_start;
+		}
+	}
+
+	return BOOT_PMALLOC_ERR;
+}