mm: refactor entire mm subsystem, part 1

Another one of those larger endeavours that take multiple commits. This first one introduces the basic vm_page data structure, as well as the x86 bootstrap code for initializing it.
2021-11-17 05:13:23 +01:00 · 2021-11-17 05:13:23 +01:00 · 7285c2e076
commit 7285c2e076
parent 2ace3d3505
17 changed files with 841 additions and 259 deletions
--- a/arch/x86/boot/boot.c
+++ b/arch/x86/boot/boot.c
@ -73,14 +73,14 @@ static void fb_init(enum vga_color fg, enum vga_color bg);
 static void print_gay_propaganda(void);

 static struct mb2_tag *next_tag(struct mb2_tag *tag);
-static int handle_tag(struct mb2_tag *tag);
-static int handle_mmap_tag(struct mb2_tag_mmap *tag);
-static const char *mmap_type_name(u32 type);
+static void handle_tag(struct mb2_tag *tag);

 extern int main(int argc, char *argv[]);

 __asmlink void _boot(void *address)
 {
+	volatile int x = 69420;
+	while (x == 69420);
 	kprintf_set_printer(&fb_kprintf_printer);
 	fb_init(VGA_COLOR_LIGHT_GREY, VGA_COLOR_BLACK);

@ -88,24 +88,17 @@ __asmlink void _boot(void *address)

 	print_gay_propaganda();

-	int err = 0;
 	/* the +8 for the address has something to do with the tags
 	 * being embedded in another struct iirc, but i have no idea what
 	 * that was and quite honestly i'm just glad it works at all. */
-	for (struct mb2_tag *tag = address + 8; tag != NULL; tag = next_tag(tag)) {
-		err = handle_tag(tag);
-		if (err)
-			break;
-	}
+	for (struct mb2_tag *tag = address + 8; tag != NULL; tag = next_tag(tag))
+		handle_tag(tag);

-	if (!err)
-		main(0, NULL);
+	main(0, nil);
 }

-static inline int handle_tag(struct mb2_tag *tag)
+static inline void handle_tag(struct mb2_tag *tag)
 {
-	int ret = 0;
-
 	switch (tag->type) {
 	case MB2_TAG_TYPE_END:
 		break;
@ -113,62 +106,12 @@ static inline int handle_tag(struct mb2_tag *tag)
 		kprintf("Kernel command line: %s\n", ((struct mb2_tag_string *)tag)->string);
 		break;
 	case MB2_TAG_TYPE_MMAP:
-		ret = handle_mmap_tag((struct mb2_tag_mmap *)tag);
+		x86_paging_init((struct mb2_tag_mmap *)tag);
 		break;
 	default:
 		//kprintf("Unknown tag %u\n", tag->type);
 		break;
 	}
-
-	return ret;
-}
-
-static inline int handle_mmap_tag(struct mb2_tag_mmap *tag)
-{
-	kprintf("Memory map:\n");
-
-	uintptr_t region = 0;
-	usize region_len = 0;
-
-	struct mb2_mmap_entry *entry = &tag->entries[0];
-	while ((void *)entry < (void *)tag + tag->tag.size) {
-		kprintf(" [0x%016llx-0x%016llx] %s\n",
-			entry->addr,
-			entry->addr + entry->len - 1,
-			mmap_type_name(entry->type));
-
-		usize safe_len;
-#		ifdef __x86_64__
-			safe_len = entry->len;
-#		else
-			if (entry->addr >= (1llu << 32))
-				safe_len = 0; /* we can't handle 64-bit pointers */
-			else if (entry->len > (1llu << 32) - entry->addr)
-				safe_len = (1llu << 32) - entry->addr; /* clip to 32-bit */
-			else
-				safe_len = entry->len;
-#		endif
-
-		if (entry->type == MB2_MEMORY_AVAILABLE && safe_len > region_len) {
-			region = entry->addr;
-			region_len = safe_len;
-		}
-
-		entry = (void *)entry + tag->entry_size;
-	}
-
-	if (region == 0 || region_len == 0) {
-		kprintf("No memory available! Aborting.\n");
-		return 1;
-	}
-
-	int err = kmalloc_init(region, region + region_len);
-	if (err) {
-		kprintf("kmalloc_init() failed! Aborting.\n");
-		return 1;
-	}
-
-	return 0;
 }

 static inline struct mb2_tag *next_tag(struct mb2_tag *tag)
@ -280,21 +223,3 @@ static void print_gay_propaganda(void)
 	fb_foreground = fg_before;
 	kprintf(", be gay do crime!\n\n");
 }
-
-static const char *mmap_type_name(u32 type)
-{
-	switch (type) {
-	case MB2_MEMORY_AVAILABLE:
-		return "Available";
-	case MB2_MEMORY_RESERVED:
-		return "Reserved";
-	case MB2_MEMORY_ACPI_RECLAIMABLE:
-		return "ACPI";
-	case MB2_MEMORY_NVS: /* non-volatile storage */
-		return "NVS";
-	case MB2_MEMORY_BADRAM:
-		return "Bad RAM";
-	default:
-		return "Unknown";
-	}
-}
--- a/arch/x86/boot/multiboot.S
+++ b/arch/x86/boot/multiboot.S
@ -26,7 +26,7 @@ header_start: /* struct mb2_header */
 	/* heaer_length */
 	.long	header_end - header_start
 	/* checksum */
-	.long	(1 << 33) - MB2_HEADER_MAGIC - MB2_ARCHITECTURE_I386 - (header_end - header_start)
+	.long	(1 << 32) - MB2_HEADER_MAGIC - MB2_ARCHITECTURE_I386 - (header_end - header_start)

 #if 0 /* TODO: implement graphics */
 	.align	MB2_TAG_ALIGN
--- a/arch/x86/boot/setup64.S
+++ b/arch/x86/boot/setup64.S
@ -129,36 +129,36 @@ ENTRY(_setup)
 	movl	$X86_KERN_TSS, %eax
 	ltr	%ax

-#if (KERNBASE % (1 << X86_PDP_SHIFT)) != 0
+#if (KERNBASE % (1 << X86_PDPT_SHIFT)) != 0
 #error "KERNBASE must be aligned to at least a PDP entry (1 GB)"
 #endif
-#if (X86_PMAP_OFFSET % (1 << X86_PML4_SHIFT)) != 0
+#if (X86_PMAP_OFFSET % (1 << X86_PML4T_SHIFT)) != 0
 #error "X86_PMAP_OFFSET must be aligned to at least a PML4 entry (512 GB)"
 #endif

 #define V48 0xffff000000000000
-#define PDP_OFFSET(ptr) (( (((ptr) - V48) >> X86_PDP_SHIFT) % 512 ) * 8)
-#define PML4_OFFSET(ptr) ( ((ptr) - V48) >> (X86_PML4_SHIFT) * 8 )
+#define PDP_OFFSET(ptr) (( (((ptr) - V48) >> X86_PDPT_SHIFT) % 512 ) * 8)
+#define PML4_OFFSET(ptr) ( ((ptr) - V48) >> (X86_PML4T_SHIFT) * 8 )

 	/*
 	 * statically map the low 2 GB to itself and to the high kernel half
 	 */
 	/* for the identity mapping */
-	movl	$0x00000083, PADDR(_pdp0) /* present (0), write (1), huge (7) */
-	movl	$0x40000083, PADDR(_pdp0 + 8)
+	movl	$0x00000083, PADDR(_pdpt0) /* present (0), write (1), huge (7) */
+	movl	$0x40000083, PADDR(_pdpt0 + 8)
 	/* For the -2GB at the end of virtual memory.  We use the same PDP for
 	 * both low and high memory, so techincally this creates a total of four
 	 * mappings (+0 GB, +510 GB, -512 GB, -2 GB), but we remove all except
 	 * the -2GB one once we have transitioned to high memory. */
-	movl	$0x00000083, PADDR(_pdp0 + PDP_OFFSET(KERNBASE))
-	movl	$0x40000083, PADDR(_pdp0 + PDP_OFFSET(KERNBASE + 0x40000000))
+	movl	$0x00000083, PADDR(_pdpt0 + PDP_OFFSET(KERNBASE))
+	movl	$0x40000083, PADDR(_pdpt0 + PDP_OFFSET(KERNBASE + 0x40000000))

-	movl	$PADDR(_pdp0 + 0x003), PADDR(_pml4) /* present (0), write (1), huge (7) */
-	movl	$PADDR(_pdp0 + 0x003), PADDR(_pml4 + PML4_OFFSET(KERNBASE))
+	movl	$PADDR(_pdpt0 + 0x003), PADDR(_pml4t) /* present (0), write (1), huge (7) */
+	movl	$PADDR(_pdpt0 + 0x003), PADDR(_pml4t + PML4_OFFSET(KERNBASE))

 	/* map the PML4 to itself */
-	movl	$PADDR(_pml4 + 0x003), PADDR(_pml4 + PML4_OFFSET(X86_PMAP_OFFSET))
-	movb	$0x80, PADDR(_pml4 + PML4_OFFSET(X86_PMAP_OFFSET) + 7) /* NX bit */
+	movl	$PADDR(_pml4t + 0x003), PADDR(_pml4t + PML4_OFFSET(X86_PMAP_OFFSET))
+	movb	$0x80, PADDR(_pml4t + PML4_OFFSET(X86_PMAP_OFFSET) + 7) /* NX bit */

 	/*
 	 * ensure paging is disabled by clearing CR0.PG (bit 31)
@ -178,7 +178,7 @@ ENTRY(_setup)
 	movl	%eax, %cr4

 	/* load cr3 with the PML4 */
-	movl	$PADDR(_pml4), %eax
+	movl	$PADDR(_pml4t), %eax
 	movl	%eax, %cr3

 	/*
@ -249,9 +249,9 @@ L_ENTRY(_setup_highmem)
 	popfq

 	/* remove the low memory identity mapping and bonk the TLB */
-	movl	$0, _pdp0
-	movl	$0, _pdp0 + 8
-	movl	$0, _pml4
+	movl	$0, _pdpt0
+	movl	$0, _pdpt0 + 8
+	movl	$0, _pml4t
 	movq	%cr3, %rax
 	movq	%rax, %cr3

--- a/arch/x86/include/amd64/page.h
+++ b/arch/x86/include/amd64/page.h
@ -5,30 +5,77 @@
 #error "This file is not meant to be included directly, use <arch/page.h>"
 #endif

-/** @brief Binary logarithm of `HUGEPAGE_SIZE`. */
-#define HUGEPAGE_SHIFT 21
+/*
+ * Common abbreviations used throughout the entire x86 vm code base:
+ * PT     - Page Table
+ * PDT    - Page Directory Table
+ * PDPT   - Page Directory Pointer Table
+ * PML4T  - Page Map Level 4 Table
+ * PTE    - Page Table Entry
+ * PDTE   - Page Directory Table Entry
+ * PDPTE  - Page Directory Pointer Table Entry
+ * PML4TE - Page Map Level 4 entry
+ * PTI    - Page Table Index (range 0 - 511)
+ * PDTI   - Page Directory Table Index (range 0 - 511)
+ * PDPTI  - Page Directory Pointer Table Index (range 0 - 511)
+ * PML4TI - Page Map Level 4 Index (range 0 - 511)
+ *
+ * Quick recap on how the x86 transes virtual to physical addresses:
+ *
+ *     |63              48|47       39|38       30|29       21|21       12|11           0|
+ *     +------------------+-----------+-----------+-----------+-----------+--------------+
+ *     |     16  bits     |  9  bits  |  9  bits  |  9  bits  |  9  bits  |   12  bits   |
+ *     +------------------+-----------+-----------+-----------+-----------+--------------+
+ * (1) | (copy of bit 47) |   PML4T   |    PDPT   |    PDT    |     PT    | offset (4 K) |
+ *     +------------------+-----------+-----------+-----------+-----------+--------------+
+ * (2) | (copy of bit 47) |   PML4T   |    PDPT   |    PDT    |       offset (2 M)       |
+ *     +------------------+-----------+-----------+-----------+--------------------------+
+ * (3) | (copy of bit 47) |   PML4T   |    PDPT   |             offset (1 G)             |
+ *     +------------------+-----------+-----------+--------------------------------------+
+ *
+ * %CR3:  pointer to PML4T, 256 TB (2^36 pages)
+ * PML4T: 512 entries, 512 GB per entry (2^27 pages)
+ * PDPT:  512 entries,   1 GB per entry (2^18 pages)
+ * PDT:   512 entries,   2 MB per entry (2^9  pages)
+ * PT:    512 entries,   4 KB per entry (1    page)
+ *
+ * PDPT entries can either reference a PDT or a 1 GB region directly (if __P_HUGE is set)
+ * PDT  entries can either reference a PT  or a 2 MB region directly (if __P_HUGE is set)
+ *
+ * (1) shows a PML4T -> PDPT -> PDT -> PT regular  mapping
+ * (2) shows a PML4T -> PDPT -> PDT       hugepage mapping
+ * (3) shows a PML4T -> PDPT              gigapage mapping
+ *
+ * Since the lowest 12 bits are always zero in any page map entry, they are
+ * used for flags.  Additionally, bit 63 stores the NX (no execute) flag.
+ */

 #include <arch/vmparam.h>

-#define X86_PT_SHIFT PAGE_SHIFT
-#define X86_PD_SHIFT (X86_PT_SHIFT + 9)
-#define X86_PDP_SHIFT (X86_PD_SHIFT + 9)
-#define X86_PML4_SHIFT (X86_PDP_SHIFT + 9)
+#define X86_PT_SHIFT	PAGE_SHIFT
+#define X86_PDT_SHIFT	(X86_PT_SHIFT + 9)
+#define X86_PDPT_SHIFT	(X86_PDT_SHIFT + 9)
+#define X86_PML4T_SHIFT	(X86_PDPT_SHIFT + 9)
+
+#define __HAVE_NOEXEC
+#define __HAVE_GIGAPAGES
+
+/** @brief Binary logarithm of `HUGEPAGE_SIZE`. */
+#define HUGEPAGE_SHIFT X86_PDT_SHIFT
+/** @brief Binary logarithm of `GIGAPAGE_SIZE`. */
+#define GIGAPAGE_SHIFT
+#define GIGAPAGE_SIZE (1 << GIGAPAGE_SHIFT)

 #ifndef _ASM_SOURCE

 #include <gay/cdefs.h>
 #include <gay/types.h>

-#define __HAVE_NOEXEC
-
 /**
- * @brief A single 64-bit Page Table Entry.
+ * @brief A single 64-bit page map entry, split up into its individual bit flags.
 * The layout matches that of the Intel SDM, vol 3, sect 4.3, fig 4-4.
- * Bits 9 and 10 (`slab` and `atomic`) are marked as AVL in the manual and
- * ignored by the MMU.  We only use them for `get_pflags()`/`set_pflags()`.
 */
-struct x86_page_flags {
+struct x86_pmap_flags {
 /*  0 */bool present:1;			/**< Page Fault on access if 0 */
 /*  1 */bool rw:1;			/**< Page Fault on write if 0 */
 /*  2 */bool user:1;			/**< Page Fault on user mode access if 0 */
@ -36,27 +83,30 @@ struct x86_page_flags {
 /*  4 */bool cache_disabled:1;		/**< Disable caching in TLB */
 /*  5 */bool accessed:1;		/**< 1 if page has been accessed */
 /*  6 */bool dirty:1;			/**< 1 if page has been written to */
-/*  7 */bool huge:1;			/**< only valid for PDPTEs and PDEs */
-/*  8 */bool global:1;			/**< Don't update the TLB on table swap if 1 */
-/*  9 */bool slab:1;			/**< Used by the slab allocator */
-/* 10 */bool atomic:1;			/**< Allocated atomically */
-/* 11 */unsigned _unused:1;
-/* 12 */uintptr_t shifted_address:51;
-/* 63 */bool noexec:1;
+/*  7 */bool huge:1;			/**< Only valid for PDPTEs and PDTEs */
+/*  8 */bool global:1;			/**< Entry survives `vm_flush()` if 1 */
+/*  9 */unsigned _unused:3;
+/* 12 */vm_paddr_t shifted_address:51;
+/* 63 */bool noexec:1;			/**< Prevent instruction fetches */
 } __packed;

-#define __PFLAG_PRESENT		(1 << 0)
-#define __PFLAG_RW		(1 << 1)
-#define __PFLAG_USER		(1 << 2)
-#define __PFLAG_WRITE_THROUGH	(1 << 3)
-#define __PFLAG_NOCACHE		(1 << 4)
-#define __PFLAG_ACCESSED	(1 << 5)
-#define __PFLAG_DIRTY		(1 << 6)
-#define __PFLAG_HUGE		(1 << 7)
-#define __PFLAG_GLOBAL		(1 << 8)
-#define __PFLAG_SLAB		(1 << 9)
-#define __PFLAG_ATOMIC		(1 << 10)
-#define __PFLAG_NOEXEC		(1 << 63)
+/* bitmasks for the structure above */
+
+#define __P_PRESENT		(1 << 0)
+#define __P_RW			(1 << 1)
+#define __P_USER		(1 << 2)
+#define __P_WRITE_THROUGH	(1 << 3)
+#define __P_NOCACHE		(1 << 4)
+#define __P_ACCESSED		(1 << 5)
+#define __P_DIRTY		(1 << 6)
+#define __P_HUGE		(1 << 7)
+#define __P_GLOBAL		(1 << 8)
+#define __P_SLAB		(1 << 9)
+#define __P_ATOMIC		(1 << 10)
+#define __P_NOEXEC		(1ul << 63)
+
+/** @brief Bitmask for extracting the physical address from a page map entry. */
+#define X86_PMAP_MASK 0x7ffffffffffff000

 /*
 * these types are deliberately not merged into one so that the
@ -64,39 +114,82 @@ struct x86_page_flags {
 */

 #define __pmap_entry_union union {	\
-	struct x86_page_flags flags;	\
-	uintptr_t val;			\
+	struct x86_pmap_flags flags;	\
+	vm_paddr_t val;			\
 }
+/** @brief x86 Page Table Entry. */
 typedef __pmap_entry_union x86_pte_t;
-typedef __pmap_entry_union x86_pde_t;
-typedef __pmap_entry_union x86_pdpe_t;
-typedef __pmap_entry_union x86_pml4e_t;
+/** @brief x86 Page Directory Table Entry. */
+typedef __pmap_entry_union x86_pdte_t;
+/** @brief x86 Page Directory Pointer Table Entry. */
+typedef __pmap_entry_union x86_pdpte_t;
+/** @brief x86 Page Map Level 4 Table Entry. */
+typedef __pmap_entry_union x86_pml4te_t;

+/** @brief x86 Page Table. */
 typedef struct { x86_pte_t entries[512]; } __aligned(PAGE_SIZE) x86_pt_t;
-typedef struct { x86_pde_t entries[512]; } __aligned(PAGE_SIZE) x86_pd_t;
-typedef struct { x86_pdpe_t entries[512]; } __aligned(PAGE_SIZE) x86_pdp_t;
-typedef struct { x86_pml4e_t entries[512]; } __aligned(PAGE_SIZE) x86_pml4_t;
-
-#define X86_PMAP_MASK 0x7ffffffffffff000
+/** @brief x86 Page Directory Table. */
+typedef struct { x86_pdte_t entries[512]; } __aligned(PAGE_SIZE) x86_pdt_t;
+/** @brief x86 Page Directory Pointer Table. */
+typedef struct { x86_pdpte_t entries[512]; } __aligned(PAGE_SIZE) x86_pdpt_t;
+/** @brief x86 Page Map Level 4 Table. */
+typedef struct { x86_pml4te_t entries[512]; } __aligned(PAGE_SIZE) x86_pml4t_t;

 /* you aren't expected to understand any of these, they're just nasty offset calculations */

+#define __V48_MASK ( ((uintptr_t)1 << 48) - 1 )
+
 /** @brief Get the linear 48-bit address */
-#define __V48ADDR(ptr) ((uintptr_t)(ptr) & 0x0000ffffffffffff)
+#define __V48(ptr) ((uintptr_t)(ptr) & __V48_MASK)

-#define X86_PT_INDEX(ptr) (( __V48ADDR(ptr) >> X86_PT_SHIFT ) % 512)
-#define X86_PD_INDEX(ptr) (( __V48ADDR(ptr) >> X86_PD_SHIFT ) % 512)
-#define X86_PDP_INDEX(ptr) (( __V48ADDR(ptr) >> X86_PDP_SHIFT ) % 512)
-#define X86_PML4_INDEX(ptr) ( __V48ADDR(ptr) >> X86_PML4_SHIFT )
+/**
+ * @brief Generate a 48-bit virtual address in user space, based on its pmap indices.
+ * Every index must be less than 512, or you'll get a garbage address.
+ * `pml4i` must be less than 256, or you'll hurt the MMU's feelings.
+ * This is because bits 63-48 of the virtual address must all match bit 47.
+ */
+#define UV48ADDR(pml4ti, pdpti, pdti, pti) (		\
+	(vm_paddr_t)(pml4ti) << X86_PML4T_SHIFT |	\
+	(vm_paddr_t)(pdpti)  << X86_PDPT_SHIFT  |	\
+	(vm_paddr_t)(pdti)   << X86_PDT_SHIFT   |	\
+	(vm_paddr_t)(pti)    << X86_PT_SHIFT		\
+)

-#define __PT_BASE X86_PMAP_OFFSET
-#define __PD_BASE (__PT_BASE + (__V48ADDR(X86_PMAP_OFFSET) >> X86_PT_SHIFT))
-#define __PDP_BASE (__PD_BASE + (__V48ADDR(X86_PMAP_OFFSET) >> X86_PD_SHIFT))
-#define __PML4_BASE (__PDP_BASE + (__V48ADDR(X86_PMAP_OFFSET) >> X86_PDP_SHIFT))
+/**
+ * @brief Generate a 48-bit virtual address in kernel space, based on its pmap indices.
+ * Every index must be less than 512, or you'll get a garbage address.
+ * `pml4i` must be at least 256, or you'll hurt the MMU's feelings.
+ * This is because bits 63-48 of the virtual address must all match bit 47.
+ */
+#define KV48ADDR(pml4ti, pdpti, pdti, pti) (	\
+	(vm_paddr_t)0xffff000000000000 |	\
+	UV48ADDR(pml4ti, pdpti, pdti, pti)	\
+)

-#define X86_PTE(ptr) ((x86_pte_t *)( __PT_BASE + (__V48ADDR(ptr) >> X86_PT_SHIFT) ))
-#define X86_PDE(ptr) ((x86_pde_t *)( __PD_BASE + (__V48ADDR(ptr) >> X86_PD_SHIFT) ))
-#define X86_PDPE(ptr) ((x86_pdpe_t *)( __PDP_BASE + (__V48ADDR(ptr) >> X86_PDP_SHIFT) ))
-#define X86_PML4E(ptr) ((x86_pml4e_t *)( __PML4_BASE + (__V48ADDR(ptr) >> X86_PML4_SHIFT) ))
+/** @brief Get the Page Table index for a given virtual address. */
+#define X86_PTI(ptr)	((__V48(ptr) >> X86_PT_SHIFT   ) % 512)
+/** @brief Get the Page Directory Table index for a given virtual address. */
+#define X86_PDTI(ptr)	((__V48(ptr) >> X86_PDT_SHIFT  ) % 512)
+/** @brief Get the Page Directory Pointer Table index for a given virtual address. */
+#define X86_PDPTI(ptr)	((__V48(ptr) >> X86_PDPT_SHIFT ) % 512)
+/** @brief Get the Page Map Level 4 Table index for a given virtual address. */
+#define X86_PML4TI(ptr)	 (__V48(ptr) >> X86_PML4T_SHIFT)
+
+/* Page Map Level 4 Table index for the recursive page map */
+#define __PML4TI (X86_PML4TI(X86_PMAP_OFFSET)) /* = 256 */
+
+#define __PT_BASE	( (x86_pt_t    *)KV48ADDR(__PML4TI,        0,        0,        0) )
+#define __PDT_BASE	( (x86_pdt_t   *)KV48ADDR(__PML4TI, __PML4TI,        0,        0) )
+#define __PDPT_BASE	( (x86_pdpt_t  *)KV48ADDR(__PML4TI, __PML4TI, __PML4TI,        0) )
+#define __PML4T_BASE	( (x86_pml4t_t *)KV48ADDR(__PML4TI, __PML4TI, __PML4TI, __PML4TI) )
+
+/** @brief Get the Page Table Entry for a given virtual address. */
+#define X86_PTE(ptr)	( &__PT_BASE->entries[__V48(ptr) >> X86_PT_SHIFT] )
+/** @brief Get the Page Directory Table Entry for a given virtual address. */
+#define X86_PDTE(ptr)	( &__PDT_BASE->entries[__V48(ptr) >> X86_PDT_SHIFT] )
+/** @brief Get the Page Directory Pointer Table Entry for a given virtual address. */
+#define X86_PDPTE(ptr)	( &__PDPT_BASE->entries[__V48(ptr) >> X86_PDPT_SHIFT] )
+/** @brief Get the Page Map Level 4 Table Entry for a given virtual address. */
+#define X86_PML4TE(ptr)	( &__PML4T_BASE->entries[__V48(ptr) >> X86_PML4T_SHIFT] )

 #endif /* not _ASM_SOURCE */
--- a/arch/x86/include/amd64/vmparam.h
+++ b/arch/x86/include/amd64/vmparam.h
@ -6,20 +6,24 @@
 #endif

 /** @brief Userland memory region */
-#define USER_OFFSET 0x0000000000000000 /* +0 TB */
-#define USER_LENGTH 0x0000800000000000 /* 128 TB */
+#define USER_OFFSET		0x0000000000000000 /* +0 TB */
+#define USER_LENGTH		0x0000800000000000 /* 128 TB */

 /** @brief Recursive Page Map Level 4 map */
-#define X86_PMAP_OFFSET 0xffff800000000000 /* -128 TB */
-#define X86_PMAP_LENGTH 0x0000004020101000 /* ~ 256.5 GB */
+#define X86_PMAP_OFFSET		0xffff800000000000 /* -128 TB */
+#define X86_PMAP_LENGTH		0x0000004020101000 /* ~ 256.5 GB */

 /** @brief Direct (contiguous) mapping of physical memory */
-#define DMAP_OFFSET 0xfffff80000000000 /* -8 TB */
-#define DMAP_LENGTH 0x0000040000000000 /* 4 TB */
+#define DMAP_OFFSET		0xfffff80000000000 /* -8 TB */
+#define DMAP_LENGTH		0x0000040000000000 /* 4 TB */

 /** @brief Kernel region (image, heap, etc) */
-#define KERN_OFFSET 0xfffffe0000000000 /* -2 TB */
-#define KERN_LENGTH 0x0000020000000000 /* 2 TB */
+#define KERN_OFFSET		0xfffffe0000000000 /* -2 TB */
+#define KERN_LENGTH		0x0000020000000000 /* 2 TB */
+
 /** @brief Where the kernel image is actually mapped to */
-#define KERNBASE	0xffffffff80000000 /* -2 GB */
-#define KERNBASE_LENGTH	0x0000000080000000
+#define KERNBASE		0xffffffff80000000 /* -2 GB */
+#define KERNBASE_LENGTH		0x0000000080000000
+
+#define VM_PAGE_ARRAY_OFFSET	KERN_OFFSET
+#define VM_PAGE_ARRAY_LENGTH	(KERN_OFFSET - KERNBASE)
--- a/arch/x86/include/arch/_inttypes.h
+++ b/arch/x86/include/arch/_inttypes.h
@ -140,6 +140,15 @@
 #define	PRIXMAX		"jX"		/* uintmax_t */
 #define	PRIXPTR		__PRIptr"X"	/* uintptr_t */

+#ifdef _KERNEL
+
+#define PRIxVM_PADDR	__PRI64"x"	/* vm_paddr_t */
+#define PRIxVM_OFFSET	__PRI64"x"	/* vm_offset_t */
+#define PRIdVM_OFFSET	__PRI64"d"	/* vm_offset_t */
+#define PRIdVM_SIZE	__PRI64"d"	/* vm_size_t */
+
+#endif /* _KERNEL */
+
 /* fscanf(3) macros for signed integers. */

 #define	SCNd8		"hhd"		/* int8_t */
--- a/arch/x86/include/arch/page.h
+++ b/arch/x86/include/arch/page.h
@ -26,6 +26,11 @@

 #ifndef _ASM_SOURCE

+#include <arch/multiboot.h>
+
+/** @brief Initialize `vm_page_array` based on the multiboot memory map. */
+void x86_paging_init(struct mb2_tag_mmap *mmap);
+
 /** @brief Pointer bitmask to get the base address of their page. */
 #define PAGE_MASK ( ~((unsigned long)PAGE_SIZE - 1) )
 /** @brief Pointer bitmask to get the base address of their huge page. */
@ -48,8 +53,20 @@
 * @brief Get the physical address a virtual one is currently mapped to.
 *
 * @param virt virtual address
- * @returns The physical address, or `0` if there is no mapping
+ * @returns The physical address, or -1 cast to `vm_paddr_t` if there is no mapping
 */
-uintptr_t vtophys(void *virt);
+vm_paddr_t vtophys(void *virt);
+
+static inline void vm_flush(void)
+{
+	register_t tmp;
+	__asm__ volatile(
+	"	mov	%%cr3,	%0	\n"
+	"	mov	%0,	%%cr3	\n"
+	: "=r"(tmp)
+	:
+	: "memory"
+	);
+}

 #endif /* not _ASM_SOURCE */
--- a/arch/x86/include/arch/smp.h
+++ b/arch/x86/include/arch/smp.h
@ -34,17 +34,3 @@ static inline int smp_cpuid(void)
 	return 0;
 #endif /* !CFG_SMP */
 }
-
-/*
- * This file is part of GayBSD.
- * Copyright (c) 2021 fef <owo@fef.moe>.
- *
- * GayBSD is nonviolent software: you may only use, redistribute, and/or
- * modify it under the terms of the Cooperative Nonviolent Public License
- * (CNPL) as found in the LICENSE file in the source code root directory
- * or at <https://git.pixie.town/thufie/npl-builder>; either version 7
- * of the license, or (at your option) any later version.
- *
- * GayBSD comes with ABSOLUTELY NO WARRANTY, to the extent
- * permitted by applicable law.  See the CNPL for details.
- */
--- a/arch/x86/mm/amd64/CMakeLists.txt
+++ b/arch/x86/mm/amd64/CMakeLists.txt
@ -1,5 +1,6 @@
 # Copyright (C) 2021 fef <owo@fef.moe>.  All rights reserved.

 target_sources(gay_arch PRIVATE
+    init.c
    page.c
 )
--- a/arch/x86/mm/amd64/init.c
+++ b/arch/x86/mm/amd64/init.c
@ -0,0 +1,428 @@
+/* Copyright (C) 2021 fef <owo@fef.moe>.  All rights reserved. */
+
+#include <arch/atom.h>
+#include <arch/multiboot.h>
+#include <arch/vmparam.h>
+
+#include <gay/linker.h>
+#include <gay/mm.h>
+#include <gay/vm/page.h>
+#include <gay/systm.h>
+#include <gay/util.h>
+
+#include <inttypes.h>
+#include <string.h>
+
+/*
+ * This file is funny.
+ * Our job here seems simple at first glance: initialize the vm_page_array.
+ * The catch is that we can't use the regular kernel memory allocators for
+ * doing so, because those depend on vm_page_array.  Classic chicken/egg stuff.
+ * So, how do we allocate (and map!) memory for the array?  Simple, by using a
+ * completely separate page frame allocator that is so basic that it can't even
+ * free pages again.  That's not a problem though, because it doesn't need to.
+ * Memory maps are created manually, which is very painful, but doable.
+ * HOWEVER!  This boot page frame allocator needs to allocate memory for keeping
+ * track of which memory areas were already allocated and which ones are still
+ * free, too.  Areas might also have to be split, if the region we want to
+ * allocate is not the exact size of the physical area.  Therefore, we have
+ * *another* allocator, which is basically the most primitive slab allocator in
+ * existence.  It uses a fixed-size "slab" (the `free_areas` array below), and
+ * keeps track of which free areas are available.
+ *
+ * To sum up:
+ * - The boot "slab" allocator hands out `struct free_area`s to ...
+ * - the boot page frame allocator, which is used to set up ...
+ * - the buddy page frame allocator, which serves as a backend to ...
+ * - the kernel slab allocator.
+ *
+ * XXX the boot memory allocator could probably be moved to an architecture
+ *     independent file, because it is not really specific to the x86.
+ */
+
+struct vm_page *const vm_page_array = (vm_page_t)VM_PAGE_ARRAY_OFFSET;
+#ifdef DEBUG
+/* this gets updated in x86_setup_paging() once we know how big the array is */
+vm_page_t _vm_page_array_end = (vm_page_t)(VM_PAGE_ARRAY_OFFSET + VM_PAGE_ARRAY_LENGTH);
+#endif
+
+/**
+ * @brief Memory area information for the boot page frame allocator.
+ * The multiboot bootloader gives us an array of memory areas, and tells us
+ * which ones are available and which aren't.  We insert all available areas
+ * into a circular list (`free_area_list`), and the boot page frame allocator
+ * iterates over that list for getting memory.
+ *
+ * Also, this is probably one of the most unfortunately named structures in the
+ * entire system, because instances of this structure need to be allocated and,
+ * well, freed.
+ */
+struct free_area {
+	struct clist link;
+	vm_paddr_t start;
+	vm_size_t end;
+};
+/** @brief This is essentially a very basic slab. */
+static struct free_area free_areas[16];
+/** @brief List of all free memory areas, ordered by ascending address */
+static CLIST(free_area_list);
+/**
+ * @brief List of all the unused members in `free_areas`.
+ * This is essentially a very basic slab freelist.
+ */
+static CLIST(free_area_freelist);
+
+/**
+ * @brief VERY early page frame allocator.
+ *
+ * Allocates `1 << log2` bytes of memory, aligned to at least its own size.
+ *
+ * @param log2 Binary logarithm of the allocation size.  Must be at least `PAGE_SHIFT`.
+ * @returns Physical address of the allocated region, or `BOOT_PMALLOC_ERR` on failure
+ */
+static vm_paddr_t __boot_pmalloc(u_int log2);
+#define BOOT_PMALLOC_ERR (~0ul)
+/** @brief Zero out a single page (required for page tables) */
+static void __boot_clear_page(vm_paddr_t paddr);
+
+/** @brief Initialize the members of `vm_page_array` within the given range. */
+static void init_page_range(vm_paddr_t start, vm_paddr_t end, u_int flags);
+/** @brief Add a new entry to the list of free memory areas. */
+static void insert_free_area(struct mb2_mmap_entry *entry);
+static void init_free_area_freelist(void);
+static void print_mem_area(struct mb2_mmap_entry *entry);
+
+/*
+ * "Oh cool another deeply nested 100-liner that nobody understands"
+ */
+void x86_paging_init(struct mb2_tag_mmap *mmap)
+{
+	init_free_area_freelist();
+
+	/*
+	 * insert all free areas and find the end of physical memory
+	 */
+	struct mb2_mmap_entry *entry = mmap->entries;
+	vm_paddr_t end = 0;
+	kprintf("Memory map:\n");
+	while ((void *)entry - (void *)mmap < mmap->tag.size) {
+		vm_paddr_t entry_end = entry->addr + entry->len;
+		end = max(end, entry_end);
+		print_mem_area(entry);
+		if (entry->type == MB2_MEMORY_AVAILABLE)
+			insert_free_area(entry);
+		entry = (void *)entry + mmap->entry_size;
+	}
+
+	/*
+	 * allocate and map vm_page_array into virtual memory at VM_PAGE_ARRAY_OFFSET
+	 * (this is gonna be a long one)
+	 */
+	struct vm_page *vm_page_array_end = vm_page_array + (end >> PAGE_SHIFT);
+#ifdef DEBUG
+	_vm_page_array_end = vm_page_array_end;
+#endif
+	void *map_pos = vm_page_array;
+	usize remaining_size = (void *)vm_page_array_end - (void *)vm_page_array;
+	remaining_size = align_ceil(remaining_size, PAGE_SIZE);
+	kprintf("Mapping %zu bytes for vm_page_array\n", remaining_size);
+
+	while (remaining_size != 0) {
+		x86_pml4te_t *pml4te = X86_PML4TE(map_pos);
+		vm_paddr_t pml4te_val = __boot_pmalloc(PAGE_SHIFT);
+		KASSERT(pml4te_val != BOOT_PMALLOC_ERR);
+		__boot_clear_page(pml4te_val);
+		pml4te_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_NOEXEC;
+		pml4te->val = pml4te_val;
+		vm_flush();
+
+		for (int pdpt_index = 0; pdpt_index < 512; pdpt_index++) {
+			x86_pdpte_t *pdpte = X86_PDPTE(map_pos);
+			vm_paddr_t pdpte_val;
+
+			/* try allocating a 1 GB gigapage first */
+			if (remaining_size >= 1 << X86_PDPT_SHIFT) {
+				pdpte_val = __boot_pmalloc(X86_PDPT_SHIFT);
+				/* CLion is warning about this condition being always true, but
+				 * that is not the case.  I've checked the disassembly with -O2,
+				 * and clang is emitting the check.  So it's fine, i guess. */
+				if (pdpte_val != BOOT_PMALLOC_ERR) {
+					pdpte_val |= __P_PRESENT | __P_RW | __P_HUGE
+						 | __P_GLOBAL | __P_NOEXEC;
+					pdpte->val = pdpte_val;
+					remaining_size -= 1 << X86_PDPT_SHIFT;
+					map_pos += 1 << X86_PDPT_SHIFT;
+					if (remaining_size == 0)
+						goto map_done;
+					continue;
+				}
+			}
+
+			/* couldn't use a gigapage, continue in hugepage steps */
+			pdpte_val = __boot_pmalloc(PAGE_SHIFT);
+			KASSERT(pdpte_val != BOOT_PMALLOC_ERR);
+			__boot_clear_page(pdpte_val);
+			pdpte_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_NOEXEC;
+			pdpte->val = pdpte_val;
+			vm_flush();
+
+			for (int pdt_index = 0; pdt_index < 512; pdt_index++) {
+				x86_pdte_t *pdte = X86_PDTE(map_pos);
+				vm_paddr_t pdte_val;
+
+				/* try allocating a 2 MB hugepage first */
+				if (remaining_size >= (1 << X86_PDT_SHIFT)) {
+					pdte_val = __boot_pmalloc(X86_PDT_SHIFT);
+					if (pdte_val != BOOT_PMALLOC_ERR) {
+						pdte_val |= __P_PRESENT | __P_RW | __P_GLOBAL
+							| __P_HUGE | __P_NOEXEC;
+						pdte->val = pdte_val;
+						remaining_size -= 1 << X86_PDT_SHIFT;
+						map_pos += 1 << X86_PDT_SHIFT;
+						if (remaining_size == 0)
+							goto map_done;
+						continue;
+					}
+				}
+
+				/* couldn't use a hugepage, continue in page steps */
+				pdte_val = __boot_pmalloc(PAGE_SHIFT);
+				KASSERT(pdte_val != BOOT_PMALLOC_ERR);
+				__boot_clear_page(pdpte_val);
+				pdte_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_NOEXEC;
+				pdte->val = pdte_val;
+				vm_flush();
+
+				for (int pt_index = 0; pt_index < 512; pt_index++) {
+					x86_pte_t *pte = X86_PTE(map_pos);
+					vm_paddr_t pte_val = __boot_pmalloc(X86_PT_SHIFT);
+					KASSERT(pte_val != BOOT_PMALLOC_ERR);
+					pte_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_NOEXEC;
+					pte->val = pte_val;
+
+					remaining_size -= 1 << X86_PT_SHIFT;
+					map_pos += 1 << X86_PT_SHIFT;
+					if (remaining_size == 0)
+						goto map_done;
+				} /* end of PT loop */
+			} /* end of PD loop */
+		} /* end of PDP loop */
+	} /* end of PML4 loop */
+
+map_done:
+	vm_flush();
+
+	/*
+	 * initialize the individual pages and calculate the usable RAM size
+	 */
+	vm_paddr_t prev_end = 0;
+	vm_size_t available_ram = 0;
+	struct free_area *cursor;
+	clist_foreach_entry(&free_area_list, cursor, link) {
+		/* list should have been ordered by ascending size */
+		KASSERT(cursor->start >= prev_end);
+
+		if (cursor->start != prev_end) {
+			vm_paddr_t reserved_start = prev_end;
+			vm_paddr_t reserved_end = cursor->start;
+			init_page_range(reserved_start, reserved_end, PG_RESERVED);
+		}
+
+		init_page_range(cursor->start, cursor->end, 0);
+		prev_end = cursor->end;
+		available_ram += cursor->end - cursor->start;
+	}
+
+	kprintf("Available RAM: %"PRIdVM_SIZE" bytes\n", available_ram);
+}
+
+static struct free_area *alloc_free_area_entry(void)
+{
+	/* XXX this should pretty much never happen, but it would still be nice to
+	 *     have at least some sort of error recovery rather than giving up */
+	if (clist_is_empty(&free_area_freelist))
+		panic("Boot memory allocator has run out of free_areas");
+	return clist_del_first_entry(&free_area_freelist, struct free_area, link);
+}
+
+static void free_free_area_entry(struct free_area *area)
+{
+#ifdef DEBUG
+	area->start = ~0ul;
+	area->end = ~0ul;
+#endif
+	clist_add(&free_area_freelist, &area->link);
+}
+
+static void init_free_area_freelist(void)
+{
+	for (u_int i = 0; i < ARRAY_SIZE(free_areas); i++)
+		clist_add(&free_area_freelist, &free_areas[i].link);
+}
+
+static void insert_free_area(struct mb2_mmap_entry *entry)
+{
+	vm_paddr_t start = align_ceil(entry->addr, PAGE_SIZE);
+	vm_paddr_t end = align_floor(entry->addr + entry->len, PAGE_SIZE);
+	if (start <= image_start_phys && end >= image_end_phys) {
+		/*
+		 * This is the area that the kernel image is loaded in, which we need
+		 * to treat differently than all the others because it gets split up
+		 * into two usable areas.  Illustration (addresses are examples only):
+		 *
+		 * 0x01000000 ---------------------- end (high_end)
+		 *     :        <free real estate>
+		 * 0x00500000 ---------------------- image_end_phys (high_start)
+		 *     :       <kernel code & data>
+		 * 0x00400000 ---------------------- image_start_phys (low_end)
+		 *     :        <free real estate>
+		 * 0x00100000 ---------------------- start (low_start)
+		 *
+		 * (we silently assert that the image always spans only one region)
+		 */
+		vm_paddr_t low_start = start;
+		vm_paddr_t low_end = align_floor(image_start_phys, PAGE_SIZE);
+		if (low_start < low_end) {
+			struct free_area *area = alloc_free_area_entry();
+			area->start = low_start;
+			area->end = low_end;
+			clist_add(&free_area_list, &area->link);
+		}
+
+		vm_paddr_t high_start = align_ceil(image_end_phys, PAGE_SIZE);
+		vm_paddr_t high_end = end;
+		if (high_start < high_end) {
+			struct free_area *area = alloc_free_area_entry();
+			area->start = high_start;
+			area->end = high_end;
+			clist_add(&free_area_list, &area->link);
+		}
+	} else {
+		struct free_area *area = alloc_free_area_entry();
+		area->start = start;
+		area->end = end;
+		clist_add(&free_area_list, &area->link);
+	}
+}
+
+static void init_page_range(vm_paddr_t start, vm_paddr_t end, u_int flags)
+{
+	KASSERT(start <= end);
+	vm_page_t cursor = vm_page_array + (start >> PAGE_SHIFT);
+	usize count = (end - start) >> PAGE_SHIFT;
+
+	if (flags == 0) {
+		memset(cursor, 0, count * sizeof(*cursor));
+	} else {
+		while (count--) {
+			atom_init(&cursor->count, 0);
+			cursor->flags = flags;
+			cursor->try_free = nil;
+			cursor->extra = nil;
+			cursor++;
+		}
+	}
+}
+
+/*
+ * This works relatively simple, actually.
+ * We iterate over the list of `struct free_area`s in reverse order because the
+ * list is sorted by ascending physical address and i've decided that we prefer
+ * using higher physical addresses for the page array.  The first fit wins, and
+ * all that's left is to split up the area and insert the top and bottom
+ * remainder back into the list, if applicable.
+ */
+static vm_paddr_t __boot_pmalloc(u_int log2)
+{
+	const usize alloc_size = 1 << log2;
+	KASSERT(log2 >= PAGE_SHIFT); /* never hand out less than a full page */
+
+	struct free_area *cursor;
+	clist_foreach_entry_rev(&free_area_list, cursor, link) {
+		vm_paddr_t area_start = cursor->start;
+		vm_paddr_t area_end = cursor->end;
+		KASSERT(area_start < area_end);
+		/* the areas tend to be aligned to greater sizes at their beginning */
+		vm_paddr_t alloc_start = align_ceil(area_start, alloc_size);
+		vm_paddr_t alloc_end = alloc_start + alloc_size;
+
+		if (alloc_start >= area_start && alloc_end <= area_end) {
+			/*
+			 * Example with log2 == 21 (alloc_size == 0x00200000):
+			 *
+			 * 0x00500000 ------------------- area_end (not aligned)
+			 *     :          <high_rest>
+			 * 0x00400000 ------------------- alloc_end (aligned to alloc_size)
+			 *     :       <allocated block>
+			 * 0x00200000 ------------------- alloc_start (aligned to alloc_size)
+			 *     :          <low_rest>
+			 * 0x00100000 ------------------- area_start (not aligned)
+			 */
+
+			if (alloc_start > area_start) {
+				struct free_area *low_rest = alloc_free_area_entry();
+				low_rest->start = area_start;
+				low_rest->end = alloc_start;
+				clist_add(&cursor->link, &low_rest->link);
+			}
+
+			if (alloc_end < area_end) {
+				struct free_area *high_rest = alloc_free_area_entry();
+				high_rest->start = alloc_end;
+				high_rest->end = area_end;
+				clist_add_first(&cursor->link, &high_rest->link);
+			}
+
+			clist_del(&cursor->link);
+			free_free_area_entry(cursor);
+			return alloc_start;
+		}
+	}
+
+	return BOOT_PMALLOC_ERR;
+}
+
+/*
+ * It's really unfortunate that we have to zero a page before we can use it as
+ * a page table, yet also need to reference it in the page table structures
+ * (thereby mapping it into virtual memory) before we can zero it out.
+ * This little hack temporarily maps the area at one PDP entry before KERNBASE
+ * (meaning index 1022 of _pdp0), zeroes the area, and then unmaps it again.
+ */
+static void __boot_clear_page(vm_paddr_t paddr)
+{
+	vm_paddr_t pbase = align_floor(paddr, 1 << X86_PDPT_SHIFT);
+	vm_offset_t offset = paddr - pbase;
+	void *vbase = (void *)KERNBASE - (1 << X86_PDPT_SHIFT);
+	x86_pdpte_t *pdpe = X86_PDPTE(vbase);
+	pdpe->val = pbase | __P_PRESENT | __P_RW | __P_HUGE | __P_NOEXEC;
+	vm_flush();
+	memset(vbase + offset, 0, PAGE_SIZE);
+	pdpe->flags.present = false;
+	vm_flush();
+}
+
+static void print_mem_area(struct mb2_mmap_entry *entry)
+{
+	const char *name;
+	switch (entry->type) {
+	case MB2_MEMORY_AVAILABLE:
+		name = "Available";
+		break;
+	case MB2_MEMORY_RESERVED:
+		name = "Reserved";
+		break;
+	case MB2_MEMORY_ACPI_RECLAIMABLE:
+		name = "ACPI (reclaimable)";
+		break;
+	case MB2_MEMORY_NVS:
+		name = "Non-Volatile Storage";
+		break;
+	case MB2_MEMORY_BADRAM:
+		name = "Bad RAM";
+		break;
+	}
+
+	kprintf("  [0x%016"PRIxVM_PADDR"-0x%016"PRIxVM_PADDR"] %s\n",
+		entry->addr, entry->addr + entry->len - 1, name);
+}
--- a/arch/x86/mm/amd64/page.c
+++ b/arch/x86/mm/amd64/page.c
@ -13,17 +13,18 @@

 #include <string.h>

-/* from linker script */
-extern void _image_start_phys;
-extern void _image_end_phys;
-
-__asmlink x86_pdp_t _pdp0;
-__asmlink x86_pml4_t _pml4;
+/*
+ * Initial Page Directory Pointer Table and Page Map Level 4 Table for the
+ * assembly startup routine (see setup64.S).  Used for statically mapping the
+ * lowest 2 GB of physical memory into the -2 GB virtual area.
+ */
+__asmlink x86_pdpt_t _pdpt0;
+__asmlink x86_pml4t_t _pml4t;

 int map_page(uintptr_t phys, void *virt, enum pflags flags)
 {
 	flags |= P_PRESENT;
-	x86_pml4e_t *pml4e = X86_PML4E(virt);
+	x86_pml4te_t *pml4e = X86_PML4TE(virt);
 	if (!pml4e->flags.present) {
 		void *page = get_pages(0, M_ATOMIC);
 		if (page == nil)
@ -95,43 +96,31 @@ void x86_isr_page_fault(trap_frame_t *frame, u32 error_code)
 	panic("Page fault");
 }

-uintptr_t vtophys(void *virt)
+vm_paddr_t vtophys(void *virt)
 {
-	x86_pml4e_t *pml4e = X86_PML4E(virt);
-	if (!pml4e->flags.present)
-		return 0;
+	x86_pml4te_t *pml4te = X86_PML4TE(virt);
+	if (!pml4te->flags.present)
+		return (vm_paddr_t)-1;

-	x86_pdpe_t *pdpe = X86_PDPE(virt);
-	if (!pml4e->flags.present)
-		return 0;
-	if (pml4e->flags.huge) {
-		uintptr_t phys_base = pdpe->val & X86_PMAP_MASK;
-		return phys_base + ((uintptr_t)virt % (1 << X86_PDP_SHIFT));
+	x86_pdpte_t *pdpte = X86_PDPTE(virt);
+	if (!pdpte->flags.present)
+		return (vm_paddr_t)-1;
+	if (pdpte->flags.huge) {
+		vm_paddr_t phys_base = pdpte->val & X86_PMAP_MASK;
+		return phys_base + ((vm_paddr_t)virt % (1 << X86_PDPT_SHIFT));
 	}

-	x86_pde_t *pde = X86_PDE(virt);
-	if (!pde->flags.present)
-		return 0;
-	if (pde->flags.huge) {
-		uintptr_t phys_base = pde->val & X86_PMAP_MASK;
-		return phys_base + ((uintptr_t)virt % (1 << X86_PD_SHIFT));
+	x86_pdte_t *pdte = X86_PDTE(virt);
+	if (!pdte->flags.present)
+		return (vm_paddr_t)-1;
+	if (pdte->flags.huge) {
+		vm_paddr_t phys_base = pdte->val & X86_PMAP_MASK;
+		return phys_base + ((vm_paddr_t)virt % (1 << X86_PDT_SHIFT));
 	}

 	x86_pte_t *pte = X86_PTE(virt);
 	if (!pte->flags.present)
-		return 0;
-	uintptr_t phys_base = pte->val & X86_PMAP_MASK;
-	return phys_base + ((uintptr_t)virt % (1 << X86_PT_SHIFT));
-}
-
-void vm_flush(void)
-{
-	register_t tmp;
-	__asm__ volatile(
-"	mov	%%cr3,	%0	\n"
-"	mov	%0,	%%cr3	\n"
-	: "=r"(tmp)
-	:
-	: "memory"
-	);
+		return (vm_paddr_t)-1;
+	vm_paddr_t phys_base = pte->val & X86_PMAP_MASK;
+	return phys_base + ((vm_paddr_t)virt % (1 << X86_PT_SHIFT));
 }
--- a/include/gay/cdefs.h
+++ b/include/gay/cdefs.h
@ -11,6 +11,12 @@
 #ifdef __cplusplus
 /** @brief Use `__restrict` in header files, and just `restrict` in C code */
 #define __restrict
+
+#define __BEGIN_DELCS	extern "C" {
+#define __END_DECLS	}
+#else
+#define __BEGIN_DECLS
+#define __END_DECLS
 #endif

 /** @brief Annotated symbol is an alias for another symbol. */
@ -110,12 +116,6 @@
 * These are hints for clang's branch optimizer which will try to arrange the
 * code to yield the best performance when a condition is true or false.
 *
- * - Use them sparingly and only in performance critical places because they
- *   come with a sometimes very significant code size overhead due to branches
- *   being rearranged and aligned
- * - Only use them if you know *for sure* that a particular branch is *very*
- *   unlikely to be hit, for example when
- *
 * Use it sparingly and only in performance critical places because the overhead
 * from rearranging and aligning the individual instructions can quickly make
 * the kernel image too big.
--- a/include/gay/kprintf.h
+++ b/include/gay/kprintf.h
@ -40,7 +40,7 @@ struct kprintf_printer {
 	/**
 	 * @brief Write to the kernel log.
 	 * The data itself may be cached in a buffer rather than written to the
-	 * target immediately; `krpintf()` will call `flush()` when needed.
+	 * target immediately; `kprintf()` will call `flush()` when needed.
 	 *
 	 * @param printer A reference to the original structure
 	 * @param buf Data to write
@ -48,7 +48,7 @@ struct kprintf_printer {
 	 * @returns The amount of bytes actually written,
 	 *	or a negative code from `errno.h` on failure
 	 */
-	ssize_t (*write)(struct kprintf_printer *printer, const void *buf, size_t len);
+	isize (*write)(struct kprintf_printer *printer, const void *buf, usize len);
 	/**
 	 * @brief Flush the kernel log buffer.
 	 * On implementations that don't have a buffer, this can be a no-op.
@ -58,7 +58,7 @@ struct kprintf_printer {
 	 * @returns The amount of bytes flushed out (0 if none),
 	 *	or a negative code from `errno.h` on failure
 	 */
-	ssize_t (*flush)(struct kprintf_printer *printer);
+	isize (*flush)(struct kprintf_printer *printer);
 };

 /**
--- a/include/gay/linker.h
+++ b/include/gay/linker.h
@ -0,0 +1,29 @@
+/* Copyright (C) 2021 fef <owo@fef.moe>.  All rights reserved. */
+
+#pragma once
+
+#include <gay/types.h>
+
+extern void _image_start_phys;
+#define image_start_phys ((vm_paddr_t)&_image_start_phys)
+
+extern void _image_end_phys;
+#define image_end_phys ((vm_paddr_t)&_image_end_phys)
+
+extern void _image_start;
+#define image_start (&_image_start)
+
+extern void _image_end;
+#define image_end (&_image_end)
+
+extern void _kernel_start_phys;
+#define kern_start_phys ((vm_paddr_t)&_kernel_start_phys)
+
+extern void _kernel_end_phys;
+#define kern_end_phys ((vm_paddr_t)&_kernel_end_phys)
+
+extern void _kernel_start;
+#define kern_start (&_kernel_start)
+
+extern void _kernel_end;
+#define kern_end (&_kernel_end)
--- a/include/gay/mm.h
+++ b/include/gay/mm.h
@ -6,9 +6,10 @@
 * @file include/gay/mm.h
 * @brief Header for dynamic memory management
 *
- * To avoid possible confusion, physical memory addresses always use type
- * `uintptr_t` and virtual ones are `void *`.  This should give us at least some
- * type of compiler warning if they are accidentally mixed up.
+ * To avoid possible confusion (and Not break 32-bit systems, even though they
+ * aren't really supported anyway), physical memory addresses always use type
+ * `vm_paddr_t` and virtual ones are `void *`.  This should give us at least
+ * some type of compiler warning if they are accidentally mixed up.
 *
 * GayBSD uses a classic slab algorithm for its own data structures, which is
 * backed by a buddy page frame allocator.  The latter is also used for getting
@ -25,22 +26,44 @@
 #include <gay/kprintf.h>
 #include <gay/types.h>

+#define _M_ZONE_NORMAL	0
+#define _M_ZONE_DMA	1
+#define _M_ZONE_INDEX(flags) ((flags) & 1)
+
+#define _M_EMERG	(1 << 1)
+#define _M_NOWAIT	(1 << 2)
+
+#define MM_ZONE_NORMAL	0
+#define MM_ZONE_DMA	1
+
+struct mm_zone {
+	patom_t freelist; /* -> struct page */
+	usize length;
+};
+
+/**
+ * @brief Map of all memory zones.
+ *
+ * Memory is currently divided into two zones: DMA and normal.
+ * The mm subsystem isn't NUMA aware, because it's not really a thing on desktop
+ * grade machines anyway and would only complicate things unnecessarily.
+ */
+extern struct mm_zone mm_zones[2];
+
 /**
 * @brief Memory allocation flags passed to `kmalloc()`.
 */
 enum mflags {
-	/** @brief Physically contiguous memory for DMA. */
-	M_CONTIG	= (1 << 0),
-	/** @brief Use emergency memory reserves if necessary. */
-	M_EMERG		= (1 << 1),
-	/** @brief Don't sleep during the allocation. */
-	M_NOSLEEP	= (1 << 2),
-	/** @brief Allocate userspace memory. */
-	M_USER		= (1 << 4),
-	/** @brief Kernel memory */
-	M_KERN		= M_CONTIG,
-	/** @brief Allocate memory in atomic (irq) context. */
-	M_ATOMIC	= M_EMERG | M_NOSLEEP,
+	/** @brief Use emergency memory reserves if necessary */
+	M_EMERG		= _M_EMERG,
+	/** @brief Don't sleep during the allocation (required for atomic context) */
+	M_NOWAIT	= _M_NOWAIT,
+	/** @brief Regular kernel memory */
+	M_KERN		= _M_ZONE_NORMAL,
+	/** @brief Don't sleep, and use emergency reserves if necessary */
+	M_ATOMIC	= _M_EMERG | _M_NOWAIT,
+	/** @brief Allocate low memory suitable for DMA transfers */
+	M_DMA		= _M_ZONE_DMA,
 };

 /**
@ -69,22 +92,22 @@ void kfree(void *ptr);
 * layout for better performance (no shifting around required).
 */
 enum pflags {
-	P_PRESENT	= __PFLAG_PRESENT,	/**< @brief Page exists */
-	P_RW		= __PFLAG_RW,		/**< @brief Page is writable */
-	P_USER		= __PFLAG_USER,		/**< @brief Page is accessible from ring 3 */
-	P_ACCESSED	= __PFLAG_ACCESSED,	/**< @brief Page has been accessed */
-	P_DIRTY		= __PFLAG_DIRTY,	/**< @brief Page has been written */
-	P_GLOBAL	= __PFLAG_GLOBAL,	/**< @brief The entry survives `vm_flush()` */
-	P_NOCACHE	= __PFLAG_NOCACHE,	/**< @brief The TLB won't cache this entry */
-	P_SLAB		= __PFLAG_SLAB,		/**< @brief Page is used by the slab allocator */
-	P_NOSLEEP	= __PFLAG_ATOMIC,	/**< @brief Page is atomic */
+	P_PRESENT	= __P_PRESENT,	/**< @brief Page exists */
+	P_RW		= __P_RW,		/**< @brief Page is writable */
+	P_USER		= __P_USER,		/**< @brief Page is accessible from ring 3 */
+	P_ACCESSED	= __P_ACCESSED,	/**< @brief Page has been accessed */
+	P_DIRTY		= __P_DIRTY,	/**< @brief Page has been written */
+	P_GLOBAL	= __P_GLOBAL,	/**< @brief The entry survives `vm_flush()` */
+	P_NOCACHE	= __P_NOCACHE,	/**< @brief The TLB won't cache this entry */
+	P_SLAB		= __P_SLAB,		/**< @brief Page is used by the slab allocator */
+	P_NOSLEEP	= __P_ATOMIC,	/**< @brief Page is atomic */
 #ifdef __HAVE_HUGEPAGES
 	/** @brief This page is `HUGEPAGE_SIZE` bytes long, rather than `PAGE_SIZE` */
-	P_HUGE		= __PFLAG_HUGE,
+	P_HUGE		= __P_HUGE,
 #endif
 #ifdef __HAVE_NOEXEC
 	/** @brief No instructions can be fetched from this page */
-	P_NOEXEC	= __PFLAG_NOEXEC,
+	P_NOEXEC	= __P_NOEXEC,
 #endif
 };

@ -143,9 +166,6 @@ enum pflags get_pflags(void *page);
 */
 int set_pflags(void *page, enum pflags flags);

-/** @brief Flush the TLB. */
-void vm_flush(void);
-
 /**
 * @brief Initialize the memory allocator.
 *
@ -180,7 +200,7 @@ int pages_init(void);
 * The returned region will be `(1 << order) * PAGE_SIZE` bytes long.
 *
 * @param order Order of magnitude (as in `1 << order`) for the region size
- * @param flags How to allocate (`order` must be 0 if `M_NOSLEEP` is specified)
+ * @param flags How to allocate (`order` must be 0 if `M_NOWAIT` is specified)
 * @return A pointer to the beginning of the region in the direct mapping area,
 *	or `nil` if the allocation failed
 */
--- a/include/gay/vm/page.h
+++ b/include/gay/vm/page.h
@ -0,0 +1,81 @@
+/* Copyright (C) 2021 fef <owo@fef.moe>.  All rights reserved. */
+
+#pragma once
+
+#include <arch/page.h>
+
+#include <gay/cdefs.h>
+#include <gay/systm.h>
+#include <gay/types.h>
+
+/**
+ * @brief Stores information about a single page in physical memory.
+ * There is exactly one of these for every physical page, no matter what that
+ * page is used for or whether it is usable at all.
+ */
+struct vm_page {
+	/** @brief Reference count (0 = unused) */
+	atom_t count;
+	/** @brief Various flags describing how and for what the page is used, see below */
+	u_int flags;
+	/** @brief Singly linked list, if the page is free */
+	patom_t next;
+	/**
+	 * @brief Request this page to be freed if possible.
+	 * This callback may be `nil` unless the `PG_FREEABLE` bit in `flags`
+	 * is set.  The presence of this bit does *not* guarantee that the page
+	 * is actually reclaimable, it's merely a performance optimization to
+	 * avoid having to call this function on pages that can never be
+	 * reclaimed anyway.
+	 *
+	 * @param page Pointer to the page itself
+	 * @return 0 if the page could be reclaimed and is now free
+	 */
+	int (*try_free)(struct vm_page *page);
+	/**
+	 * @brief Optional extra data pointer, reserved for private use.
+	 * The current owner of the page may use this to track the underlying
+	 * object in memory (or pretty much anything else), for example the
+	 * `struct slab` if this page is currently used by the slab allocator.
+	 * Useful for implementing the `try_free()` callback.
+	 */
+	void *extra;
+};
+
+typedef struct vm_page *vm_page_t;
+
+/* values for struct page::flags */
+/** @brief Page must never be accessed */
+#define PG_RESERVED	(1 << 0)
+/** @brief Page is in an atomic per-cpu cache */
+#define PG_ATOMIC	(1 << 1)
+/** @brief Page is used by the slab allocator */
+#define PG_SLAB		(1 << 2)
+/** @brief It **might** be possible to reclaim this page using `try_free()` */
+#define PG_FREEABLE	(1 << 3)
+
+/** @brief Array of every single page in physical memory, indexed by page frame number. */
+extern struct vm_page *const vm_page_array;
+#ifdef DEBUG
+extern vm_page_t _vm_page_array_end;
+#endif
+
+/** @brief Get the page frame number of a page. */
+__pure2 static inline u_long pg2pfn(vm_page_t page)
+{
+	KASSERT(page < _vm_page_array_end);
+	return page - vm_page_array;
+}
+
+__pure2 static inline u_long paddr2pfn(vm_paddr_t paddr)
+{
+	KASSERT(&vm_page_array[paddr >> PAGE_SHIFT] < _vm_page_array_end);
+	return paddr >> PAGE_SHIFT;
+}
+
+__pure2 static inline vm_page_t paddr2pg(vm_paddr_t paddr)
+{
+	vm_page_t page = vm_page_array + (paddr >> PAGE_SHIFT);
+	KASSERT(page < _vm_page_array_end);
+	return page;
+}
--- a/lib/c/include/inttypes.h
+++ b/lib/c/include/inttypes.h
@ -43,8 +43,8 @@ typedef	___wchar_t	wchar_t;
 #endif

 typedef struct {
-	intmax_t	quot;		/* Quotient. */
-	intmax_t	rem;		/* Remainder. */
+	__intmax_t	quot;		/* Quotient. */
+	__intmax_t	rem;		/* Remainder. */
 } imaxdiv_t;

 /* TODO: these haven't been ported over yet */