mm: refactor page allocator

This is hopefully the last time in a while that something in the mm subsystem needs a refactor this large. There are two main changes: - The page frame allocator returns a vm_page_t rather than a virtual address. - Data for the slab allocator is now stored in struct vm_page, which means there is no overhead in the slab itself so the space is used in a more efficient manner.
2 years ago · b4ed811920
parent f8a85a1541
commit b4ed811920
16 changed files with 370 additions and 345 deletions
--- a/arch/x86/boot/setup32.S
+++ b/arch/x86/boot/setup32.S
@ -116,7 +116,7 @@ ENTRY(_setup)
 	 * because the page directory is being interpreted as a page table.
 	 * This allows us to manipulate the table while we are in virtual memory.
 	 */
-	movl	$(PADDR(pd0) + 0x003), PADDR(pd0) + 1023 * 4 /* 0xffc00000 */
+	movl	$(PADDR(pd0) + 0x013), PADDR(pd0) + 1023 * 4 /* 0xffc00000 */

 	/* set the Page Size Extensions (4) and Page Global Enable (7) bits in cr4 */
 	mov	%cr4, %ecx
--- a/arch/x86/boot/setup64.S
+++ b/arch/x86/boot/setup64.S
@ -160,11 +160,11 @@ ENTRY(_setup)
 	movl	$0x00000083, PADDR(_pdpt0 + PDPT_OFFSET(KERNBASE))
 	movl	$0x40000083, PADDR(_pdpt0 + PDPT_OFFSET(KERNBASE + 0x40000000))

-	movl	$PADDR(_pdpt0 + 0x003), PADDR(_pml4t) /* present (0), write (1), huge (7) */
+	movl	$PADDR(_pdpt0 + 0x003), PADDR(_pml4t) /* present (0), write (1) */
 	movl	$PADDR(_pdpt0 + 0x003), PADDR(_pml4t + PML4T_OFFSET(KERNBASE))

-	/* map the PML4 to itself */
-	movl	$PADDR(_pml4t + 0x003), PADDR(_pml4t + PML4T_OFFSET(X86_PMAP_OFFSET))
+	/* map the PML4 to itself (set the cache disable bit (4)) */
+	movl	$PADDR(_pml4t + 0x013), PADDR(_pml4t + PML4T_OFFSET(X86_PMAP_OFFSET))
 	movb	$0x80, PADDR(_pml4t + PML4T_OFFSET(X86_PMAP_OFFSET) + 7) /* NX bit */

 	/*
--- a/arch/x86/boot/util.S
+++ b/arch/x86/boot/util.S
@ -14,10 +14,6 @@
 	.code32
 	.section .multiboot.text, "ax", @progbits

-/*
- * miscellaneous utility routines
- */
-
 /* void _x86_write_tss_base(u64 *gdt_entry, struct x86_tss *tss) */
 ENTRY(_x86_write_tss_base)
 	movl	4(%esp), %edi
--- a/arch/x86/include/arch/page.h
+++ b/arch/x86/include/arch/page.h
@ -68,8 +68,8 @@ static inline void vm_flush(void)
 {
 	register_t tmp;
 	__asm__ volatile(
-	"	mov	%%cr3,	%0	\n"
-	"	mov	%0,	%%cr3	\n"
+"	mov	%%cr3,	%0	\n"
+"	mov	%0,	%%cr3	\n"
 	: "=r"(tmp)
 	:
 	: "memory"
--- a/arch/x86/include/arch/string.h
+++ b/arch/x86/include/arch/string.h
@ -0,0 +1,5 @@
+/* Copyright (C) 2021,2022 fef <owo@fef.moe>.  All rights reserved. */
+
+#pragma once
+
+#include <arch/string/memset.h>
--- a/arch/x86/mm/amd64/init.c
+++ b/arch/x86/mm/amd64/init.c
@ -27,14 +27,33 @@ static void register_area(struct mb2_mmap_entry *entry)
 	vm_paddr_t end = start + entry->len;

 	if (start >= DMA_LIMIT) {
+		/*
+		 * --------------------- end
+		 *     MM_ZONE_NORMAL
+		 * --------------------- start
+		 *  <not part of entry>
+		 * --------------------- DMA_LIMIT
+		 */
 		__boot_register_mem_area(start, end, MM_ZONE_NORMAL);
-	} else if (start < DMA_LIMIT && end > DMA_LIMIT) {
+	} else if (end > DMA_LIMIT) {
+		/*
+		 * ----------------- end
+		 *   MM_ZONE_NORMAL
+		 * ----------------- DMA_LIMIT
+		 *    MM_ZONE_DMA
+		 * ----------------- start
+		 */
 		__boot_register_mem_area(start, DMA_LIMIT, MM_ZONE_DMA);
 		__boot_register_mem_area(DMA_LIMIT, end, MM_ZONE_NORMAL);
-	} else if (start < DMA_LIMIT && end <= DMA_LIMIT) {
-		__boot_register_mem_area(start, end, MM_ZONE_DMA);
 	} else {
-		panic("congratulations, you reached an unreachable branch");
+		/*
+		 * --------------------- DMA_LIMIT
+		 *  <not part of entry>
+		 * --------------------- end
+		 *      MM_ZONE_DMA
+		 * --------------------- start
+		 */
+		__boot_register_mem_area(start, end, MM_ZONE_DMA);
 	}
 }

@ -68,8 +87,8 @@ static void map_direct_area(vm_paddr_t end)

 		for (int pdpti = 0; pdpti < 512; pdpti++) {
 			x86_pdpte_t *pdpte = X86_PDPTE(vpos);
-			pdpte->val = ppos | __P_PRESENT | __P_RW | __P_GLOBAL
-					  | __P_HUGE | __P_NOEXEC;
+			pdpte->val = ppos | __P_PRESENT | __P_RW | __P_NOCACHE | __P_WRITE_THROUGH
+					  | __P_GLOBAL | __P_HUGE | __P_NOEXEC;

 			ppos += GIGAPAGE_SIZE;
 			vpos += GIGAPAGE_SIZE;
@ -129,7 +148,7 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
 		vm_paddr_t pml4te_val = __boot_pmalloc(PAGE_SHIFT, MM_ZONE_NORMAL);
 		panic_if(pml4te_val == BOOT_PMALLOC_ERR, "cannot reserve memory for vm_page_array");
 		__boot_clear_page(pml4te_val);
-		pml4te_val |= __P_PRESENT | __P_RW | __P_NOCACHE | __P_GLOBAL | __P_NOEXEC;
+		pml4te_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_NOEXEC;
 		pml4te->val = pml4te_val;
 		vm_flush();

@ -145,8 +164,8 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
 				 * that is not the case.  I've checked the disassembly with -O2,
 				 * and clang is emitting the check.  So it's fine, i guess. */
 				if (pdpte_val != BOOT_PMALLOC_ERR) {
-					pdpte_val |= __P_PRESENT | __P_RW | __P_NOCACHE
-						  | __P_HUGE | __P_GLOBAL | __P_NOEXEC;
+					pdpte_val |= __P_PRESENT | __P_RW | __P_HUGE
+						  | __P_GLOBAL | __P_NOEXEC;
 					pdpte->val = pdpte_val;
 					map_pos += GIGAPAGE_SIZE;
 					if (map_pos >= map_end)
@ -160,7 +179,7 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
 			panic_if(pdpte_val == BOOT_PMALLOC_ERR,
 				 "cannot reserve memory for vm_page_array");
 			__boot_clear_page(pdpte_val);
-			pdpte_val |= __P_PRESENT | __P_RW | __P_NOCACHE | __P_GLOBAL | __P_NOEXEC;
+			pdpte_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_NOEXEC;
 			pdpte->val = pdpte_val;
 			vm_flush();

@ -173,8 +192,8 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
 				if (map_end - map_pos >= HUGEPAGE_SIZE) {
 					pdte_val = __boot_pmalloc(X86_PDT_SHIFT, MM_ZONE_NORMAL);
 					if (pdte_val != BOOT_PMALLOC_ERR) {
-						pdte_val |= __P_PRESENT | __P_RW | __P_NOCACHE
-							 | __P_GLOBAL | __P_HUGE | __P_NOEXEC;
+						pdte_val |= __P_PRESENT | __P_RW | __P_GLOBAL
+							 | __P_HUGE | __P_NOEXEC;
 						pdte->val = pdte_val;
 						map_pos += HUGEPAGE_SIZE;
 						if (map_pos >= map_end)
@ -188,8 +207,7 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
 				panic_if(pdte_val == BOOT_PMALLOC_ERR,
 					 "cannot reserve memory for vm_page_array");
 				__boot_clear_page(pdpte_val);
-				pdte_val |= __P_PRESENT | __P_RW | __P_NOCACHE
-					 | __P_GLOBAL | __P_NOEXEC;
+				pdte_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_NOEXEC;
 				pdte->val = pdte_val;
 				vm_flush();

@ -199,8 +217,7 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
 					vm_paddr_t pte_val = __boot_pmalloc(X86_PT_SHIFT, MM_ZONE_NORMAL);
 					panic_if(pte_val == BOOT_PMALLOC_ERR,
 						 "cannot reserve memory for vm_page_array");
-					pte_val |= __P_PRESENT | __P_RW | __P_NOCACHE
-						| __P_GLOBAL | __P_NOEXEC;
+					pte_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_NOEXEC;
 					pte->val = pte_val;

 					map_pos += PAGE_SIZE;
@ -228,8 +245,10 @@ void __boot_clear_page(vm_paddr_t paddr)
 	vm_paddr_t pbase = align_floor(paddr, 1 << X86_PDPT_SHIFT);
 	vm_offset_t offset = paddr - pbase;
 	void *vbase = (void *)KERNBASE - (1 << X86_PDPT_SHIFT);
-	x86_pdpte_t *pdpe = X86_PDPTE(vbase);
-	pdpe->val = pbase | __P_PRESENT | __P_RW | __P_NOCACHE | __P_HUGE | __P_NOEXEC;
+	x86_pdpte_t *pdpte = X86_PDPTE(vbase);
+	x86_pdpte_t old_pdpte = *pdpte;
+	old_pdpte.val = pdpte->val;
+	pdpte->val = pbase | __P_PRESENT | __P_RW | __P_NOCACHE | __P_HUGE | __P_NOEXEC;
 	vm_flush();
 	memset64(vbase + offset, 0, PAGE_SIZE);
 	pdpe->val = 0;
--- a/arch/x86/mm/amd64/page.c
+++ b/arch/x86/mm/amd64/page.c
@ -1,12 +1,18 @@
 /* Copyright (C) 2021,2022 fef <owo@fef.moe>.  All rights reserved. */

+#include <arch/cpufunc.h>
 #include <arch/page.h>
+#include <arch/segment.h>
 #include <arch/trap.h>

 #include <gay/cdefs.h>
 #include <gay/kprintf.h>
+#include <gay/ktrace.h>
 #include <gay/systm.h>
 #include <gay/types.h>
+#include <gay/vm/page.h>
+
+#include <string.h>

 /*
 * Initial Page Directory Pointer Table and Page Map Level 4 Table for the
@ -48,7 +54,10 @@ void x86_isr_page_fault(trap_frame_t *frame, u32 error_code)
 	kprintf("\n##########  B O N K  ##########\n");
 	kprintf("Illegal %s %s%s address %p!\n", space, rwx, present, address);
 	print_regs(frame);
-	panic("Page fault");
+	/* print a stack trace if this came from kernel space */
+	if (frame->hw_frame.cs == X86_64_KERN_CS)
+		ktrace_print_from((void *)frame->rbp);
+	panic_notrace("Page fault");
 }

 vm_paddr_t vtophys(void *virt)
@ -79,3 +88,14 @@ vm_paddr_t vtophys(void *virt)
 	vm_paddr_t phys_base = pte->val & X86_PMAP_MASK;
 	return phys_base + ((vm_paddr_t)virt % (1 << X86_PT_SHIFT));
 }
+
+void page_clear(vm_page_t page)
+{
+	register_t cpuflags = intr_disable();
+	page_lock(page);
+	u64 *dest = DMAP_START + (pg2pfn(page) << PAGE_SHIFT);
+	usize nbyte = (usize)1 << (pga_order(page) + PAGE_SHIFT);
+	memset64(dest, 0, nbyte);
+	page_unlock(page);
+	intr_restore(cpuflags);
+}
--- a/arch/x86/mm/i386/page.c
+++ b/arch/x86/mm/i386/page.c
@ -9,6 +9,7 @@
 * address `0xfffff000-0xffffffff`, then points to the page directory itself.
 */

+#include <arch/cpufunc.h>
 #include <arch/page.h>
 #include <arch/trap.h>

@ -19,6 +20,7 @@
 #include <gay/mm.h>
 #include <gay/systm.h>
 #include <gay/types.h>
+#include <gay/vm/page.h>

 #include <string.h>

@ -275,14 +277,12 @@ uintptr_t vtophys(void *virt)
 	return phys;
 }

-void vm_flush(void)
+void page_clear(vm_page_t page)
 {
-	register_t tmp;
-	__asm__ volatile(
-"	mov	%%cr3,	%0	\n"
-"	mov	%0,	%%cr3	\n"
-	: "=r"(tmp)
-	:
-	: "memory"
-	);
+	register_t cpuflags = intr_disable();
+	page_lock(page);
+	u32 *dest = DMAP_START + (pg2pfn(page) << PAGE_SHIFT);
+	usize nbyte = (usize)1 << (pga_order(page) + PAGE_SHIFT);
+	memset32(dest, 0, nbyte);
+	page_unlock(page);
 }
--- a/doc/amd64/memory.md
+++ b/doc/amd64/memory.md
@ -31,20 +31,24 @@ It also kind of makes you appreciate the sheer vastness of 64-bit address space.

 Kernel space addresses start at `0xffff800000000000` because the MMU "only"
 supports 48-bit linear addresses.
-The way i've understood it, the Intel spec says the 17 MSBs of virtual
-addresses must be all the same, but other than that are ignored.
+The way i've understood it, the Intel spec says bits 63:48 of virtual
+addresses must be copies of bit 47, but other than that are ignored.
 So, as far as the MMU is concerned, the huge hole doesn't even exist:
 Userspace ranges from `0x000000000000~0x7fffffffffff`,
 and everything belonging to the kernel from `0x800000000000~0xffffffffffff`
 (note how the leading 0's/f's are missing, these are 48-bit values).

 The linear physical memory is a direct mapping of physical RAM, which is
-required because `kmalloc()` needs to be able to allocate *physically*
-contiguous memory for DMA transfers.
+required because `kmalloc()` and friends need to be able to allocate
+*physically* contiguous memory for DMA transfers and i don't have the energy
+to update kernel page maps every time the kernel needs a new page.

 The kernel image itself is loaded into physical memory at `0x00400000` by
 default, and the entire low 2 GB of physical memory are statically mapped to
 the end of virtual memory (-2 GB).  That way, we can use `-mcmodel=kernel`,
 which prevents  the compiler from emitting raw address loads and absolute jumps
 (this is significantly faster).
-All kernel code resides within the -2 GB region.
+All kernel code resides within the -2 GB region.
+
+The `vm_page_array`, which keeps track of what each individual page is used for,
+starts directly at the beginning of the kernel area at -2 TB.
--- a/include/gay/cdefs.h
+++ b/include/gay/cdefs.h
@ -88,6 +88,9 @@
 /** @brief Mark the symbol as used, even if it really isn't. */
 #define __used __attribute__(( used ))

+/** @brief Tell the compiler that a struct member is intentionally unused. */
+#define __unused __attribute__(( unused ))
+
 /** @brief Symbol may be silently redefined. */
 #define __weak __attribute__(( weak ))

--- a/include/gay/mm.h
+++ b/include/gay/mm.h
@ -6,16 +6,19 @@
 * @file include/gay/mm.h
 * @brief Header for dynamic memory management
 *
- * To avoid possible confusion (and Not break 32-bit systems, even though they
- * aren't really supported anyway), physical memory addresses always use type
- * `vm_paddr_t` and virtual ones are `void *`.  This should give us at least
- * some type of compiler warning if they are accidentally mixed up.
+ * To avoid possible confusion (and Not break systems where virtual addresses
+ * are less wide than physical ones, like IA-32 with PAE), physical memory
+ * addresses always use type `vm_paddr_t` and virtual ones are `void *`.
+ * This should give us at least some type of compiler warning if they are
+ * accidentally mixed up.
 *
 * GayBSD uses a classic slab algorithm for its own data structures, which is
 * backed by a buddy page frame allocator.  The latter is also used for getting
 * bigger areas of memory that are not physically contiguous (for regular user
 * allocations).  The entire physical memory is mapped statically in the range
- * `DMAP_START - DMAP_END`.
+ * `DMAP_START - DMAP_END` in order to make clearing pages without a specific
+ * mapping easier, even though regular code outside the mm subsystem should be
+ * completely oblivious to this fact.
 *
 * Memory is split up into (currently) two zones: `MM_ZONE_NORMAL` and
 * `MM_ZONE_DMA`.  As their names suggest, the former is for general purpose
@ -23,6 +26,10 @@
 * Zones are further divided into pools, each of which hold a list of groups of
 * free pages.  The size of these page groups is determined by the pool's order,
 * where the pool of order `n` holds groups of `1 << n` pages.
+ *
+ * The mm subsystem needs to allocate memory for initializing itself.
+ * Therefore, there is an additional boot page frame allocator, which gets the
+ * free areas from architecture dependent code (`arch/mm/.../init.c`).
 */

 #ifdef _KERNEL
@ -38,17 +45,24 @@

 #include <string.h>

-#define _M_ZONE_NORMAL	0
-#define _M_ZONE_DMA	1
-#define _M_ZONE_INDEX(flags) ((flags) & 1)
+#define _M_ZONE_DMA	0
+#define _M_ZONE_NORMAL	1
+/* we use 2 bits because there are likely gonna be additional zones in the future */
+#define _M_ZONE_INDEX(flags) ((flags) & 3)
+
+#define _M_EMERG	(1 << 2)
+#define _M_NOWAIT	(1 << 3)

-#define _M_EMERG	(1 << 1)
-#define _M_NOWAIT	(1 << 2)
+#ifndef _HAVE_VM_PAGE_T
+#define _HAVE_VM_PAGE_T 1
+struct vm_page;
+typedef struct vm_page *vm_page_t;
+#endif

 enum mm_zone_type {
-	MM_ZONE_NORMAL	= _M_ZONE_NORMAL,
 	MM_ZONE_DMA	= _M_ZONE_DMA,
-	MM_NR_ZONES
+	MM_ZONE_NORMAL	= _M_ZONE_NORMAL,
+	MM_NR_ZONES	= 2
 };

 /** @brief Boot memory area. */
@ -76,7 +90,7 @@ struct mm_zone {
 	/** @brief Thresholds for OOM behavior */
 	struct {
 		/** @brief Minimum number of pages reserved for emergency allocations */
-		u_long emerg;
+		long emerg;
 	} thrsh;
 	struct mm_pool pools[MM_NR_ORDERS];
 	struct clist _bmem_areas; /* -> struct _bmem_area */
@ -92,7 +106,9 @@ struct mm_zone {
 extern struct mm_zone mm_zones[MM_NR_ZONES]; /* kernel/mm/page.c */

 /**
- * @brief Memory allocation flags passed to `kmalloc()`.
+ * @brief Memory allocation flags commonly used by all allocators.
+ * All of them are eventually passed down to `page_alloc()`, the physical page
+ * frame allocator,
 */
 enum mflags {
 	/** @brief Use emergency memory reserves if necessary */
@ -107,6 +123,9 @@ enum mflags {
 	M_DMA		= _M_ZONE_DMA,
 };

+/** @brief Initialize the slab allocator. */
+void kmalloc_init(void);
+
 /**
 * @brief Allocate memory.
 *
@ -125,33 +144,6 @@ void *kmalloc(size_t size, enum mflags flags) __malloc_like __alloc_size(1);
 */
 void kfree(void *ptr);

-/**
- * @brief Flags for the paging structures.
- *
- * The macros with two underscores in front of them are defined in `arch/page.h`
- * and match the respective bit positions in the platform's native hardware
- * layout for better performance (no shifting around required).
- */
-enum pflags {
-	P_PRESENT	= __P_PRESENT,	/**< @brief Page exists */
-	P_RW		= __P_RW,		/**< @brief Page is writable */
-	P_USER		= __P_USER,		/**< @brief Page is accessible from ring 3 */
-	P_ACCESSED	= __P_ACCESSED,	/**< @brief Page has been accessed */
-	P_DIRTY		= __P_DIRTY,	/**< @brief Page has been written */
-	P_GLOBAL	= __P_GLOBAL,	/**< @brief The entry survives `vm_flush()` */
-	P_NOCACHE	= __P_NOCACHE,	/**< @brief The TLB won't cache this entry */
-	P_SLAB		= __P_SLAB,		/**< @brief Page is used by the slab allocator */
-	P_NOSLEEP	= __P_ATOMIC,	/**< @brief Page is atomic */
-#ifdef __HAVE_HUGEPAGES
-	/** @brief This page is `HUGEPAGE_SIZE` bytes long, rather than `PAGE_SIZE` */
-	P_HUGE		= __P_HUGE,
-#endif
-#ifdef __HAVE_NOEXEC
-	/** @brief No instructions can be fetched from this page */
-	P_NOEXEC	= __P_NOEXEC,
-#endif
-};
-
 /**
 * @brief Initialize the buddy page frame allocator.
 * This is only called once, from the arch dependent counterpart after it has
@ -161,11 +153,22 @@ enum pflags {
 void paging_init(vm_paddr_t phys_end);

 /**
- * @brief Allocate a contiguous region in physical memory.
+ * @brief Allocate a physically contiguous region and initialize it with zeroes.
 * The returned region will be `(1 << order) * PAGE_SIZE` bytes long.
 *
- * **The pages are not initialized.**
- * If you want zeroed pages, use `get_zero_pages()`.
+ * @param order Order of magnitude (as in `1 << order` pages)
+ * @param flags How to allocate
+ * @return The page group that was allocated (evaluates false on failure)
+ */
+vm_page_t page_alloc(u_int order, enum mflags flags) __malloc_like;
+
+/**
+ * @brief Allocate and map a physically contiguous region in memory.
+ * The returned region will be `(1 << order) * PAGE_SIZE` bytes long,
+ * and initialized with zeroes.
+ *
+ * If filling the page with zeroes takes too much time, use `page_alloc()`.
+ * But only if you're careful and it's not an allocation for user space.
 *
 * @param order Order of magnitude (as in `1 << order` pages)
 * @param flags How to allocate
@ -173,12 +176,11 @@ void paging_init(vm_paddr_t phys_end);
 *	or `nil` if the allocation failed
 */
 void *get_pages(u_int order, enum mflags flags) __malloc_like;
+/** @brief Alias for `get_pages(0, flags)`. */
 void *get_page(enum mflags flags) __malloc_like;
-void *get_zero_pages(u_int order, enum mflags flags) __malloc_like;
-void *get_zero_page(enum mflags flags) __malloc_like;

-void free_pages(void *ptr);
-#define free_page(ptr) free_pages(ptr)
+/** @brief Free a page from `page_alloc()`. */
+void page_free(vm_page_t page);

 /**
 * @brief Initialize the slab caches.
--- a/include/gay/vm/page.h
+++ b/include/gay/vm/page.h
@ -11,18 +11,6 @@
 #include <gay/systm.h>
 #include <gay/types.h>

-/*
- * I'm trying really hard to keep the size of struct vm_page a power of two
- * on LP64 systems, because that way we can quickly get to the page frame number
- * by shifting the byte offset of the vm_page_t in vm_page_array to the right
- * rather than doing a costly divide instruction (or store the page frame number
- * within the structure itself, which takes up precious space).
- *
- * There is insane pressure on the size of this structure, because a typical
- * system will have millions of instances of it.  Every additional byte makes
- * a significant difference in memory management overhead.
- */
-
 union vm_page_attr {
 	int _val;
 	struct {
@ -49,6 +37,9 @@ union vm_page_attr {

 typedef union vm_page_attr vm_page_attr_t;

+/* defined in kernel/mm/slab.c */
+struct slab_pool;
+
 /**
 * @brief Stores information about a single page in physical memory.
 * There is exactly one of these for every physical page, no matter what that
@ -59,18 +50,31 @@ struct vm_page {
 	atom_t count;
 	/** @brief Page attributes, use the macros below to access this */
 	atom_t attr;
-	/** @brief If the page is free, this is its freelist. */
-	struct clist link;
+	/** @brief Page frame number */
+	u_long pfn;
 	/**
-	 * @brief Optional extra data pointer, reserved for private use.
-	 * The current owner of the page may use this to track the underlying
-	 * object in memory (or pretty much anything else), for example the
-	 * `struct slab` if this page is currently used by the slab allocator.
+	 * @brief If the page is free, this is its freelist.
+	 * If the page is used in the slab allocator, this is the list for the
+	 * pool in which it currently resides.
 	 */
-	void *extra;
+	struct clist link;
+	union {
+		struct {
+			void **freelist;
+			struct slab_pool *pool;
+			u_int entry_size;
+			u_int free_count;
+		} slab;
+	};
 };

+#define INVALID_PAGE nil
+#define SLAB(page) (&(page)->slab)
+
+#ifndef _HAVE_VM_PAGE_T
+#define _HAVE_VM_PAGE_T 1
 typedef struct vm_page *vm_page_t;
+#endif

 /** @brief Array of every single page in physical memory, indexed by page frame number. */
 extern struct vm_page *const vm_page_array;
@ -82,6 +86,9 @@ extern vm_page_t _vm_page_array_end;
 #define PGADDR_ASSERT(x) ({})
 #endif

+/** @brief Fill a page with zeroes (size depends on the current page order). */
+void page_clear(vm_page_t page);
+
 static inline u8 pga_order(vm_page_t page)
 {
 	union vm_page_attr attr = { ._val = atom_read(&page->attr) };
@ -211,7 +218,7 @@ __pure2
 static inline u_long pg2pfn(vm_page_t page)
 {
 	PGADDR_ASSERT(page < _vm_page_array_end);
-	return page - vm_page_array;
+	return page->pfn;
 }

 /**
@ -224,7 +231,8 @@ static inline vm_page_t vaddr2pg(void *vaddr)
 {
 	PGADDR_ASSERT(vaddr >= DMAP_START && vaddr < (void *)_vm_page_array_end);
 	uintptr_t offset = (uintptr_t)vaddr - DMAP_OFFSET;
-	return &vm_page_array[offset >> PAGE_SHIFT];
+	struct vm_page *page = &vm_page_array[offset >> PAGE_SHIFT];
+	return page - page->pfn % (1 << pga_order(page));
 }

 /**
@ -254,7 +262,7 @@ static inline vm_page_t paddr2pg(vm_paddr_t paddr)
 {
 	vm_page_t page = vm_page_array + (paddr >> PAGE_SHIFT);
 	PGADDR_ASSERT(page < _vm_page_array_end);
-	return page;
+	return page - page->pfn % (1 << pga_order(page));
 }

 /**
@ -267,19 +275,3 @@ static inline void *pfn2vaddr(u_long pfn)
 	PGADDR_ASSERT(&vm_page_array[pfn] < _vm_page_array_end);
 	return DMAP_START + (pfn << PAGE_SHIFT);
 }
-
-/*
- * We have to be careful in this macro, because only the first page in the
- * order group has the correct order set.  So we can only read it once at
- * the beginning of the loop, since the page pointer is being updated.
- */
-
-/**
- * @brief Iterate over every page in its order group.
- *
- * @param page The first `vm_page_t` in the group.
- */
-#define vm_page_foreach_in_order(page)		\
-	for (int __i = 1 << pga_order(page);	\
-	     __i >= 0;				\
-	     __i = ({ ++(page); --__i; }))
--- a/kernel/mm/boot.c
+++ b/kernel/mm/boot.c
@ -8,7 +8,7 @@

 #include <limits.h>

-static struct _bmem_area _bmem_area_cache[16];
+static struct _bmem_area _bmem_area_cache[128];
 static CLIST(bmem_area_freelist);

 #ifdef DEBUG
@ -37,6 +37,7 @@ static void free_bmem_area(struct _bmem_area *area)
 	clist_add(&bmem_area_freelist, &area->link);
 }

+/* insert an area when we already know there are no intersections with reserved memory */
 static void insert_area_unsafe(vm_paddr_t start, vm_paddr_t end, enum mm_zone_type zone_type)
 {
 	KASSERT((start % PAGE_SIZE) == 0);
--- a/kernel/mm/page.c
+++ b/kernel/mm/page.c
@ -66,19 +66,18 @@ static inline u_int paddr_find_order(vm_paddr_t addr)
 }

 /** @brief Claim all free pages in one of the memory areas from the boot allocator. */
-static inline void claim_bmem_area(struct mm_zone *zone, struct _bmem_area *area)
+static inline void claim_bmem_area(struct mm_zone *zone, const struct _bmem_area *area)
 {
-	vm_paddr_t start = area->start;
-	vm_paddr_t end = area->end;
-	vm_paddr_t pos = start;
-	vm_size_t nr_pages = end - start / PAGE_SIZE;
-	latom_add(&zone->free_count, (long)nr_pages);
+	u_int order = paddr_find_order(area->start);
+	while (area->start + ORDER_SIZE(order) > area->end)
+		order--;
+
+	struct vm_page *const start = paddr2pg(area->start);
+	struct vm_page *const end = paddr2pg(area->end);
+	struct vm_page *pos = start;

-	struct vm_page *page = &vm_page_array[start >> PAGE_SHIFT];
-	u_int order = paddr_find_order(start);
-	/* make sure the boot memory allocator cannot under any circumstances hand
-	 * out pages from this area anymore, even though that should be unnecessary */
-	clist_del(&area->link);
+	const vm_size_t nr_pages = end->pfn - start->pfn;
+	latom_add(&zone->free_count, (long)nr_pages);

 	/*
 	 * We want to insert pages at the highest possible order.  However, the
@ -90,15 +89,21 @@ static inline void claim_bmem_area(struct mm_zone *zone, struct _bmem_area *area
 	 * subsequently lower the order again.
 	 */
 	while (pos < end) {
-		struct mm_pool *pool = &zone->pools[order];
-		clist_add(&pool->freelist, &page->link);
+		struct mm_pool *const pool = &zone->pools[order];
+		clist_add(&pool->freelist, &pos->link);
 		pool->free_entries++;

 		/* only the first page in the order group is inserted into
 		 * the freelist, but all of them need to be initialized */
-		for (u_int i = 0; i < (1 << order); i++) {
-			atom_init(&page[i].count, 0);
-			atom_init(&page[i].attr, 0);
+		for (u_int i = 0; i < (1u << order); i++) {
+			if (pos >= end)
+				panic("page %p out of range", pos);
+			if (atom_read(&pos->count) != 420)
+				panic("page %p double initialized\n", pos);
+			atom_init(&pos->count, 0);
+			atom_init(&pos->attr, 0);
+
+			pos++;
 		}

 		/*
@ -111,22 +116,14 @@ static inline void claim_bmem_area(struct mm_zone *zone, struct _bmem_area *area
 		 *         |---------------------|----> pos
 		 *       start                  end
 		 */
-		pos += ORDER_SIZE(order);
-		page += (1 << order);
-		if (order < MM_MAX_ORDER && pos + ORDER_SIZE(order) <= end) {
+		if (order < MM_MAX_ORDER && pos + (1 << (order + 1)) <= end) {
 			/* this makes the rising part of the graph */
 			order++;
-		} else if (order > 0 && pos > end) {
-			/* we have overshot, lower the order */
-			pos -= ORDER_SIZE(order);
-			page -= (1 << order);
+		} else if (order > 0 && pos + (1 << order) > end) {
 			/* this makes the abrupt downwards jump at the end of the graph */
 			while (--order) {
-				if (pos + ORDER_SIZE(order) <= end) {
-					pos += ORDER_SIZE(order);
-					page += (1 << order);
+				if (pos + (1 << order) <= end)
 					break;
-				}
 			}
 		}
 	}
@ -141,7 +138,7 @@ void paging_init(vm_paddr_t phys_end)
 	usize bitmap_total_size = 0;

 	for (int order = 0; order < MM_NR_ORDERS; order++) {
-		usize pages = phys_end >> ORDER_SHIFT(order + 1);
+		usize pages = phys_end >> ORDER_SHIFT(order);
 		pages = align_ceil(pages, LATOM_BIT * 2);
 		usize bytes = pages / (CHAR_BIT * 2);
 		bitmap_sizes[order] = bytes;
@ -158,7 +155,7 @@ void paging_init(vm_paddr_t phys_end)
 	bitmap_size_log2--; /* the bit index returned by flsl starts at 1 */
 	if (bitmap_total_size ^ (1ul << bitmap_size_log2))
 		bitmap_size_log2++; /* bitmap_total_size is not a power of 2, round up */
-	uintptr_t bitmap_start_phys = __boot_pmalloc(bitmap_size_log2, MM_ZONE_NORMAL);
+	vm_paddr_t bitmap_start_phys = __boot_pmalloc(bitmap_size_log2, MM_ZONE_NORMAL);
 	panic_if(bitmap_start_phys == BOOT_PMALLOC_ERR,
 		 "cannot allocate memory for the page bitmaps");
 	memset(__v(bitmap_start_phys), 0, bitmap_total_size);
@ -168,12 +165,15 @@ void paging_init(vm_paddr_t phys_end)
 	 */
 	for (int zone_index = 0; zone_index < ARRAY_SIZE(mm_zones); zone_index++) {
 		struct mm_zone *zone = &mm_zones[zone_index];
+		latom_init(&zone->free_count, 0);
+		/* we use the same bitmaps for all zones */
 		latom_t *bitmap_pos = __v(bitmap_start_phys);
 		for (int order = 0; order < MM_NR_ORDERS; order++) {
-			zone->pools[order].bitmap = bitmap_pos;
-			clist_init(&zone->pools[order].freelist);
-			zone->pools[order].free_entries = 0;
-			latom_init(&zone->free_count, 0);
+			struct mm_pool *pool = &zone->pools[order];
+			pool->bitmap = bitmap_pos;
+			pool->free_entries = 0;
+			clist_init(&pool->freelist);
+			spin_init(&pool->lock);

 			bitmap_pos += bitmap_sizes[order];
 		}
@ -188,12 +188,13 @@ void paging_init(vm_paddr_t phys_end)
 	 *
 	 * if the reserved bit is set, all other fields in the page are invalid.
 	 */
-	for (usize i = 0; i < phys_end >> PAGE_SHIFT; i++) {
+	for (u_long pfn = 0; pfn < phys_end >> PAGE_SHIFT; pfn++) {
 		/* This is merely an optimization to simplify checking whether
 		 * two buddies can be coalesced into one.  In reality, the
 		 * reference count is invalid because the page is reserved. */
-		atom_init(&vm_page_array[i].count, 1);
-		atom_init(&vm_page_array[i].attr, _PGA_RSVD_MASK);
+		atom_init(&vm_page_array[pfn].count, 420);
+		atom_init(&vm_page_array[pfn].attr, _PGA_RSVD_MASK);
+		vm_page_array[pfn].pfn = pfn;
 	}

 	/*
@ -203,11 +204,15 @@ void paging_init(vm_paddr_t phys_end)
 		struct mm_zone *zone = &mm_zones[i];
 		struct _bmem_area *area, *tmp;
 		clist_foreach_entry_safe(&zone->_bmem_areas, area, tmp, link) {
+			/* make sure the boot memory allocator cannot under any circumstances hand
+			 * out pages from this area anymore, even though that should be unnecessary */
+			clist_del(&area->link);
+
 			claim_bmem_area(zone, area);
+			zone->thrsh.emerg = latom_read(&zone->free_count) / CFG_PAGE_EMERG_DENOM;
+			if (zone->thrsh.emerg > CFG_PAGE_EMERG_MAX)
+				zone->thrsh.emerg = CFG_PAGE_EMERG_MAX;
 		}
-		zone->thrsh.emerg = latom_read(&zone->free_count) / CFG_PAGE_EMERG_DENOM;
-		if (zone->thrsh.emerg > CFG_PAGE_EMERG_MAX)
-			zone->thrsh.emerg = CFG_PAGE_EMERG_MAX;
 	}
 }

@ -218,22 +223,27 @@ static inline bool pg_flip_bit(struct mm_zone *zone, u_long pfn, u_int order)
 	return latom_flip_bit(bitmap, (int)(bit % LATOM_BIT));
 }

-__malloc_like
-static void *__get_pages(u_int order, enum mflags flags)
+vm_page_t page_alloc(u_int order, enum mflags flags)
 {
-	PAGE_ASSERT(order >= 0);
-	struct mm_zone *zone = &mm_zones[_M_ZONE_INDEX(flags)];
-
 	if (order > MM_MAX_ORDER) {
 		page_debug("get_pages(%d, %#08x): Order too high!\n", order, flags);
 		return nil;
 	}

-	u_long count_after = latom_sub(&zone->free_count, (1 << order)) - (1 << order);
+	struct mm_zone *zone = &mm_zones[_M_ZONE_INDEX(flags)];
+	long count_after;
+try_next_zone:
+	count_after = latom_sub(&zone->free_count, (1 << order)) - (1 << order);
 	if (count_after < zone->thrsh.emerg) {
 		if (count_after < 0 || !(flags & _M_EMERG)) {
 			latom_add(&zone->free_count, (1 << order));
-			return nil;
+			/* if we can't allocate from ZONE_NORMAL, fall back to ZONE_DMA */
+			if (zone > &mm_zones[0]) {
+				zone--;
+				goto try_next_zone;
+			} else {
+				return nil;
+			}
 		}
 	}

@ -266,93 +276,76 @@ static void *__get_pages(u_int order, enum mflags flags)
 		intr_restore(cpuflags);
 	}

+	if (page == nil) {
+		if (zone > &mm_zones[0]) {
+			/*
+			 * If we reach this, the current zone technically had enough free
+			 * pages for the allocation, but those pages were split up into
+			 * smaller chunks rather than a contiguous area.  However, we don't
+			 * give up quite yet:  If possible, we fall back to a lower memory
+			 * zone (ZONE_NORMAL -> ZONE_DMA) and start over from the top.
+			 */
+			zone--;
+			goto try_next_zone;
+		} else {
+			return nil;
+		}
+	}
+
 	/*
 	 * if we found a page, check if we need to split it up
 	 * (which is the case if we took one from a higher order freelist)
 	 */
-	if (page != nil) {
-		usize pfn = pg2pfn(page);
-		page_debug_noisy("alloc order %u, split pfn %#lx from order %u\n",
-				 order, pfn, page_order);
-		pg_flip_bit(zone, pfn, page_order);
-
-		/* split the page and insert the upper halves into the
-		 * respective freelist until we reach the requested order */
-		while (page_order-- > order) {
-			page_debug_noisy("split %p (order = %u)\n", pfn2vaddr(pfn), page_order);
-			struct mm_pool *pool = &zone->pools[page_order];
-			vm_page_t buddy = page + (1 << page_order);
-			pga_set_order(buddy, page_order);
-			pg_flip_bit(zone, pfn + (1 << page_order), page_order);
-
-			disable_intr();
-			spin_lock(&pool->lock);
-			clist_add_first(&pool->freelist, &buddy->link);
-			pool->free_entries++;
-			spin_unlock(&pool->lock);
-			intr_restore(cpuflags);
-		}
-
-		pga_set_order(page, order);
-		void *vaddr = pfn2vaddr(pfn);
+	usize pfn = pg2pfn(page);
+	page_debug_noisy("alloc order %u, split pfn %#lx from order %u\n",
+			 order, pfn, page_order);
+	pg_flip_bit(zone, pfn, page_order);
+
+	/* split the page and insert the upper halves into the
+	 * respective freelist until we reach the requested order */
+	while (page_order-- > order) {
+		page_debug_noisy("split %p (order = %u)\n", pfn2vaddr(pfn), page_order);
+		struct mm_pool *pool = &zone->pools[page_order];
+		vm_page_t buddy = page + (1 << page_order);
+		pga_set_order(buddy, page_order);
+		pg_flip_bit(zone, pfn + (1 << page_order), page_order);

-		return vaddr;
-	} else {
-		return nil;
+		disable_intr();
+		spin_lock(&pool->lock);
+		clist_add_first(&pool->freelist, &buddy->link);
+		pool->free_entries++;
+		spin_unlock(&pool->lock);
+		intr_restore(cpuflags);
 	}
-}

-/* faster memset for whole pages */
-static inline void init_pages(u_long *start, u_long val, u_int order)
-{
-	u_long *end = start + (ORDER_SIZE(order) / sizeof(*start));
-	do {
-		*start++ = val;
-	} while (start != end);
+	for (u_int i = 0; i < (1 << order); i++)
+		pga_set_order(&page[i], order);
+	page_clear(page);
+	return page;
 }

+/*
+ * XXX get_page() and get_pages() shouldn't depend on the direct map
+ *
+ * XXX Do we need these at all?  I don't think so.
+ */
+
 void *get_pages(u_int order, enum mflags flags)
 {
-	void *pages = __get_pages(order, flags);
-
-#if CFG_POISON_PAGES
-	if (pages != nil)
-		init_pages(pages, PAGE_POISON_ALLOC, order);
-#endif
-
-	return pages;
+	vm_page_t page = page_alloc(order, flags);
+	if (page)
+		return pfn2vaddr(pg2pfn(page));
+	else
+		return nil;
 }

 void *get_page(enum mflags flags)
 {
-	void *pages = __get_pages(0, flags);
-
-#if CFG_POISON_PAGES
-	if (pages != nil)
-		init_pages(pages, PAGE_POISON_ALLOC, 0);
-#endif
-
-	return pages;
-}
-
-void *get_zero_pages(u_int order, enum mflags flags)
-{
-	void *pages = __get_pages(order, flags);
-
-	if (pages != nil)
-		init_pages(pages, 0, order);
-
-	return pages;
-}
-
-void *get_zero_page(enum mflags flags)
-{
-	void *page = __get_pages(0, flags);
-
-	if (page != nil)
-		init_pages(page, 0, 0);
-
-	return page;
+	vm_page_t page = page_alloc(0, flags);
+	if (page)
+		return pfn2vaddr(pg2pfn(page));
+	else
+		return nil;
 }

 /*
@ -377,26 +370,13 @@ static __always_inline bool can_merge(vm_page_t page, vm_page_t buddy)
 	return merge;
 }

-void free_pages(void *ptr)
+void page_free(vm_page_t page)
 {
-	PAGE_DEBUG_BLOCK {
-		if (ptr < DMAP_START || ptr >= DMAP_END) {
-			panic("free_pages(%p): not in DMAP region\n", ptr);
-		}
-	}
-
 	register_t cpuflags = read_flags();

-	vm_page_t page = vaddr2pg(ptr);
-	panic_if(pga_rsvd(page), "tried to free reserved page %p", ptr);
-
 	u_int order = pga_order(page);
 	PAGE_ASSERT((uintptr_t)ptr % ORDER_SIZE(order) == 0);
-	u_long pfn = vaddr2pfn(ptr);
-
-#if CFG_POISON_PAGES
-	init_pages(ptr, PAGE_POISON_FREE, order);
-#endif
+	u_long pfn = pg2pfn(page);

 	PAGE_DEBUG_BLOCK {
 		int old_count = atom_sub(&page->count, 1);
@ -407,6 +387,8 @@ void free_pages(void *ptr)
 				page_debug("attempted to free %p with references", ptr);
 			return;
 		}
+	} else {
+		atom_dec(&page->count);
 	}

 	struct mm_zone *zone = &mm_zones[pga_zone(page)];
--- a/kernel/mm/slab.c
+++ b/kernel/mm/slab.c
@ -21,7 +21,7 @@

 #if CFG_POISON_SLABS
 struct slab_poison {
-	void *_pad;		/**< @brief That's where the freelist pointer is stored */
+	void *_pad __unused;	/**< @brief That's where the freelist pointer is stored */
 	void *alloc_source;	/**< @brief Code address that made the alloc call */
 	u_long exact_size;
 	u_long low_poison;
@ -33,32 +33,6 @@ static void poison_after_alloc(struct slab_poison *poison, u_int exact_size, voi
 static void poison_after_free(struct slab_poison *poison);
 #endif

-/**
- * @brief This header sits at the beginning of each slab.
- * The individual entries follow immediately after the struct itself.
- */
-struct slab {
-	struct clist link;
-	void **freelist;
-	struct slab_pool *pool;
-	/** @brief For `link` */
-	spin_t lock;
-	/**
-	 * @brief Number of free entries.
-	 * The slabs are sorted within their pool by this value, so that we
-	 * always hand out entries from the fullest slabs (increases locality
-	 * and thus decreases the stress on the TLB).
-	 *
-	 * This is intentionally not a `usize` because entry sizes are really
-	 * small anyway (we currently refuse to allocate anything bigger than
-	 * `PAGE_SIZE`), so this saves a couple of bytes on systems where `int`
-	 * is smaller than `usize`.
-	 */
-	u_int free_entries;
-};
-
-#define SLAB_OVERHEAD (sizeof(struct slab))
-
 #if CFG_DEBUG_SLAB_ALLOCS
 #	define slab_debug(msg, ...) kprintf("[slab] " msg, ##__VA_ARGS__)
 #	define SLAB_DEBUG_BLOCK
@ -77,12 +51,12 @@ struct slab {

 struct slab_pool {
 	const u_int entry_size;		/**< @brief Size of one entry in bytes */
-	const int entries_per_slab;	/**< @brief Max number of entries per slab */
+	const u_int entries_per_slab;	/**< @brief Max number of entries per slab */
 	atom_t total_used;		/**< @brief Total allocated entries */
 	const u_int page_order;		/**< @brief Order passed to `get_pages()` */
-	struct clist empty_list;	/* -> struct slab::link */
-	struct clist partial_list;	/* -> struct slab::link */
-	struct clist full_list;		/* -> struct slab::link */
+	struct clist empty_list;	/* -> struct vm_page::link */
+	struct clist partial_list;	/* -> struct vm_page::link */
+	struct clist full_list;		/* -> struct vm_page::link */
 	spin_t empty_lock;		/**< @brief Lock for `empty_list` */
 	spin_t partial_lock;		/**< @brief Lock for `partial_list` */
 	spin_t full_lock;		/**< @brief Lock for `full_list` */
@ -98,12 +72,10 @@ struct slab_pool {
 * powers of two and perfectly aligned then.
 */
 #define _MIN1(x) ((x) < 1 ? 1 : (x))
-#define POOL_ENTRY_SIZE(sz) (( (sz) - ( SLAB_OVERHEAD / _MIN1(PAGE_SIZE / (sz)) ) ) & ~0xfu)
-#define POOL_ENTRIES_PER_TABLE(sz) \
-	_MIN1((PAGE_SIZE - SLAB_OVERHEAD) / POOL_ENTRY_SIZE(sz))
+#define POOL_ENTRIES_PER_TABLE(sz) _MIN1(PAGE_SIZE / (sz))

 #define POOL_DEFINE(sz) {					\
-	.entry_size		= POOL_ENTRY_SIZE(sz),		\
+	.entry_size		= (sz),				\
 	.entries_per_slab	= POOL_ENTRIES_PER_TABLE(sz),	\
        .total_used		= ATOM_DEFINE(0),		\
 	.page_order		= ((sz) - 1) / PAGE_SIZE,	\
@ -127,7 +99,7 @@ static struct slab_pool slab_pools_normal[] = {
 	POOL_DEFINE(8192),
 	POOL_DEFINE(16384),
 	POOL_DEFINE(32768),
-	{ .entry_size = 0 } /* terminator */
+	{ /* terminator */ }
 };
 static struct slab_pool slab_pools_dma[] = {
 	POOL_DEFINE(32),
@ -136,16 +108,16 @@ static struct slab_pool slab_pools_dma[] = {
 	POOL_DEFINE(256),
 	POOL_DEFINE(512),
 	POOL_DEFINE(1024),
-	{ .entry_size = 0 } /* terminator */
+	{ /* terminator */ }
 };
 #undef _MIN1 /* we don't wanna end up using this in actual code, do we? */

 static struct slab_pool *slab_zone_pools[MM_NR_ZONES] = {
-	[_M_ZONE_NORMAL]	= slab_pools_normal,
 	[_M_ZONE_DMA]		= slab_pools_dma,
+	[_M_ZONE_NORMAL]	= slab_pools_normal,
 };

-static struct slab *slab_create(struct slab_pool *pool, enum mflags flags);
+static vm_page_t slab_create(struct slab_pool *pool, enum mflags flags);

 void kmalloc_init(void)
 {
@ -206,31 +178,31 @@ void *kmalloc(usize size, enum mflags flags)
 	 * it can't possibly be used for allocations anymore.
 	 * This is probably not worth the overhead, though.
 	 */
-	struct slab *slab = nil;
+	vm_page_t page = INVALID_PAGE;

 	/* try to use a slab that is already partially used first */
 	register_t cpuflags = intr_disable();
 	spin_lock(&pool->partial_lock);
 	if (!clist_is_empty(&pool->partial_list)) {
 		atom_dec(&pool->partial_count);
-		slab = clist_del_first_entry(&pool->partial_list, typeof(*slab), link);
+		page = clist_del_first_entry(&pool->partial_list, typeof(*page), link);
 	}
 	spin_unlock(&pool->partial_lock);

-	if (slab == nil) {
+	if (!page) {
 		/* no partially used slab available, see if we have a completely free one */
 		spin_lock(&pool->empty_lock);
 		if (!clist_is_empty(&pool->empty_list)) {
 			atom_dec(&pool->empty_count);
-			slab = clist_del_first_entry(&pool->empty_list, typeof(*slab), link);
+			page = clist_del_first_entry(&pool->empty_list, typeof(*page), link);
 		}
 		spin_unlock(&pool->empty_lock);

-		if (slab == nil) {
+		if (!page) {
 			/* we're completely out of usable slabs, allocate a new one */
 			intr_restore(cpuflags);
-			slab = slab_create(pool, flags);
-			if (slab == nil) {
+			page = slab_create(pool, flags);
+			if (!page) {
 				slab_debug("kernel OOM\n");
 				return nil;
 			}
@ -238,22 +210,22 @@ void *kmalloc(usize size, enum mflags flags)
 		}
 	}

-	/* if we've made it to here, slab != nil and interrupts are disabled */
-	spin_lock(&slab->lock);
-	void *ret = slab->freelist;
-	slab->freelist = *slab->freelist;
-	if (--slab->free_entries == 0) {
+	/* if we've made it to here, we have a slab and interrupts are disabled */
+	page_lock(page);
+	void *ret = page->slab.freelist;
+	SLAB(page)->freelist = *SLAB(page)->freelist;
+	if (--page->slab.free_count == 0) {
 		spin_lock(&pool->full_lock);
-		clist_add(&pool->full_list, &slab->link);
+		clist_add(&pool->full_list, &page->link);
 		spin_unlock(&pool->full_lock);
 		atom_inc(&pool->full_count);
 	} else {
 		spin_lock(&pool->partial_lock);
-		clist_add(&pool->partial_list, &slab->link);
+		clist_add(&pool->partial_list, &page->link);
 		spin_unlock(&pool->partial_lock);
 		atom_inc(&pool->partial_count);
 	}
-	spin_unlock(&slab->lock);
+	page_unlock(page);
 	intr_restore(cpuflags);

 	atom_inc(&pool->total_used);
@ -275,8 +247,7 @@ void kfree(void *ptr)

 	vm_page_t page = vaddr2pg(ptr);
 	SLAB_ASSERT(pga_slab(page));
-	struct slab *slab = page->extra;
-	struct slab_pool *pool = slab->pool;
+	struct slab_pool *pool = SLAB(page)->pool;
 #if CFG_POISON_SLABS
 	struct slab_poison *poison = container_of(ptr, typeof(*poison), data);
 	poison_after_free(poison);
@ -284,63 +255,63 @@ void kfree(void *ptr)
 #endif

 	register_t cpuflags = intr_disable();
-	spin_lock(&slab->lock);
-	*(void **)ptr = slab->freelist;
-	slab->freelist = (void **)ptr;
-	if (++slab->free_entries == pool->entries_per_slab) {
+	page_lock(page);
+	*(void **)ptr = SLAB(page)->freelist;
+	SLAB(page)->freelist = (void **)ptr;
+	if (++SLAB(page)->free_count == pool->entries_per_slab) {
 		spin_lock(&pool->partial_lock);
-		clist_del(&slab->link);
+		clist_del(&page->link);
 		spin_unlock(&pool->partial_lock);
 		atom_dec(&pool->partial_count);

 		spin_lock(&pool->empty_lock);
-		clist_add(&pool->empty_list, &slab->link);
+		clist_add(&pool->empty_list, &page->link);
 		spin_unlock(&pool->empty_lock);
 		atom_inc(&pool->empty_count);
 	}
-	spin_unlock(&slab->lock);
+	page_unlock(page);
 	atom_dec(&pool->total_used);
 	intr_restore(cpuflags);
 }

-static struct slab *slab_create(struct slab_pool *pool, enum mflags flags)
+static vm_page_t slab_create(struct slab_pool *pool, enum mflags flags)
 {
 	slab_debug_noisy("Creating new cache for entry_size %u\n", pool->entry_size);
-	struct slab *slab = get_zero_pages(pool->page_order, flags);
-
-	if (slab != nil) {
-		vm_page_t page = vaddr2pg(slab);
-		/* XXX it's probably sufficient to only do this for the lowest page */
-		vm_page_foreach_in_order(page) {
-			pga_set_slab(page, true);
-			page->extra = slab;
-		}
+	vm_page_t page = page_alloc(pool->page_order, flags);

-		spin_init(&slab->lock);
-		slab->pool = pool;
-		slab->free_entries = pool->entries_per_slab;
+	if (page) {
+		pga_set_slab(page, true);
+		SLAB(page)->pool = pool;
+		SLAB(page)->free_count = pool->entries_per_slab;
 		void *prev = nil;
-		void *end = (void *)slab + (1 << (pool->page_order + PAGE_SHIFT));
+		/* XXX this should not rely on a direct map */
+		void *start = pfn2vaddr(pg2pfn(page));
+		void *end = start + (1 << (pool->page_order + PAGE_SHIFT));
 		void *pos = end;
 		do {
 			pos -= pool->entry_size;
 			*(void **)pos = prev;
 			prev = pos;
-		} while (pos >= (void *)&slab[1] + pool->entry_size);
-		slab->freelist = pos;
+		} while (pos > start);
+		SLAB(page)->freelist = pos;
 	}

-	return slab;
+	return page;
 }

 #if CFG_POISON_SLABS
+
 static inline void poison_after_alloc(struct slab_poison *poison, u_int exact_size,
 				      void *alloc_source)
 {
 	u_int offset = align_ceil(poison->exact_size, sizeof(long)) / sizeof(long);
 	u_long *poison_start = &poison->low_poison;

-	/* slabs are zeroed out when they are newly allocated */
+	/*
+	 * page_alloc() always initializes the allocated page to zeroes.
+	 * Therefore, if exact_size is 0, we know this particular slab entry has
+	 * never been used before, and we can skip the check.
+	 */
 	if (poison->exact_size != 0) {
 		for (u_long *pos = poison_start; pos < &poison->high_poison[offset]; pos++) {
 			if (*pos != SLAB_POISON_FREE) {
@ -377,7 +348,12 @@ static inline void poison_after_free(struct slab_poison *poison)
 	for (u_long *pos = &poison->low_poison; pos <= &poison->high_poison[offset]; pos++)
 		*pos = SLAB_POISON_FREE;
 }
-#endif
+
+#endif /* CFG_POISON_SLABS */
+
+/*
+ * for certain libc routines
+ */

 __weak void *malloc(usize size)
 {
--- a/lib/c/include/string.h
+++ b/lib/c/include/string.h
@ -2,6 +2,8 @@

 #pragma once

+#include <arch/string.h>
+
 #include <gay/cdefs.h>
 #include <gay/types.h>

@ -71,6 +73,7 @@ void *memcpy(void *__restrict dest, const void *__restrict src, usize n);
 */
 __pure int memcmp(const void *s1, const void *s2, usize n);

+#ifndef __HAVE_ARCH_MEMSET
 /**
 * @brief Starting from `ptr`, fill `n` bytes with the constant byte `c`.
 *
@ -80,6 +83,28 @@ __pure int memcmp(const void *s1, const void *s2, usize n);
 * @returns A pointer to `ptr`
 */
 void *memset(void *ptr, int c, usize n);
+#endif
+
+#if _GAY_SOURCE >= 202109L
+#ifndef __HAVE_ARCH_MEMSET16
+void *memset16(u16 *dest, u16 c, usize nbyte);
+#endif
+#ifndef __HAVE_ARCH_MEMSET32
+void *memset32(u32 *dest, u32 c, usize nbyte);
+#endif
+#ifndef __HAVE_ARCH_MEMSET64
+void *memset64(u64 *dest, u64 c, usize nbyte);
+#endif
+
+#include <limits.h>
+#if LONG_BIT == 32
+#define memsetl memset32
+#elif LONG_BIT == 64
+#define memsetl memset64
+#else
+#error "Unsupported sizeof(long)"
+#endif
+#endif

 /**
 * @brief Copy a memory area.