mm/slab: add object caches

This was the last major obstacle in being able to manage virtual memory maps. Object caches are custom allocators that allow for more fine-grained allocation policies, including being able to use memory from the DMAP region.
x86/mm: fix __boot_clear_page
5 changed files with 501 additions and 231 deletions
--- a/arch/x86/mm/amd64/init.c
+++ b/arch/x86/mm/amd64/init.c
@ -251,7 +251,7 @@ void __boot_clear_page(vm_paddr_t paddr)
 	pdpte->val = pbase | __P_PRESENT | __P_RW | __P_NOCACHE | __P_HUGE | __P_NOEXEC;
 	vm_flush();
 	memset64(vbase + offset, 0, PAGE_SIZE);
-	pdpe->val = 0;
+	pdpte->val = old_pdpte.val;
 	vm_flush();
 }

--- a/include/gay/mm.h
+++ b/include/gay/mm.h
@ -28,8 +28,9 @@
 * where the pool of order `n` holds groups of `1 << n` pages.
 *
 * The mm subsystem needs to allocate memory for initializing itself.
- * Therefore, there is an additional boot page frame allocator, which gets the
- * free areas from architecture dependent code (`arch/mm/.../init.c`).
+ * Therefore, there is an additional boot page frame allocator, which is
+ * initialized with the initial usable memory areas from arch dependent
+ * bootstrap code (`arch/mm/.../init.c`).
 */

 #ifdef _KERNEL
@ -52,6 +53,7 @@

 #define _M_EMERG	(1 << 2)
 #define _M_NOWAIT	(1 << 3)
+#define _M_ZERO		(1 << 4)

 #ifndef _HAVE_VM_PAGE_T
 #define _HAVE_VM_PAGE_T 1
@ -108,7 +110,10 @@ extern struct mm_zone mm_zones[MM_NR_ZONES]; /* kernel/mm/page.c */
 /**
 * @brief Memory allocation flags commonly used by all allocators.
 * All of them are eventually passed down to `page_alloc()`, the physical page
- * frame allocator,
+ * frame allocator.
+ *
+ * You always need to pick either `M_KERN` or `M_DMA` (depending on what you
+ * need the memory for), and then combine it with optional flags.
 */
 enum mflags {
 	/** @brief Use emergency memory reserves if necessary */
@ -121,6 +126,8 @@ enum mflags {
 	M_ATOMIC	= _M_EMERG | _M_NOWAIT,
 	/** @brief Allocate low memory suitable for DMA transfers */
 	M_DMA		= _M_ZONE_DMA,
+	/** @brief Don't zero out pages before returning them */
+	M_ZERO		= _M_ZERO,
 };

 /** @brief Initialize the slab allocator. */
@ -144,6 +151,57 @@ void *kmalloc(size_t size, enum mflags flags) __malloc_like __alloc_size(1);
 */
 void kfree(void *ptr);

+/* see kernel/mm/slab.c */
+struct kmem_cache;
+typedef struct kmem_cache *kmem_cache_t;
+
+/** @brief Flags for `kmem_cache_create()` */
+enum slab_flags {
+	SLAB_ZONE_NORMAL	= _M_ZONE_NORMAL,
+	SLAB_ZONE_DMA		= _M_ZONE_DMA,
+	/** @brief Poison objects before alloc and after free */
+	SLAB_POISON		= (1u << 2),
+	/** @brief Only operate on memory within the direct mapping */
+	SLAB_DMAP		= (1u << 3),
+};
+
+/**
+ * @brief Register a custom object cache to the slab allocator.
+ * Caches can be allocated from using `kmem_cache_alloc()`.
+ * Use `kmem_cache_deregister()` when you don't need the allocator anymore.
+ *
+ * @param name Unique name for the object cache, use only `[a-z0-9\\-]`
+ * @param obj_size Size of a single object in bytes
+ * @param flags Flags
+ * @param ctor Constructor
+ * @param dtor Destructor
+ * @return Handle to the newly created object cache, evaluates false on failure
+ */
+kmem_cache_t kmem_cache_register(const char *name, u_int obj_size, enum slab_flags flags,
+				 void (*ctor)(void *ptr, kmem_cache_t cache),
+				 void (*dtor)(void *ptr, kmem_cache_t cache));
+
+/**
+ * @brief Allocate from an object cache.
+ * Caches can be created using `kmem_cache_register()`.
+ * Allocated objects can be released using `kfree()`.
+ *
+ * @param cache Object cache to allocate from
+ * @return An initialized object from the cache, in the state that the `init`
+ *	function passed to `kmem_cache_register()` left it in, or `nil` if OOM
+ */
+void *kmem_cache_alloc(kmem_cache_t cache, enum mflags flags) __malloc_like;
+
+/**
+ * @brief Deregister a custom object cache.
+ * When calling this method, the cache must not have any remaining allocations.
+ *
+ * @param cache The cache handle you got from `kmem_cache_register()`
+ * @return 0 on success, or a negative number if the cache still contains
+ *	allocated objects
+ */
+int kmem_cache_deregister(kmem_cache_t cache);
+
 /**
 * @brief Initialize the buddy page frame allocator.
 * This is only called once, from the arch dependent counterpart after it has
--- a/include/gay/vm/page.h
+++ b/include/gay/vm/page.h
@ -20,6 +20,7 @@ union vm_page_attr {
 		bool pcpu:1;		/**< @brief Page is in a per-cpu cache */
 		bool slab:1;		/**< @brief Page is used by the slab allocator */
 		unsigned zone:2;	/**< @brief Index into `mm_zones` */
+		bool zero:1;		/**< @brief Page is known to contain only zeroes */
 	};
 };
 #define _PGA_ORDER_SHIFT	0
@ -34,11 +35,13 @@ union vm_page_attr {
 #define _PGA_SLAB_MASK		(1 << _PGA_SLAB_SHIFT)
 #define _PGA_ZONE_SHIFT		12
 #define _PGA_ZONE_MASK		(3 << _PGA_ZONE_SHIFT)
+#define _PGA_ZERO_SHIFT		13
+#define _PGA_ZERO_MASK		(1 << _PGA_ZERO_SHIFT)

 typedef union vm_page_attr vm_page_attr_t;

 /* defined in kernel/mm/slab.c */
-struct slab_pool;
+struct kmem_cache_node;

 /**
 * @brief Stores information about a single page in physical memory.
@ -50,7 +53,7 @@ struct vm_page {
 	atom_t count;
 	/** @brief Page attributes, use the macros below to access this */
 	atom_t attr;
-	/** @brief Page frame number */
+	/** @brief Page frame number (= `paddr >> PAGE_SHIFT`) */
 	u_long pfn;
 	/**
 	 * @brief If the page is free, this is its freelist.
@ -59,17 +62,12 @@ struct vm_page {
 	 */
 	struct clist link;
 	union {
-		struct {
-			void **freelist;
-			struct slab_pool *pool;
-			u_int entry_size;
-			u_int free_count;
-		} slab;
+		struct kmem_cache_node *slab;
 	};
 };

 #define INVALID_PAGE nil
-#define SLAB(page) (&(page)->slab)
+#define SLAB(page) ((page)->slab)

 #ifndef _HAVE_VM_PAGE_T
 #define _HAVE_VM_PAGE_T 1
@ -89,6 +87,11 @@ extern vm_page_t _vm_page_array_end;
 /** @brief Fill a page with zeroes (size depends on the current page order). */
 void page_clear(vm_page_t page);

+static __always_inline struct kmem_cache_node *page_slab(vm_page_t page)
+{
+	return page->slab;
+}
+
 static inline u8 pga_order(vm_page_t page)
 {
 	union vm_page_attr attr = { ._val = atom_read(&page->attr) };
@ -113,6 +116,12 @@ static inline bool pga_slab(vm_page_t page)
 	return attr.slab;
 }

+static inline bool pga_zero(vm_page_t page)
+{
+	union vm_page_attr attr = { ._val = atom_read(&page->attr) };
+	return attr.zero;
+}
+
 static inline enum mm_zone_type pga_zone(vm_page_t page)
 {
 	union vm_page_attr attr = { ._val = atom_read(&page->attr) };
@ -157,6 +166,14 @@ static inline enum mm_zone_type pga_set_zone(vm_page_t page, enum mm_zone_type z
 	}
 }

+static inline bool pga_set_zero(vm_page_t page, bool zero)
+{
+	if (zero)
+		return atom_set_bit(&page->attr, _PGA_ZERO_SHIFT);
+	else
+		return atom_clr_bit(&page->attr, _PGA_ZERO_SHIFT);
+}
+
 static __always_inline bool page_get(vm_page_t page)
 {
 	return atom_inc(&page->count);
@ -172,7 +189,7 @@ static __always_inline bool page_put(vm_page_t page)
 static inline void page_lock(vm_page_t page)
 {
 	spin_loop {
-		if (atom_set_bit(&page->attr, _PGA_LOCK_SHIFT))
+		if (!atom_set_bit(&page->attr, _PGA_LOCK_SHIFT))
 			break;
 	}
 }
@ -182,9 +199,16 @@ static __always_inline void page_unlock(vm_page_t page)
 	atom_clr_bit(&page->attr, _PGA_LOCK_SHIFT);
 }

+/**
+ * @brief Attempt to lock a page.
+ * Must be called with interrupts disabled.
+ *
+ * @param page Page to lock.
+ * @return `true` if you claimed the lock, `false` if not.
+ */
 static __always_inline bool page_trylock(vm_page_t page)
 {
-	return atom_set_bit(&page->attr, _PGA_LOCK_SHIFT);
+	return !atom_set_bit(&page->attr, _PGA_LOCK_SHIFT);
 }

 static inline void __page_set_flag(vm_page_t page, unsigned flag)
@ -275,3 +299,9 @@ static inline void *pfn2vaddr(u_long pfn)
 	PGADDR_ASSERT(&vm_page_array[pfn] < _vm_page_array_end);
 	return DMAP_START + (pfn << PAGE_SHIFT);
 }
+
+__pure2
+static inline vm_paddr_t pg2paddr(vm_page_t page)
+{
+	return (vm_paddr_t)page->pfn << PAGE_SHIFT;
+}
--- a/kernel/mm/page.c
+++ b/kernel/mm/page.c
@ -96,10 +96,6 @@ static inline void claim_bmem_area(struct mm_zone *zone, const struct _bmem_area
 		/* only the first page in the order group is inserted into
 		 * the freelist, but all of them need to be initialized */
 		for (u_int i = 0; i < (1u << order); i++) {
-			if (pos >= end)
-				panic("page %p out of range", pos);
-			if (atom_read(&pos->count) != 420)
-				panic("page %p double initialized\n", pos);
 			atom_init(&pos->count, 0);
 			atom_init(&pos->attr, 0);

@ -158,7 +154,8 @@ void paging_init(vm_paddr_t phys_end)
 	vm_paddr_t bitmap_start_phys = __boot_pmalloc(bitmap_size_log2, MM_ZONE_NORMAL);
 	panic_if(bitmap_start_phys == BOOT_PMALLOC_ERR,
 		 "cannot allocate memory for the page bitmaps");
-	memset(__v(bitmap_start_phys), 0, bitmap_total_size);
+	for (int i = 0; i < (1 << bitmap_size_log2); i++)
+		__boot_clear_page(bitmap_start_phys + (i * PAGE_SIZE));

 	/*
 	 * initialize the pools
@ -192,7 +189,7 @@ void paging_init(vm_paddr_t phys_end)
 		/* This is merely an optimization to simplify checking whether
 		 * two buddies can be coalesced into one.  In reality, the
 		 * reference count is invalid because the page is reserved. */
-		atom_init(&vm_page_array[pfn].count, 420);
+		atom_init(&vm_page_array[pfn].count, INT_MIN);
 		atom_init(&vm_page_array[pfn].attr, _PGA_RSVD_MASK);
 		vm_page_array[pfn].pfn = pfn;
 	}
@ -207,12 +204,12 @@ void paging_init(vm_paddr_t phys_end)
 			/* make sure the boot memory allocator cannot under any circumstances hand
 			 * out pages from this area anymore, even though that should be unnecessary */
 			clist_del(&area->link);
-
 			claim_bmem_area(zone, area);
-			zone->thrsh.emerg = latom_read(&zone->free_count) / CFG_PAGE_EMERG_DENOM;
-			if (zone->thrsh.emerg > CFG_PAGE_EMERG_MAX)
-				zone->thrsh.emerg = CFG_PAGE_EMERG_MAX;
 		}
+
+		zone->thrsh.emerg = latom_read(&zone->free_count) / CFG_PAGE_EMERG_DENOM;
+		if (zone->thrsh.emerg > CFG_PAGE_EMERG_MAX)
+			zone->thrsh.emerg = CFG_PAGE_EMERG_MAX;
 	}
 }

@ -227,9 +224,18 @@ vm_page_t page_alloc(u_int order, enum mflags flags)
 {
 	if (order > MM_MAX_ORDER) {
 		page_debug("get_pages(%d, %#08x): Order too high!\n", order, flags);
-		return nil;
+		return INVALID_PAGE;
 	}

+	/*
+	 * See if the requested zone has enough free pages for the allocation.
+	 * If not, fall back to lower physical memory (i.e. use a zone with
+	 * smaller index).  Repeat until we either find a zone that has enough
+	 * free pages, or until we've run out of zones (in which case the
+	 * allocation failed).  Just because we found a zone doesn't mean we've
+	 * succeeded, since the pages in that zone might not be contiguous.
+	 * If they're not, we have to try again (see further down below).
+	 */
 	struct mm_zone *zone = &mm_zones[_M_ZONE_INDEX(flags)];
 	long count_after;
 try_next_zone:
@ -242,7 +248,7 @@ try_next_zone:
 				zone--;
 				goto try_next_zone;
 			} else {
-				return nil;
+				return INVALID_PAGE;
 			}
 		}
 	}
@ -254,9 +260,9 @@ try_next_zone:
 	 * requested order, and if it's empty, go over to the next higher order.
 	 * Repeat until we found a page, or we've reached the highest order.
 	 */
-	vm_page_t page = nil;
+	vm_page_t page = INVALID_PAGE;
 	u_int page_order = order;
-	while (page == nil && page_order < MM_NR_ORDERS) {
+	while (!page && page_order < MM_NR_ORDERS) {
 		struct mm_pool *pool = &zone->pools[page_order];

 		disable_intr();
@ -276,7 +282,7 @@ try_next_zone:
 		intr_restore(cpuflags);
 	}

-	if (page == nil) {
+	if (!page) {
 		if (zone > &mm_zones[0]) {
 			/*
 			 * If we reach this, the current zone technically had enough free
@ -288,7 +294,7 @@ try_next_zone:
 			zone--;
 			goto try_next_zone;
 		} else {
-			return nil;
+			return INVALID_PAGE;
 		}
 	}

@ -312,7 +318,7 @@ try_next_zone:

 		disable_intr();
 		spin_lock(&pool->lock);
-		clist_add_first(&pool->freelist, &buddy->link);
+		clist_add(&pool->freelist, &buddy->link);
 		pool->free_entries++;
 		spin_unlock(&pool->lock);
 		intr_restore(cpuflags);
@ -320,7 +326,14 @@ try_next_zone:

 	for (u_int i = 0; i < (1 << order); i++)
 		pga_set_order(&page[i], order);
-	page_clear(page);
+
+	/* future versions will have a background thread that
+	 * clears pages in the freelist when the cpu is idle */
+	if ((flags & _M_ZERO) && !pga_zero(page))
+		page_clear(page);
+	/* XXX only clear the zero flag when the page actually becomes dirty */
+	pga_set_zero(page, false);
+
 	return page;
 }

@ -378,21 +391,14 @@ void page_free(vm_page_t page)
 	PAGE_ASSERT((uintptr_t)ptr % ORDER_SIZE(order) == 0);
 	u_long pfn = pg2pfn(page);

-	PAGE_DEBUG_BLOCK {
-		int old_count = atom_sub(&page->count, 1);
-		if (old_count != 1) {
-			if (old_count == 0)
-				page_debug("double free of %p", ptr);
-			else
-				page_debug("attempted to free %p with references", ptr);
-			return;
-		}
-	} else {
-		atom_dec(&page->count);
+	if (atom_dec(&page->count)) {
+		page_debug("Double free of %p", page);
+		return;
 	}

 	struct mm_zone *zone = &mm_zones[pga_zone(page)];
 	latom_add(&zone->free_count, (1 << order));
+	struct mm_pool *pool = &zone->pools[order];

 	/* try to coalesce free buddy blocks until we're reached the highest order */
 	while (order < MM_MAX_ORDER) {
@ -405,30 +411,30 @@ void page_free(vm_page_t page)
 		 * to avoid blocking other CPUs for longer than necessary */
 		vm_page_t buddy = &vm_page_array[pfn ^ (1ul << order)];
 		vm_page_t low = &vm_page_array[pfn & ~(1ul << order)];
-		struct mm_pool *current_order_pool = &zone->pools[order];
-		struct mm_pool *next_order_pool = &zone->pools[order + 1];

 		disable_intr();
-		spin_lock(&zone->pools[order].lock);
+		spin_lock(&pool->lock);
 		if (can_merge(page, buddy)) {
+			/* remove buddy from the low order freelist */
 			clist_del(&buddy->link);
-			current_order_pool->free_entries--;
+			pool->free_entries--;
+			spin_unlock(&pool->lock);
+
 			pga_set_order(buddy, order + 1);
 			pga_set_order(page, order + 1);
-			clist_add(&next_order_pool->freelist, &low->link);
-			next_order_pool->free_entries++;
 		} else {
-			order = MM_MAX_ORDER; /* break out of the loop */
+			spin_unlock(&pool->lock);
+			intr_restore(cpuflags);
+			break;
 		}
-		spin_unlock(&zone->pools[order].lock);
-		intr_restore(cpuflags);

 		page = low;
+		pfn = pg2pfn(page);
 		order++;
+		pool++;
 	}

 	/* finally, we need to insert the page at its freelist */
-	struct mm_pool *pool = &zone->pools[order];
 	disable_intr();
 	spin_lock(&pool->lock);
 	clist_add(&pool->freelist, &page->link);
--- a/kernel/mm/slab.c
+++ b/kernel/mm/slab.c
@ -1,9 +1,14 @@
 /* Copyright (C) 2021,2022 fef <owo@fef.moe>.  All rights reserved. */

+/*
+ * slabbing slabs onto the slab for slabs slab slab slahsdf ashklfghdsla
+ */
+
 #include <arch/atom.h>
 #include <arch/cpufunc.h>
 #include <arch/page.h>

+#include <gay/bits.h>
 #include <gay/cdefs.h>
 #include <gay/clist.h>
 #include <gay/config.h>
@ -15,9 +20,7 @@
 #include <gay/types.h>
 #include <gay/vm/page.h>

-/*
- * XXX this implementation is still missing object caches
- */
+#include <strings.h>

 #if CFG_POISON_SLABS
 struct slab_poison {
@ -29,8 +32,8 @@ struct slab_poison {
 	u_long high_poison[1];
 };

-static void poison_after_alloc(struct slab_poison *poison, u_int exact_size, void *alloc_source);
-static void poison_after_free(struct slab_poison *poison);
+static void poison_on_alloc(struct slab_poison *poison, u_long exact_size, void *alloc_source);
+static void poison_on_free(struct slab_poison *poison);
 #endif

 #if CFG_DEBUG_SLAB_ALLOCS
@ -49,126 +52,323 @@ static void poison_after_free(struct slab_poison *poison);
 #	define slab_debug_noisy(msg, ...) ({})
 #endif

-struct slab_pool {
-	const u_int entry_size;		/**< @brief Size of one entry in bytes */
-	const u_int entries_per_slab;	/**< @brief Max number of entries per slab */
-	atom_t total_used;		/**< @brief Total allocated entries */
-	const u_int page_order;		/**< @brief Order passed to `get_pages()` */
-	struct clist empty_list;	/* -> struct vm_page::link */
-	struct clist partial_list;	/* -> struct vm_page::link */
-	struct clist full_list;		/* -> struct vm_page::link */
-	spin_t empty_lock;		/**< @brief Lock for `empty_list` */
-	spin_t partial_lock;		/**< @brief Lock for `partial_list` */
-	spin_t full_lock;		/**< @brief Lock for `full_list` */
-	atom_t empty_count;		/**< @brief Number of empty slabs */
-	atom_t partial_count;		/**< @brief Number of partially empty slabs */
-	atom_t full_count;		/**< @brief Number of full slabs */
+/**
+ * @brief Single node in the object cache system.
+ * Each node owns a page
+ */
+struct kmem_cache_node {
+	struct clist link;		/* -> struct kmem_cache_pool::list */
+	void **freelist;		/**< @brief Stack of free objects */
+	struct kmem_cache *cache;	/**< @brief Object cache this node belongs to */
+	spin_t lock;			/**< @brief Lock for `freelist` */
+	u_int free_count;
+	vm_page_t page;			/**< @brief Physical page this node manages */
 };

-/*
- * Fun size calculations because the slab header takes up some overhead at the
- * beginning of each page.  We should ideally try to cram all the info we need
- * into struct vm_page, because the individual slab entry sizes could be even
- * powers of two and perfectly aligned then.
+struct kmem_cache_pool {
+	struct clist list;	/* -> struct kmem_cache_node::link */
+	spin_t lock;
+	atom_t count;
+};
+
+/**
+ * @brief Cache for one particular object type.
+ * A pool holds multiple nodes, each of which hold the same number of slabs.
 */
+struct kmem_cache {
+	u_int object_size;		/**< @brief Object size in bytes */
+	u_int page_order;		/**< @brief Order passed to `get_pages()` */
+	enum slab_flags flags;		/**< @brief Flags for how to allocate */
+	u_int slabs_per_node;		/**< @brief Max number of slabs per cache node */
+	latom_t total_used;		/**< @brief Total allocated entries */
+	const char *name;		/**< @brief Unique name for this object type */
+	void (*ctor)(void *ptr, kmem_cache_t cache);
+	void (*dtor)(void *ptr, kmem_cache_t cache);
+	struct kmem_cache_pool empty;
+	struct kmem_cache_pool partial;
+	struct kmem_cache_pool full;
+	struct clist link;		/**< @brief List of all kmem caches */
+};
+
+/* values for struct kmem_cache::flags */
+
+/** @brief Zone to request pages from (using `page_alloc()`) */
+#define SLAB_ZONE(flags)	((flags) & 3)
+
+/** @brief List of all currently registered `struct kmem_cache`s. */
+static CLIST(kmem_cache_list);
+
 #define _MIN1(x) ((x) < 1 ? 1 : (x))
-#define POOL_ENTRIES_PER_TABLE(sz) _MIN1(PAGE_SIZE / (sz))
+#define SLABS_PER_NODE(sz) _MIN1(PAGE_SIZE / (sz))

-#define POOL_DEFINE(sz) {					\
-	.entry_size		= (sz),				\
-	.entries_per_slab	= POOL_ENTRIES_PER_TABLE(sz),	\
-        .total_used		= ATOM_DEFINE(0),		\
+#define CACHE_DEFINE(sz, _name, _flags) {			\
+	.object_size		= (sz),				\
 	.page_order		= ((sz) - 1) / PAGE_SIZE,	\
-        .empty_lock		= SPIN_DEFINE,			\
-	.partial_lock		= SPIN_DEFINE,			\
-	.full_lock		= SPIN_DEFINE,			\
-	.empty_count		= ATOM_DEFINE(0),		\
-	.partial_count		= ATOM_DEFINE(0),		\
-	.full_count		= ATOM_DEFINE(0),		\
+        .flags			= (_flags),			\
+	.slabs_per_node		= SLABS_PER_NODE(sz),		\
+        .total_used		= ATOM_DEFINE(0),		\
+	.name			= (_name),			\
 }

-static struct slab_pool slab_pools_normal[] = {
-	POOL_DEFINE(32),
-	POOL_DEFINE(64),
-	POOL_DEFINE(128),
-	POOL_DEFINE(256),
-	POOL_DEFINE(512),
-	POOL_DEFINE(1024),
-	POOL_DEFINE(2048),
-	POOL_DEFINE(4096),
-	POOL_DEFINE(8192),
-	POOL_DEFINE(16384),
-	POOL_DEFINE(32768),
+static struct kmem_cache kmem_caches[] = {
+	CACHE_DEFINE(32,	"kmem_32",	_M_ZONE_NORMAL | SLAB_POISON),
+	CACHE_DEFINE(64,	"kmem_64",	_M_ZONE_NORMAL | SLAB_POISON),
+	CACHE_DEFINE(128,	"kmem_128",	_M_ZONE_NORMAL | SLAB_POISON),
+	CACHE_DEFINE(256,	"kmem_256",	_M_ZONE_NORMAL | SLAB_POISON),
+	CACHE_DEFINE(512,	"kmem_512",	_M_ZONE_NORMAL | SLAB_POISON),
+	CACHE_DEFINE(1024,	"kmem_1024",	_M_ZONE_NORMAL | SLAB_POISON),
+	CACHE_DEFINE(2048,	"kmem_2048",	_M_ZONE_NORMAL | SLAB_POISON),
+	CACHE_DEFINE(4096,	"kmem_4096",	_M_ZONE_NORMAL | SLAB_POISON),
+	CACHE_DEFINE(8192,	"kmem_8192",	_M_ZONE_NORMAL | SLAB_POISON),
+	CACHE_DEFINE(16384,	"kmem_16384",	_M_ZONE_NORMAL | SLAB_POISON),
+	CACHE_DEFINE(32768,	"kmem_32768",	_M_ZONE_NORMAL | SLAB_POISON),
 	{ /* terminator */ }
 };
-static struct slab_pool slab_pools_dma[] = {
-	POOL_DEFINE(32),
-	POOL_DEFINE(64),
-	POOL_DEFINE(128),
-	POOL_DEFINE(256),
-	POOL_DEFINE(512),
-	POOL_DEFINE(1024),
+static struct kmem_cache kmem_dma_caches[] = {
+	CACHE_DEFINE(32,	"kmem_dma_32",		_M_ZONE_DMA | SLAB_POISON),
+	CACHE_DEFINE(64,	"kmem_dma_64",		_M_ZONE_DMA | SLAB_POISON),
+	CACHE_DEFINE(128,	"kmem_dma_128",		_M_ZONE_DMA | SLAB_POISON),
+	CACHE_DEFINE(256,	"kmem_dma_256",		_M_ZONE_DMA | SLAB_POISON),
+	CACHE_DEFINE(512,	"kmem_dma_512",		_M_ZONE_DMA | SLAB_POISON),
+	CACHE_DEFINE(1024,	"kmem_dma_1024",	_M_ZONE_DMA | SLAB_POISON),
 	{ /* terminator */ }
 };
+
+/**
+ * This is a little fucked.
+ *
+ * So, every `vm_page_t` in use by the slab allocator gets a corresponding
+ * `struct kmem_cache_node` that keeps track of everything we need to know to
+ * make allocations.  However, the memory for those structs themselves doesn't
+ * magically grow on trees.  In other words, we need to allocate memory in
+ * order to be able to allocate memory.
+ *
+ * So what we have here is a separate object cache for `struct kmem_cache_node`
+ * that works slightly differently than all the other ones:  Instead of making
+ * an extra allocation for the cache node, that node sits at the beginning of
+ * the page that we allocate from itself.  Other caches don't do this because
+ * it destroys the perfect page alignment of the allocated area itself, but that
+ * doesn't matter here.
+ */
+static struct kmem_cache kmem_cache_node_caches =
+	CACHE_DEFINE(sizeof(struct kmem_cache_node), "kmem_cache_node", _M_ZONE_NORMAL | SLAB_DMAP);
+
 #undef _MIN1 /* we don't wanna end up using this in actual code, do we? */

-static struct slab_pool *slab_zone_pools[MM_NR_ZONES] = {
-	[_M_ZONE_DMA]		= slab_pools_dma,
-	[_M_ZONE_NORMAL]	= slab_pools_normal,
+static struct kmem_cache *kmem_cache_zones[MM_NR_ZONES] = {
+	[_M_ZONE_DMA]		= kmem_dma_caches,
+	[_M_ZONE_NORMAL]	= kmem_caches,
 };

-static vm_page_t slab_create(struct slab_pool *pool, enum mflags flags);
+static void cache_pool_init(struct kmem_cache_pool *pool)
+{
+	clist_init(&pool->list);
+	atom_init(&pool->count, 0);
+	spin_init(&pool->lock);
+}

 void kmalloc_init(void)
 {
-	for (int i = 0; i < MM_NR_ZONES; i++) {
-		struct slab_pool *pool = slab_zone_pools[i];
+	cache_pool_init(&kmem_cache_node_caches.empty);
+	cache_pool_init(&kmem_cache_node_caches.partial);
+	cache_pool_init(&kmem_cache_node_caches.full);
+	/* for the management node at the beginning of the page */
+	kmem_cache_node_caches.slabs_per_node--;
+	clist_add(&kmem_cache_list, &kmem_cache_node_caches.link);

-		while (pool->entry_size != 0) {
-			clist_init(&pool->empty_list);
-			clist_init(&pool->partial_list);
-			clist_init(&pool->full_list);
-			pool++;
+	for (int i = 0; i < MM_NR_ZONES; i++) {
+		struct kmem_cache *cache = kmem_cache_zones[i];
+
+		while (cache->object_size != 0) {
+			clist_init(&cache->empty.list);
+			clist_init(&cache->partial.list);
+			clist_init(&cache->full.list);
+			clist_add(&kmem_cache_list, &cache->link);
+			cache++;
 		}
 	}
 }

-void *kmalloc(usize size, enum mflags flags)
+kmem_cache_t kmem_cache_register(const char *name, u_int obj_size, enum slab_flags flags,
+				 void (*ctor)(void *ptr, kmem_cache_t cache),
+				 void (*dtor)(void *ptr, kmem_cache_t cache))
 {
-	if (size == 0)
+	obj_size = align_ceil(obj_size, sizeof(long));
+	/* we only support objects up to PAGE_SIZE for now */
+	if (obj_size > PAGE_SIZE || obj_size == 0)
 		return nil;

-#if CFG_POISON_SLABS
-	size += sizeof(struct slab_poison);
-#endif
+	struct kmem_cache *cache = kmalloc(sizeof(*cache), M_KERN);

-	SLAB_DEBUG_BLOCK {
-		if (!(flags & _M_NOWAIT) && in_irq()) {
-			slab_debug("kmalloc() called from irq without M_NOWAIT "
-				   "(caller: %p)\n", ktrace_return_addr());
-			flags |= _M_NOWAIT;
+	if (cache) {
+		cache->name = name;
+		cache->object_size = obj_size;
+		cache->flags = flags;
+		cache->ctor = ctor;
+		cache->dtor = dtor;
+		cache_pool_init(&cache->empty);
+		cache_pool_init(&cache->partial);
+		cache_pool_init(&cache->full);
+
+		/* XXX this is pretty wasteful for larger obj_sizes */
+		cache->slabs_per_node = PAGE_SIZE / obj_size;
+		cache->page_order = 0;
+
+		clist_add(&kmem_cache_list, &cache->link);
+	}
+
+	return cache;
+}
+
+static inline void **freelist_init(vm_page_t page, struct kmem_cache *cache)
+{
+	void *prev = nil;
+	void *start = __v(pg2paddr(page));
+	void *end = start + align_floor(1 << (cache->page_order + PAGE_SHIFT), cache->object_size);
+	void *pos = end;
+
+	do {
+		pos -= cache->object_size;
+		if (cache->ctor)
+			cache->ctor(pos, cache);
+		*(void **)pos = prev;
+		prev = pos;
+	} while (pos >= start + cache->object_size);
+
+	return (void **)pos;
+}
+
+/** Attempt to remove a cache node from the partial/empty lists in a cache node and return it */
+/* call with interrupts disabled */
+static inline struct kmem_cache_node *pool_del_first_node(struct kmem_cache_pool *pool)
+{
+	struct kmem_cache_node *node = nil;
+
+	spin_lock(&pool->lock);
+	if (!clist_is_empty(&pool->list)) {
+		atom_dec(&pool->count);
+		node = clist_del_first_entry(&pool->list, typeof(*node), link);
+	}
+	spin_unlock(&pool->lock);
+
+	return node;
+}
+
+/* call with interrupts disabled */
+static inline void pool_del_node(struct kmem_cache_pool *pool, struct kmem_cache_node *node)
+{
+	atom_dec(&pool->count);
+	spin_lock(&pool->lock);
+	clist_del(&node->link);
+	spin_unlock(&pool->lock);
+}
+
+/* call with interrupts disabled */
+static inline void pool_add_node(struct kmem_cache_pool *pool, struct kmem_cache_node *node)
+{
+	spin_lock(&pool->lock);
+	clist_add(&pool->list, &node->link);
+	spin_unlock(&pool->lock);
+	atom_inc(&pool->count);
+}
+
+/* call with interrupts disabled */
+static inline void *pop_freelist_and_insert(struct kmem_cache *cache, struct kmem_cache_node *node)
+{
+	spin_lock(&node->lock);
+	void *ret = node->freelist;
+	node->freelist = *node->freelist;
+	u_int free_count = --node->free_count;
+	spin_unlock(&node->lock);
+
+	latom_inc(&cache->total_used);
+	if (free_count == 0)
+		pool_add_node(&cache->full, node);
+	else
+		pool_add_node(&cache->partial, node);
+
+	return ret;
+}
+
+/* call with interrupts disabled */
+static struct kmem_cache_node *node_alloc(void)
+{
+	/*
+	 * This is really the same basic procedure as kmem_cache_alloc(),
+	 * except that we allocate everything manually if we run out of caches
+	 * and interrupts are disabled.
+	 * It definitely needs a cleanup at some point, most of the stuff here
+	 * can probably be eliminated if kmem_cache_alloc() is split up.
+	 */
+	struct kmem_cache_node *mgmt_node = pool_del_first_node(&kmem_cache_node_caches.partial);
+	if (!mgmt_node) {
+		mgmt_node = pool_del_first_node(&kmem_cache_node_caches.empty);
+		if (!mgmt_node) {
+			vm_page_t page = page_alloc(0, M_ATOMIC);
+			if (!page)
+				return nil;
+
+			void **freelist = freelist_init(page, &kmem_cache_node_caches);
+			mgmt_node = (struct kmem_cache_node *)freelist;
+			mgmt_node->freelist = *freelist;
+
+			mgmt_node = __v(pg2paddr(page));
+			spin_init(&mgmt_node->lock);
+			mgmt_node->free_count = kmem_cache_node_caches.slabs_per_node;
+			mgmt_node->cache = &kmem_cache_node_caches;
+			mgmt_node->page = page;
 		}
 	}

-	SLAB_ASSERT(_M_ZONE_INDEX(flags) < ARRAY_SIZE(slab_zone_pools));
-	struct slab_pool *pool = slab_zone_pools[_M_ZONE_INDEX(flags)];
-	while (pool->entry_size != 0) {
-		if (pool->entry_size >= size)
-			break;
-		pool++;
+	struct kmem_cache_node *new_node = pop_freelist_and_insert(&kmem_cache_node_caches,
+								   mgmt_node);
+	return new_node;
+}
+
+/* call with interrupts disabled */
+static inline struct kmem_cache_node *node_create(struct kmem_cache *cache, enum mflags flags,
+						  register_t cpuflags)
+{
+	struct kmem_cache_node *node = node_alloc();
+
+	if (node) {
+		intr_restore(cpuflags);
+		vm_page_t page = page_alloc(cache->page_order, flags | M_ZERO);
+		if (page) {
+			pga_set_slab(page, true);
+			page->slab = node;
+
+			node->freelist = freelist_init(page, cache);
+			spin_init(&node->lock);
+			node->free_count = cache->slabs_per_node;
+			node->cache = cache;
+			node->page = page;
+		} else {
+			kfree(node);
+			node = nil;
+		}
+		intr_disable();
 	}

-	if (pool->entry_size == 0) {
-		slab_debug("Refusing to allocate %zu bytes in zone %d (limit is %u)\n",
-			   size, _M_ZONE_INDEX(flags), pool[-1].entry_size);
-		return nil;
+	return node;
+}
+
+void *kmem_cache_alloc(kmem_cache_t cache, enum mflags flags)
+{
+	SLAB_DEBUG_BLOCK {
+		if (!(flags & _M_NOWAIT) && in_irq()) {
+			slab_debug("kmem_cache_alloc() called from irq %p w/o M_NOWAIT\n",
+				   ktrace_return_addr());
+			flags |= _M_NOWAIT;
+		}
 	}

-	slab_debug_noisy("alloc %zu bytes from zone %d, pool size %u\n",
-			 size, _M_ZONE_INDEX(flags), pool->entry_size);
+	SLAB_ASSERT(_M_ZONE_INDEX(flags) < ARRAY_SIZE(slab_zone_pools));
+	slab_debug_noisy("alloc %zu bytes from zone %d, cache %s\n",
+			 size, _M_ZONE_INDEX(flags), cache->name);

 	/*
-	 * Before locking a slab, we always remove it from its pool.
+	 * Before locking a node, we always remove it from its cache pool.
 	 * This is far from optimal, because if multiple CPUs allocate from the
 	 * same pool at the same time, we could end up creating several slabs
 	 * with one used entry each (not to mention the overhead of the mostly
@ -178,62 +378,63 @@ void *kmalloc(usize size, enum mflags flags)
 	 * it can't possibly be used for allocations anymore.
 	 * This is probably not worth the overhead, though.
 	 */
-	vm_page_t page = INVALID_PAGE;
+	struct kmem_cache_node *node = nil;

 	/* try to use a slab that is already partially used first */
 	register_t cpuflags = intr_disable();
-	spin_lock(&pool->partial_lock);
-	if (!clist_is_empty(&pool->partial_list)) {
-		atom_dec(&pool->partial_count);
-		page = clist_del_first_entry(&pool->partial_list, typeof(*page), link);
-	}
-	spin_unlock(&pool->partial_lock);
-
-	if (!page) {
-		/* no partially used slab available, see if we have a completely free one */
-		spin_lock(&pool->empty_lock);
-		if (!clist_is_empty(&pool->empty_list)) {
-			atom_dec(&pool->empty_count);
-			page = clist_del_first_entry(&pool->empty_list, typeof(*page), link);
-		}
-		spin_unlock(&pool->empty_lock);

-		if (!page) {
-			/* we're completely out of usable slabs, allocate a new one */
-			intr_restore(cpuflags);
-			page = slab_create(pool, flags);
-			if (!page) {
+	node = pool_del_first_node(&cache->partial);
+	if (!node) {
+		/* no partially used node available, see if we have a completely free one */
+		node = pool_del_first_node(&cache->empty);
+		if (!node) {
+			/* we're completely out of usable nodes, allocate a new one */
+			node = node_create(cache, flags, cpuflags);
+			if (!node) {
 				slab_debug("kernel OOM\n");
 				return nil;
 			}
-			intr_disable();
 		}
 	}

-	/* if we've made it to here, we have a slab and interrupts are disabled */
-	page_lock(page);
-	void *ret = page->slab.freelist;
-	SLAB(page)->freelist = *SLAB(page)->freelist;
-	if (--page->slab.free_count == 0) {
-		spin_lock(&pool->full_lock);
-		clist_add(&pool->full_list, &page->link);
-		spin_unlock(&pool->full_lock);
-		atom_inc(&pool->full_count);
-	} else {
-		spin_lock(&pool->partial_lock);
-		clist_add(&pool->partial_list, &page->link);
-		spin_unlock(&pool->partial_lock);
-		atom_inc(&pool->partial_count);
-	}
-	page_unlock(page);
+	/* if we've made it to here, we have a cache node and interrupts are disabled */
+	void *ret = pop_freelist_and_insert(cache, node);
 	intr_restore(cpuflags);

-	atom_inc(&pool->total_used);
+	return ret;
+}
+
+void *kmalloc(usize size, enum mflags flags)
+{
+	if (size == 0)
+		return nil;

 #if CFG_POISON_SLABS
-	struct slab_poison *poison = ret;
-	poison_after_alloc(poison, size - sizeof(*poison), ktrace_return_addr());
-	ret = poison->data;
+	size += sizeof(struct slab_poison);
+#endif
+
+	SLAB_ASSERT(_M_ZONE_INDEX(flags) < ARRAY_SIZE(slab_zone_pools));
+	struct kmem_cache *cache = kmem_cache_zones[_M_ZONE_INDEX(flags)];
+	while (cache->object_size != 0) {
+		if (cache->object_size >= size)
+			break;
+		cache++;
+	}
+
+	if (cache->object_size == 0) {
+		slab_debug("Refusing to allocate %zu bytes in zone %d (limit is %u)\n",
+			   size, _M_ZONE_INDEX(flags), cache[-1].object_size);
+		return nil;
+	}
+
+	void *ret = kmem_cache_alloc(cache, flags);
+
+#if CFG_POISON_SLABS
+	if (ret) {
+		struct slab_poison *poison = ret;
+		poison_on_alloc(poison, size - sizeof(*poison), ktrace_return_addr());
+		ret = poison->data;
+	}
 #endif
 	return ret;
 }
@ -247,64 +448,39 @@ void kfree(void *ptr)

 	vm_page_t page = vaddr2pg(ptr);
 	SLAB_ASSERT(pga_slab(page));
-	struct slab_pool *pool = SLAB(page)->pool;
+	struct kmem_cache_node *node = page_slab(page);
+	struct kmem_cache *cache = node->cache;
 #if CFG_POISON_SLABS
-	struct slab_poison *poison = container_of(ptr, typeof(*poison), data);
-	poison_after_free(poison);
-	ptr = poison;
+	if (cache->flags & SLAB_POISON) {
+		struct slab_poison *poison = container_of(ptr, typeof(*poison), data);
+		poison_on_free(poison);
+		ptr = poison;
+	}
 #endif

 	register_t cpuflags = intr_disable();
-	page_lock(page);
-	*(void **)ptr = SLAB(page)->freelist;
+
+	spin_lock(&node->lock);
+	*(void **)ptr = node->freelist;
 	SLAB(page)->freelist = (void **)ptr;
-	if (++SLAB(page)->free_count == pool->entries_per_slab) {
-		spin_lock(&pool->partial_lock);
-		clist_del(&page->link);
-		spin_unlock(&pool->partial_lock);
-		atom_dec(&pool->partial_count);
-
-		spin_lock(&pool->empty_lock);
-		clist_add(&pool->empty_list, &page->link);
-		spin_unlock(&pool->empty_lock);
-		atom_inc(&pool->empty_count);
-	}
-	page_unlock(page);
-	atom_dec(&pool->total_used);
-	intr_restore(cpuflags);
-}
+	u_int free_count = ++node->free_count;
+	spin_unlock(&node->lock);

-static vm_page_t slab_create(struct slab_pool *pool, enum mflags flags)
-{
-	slab_debug_noisy("Creating new cache for entry_size %u\n", pool->entry_size);
-	vm_page_t page = page_alloc(pool->page_order, flags);
-
-	if (page) {
-		pga_set_slab(page, true);
-		SLAB(page)->pool = pool;
-		SLAB(page)->free_count = pool->entries_per_slab;
-		void *prev = nil;
-		/* XXX this should not rely on a direct map */
-		void *start = pfn2vaddr(pg2pfn(page));
-		void *end = start + (1 << (pool->page_order + PAGE_SHIFT));
-		void *pos = end;
-		do {
-			pos -= pool->entry_size;
-			*(void **)pos = prev;
-			prev = pos;
-		} while (pos > start);
-		SLAB(page)->freelist = pos;
+	if (free_count == cache->slabs_per_node) {
+		pool_del_node(&cache->partial, node);
+		pool_add_node(&cache->empty, node);
 	}

-	return page;
+	latom_dec(&cache->total_used);
+	intr_restore(cpuflags);
 }

 #if CFG_POISON_SLABS

-static inline void poison_after_alloc(struct slab_poison *poison, u_int exact_size,
-				      void *alloc_source)
+static inline void poison_on_alloc(struct slab_poison *poison, u_long exact_size,
+				   void *alloc_source)
 {
-	u_int offset = align_ceil(poison->exact_size, sizeof(long)) / sizeof(long);
+	u_long offset = align_ceil(poison->exact_size, sizeof(long)) / sizeof(long);
 	u_long *poison_start = &poison->low_poison;

 	/*
@ -331,9 +507,9 @@ static inline void poison_after_alloc(struct slab_poison *poison, u_int exact_si
 		*pos = SLAB_POISON_ALLOC;
 }

-static inline void poison_after_free(struct slab_poison *poison)
+static inline void poison_on_free(struct slab_poison *poison)
 {
-	u_int offset = align_ceil(poison->exact_size, sizeof(long)) / sizeof(long);
+	u_long offset = align_ceil(poison->exact_size, sizeof(long)) / sizeof(long);

 	if (poison->low_poison != SLAB_POISON_ALLOC) {
 		kprintf("Low out-of-bounds write to %p (alloc by %p)\n",