kern/kernel/mm/slab.c

/* Copyright (C) 2021,2022 fef <owo@fef.moe>.  All rights reserved. */

/*
 * slabbing slabs onto the slab for slabs slab slab slahsdf ashklfghdsla
 */

#include <arch/atom.h>
#include <arch/cpufunc.h>
#include <arch/page.h>

#include <gay/bits.h>
#include <gay/cdefs.h>
#include <gay/clist.h>
#include <gay/config.h>
#include <gay/kprintf.h>
#include <gay/ktrace.h>
#include <gay/mm.h>
#include <gay/poison.h>
#include <gay/systm.h>
#include <gay/types.h>
#include <gay/vm/page.h>

#include <strings.h>

#if CFG_POISON_SLABS
struct slab_poison {
	void *_pad __unused;	/**< @brief That's where the freelist pointer is stored */
	void *alloc_source;	/**< @brief Code address that made the alloc call */
	u_long exact_size;
	u_long low_poison;
	u8 data[0];
	u_long high_poison[1];
};

static void poison_on_alloc(struct slab_poison *poison, u_long exact_size, void *alloc_source);
static void poison_on_free(struct slab_poison *poison);
#endif

#if CFG_DEBUG_SLAB_ALLOCS
#	define slab_debug(msg, ...) kprintf("[slab] " msg, ##__VA_ARGS__)
#	define SLAB_DEBUG_BLOCK
#	define SLAB_ASSERT KASSERT
#	if CFG_DEBUG_SLAB_ALLOCS_NOISY
#		define slab_debug_noisy(msg, ...) kprintf("[slab] " msg, ##__VA_ARGS__)
#	else
#		define slab_debug_noisy(msg, ...) ({})
#	endif
#else
#	define SLAB_DEBUG_BLOCK if (0)
#	define SLAB_ASSERT(x) ({})
#	define slab_debug(msg, ...) ({})
#	define slab_debug_noisy(msg, ...) ({})
#endif

/**
 * @brief Single node in the object cache system.
 * Each node owns a page
 */
struct kmem_cache_node {
	struct clist link;		/* -> struct kmem_cache_pool::list */
	void **freelist;		/**< @brief Stack of free objects */
	struct kmem_cache *cache;	/**< @brief Object cache this node belongs to */
	spin_t lock;			/**< @brief Lock for `freelist` */
	u_int free_count;
	vm_page_t page;			/**< @brief Physical page this node manages */
};

struct kmem_cache_pool {
	struct clist list;	/* -> struct kmem_cache_node::link */
	spin_t lock;
	atom_t count;
};

/**
 * @brief Cache for one particular object type.
 * A pool holds multiple nodes, each of which hold the same number of slabs.
 */
struct kmem_cache {
	u_int object_size;		/**< @brief Object size in bytes */
	u_int page_order;		/**< @brief Order passed to `get_pages()` */
	enum slab_flags flags;		/**< @brief Flags for how to allocate */
	u_int slabs_per_node;		/**< @brief Max number of slabs per cache node */
	latom_t total_used;		/**< @brief Total allocated entries */
	const char *name;		/**< @brief Unique name for this object type */
	void (*ctor)(void *ptr, kmem_cache_t cache);
	void (*dtor)(void *ptr, kmem_cache_t cache);
	struct kmem_cache_pool empty;
	struct kmem_cache_pool partial;
	struct kmem_cache_pool full;
	struct clist link;		/**< @brief List of all kmem caches */
};

/* values for struct kmem_cache::flags */

/** @brief Zone to request pages from (using `page_alloc()`) */
#define SLAB_ZONE(flags)	((flags) & 3)

/** @brief List of all currently registered `struct kmem_cache`s. */
static CLIST(kmem_cache_list);

#define _MIN1(x) ((x) < 1 ? 1 : (x))
#define SLABS_PER_NODE(sz) _MIN1(PAGE_SIZE / (sz))

#define CACHE_DEFINE(sz, _name, _flags) {			\
	.object_size		= (sz),				\
	.page_order		= ((sz) - 1) / PAGE_SIZE,	\
        .flags			= (_flags),			\
	.slabs_per_node		= SLABS_PER_NODE(sz),		\
        .total_used		= ATOM_DEFINE(0),		\
	.name			= (_name),			\
}

static struct kmem_cache kmem_caches[] = {
	CACHE_DEFINE(32,	"kmem_32",	_M_ZONE_NORMAL | SLAB_POISON),
	CACHE_DEFINE(64,	"kmem_64",	_M_ZONE_NORMAL | SLAB_POISON),
	CACHE_DEFINE(128,	"kmem_128",	_M_ZONE_NORMAL | SLAB_POISON),
	CACHE_DEFINE(256,	"kmem_256",	_M_ZONE_NORMAL | SLAB_POISON),
	CACHE_DEFINE(512,	"kmem_512",	_M_ZONE_NORMAL | SLAB_POISON),
	CACHE_DEFINE(1024,	"kmem_1024",	_M_ZONE_NORMAL | SLAB_POISON),
	CACHE_DEFINE(2048,	"kmem_2048",	_M_ZONE_NORMAL | SLAB_POISON),
	CACHE_DEFINE(4096,	"kmem_4096",	_M_ZONE_NORMAL | SLAB_POISON),
	CACHE_DEFINE(8192,	"kmem_8192",	_M_ZONE_NORMAL | SLAB_POISON),
	CACHE_DEFINE(16384,	"kmem_16384",	_M_ZONE_NORMAL | SLAB_POISON),
	CACHE_DEFINE(32768,	"kmem_32768",	_M_ZONE_NORMAL | SLAB_POISON),
	{ /* terminator */ }
};
static struct kmem_cache kmem_dma_caches[] = {
	CACHE_DEFINE(32,	"kmem_dma_32",		_M_ZONE_DMA | SLAB_POISON),
	CACHE_DEFINE(64,	"kmem_dma_64",		_M_ZONE_DMA | SLAB_POISON),
	CACHE_DEFINE(128,	"kmem_dma_128",		_M_ZONE_DMA | SLAB_POISON),
	CACHE_DEFINE(256,	"kmem_dma_256",		_M_ZONE_DMA | SLAB_POISON),
	CACHE_DEFINE(512,	"kmem_dma_512",		_M_ZONE_DMA | SLAB_POISON),
	CACHE_DEFINE(1024,	"kmem_dma_1024",	_M_ZONE_DMA | SLAB_POISON),
	{ /* terminator */ }
};

/**
 * This is a little fucked.
 *
 * So, every `vm_page_t` in use by the slab allocator gets a corresponding
 * `struct kmem_cache_node` that keeps track of everything we need to know to
 * make allocations.  However, the memory for those structs themselves doesn't
 * magically grow on trees.  In other words, we need to allocate memory in
 * order to be able to allocate memory.
 *
 * So what we have here is a separate object cache for `struct kmem_cache_node`
 * that works slightly differently than all the other ones:  Instead of making
 * an extra allocation for the cache node, that node sits at the beginning of
 * the page that we allocate from itself.  Other caches don't do this because
 * it destroys the perfect page alignment of the allocated area itself, but that
 * doesn't matter here.
 */
static struct kmem_cache kmem_cache_node_caches =
	CACHE_DEFINE(sizeof(struct kmem_cache_node), "kmem_cache_node", _M_ZONE_NORMAL | SLAB_DMAP);

#undef _MIN1 /* we don't wanna end up using this in actual code, do we? */

static struct kmem_cache *kmem_cache_zones[MM_NR_ZONES] = {
	[_M_ZONE_DMA]		= kmem_dma_caches,
	[_M_ZONE_NORMAL]	= kmem_caches,
};

static void cache_pool_init(struct kmem_cache_pool *pool)
{
	clist_init(&pool->list);
	atom_init(&pool->count, 0);
	spin_init(&pool->lock);
}

void kmalloc_init(void)
{
	cache_pool_init(&kmem_cache_node_caches.empty);
	cache_pool_init(&kmem_cache_node_caches.partial);
	cache_pool_init(&kmem_cache_node_caches.full);
	/* for the management node at the beginning of the page */
	kmem_cache_node_caches.slabs_per_node--;
	clist_add(&kmem_cache_list, &kmem_cache_node_caches.link);

	for (int i = 0; i < MM_NR_ZONES; i++) {
		struct kmem_cache *cache = kmem_cache_zones[i];

		while (cache->object_size != 0) {
			clist_init(&cache->empty.list);
			clist_init(&cache->partial.list);
			clist_init(&cache->full.list);
			clist_add(&kmem_cache_list, &cache->link);
			cache++;
		}
	}
}

kmem_cache_t kmem_cache_register(const char *name, u_int obj_size, enum slab_flags flags,
				 void (*ctor)(void *ptr, kmem_cache_t cache),
				 void (*dtor)(void *ptr, kmem_cache_t cache))
{
	obj_size = align_ceil(obj_size, sizeof(long));
	/* we only support objects up to PAGE_SIZE for now */
	if (obj_size > PAGE_SIZE || obj_size == 0)
		return nil;

	struct kmem_cache *cache = kmalloc(sizeof(*cache), M_KERN);

	if (cache) {
		cache->name = name;
		cache->object_size = obj_size;
		cache->flags = flags;
		cache->ctor = ctor;
		cache->dtor = dtor;
		cache_pool_init(&cache->empty);
		cache_pool_init(&cache->partial);
		cache_pool_init(&cache->full);

		/* XXX this is pretty wasteful for larger obj_sizes */
		cache->slabs_per_node = PAGE_SIZE / obj_size;
		cache->page_order = 0;

		clist_add(&kmem_cache_list, &cache->link);
	}

	return cache;
}

static inline void **freelist_init(vm_page_t page, struct kmem_cache *cache)
{
	void *prev = nil;
	void *start = __v(pg2paddr(page));
	void *end = start + align_floor(1 << (cache->page_order + PAGE_SHIFT), cache->object_size);
	void *pos = end;

	do {
		pos -= cache->object_size;
		if (cache->ctor)
			cache->ctor(pos, cache);
		*(void **)pos = prev;
		prev = pos;
	} while (pos >= start + cache->object_size);

	return (void **)pos;
}

/** Attempt to remove a cache node from the partial/empty lists in a cache node and return it */
/* call with interrupts disabled */
static inline struct kmem_cache_node *pool_del_first_node(struct kmem_cache_pool *pool)
{
	struct kmem_cache_node *node = nil;

	spin_lock(&pool->lock);
	if (!clist_is_empty(&pool->list)) {
		atom_dec(&pool->count);
		node = clist_del_first_entry(&pool->list, typeof(*node), link);
	}
	spin_unlock(&pool->lock);

	return node;
}

/* call with interrupts disabled */
static inline void pool_del_node(struct kmem_cache_pool *pool, struct kmem_cache_node *node)
{
	atom_dec(&pool->count);
	spin_lock(&pool->lock);
	clist_del(&node->link);
	spin_unlock(&pool->lock);
}

/* call with interrupts disabled */
static inline void pool_add_node(struct kmem_cache_pool *pool, struct kmem_cache_node *node)
{
	spin_lock(&pool->lock);
	clist_add(&pool->list, &node->link);
	spin_unlock(&pool->lock);
	atom_inc(&pool->count);
}

/* call with interrupts disabled */
static inline void *pop_freelist_and_insert(struct kmem_cache *cache, struct kmem_cache_node *node)
{
	spin_lock(&node->lock);
	void *ret = node->freelist;
	node->freelist = *node->freelist;
	u_int free_count = --node->free_count;
	spin_unlock(&node->lock);

	latom_inc(&cache->total_used);
	if (free_count == 0)
		pool_add_node(&cache->full, node);
	else
		pool_add_node(&cache->partial, node);

	return ret;
}

/* call with interrupts disabled */
static struct kmem_cache_node *node_alloc(void)
{
	/*
	 * This is really the same basic procedure as kmem_cache_alloc(),
	 * except that we allocate everything manually if we run out of caches
	 * and interrupts are disabled.
	 * It definitely needs a cleanup at some point, most of the stuff here
	 * can probably be eliminated if kmem_cache_alloc() is split up.
	 */
	struct kmem_cache_node *mgmt_node = pool_del_first_node(&kmem_cache_node_caches.partial);
	if (!mgmt_node) {
		mgmt_node = pool_del_first_node(&kmem_cache_node_caches.empty);
		if (!mgmt_node) {
			vm_page_t page = page_alloc(0, M_ATOMIC);
			if (!page)
				return nil;

			void **freelist = freelist_init(page, &kmem_cache_node_caches);
			mgmt_node = (struct kmem_cache_node *)freelist;
			mgmt_node->freelist = *freelist;

			mgmt_node = __v(pg2paddr(page));
			spin_init(&mgmt_node->lock);
			mgmt_node->free_count = kmem_cache_node_caches.slabs_per_node;
			mgmt_node->cache = &kmem_cache_node_caches;
			mgmt_node->page = page;
		}
	}

	struct kmem_cache_node *new_node = pop_freelist_and_insert(&kmem_cache_node_caches,
								   mgmt_node);
	return new_node;
}

/* call with interrupts disabled */
static inline struct kmem_cache_node *node_create(struct kmem_cache *cache, enum mflags flags,
						  register_t cpuflags)
{
	struct kmem_cache_node *node = node_alloc();

	if (node) {
		intr_restore(cpuflags);
		vm_page_t page = page_alloc(cache->page_order, flags | M_ZERO);
		if (page) {
			pga_set_slab(page, true);
			page->slab = node;

			node->freelist = freelist_init(page, cache);
			spin_init(&node->lock);
			node->free_count = cache->slabs_per_node;
			node->cache = cache;
			node->page = page;
		} else {
			kfree(node);
			node = nil;
		}
		intr_disable();
	}

	return node;
}

void *kmem_cache_alloc(kmem_cache_t cache, enum mflags flags)
{
	SLAB_DEBUG_BLOCK {
		if (!(flags & _M_NOWAIT) && in_irq()) {
			slab_debug("kmem_cache_alloc() called from irq %p w/o M_NOWAIT\n",
				   ktrace_return_addr());
			flags |= _M_NOWAIT;
		}
	}

	SLAB_ASSERT(_M_ZONE_INDEX(flags) < ARRAY_SIZE(slab_zone_pools));
	slab_debug_noisy("alloc %zu bytes from zone %d, cache %s\n",
			 size, _M_ZONE_INDEX(flags), cache->name);

	/*
	 * Before locking a node, we always remove it from its cache pool.
	 * This is far from optimal, because if multiple CPUs allocate from the
	 * same pool at the same time, we could end up creating several slabs
	 * with one used entry each (not to mention the overhead of the mostly
	 * unnecessary list deletions/insertions).  However, it allows me to be
	 * lazier when freeing unused slabs from a background thread since that
	 * thread knows for sure that once it has removed a slab from free_list,
	 * it can't possibly be used for allocations anymore.
	 * This is probably not worth the overhead, though.
	 */
	struct kmem_cache_node *node = nil;

	/* try to use a slab that is already partially used first */
	register_t cpuflags = intr_disable();

	node = pool_del_first_node(&cache->partial);
	if (!node) {
		/* no partially used node available, see if we have a completely free one */
		node = pool_del_first_node(&cache->empty);
		if (!node) {
			/* we're completely out of usable nodes, allocate a new one */
			node = node_create(cache, flags, cpuflags);
			if (!node) {
				slab_debug("kernel OOM\n");
				return nil;
			}
		}
	}

	/* if we've made it to here, we have a cache node and interrupts are disabled */
	void *ret = pop_freelist_and_insert(cache, node);
	intr_restore(cpuflags);

	return ret;
}

void *kmalloc(usize size, enum mflags flags)
{
	if (size == 0)
		return nil;

#if CFG_POISON_SLABS
	size += sizeof(struct slab_poison);
#endif

	SLAB_ASSERT(_M_ZONE_INDEX(flags) < ARRAY_SIZE(slab_zone_pools));
	struct kmem_cache *cache = kmem_cache_zones[_M_ZONE_INDEX(flags)];
	while (cache->object_size != 0) {
		if (cache->object_size >= size)
			break;
		cache++;
	}

	if (cache->object_size == 0) {
		slab_debug("Refusing to allocate %zu bytes in zone %d (limit is %u)\n",
			   size, _M_ZONE_INDEX(flags), cache[-1].object_size);
		return nil;
	}

	void *ret = kmem_cache_alloc(cache, flags);

#if CFG_POISON_SLABS
	if (ret) {
		struct slab_poison *poison = ret;
		poison_on_alloc(poison, size - sizeof(*poison), ktrace_return_addr());
		ret = poison->data;
	}
#endif
	return ret;
}

void kfree(void *ptr)
{
	if (ptr == nil)
		return;

	SLAB_ASSERT(ptr >= DMAP_START && ptr < DMAP_END);

	vm_page_t page = vaddr2pg(ptr);
	SLAB_ASSERT(pga_slab(page));
	struct kmem_cache_node *node = page_slab(page);
	struct kmem_cache *cache = node->cache;
#if CFG_POISON_SLABS
	if (cache->flags & SLAB_POISON) {
		struct slab_poison *poison = container_of(ptr, typeof(*poison), data);
		poison_on_free(poison);
		ptr = poison;
	}
#endif

	register_t cpuflags = intr_disable();

	spin_lock(&node->lock);
	*(void **)ptr = node->freelist;
	SLAB(page)->freelist = (void **)ptr;
	u_int free_count = ++node->free_count;
	spin_unlock(&node->lock);

	if (free_count == cache->slabs_per_node) {
		pool_del_node(&cache->partial, node);
		pool_add_node(&cache->empty, node);
	}

	latom_dec(&cache->total_used);
	intr_restore(cpuflags);
}

#if CFG_POISON_SLABS

static inline void poison_on_alloc(struct slab_poison *poison, u_long exact_size,
				   void *alloc_source)
{
	u_long offset = align_ceil(poison->exact_size, sizeof(long)) / sizeof(long);
	u_long *poison_start = &poison->low_poison;

	/*
	 * page_alloc() always initializes the allocated page to zeroes.
	 * Therefore, if exact_size is 0, we know this particular slab entry has
	 * never been used before, and we can skip the check.
	 */
	if (poison->exact_size != 0) {
		for (u_long *pos = poison_start; pos < &poison->high_poison[offset]; pos++) {
			if (*pos != SLAB_POISON_FREE) {
				kprintf("Use-after-free in %p (alloc by %p)\n",
					poison->data, poison->alloc_source);
				break;
			}
		}
	}

	/* update offset to the new size */
	offset = align_ceil(exact_size, sizeof(long)) / sizeof(long);

	poison->alloc_source = alloc_source;
	poison->exact_size = exact_size;
	for (u_long *pos = &poison->low_poison; pos <= &poison->high_poison[offset]; pos++)
		*pos = SLAB_POISON_ALLOC;
}

static inline void poison_on_free(struct slab_poison *poison)
{
	u_long offset = align_ceil(poison->exact_size, sizeof(long)) / sizeof(long);

	if (poison->low_poison != SLAB_POISON_ALLOC) {
		kprintf("Low out-of-bounds write to %p (alloc by %p)\n",
			poison->data, poison->alloc_source);
	}

	if (poison->high_poison[offset] != SLAB_POISON_ALLOC) {
		kprintf("High out-of-bounds write to %p (alloc by %p)\n",
			poison->data, poison->alloc_source);
	}

	for (u_long *pos = &poison->low_poison; pos <= &poison->high_poison[offset]; pos++)
		*pos = SLAB_POISON_FREE;
}

#endif /* CFG_POISON_SLABS */

/*
 * for certain libc routines
 */

__weak void *malloc(usize size)
{
	return kmalloc(size, M_KERN);
}

__weak void free(void *ptr)
{
	kfree(ptr);
}
No results found.