kern/kernel/mm/slab.c
fef f5db4e4a25
mm/slab: add object caches
This was the last major obstacle in being able to
manage virtual memory maps.  Object caches are
custom allocators that allow for more fine-grained
allocation policies, including being able to use
memory from the DMAP region.
2022-04-01 17:09:03 +02:00

542 lines
16 KiB
C

/* Copyright (C) 2021,2022 fef <owo@fef.moe>. All rights reserved. */
/*
* slabbing slabs onto the slab for slabs slab slab slahsdf ashklfghdsla
*/
#include <arch/atom.h>
#include <arch/cpufunc.h>
#include <arch/page.h>
#include <gay/bits.h>
#include <gay/cdefs.h>
#include <gay/clist.h>
#include <gay/config.h>
#include <gay/kprintf.h>
#include <gay/ktrace.h>
#include <gay/mm.h>
#include <gay/poison.h>
#include <gay/systm.h>
#include <gay/types.h>
#include <gay/vm/page.h>
#include <strings.h>
#if CFG_POISON_SLABS
struct slab_poison {
void *_pad __unused; /**< @brief That's where the freelist pointer is stored */
void *alloc_source; /**< @brief Code address that made the alloc call */
u_long exact_size;
u_long low_poison;
u8 data[0];
u_long high_poison[1];
};
static void poison_on_alloc(struct slab_poison *poison, u_long exact_size, void *alloc_source);
static void poison_on_free(struct slab_poison *poison);
#endif
#if CFG_DEBUG_SLAB_ALLOCS
# define slab_debug(msg, ...) kprintf("[slab] " msg, ##__VA_ARGS__)
# define SLAB_DEBUG_BLOCK
# define SLAB_ASSERT KASSERT
# if CFG_DEBUG_SLAB_ALLOCS_NOISY
# define slab_debug_noisy(msg, ...) kprintf("[slab] " msg, ##__VA_ARGS__)
# else
# define slab_debug_noisy(msg, ...) ({})
# endif
#else
# define SLAB_DEBUG_BLOCK if (0)
# define SLAB_ASSERT(x) ({})
# define slab_debug(msg, ...) ({})
# define slab_debug_noisy(msg, ...) ({})
#endif
/**
* @brief Single node in the object cache system.
* Each node owns a page
*/
struct kmem_cache_node {
struct clist link; /* -> struct kmem_cache_pool::list */
void **freelist; /**< @brief Stack of free objects */
struct kmem_cache *cache; /**< @brief Object cache this node belongs to */
spin_t lock; /**< @brief Lock for `freelist` */
u_int free_count;
vm_page_t page; /**< @brief Physical page this node manages */
};
struct kmem_cache_pool {
struct clist list; /* -> struct kmem_cache_node::link */
spin_t lock;
atom_t count;
};
/**
* @brief Cache for one particular object type.
* A pool holds multiple nodes, each of which hold the same number of slabs.
*/
struct kmem_cache {
u_int object_size; /**< @brief Object size in bytes */
u_int page_order; /**< @brief Order passed to `get_pages()` */
enum slab_flags flags; /**< @brief Flags for how to allocate */
u_int slabs_per_node; /**< @brief Max number of slabs per cache node */
latom_t total_used; /**< @brief Total allocated entries */
const char *name; /**< @brief Unique name for this object type */
void (*ctor)(void *ptr, kmem_cache_t cache);
void (*dtor)(void *ptr, kmem_cache_t cache);
struct kmem_cache_pool empty;
struct kmem_cache_pool partial;
struct kmem_cache_pool full;
struct clist link; /**< @brief List of all kmem caches */
};
/* values for struct kmem_cache::flags */
/** @brief Zone to request pages from (using `page_alloc()`) */
#define SLAB_ZONE(flags) ((flags) & 3)
/** @brief List of all currently registered `struct kmem_cache`s. */
static CLIST(kmem_cache_list);
#define _MIN1(x) ((x) < 1 ? 1 : (x))
#define SLABS_PER_NODE(sz) _MIN1(PAGE_SIZE / (sz))
#define CACHE_DEFINE(sz, _name, _flags) { \
.object_size = (sz), \
.page_order = ((sz) - 1) / PAGE_SIZE, \
.flags = (_flags), \
.slabs_per_node = SLABS_PER_NODE(sz), \
.total_used = ATOM_DEFINE(0), \
.name = (_name), \
}
static struct kmem_cache kmem_caches[] = {
CACHE_DEFINE(32, "kmem_32", _M_ZONE_NORMAL | SLAB_POISON),
CACHE_DEFINE(64, "kmem_64", _M_ZONE_NORMAL | SLAB_POISON),
CACHE_DEFINE(128, "kmem_128", _M_ZONE_NORMAL | SLAB_POISON),
CACHE_DEFINE(256, "kmem_256", _M_ZONE_NORMAL | SLAB_POISON),
CACHE_DEFINE(512, "kmem_512", _M_ZONE_NORMAL | SLAB_POISON),
CACHE_DEFINE(1024, "kmem_1024", _M_ZONE_NORMAL | SLAB_POISON),
CACHE_DEFINE(2048, "kmem_2048", _M_ZONE_NORMAL | SLAB_POISON),
CACHE_DEFINE(4096, "kmem_4096", _M_ZONE_NORMAL | SLAB_POISON),
CACHE_DEFINE(8192, "kmem_8192", _M_ZONE_NORMAL | SLAB_POISON),
CACHE_DEFINE(16384, "kmem_16384", _M_ZONE_NORMAL | SLAB_POISON),
CACHE_DEFINE(32768, "kmem_32768", _M_ZONE_NORMAL | SLAB_POISON),
{ /* terminator */ }
};
static struct kmem_cache kmem_dma_caches[] = {
CACHE_DEFINE(32, "kmem_dma_32", _M_ZONE_DMA | SLAB_POISON),
CACHE_DEFINE(64, "kmem_dma_64", _M_ZONE_DMA | SLAB_POISON),
CACHE_DEFINE(128, "kmem_dma_128", _M_ZONE_DMA | SLAB_POISON),
CACHE_DEFINE(256, "kmem_dma_256", _M_ZONE_DMA | SLAB_POISON),
CACHE_DEFINE(512, "kmem_dma_512", _M_ZONE_DMA | SLAB_POISON),
CACHE_DEFINE(1024, "kmem_dma_1024", _M_ZONE_DMA | SLAB_POISON),
{ /* terminator */ }
};
/**
* This is a little fucked.
*
* So, every `vm_page_t` in use by the slab allocator gets a corresponding
* `struct kmem_cache_node` that keeps track of everything we need to know to
* make allocations. However, the memory for those structs themselves doesn't
* magically grow on trees. In other words, we need to allocate memory in
* order to be able to allocate memory.
*
* So what we have here is a separate object cache for `struct kmem_cache_node`
* that works slightly differently than all the other ones: Instead of making
* an extra allocation for the cache node, that node sits at the beginning of
* the page that we allocate from itself. Other caches don't do this because
* it destroys the perfect page alignment of the allocated area itself, but that
* doesn't matter here.
*/
static struct kmem_cache kmem_cache_node_caches =
CACHE_DEFINE(sizeof(struct kmem_cache_node), "kmem_cache_node", _M_ZONE_NORMAL | SLAB_DMAP);
#undef _MIN1 /* we don't wanna end up using this in actual code, do we? */
static struct kmem_cache *kmem_cache_zones[MM_NR_ZONES] = {
[_M_ZONE_DMA] = kmem_dma_caches,
[_M_ZONE_NORMAL] = kmem_caches,
};
static void cache_pool_init(struct kmem_cache_pool *pool)
{
clist_init(&pool->list);
atom_init(&pool->count, 0);
spin_init(&pool->lock);
}
void kmalloc_init(void)
{
cache_pool_init(&kmem_cache_node_caches.empty);
cache_pool_init(&kmem_cache_node_caches.partial);
cache_pool_init(&kmem_cache_node_caches.full);
/* for the management node at the beginning of the page */
kmem_cache_node_caches.slabs_per_node--;
clist_add(&kmem_cache_list, &kmem_cache_node_caches.link);
for (int i = 0; i < MM_NR_ZONES; i++) {
struct kmem_cache *cache = kmem_cache_zones[i];
while (cache->object_size != 0) {
clist_init(&cache->empty.list);
clist_init(&cache->partial.list);
clist_init(&cache->full.list);
clist_add(&kmem_cache_list, &cache->link);
cache++;
}
}
}
kmem_cache_t kmem_cache_register(const char *name, u_int obj_size, enum slab_flags flags,
void (*ctor)(void *ptr, kmem_cache_t cache),
void (*dtor)(void *ptr, kmem_cache_t cache))
{
obj_size = align_ceil(obj_size, sizeof(long));
/* we only support objects up to PAGE_SIZE for now */
if (obj_size > PAGE_SIZE || obj_size == 0)
return nil;
struct kmem_cache *cache = kmalloc(sizeof(*cache), M_KERN);
if (cache) {
cache->name = name;
cache->object_size = obj_size;
cache->flags = flags;
cache->ctor = ctor;
cache->dtor = dtor;
cache_pool_init(&cache->empty);
cache_pool_init(&cache->partial);
cache_pool_init(&cache->full);
/* XXX this is pretty wasteful for larger obj_sizes */
cache->slabs_per_node = PAGE_SIZE / obj_size;
cache->page_order = 0;
clist_add(&kmem_cache_list, &cache->link);
}
return cache;
}
static inline void **freelist_init(vm_page_t page, struct kmem_cache *cache)
{
void *prev = nil;
void *start = __v(pg2paddr(page));
void *end = start + align_floor(1 << (cache->page_order + PAGE_SHIFT), cache->object_size);
void *pos = end;
do {
pos -= cache->object_size;
if (cache->ctor)
cache->ctor(pos, cache);
*(void **)pos = prev;
prev = pos;
} while (pos >= start + cache->object_size);
return (void **)pos;
}
/** Attempt to remove a cache node from the partial/empty lists in a cache node and return it */
/* call with interrupts disabled */
static inline struct kmem_cache_node *pool_del_first_node(struct kmem_cache_pool *pool)
{
struct kmem_cache_node *node = nil;
spin_lock(&pool->lock);
if (!clist_is_empty(&pool->list)) {
atom_dec(&pool->count);
node = clist_del_first_entry(&pool->list, typeof(*node), link);
}
spin_unlock(&pool->lock);
return node;
}
/* call with interrupts disabled */
static inline void pool_del_node(struct kmem_cache_pool *pool, struct kmem_cache_node *node)
{
atom_dec(&pool->count);
spin_lock(&pool->lock);
clist_del(&node->link);
spin_unlock(&pool->lock);
}
/* call with interrupts disabled */
static inline void pool_add_node(struct kmem_cache_pool *pool, struct kmem_cache_node *node)
{
spin_lock(&pool->lock);
clist_add(&pool->list, &node->link);
spin_unlock(&pool->lock);
atom_inc(&pool->count);
}
/* call with interrupts disabled */
static inline void *pop_freelist_and_insert(struct kmem_cache *cache, struct kmem_cache_node *node)
{
spin_lock(&node->lock);
void *ret = node->freelist;
node->freelist = *node->freelist;
u_int free_count = --node->free_count;
spin_unlock(&node->lock);
latom_inc(&cache->total_used);
if (free_count == 0)
pool_add_node(&cache->full, node);
else
pool_add_node(&cache->partial, node);
return ret;
}
/* call with interrupts disabled */
static struct kmem_cache_node *node_alloc(void)
{
/*
* This is really the same basic procedure as kmem_cache_alloc(),
* except that we allocate everything manually if we run out of caches
* and interrupts are disabled.
* It definitely needs a cleanup at some point, most of the stuff here
* can probably be eliminated if kmem_cache_alloc() is split up.
*/
struct kmem_cache_node *mgmt_node = pool_del_first_node(&kmem_cache_node_caches.partial);
if (!mgmt_node) {
mgmt_node = pool_del_first_node(&kmem_cache_node_caches.empty);
if (!mgmt_node) {
vm_page_t page = page_alloc(0, M_ATOMIC);
if (!page)
return nil;
void **freelist = freelist_init(page, &kmem_cache_node_caches);
mgmt_node = (struct kmem_cache_node *)freelist;
mgmt_node->freelist = *freelist;
mgmt_node = __v(pg2paddr(page));
spin_init(&mgmt_node->lock);
mgmt_node->free_count = kmem_cache_node_caches.slabs_per_node;
mgmt_node->cache = &kmem_cache_node_caches;
mgmt_node->page = page;
}
}
struct kmem_cache_node *new_node = pop_freelist_and_insert(&kmem_cache_node_caches,
mgmt_node);
return new_node;
}
/* call with interrupts disabled */
static inline struct kmem_cache_node *node_create(struct kmem_cache *cache, enum mflags flags,
register_t cpuflags)
{
struct kmem_cache_node *node = node_alloc();
if (node) {
intr_restore(cpuflags);
vm_page_t page = page_alloc(cache->page_order, flags | M_ZERO);
if (page) {
pga_set_slab(page, true);
page->slab = node;
node->freelist = freelist_init(page, cache);
spin_init(&node->lock);
node->free_count = cache->slabs_per_node;
node->cache = cache;
node->page = page;
} else {
kfree(node);
node = nil;
}
intr_disable();
}
return node;
}
void *kmem_cache_alloc(kmem_cache_t cache, enum mflags flags)
{
SLAB_DEBUG_BLOCK {
if (!(flags & _M_NOWAIT) && in_irq()) {
slab_debug("kmem_cache_alloc() called from irq %p w/o M_NOWAIT\n",
ktrace_return_addr());
flags |= _M_NOWAIT;
}
}
SLAB_ASSERT(_M_ZONE_INDEX(flags) < ARRAY_SIZE(slab_zone_pools));
slab_debug_noisy("alloc %zu bytes from zone %d, cache %s\n",
size, _M_ZONE_INDEX(flags), cache->name);
/*
* Before locking a node, we always remove it from its cache pool.
* This is far from optimal, because if multiple CPUs allocate from the
* same pool at the same time, we could end up creating several slabs
* with one used entry each (not to mention the overhead of the mostly
* unnecessary list deletions/insertions). However, it allows me to be
* lazier when freeing unused slabs from a background thread since that
* thread knows for sure that once it has removed a slab from free_list,
* it can't possibly be used for allocations anymore.
* This is probably not worth the overhead, though.
*/
struct kmem_cache_node *node = nil;
/* try to use a slab that is already partially used first */
register_t cpuflags = intr_disable();
node = pool_del_first_node(&cache->partial);
if (!node) {
/* no partially used node available, see if we have a completely free one */
node = pool_del_first_node(&cache->empty);
if (!node) {
/* we're completely out of usable nodes, allocate a new one */
node = node_create(cache, flags, cpuflags);
if (!node) {
slab_debug("kernel OOM\n");
return nil;
}
}
}
/* if we've made it to here, we have a cache node and interrupts are disabled */
void *ret = pop_freelist_and_insert(cache, node);
intr_restore(cpuflags);
return ret;
}
void *kmalloc(usize size, enum mflags flags)
{
if (size == 0)
return nil;
#if CFG_POISON_SLABS
size += sizeof(struct slab_poison);
#endif
SLAB_ASSERT(_M_ZONE_INDEX(flags) < ARRAY_SIZE(slab_zone_pools));
struct kmem_cache *cache = kmem_cache_zones[_M_ZONE_INDEX(flags)];
while (cache->object_size != 0) {
if (cache->object_size >= size)
break;
cache++;
}
if (cache->object_size == 0) {
slab_debug("Refusing to allocate %zu bytes in zone %d (limit is %u)\n",
size, _M_ZONE_INDEX(flags), cache[-1].object_size);
return nil;
}
void *ret = kmem_cache_alloc(cache, flags);
#if CFG_POISON_SLABS
if (ret) {
struct slab_poison *poison = ret;
poison_on_alloc(poison, size - sizeof(*poison), ktrace_return_addr());
ret = poison->data;
}
#endif
return ret;
}
void kfree(void *ptr)
{
if (ptr == nil)
return;
SLAB_ASSERT(ptr >= DMAP_START && ptr < DMAP_END);
vm_page_t page = vaddr2pg(ptr);
SLAB_ASSERT(pga_slab(page));
struct kmem_cache_node *node = page_slab(page);
struct kmem_cache *cache = node->cache;
#if CFG_POISON_SLABS
if (cache->flags & SLAB_POISON) {
struct slab_poison *poison = container_of(ptr, typeof(*poison), data);
poison_on_free(poison);
ptr = poison;
}
#endif
register_t cpuflags = intr_disable();
spin_lock(&node->lock);
*(void **)ptr = node->freelist;
SLAB(page)->freelist = (void **)ptr;
u_int free_count = ++node->free_count;
spin_unlock(&node->lock);
if (free_count == cache->slabs_per_node) {
pool_del_node(&cache->partial, node);
pool_add_node(&cache->empty, node);
}
latom_dec(&cache->total_used);
intr_restore(cpuflags);
}
#if CFG_POISON_SLABS
static inline void poison_on_alloc(struct slab_poison *poison, u_long exact_size,
void *alloc_source)
{
u_long offset = align_ceil(poison->exact_size, sizeof(long)) / sizeof(long);
u_long *poison_start = &poison->low_poison;
/*
* page_alloc() always initializes the allocated page to zeroes.
* Therefore, if exact_size is 0, we know this particular slab entry has
* never been used before, and we can skip the check.
*/
if (poison->exact_size != 0) {
for (u_long *pos = poison_start; pos < &poison->high_poison[offset]; pos++) {
if (*pos != SLAB_POISON_FREE) {
kprintf("Use-after-free in %p (alloc by %p)\n",
poison->data, poison->alloc_source);
break;
}
}
}
/* update offset to the new size */
offset = align_ceil(exact_size, sizeof(long)) / sizeof(long);
poison->alloc_source = alloc_source;
poison->exact_size = exact_size;
for (u_long *pos = &poison->low_poison; pos <= &poison->high_poison[offset]; pos++)
*pos = SLAB_POISON_ALLOC;
}
static inline void poison_on_free(struct slab_poison *poison)
{
u_long offset = align_ceil(poison->exact_size, sizeof(long)) / sizeof(long);
if (poison->low_poison != SLAB_POISON_ALLOC) {
kprintf("Low out-of-bounds write to %p (alloc by %p)\n",
poison->data, poison->alloc_source);
}
if (poison->high_poison[offset] != SLAB_POISON_ALLOC) {
kprintf("High out-of-bounds write to %p (alloc by %p)\n",
poison->data, poison->alloc_source);
}
for (u_long *pos = &poison->low_poison; pos <= &poison->high_poison[offset]; pos++)
*pos = SLAB_POISON_FREE;
}
#endif /* CFG_POISON_SLABS */
/*
* for certain libc routines
*/
__weak void *malloc(usize size)
{
return kmalloc(size, M_KERN);
}
__weak void free(void *ptr)
{
kfree(ptr);
}