Compare commits

...

3 Commits

Author SHA1 Message Date
anna f5db4e4a25
mm/slab: add object caches
This was the last major obstacle in being able to
manage virtual memory maps.  Object caches are
custom allocators that allow for more fine-grained
allocation policies, including being able to use
memory from the DMAP region.
2 years ago
anna 3910c85cac
x86/mm: fix __boot_clear_page 2 years ago
anna 30df044cec
mm/page: yet another overhaul
This is primarily for the slab allocator update
that's about to come, but to be completely tbh
not even i myself am sure what i should include
here because i made a longer break and there are
like 40 modified files that are all interlinked.
2 years ago

@ -251,7 +251,7 @@ void __boot_clear_page(vm_paddr_t paddr)
pdpte->val = pbase | __P_PRESENT | __P_RW | __P_NOCACHE | __P_HUGE | __P_NOEXEC;
vm_flush();
memset64(vbase + offset, 0, PAGE_SIZE);
pdpe->val = 0;
pdpte->val = old_pdpte.val;
vm_flush();
}

@ -28,8 +28,9 @@
* where the pool of order `n` holds groups of `1 << n` pages.
*
* The mm subsystem needs to allocate memory for initializing itself.
* Therefore, there is an additional boot page frame allocator, which gets the
* free areas from architecture dependent code (`arch/mm/.../init.c`).
* Therefore, there is an additional boot page frame allocator, which is
* initialized with the initial usable memory areas from arch dependent
* bootstrap code (`arch/mm/.../init.c`).
*/
#ifdef _KERNEL
@ -52,6 +53,7 @@
#define _M_EMERG (1 << 2)
#define _M_NOWAIT (1 << 3)
#define _M_ZERO (1 << 4)
#ifndef _HAVE_VM_PAGE_T
#define _HAVE_VM_PAGE_T 1
@ -108,7 +110,10 @@ extern struct mm_zone mm_zones[MM_NR_ZONES]; /* kernel/mm/page.c */
/**
* @brief Memory allocation flags commonly used by all allocators.
* All of them are eventually passed down to `page_alloc()`, the physical page
* frame allocator,
* frame allocator.
*
* You always need to pick either `M_KERN` or `M_DMA` (depending on what you
* need the memory for), and then combine it with optional flags.
*/
enum mflags {
/** @brief Use emergency memory reserves if necessary */
@ -121,6 +126,8 @@ enum mflags {
M_ATOMIC = _M_EMERG | _M_NOWAIT,
/** @brief Allocate low memory suitable for DMA transfers */
M_DMA = _M_ZONE_DMA,
/** @brief Don't zero out pages before returning them */
M_ZERO = _M_ZERO,
};
/** @brief Initialize the slab allocator. */
@ -144,6 +151,57 @@ void *kmalloc(size_t size, enum mflags flags) __malloc_like __alloc_size(1);
*/
void kfree(void *ptr);
/* see kernel/mm/slab.c */
struct kmem_cache;
typedef struct kmem_cache *kmem_cache_t;
/** @brief Flags for `kmem_cache_create()` */
enum slab_flags {
SLAB_ZONE_NORMAL = _M_ZONE_NORMAL,
SLAB_ZONE_DMA = _M_ZONE_DMA,
/** @brief Poison objects before alloc and after free */
SLAB_POISON = (1u << 2),
/** @brief Only operate on memory within the direct mapping */
SLAB_DMAP = (1u << 3),
};
/**
* @brief Register a custom object cache to the slab allocator.
* Caches can be allocated from using `kmem_cache_alloc()`.
* Use `kmem_cache_deregister()` when you don't need the allocator anymore.
*
* @param name Unique name for the object cache, use only `[a-z0-9\\-]`
* @param obj_size Size of a single object in bytes
* @param flags Flags
* @param ctor Constructor
* @param dtor Destructor
* @return Handle to the newly created object cache, evaluates false on failure
*/
kmem_cache_t kmem_cache_register(const char *name, u_int obj_size, enum slab_flags flags,
void (*ctor)(void *ptr, kmem_cache_t cache),
void (*dtor)(void *ptr, kmem_cache_t cache));
/**
* @brief Allocate from an object cache.
* Caches can be created using `kmem_cache_register()`.
* Allocated objects can be released using `kfree()`.
*
* @param cache Object cache to allocate from
* @return An initialized object from the cache, in the state that the `init`
* function passed to `kmem_cache_register()` left it in, or `nil` if OOM
*/
void *kmem_cache_alloc(kmem_cache_t cache, enum mflags flags) __malloc_like;
/**
* @brief Deregister a custom object cache.
* When calling this method, the cache must not have any remaining allocations.
*
* @param cache The cache handle you got from `kmem_cache_register()`
* @return 0 on success, or a negative number if the cache still contains
* allocated objects
*/
int kmem_cache_deregister(kmem_cache_t cache);
/**
* @brief Initialize the buddy page frame allocator.
* This is only called once, from the arch dependent counterpart after it has

@ -20,6 +20,7 @@ union vm_page_attr {
bool pcpu:1; /**< @brief Page is in a per-cpu cache */
bool slab:1; /**< @brief Page is used by the slab allocator */
unsigned zone:2; /**< @brief Index into `mm_zones` */
bool zero:1; /**< @brief Page is known to contain only zeroes */
};
};
#define _PGA_ORDER_SHIFT 0
@ -34,11 +35,13 @@ union vm_page_attr {
#define _PGA_SLAB_MASK (1 << _PGA_SLAB_SHIFT)
#define _PGA_ZONE_SHIFT 12
#define _PGA_ZONE_MASK (3 << _PGA_ZONE_SHIFT)
#define _PGA_ZERO_SHIFT 13
#define _PGA_ZERO_MASK (1 << _PGA_ZERO_SHIFT)
typedef union vm_page_attr vm_page_attr_t;
/* defined in kernel/mm/slab.c */
struct slab_pool;
struct kmem_cache_node;
/**
* @brief Stores information about a single page in physical memory.
@ -50,7 +53,7 @@ struct vm_page {
atom_t count;
/** @brief Page attributes, use the macros below to access this */
atom_t attr;
/** @brief Page frame number */
/** @brief Page frame number (= `paddr >> PAGE_SHIFT`) */
u_long pfn;
/**
* @brief If the page is free, this is its freelist.
@ -59,17 +62,12 @@ struct vm_page {
*/
struct clist link;
union {
struct {
void **freelist;
struct slab_pool *pool;
u_int entry_size;
u_int free_count;
} slab;
struct kmem_cache_node *slab;
};
};
#define INVALID_PAGE nil
#define SLAB(page) (&(page)->slab)
#define SLAB(page) ((page)->slab)
#ifndef _HAVE_VM_PAGE_T
#define _HAVE_VM_PAGE_T 1
@ -89,6 +87,11 @@ extern vm_page_t _vm_page_array_end;
/** @brief Fill a page with zeroes (size depends on the current page order). */
void page_clear(vm_page_t page);
static __always_inline struct kmem_cache_node *page_slab(vm_page_t page)
{
return page->slab;
}
static inline u8 pga_order(vm_page_t page)
{
union vm_page_attr attr = { ._val = atom_read(&page->attr) };
@ -113,6 +116,12 @@ static inline bool pga_slab(vm_page_t page)
return attr.slab;
}
static inline bool pga_zero(vm_page_t page)
{
union vm_page_attr attr = { ._val = atom_read(&page->attr) };
return attr.zero;
}
static inline enum mm_zone_type pga_zone(vm_page_t page)
{
union vm_page_attr attr = { ._val = atom_read(&page->attr) };
@ -157,6 +166,14 @@ static inline enum mm_zone_type pga_set_zone(vm_page_t page, enum mm_zone_type z
}
}
static inline bool pga_set_zero(vm_page_t page, bool zero)
{
if (zero)
return atom_set_bit(&page->attr, _PGA_ZERO_SHIFT);
else
return atom_clr_bit(&page->attr, _PGA_ZERO_SHIFT);
}
static __always_inline bool page_get(vm_page_t page)
{
return atom_inc(&page->count);
@ -172,7 +189,7 @@ static __always_inline bool page_put(vm_page_t page)
static inline void page_lock(vm_page_t page)
{
spin_loop {
if (atom_set_bit(&page->attr, _PGA_LOCK_SHIFT))
if (!atom_set_bit(&page->attr, _PGA_LOCK_SHIFT))
break;
}
}
@ -182,9 +199,16 @@ static __always_inline void page_unlock(vm_page_t page)
atom_clr_bit(&page->attr, _PGA_LOCK_SHIFT);
}
/**
* @brief Attempt to lock a page.
* Must be called with interrupts disabled.
*
* @param page Page to lock.
* @return `true` if you claimed the lock, `false` if not.
*/
static __always_inline bool page_trylock(vm_page_t page)
{
return atom_set_bit(&page->attr, _PGA_LOCK_SHIFT);
return !atom_set_bit(&page->attr, _PGA_LOCK_SHIFT);
}
static inline void __page_set_flag(vm_page_t page, unsigned flag)
@ -275,3 +299,9 @@ static inline void *pfn2vaddr(u_long pfn)
PGADDR_ASSERT(&vm_page_array[pfn] < _vm_page_array_end);
return DMAP_START + (pfn << PAGE_SHIFT);
}
__pure2
static inline vm_paddr_t pg2paddr(vm_page_t page)
{
return (vm_paddr_t)page->pfn << PAGE_SHIFT;
}

@ -96,10 +96,6 @@ static inline void claim_bmem_area(struct mm_zone *zone, const struct _bmem_area
/* only the first page in the order group is inserted into
* the freelist, but all of them need to be initialized */
for (u_int i = 0; i < (1u << order); i++) {
if (pos >= end)
panic("page %p out of range", pos);
if (atom_read(&pos->count) != 420)
panic("page %p double initialized\n", pos);
atom_init(&pos->count, 0);
atom_init(&pos->attr, 0);
@ -158,7 +154,8 @@ void paging_init(vm_paddr_t phys_end)
vm_paddr_t bitmap_start_phys = __boot_pmalloc(bitmap_size_log2, MM_ZONE_NORMAL);
panic_if(bitmap_start_phys == BOOT_PMALLOC_ERR,
"cannot allocate memory for the page bitmaps");
memset(__v(bitmap_start_phys), 0, bitmap_total_size);
for (int i = 0; i < (1 << bitmap_size_log2); i++)
__boot_clear_page(bitmap_start_phys + (i * PAGE_SIZE));
/*
* initialize the pools
@ -192,7 +189,7 @@ void paging_init(vm_paddr_t phys_end)
/* This is merely an optimization to simplify checking whether
* two buddies can be coalesced into one. In reality, the
* reference count is invalid because the page is reserved. */
atom_init(&vm_page_array[pfn].count, 420);
atom_init(&vm_page_array[pfn].count, INT_MIN);
atom_init(&vm_page_array[pfn].attr, _PGA_RSVD_MASK);
vm_page_array[pfn].pfn = pfn;
}
@ -207,12 +204,12 @@ void paging_init(vm_paddr_t phys_end)
/* make sure the boot memory allocator cannot under any circumstances hand
* out pages from this area anymore, even though that should be unnecessary */
clist_del(&area->link);
claim_bmem_area(zone, area);
zone->thrsh.emerg = latom_read(&zone->free_count) / CFG_PAGE_EMERG_DENOM;
if (zone->thrsh.emerg > CFG_PAGE_EMERG_MAX)
zone->thrsh.emerg = CFG_PAGE_EMERG_MAX;
}
zone->thrsh.emerg = latom_read(&zone->free_count) / CFG_PAGE_EMERG_DENOM;
if (zone->thrsh.emerg > CFG_PAGE_EMERG_MAX)
zone->thrsh.emerg = CFG_PAGE_EMERG_MAX;
}
}
@ -227,9 +224,18 @@ vm_page_t page_alloc(u_int order, enum mflags flags)
{
if (order > MM_MAX_ORDER) {
page_debug("get_pages(%d, %#08x): Order too high!\n", order, flags);
return nil;
return INVALID_PAGE;
}
/*
* See if the requested zone has enough free pages for the allocation.
* If not, fall back to lower physical memory (i.e. use a zone with
* smaller index). Repeat until we either find a zone that has enough
* free pages, or until we've run out of zones (in which case the
* allocation failed). Just because we found a zone doesn't mean we've
* succeeded, since the pages in that zone might not be contiguous.
* If they're not, we have to try again (see further down below).
*/
struct mm_zone *zone = &mm_zones[_M_ZONE_INDEX(flags)];
long count_after;
try_next_zone:
@ -242,7 +248,7 @@ try_next_zone:
zone--;
goto try_next_zone;
} else {
return nil;
return INVALID_PAGE;
}
}
}
@ -254,9 +260,9 @@ try_next_zone:
* requested order, and if it's empty, go over to the next higher order.
* Repeat until we found a page, or we've reached the highest order.
*/
vm_page_t page = nil;
vm_page_t page = INVALID_PAGE;
u_int page_order = order;
while (page == nil && page_order < MM_NR_ORDERS) {
while (!page && page_order < MM_NR_ORDERS) {
struct mm_pool *pool = &zone->pools[page_order];
disable_intr();
@ -276,7 +282,7 @@ try_next_zone:
intr_restore(cpuflags);
}
if (page == nil) {
if (!page) {
if (zone > &mm_zones[0]) {
/*
* If we reach this, the current zone technically had enough free
@ -288,7 +294,7 @@ try_next_zone:
zone--;
goto try_next_zone;
} else {
return nil;
return INVALID_PAGE;
}
}
@ -312,7 +318,7 @@ try_next_zone:
disable_intr();
spin_lock(&pool->lock);
clist_add_first(&pool->freelist, &buddy->link);
clist_add(&pool->freelist, &buddy->link);
pool->free_entries++;
spin_unlock(&pool->lock);
intr_restore(cpuflags);
@ -320,7 +326,14 @@ try_next_zone:
for (u_int i = 0; i < (1 << order); i++)
pga_set_order(&page[i], order);
page_clear(page);
/* future versions will have a background thread that
* clears pages in the freelist when the cpu is idle */
if ((flags & _M_ZERO) && !pga_zero(page))
page_clear(page);
/* XXX only clear the zero flag when the page actually becomes dirty */
pga_set_zero(page, false);
return page;
}
@ -378,21 +391,14 @@ void page_free(vm_page_t page)
PAGE_ASSERT((uintptr_t)ptr % ORDER_SIZE(order) == 0);
u_long pfn = pg2pfn(page);
PAGE_DEBUG_BLOCK {
int old_count = atom_sub(&page->count, 1);
if (old_count != 1) {
if (old_count == 0)
page_debug("double free of %p", ptr);
else
page_debug("attempted to free %p with references", ptr);
return;
}
} else {
atom_dec(&page->count);
if (atom_dec(&page->count)) {
page_debug("Double free of %p", page);
return;
}
struct mm_zone *zone = &mm_zones[pga_zone(page)];
latom_add(&zone->free_count, (1 << order));
struct mm_pool *pool = &zone->pools[order];
/* try to coalesce free buddy blocks until we're reached the highest order */
while (order < MM_MAX_ORDER) {
@ -405,30 +411,30 @@ void page_free(vm_page_t page)
* to avoid blocking other CPUs for longer than necessary */
vm_page_t buddy = &vm_page_array[pfn ^ (1ul << order)];
vm_page_t low = &vm_page_array[pfn & ~(1ul << order)];
struct mm_pool *current_order_pool = &zone->pools[order];
struct mm_pool *next_order_pool = &zone->pools[order + 1];
disable_intr();
spin_lock(&zone->pools[order].lock);
spin_lock(&pool->lock);
if (can_merge(page, buddy)) {
/* remove buddy from the low order freelist */
clist_del(&buddy->link);
current_order_pool->free_entries--;
pool->free_entries--;
spin_unlock(&pool->lock);
pga_set_order(buddy, order + 1);
pga_set_order(page, order + 1);
clist_add(&next_order_pool->freelist, &low->link);
next_order_pool->free_entries++;
} else {
order = MM_MAX_ORDER; /* break out of the loop */
spin_unlock(&pool->lock);
intr_restore(cpuflags);
break;
}
spin_unlock(&zone->pools[order].lock);
intr_restore(cpuflags);
page = low;
pfn = pg2pfn(page);
order++;
pool++;
}
/* finally, we need to insert the page at its freelist */
struct mm_pool *pool = &zone->pools[order];
disable_intr();
spin_lock(&pool->lock);
clist_add(&pool->freelist, &page->link);

@ -1,9 +1,14 @@
/* Copyright (C) 2021,2022 fef <owo@fef.moe>. All rights reserved. */
/*
* slabbing slabs onto the slab for slabs slab slab slahsdf ashklfghdsla
*/
#include <arch/atom.h>
#include <arch/cpufunc.h>
#include <arch/page.h>
#include <gay/bits.h>
#include <gay/cdefs.h>
#include <gay/clist.h>
#include <gay/config.h>
@ -15,9 +20,7 @@
#include <gay/types.h>
#include <gay/vm/page.h>
/*
* XXX this implementation is still missing object caches
*/
#include <strings.h>
#if CFG_POISON_SLABS
struct slab_poison {
@ -29,8 +32,8 @@ struct slab_poison {
u_long high_poison[1];
};
static void poison_after_alloc(struct slab_poison *poison, u_int exact_size, void *alloc_source);
static void poison_after_free(struct slab_poison *poison);
static void poison_on_alloc(struct slab_poison *poison, u_long exact_size, void *alloc_source);
static void poison_on_free(struct slab_poison *poison);
#endif
#if CFG_DEBUG_SLAB_ALLOCS
@ -49,126 +52,323 @@ static void poison_after_free(struct slab_poison *poison);
# define slab_debug_noisy(msg, ...) ({})
#endif
struct slab_pool {
const u_int entry_size; /**< @brief Size of one entry in bytes */
const u_int entries_per_slab; /**< @brief Max number of entries per slab */
atom_t total_used; /**< @brief Total allocated entries */
const u_int page_order; /**< @brief Order passed to `get_pages()` */
struct clist empty_list; /* -> struct vm_page::link */
struct clist partial_list; /* -> struct vm_page::link */
struct clist full_list; /* -> struct vm_page::link */
spin_t empty_lock; /**< @brief Lock for `empty_list` */
spin_t partial_lock; /**< @brief Lock for `partial_list` */
spin_t full_lock; /**< @brief Lock for `full_list` */
atom_t empty_count; /**< @brief Number of empty slabs */
atom_t partial_count; /**< @brief Number of partially empty slabs */
atom_t full_count; /**< @brief Number of full slabs */
/**
* @brief Single node in the object cache system.
* Each node owns a page
*/
struct kmem_cache_node {
struct clist link; /* -> struct kmem_cache_pool::list */
void **freelist; /**< @brief Stack of free objects */
struct kmem_cache *cache; /**< @brief Object cache this node belongs to */
spin_t lock; /**< @brief Lock for `freelist` */
u_int free_count;
vm_page_t page; /**< @brief Physical page this node manages */
};
/*
* Fun size calculations because the slab header takes up some overhead at the
* beginning of each page. We should ideally try to cram all the info we need
* into struct vm_page, because the individual slab entry sizes could be even
* powers of two and perfectly aligned then.
struct kmem_cache_pool {
struct clist list; /* -> struct kmem_cache_node::link */
spin_t lock;
atom_t count;
};
/**
* @brief Cache for one particular object type.
* A pool holds multiple nodes, each of which hold the same number of slabs.
*/
struct kmem_cache {
u_int object_size; /**< @brief Object size in bytes */
u_int page_order; /**< @brief Order passed to `get_pages()` */
enum slab_flags flags; /**< @brief Flags for how to allocate */
u_int slabs_per_node; /**< @brief Max number of slabs per cache node */
latom_t total_used; /**< @brief Total allocated entries */
const char *name; /**< @brief Unique name for this object type */
void (*ctor)(void *ptr, kmem_cache_t cache);
void (*dtor)(void *ptr, kmem_cache_t cache);
struct kmem_cache_pool empty;
struct kmem_cache_pool partial;
struct kmem_cache_pool full;
struct clist link; /**< @brief List of all kmem caches */
};
/* values for struct kmem_cache::flags */
/** @brief Zone to request pages from (using `page_alloc()`) */
#define SLAB_ZONE(flags) ((flags) & 3)
/** @brief List of all currently registered `struct kmem_cache`s. */
static CLIST(kmem_cache_list);
#define _MIN1(x) ((x) < 1 ? 1 : (x))
#define POOL_ENTRIES_PER_TABLE(sz) _MIN1(PAGE_SIZE / (sz))
#define SLABS_PER_NODE(sz) _MIN1(PAGE_SIZE / (sz))
#define POOL_DEFINE(sz) { \
.entry_size = (sz), \
.entries_per_slab = POOL_ENTRIES_PER_TABLE(sz), \
.total_used = ATOM_DEFINE(0), \
#define CACHE_DEFINE(sz, _name, _flags) { \
.object_size = (sz), \
.page_order = ((sz) - 1) / PAGE_SIZE, \
.empty_lock = SPIN_DEFINE, \
.partial_lock = SPIN_DEFINE, \
.full_lock = SPIN_DEFINE, \
.empty_count = ATOM_DEFINE(0), \
.partial_count = ATOM_DEFINE(0), \
.full_count = ATOM_DEFINE(0), \
.flags = (_flags), \
.slabs_per_node = SLABS_PER_NODE(sz), \
.total_used = ATOM_DEFINE(0), \
.name = (_name), \
}
static struct slab_pool slab_pools_normal[] = {
POOL_DEFINE(32),
POOL_DEFINE(64),
POOL_DEFINE(128),
POOL_DEFINE(256),
POOL_DEFINE(512),
POOL_DEFINE(1024),
POOL_DEFINE(2048),
POOL_DEFINE(4096),
POOL_DEFINE(8192),
POOL_DEFINE(16384),
POOL_DEFINE(32768),
static struct kmem_cache kmem_caches[] = {
CACHE_DEFINE(32, "kmem_32", _M_ZONE_NORMAL | SLAB_POISON),
CACHE_DEFINE(64, "kmem_64", _M_ZONE_NORMAL | SLAB_POISON),
CACHE_DEFINE(128, "kmem_128", _M_ZONE_NORMAL | SLAB_POISON),
CACHE_DEFINE(256, "kmem_256", _M_ZONE_NORMAL | SLAB_POISON),
CACHE_DEFINE(512, "kmem_512", _M_ZONE_NORMAL | SLAB_POISON),
CACHE_DEFINE(1024, "kmem_1024", _M_ZONE_NORMAL | SLAB_POISON),
CACHE_DEFINE(2048, "kmem_2048", _M_ZONE_NORMAL | SLAB_POISON),
CACHE_DEFINE(4096, "kmem_4096", _M_ZONE_NORMAL | SLAB_POISON),
CACHE_DEFINE(8192, "kmem_8192", _M_ZONE_NORMAL | SLAB_POISON),
CACHE_DEFINE(16384, "kmem_16384", _M_ZONE_NORMAL | SLAB_POISON),
CACHE_DEFINE(32768, "kmem_32768", _M_ZONE_NORMAL | SLAB_POISON),
{ /* terminator */ }
};
static struct slab_pool slab_pools_dma[] = {
POOL_DEFINE(32),
POOL_DEFINE(64),
POOL_DEFINE(128),
POOL_DEFINE(256),
POOL_DEFINE(512),
POOL_DEFINE(1024),
static struct kmem_cache kmem_dma_caches[] = {
CACHE_DEFINE(32, "kmem_dma_32", _M_ZONE_DMA | SLAB_POISON),
CACHE_DEFINE(64, "kmem_dma_64", _M_ZONE_DMA | SLAB_POISON),
CACHE_DEFINE(128, "kmem_dma_128", _M_ZONE_DMA | SLAB_POISON),
CACHE_DEFINE(256, "kmem_dma_256", _M_ZONE_DMA | SLAB_POISON),
CACHE_DEFINE(512, "kmem_dma_512", _M_ZONE_DMA | SLAB_POISON),
CACHE_DEFINE(1024, "kmem_dma_1024", _M_ZONE_DMA | SLAB_POISON),
{ /* terminator */ }
};
/**
* This is a little fucked.
*
* So, every `vm_page_t` in use by the slab allocator gets a corresponding
* `struct kmem_cache_node` that keeps track of everything we need to know to
* make allocations. However, the memory for those structs themselves doesn't
* magically grow on trees. In other words, we need to allocate memory in
* order to be able to allocate memory.
*
* So what we have here is a separate object cache for `struct kmem_cache_node`
* that works slightly differently than all the other ones: Instead of making
* an extra allocation for the cache node, that node sits at the beginning of
* the page that we allocate from itself. Other caches don't do this because
* it destroys the perfect page alignment of the allocated area itself, but that
* doesn't matter here.
*/
static struct kmem_cache kmem_cache_node_caches =
CACHE_DEFINE(sizeof(struct kmem_cache_node), "kmem_cache_node", _M_ZONE_NORMAL | SLAB_DMAP);
#undef _MIN1 /* we don't wanna end up using this in actual code, do we? */
static struct slab_pool *slab_zone_pools[MM_NR_ZONES] = {
[_M_ZONE_DMA] = slab_pools_dma,
[_M_ZONE_NORMAL] = slab_pools_normal,
static struct kmem_cache *kmem_cache_zones[MM_NR_ZONES] = {
[_M_ZONE_DMA] = kmem_dma_caches,
[_M_ZONE_NORMAL] = kmem_caches,
};
static vm_page_t slab_create(struct slab_pool *pool, enum mflags flags);
static void cache_pool_init(struct kmem_cache_pool *pool)
{
clist_init(&pool->list);
atom_init(&pool->count, 0);
spin_init(&pool->lock);
}
void kmalloc_init(void)
{
for (int i = 0; i < MM_NR_ZONES; i++) {
struct slab_pool *pool = slab_zone_pools[i];
cache_pool_init(&kmem_cache_node_caches.empty);
cache_pool_init(&kmem_cache_node_caches.partial);
cache_pool_init(&kmem_cache_node_caches.full);
/* for the management node at the beginning of the page */
kmem_cache_node_caches.slabs_per_node--;
clist_add(&kmem_cache_list, &kmem_cache_node_caches.link);
while (pool->entry_size != 0) {
clist_init(&pool->empty_list);
clist_init(&pool->partial_list);
clist_init(&pool->full_list);
pool++;
for (int i = 0; i < MM_NR_ZONES; i++) {
struct kmem_cache *cache = kmem_cache_zones[i];
while (cache->object_size != 0) {
clist_init(&cache->empty.list);
clist_init(&cache->partial.list);
clist_init(&cache->full.list);
clist_add(&kmem_cache_list, &cache->link);
cache++;
}
}
}
void *kmalloc(usize size, enum mflags flags)
kmem_cache_t kmem_cache_register(const char *name, u_int obj_size, enum slab_flags flags,
void (*ctor)(void *ptr, kmem_cache_t cache),
void (*dtor)(void *ptr, kmem_cache_t cache))
{
if (size == 0)
obj_size = align_ceil(obj_size, sizeof(long));
/* we only support objects up to PAGE_SIZE for now */
if (obj_size > PAGE_SIZE || obj_size == 0)
return nil;
#if CFG_POISON_SLABS
size += sizeof(struct slab_poison);
#endif
struct kmem_cache *cache = kmalloc(sizeof(*cache), M_KERN);
SLAB_DEBUG_BLOCK {
if (!(flags & _M_NOWAIT) && in_irq()) {
slab_debug("kmalloc() called from irq without M_NOWAIT "
"(caller: %p)\n", ktrace_return_addr());
flags |= _M_NOWAIT;
if (cache) {
cache->name = name;
cache->object_size = obj_size;
cache->flags = flags;
cache->ctor = ctor;
cache->dtor = dtor;
cache_pool_init(&cache->empty);
cache_pool_init(&cache->partial);
cache_pool_init(&cache->full);
/* XXX this is pretty wasteful for larger obj_sizes */
cache->slabs_per_node = PAGE_SIZE / obj_size;
cache->page_order = 0;
clist_add(&kmem_cache_list, &cache->link);
}
return cache;
}
static inline void **freelist_init(vm_page_t page, struct kmem_cache *cache)
{
void *prev = nil;
void *start = __v(pg2paddr(page));
void *end = start + align_floor(1 << (cache->page_order + PAGE_SHIFT), cache->object_size);
void *pos = end;
do {
pos -= cache->object_size;
if (cache->ctor)
cache->ctor(pos, cache);
*(void **)pos = prev;
prev = pos;
} while (pos >= start + cache->object_size);
return (void **)pos;
}
/** Attempt to remove a cache node from the partial/empty lists in a cache node and return it */
/* call with interrupts disabled */
static inline struct kmem_cache_node *pool_del_first_node(struct kmem_cache_pool *pool)
{
struct kmem_cache_node *node = nil;
spin_lock(&pool->lock);
if (!clist_is_empty(&pool->list)) {
atom_dec(&pool->count);
node = clist_del_first_entry(&pool->list, typeof(*node), link);
}
spin_unlock(&pool->lock);
return node;
}
/* call with interrupts disabled */
static inline void pool_del_node(struct kmem_cache_pool *pool, struct kmem_cache_node *node)
{
atom_dec(&pool->count);
spin_lock(&pool->lock);
clist_del(&node->link);
spin_unlock(&pool->lock);
}
/* call with interrupts disabled */
static inline void pool_add_node(struct kmem_cache_pool *pool, struct kmem_cache_node *node)
{
spin_lock(&pool->lock);
clist_add(&pool->list, &node->link);
spin_unlock(&pool->lock);
atom_inc(&pool->count);
}
/* call with interrupts disabled */
static inline void *pop_freelist_and_insert(struct kmem_cache *cache, struct kmem_cache_node *node)
{
spin_lock(&node->lock);
void *ret = node->freelist;
node->freelist = *node->freelist;
u_int free_count = --node->free_count;
spin_unlock(&node->lock);
latom_inc(&cache->total_used);
if (free_count == 0)
pool_add_node(&cache->full, node);
else
pool_add_node(&cache->partial, node);
return ret;
}
/* call with interrupts disabled */
static struct kmem_cache_node *node_alloc(void)
{
/*
* This is really the same basic procedure as kmem_cache_alloc(),
* except that we allocate everything manually if we run out of caches
* and interrupts are disabled.
* It definitely needs a cleanup at some point, most of the stuff here
* can probably be eliminated if kmem_cache_alloc() is split up.
*/
struct kmem_cache_node *mgmt_node = pool_del_first_node(&kmem_cache_node_caches.partial);
if (!mgmt_node) {
mgmt_node = pool_del_first_node(&kmem_cache_node_caches.empty);
if (!mgmt_node) {
vm_page_t page = page_alloc(0, M_ATOMIC);
if (!page)
return nil;
void **freelist = freelist_init(page, &kmem_cache_node_caches);
mgmt_node = (struct kmem_cache_node *)freelist;
mgmt_node->freelist = *freelist;
mgmt_node = __v(pg2paddr(page));
spin_init(&mgmt_node->lock);
mgmt_node->free_count = kmem_cache_node_caches.slabs_per_node;
mgmt_node->cache = &kmem_cache_node_caches;
mgmt_node->page = page;
}
}
SLAB_ASSERT(_M_ZONE_INDEX(flags) < ARRAY_SIZE(slab_zone_pools));
struct slab_pool *pool = slab_zone_pools[_M_ZONE_INDEX(flags)];
while (pool->entry_size != 0) {
if (pool->entry_size >= size)
break;
pool++;
struct kmem_cache_node *new_node = pop_freelist_and_insert(&kmem_cache_node_caches,
mgmt_node);
return new_node;
}
/* call with interrupts disabled */
static inline struct kmem_cache_node *node_create(struct kmem_cache *cache, enum mflags flags,
register_t cpuflags)
{
struct kmem_cache_node *node = node_alloc();
if (node) {
intr_restore(cpuflags);
vm_page_t page = page_alloc(cache->page_order, flags | M_ZERO);
if (page) {
pga_set_slab(page, true);
page->slab = node;
node->freelist = freelist_init(page, cache);
spin_init(&node->lock);
node->free_count = cache->slabs_per_node;
node->cache = cache;
node->page = page;
} else {
kfree(node);
node = nil;
}
intr_disable();
}
if (pool->entry_size == 0) {
slab_debug("Refusing to allocate %zu bytes in zone %d (limit is %u)\n",
size, _M_ZONE_INDEX(flags), pool[-1].entry_size);
return nil;
return node;
}
void *kmem_cache_alloc(kmem_cache_t cache, enum mflags flags)
{
SLAB_DEBUG_BLOCK {
if (!(flags & _M_NOWAIT) && in_irq()) {
slab_debug("kmem_cache_alloc() called from irq %p w/o M_NOWAIT\n",
ktrace_return_addr());
flags |= _M_NOWAIT;
}
}
slab_debug_noisy("alloc %zu bytes from zone %d, pool size %u\n",
size, _M_ZONE_INDEX(flags), pool->entry_size);
SLAB_ASSERT(_M_ZONE_INDEX(flags) < ARRAY_SIZE(slab_zone_pools));
slab_debug_noisy("alloc %zu bytes from zone %d, cache %s\n",
size, _M_ZONE_INDEX(flags), cache->name);
/*
* Before locking a slab, we always remove it from its pool.
* Before locking a node, we always remove it from its cache pool.
* This is far from optimal, because if multiple CPUs allocate from the
* same pool at the same time, we could end up creating several slabs
* with one used entry each (not to mention the overhead of the mostly
@ -178,62 +378,63 @@ void *kmalloc(usize size, enum mflags flags)
* it can't possibly be used for allocations anymore.
* This is probably not worth the overhead, though.
*/
vm_page_t page = INVALID_PAGE;
struct kmem_cache_node *node = nil;
/* try to use a slab that is already partially used first */
register_t cpuflags = intr_disable();
spin_lock(&pool->partial_lock);
if (!clist_is_empty(&pool->partial_list)) {
atom_dec(&pool->partial_count);
page = clist_del_first_entry(&pool->partial_list, typeof(*page), link);
}
spin_unlock(&pool->partial_lock);
if (!page) {
/* no partially used slab available, see if we have a completely free one */
spin_lock(&pool->empty_lock);
if (!clist_is_empty(&pool->empty_list)) {
atom_dec(&pool->empty_count);
page = clist_del_first_entry(&pool->empty_list, typeof(*page), link);
}
spin_unlock(&pool->empty_lock);
if (!page) {
/* we're completely out of usable slabs, allocate a new one */
intr_restore(cpuflags);
page = slab_create(pool, flags);
if (!page) {
node = pool_del_first_node(&cache->partial);
if (!node) {
/* no partially used node available, see if we have a completely free one */
node = pool_del_first_node(&cache->empty);
if (!node) {
/* we're completely out of usable nodes, allocate a new one */
node = node_create(cache, flags, cpuflags);
if (!node) {
slab_debug("kernel OOM\n");
return nil;
}
intr_disable();
}
}
/* if we've made it to here, we have a slab and interrupts are disabled */
page_lock(page);
void *ret = page->slab.freelist;
SLAB(page)->freelist = *SLAB(page)->freelist;
if (--page->slab.free_count == 0) {
spin_lock(&pool->full_lock);
clist_add(&pool->full_list, &page->link);
spin_unlock(&pool->full_lock);
atom_inc(&pool->full_count);
} else {
spin_lock(&pool->partial_lock);
clist_add(&pool->partial_list, &page->link);
spin_unlock(&pool->partial_lock);
atom_inc(&pool->partial_count);
}
page_unlock(page);
/* if we've made it to here, we have a cache node and interrupts are disabled */
void *ret = pop_freelist_and_insert(cache, node);
intr_restore(cpuflags);
atom_inc(&pool->total_used);
return ret;
}
void *kmalloc(usize size, enum mflags flags)
{
if (size == 0)
return nil;
#if CFG_POISON_SLABS
struct slab_poison *poison = ret;
poison_after_alloc(poison, size - sizeof(*poison), ktrace_return_addr());
ret = poison->data;
size += sizeof(struct slab_poison);
#endif
SLAB_ASSERT(_M_ZONE_INDEX(flags) < ARRAY_SIZE(slab_zone_pools));
struct kmem_cache *cache = kmem_cache_zones[_M_ZONE_INDEX(flags)];
while (cache->object_size != 0) {
if (cache->object_size >= size)
break;
cache++;
}
if (cache->object_size == 0) {
slab_debug("Refusing to allocate %zu bytes in zone %d (limit is %u)\n",
size, _M_ZONE_INDEX(flags), cache[-1].object_size);
return nil;
}
void *ret = kmem_cache_alloc(cache, flags);
#if CFG_POISON_SLABS
if (ret) {
struct slab_poison *poison = ret;
poison_on_alloc(poison, size - sizeof(*poison), ktrace_return_addr());
ret = poison->data;
}
#endif
return ret;
}
@ -247,64 +448,39 @@ void kfree(void *ptr)
vm_page_t page = vaddr2pg(ptr);
SLAB_ASSERT(pga_slab(page));
struct slab_pool *pool = SLAB(page)->pool;
struct kmem_cache_node *node = page_slab(page);
struct kmem_cache *cache = node->cache;
#if CFG_POISON_SLABS
struct slab_poison *poison = container_of(ptr, typeof(*poison), data);
poison_after_free(poison);
ptr = poison;
if (cache->flags & SLAB_POISON) {
struct slab_poison *poison = container_of(ptr, typeof(*poison), data);
poison_on_free(poison);
ptr = poison;
}
#endif
register_t cpuflags = intr_disable();
page_lock(page);
*(void **)ptr = SLAB(page)->freelist;
spin_lock(&node->lock);
*(void **)ptr = node->freelist;
SLAB(page)->freelist = (void **)ptr;
if (++SLAB(page)->free_count == pool->entries_per_slab) {
spin_lock(&pool->partial_lock);
clist_del(&page->link);
spin_unlock(&pool->partial_lock);
atom_dec(&pool->partial_count);
spin_lock(&pool->empty_lock);
clist_add(&pool->empty_list, &page->link);
spin_unlock(&pool->empty_lock);
atom_inc(&pool->empty_count);
}
page_unlock(page);
atom_dec(&pool->total_used);
intr_restore(cpuflags);
}
u_int free_count = ++node->free_count;
spin_unlock(&node->lock);
static vm_page_t slab_create(struct slab_pool *pool, enum mflags flags)
{
slab_debug_noisy("Creating new cache for entry_size %u\n", pool->entry_size);
vm_page_t page = page_alloc(pool->page_order, flags);
if (page) {
pga_set_slab(page, true);
SLAB(page)->pool = pool;
SLAB(page)->free_count = pool->entries_per_slab;
void *prev = nil;
/* XXX this should not rely on a direct map */
void *start = pfn2vaddr(pg2pfn(page));
void *end = start + (1 << (pool->page_order + PAGE_SHIFT));
void *pos = end;
do {
pos -= pool->entry_size;
*(void **)pos = prev;
prev = pos;
} while (pos > start);
SLAB(page)->freelist = pos;
if (free_count == cache->slabs_per_node) {
pool_del_node(&cache->partial, node);
pool_add_node(&cache->empty, node);
}
return page;
latom_dec(&cache->total_used);
intr_restore(cpuflags);
}
#if CFG_POISON_SLABS
static inline void poison_after_alloc(struct slab_poison *poison, u_int exact_size,
void *alloc_source)
static inline void poison_on_alloc(struct slab_poison *poison, u_long exact_size,
void *alloc_source)
{
u_int offset = align_ceil(poison->exact_size, sizeof(long)) / sizeof(long);
u_long offset = align_ceil(poison->exact_size, sizeof(long)) / sizeof(long);
u_long *poison_start = &poison->low_poison;
/*
@ -331,9 +507,9 @@ static inline void poison_after_alloc(struct slab_poison *poison, u_int exact_si
*pos = SLAB_POISON_ALLOC;
}
static inline void poison_after_free(struct slab_poison *poison)
static inline void poison_on_free(struct slab_poison *poison)
{
u_int offset = align_ceil(poison->exact_size, sizeof(long)) / sizeof(long);
u_long offset = align_ceil(poison->exact_size, sizeof(long)) / sizeof(long);
if (poison->low_poison != SLAB_POISON_ALLOC) {
kprintf("Low out-of-bounds write to %p (alloc by %p)\n",

Loading…
Cancel
Save