mm: rewrite slab allocator

This is the final part of the major mm subsystem
refactor (for now).  The new and improved slab
allocator can do *proper* poisoning, with pretty
accurate out-of-bounds and use-after-free
detection.
vm_page_t has also been restructured; its flags
and order are now combined into one atomic field.
main
anna 2 years ago
parent 36d53093d4
commit 2e32e299d2
Signed by: fef
GPG Key ID: EC22E476DC2D3D84

@ -27,3 +27,11 @@ void ktrace_print_from(void *frame)
rbp = *rbp;
}
}
__naked void *ktrace_return_addr(void)
{
__asm__ volatile(
" movq 8(%rbp), %rax \n"
" ret \n"
);
}

@ -28,3 +28,11 @@ void ktrace_print_from(void *frame)
rbp = *rbp;
}
}
__naked void *ktrace_return_addr(void)
{
__asm__ volatile(
" movl 4(%ebp), %eax \n"
" ret \n"
);
}

@ -28,14 +28,14 @@
/** @brief Poison dynamic pages when allocating and freeing them */
#cmakedefine01 CFG_POISON_PAGES
/** @brief Poison heap areas after `kmalloc()` and `kfree()` */
#cmakedefine01 CFG_POISON_HEAP
/** @brief Poison slab allocations (`kmalloc()` and friends) */
#cmakedefine01 CFG_POISON_SLABS
/** @brief Denominator for the fraction of pages kept in emergency reserves */
#define CFG_PAGE_EMERG_DENOM @CFG_PAGE_EMERG_DENOM@
/** @brief Absolute maximum number of pages kept in emergency reserves */
#define CFG_PAGE_EMERG_MAX @CFG_PAGE_EMERG_THRESH@
#define CFG_PAGE_EMERG_MAX @CFG_PAGE_EMERG_MAX@
/** @brief Enable Symmetric Multiprocessing */
#cmakedefine01 CFG_SMP

@ -9,3 +9,6 @@ void ktrace_print(void);
/** @brief Print a full stack trace to the kernel log, starting from `frame`. */
__asmlink
void ktrace_print_from(void *frame);
/** @brief Get the address the current function call will return to. */
void *ktrace_return_addr(void);

@ -27,3 +27,15 @@ extern void _kernel_start;
extern void _kernel_end;
#define kern_end (&_kernel_end)
extern void _text_start;
#define text_start (&_text_start)
extern void _text_end;
#define text_end (&_text_end)
extern void _isr_start;
#define isr_start (&_isr_start)
extern void _isr_end;
#define isr_end (&_isr_end)

@ -4,6 +4,8 @@
#include <arch/poison.h>
#include <limits.h>
/*
* If possible, arch should define this value as an unmappable base address.
* For example, on the amd64, this is set to 0xdead000000000000 because the
@ -26,5 +28,18 @@
#define PAGE_POISON_ALLOC (POISON_BASE + 0x00000010ul)
#define PAGE_POISON_FREE (POISON_BASE + 0x00000020ul)
#if LONG_BIT == 32
#define SLAB_POISON_ALLOC 0x61616160ul
#define SLAB_POISON_FREE 0x41414140ul
#elif LONG_BIT == 64
#define SLAB_POISON_ALLOC 0x6161616161616161ul
#define SLAB_POISON_FREE 0x4141414141414141ul
#elif LONG_BIT == 128
#define SLAB_POISON_ALLOC 0x61616161616161616161616161616160ul
#define SLAB_POISON_FREE 0x41414141414141414141414141414140ul
#else
#error "Unsupported long size"
#endif
#define CLIST_POISON_PREV (POISON_BASE + 0x000000c4ul)
#define CLIST_POISON_NEXT (POISON_BASE + 0x000000c8ul)

@ -2,6 +2,7 @@
#pragma once
#include <arch/atom.h>
#include <arch/page.h>
#include <gay/cdefs.h>
@ -22,17 +23,43 @@
* a significant difference in memory management overhead.
*/
union vm_page_attr {
int _val;
struct {
unsigned order:8; /**< @brief Index into `mm_zones[zone].pools` */
bool lock:1; /**< @brief Page is locked */
bool rsvd:1; /**< @brief Page is reserved and must never be touched */
bool pcpu:1; /**< @brief Page is in a per-cpu cache */
bool slab:1; /**< @brief Page is used by the slab allocator */
unsigned zone:2; /**< @brief Index into `mm_zones` */
};
};
#define _PGA_ORDER_SHIFT 0
#define _PGA_ORDER_MASK (0xf << _PGA_ORDER_SHIFT)
#define _PGA_LOCK_SHIFT 8
#define _PGA_LOCK_MASK (1 << _PGA_LOCK_SHIFT)
#define _PGA_RSVD_SHIFT 9
#define _PGA_RSVD_MASK (1 << _PGA_RSVD_SHIFT)
#define _PGA_PCPU_SHIFT 10
#define _PGA_PCPU_MASK (1 << _PGA_PCPU_SHIFT)
#define _PGA_SLAB_SHIFT 11
#define _PGA_SLAB_MASK (1 << _PGA_SLAB_SHIFT)
#define _PGA_ZONE_SHIFT 12
#define _PGA_ZONE_MASK (3 << _PGA_ZONE_SHIFT)
typedef union vm_page_attr vm_page_attr_t;
/**
* @brief Stores information about a single page in physical memory.
* There is exactly one of these for every physical page, no matter what that
* page is used for or whether it is usable at all.
*/
struct vm_page {
/** @brief Reference count (0 = unused) */
/** @brief Reference count (0 = unused, < 0 = locked) */
atom_t count;
unsigned order:8;
/** @brief Various flags describing how and for what the page is used, see below */
unsigned flags:24;
/** @brief Page attributes, use the macros below to access this */
atom_t attr;
/** @brief If the page is free, this is its freelist. */
struct clist link;
/**
* @brief Optional extra data pointer, reserved for private use.
@ -45,17 +72,6 @@ struct vm_page {
typedef struct vm_page *vm_page_t;
/* values for struct vm_page::flags */
/** @brief Page must never be accessed */
#define PG_RESERVED (1 << 0)
/** @brief Page is in a per-cpu cache */
#define PG_PCPU (1 << 1)
/** @brief Page is used by the slab allocator */
#define PG_SLAB (1 << 2)
/** @brief Page is in `MM_ZONE_DMA`, rather than `MM_ZONE_NORMAL` */
#define PG_DMA (1u << 3)
/** @brief Array of every single page in physical memory, indexed by page frame number. */
extern struct vm_page *const vm_page_array;
@ -66,16 +82,130 @@ extern vm_page_t _vm_page_array_end;
#define PGADDR_ASSERT(x) ({})
#endif
static inline bool page_get(vm_page_t page)
static inline u8 pga_order(vm_page_t page)
{
union vm_page_attr attr = { ._val = atom_read(&page->attr) };
return attr.order;
}
static inline bool pga_rsvd(vm_page_t page)
{
union vm_page_attr attr = { ._val = atom_read(&page->attr) };
return attr.rsvd;
}
static inline bool pga_pcpu(vm_page_t page)
{
union vm_page_attr attr = { ._val = atom_read(&page->attr) };
return attr.pcpu;
}
static inline bool pga_slab(vm_page_t page)
{
union vm_page_attr attr = { ._val = atom_read(&page->attr) };
return attr.slab;
}
static inline enum mm_zone_type pga_zone(vm_page_t page)
{
union vm_page_attr attr = { ._val = atom_read(&page->attr) };
return attr.zone;
}
static inline u8 pga_set_order(vm_page_t page, u8 order)
{
spin_loop {
union vm_page_attr old = { ._val = atom_read(&page->attr) };
union vm_page_attr new = old;
new.order = order;
if (atom_cmp_xchg(&page->attr, old._val, new._val) == old._val)
return old.order;
}
}
static inline bool pga_set_pcpu(vm_page_t page, bool pcpu)
{
if (pcpu)
return atom_set_bit(&page->attr, _PGA_PCPU_SHIFT);
else
return atom_clr_bit(&page->attr, _PGA_PCPU_SHIFT);
}
static inline bool pga_set_slab(vm_page_t page, bool slab)
{
if (slab)
return atom_set_bit(&page->attr, _PGA_SLAB_SHIFT);
else
return atom_clr_bit(&page->attr, _PGA_SLAB_SHIFT);
}
static inline enum mm_zone_type pga_set_zone(vm_page_t page, enum mm_zone_type zone)
{
spin_loop {
union vm_page_attr old = { ._val = atom_read(&page->attr) };
union vm_page_attr new = old;
new.zone = zone;
if (atom_cmp_xchg(&page->attr, old._val, new._val) == old._val)
return old.zone;
}
}
static __always_inline bool page_get(vm_page_t page)
{
return atom_inc(&page->count);
}
static inline bool page_put(vm_page_t page)
static __always_inline bool page_put(vm_page_t page)
{
return atom_dec(&page->count);
}
/* XXX we should probably use a wait queue for these rather than a spinlock like thing */
static inline void page_lock(vm_page_t page)
{
spin_loop {
if (atom_set_bit(&page->attr, _PGA_LOCK_SHIFT))
break;
}
}
static __always_inline void page_unlock(vm_page_t page)
{
atom_clr_bit(&page->attr, _PGA_LOCK_SHIFT);
}
static __always_inline bool page_trylock(vm_page_t page)
{
return atom_set_bit(&page->attr, _PGA_LOCK_SHIFT);
}
static inline void __page_set_flag(vm_page_t page, unsigned flag)
{
atom_or(&page->attr, (int)flag);
}
static inline void __page_clr_flag(vm_page_t page, unsigned mask)
{
atom_and(&page->attr, (int)~mask);
}
static __always_inline void page_attr_load(vm_page_attr_t *attr, vm_page_t page)
{
attr->_val = atom_read(&page->attr);
}
static __always_inline void page_attr_copy(vm_page_attr_t *dest, const vm_page_attr_t *src)
{
dest->_val = src->_val;
}
static __always_inline bool page_attr_cmp_xchg(vm_page_t page, const vm_page_attr_t *cmp,
const vm_page_attr_t *val)
{
return atom_cmp_xchg(&page->attr, cmp->_val, val->_val);
}
/** @brief Get the page frame number of a page. */
__pure2
static inline u_long pg2pfn(vm_page_t page)
@ -84,6 +214,11 @@ static inline u_long pg2pfn(vm_page_t page)
return page - vm_page_array;
}
/**
* @brief Get the page that a virtual address points to.
* The address must point to the DMAP region (i.e. an address that is returned
* by either `get_pages()` and friends, or `kmalloc()` and friends).
*/
__pure2
static inline vm_page_t vaddr2pg(void *vaddr)
{
@ -92,6 +227,11 @@ static inline vm_page_t vaddr2pg(void *vaddr)
return &vm_page_array[offset >> PAGE_SHIFT];
}
/**
* @brief Get the page frame number for a virtual address.
* The address must point to the DMAP region (i.e. an address that is returned
* by either `get_pages()` and friends, or `kmalloc()` and friends).
*/
__pure2
static inline u_long vaddr2pfn(void *vaddr)
{
@ -100,6 +240,7 @@ static inline u_long vaddr2pfn(void *vaddr)
return pfn;
}
/** @brief Get the page frame number for a physical address. */
__pure2
static inline u_long paddr2pfn(vm_paddr_t paddr)
{
@ -107,6 +248,7 @@ static inline u_long paddr2pfn(vm_paddr_t paddr)
return paddr >> PAGE_SHIFT;
}
/** @brief Get the page that a physical address belongs to. */
__pure2
static inline vm_page_t paddr2pg(vm_paddr_t paddr)
{
@ -115,9 +257,29 @@ static inline vm_page_t paddr2pg(vm_paddr_t paddr)
return page;
}
/**
* @brief Translate a page frame number to its corresponding virtual address
* in the DMAP region.
*/
__pure2
static inline void *pfn2vaddr(u_long pfn)
{
PGADDR_ASSERT(&vm_page_array[pfn] < _vm_page_array_end);
return DMAP_START + (pfn << PAGE_SHIFT);
}
/*
* We have to be careful in this macro, because only the first page in the
* order group has the correct order set. So we can only read it once at
* the beginning of the loop, since the page pointer is being updated.
*/
/**
* @brief Iterate over every page in its order group.
*
* @param page The first `vm_page_t` in the group.
*/
#define vm_page_foreach_in_order(page) \
for (int __i = 1 << pga_order(page); \
__i >= 0; \
__i = ({ ++(page); --__i; }))

@ -1,6 +1,7 @@
/* Copyright (C) 2021 fef <owo@fef.moe>. All rights reserved. */
#include <gay/irq.h>
#include <gay/mm.h>
#include <gay/sched.h>
/**
@ -18,6 +19,8 @@ int main(int argc, char *argv[])
{
int err;
kmalloc_init();
irq_init();
err = sched_init();

@ -66,7 +66,7 @@ static inline u_int paddr_find_order(vm_paddr_t addr)
}
/** @brief Claim all free pages in one of the memory areas from the boot allocator. */
static inline void claim_bmem_pages(struct mm_zone *zone, struct _bmem_area *area)
static inline void claim_bmem_area(struct mm_zone *zone, struct _bmem_area *area)
{
vm_paddr_t start = area->start;
vm_paddr_t end = area->end;
@ -98,19 +98,18 @@ static inline void claim_bmem_pages(struct mm_zone *zone, struct _bmem_area *are
* the freelist, but all of them need to be initialized */
for (u_int i = 0; i < (1 << order); i++) {
atom_init(&page[i].count, 0);
page[i].flags = 0;
page[i].order = 0;
atom_init(&page[i].attr, 0);
}
/*
* order
* ^
* | _________ < MM_MAX_ORDER
* | / |
* start | / \ < end order
* order > |/
* |--------------|----> pos
* start end
* | ._____._____. < MM_MAX_ORDER
* | .___| |
* start |._| |_.
* order > .| |. < end order
* |---------------------|----> pos
* start end
*/
pos += ORDER_SIZE(order);
page += (1 << order);
@ -194,7 +193,7 @@ void paging_init(vm_paddr_t phys_end)
* two buddies can be coalesced into one. In reality, the
* reference count is invalid because the page is reserved. */
atom_init(&vm_page_array[i].count, 1);
vm_page_array[i].flags = PG_RESERVED;
atom_init(&vm_page_array[1].attr, _PGA_RSVD_MASK);
}
/*
@ -204,7 +203,7 @@ void paging_init(vm_paddr_t phys_end)
struct mm_zone *zone = &mm_zones[i];
struct _bmem_area *area, *tmp;
clist_foreach_entry_safe(&zone->_bmem_areas, area, tmp, link) {
claim_bmem_pages(zone, area);
claim_bmem_area(zone, area);
}
zone->thrsh.emerg = latom_read(&zone->free_count) / CFG_PAGE_EMERG_DENOM;
if (zone->thrsh.emerg > CFG_PAGE_EMERG_MAX)
@ -283,7 +282,7 @@ static void *__get_pages(u_int order, enum mflags flags)
page_debug_noisy("split %p (order = %u)\n", pfn2vaddr(pfn), page_order);
struct mm_pool *pool = &zone->pools[page_order];
vm_page_t buddy = page + (1 << page_order);
buddy->order = page_order;
pga_set_order(buddy, page_order);
pg_flip_bit(zone, pfn + (1 << page_order), page_order);
disable_intr();
@ -294,7 +293,7 @@ static void *__get_pages(u_int order, enum mflags flags)
intr_restore(cpuflags);
}
page->order = order;
pga_set_order(page, order);
void *vaddr = pfn2vaddr(pfn);
return vaddr;
@ -370,10 +369,10 @@ static __always_inline bool can_merge(vm_page_t page, vm_page_t buddy)
{
bool merge = (atom_read(&buddy->count) == 0);
/* we know that `page` doesn't have PG_RESERVED set,
* because we check that flag before anything else */
const unsigned mask = PG_RESERVED | PG_DMA;
merge &= (page->flags & mask) == (buddy->flags & mask);
/* we know that `page' is not reserved, because we
* check that flag before we even attempt coalition */
const unsigned mask = _PGA_RSVD_MASK | _PGA_ZONE_MASK;
merge &= (atom_read(&page->attr) & mask) == (atom_read(&page->attr) & mask);
return merge;
}
@ -389,9 +388,9 @@ void free_pages(void *ptr)
register_t cpuflags = read_flags();
vm_page_t page = vaddr2pg(ptr);
panic_if(page->flags & PG_RESERVED, "tried to free reserved page %p", ptr);
panic_if(pga_rsvd(page), "tried to free reserved page %p", ptr);
u_int order = page->order;
u_int order = pga_order(page);
PAGE_ASSERT((uintptr_t)ptr % ORDER_SIZE(order) == 0);
u_long pfn = vaddr2pfn(ptr);
@ -400,19 +399,17 @@ void free_pages(void *ptr)
#endif
int old_count = atom_sub(&page->count, 1);
if (old_count != 1) {
if (old_count == 0)
panic("double free of page %p", ptr);
else
panic("attempted to free page %p with references", ptr);
PAGE_DEBUG_BLOCK {
if (old_count != 1) {
if (old_count == 0)
page_debug("double free of page %p", ptr);
else
page_debug("attempted to free page %p with references", ptr);
return;
}
}
struct mm_zone *zone;
if (page->flags & PG_DMA)
zone = &mm_zones[MM_ZONE_DMA];
else
zone = &mm_zones[MM_ZONE_NORMAL];
struct mm_zone *zone = &mm_zones[pga_zone(page)];
latom_add(&zone->free_count, (1 << order));
/* try to coalesce free buddy blocks until we're reached the highest order */
@ -434,8 +431,8 @@ void free_pages(void *ptr)
if (can_merge(page, buddy)) {
clist_del(&buddy->link);
current_order_pool->free_entries--;
buddy->order = order + 1;
page->order = order + 1;
pga_set_order(buddy, order + 1);
pga_set_order(page, order + 1);
clist_add(&next_order_pool->freelist, &low->link);
next_order_pool->free_entries++;
} else {

@ -1,24 +1,48 @@
/* Copyright (C) 2021 fef <owo@fef.moe>. All rights reserved. */
#include <arch/atom.h>
#include <arch/cpufunc.h>
#include <arch/page.h>
#include <gay/cdefs.h>
#include <gay/clist.h>
#include <gay/config.h>
#include <gay/kprintf.h>
#include <gay/ktrace.h>
#include <gay/mm.h>
#include <gay/poison.h>
#include <gay/systm.h>
#include <gay/types.h>
#include <gay/vm/page.h>
#include <string.h>
/*
* XXX this implementation is still missing object caches
*/
#if CFG_POISON_SLABS
struct slab_poison {
void *_pad; /**< @brief That's where the freelist pointer is stored */
void *alloc_source; /**< @brief Code address that made the alloc call */
u_long exact_size;
u_long low_poison;
u8 data[0];
u_long high_poison[1];
};
static void poison_after_alloc(struct slab_poison *poison, u_int exact_size, void *alloc_source);
static void poison_after_free(struct slab_poison *poison);
#endif
/**
* @brief This header sits at the beginning of each slab.
* The individual entries follow immediately after the struct itself.
*/
struct slab {
struct clist clink; /* -> pools[entry_size / SLAB_STEP - 1] (see below) */
/** @brief The individual clist nodes sit at the beginning of each free entry */
struct clist freelist;
struct clist link;
void **freelist;
struct slab_pool *pool;
/** @brief For `link` */
spin_t lock;
/**
* @brief Number of free entries.
* The slabs are sorted within their pool by this value, so that we
@ -30,181 +54,337 @@ struct slab {
* `PAGE_SIZE`), so this saves a couple of bytes on systems where `int`
* is smaller than `usize`.
*/
unsigned int free_entries;
/**
* @brief Size of a single slab entry in bytes.
* Sizes must always be an integral multiple of `sizeof(void *)` and
* at least `sizeof(struct clist)`, because that's the data structure
* used for tracking what entries are free (`freelist`).
*
* Like `free_entries`, this is intentionally not a `usize`.
*/
unsigned int entry_size;
/* here would come the individual entries */
u_int free_entries;
};
/** @brief All slabs currently have the same size of one full page. */
#define SLAB_SIZE PAGE_SIZE
/**
* @brief All slab entry sizes are an integral multiple of this.
* When allocating memory, the requested size gets rounded upwards.
*/
#define SLAB_STEP (sizeof(struct clist))
#define SLAB_OVERHEAD (sizeof(struct slab))
#define SLAB_MAX_ALLOC (SLAB_SIZE - SLAB_OVERHEAD)
/* slabs are always aligned ... */
#define SLAB_PTR_MASK (~(SLAB_SIZE - 1))
/* ... so we can do this */
#define GET_SLAB(ptr) ( (struct slab *)((uintptr_t)(ptr) & SLAB_PTR_MASK) )
#if CFG_DEBUG_SLAB_ALLOCS
# define slab_debug(msg, ...) kprintf("[slab] " msg, ##__VA_ARGS__)
# define SLAB_DEBUG_BLOCK
# define SLAB_ASSERT KASSERT
# if CFG_DEBUG_SLAB_ALLOCS_NOISY
# define slab_debug_noisy(msg, ...) kprintf("[slab] " msg, ##__VA_ARGS__)
# else
# define slab_debug_noisy(msg, ...) ({})
# endif
#else
# define SLAB_DEBUG_BLOCK if (0)
# define SLAB_ASSERT(x) ({})
# define slab_debug(msg, ...) ({})
# define slab_debug_noisy(msg, ...) ({})
#endif
/** @brief All slabs grouped by entry_size, indexed by `entry_size / SLAB_STEP - 1` */
struct clist pools[SLAB_MAX_ALLOC / SLAB_STEP];
struct slab_pool {
const u_int entry_size; /**< @brief Size of one entry in bytes */
const int entries_per_slab; /**< @brief Max number of entries per slab */
atom_t total_used; /**< @brief Total allocated entries */
const u_int page_order; /**< @brief Order passed to `get_pages()` */
struct clist empty_list; /* -> struct slab::link */
struct clist partial_list; /* -> struct slab::link */
struct clist full_list; /* -> struct slab::link */
spin_t empty_lock; /**< @brief Lock for `empty_list` */
spin_t partial_lock; /**< @brief Lock for `partial_list` */
spin_t full_lock; /**< @brief Lock for `full_list` */
atom_t empty_count; /**< @brief Number of empty slabs */
atom_t partial_count; /**< @brief Number of partially empty slabs */
atom_t full_count; /**< @brief Number of full slabs */
};
static void *slab_alloc(usize size, enum mflags flags);
static void slab_free(void *ptr);
/*
* Fun size calculations because the slab header takes up some overhead at the
* beginning of each page. We should ideally try to cram all the info we need
* into struct vm_page, because the individual slab entry sizes could be even
* powers of two and perfectly aligned then.
*/
#define _MIN1(x) ((x) < 1 ? 1 : (x))
#define POOL_ENTRY_SIZE(sz) (( (sz) - ( SLAB_OVERHEAD / _MIN1(PAGE_SIZE / (sz)) ) ) & ~0xfu)
#define POOL_ENTRIES_PER_TABLE(sz) \
_MIN1((PAGE_SIZE - SLAB_OVERHEAD) / POOL_ENTRY_SIZE(sz))
static struct slab *slab_create(unsigned int entry_size, enum mflags flags);
#define POOL_DEFINE(sz) { \
.entry_size = POOL_ENTRY_SIZE(sz), \
.entries_per_slab = POOL_ENTRIES_PER_TABLE(sz), \
.total_used = ATOM_DEFINE(0), \
.page_order = ((sz) - 1) / PAGE_SIZE, \
.empty_lock = SPIN_DEFINE, \
.partial_lock = SPIN_DEFINE, \
.full_lock = SPIN_DEFINE, \
.empty_count = ATOM_DEFINE(0), \
.partial_count = ATOM_DEFINE(0), \
.full_count = ATOM_DEFINE(0), \
}
static inline int get_order(usize size)
static struct slab_pool slab_pools_normal[] = {
POOL_DEFINE(32),
POOL_DEFINE(64),
POOL_DEFINE(128),
POOL_DEFINE(256),
POOL_DEFINE(512),
POOL_DEFINE(1024),
POOL_DEFINE(2048),
POOL_DEFINE(4096),
POOL_DEFINE(8192),
POOL_DEFINE(16384),
POOL_DEFINE(32768),
{ .entry_size = 0 } /* terminator */
};
static struct slab_pool slab_pools_dma[] = {
POOL_DEFINE(32),
POOL_DEFINE(64),
POOL_DEFINE(128),
POOL_DEFINE(256),
POOL_DEFINE(512),
POOL_DEFINE(1024),
{ .entry_size = 0 } /* terminator */
};
#undef _MIN1 /* we don't wanna end up using this in actual code, do we? */
static struct slab_pool *slab_zone_pools[MM_NR_ZONES] = {
[_M_ZONE_NORMAL] = slab_pools_normal,
[_M_ZONE_DMA] = slab_pools_dma,
};
static struct slab *slab_create(struct slab_pool *pool, enum mflags flags);
void kmalloc_init(void)
{
int order;
usize order_size = PAGE_SIZE;
for (int i = 0; i < MM_NR_ZONES; i++) {
struct slab_pool *pool = slab_zone_pools[i];
for (order = 0; order <= GET_PAGE_MAX_ORDER; order++) {
if (order_size >= size)
break;
order_size <<= 1;
while (pool->entry_size != 0) {
clist_init(&pool->empty_list);
clist_init(&pool->partial_list);
clist_init(&pool->full_list);
pool++;
}
}
return order;
}
void *kmalloc(usize size, enum mflags flags)
{
if (size > SLAB_MAX_ALLOC) {
if (flags & M_CONTIG) {
int order = get_order(size);
if (order > GET_PAGE_MAX_ORDER) {
slab_debug("Requested alloc size %zu too large for get_pages()\n",
size);
if (size == 0)
return nil;
#if CFG_POISON_SLABS
size += sizeof(struct slab_poison);
#endif
SLAB_DEBUG_BLOCK {
if (!(flags & _M_NOWAIT) && in_irq()) {
slab_debug("kmalloc() called from irq without M_NOWAIT "
"(caller: %p)\n", ktrace_return_addr());
flags |= _M_NOWAIT;
}
}
SLAB_ASSERT(_M_ZONE_INDEX(flags) < ARRAY_SIZE(slab_zone_pools));
struct slab_pool *pool = slab_zone_pools[_M_ZONE_INDEX(flags)];
while (pool->entry_size != 0) {
if (pool->entry_size >= size)
break;
pool++;
}
if (pool->entry_size == 0) {
slab_debug("Refusing to allocate %zu bytes in zone %d (limit is %u)\n",
size, _M_ZONE_INDEX(flags), pool[-1].entry_size);
return nil;
}
slab_debug_noisy("alloc %zu bytes from zone %d, pool size %u\n",
size, _M_ZONE_INDEX(flags), pool->entry_size);
/*
* Before locking a slab, we always remove it from its pool.
* This is far from optimal, because if multiple CPUs allocate from the
* same pool at the same time, we could end up creating several slabs
* with one used entry each (not to mention the overhead of the mostly
* unnecessary list deletions/insertions). However, it allows me to be
* lazier when freeing unused slabs from a background thread since that
* thread knows for sure that once it has removed a slab from free_list,
* it can't possibly be used for allocations anymore.
* This is probably not worth the overhead, though.
*/
struct slab *slab = nil;
/* try to use a slab that is already partially used first */
register_t cpuflags = intr_disable();
spin_lock(&pool->partial_lock);
if (!clist_is_empty(&pool->partial_list)) {
atom_dec(&pool->partial_count);
slab = clist_del_first_entry(&pool->partial_list, typeof(*slab), link);
}
spin_unlock(&pool->partial_lock);
if (slab == nil) {
/* no partially used slab available, see if we have a completely free one */
spin_lock(&pool->empty_lock);
if (!clist_is_empty(&pool->empty_list)) {
atom_dec(&pool->empty_count);
slab = clist_del_first_entry(&pool->empty_list, typeof(*slab), link);
}
spin_unlock(&pool->empty_lock);
if (slab == nil) {
/* we're completely out of usable slabs, allocate a new one */
intr_restore(cpuflags);
slab = slab_create(pool, flags);
if (slab == nil) {
slab_debug("kernel OOM\n");
return nil;
} else {
return get_pages(order, flags);
}
} else {
slab_debug("Refusing to allocate %zu bytes as slabs\n", size);
return nil;
intr_disable();
}
}
/* if we've made it to here, slab != nil and interrupts are disabled */
spin_lock(&slab->lock);
void *ret = slab->freelist;
slab->freelist = *slab->freelist;
if (--slab->free_entries == 0) {
spin_lock(&pool->full_lock);
clist_add(&pool->full_list, &slab->link);
spin_unlock(&pool->full_lock);
atom_inc(&pool->full_count);
} else {
return slab_alloc(size, flags);
spin_lock(&pool->partial_lock);
clist_add(&pool->partial_list, &slab->link);
spin_unlock(&pool->partial_lock);
atom_inc(&pool->partial_count);
}
spin_unlock(&slab->lock);
intr_restore(cpuflags);
atom_inc(&pool->total_used);
#if CFG_POISON_SLABS
struct slab_poison *poison = ret;
poison_after_alloc(poison, size - sizeof(*poison), ktrace_return_addr());
ret = poison->data;
#endif
return ret;
}
void kfree(void *ptr)
{
kprintf("kfree() is not implemented yet lmao\n");
}
if (ptr == nil)
return;
void slab_init(void)
{
slab_debug("Initializing %zu cache pools (%zu~%zu bytes)\n",
ARRAY_SIZE(pools), SLAB_STEP, SLAB_MAX_ALLOC);
for (int i = 0; i < ARRAY_SIZE(pools); i++)
clist_init(&pools[i]);
SLAB_ASSERT(ptr >= DMAP_START && ptr < DMAP_END);
vm_page_t page = vaddr2pg(ptr);
SLAB_ASSERT(pga_slab(page));
struct slab *slab = page->extra;
struct slab_pool *pool = slab->pool;
#if CFG_POISON_SLABS
struct slab_poison *poison = container_of(ptr, typeof(*poison), data);
poison_after_free(poison);
ptr = poison;
#endif
register_t cpuflags = intr_disable();
spin_lock(&slab->lock);
*(void **)ptr = slab->freelist;
slab->freelist = (void **)ptr;
if (++slab->free_entries == pool->entries_per_slab) {
spin_lock(&pool->partial_lock);
clist_del(&slab->link);
spin_unlock(&pool->partial_lock);
atom_dec(&pool->partial_count);
spin_lock(&pool->empty_lock);
clist_add(&pool->empty_list, &slab->link);
spin_unlock(&pool->empty_lock);
atom_inc(&pool->empty_count);
}
spin_unlock(&slab->lock);
atom_dec(&pool->total_used);
intr_restore(cpuflags);
}
static inline void *slab_alloc(usize size, enum mflags flags)
static struct slab *slab_create(struct slab_pool *pool, enum mflags flags)
{
size = align_ceil(size, SLAB_STEP);
if (size == 0 || size > SLAB_MAX_ALLOC)
return nil;
slab_debug_noisy("Creating new cache for entry_size %u\n", pool->entry_size);
struct slab *slab = get_zero_pages(pool->page_order, flags);
struct clist *pool = &pools[size / SLAB_STEP - 1];
struct slab *slab = nil;
struct slab *cursor;
clist_foreach_entry(pool, cursor, clink) {
if (cursor->free_entries > 0) {
slab = cursor;
break;
if (slab != nil) {
vm_page_t page = vaddr2pg(slab);
/* XXX it's probably sufficient to only do this for the lowest page */
vm_page_foreach_in_order(page) {
pga_set_slab(page, true);
page->extra = slab;
}
}
if (slab == nil) {
slab = slab_create(size, flags);
if (slab == nil)
return nil; /* OOM */
clist_add_first(pool, &slab->clink);
spin_init(&slab->lock);
slab->pool = pool;
slab->free_entries = pool->entries_per_slab;
void *prev = nil;
void *end = (void *)slab + (1 << (pool->page_order + PAGE_SHIFT));
void *pos = end;
do {
pos -= pool->entry_size;
*(void **)pos = prev;
prev = pos;
} while (pos >= (void *)&slab[1] + pool->entry_size);
slab->freelist = pos;
}
/* list must have at least one entry, otherwise
* we would have created a completely new slab */
struct clist *ret = slab->freelist.next;
clist_del(ret);
slab->free_entries--;
# if CFG_POISON_HEAP
memset(ret, 'a', size);
# endif
return (void *)ret;
return slab;
}
static inline void slab_free(void *ptr)
#if CFG_POISON_SLABS
static inline void poison_after_alloc(struct slab_poison *poison, u_int exact_size,
void *alloc_source)
{
# if CFG_DEBUG_SLAB_ALLOCS
if (ptr < kheap_start || ptr >= kheap_end) {
kprintf("slab_free(%p): invalid ptr!\n", ptr);
return;
}
if ((uintptr_t)ptr % SLAB_STEP) {
kprintf("slab_free(%p): unaligned ptr!\n", ptr);
}
# endif
u_int offset = align_ceil(poison->exact_size, sizeof(long)) / sizeof(long);
u_long *poison_start = &poison->low_poison;
struct slab *slab = GET_SLAB(ptr);
slab->free_entries++;
/* slabs are zeroed out when they are newly allocated */
if (poison->exact_size != 0) {
for (u_long *pos = poison_start; pos < &poison->high_poison[offset]; pos++) {
if (*pos != SLAB_POISON_FREE) {
kprintf("Use-after-free in %p (alloc by %p)\n",
poison->data, poison->alloc_source);
break;
}
}
}
# if CFG_POISON_HEAP
memset(ptr, 'A', slab->entry_size);
# endif
/* update offset to the new size */
offset = align_ceil(exact_size, sizeof(long)) / sizeof(long);
if (slab->free_entries * slab->entry_size + slab->entry_size > SLAB_MAX_ALLOC) {
/* none of the entries are in use, free the slab */
slab_debug_noisy("Destroying empty cache of size %zu\n", slab->entry_size);
free_pages(slab);
} else {
clist_add(&slab->freelist, (struct clist *)ptr);
}
poison->alloc_source = alloc_source;
poison->exact_size = exact_size;
for (u_long *pos = &poison->low_poison; pos <= &poison->high_poison[offset]; pos++)
*pos = SLAB_POISON_ALLOC;
}
static struct slab *slab_create(unsigned int entry_size, enum mflags flags)
static inline void poison_after_free(struct slab_poison *poison)
{
slab_debug_noisy("Creating new cache for size %zu\n", entry_size);
struct slab *slab = get_pages(SLAB_SIZE / PAGE_SIZE, flags);
u_int offset = align_ceil(poison->exact_size, sizeof(long)) / sizeof(long);
if (slab != nil) {
clist_init(&slab->freelist);
slab->free_entries = 0;
slab->entry_size = entry_size;
void *startptr = (void *)slab + sizeof(*slab);
void *endptr = (void *)slab + SLAB_SIZE - entry_size;
for (void *pos = startptr; pos <= endptr; pos += entry_size) {
clist_add(&slab->freelist, (struct clist *)pos);
slab->free_entries++;
}
if (poison->low_poison != SLAB_POISON_ALLOC) {
kprintf("Low out-of-bounds write to %p (alloc by %p)\n",
poison->data, poison->alloc_source);
}
return slab;
if (poison->high_poison[offset] != SLAB_POISON_ALLOC) {
kprintf("High out-of-bounds write to %p (alloc by %p)\n",
poison->data, poison->alloc_source);
}
for (u_long *pos = &poison->low_poison; pos <= &poison->high_poison[offset]; pos++)
*pos = SLAB_POISON_FREE;
}
#endif
__weak void *malloc(usize size)
{
return kmalloc(size, M_KERN);
}
__weak void free(void *ptr)
{
kfree(ptr);
}

Loading…
Cancel
Save