mm: rewrite slab allocator
This is the final part of the major mm subsystem refactor (for now). The new and improved slab allocator can do *proper* poisoning, with pretty accurate out-of-bounds and use-after-free detection. vm_page_t has also been restructured; its flags and order are now combined into one atomic field.
This commit is contained in:
parent
36d53093d4
commit
2e32e299d2
10 changed files with 576 additions and 188 deletions
|
@ -27,3 +27,11 @@ void ktrace_print_from(void *frame)
|
|||
rbp = *rbp;
|
||||
}
|
||||
}
|
||||
|
||||
__naked void *ktrace_return_addr(void)
|
||||
{
|
||||
__asm__ volatile(
|
||||
" movq 8(%rbp), %rax \n"
|
||||
" ret \n"
|
||||
);
|
||||
}
|
||||
|
|
|
@ -28,3 +28,11 @@ void ktrace_print_from(void *frame)
|
|||
rbp = *rbp;
|
||||
}
|
||||
}
|
||||
|
||||
__naked void *ktrace_return_addr(void)
|
||||
{
|
||||
__asm__ volatile(
|
||||
" movl 4(%ebp), %eax \n"
|
||||
" ret \n"
|
||||
);
|
||||
}
|
||||
|
|
|
@ -28,14 +28,14 @@
|
|||
/** @brief Poison dynamic pages when allocating and freeing them */
|
||||
#cmakedefine01 CFG_POISON_PAGES
|
||||
|
||||
/** @brief Poison heap areas after `kmalloc()` and `kfree()` */
|
||||
#cmakedefine01 CFG_POISON_HEAP
|
||||
/** @brief Poison slab allocations (`kmalloc()` and friends) */
|
||||
#cmakedefine01 CFG_POISON_SLABS
|
||||
|
||||
/** @brief Denominator for the fraction of pages kept in emergency reserves */
|
||||
#define CFG_PAGE_EMERG_DENOM @CFG_PAGE_EMERG_DENOM@
|
||||
|
||||
/** @brief Absolute maximum number of pages kept in emergency reserves */
|
||||
#define CFG_PAGE_EMERG_MAX @CFG_PAGE_EMERG_THRESH@
|
||||
#define CFG_PAGE_EMERG_MAX @CFG_PAGE_EMERG_MAX@
|
||||
|
||||
/** @brief Enable Symmetric Multiprocessing */
|
||||
#cmakedefine01 CFG_SMP
|
||||
|
|
|
@ -9,3 +9,6 @@ void ktrace_print(void);
|
|||
/** @brief Print a full stack trace to the kernel log, starting from `frame`. */
|
||||
__asmlink
|
||||
void ktrace_print_from(void *frame);
|
||||
|
||||
/** @brief Get the address the current function call will return to. */
|
||||
void *ktrace_return_addr(void);
|
||||
|
|
|
@ -27,3 +27,15 @@ extern void _kernel_start;
|
|||
|
||||
extern void _kernel_end;
|
||||
#define kern_end (&_kernel_end)
|
||||
|
||||
extern void _text_start;
|
||||
#define text_start (&_text_start)
|
||||
|
||||
extern void _text_end;
|
||||
#define text_end (&_text_end)
|
||||
|
||||
extern void _isr_start;
|
||||
#define isr_start (&_isr_start)
|
||||
|
||||
extern void _isr_end;
|
||||
#define isr_end (&_isr_end)
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
|
||||
#include <arch/poison.h>
|
||||
|
||||
#include <limits.h>
|
||||
|
||||
/*
|
||||
* If possible, arch should define this value as an unmappable base address.
|
||||
* For example, on the amd64, this is set to 0xdead000000000000 because the
|
||||
|
@ -26,5 +28,18 @@
|
|||
#define PAGE_POISON_ALLOC (POISON_BASE + 0x00000010ul)
|
||||
#define PAGE_POISON_FREE (POISON_BASE + 0x00000020ul)
|
||||
|
||||
#if LONG_BIT == 32
|
||||
#define SLAB_POISON_ALLOC 0x61616160ul
|
||||
#define SLAB_POISON_FREE 0x41414140ul
|
||||
#elif LONG_BIT == 64
|
||||
#define SLAB_POISON_ALLOC 0x6161616161616161ul
|
||||
#define SLAB_POISON_FREE 0x4141414141414141ul
|
||||
#elif LONG_BIT == 128
|
||||
#define SLAB_POISON_ALLOC 0x61616161616161616161616161616160ul
|
||||
#define SLAB_POISON_FREE 0x41414141414141414141414141414140ul
|
||||
#else
|
||||
#error "Unsupported long size"
|
||||
#endif
|
||||
|
||||
#define CLIST_POISON_PREV (POISON_BASE + 0x000000c4ul)
|
||||
#define CLIST_POISON_NEXT (POISON_BASE + 0x000000c8ul)
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include <arch/atom.h>
|
||||
#include <arch/page.h>
|
||||
|
||||
#include <gay/cdefs.h>
|
||||
|
@ -22,17 +23,43 @@
|
|||
* a significant difference in memory management overhead.
|
||||
*/
|
||||
|
||||
union vm_page_attr {
|
||||
int _val;
|
||||
struct {
|
||||
unsigned order:8; /**< @brief Index into `mm_zones[zone].pools` */
|
||||
bool lock:1; /**< @brief Page is locked */
|
||||
bool rsvd:1; /**< @brief Page is reserved and must never be touched */
|
||||
bool pcpu:1; /**< @brief Page is in a per-cpu cache */
|
||||
bool slab:1; /**< @brief Page is used by the slab allocator */
|
||||
unsigned zone:2; /**< @brief Index into `mm_zones` */
|
||||
};
|
||||
};
|
||||
#define _PGA_ORDER_SHIFT 0
|
||||
#define _PGA_ORDER_MASK (0xf << _PGA_ORDER_SHIFT)
|
||||
#define _PGA_LOCK_SHIFT 8
|
||||
#define _PGA_LOCK_MASK (1 << _PGA_LOCK_SHIFT)
|
||||
#define _PGA_RSVD_SHIFT 9
|
||||
#define _PGA_RSVD_MASK (1 << _PGA_RSVD_SHIFT)
|
||||
#define _PGA_PCPU_SHIFT 10
|
||||
#define _PGA_PCPU_MASK (1 << _PGA_PCPU_SHIFT)
|
||||
#define _PGA_SLAB_SHIFT 11
|
||||
#define _PGA_SLAB_MASK (1 << _PGA_SLAB_SHIFT)
|
||||
#define _PGA_ZONE_SHIFT 12
|
||||
#define _PGA_ZONE_MASK (3 << _PGA_ZONE_SHIFT)
|
||||
|
||||
typedef union vm_page_attr vm_page_attr_t;
|
||||
|
||||
/**
|
||||
* @brief Stores information about a single page in physical memory.
|
||||
* There is exactly one of these for every physical page, no matter what that
|
||||
* page is used for or whether it is usable at all.
|
||||
*/
|
||||
struct vm_page {
|
||||
/** @brief Reference count (0 = unused) */
|
||||
/** @brief Reference count (0 = unused, < 0 = locked) */
|
||||
atom_t count;
|
||||
unsigned order:8;
|
||||
/** @brief Various flags describing how and for what the page is used, see below */
|
||||
unsigned flags:24;
|
||||
/** @brief Page attributes, use the macros below to access this */
|
||||
atom_t attr;
|
||||
/** @brief If the page is free, this is its freelist. */
|
||||
struct clist link;
|
||||
/**
|
||||
* @brief Optional extra data pointer, reserved for private use.
|
||||
|
@ -45,17 +72,6 @@ struct vm_page {
|
|||
|
||||
typedef struct vm_page *vm_page_t;
|
||||
|
||||
/* values for struct vm_page::flags */
|
||||
|
||||
/** @brief Page must never be accessed */
|
||||
#define PG_RESERVED (1 << 0)
|
||||
/** @brief Page is in a per-cpu cache */
|
||||
#define PG_PCPU (1 << 1)
|
||||
/** @brief Page is used by the slab allocator */
|
||||
#define PG_SLAB (1 << 2)
|
||||
/** @brief Page is in `MM_ZONE_DMA`, rather than `MM_ZONE_NORMAL` */
|
||||
#define PG_DMA (1u << 3)
|
||||
|
||||
/** @brief Array of every single page in physical memory, indexed by page frame number. */
|
||||
extern struct vm_page *const vm_page_array;
|
||||
|
||||
|
@ -66,16 +82,130 @@ extern vm_page_t _vm_page_array_end;
|
|||
#define PGADDR_ASSERT(x) ({})
|
||||
#endif
|
||||
|
||||
static inline bool page_get(vm_page_t page)
|
||||
static inline u8 pga_order(vm_page_t page)
|
||||
{
|
||||
union vm_page_attr attr = { ._val = atom_read(&page->attr) };
|
||||
return attr.order;
|
||||
}
|
||||
|
||||
static inline bool pga_rsvd(vm_page_t page)
|
||||
{
|
||||
union vm_page_attr attr = { ._val = atom_read(&page->attr) };
|
||||
return attr.rsvd;
|
||||
}
|
||||
|
||||
static inline bool pga_pcpu(vm_page_t page)
|
||||
{
|
||||
union vm_page_attr attr = { ._val = atom_read(&page->attr) };
|
||||
return attr.pcpu;
|
||||
}
|
||||
|
||||
static inline bool pga_slab(vm_page_t page)
|
||||
{
|
||||
union vm_page_attr attr = { ._val = atom_read(&page->attr) };
|
||||
return attr.slab;
|
||||
}
|
||||
|
||||
static inline enum mm_zone_type pga_zone(vm_page_t page)
|
||||
{
|
||||
union vm_page_attr attr = { ._val = atom_read(&page->attr) };
|
||||
return attr.zone;
|
||||
}
|
||||
|
||||
static inline u8 pga_set_order(vm_page_t page, u8 order)
|
||||
{
|
||||
spin_loop {
|
||||
union vm_page_attr old = { ._val = atom_read(&page->attr) };
|
||||
union vm_page_attr new = old;
|
||||
new.order = order;
|
||||
if (atom_cmp_xchg(&page->attr, old._val, new._val) == old._val)
|
||||
return old.order;
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool pga_set_pcpu(vm_page_t page, bool pcpu)
|
||||
{
|
||||
if (pcpu)
|
||||
return atom_set_bit(&page->attr, _PGA_PCPU_SHIFT);
|
||||
else
|
||||
return atom_clr_bit(&page->attr, _PGA_PCPU_SHIFT);
|
||||
}
|
||||
|
||||
static inline bool pga_set_slab(vm_page_t page, bool slab)
|
||||
{
|
||||
if (slab)
|
||||
return atom_set_bit(&page->attr, _PGA_SLAB_SHIFT);
|
||||
else
|
||||
return atom_clr_bit(&page->attr, _PGA_SLAB_SHIFT);
|
||||
}
|
||||
|
||||
static inline enum mm_zone_type pga_set_zone(vm_page_t page, enum mm_zone_type zone)
|
||||
{
|
||||
spin_loop {
|
||||
union vm_page_attr old = { ._val = atom_read(&page->attr) };
|
||||
union vm_page_attr new = old;
|
||||
new.zone = zone;
|
||||
if (atom_cmp_xchg(&page->attr, old._val, new._val) == old._val)
|
||||
return old.zone;
|
||||
}
|
||||
}
|
||||
|
||||
static __always_inline bool page_get(vm_page_t page)
|
||||
{
|
||||
return atom_inc(&page->count);
|
||||
}
|
||||
|
||||
static inline bool page_put(vm_page_t page)
|
||||
static __always_inline bool page_put(vm_page_t page)
|
||||
{
|
||||
return atom_dec(&page->count);
|
||||
}
|
||||
|
||||
/* XXX we should probably use a wait queue for these rather than a spinlock like thing */
|
||||
|
||||
static inline void page_lock(vm_page_t page)
|
||||
{
|
||||
spin_loop {
|
||||
if (atom_set_bit(&page->attr, _PGA_LOCK_SHIFT))
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static __always_inline void page_unlock(vm_page_t page)
|
||||
{
|
||||
atom_clr_bit(&page->attr, _PGA_LOCK_SHIFT);
|
||||
}
|
||||
|
||||
static __always_inline bool page_trylock(vm_page_t page)
|
||||
{
|
||||
return atom_set_bit(&page->attr, _PGA_LOCK_SHIFT);
|
||||
}
|
||||
|
||||
static inline void __page_set_flag(vm_page_t page, unsigned flag)
|
||||
{
|
||||
atom_or(&page->attr, (int)flag);
|
||||
}
|
||||
|
||||
static inline void __page_clr_flag(vm_page_t page, unsigned mask)
|
||||
{
|
||||
atom_and(&page->attr, (int)~mask);
|
||||
}
|
||||
|
||||
static __always_inline void page_attr_load(vm_page_attr_t *attr, vm_page_t page)
|
||||
{
|
||||
attr->_val = atom_read(&page->attr);
|
||||
}
|
||||
|
||||
static __always_inline void page_attr_copy(vm_page_attr_t *dest, const vm_page_attr_t *src)
|
||||
{
|
||||
dest->_val = src->_val;
|
||||
}
|
||||
|
||||
static __always_inline bool page_attr_cmp_xchg(vm_page_t page, const vm_page_attr_t *cmp,
|
||||
const vm_page_attr_t *val)
|
||||
{
|
||||
return atom_cmp_xchg(&page->attr, cmp->_val, val->_val);
|
||||
}
|
||||
|
||||
/** @brief Get the page frame number of a page. */
|
||||
__pure2
|
||||
static inline u_long pg2pfn(vm_page_t page)
|
||||
|
@ -84,6 +214,11 @@ static inline u_long pg2pfn(vm_page_t page)
|
|||
return page - vm_page_array;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Get the page that a virtual address points to.
|
||||
* The address must point to the DMAP region (i.e. an address that is returned
|
||||
* by either `get_pages()` and friends, or `kmalloc()` and friends).
|
||||
*/
|
||||
__pure2
|
||||
static inline vm_page_t vaddr2pg(void *vaddr)
|
||||
{
|
||||
|
@ -92,6 +227,11 @@ static inline vm_page_t vaddr2pg(void *vaddr)
|
|||
return &vm_page_array[offset >> PAGE_SHIFT];
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Get the page frame number for a virtual address.
|
||||
* The address must point to the DMAP region (i.e. an address that is returned
|
||||
* by either `get_pages()` and friends, or `kmalloc()` and friends).
|
||||
*/
|
||||
__pure2
|
||||
static inline u_long vaddr2pfn(void *vaddr)
|
||||
{
|
||||
|
@ -100,6 +240,7 @@ static inline u_long vaddr2pfn(void *vaddr)
|
|||
return pfn;
|
||||
}
|
||||
|
||||
/** @brief Get the page frame number for a physical address. */
|
||||
__pure2
|
||||
static inline u_long paddr2pfn(vm_paddr_t paddr)
|
||||
{
|
||||
|
@ -107,6 +248,7 @@ static inline u_long paddr2pfn(vm_paddr_t paddr)
|
|||
return paddr >> PAGE_SHIFT;
|
||||
}
|
||||
|
||||
/** @brief Get the page that a physical address belongs to. */
|
||||
__pure2
|
||||
static inline vm_page_t paddr2pg(vm_paddr_t paddr)
|
||||
{
|
||||
|
@ -115,9 +257,29 @@ static inline vm_page_t paddr2pg(vm_paddr_t paddr)
|
|||
return page;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Translate a page frame number to its corresponding virtual address
|
||||
* in the DMAP region.
|
||||
*/
|
||||
__pure2
|
||||
static inline void *pfn2vaddr(u_long pfn)
|
||||
{
|
||||
PGADDR_ASSERT(&vm_page_array[pfn] < _vm_page_array_end);
|
||||
return DMAP_START + (pfn << PAGE_SHIFT);
|
||||
}
|
||||
|
||||
/*
|
||||
* We have to be careful in this macro, because only the first page in the
|
||||
* order group has the correct order set. So we can only read it once at
|
||||
* the beginning of the loop, since the page pointer is being updated.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief Iterate over every page in its order group.
|
||||
*
|
||||
* @param page The first `vm_page_t` in the group.
|
||||
*/
|
||||
#define vm_page_foreach_in_order(page) \
|
||||
for (int __i = 1 << pga_order(page); \
|
||||
__i >= 0; \
|
||||
__i = ({ ++(page); --__i; }))
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
/* Copyright (C) 2021 fef <owo@fef.moe>. All rights reserved. */
|
||||
|
||||
#include <gay/irq.h>
|
||||
#include <gay/mm.h>
|
||||
#include <gay/sched.h>
|
||||
|
||||
/**
|
||||
|
@ -18,6 +19,8 @@ int main(int argc, char *argv[])
|
|||
{
|
||||
int err;
|
||||
|
||||
kmalloc_init();
|
||||
|
||||
irq_init();
|
||||
|
||||
err = sched_init();
|
||||
|
|
|
@ -66,7 +66,7 @@ static inline u_int paddr_find_order(vm_paddr_t addr)
|
|||
}
|
||||
|
||||
/** @brief Claim all free pages in one of the memory areas from the boot allocator. */
|
||||
static inline void claim_bmem_pages(struct mm_zone *zone, struct _bmem_area *area)
|
||||
static inline void claim_bmem_area(struct mm_zone *zone, struct _bmem_area *area)
|
||||
{
|
||||
vm_paddr_t start = area->start;
|
||||
vm_paddr_t end = area->end;
|
||||
|
@ -98,19 +98,18 @@ static inline void claim_bmem_pages(struct mm_zone *zone, struct _bmem_area *are
|
|||
* the freelist, but all of them need to be initialized */
|
||||
for (u_int i = 0; i < (1 << order); i++) {
|
||||
atom_init(&page[i].count, 0);
|
||||
page[i].flags = 0;
|
||||
page[i].order = 0;
|
||||
atom_init(&page[i].attr, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* order
|
||||
* ^
|
||||
* | _________ < MM_MAX_ORDER
|
||||
* | / |
|
||||
* start | / \ < end order
|
||||
* order > |/
|
||||
* |--------------|----> pos
|
||||
* start end
|
||||
* | ._____._____. < MM_MAX_ORDER
|
||||
* | .___| |
|
||||
* start |._| |_.
|
||||
* order > .| |. < end order
|
||||
* |---------------------|----> pos
|
||||
* start end
|
||||
*/
|
||||
pos += ORDER_SIZE(order);
|
||||
page += (1 << order);
|
||||
|
@ -194,7 +193,7 @@ void paging_init(vm_paddr_t phys_end)
|
|||
* two buddies can be coalesced into one. In reality, the
|
||||
* reference count is invalid because the page is reserved. */
|
||||
atom_init(&vm_page_array[i].count, 1);
|
||||
vm_page_array[i].flags = PG_RESERVED;
|
||||
atom_init(&vm_page_array[1].attr, _PGA_RSVD_MASK);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -204,7 +203,7 @@ void paging_init(vm_paddr_t phys_end)
|
|||
struct mm_zone *zone = &mm_zones[i];
|
||||
struct _bmem_area *area, *tmp;
|
||||
clist_foreach_entry_safe(&zone->_bmem_areas, area, tmp, link) {
|
||||
claim_bmem_pages(zone, area);
|
||||
claim_bmem_area(zone, area);
|
||||
}
|
||||
zone->thrsh.emerg = latom_read(&zone->free_count) / CFG_PAGE_EMERG_DENOM;
|
||||
if (zone->thrsh.emerg > CFG_PAGE_EMERG_MAX)
|
||||
|
@ -283,7 +282,7 @@ static void *__get_pages(u_int order, enum mflags flags)
|
|||
page_debug_noisy("split %p (order = %u)\n", pfn2vaddr(pfn), page_order);
|
||||
struct mm_pool *pool = &zone->pools[page_order];
|
||||
vm_page_t buddy = page + (1 << page_order);
|
||||
buddy->order = page_order;
|
||||
pga_set_order(buddy, page_order);
|
||||
pg_flip_bit(zone, pfn + (1 << page_order), page_order);
|
||||
|
||||
disable_intr();
|
||||
|
@ -294,7 +293,7 @@ static void *__get_pages(u_int order, enum mflags flags)
|
|||
intr_restore(cpuflags);
|
||||
}
|
||||
|
||||
page->order = order;
|
||||
pga_set_order(page, order);
|
||||
void *vaddr = pfn2vaddr(pfn);
|
||||
|
||||
return vaddr;
|
||||
|
@ -370,10 +369,10 @@ static __always_inline bool can_merge(vm_page_t page, vm_page_t buddy)
|
|||
{
|
||||
bool merge = (atom_read(&buddy->count) == 0);
|
||||
|
||||
/* we know that `page` doesn't have PG_RESERVED set,
|
||||
* because we check that flag before anything else */
|
||||
const unsigned mask = PG_RESERVED | PG_DMA;
|
||||
merge &= (page->flags & mask) == (buddy->flags & mask);
|
||||
/* we know that `page' is not reserved, because we
|
||||
* check that flag before we even attempt coalition */
|
||||
const unsigned mask = _PGA_RSVD_MASK | _PGA_ZONE_MASK;
|
||||
merge &= (atom_read(&page->attr) & mask) == (atom_read(&page->attr) & mask);
|
||||
|
||||
return merge;
|
||||
}
|
||||
|
@ -389,9 +388,9 @@ void free_pages(void *ptr)
|
|||
register_t cpuflags = read_flags();
|
||||
|
||||
vm_page_t page = vaddr2pg(ptr);
|
||||
panic_if(page->flags & PG_RESERVED, "tried to free reserved page %p", ptr);
|
||||
panic_if(pga_rsvd(page), "tried to free reserved page %p", ptr);
|
||||
|
||||
u_int order = page->order;
|
||||
u_int order = pga_order(page);
|
||||
PAGE_ASSERT((uintptr_t)ptr % ORDER_SIZE(order) == 0);
|
||||
u_long pfn = vaddr2pfn(ptr);
|
||||
|
||||
|
@ -400,19 +399,17 @@ void free_pages(void *ptr)
|
|||
#endif
|
||||
|
||||
int old_count = atom_sub(&page->count, 1);
|
||||
if (old_count != 1) {
|
||||
if (old_count == 0)
|
||||
panic("double free of page %p", ptr);
|
||||
else
|
||||
panic("attempted to free page %p with references", ptr);
|
||||
PAGE_DEBUG_BLOCK {
|
||||
if (old_count != 1) {
|
||||
if (old_count == 0)
|
||||
page_debug("double free of page %p", ptr);
|
||||
else
|
||||
page_debug("attempted to free page %p with references", ptr);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
struct mm_zone *zone;
|
||||
if (page->flags & PG_DMA)
|
||||
zone = &mm_zones[MM_ZONE_DMA];
|
||||
else
|
||||
zone = &mm_zones[MM_ZONE_NORMAL];
|
||||
|
||||
struct mm_zone *zone = &mm_zones[pga_zone(page)];
|
||||
latom_add(&zone->free_count, (1 << order));
|
||||
|
||||
/* try to coalesce free buddy blocks until we're reached the highest order */
|
||||
|
@ -434,8 +431,8 @@ void free_pages(void *ptr)
|
|||
if (can_merge(page, buddy)) {
|
||||
clist_del(&buddy->link);
|
||||
current_order_pool->free_entries--;
|
||||
buddy->order = order + 1;
|
||||
page->order = order + 1;
|
||||
pga_set_order(buddy, order + 1);
|
||||
pga_set_order(page, order + 1);
|
||||
clist_add(&next_order_pool->freelist, &low->link);
|
||||
next_order_pool->free_entries++;
|
||||
} else {
|
||||
|
|
452
kernel/mm/slab.c
452
kernel/mm/slab.c
|
@ -1,24 +1,48 @@
|
|||
/* Copyright (C) 2021 fef <owo@fef.moe>. All rights reserved. */
|
||||
|
||||
#include <arch/atom.h>
|
||||
#include <arch/cpufunc.h>
|
||||
#include <arch/page.h>
|
||||
|
||||
#include <gay/cdefs.h>
|
||||
#include <gay/clist.h>
|
||||
#include <gay/config.h>
|
||||
#include <gay/kprintf.h>
|
||||
#include <gay/ktrace.h>
|
||||
#include <gay/mm.h>
|
||||
#include <gay/poison.h>
|
||||
#include <gay/systm.h>
|
||||
#include <gay/types.h>
|
||||
#include <gay/vm/page.h>
|
||||
|
||||
#include <string.h>
|
||||
/*
|
||||
* XXX this implementation is still missing object caches
|
||||
*/
|
||||
|
||||
#if CFG_POISON_SLABS
|
||||
struct slab_poison {
|
||||
void *_pad; /**< @brief That's where the freelist pointer is stored */
|
||||
void *alloc_source; /**< @brief Code address that made the alloc call */
|
||||
u_long exact_size;
|
||||
u_long low_poison;
|
||||
u8 data[0];
|
||||
u_long high_poison[1];
|
||||
};
|
||||
|
||||
static void poison_after_alloc(struct slab_poison *poison, u_int exact_size, void *alloc_source);
|
||||
static void poison_after_free(struct slab_poison *poison);
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief This header sits at the beginning of each slab.
|
||||
* The individual entries follow immediately after the struct itself.
|
||||
*/
|
||||
struct slab {
|
||||
struct clist clink; /* -> pools[entry_size / SLAB_STEP - 1] (see below) */
|
||||
/** @brief The individual clist nodes sit at the beginning of each free entry */
|
||||
struct clist freelist;
|
||||
struct clist link;
|
||||
void **freelist;
|
||||
struct slab_pool *pool;
|
||||
/** @brief For `link` */
|
||||
spin_t lock;
|
||||
/**
|
||||
* @brief Number of free entries.
|
||||
* The slabs are sorted within their pool by this value, so that we
|
||||
|
@ -30,181 +54,337 @@ struct slab {
|
|||
* `PAGE_SIZE`), so this saves a couple of bytes on systems where `int`
|
||||
* is smaller than `usize`.
|
||||
*/
|
||||
unsigned int free_entries;
|
||||
/**
|
||||
* @brief Size of a single slab entry in bytes.
|
||||
* Sizes must always be an integral multiple of `sizeof(void *)` and
|
||||
* at least `sizeof(struct clist)`, because that's the data structure
|
||||
* used for tracking what entries are free (`freelist`).
|
||||
*
|
||||
* Like `free_entries`, this is intentionally not a `usize`.
|
||||
*/
|
||||
unsigned int entry_size;
|
||||
|
||||
/* here would come the individual entries */
|
||||
u_int free_entries;
|
||||
};
|
||||
|
||||
/** @brief All slabs currently have the same size of one full page. */
|
||||
#define SLAB_SIZE PAGE_SIZE
|
||||
/**
|
||||
* @brief All slab entry sizes are an integral multiple of this.
|
||||
* When allocating memory, the requested size gets rounded upwards.
|
||||
*/
|
||||
#define SLAB_STEP (sizeof(struct clist))
|
||||
|
||||
#define SLAB_OVERHEAD (sizeof(struct slab))
|
||||
#define SLAB_MAX_ALLOC (SLAB_SIZE - SLAB_OVERHEAD)
|
||||
/* slabs are always aligned ... */
|
||||
#define SLAB_PTR_MASK (~(SLAB_SIZE - 1))
|
||||
/* ... so we can do this */
|
||||
#define GET_SLAB(ptr) ( (struct slab *)((uintptr_t)(ptr) & SLAB_PTR_MASK) )
|
||||
|
||||
#if CFG_DEBUG_SLAB_ALLOCS
|
||||
# define slab_debug(msg, ...) kprintf("[slab] " msg, ##__VA_ARGS__)
|
||||
# define SLAB_DEBUG_BLOCK
|
||||
# define SLAB_ASSERT KASSERT
|
||||
# if CFG_DEBUG_SLAB_ALLOCS_NOISY
|
||||
# define slab_debug_noisy(msg, ...) kprintf("[slab] " msg, ##__VA_ARGS__)
|
||||
# else
|
||||
# define slab_debug_noisy(msg, ...) ({})
|
||||
# endif
|
||||
#else
|
||||
# define SLAB_DEBUG_BLOCK if (0)
|
||||
# define SLAB_ASSERT(x) ({})
|
||||
# define slab_debug(msg, ...) ({})
|
||||
# define slab_debug_noisy(msg, ...) ({})
|
||||
#endif
|
||||
|
||||
/** @brief All slabs grouped by entry_size, indexed by `entry_size / SLAB_STEP - 1` */
|
||||
struct clist pools[SLAB_MAX_ALLOC / SLAB_STEP];
|
||||
struct slab_pool {
|
||||
const u_int entry_size; /**< @brief Size of one entry in bytes */
|
||||
const int entries_per_slab; /**< @brief Max number of entries per slab */
|
||||
atom_t total_used; /**< @brief Total allocated entries */
|
||||
const u_int page_order; /**< @brief Order passed to `get_pages()` */
|
||||
struct clist empty_list; /* -> struct slab::link */
|
||||
struct clist partial_list; /* -> struct slab::link */
|
||||
struct clist full_list; /* -> struct slab::link */
|
||||
spin_t empty_lock; /**< @brief Lock for `empty_list` */
|
||||
spin_t partial_lock; /**< @brief Lock for `partial_list` */
|
||||
spin_t full_lock; /**< @brief Lock for `full_list` */
|
||||
atom_t empty_count; /**< @brief Number of empty slabs */
|
||||
atom_t partial_count; /**< @brief Number of partially empty slabs */
|
||||
atom_t full_count; /**< @brief Number of full slabs */
|
||||
};
|
||||
|
||||
static void *slab_alloc(usize size, enum mflags flags);
|
||||
static void slab_free(void *ptr);
|
||||
/*
|
||||
* Fun size calculations because the slab header takes up some overhead at the
|
||||
* beginning of each page. We should ideally try to cram all the info we need
|
||||
* into struct vm_page, because the individual slab entry sizes could be even
|
||||
* powers of two and perfectly aligned then.
|
||||
*/
|
||||
#define _MIN1(x) ((x) < 1 ? 1 : (x))
|
||||
#define POOL_ENTRY_SIZE(sz) (( (sz) - ( SLAB_OVERHEAD / _MIN1(PAGE_SIZE / (sz)) ) ) & ~0xfu)
|
||||
#define POOL_ENTRIES_PER_TABLE(sz) \
|
||||
_MIN1((PAGE_SIZE - SLAB_OVERHEAD) / POOL_ENTRY_SIZE(sz))
|
||||
|
||||
static struct slab *slab_create(unsigned int entry_size, enum mflags flags);
|
||||
#define POOL_DEFINE(sz) { \
|
||||
.entry_size = POOL_ENTRY_SIZE(sz), \
|
||||
.entries_per_slab = POOL_ENTRIES_PER_TABLE(sz), \
|
||||
.total_used = ATOM_DEFINE(0), \
|
||||
.page_order = ((sz) - 1) / PAGE_SIZE, \
|
||||
.empty_lock = SPIN_DEFINE, \
|
||||
.partial_lock = SPIN_DEFINE, \
|
||||
.full_lock = SPIN_DEFINE, \
|
||||
.empty_count = ATOM_DEFINE(0), \
|
||||
.partial_count = ATOM_DEFINE(0), \
|
||||
.full_count = ATOM_DEFINE(0), \
|
||||
}
|
||||
|
||||
static inline int get_order(usize size)
|
||||
static struct slab_pool slab_pools_normal[] = {
|
||||
POOL_DEFINE(32),
|
||||
POOL_DEFINE(64),
|
||||
POOL_DEFINE(128),
|
||||
POOL_DEFINE(256),
|
||||
POOL_DEFINE(512),
|
||||
POOL_DEFINE(1024),
|
||||
POOL_DEFINE(2048),
|
||||
POOL_DEFINE(4096),
|
||||
POOL_DEFINE(8192),
|
||||
POOL_DEFINE(16384),
|
||||
POOL_DEFINE(32768),
|
||||
{ .entry_size = 0 } /* terminator */
|
||||
};
|
||||
static struct slab_pool slab_pools_dma[] = {
|
||||
POOL_DEFINE(32),
|
||||
POOL_DEFINE(64),
|
||||
POOL_DEFINE(128),
|
||||
POOL_DEFINE(256),
|
||||
POOL_DEFINE(512),
|
||||
POOL_DEFINE(1024),
|
||||
{ .entry_size = 0 } /* terminator */
|
||||
};
|
||||
#undef _MIN1 /* we don't wanna end up using this in actual code, do we? */
|
||||
|
||||
static struct slab_pool *slab_zone_pools[MM_NR_ZONES] = {
|
||||
[_M_ZONE_NORMAL] = slab_pools_normal,
|
||||
[_M_ZONE_DMA] = slab_pools_dma,
|
||||
};
|
||||
|
||||
static struct slab *slab_create(struct slab_pool *pool, enum mflags flags);
|
||||
|
||||
void kmalloc_init(void)
|
||||
{
|
||||
int order;
|
||||
usize order_size = PAGE_SIZE;
|
||||
for (int i = 0; i < MM_NR_ZONES; i++) {
|
||||
struct slab_pool *pool = slab_zone_pools[i];
|
||||
|
||||
for (order = 0; order <= GET_PAGE_MAX_ORDER; order++) {
|
||||
if (order_size >= size)
|
||||
break;
|
||||
order_size <<= 1;
|
||||
while (pool->entry_size != 0) {
|
||||
clist_init(&pool->empty_list);
|
||||
clist_init(&pool->partial_list);
|
||||
clist_init(&pool->full_list);
|
||||
pool++;
|
||||
}
|
||||
}
|
||||
|
||||
return order;
|
||||
}
|
||||
|
||||
void *kmalloc(usize size, enum mflags flags)
|
||||
{
|
||||
if (size > SLAB_MAX_ALLOC) {
|
||||
if (flags & M_CONTIG) {
|
||||
int order = get_order(size);
|
||||
if (order > GET_PAGE_MAX_ORDER) {
|
||||
slab_debug("Requested alloc size %zu too large for get_pages()\n",
|
||||
size);
|
||||
return nil;
|
||||
} else {
|
||||
return get_pages(order, flags);
|
||||
}
|
||||
} else {
|
||||
slab_debug("Refusing to allocate %zu bytes as slabs\n", size);
|
||||
return nil;
|
||||
if (size == 0)
|
||||
return nil;
|
||||
|
||||
#if CFG_POISON_SLABS
|
||||
size += sizeof(struct slab_poison);
|
||||
#endif
|
||||
|
||||
SLAB_DEBUG_BLOCK {
|
||||
if (!(flags & _M_NOWAIT) && in_irq()) {
|
||||
slab_debug("kmalloc() called from irq without M_NOWAIT "
|
||||
"(caller: %p)\n", ktrace_return_addr());
|
||||
flags |= _M_NOWAIT;
|
||||
}
|
||||
} else {
|
||||
return slab_alloc(size, flags);
|
||||
}
|
||||
|
||||
SLAB_ASSERT(_M_ZONE_INDEX(flags) < ARRAY_SIZE(slab_zone_pools));
|
||||
struct slab_pool *pool = slab_zone_pools[_M_ZONE_INDEX(flags)];
|
||||
while (pool->entry_size != 0) {
|
||||
if (pool->entry_size >= size)
|
||||
break;
|
||||
pool++;
|
||||
}
|
||||
|
||||
if (pool->entry_size == 0) {
|
||||
slab_debug("Refusing to allocate %zu bytes in zone %d (limit is %u)\n",
|
||||
size, _M_ZONE_INDEX(flags), pool[-1].entry_size);
|
||||
return nil;
|
||||
}
|
||||
|
||||
slab_debug_noisy("alloc %zu bytes from zone %d, pool size %u\n",
|
||||
size, _M_ZONE_INDEX(flags), pool->entry_size);
|
||||
|
||||
/*
|
||||
* Before locking a slab, we always remove it from its pool.
|
||||
* This is far from optimal, because if multiple CPUs allocate from the
|
||||
* same pool at the same time, we could end up creating several slabs
|
||||
* with one used entry each (not to mention the overhead of the mostly
|
||||
* unnecessary list deletions/insertions). However, it allows me to be
|
||||
* lazier when freeing unused slabs from a background thread since that
|
||||
* thread knows for sure that once it has removed a slab from free_list,
|
||||
* it can't possibly be used for allocations anymore.
|
||||
* This is probably not worth the overhead, though.
|
||||
*/
|
||||
struct slab *slab = nil;
|
||||
|
||||
/* try to use a slab that is already partially used first */
|
||||
register_t cpuflags = intr_disable();
|
||||
spin_lock(&pool->partial_lock);
|
||||
if (!clist_is_empty(&pool->partial_list)) {
|
||||
atom_dec(&pool->partial_count);
|
||||
slab = clist_del_first_entry(&pool->partial_list, typeof(*slab), link);
|
||||
}
|
||||
spin_unlock(&pool->partial_lock);
|
||||
|
||||
if (slab == nil) {
|
||||
/* no partially used slab available, see if we have a completely free one */
|
||||
spin_lock(&pool->empty_lock);
|
||||
if (!clist_is_empty(&pool->empty_list)) {
|
||||
atom_dec(&pool->empty_count);
|
||||
slab = clist_del_first_entry(&pool->empty_list, typeof(*slab), link);
|
||||
}
|
||||
spin_unlock(&pool->empty_lock);
|
||||
|
||||
if (slab == nil) {
|
||||
/* we're completely out of usable slabs, allocate a new one */
|
||||
intr_restore(cpuflags);
|
||||
slab = slab_create(pool, flags);
|
||||
if (slab == nil) {
|
||||
slab_debug("kernel OOM\n");
|
||||
return nil;
|
||||
}
|
||||
intr_disable();
|
||||
}
|
||||
}
|
||||
|
||||
/* if we've made it to here, slab != nil and interrupts are disabled */
|
||||
spin_lock(&slab->lock);
|
||||
void *ret = slab->freelist;
|
||||
slab->freelist = *slab->freelist;
|
||||
if (--slab->free_entries == 0) {
|
||||
spin_lock(&pool->full_lock);
|
||||
clist_add(&pool->full_list, &slab->link);
|
||||
spin_unlock(&pool->full_lock);
|
||||
atom_inc(&pool->full_count);
|
||||
} else {
|
||||
spin_lock(&pool->partial_lock);
|
||||
clist_add(&pool->partial_list, &slab->link);
|
||||
spin_unlock(&pool->partial_lock);
|
||||
atom_inc(&pool->partial_count);
|
||||
}
|
||||
spin_unlock(&slab->lock);
|
||||
intr_restore(cpuflags);
|
||||
|
||||
atom_inc(&pool->total_used);
|
||||
|
||||
#if CFG_POISON_SLABS
|
||||
struct slab_poison *poison = ret;
|
||||
poison_after_alloc(poison, size - sizeof(*poison), ktrace_return_addr());
|
||||
ret = poison->data;
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
void kfree(void *ptr)
|
||||
{
|
||||
kprintf("kfree() is not implemented yet lmao\n");
|
||||
}
|
||||
if (ptr == nil)
|
||||
return;
|
||||
|
||||
void slab_init(void)
|
||||
{
|
||||
slab_debug("Initializing %zu cache pools (%zu~%zu bytes)\n",
|
||||
ARRAY_SIZE(pools), SLAB_STEP, SLAB_MAX_ALLOC);
|
||||
for (int i = 0; i < ARRAY_SIZE(pools); i++)
|
||||
clist_init(&pools[i]);
|
||||
}
|
||||
SLAB_ASSERT(ptr >= DMAP_START && ptr < DMAP_END);
|
||||
|
||||
static inline void *slab_alloc(usize size, enum mflags flags)
|
||||
{
|
||||
size = align_ceil(size, SLAB_STEP);
|
||||
if (size == 0 || size > SLAB_MAX_ALLOC)
|
||||
return nil;
|
||||
vm_page_t page = vaddr2pg(ptr);
|
||||
SLAB_ASSERT(pga_slab(page));
|
||||
struct slab *slab = page->extra;
|
||||
struct slab_pool *pool = slab->pool;
|
||||
#if CFG_POISON_SLABS
|
||||
struct slab_poison *poison = container_of(ptr, typeof(*poison), data);
|
||||
poison_after_free(poison);
|
||||
ptr = poison;
|
||||
#endif
|
||||
|
||||
struct clist *pool = &pools[size / SLAB_STEP - 1];
|
||||
struct slab *slab = nil;
|
||||
struct slab *cursor;
|
||||
clist_foreach_entry(pool, cursor, clink) {
|
||||
if (cursor->free_entries > 0) {
|
||||
slab = cursor;
|
||||
break;
|
||||
}
|
||||
register_t cpuflags = intr_disable();
|
||||
spin_lock(&slab->lock);
|
||||
*(void **)ptr = slab->freelist;
|
||||
slab->freelist = (void **)ptr;
|
||||
if (++slab->free_entries == pool->entries_per_slab) {
|
||||
spin_lock(&pool->partial_lock);
|
||||
clist_del(&slab->link);
|
||||
spin_unlock(&pool->partial_lock);
|
||||
atom_dec(&pool->partial_count);
|
||||
|
||||
spin_lock(&pool->empty_lock);
|
||||
clist_add(&pool->empty_list, &slab->link);
|
||||
spin_unlock(&pool->empty_lock);
|
||||
atom_inc(&pool->empty_count);
|
||||
}
|
||||
if (slab == nil) {
|
||||
slab = slab_create(size, flags);
|
||||
if (slab == nil)
|
||||
return nil; /* OOM */
|
||||
clist_add_first(pool, &slab->clink);
|
||||
}
|
||||
|
||||
/* list must have at least one entry, otherwise
|
||||
* we would have created a completely new slab */
|
||||
struct clist *ret = slab->freelist.next;
|
||||
clist_del(ret);
|
||||
slab->free_entries--;
|
||||
# if CFG_POISON_HEAP
|
||||
memset(ret, 'a', size);
|
||||
# endif
|
||||
return (void *)ret;
|
||||
spin_unlock(&slab->lock);
|
||||
atom_dec(&pool->total_used);
|
||||
intr_restore(cpuflags);
|
||||
}
|
||||
|
||||
static inline void slab_free(void *ptr)
|
||||
static struct slab *slab_create(struct slab_pool *pool, enum mflags flags)
|
||||
{
|
||||
# if CFG_DEBUG_SLAB_ALLOCS
|
||||
if (ptr < kheap_start || ptr >= kheap_end) {
|
||||
kprintf("slab_free(%p): invalid ptr!\n", ptr);
|
||||
return;
|
||||
}
|
||||
if ((uintptr_t)ptr % SLAB_STEP) {
|
||||
kprintf("slab_free(%p): unaligned ptr!\n", ptr);
|
||||
}
|
||||
# endif
|
||||
|
||||
struct slab *slab = GET_SLAB(ptr);
|
||||
slab->free_entries++;
|
||||
|
||||
# if CFG_POISON_HEAP
|
||||
memset(ptr, 'A', slab->entry_size);
|
||||
# endif
|
||||
|
||||
if (slab->free_entries * slab->entry_size + slab->entry_size > SLAB_MAX_ALLOC) {
|
||||
/* none of the entries are in use, free the slab */
|
||||
slab_debug_noisy("Destroying empty cache of size %zu\n", slab->entry_size);
|
||||
free_pages(slab);
|
||||
} else {
|
||||
clist_add(&slab->freelist, (struct clist *)ptr);
|
||||
}
|
||||
}
|
||||
|
||||
static struct slab *slab_create(unsigned int entry_size, enum mflags flags)
|
||||
{
|
||||
slab_debug_noisy("Creating new cache for size %zu\n", entry_size);
|
||||
struct slab *slab = get_pages(SLAB_SIZE / PAGE_SIZE, flags);
|
||||
slab_debug_noisy("Creating new cache for entry_size %u\n", pool->entry_size);
|
||||
struct slab *slab = get_zero_pages(pool->page_order, flags);
|
||||
|
||||
if (slab != nil) {
|
||||
clist_init(&slab->freelist);
|
||||
slab->free_entries = 0;
|
||||
slab->entry_size = entry_size;
|
||||
|
||||
void *startptr = (void *)slab + sizeof(*slab);
|
||||
void *endptr = (void *)slab + SLAB_SIZE - entry_size;
|
||||
for (void *pos = startptr; pos <= endptr; pos += entry_size) {
|
||||
clist_add(&slab->freelist, (struct clist *)pos);
|
||||
slab->free_entries++;
|
||||
vm_page_t page = vaddr2pg(slab);
|
||||
/* XXX it's probably sufficient to only do this for the lowest page */
|
||||
vm_page_foreach_in_order(page) {
|
||||
pga_set_slab(page, true);
|
||||
page->extra = slab;
|
||||
}
|
||||
|
||||
spin_init(&slab->lock);
|
||||
slab->pool = pool;
|
||||
slab->free_entries = pool->entries_per_slab;
|
||||
void *prev = nil;
|
||||
void *end = (void *)slab + (1 << (pool->page_order + PAGE_SHIFT));
|
||||
void *pos = end;
|
||||
do {
|
||||
pos -= pool->entry_size;
|
||||
*(void **)pos = prev;
|
||||
prev = pos;
|
||||
} while (pos >= (void *)&slab[1] + pool->entry_size);
|
||||
slab->freelist = pos;
|
||||
}
|
||||
|
||||
return slab;
|
||||
}
|
||||
|
||||
#if CFG_POISON_SLABS
|
||||
static inline void poison_after_alloc(struct slab_poison *poison, u_int exact_size,
|
||||
void *alloc_source)
|
||||
{
|
||||
u_int offset = align_ceil(poison->exact_size, sizeof(long)) / sizeof(long);
|
||||
u_long *poison_start = &poison->low_poison;
|
||||
|
||||
/* slabs are zeroed out when they are newly allocated */
|
||||
if (poison->exact_size != 0) {
|
||||
for (u_long *pos = poison_start; pos < &poison->high_poison[offset]; pos++) {
|
||||
if (*pos != SLAB_POISON_FREE) {
|
||||
kprintf("Use-after-free in %p (alloc by %p)\n",
|
||||
poison->data, poison->alloc_source);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* update offset to the new size */
|
||||
offset = align_ceil(exact_size, sizeof(long)) / sizeof(long);
|
||||
|
||||
poison->alloc_source = alloc_source;
|
||||
poison->exact_size = exact_size;
|
||||
for (u_long *pos = &poison->low_poison; pos <= &poison->high_poison[offset]; pos++)
|
||||
*pos = SLAB_POISON_ALLOC;
|
||||
}
|
||||
|
||||
static inline void poison_after_free(struct slab_poison *poison)
|
||||
{
|
||||
u_int offset = align_ceil(poison->exact_size, sizeof(long)) / sizeof(long);
|
||||
|
||||
if (poison->low_poison != SLAB_POISON_ALLOC) {
|
||||
kprintf("Low out-of-bounds write to %p (alloc by %p)\n",
|
||||
poison->data, poison->alloc_source);
|
||||
}
|
||||
|
||||
if (poison->high_poison[offset] != SLAB_POISON_ALLOC) {
|
||||
kprintf("High out-of-bounds write to %p (alloc by %p)\n",
|
||||
poison->data, poison->alloc_source);
|
||||
}
|
||||
|
||||
for (u_long *pos = &poison->low_poison; pos <= &poison->high_poison[offset]; pos++)
|
||||
*pos = SLAB_POISON_FREE;
|
||||
}
|
||||
#endif
|
||||
|
||||
__weak void *malloc(usize size)
|
||||
{
|
||||
return kmalloc(size, M_KERN);
|
||||
}
|
||||
|
||||
__weak void free(void *ptr)
|
||||
{
|
||||
kfree(ptr);
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue