From 2e32e299d26487e383156adc0b8cd271adc912b8 Mon Sep 17 00:00:00 2001 From: fef Date: Mon, 22 Nov 2021 04:23:31 +0100 Subject: [PATCH] mm: rewrite slab allocator This is the final part of the major mm subsystem refactor (for now). The new and improved slab allocator can do *proper* poisoning, with pretty accurate out-of-bounds and use-after-free detection. vm_page_t has also been restructured; its flags and order are now combined into one atomic field. --- arch/x86/sys/amd64/ktrace.c | 8 + arch/x86/sys/i386/ktrace.c | 8 + include/gay/config.h.in | 6 +- include/gay/ktrace.h | 3 + include/gay/linker.h | 12 + include/gay/poison.h | 15 ++ include/gay/vm/page.h | 196 ++++++++++++++-- kernel/main.c | 3 + kernel/mm/page.c | 61 +++-- kernel/mm/slab.c | 432 +++++++++++++++++++++++++----------- 10 files changed, 566 insertions(+), 178 deletions(-) diff --git a/arch/x86/sys/amd64/ktrace.c b/arch/x86/sys/amd64/ktrace.c index 62249b3..040b32e 100644 --- a/arch/x86/sys/amd64/ktrace.c +++ b/arch/x86/sys/amd64/ktrace.c @@ -27,3 +27,11 @@ void ktrace_print_from(void *frame) rbp = *rbp; } } + +__naked void *ktrace_return_addr(void) +{ + __asm__ volatile( +" movq 8(%rbp), %rax \n" +" ret \n" + ); +} diff --git a/arch/x86/sys/i386/ktrace.c b/arch/x86/sys/i386/ktrace.c index 6d29cea..780af5f 100644 --- a/arch/x86/sys/i386/ktrace.c +++ b/arch/x86/sys/i386/ktrace.c @@ -28,3 +28,11 @@ void ktrace_print_from(void *frame) rbp = *rbp; } } + +__naked void *ktrace_return_addr(void) +{ + __asm__ volatile( +" movl 4(%ebp), %eax \n" +" ret \n" + ); +} diff --git a/include/gay/config.h.in b/include/gay/config.h.in index bd1bd99..9669998 100644 --- a/include/gay/config.h.in +++ b/include/gay/config.h.in @@ -28,14 +28,14 @@ /** @brief Poison dynamic pages when allocating and freeing them */ #cmakedefine01 CFG_POISON_PAGES -/** @brief Poison heap areas after `kmalloc()` and `kfree()` */ -#cmakedefine01 CFG_POISON_HEAP +/** @brief Poison slab allocations (`kmalloc()` and friends) */ +#cmakedefine01 CFG_POISON_SLABS /** @brief Denominator for the fraction of pages kept in emergency reserves */ #define CFG_PAGE_EMERG_DENOM @CFG_PAGE_EMERG_DENOM@ /** @brief Absolute maximum number of pages kept in emergency reserves */ -#define CFG_PAGE_EMERG_MAX @CFG_PAGE_EMERG_THRESH@ +#define CFG_PAGE_EMERG_MAX @CFG_PAGE_EMERG_MAX@ /** @brief Enable Symmetric Multiprocessing */ #cmakedefine01 CFG_SMP diff --git a/include/gay/ktrace.h b/include/gay/ktrace.h index 4d8e273..b6cb259 100644 --- a/include/gay/ktrace.h +++ b/include/gay/ktrace.h @@ -9,3 +9,6 @@ void ktrace_print(void); /** @brief Print a full stack trace to the kernel log, starting from `frame`. */ __asmlink void ktrace_print_from(void *frame); + +/** @brief Get the address the current function call will return to. */ +void *ktrace_return_addr(void); diff --git a/include/gay/linker.h b/include/gay/linker.h index e5af387..49d6811 100644 --- a/include/gay/linker.h +++ b/include/gay/linker.h @@ -27,3 +27,15 @@ extern void _kernel_start; extern void _kernel_end; #define kern_end (&_kernel_end) + +extern void _text_start; +#define text_start (&_text_start) + +extern void _text_end; +#define text_end (&_text_end) + +extern void _isr_start; +#define isr_start (&_isr_start) + +extern void _isr_end; +#define isr_end (&_isr_end) diff --git a/include/gay/poison.h b/include/gay/poison.h index d3a3bec..6449d61 100644 --- a/include/gay/poison.h +++ b/include/gay/poison.h @@ -4,6 +4,8 @@ #include +#include + /* * If possible, arch should define this value as an unmappable base address. * For example, on the amd64, this is set to 0xdead000000000000 because the @@ -26,5 +28,18 @@ #define PAGE_POISON_ALLOC (POISON_BASE + 0x00000010ul) #define PAGE_POISON_FREE (POISON_BASE + 0x00000020ul) +#if LONG_BIT == 32 +#define SLAB_POISON_ALLOC 0x61616160ul +#define SLAB_POISON_FREE 0x41414140ul +#elif LONG_BIT == 64 +#define SLAB_POISON_ALLOC 0x6161616161616161ul +#define SLAB_POISON_FREE 0x4141414141414141ul +#elif LONG_BIT == 128 +#define SLAB_POISON_ALLOC 0x61616161616161616161616161616160ul +#define SLAB_POISON_FREE 0x41414141414141414141414141414140ul +#else +#error "Unsupported long size" +#endif + #define CLIST_POISON_PREV (POISON_BASE + 0x000000c4ul) #define CLIST_POISON_NEXT (POISON_BASE + 0x000000c8ul) diff --git a/include/gay/vm/page.h b/include/gay/vm/page.h index 72eb52b..09f4386 100644 --- a/include/gay/vm/page.h +++ b/include/gay/vm/page.h @@ -2,6 +2,7 @@ #pragma once +#include #include #include @@ -22,17 +23,43 @@ * a significant difference in memory management overhead. */ +union vm_page_attr { + int _val; + struct { + unsigned order:8; /**< @brief Index into `mm_zones[zone].pools` */ + bool lock:1; /**< @brief Page is locked */ + bool rsvd:1; /**< @brief Page is reserved and must never be touched */ + bool pcpu:1; /**< @brief Page is in a per-cpu cache */ + bool slab:1; /**< @brief Page is used by the slab allocator */ + unsigned zone:2; /**< @brief Index into `mm_zones` */ + }; +}; +#define _PGA_ORDER_SHIFT 0 +#define _PGA_ORDER_MASK (0xf << _PGA_ORDER_SHIFT) +#define _PGA_LOCK_SHIFT 8 +#define _PGA_LOCK_MASK (1 << _PGA_LOCK_SHIFT) +#define _PGA_RSVD_SHIFT 9 +#define _PGA_RSVD_MASK (1 << _PGA_RSVD_SHIFT) +#define _PGA_PCPU_SHIFT 10 +#define _PGA_PCPU_MASK (1 << _PGA_PCPU_SHIFT) +#define _PGA_SLAB_SHIFT 11 +#define _PGA_SLAB_MASK (1 << _PGA_SLAB_SHIFT) +#define _PGA_ZONE_SHIFT 12 +#define _PGA_ZONE_MASK (3 << _PGA_ZONE_SHIFT) + +typedef union vm_page_attr vm_page_attr_t; + /** * @brief Stores information about a single page in physical memory. * There is exactly one of these for every physical page, no matter what that * page is used for or whether it is usable at all. */ struct vm_page { - /** @brief Reference count (0 = unused) */ + /** @brief Reference count (0 = unused, < 0 = locked) */ atom_t count; - unsigned order:8; - /** @brief Various flags describing how and for what the page is used, see below */ - unsigned flags:24; + /** @brief Page attributes, use the macros below to access this */ + atom_t attr; + /** @brief If the page is free, this is its freelist. */ struct clist link; /** * @brief Optional extra data pointer, reserved for private use. @@ -45,17 +72,6 @@ struct vm_page { typedef struct vm_page *vm_page_t; -/* values for struct vm_page::flags */ - -/** @brief Page must never be accessed */ -#define PG_RESERVED (1 << 0) -/** @brief Page is in a per-cpu cache */ -#define PG_PCPU (1 << 1) -/** @brief Page is used by the slab allocator */ -#define PG_SLAB (1 << 2) -/** @brief Page is in `MM_ZONE_DMA`, rather than `MM_ZONE_NORMAL` */ -#define PG_DMA (1u << 3) - /** @brief Array of every single page in physical memory, indexed by page frame number. */ extern struct vm_page *const vm_page_array; @@ -66,16 +82,130 @@ extern vm_page_t _vm_page_array_end; #define PGADDR_ASSERT(x) ({}) #endif -static inline bool page_get(vm_page_t page) +static inline u8 pga_order(vm_page_t page) +{ + union vm_page_attr attr = { ._val = atom_read(&page->attr) }; + return attr.order; +} + +static inline bool pga_rsvd(vm_page_t page) +{ + union vm_page_attr attr = { ._val = atom_read(&page->attr) }; + return attr.rsvd; +} + +static inline bool pga_pcpu(vm_page_t page) +{ + union vm_page_attr attr = { ._val = atom_read(&page->attr) }; + return attr.pcpu; +} + +static inline bool pga_slab(vm_page_t page) +{ + union vm_page_attr attr = { ._val = atom_read(&page->attr) }; + return attr.slab; +} + +static inline enum mm_zone_type pga_zone(vm_page_t page) +{ + union vm_page_attr attr = { ._val = atom_read(&page->attr) }; + return attr.zone; +} + +static inline u8 pga_set_order(vm_page_t page, u8 order) +{ + spin_loop { + union vm_page_attr old = { ._val = atom_read(&page->attr) }; + union vm_page_attr new = old; + new.order = order; + if (atom_cmp_xchg(&page->attr, old._val, new._val) == old._val) + return old.order; + } +} + +static inline bool pga_set_pcpu(vm_page_t page, bool pcpu) +{ + if (pcpu) + return atom_set_bit(&page->attr, _PGA_PCPU_SHIFT); + else + return atom_clr_bit(&page->attr, _PGA_PCPU_SHIFT); +} + +static inline bool pga_set_slab(vm_page_t page, bool slab) +{ + if (slab) + return atom_set_bit(&page->attr, _PGA_SLAB_SHIFT); + else + return atom_clr_bit(&page->attr, _PGA_SLAB_SHIFT); +} + +static inline enum mm_zone_type pga_set_zone(vm_page_t page, enum mm_zone_type zone) +{ + spin_loop { + union vm_page_attr old = { ._val = atom_read(&page->attr) }; + union vm_page_attr new = old; + new.zone = zone; + if (atom_cmp_xchg(&page->attr, old._val, new._val) == old._val) + return old.zone; + } +} + +static __always_inline bool page_get(vm_page_t page) { return atom_inc(&page->count); } -static inline bool page_put(vm_page_t page) +static __always_inline bool page_put(vm_page_t page) { return atom_dec(&page->count); } +/* XXX we should probably use a wait queue for these rather than a spinlock like thing */ + +static inline void page_lock(vm_page_t page) +{ + spin_loop { + if (atom_set_bit(&page->attr, _PGA_LOCK_SHIFT)) + break; + } +} + +static __always_inline void page_unlock(vm_page_t page) +{ + atom_clr_bit(&page->attr, _PGA_LOCK_SHIFT); +} + +static __always_inline bool page_trylock(vm_page_t page) +{ + return atom_set_bit(&page->attr, _PGA_LOCK_SHIFT); +} + +static inline void __page_set_flag(vm_page_t page, unsigned flag) +{ + atom_or(&page->attr, (int)flag); +} + +static inline void __page_clr_flag(vm_page_t page, unsigned mask) +{ + atom_and(&page->attr, (int)~mask); +} + +static __always_inline void page_attr_load(vm_page_attr_t *attr, vm_page_t page) +{ + attr->_val = atom_read(&page->attr); +} + +static __always_inline void page_attr_copy(vm_page_attr_t *dest, const vm_page_attr_t *src) +{ + dest->_val = src->_val; +} + +static __always_inline bool page_attr_cmp_xchg(vm_page_t page, const vm_page_attr_t *cmp, + const vm_page_attr_t *val) +{ + return atom_cmp_xchg(&page->attr, cmp->_val, val->_val); +} + /** @brief Get the page frame number of a page. */ __pure2 static inline u_long pg2pfn(vm_page_t page) @@ -84,6 +214,11 @@ static inline u_long pg2pfn(vm_page_t page) return page - vm_page_array; } +/** + * @brief Get the page that a virtual address points to. + * The address must point to the DMAP region (i.e. an address that is returned + * by either `get_pages()` and friends, or `kmalloc()` and friends). + */ __pure2 static inline vm_page_t vaddr2pg(void *vaddr) { @@ -92,6 +227,11 @@ static inline vm_page_t vaddr2pg(void *vaddr) return &vm_page_array[offset >> PAGE_SHIFT]; } +/** + * @brief Get the page frame number for a virtual address. + * The address must point to the DMAP region (i.e. an address that is returned + * by either `get_pages()` and friends, or `kmalloc()` and friends). + */ __pure2 static inline u_long vaddr2pfn(void *vaddr) { @@ -100,6 +240,7 @@ static inline u_long vaddr2pfn(void *vaddr) return pfn; } +/** @brief Get the page frame number for a physical address. */ __pure2 static inline u_long paddr2pfn(vm_paddr_t paddr) { @@ -107,6 +248,7 @@ static inline u_long paddr2pfn(vm_paddr_t paddr) return paddr >> PAGE_SHIFT; } +/** @brief Get the page that a physical address belongs to. */ __pure2 static inline vm_page_t paddr2pg(vm_paddr_t paddr) { @@ -115,9 +257,29 @@ static inline vm_page_t paddr2pg(vm_paddr_t paddr) return page; } +/** + * @brief Translate a page frame number to its corresponding virtual address + * in the DMAP region. + */ __pure2 static inline void *pfn2vaddr(u_long pfn) { PGADDR_ASSERT(&vm_page_array[pfn] < _vm_page_array_end); return DMAP_START + (pfn << PAGE_SHIFT); } + +/* + * We have to be careful in this macro, because only the first page in the + * order group has the correct order set. So we can only read it once at + * the beginning of the loop, since the page pointer is being updated. + */ + +/** + * @brief Iterate over every page in its order group. + * + * @param page The first `vm_page_t` in the group. + */ +#define vm_page_foreach_in_order(page) \ + for (int __i = 1 << pga_order(page); \ + __i >= 0; \ + __i = ({ ++(page); --__i; })) diff --git a/kernel/main.c b/kernel/main.c index 088feec..2841a2e 100644 --- a/kernel/main.c +++ b/kernel/main.c @@ -1,6 +1,7 @@ /* Copyright (C) 2021 fef . All rights reserved. */ #include +#include #include /** @@ -18,6 +19,8 @@ int main(int argc, char *argv[]) { int err; + kmalloc_init(); + irq_init(); err = sched_init(); diff --git a/kernel/mm/page.c b/kernel/mm/page.c index 25522d1..de44c32 100644 --- a/kernel/mm/page.c +++ b/kernel/mm/page.c @@ -66,7 +66,7 @@ static inline u_int paddr_find_order(vm_paddr_t addr) } /** @brief Claim all free pages in one of the memory areas from the boot allocator. */ -static inline void claim_bmem_pages(struct mm_zone *zone, struct _bmem_area *area) +static inline void claim_bmem_area(struct mm_zone *zone, struct _bmem_area *area) { vm_paddr_t start = area->start; vm_paddr_t end = area->end; @@ -98,19 +98,18 @@ static inline void claim_bmem_pages(struct mm_zone *zone, struct _bmem_area *are * the freelist, but all of them need to be initialized */ for (u_int i = 0; i < (1 << order); i++) { atom_init(&page[i].count, 0); - page[i].flags = 0; - page[i].order = 0; + atom_init(&page[i].attr, 0); } /* * order * ^ - * | _________ < MM_MAX_ORDER - * | / | - * start | / \ < end order - * order > |/ - * |--------------|----> pos - * start end + * | ._____._____. < MM_MAX_ORDER + * | .___| | + * start |._| |_. + * order > .| |. < end order + * |---------------------|----> pos + * start end */ pos += ORDER_SIZE(order); page += (1 << order); @@ -194,7 +193,7 @@ void paging_init(vm_paddr_t phys_end) * two buddies can be coalesced into one. In reality, the * reference count is invalid because the page is reserved. */ atom_init(&vm_page_array[i].count, 1); - vm_page_array[i].flags = PG_RESERVED; + atom_init(&vm_page_array[1].attr, _PGA_RSVD_MASK); } /* @@ -204,7 +203,7 @@ void paging_init(vm_paddr_t phys_end) struct mm_zone *zone = &mm_zones[i]; struct _bmem_area *area, *tmp; clist_foreach_entry_safe(&zone->_bmem_areas, area, tmp, link) { - claim_bmem_pages(zone, area); + claim_bmem_area(zone, area); } zone->thrsh.emerg = latom_read(&zone->free_count) / CFG_PAGE_EMERG_DENOM; if (zone->thrsh.emerg > CFG_PAGE_EMERG_MAX) @@ -283,7 +282,7 @@ static void *__get_pages(u_int order, enum mflags flags) page_debug_noisy("split %p (order = %u)\n", pfn2vaddr(pfn), page_order); struct mm_pool *pool = &zone->pools[page_order]; vm_page_t buddy = page + (1 << page_order); - buddy->order = page_order; + pga_set_order(buddy, page_order); pg_flip_bit(zone, pfn + (1 << page_order), page_order); disable_intr(); @@ -294,7 +293,7 @@ static void *__get_pages(u_int order, enum mflags flags) intr_restore(cpuflags); } - page->order = order; + pga_set_order(page, order); void *vaddr = pfn2vaddr(pfn); return vaddr; @@ -370,10 +369,10 @@ static __always_inline bool can_merge(vm_page_t page, vm_page_t buddy) { bool merge = (atom_read(&buddy->count) == 0); - /* we know that `page` doesn't have PG_RESERVED set, - * because we check that flag before anything else */ - const unsigned mask = PG_RESERVED | PG_DMA; - merge &= (page->flags & mask) == (buddy->flags & mask); + /* we know that `page' is not reserved, because we + * check that flag before we even attempt coalition */ + const unsigned mask = _PGA_RSVD_MASK | _PGA_ZONE_MASK; + merge &= (atom_read(&page->attr) & mask) == (atom_read(&page->attr) & mask); return merge; } @@ -389,9 +388,9 @@ void free_pages(void *ptr) register_t cpuflags = read_flags(); vm_page_t page = vaddr2pg(ptr); - panic_if(page->flags & PG_RESERVED, "tried to free reserved page %p", ptr); + panic_if(pga_rsvd(page), "tried to free reserved page %p", ptr); - u_int order = page->order; + u_int order = pga_order(page); PAGE_ASSERT((uintptr_t)ptr % ORDER_SIZE(order) == 0); u_long pfn = vaddr2pfn(ptr); @@ -400,19 +399,17 @@ void free_pages(void *ptr) #endif int old_count = atom_sub(&page->count, 1); - if (old_count != 1) { - if (old_count == 0) - panic("double free of page %p", ptr); - else - panic("attempted to free page %p with references", ptr); + PAGE_DEBUG_BLOCK { + if (old_count != 1) { + if (old_count == 0) + page_debug("double free of page %p", ptr); + else + page_debug("attempted to free page %p with references", ptr); + return; + } } - struct mm_zone *zone; - if (page->flags & PG_DMA) - zone = &mm_zones[MM_ZONE_DMA]; - else - zone = &mm_zones[MM_ZONE_NORMAL]; - + struct mm_zone *zone = &mm_zones[pga_zone(page)]; latom_add(&zone->free_count, (1 << order)); /* try to coalesce free buddy blocks until we're reached the highest order */ @@ -434,8 +431,8 @@ void free_pages(void *ptr) if (can_merge(page, buddy)) { clist_del(&buddy->link); current_order_pool->free_entries--; - buddy->order = order + 1; - page->order = order + 1; + pga_set_order(buddy, order + 1); + pga_set_order(page, order + 1); clist_add(&next_order_pool->freelist, &low->link); next_order_pool->free_entries++; } else { diff --git a/kernel/mm/slab.c b/kernel/mm/slab.c index 375f8ee..f61bdd5 100644 --- a/kernel/mm/slab.c +++ b/kernel/mm/slab.c @@ -1,24 +1,48 @@ /* Copyright (C) 2021 fef . All rights reserved. */ +#include +#include #include #include #include #include #include +#include #include +#include +#include #include +#include -#include +/* + * XXX this implementation is still missing object caches + */ + +#if CFG_POISON_SLABS +struct slab_poison { + void *_pad; /**< @brief That's where the freelist pointer is stored */ + void *alloc_source; /**< @brief Code address that made the alloc call */ + u_long exact_size; + u_long low_poison; + u8 data[0]; + u_long high_poison[1]; +}; + +static void poison_after_alloc(struct slab_poison *poison, u_int exact_size, void *alloc_source); +static void poison_after_free(struct slab_poison *poison); +#endif /** * @brief This header sits at the beginning of each slab. * The individual entries follow immediately after the struct itself. */ struct slab { - struct clist clink; /* -> pools[entry_size / SLAB_STEP - 1] (see below) */ - /** @brief The individual clist nodes sit at the beginning of each free entry */ - struct clist freelist; + struct clist link; + void **freelist; + struct slab_pool *pool; + /** @brief For `link` */ + spin_t lock; /** * @brief Number of free entries. * The slabs are sorted within their pool by this value, so that we @@ -30,181 +54,337 @@ struct slab { * `PAGE_SIZE`), so this saves a couple of bytes on systems where `int` * is smaller than `usize`. */ - unsigned int free_entries; - /** - * @brief Size of a single slab entry in bytes. - * Sizes must always be an integral multiple of `sizeof(void *)` and - * at least `sizeof(struct clist)`, because that's the data structure - * used for tracking what entries are free (`freelist`). - * - * Like `free_entries`, this is intentionally not a `usize`. - */ - unsigned int entry_size; - - /* here would come the individual entries */ + u_int free_entries; }; -/** @brief All slabs currently have the same size of one full page. */ -#define SLAB_SIZE PAGE_SIZE -/** - * @brief All slab entry sizes are an integral multiple of this. - * When allocating memory, the requested size gets rounded upwards. - */ -#define SLAB_STEP (sizeof(struct clist)) - #define SLAB_OVERHEAD (sizeof(struct slab)) -#define SLAB_MAX_ALLOC (SLAB_SIZE - SLAB_OVERHEAD) -/* slabs are always aligned ... */ -#define SLAB_PTR_MASK (~(SLAB_SIZE - 1)) -/* ... so we can do this */ -#define GET_SLAB(ptr) ( (struct slab *)((uintptr_t)(ptr) & SLAB_PTR_MASK) ) #if CFG_DEBUG_SLAB_ALLOCS # define slab_debug(msg, ...) kprintf("[slab] " msg, ##__VA_ARGS__) +# define SLAB_DEBUG_BLOCK +# define SLAB_ASSERT KASSERT # if CFG_DEBUG_SLAB_ALLOCS_NOISY # define slab_debug_noisy(msg, ...) kprintf("[slab] " msg, ##__VA_ARGS__) # else # define slab_debug_noisy(msg, ...) ({}) # endif #else +# define SLAB_DEBUG_BLOCK if (0) +# define SLAB_ASSERT(x) ({}) # define slab_debug(msg, ...) ({}) # define slab_debug_noisy(msg, ...) ({}) #endif -/** @brief All slabs grouped by entry_size, indexed by `entry_size / SLAB_STEP - 1` */ -struct clist pools[SLAB_MAX_ALLOC / SLAB_STEP]; +struct slab_pool { + const u_int entry_size; /**< @brief Size of one entry in bytes */ + const int entries_per_slab; /**< @brief Max number of entries per slab */ + atom_t total_used; /**< @brief Total allocated entries */ + const u_int page_order; /**< @brief Order passed to `get_pages()` */ + struct clist empty_list; /* -> struct slab::link */ + struct clist partial_list; /* -> struct slab::link */ + struct clist full_list; /* -> struct slab::link */ + spin_t empty_lock; /**< @brief Lock for `empty_list` */ + spin_t partial_lock; /**< @brief Lock for `partial_list` */ + spin_t full_lock; /**< @brief Lock for `full_list` */ + atom_t empty_count; /**< @brief Number of empty slabs */ + atom_t partial_count; /**< @brief Number of partially empty slabs */ + atom_t full_count; /**< @brief Number of full slabs */ +}; -static void *slab_alloc(usize size, enum mflags flags); -static void slab_free(void *ptr); +/* + * Fun size calculations because the slab header takes up some overhead at the + * beginning of each page. We should ideally try to cram all the info we need + * into struct vm_page, because the individual slab entry sizes could be even + * powers of two and perfectly aligned then. + */ +#define _MIN1(x) ((x) < 1 ? 1 : (x)) +#define POOL_ENTRY_SIZE(sz) (( (sz) - ( SLAB_OVERHEAD / _MIN1(PAGE_SIZE / (sz)) ) ) & ~0xfu) +#define POOL_ENTRIES_PER_TABLE(sz) \ + _MIN1((PAGE_SIZE - SLAB_OVERHEAD) / POOL_ENTRY_SIZE(sz)) -static struct slab *slab_create(unsigned int entry_size, enum mflags flags); +#define POOL_DEFINE(sz) { \ + .entry_size = POOL_ENTRY_SIZE(sz), \ + .entries_per_slab = POOL_ENTRIES_PER_TABLE(sz), \ + .total_used = ATOM_DEFINE(0), \ + .page_order = ((sz) - 1) / PAGE_SIZE, \ + .empty_lock = SPIN_DEFINE, \ + .partial_lock = SPIN_DEFINE, \ + .full_lock = SPIN_DEFINE, \ + .empty_count = ATOM_DEFINE(0), \ + .partial_count = ATOM_DEFINE(0), \ + .full_count = ATOM_DEFINE(0), \ +} -static inline int get_order(usize size) +static struct slab_pool slab_pools_normal[] = { + POOL_DEFINE(32), + POOL_DEFINE(64), + POOL_DEFINE(128), + POOL_DEFINE(256), + POOL_DEFINE(512), + POOL_DEFINE(1024), + POOL_DEFINE(2048), + POOL_DEFINE(4096), + POOL_DEFINE(8192), + POOL_DEFINE(16384), + POOL_DEFINE(32768), + { .entry_size = 0 } /* terminator */ +}; +static struct slab_pool slab_pools_dma[] = { + POOL_DEFINE(32), + POOL_DEFINE(64), + POOL_DEFINE(128), + POOL_DEFINE(256), + POOL_DEFINE(512), + POOL_DEFINE(1024), + { .entry_size = 0 } /* terminator */ +}; +#undef _MIN1 /* we don't wanna end up using this in actual code, do we? */ + +static struct slab_pool *slab_zone_pools[MM_NR_ZONES] = { + [_M_ZONE_NORMAL] = slab_pools_normal, + [_M_ZONE_DMA] = slab_pools_dma, +}; + +static struct slab *slab_create(struct slab_pool *pool, enum mflags flags); + +void kmalloc_init(void) { - int order; - usize order_size = PAGE_SIZE; + for (int i = 0; i < MM_NR_ZONES; i++) { + struct slab_pool *pool = slab_zone_pools[i]; - for (order = 0; order <= GET_PAGE_MAX_ORDER; order++) { - if (order_size >= size) - break; - order_size <<= 1; + while (pool->entry_size != 0) { + clist_init(&pool->empty_list); + clist_init(&pool->partial_list); + clist_init(&pool->full_list); + pool++; + } } - - return order; } void *kmalloc(usize size, enum mflags flags) { - if (size > SLAB_MAX_ALLOC) { - if (flags & M_CONTIG) { - int order = get_order(size); - if (order > GET_PAGE_MAX_ORDER) { - slab_debug("Requested alloc size %zu too large for get_pages()\n", - size); + if (size == 0) + return nil; + +#if CFG_POISON_SLABS + size += sizeof(struct slab_poison); +#endif + + SLAB_DEBUG_BLOCK { + if (!(flags & _M_NOWAIT) && in_irq()) { + slab_debug("kmalloc() called from irq without M_NOWAIT " + "(caller: %p)\n", ktrace_return_addr()); + flags |= _M_NOWAIT; + } + } + + SLAB_ASSERT(_M_ZONE_INDEX(flags) < ARRAY_SIZE(slab_zone_pools)); + struct slab_pool *pool = slab_zone_pools[_M_ZONE_INDEX(flags)]; + while (pool->entry_size != 0) { + if (pool->entry_size >= size) + break; + pool++; + } + + if (pool->entry_size == 0) { + slab_debug("Refusing to allocate %zu bytes in zone %d (limit is %u)\n", + size, _M_ZONE_INDEX(flags), pool[-1].entry_size); + return nil; + } + + slab_debug_noisy("alloc %zu bytes from zone %d, pool size %u\n", + size, _M_ZONE_INDEX(flags), pool->entry_size); + + /* + * Before locking a slab, we always remove it from its pool. + * This is far from optimal, because if multiple CPUs allocate from the + * same pool at the same time, we could end up creating several slabs + * with one used entry each (not to mention the overhead of the mostly + * unnecessary list deletions/insertions). However, it allows me to be + * lazier when freeing unused slabs from a background thread since that + * thread knows for sure that once it has removed a slab from free_list, + * it can't possibly be used for allocations anymore. + * This is probably not worth the overhead, though. + */ + struct slab *slab = nil; + + /* try to use a slab that is already partially used first */ + register_t cpuflags = intr_disable(); + spin_lock(&pool->partial_lock); + if (!clist_is_empty(&pool->partial_list)) { + atom_dec(&pool->partial_count); + slab = clist_del_first_entry(&pool->partial_list, typeof(*slab), link); + } + spin_unlock(&pool->partial_lock); + + if (slab == nil) { + /* no partially used slab available, see if we have a completely free one */ + spin_lock(&pool->empty_lock); + if (!clist_is_empty(&pool->empty_list)) { + atom_dec(&pool->empty_count); + slab = clist_del_first_entry(&pool->empty_list, typeof(*slab), link); + } + spin_unlock(&pool->empty_lock); + + if (slab == nil) { + /* we're completely out of usable slabs, allocate a new one */ + intr_restore(cpuflags); + slab = slab_create(pool, flags); + if (slab == nil) { + slab_debug("kernel OOM\n"); return nil; - } else { - return get_pages(order, flags); } - } else { - slab_debug("Refusing to allocate %zu bytes as slabs\n", size); - return nil; + intr_disable(); } + } + + /* if we've made it to here, slab != nil and interrupts are disabled */ + spin_lock(&slab->lock); + void *ret = slab->freelist; + slab->freelist = *slab->freelist; + if (--slab->free_entries == 0) { + spin_lock(&pool->full_lock); + clist_add(&pool->full_list, &slab->link); + spin_unlock(&pool->full_lock); + atom_inc(&pool->full_count); } else { - return slab_alloc(size, flags); + spin_lock(&pool->partial_lock); + clist_add(&pool->partial_list, &slab->link); + spin_unlock(&pool->partial_lock); + atom_inc(&pool->partial_count); } + spin_unlock(&slab->lock); + intr_restore(cpuflags); + + atom_inc(&pool->total_used); + +#if CFG_POISON_SLABS + struct slab_poison *poison = ret; + poison_after_alloc(poison, size - sizeof(*poison), ktrace_return_addr()); + ret = poison->data; +#endif + return ret; } void kfree(void *ptr) { - kprintf("kfree() is not implemented yet lmao\n"); -} + if (ptr == nil) + return; -void slab_init(void) -{ - slab_debug("Initializing %zu cache pools (%zu~%zu bytes)\n", - ARRAY_SIZE(pools), SLAB_STEP, SLAB_MAX_ALLOC); - for (int i = 0; i < ARRAY_SIZE(pools); i++) - clist_init(&pools[i]); + SLAB_ASSERT(ptr >= DMAP_START && ptr < DMAP_END); + + vm_page_t page = vaddr2pg(ptr); + SLAB_ASSERT(pga_slab(page)); + struct slab *slab = page->extra; + struct slab_pool *pool = slab->pool; +#if CFG_POISON_SLABS + struct slab_poison *poison = container_of(ptr, typeof(*poison), data); + poison_after_free(poison); + ptr = poison; +#endif + + register_t cpuflags = intr_disable(); + spin_lock(&slab->lock); + *(void **)ptr = slab->freelist; + slab->freelist = (void **)ptr; + if (++slab->free_entries == pool->entries_per_slab) { + spin_lock(&pool->partial_lock); + clist_del(&slab->link); + spin_unlock(&pool->partial_lock); + atom_dec(&pool->partial_count); + + spin_lock(&pool->empty_lock); + clist_add(&pool->empty_list, &slab->link); + spin_unlock(&pool->empty_lock); + atom_inc(&pool->empty_count); + } + spin_unlock(&slab->lock); + atom_dec(&pool->total_used); + intr_restore(cpuflags); } -static inline void *slab_alloc(usize size, enum mflags flags) +static struct slab *slab_create(struct slab_pool *pool, enum mflags flags) { - size = align_ceil(size, SLAB_STEP); - if (size == 0 || size > SLAB_MAX_ALLOC) - return nil; + slab_debug_noisy("Creating new cache for entry_size %u\n", pool->entry_size); + struct slab *slab = get_zero_pages(pool->page_order, flags); - struct clist *pool = &pools[size / SLAB_STEP - 1]; - struct slab *slab = nil; - struct slab *cursor; - clist_foreach_entry(pool, cursor, clink) { - if (cursor->free_entries > 0) { - slab = cursor; - break; + if (slab != nil) { + vm_page_t page = vaddr2pg(slab); + /* XXX it's probably sufficient to only do this for the lowest page */ + vm_page_foreach_in_order(page) { + pga_set_slab(page, true); + page->extra = slab; } - } - if (slab == nil) { - slab = slab_create(size, flags); - if (slab == nil) - return nil; /* OOM */ - clist_add_first(pool, &slab->clink); + + spin_init(&slab->lock); + slab->pool = pool; + slab->free_entries = pool->entries_per_slab; + void *prev = nil; + void *end = (void *)slab + (1 << (pool->page_order + PAGE_SHIFT)); + void *pos = end; + do { + pos -= pool->entry_size; + *(void **)pos = prev; + prev = pos; + } while (pos >= (void *)&slab[1] + pool->entry_size); + slab->freelist = pos; } - /* list must have at least one entry, otherwise - * we would have created a completely new slab */ - struct clist *ret = slab->freelist.next; - clist_del(ret); - slab->free_entries--; -# if CFG_POISON_HEAP - memset(ret, 'a', size); -# endif - return (void *)ret; + return slab; } -static inline void slab_free(void *ptr) +#if CFG_POISON_SLABS +static inline void poison_after_alloc(struct slab_poison *poison, u_int exact_size, + void *alloc_source) { -# if CFG_DEBUG_SLAB_ALLOCS - if (ptr < kheap_start || ptr >= kheap_end) { - kprintf("slab_free(%p): invalid ptr!\n", ptr); - return; - } - if ((uintptr_t)ptr % SLAB_STEP) { - kprintf("slab_free(%p): unaligned ptr!\n", ptr); - } -# endif + u_int offset = align_ceil(poison->exact_size, sizeof(long)) / sizeof(long); + u_long *poison_start = &poison->low_poison; - struct slab *slab = GET_SLAB(ptr); - slab->free_entries++; + /* slabs are zeroed out when they are newly allocated */ + if (poison->exact_size != 0) { + for (u_long *pos = poison_start; pos < &poison->high_poison[offset]; pos++) { + if (*pos != SLAB_POISON_FREE) { + kprintf("Use-after-free in %p (alloc by %p)\n", + poison->data, poison->alloc_source); + break; + } + } + } -# if CFG_POISON_HEAP - memset(ptr, 'A', slab->entry_size); -# endif + /* update offset to the new size */ + offset = align_ceil(exact_size, sizeof(long)) / sizeof(long); - if (slab->free_entries * slab->entry_size + slab->entry_size > SLAB_MAX_ALLOC) { - /* none of the entries are in use, free the slab */ - slab_debug_noisy("Destroying empty cache of size %zu\n", slab->entry_size); - free_pages(slab); - } else { - clist_add(&slab->freelist, (struct clist *)ptr); - } + poison->alloc_source = alloc_source; + poison->exact_size = exact_size; + for (u_long *pos = &poison->low_poison; pos <= &poison->high_poison[offset]; pos++) + *pos = SLAB_POISON_ALLOC; } -static struct slab *slab_create(unsigned int entry_size, enum mflags flags) +static inline void poison_after_free(struct slab_poison *poison) { - slab_debug_noisy("Creating new cache for size %zu\n", entry_size); - struct slab *slab = get_pages(SLAB_SIZE / PAGE_SIZE, flags); + u_int offset = align_ceil(poison->exact_size, sizeof(long)) / sizeof(long); - if (slab != nil) { - clist_init(&slab->freelist); - slab->free_entries = 0; - slab->entry_size = entry_size; - - void *startptr = (void *)slab + sizeof(*slab); - void *endptr = (void *)slab + SLAB_SIZE - entry_size; - for (void *pos = startptr; pos <= endptr; pos += entry_size) { - clist_add(&slab->freelist, (struct clist *)pos); - slab->free_entries++; - } + if (poison->low_poison != SLAB_POISON_ALLOC) { + kprintf("Low out-of-bounds write to %p (alloc by %p)\n", + poison->data, poison->alloc_source); } - return slab; + if (poison->high_poison[offset] != SLAB_POISON_ALLOC) { + kprintf("High out-of-bounds write to %p (alloc by %p)\n", + poison->data, poison->alloc_source); + } + + for (u_long *pos = &poison->low_poison; pos <= &poison->high_poison[offset]; pos++) + *pos = SLAB_POISON_FREE; +} +#endif + +__weak void *malloc(usize size) +{ + return kmalloc(size, M_KERN); +} + +__weak void free(void *ptr) +{ + kfree(ptr); }