From 385af1b7efad63dbfd426e8a0232613e008859ca Mon Sep 17 00:00:00 2001 From: fef Date: Sat, 20 Nov 2021 22:49:05 +0100 Subject: [PATCH] mm: refactor page frame allocator This is part 3 of the mm subsystem overhaul. The allocator doesn't rely on mutexes anymore and uses individual per-order spinlocks instead. Also, it is aware of multiple memory zones (normal and DMA) as well as emergency reserves. Page bitmaps take up 50 % less overhead now. --- arch/x86/include/amd64/latom.h | 2 +- arch/x86/mm/amd64/init.c | 112 +++--- arch/x86/mm/amd64/page.c | 45 --- cmake/config.cmake | 6 + include/gay/config.h.in | 9 + include/gay/mm.h | 149 +++----- include/gay/vm/page.h | 98 ++++-- kernel/mm/CMakeLists.txt | 1 - kernel/mm/boot.c | 5 +- kernel/mm/kmalloc.c | 74 ---- kernel/mm/page.c | 626 ++++++++++++++++++--------------- 11 files changed, 546 insertions(+), 581 deletions(-) delete mode 100644 kernel/mm/kmalloc.c diff --git a/arch/x86/include/amd64/latom.h b/arch/x86/include/amd64/latom.h index 5fa72ab..72c2163 100644 --- a/arch/x86/include/amd64/latom.h +++ b/arch/x86/include/amd64/latom.h @@ -202,7 +202,7 @@ static inline bool latom_flip_bit(latom_t *latom, int pos) __asm__ volatile( X86_LOCK_PREFIX -" btcq %1, (%2) \n" +" btcq %q1, (%2) \n" " setc %b0 \n" : "+r"(ret) : "r"(pos), "r"(&latom->_value) diff --git a/arch/x86/mm/amd64/init.c b/arch/x86/mm/amd64/init.c index 336d848..da86304 100644 --- a/arch/x86/mm/amd64/init.c +++ b/arch/x86/mm/amd64/init.c @@ -5,7 +5,6 @@ #include #include -#include #include #include #include @@ -15,7 +14,7 @@ #include struct vm_page *const vm_page_array = (vm_page_t)VM_PAGE_ARRAY_OFFSET; -#ifdef DEBUG +#if CFG_DEBUG_PGADDRS /* this gets updated in x86_setup_paging() once we know how big the array is */ vm_page_t _vm_page_array_end = (vm_page_t)(VM_PAGE_ARRAY_OFFSET + VM_PAGE_ARRAY_LENGTH); #endif @@ -41,6 +40,51 @@ static void register_area(struct mb2_mmap_entry *entry) } } +/** + * @brief Map the entire physical memory to `DMAP_OFFSET`. + * + * This may overshoot up to 1 GB because we only use gigapages, but considering + * the fact that mapping literally the entire physical RAM is probably the + * bigger problem here i'd say it's fine. + * + * @param end End of physical memory + */ +static void map_direct_area(vm_paddr_t end) +{ + vm_paddr_t ppos = 0; + void *vpos = __v(0); + void *const vend = __v(end); + /* This assertion fails if > 4 TB of physical memory are available. + * Sorry gamers, we don't support enough RAM for all your Chrome tabs. */ + KASSERT(vend < DMAP_END); + + while (vpos < vend) { + x86_pml4te_t *pml4te = X86_PML4TE(vpos); + vm_paddr_t pdpt_phys = __boot_pmalloc(PAGE_SHIFT, MM_ZONE_NORMAL); + panic_if(pdpt_phys == BOOT_PMALLOC_ERR, + "cannot allocate memory for direct mapping"); + + __boot_clear_page(pdpt_phys); + pml4te->val = pdpt_phys | __P_PRESENT | __P_RW | __P_NOEXEC; + vm_flush(); + + for (int pdpti = 0; pdpti < 512; pdpti++) { + x86_pdpte_t *pdpte = X86_PDPTE(vpos); + pdpte->val = ppos | __P_PRESENT | __P_RW | __P_GLOBAL + | __P_HUGE | __P_NOEXEC; + + ppos += GIGAPAGE_SIZE; + vpos += GIGAPAGE_SIZE; + if (vpos >= vend) + break; + } + + pml4te->flags.global = 1; + } + + vm_flush(); +} + /* * "Oh cool another deeply nested 100-liner that nobody understands" */ @@ -68,16 +112,15 @@ void x86_paging_init(struct mb2_tag_mmap *mmap) * (this is gonna be a long one) */ struct vm_page *vm_page_array_end = vm_page_array + (end >> PAGE_SHIFT); -#ifdef DEBUG +#if CFG_DEBUG_PGADDRS _vm_page_array_end = vm_page_array_end; #endif void *map_pos = vm_page_array; - usize remaining_size = (void *)vm_page_array_end - (void *)vm_page_array; - remaining_size = align_ceil(remaining_size, PAGE_SIZE); - kprintf("Mapping %zu bytes for vm_page_array\n", remaining_size); + void *map_end = map_pos + ((void *)vm_page_array_end - (void *)vm_page_array); + kprintf("Mapping %zu bytes for vm_page_array\n", map_end - map_pos); /* PML4T loop */ - while (remaining_size != 0) { + while (map_pos < map_end) { /* Is vm_page_array so huge that it spans almost the entire 2 TB * kernel region? If that's the case, something has gone terribly * wrong, unless we somehow happen to have about an Exabyte of RAM @@ -85,7 +128,7 @@ void x86_paging_init(struct mb2_tag_mmap *mmap) KASSERT(map_pos < (void *)KERNBASE); x86_pml4te_t *pml4te = X86_PML4TE(map_pos); - vm_paddr_t pml4te_val = __boot_pmalloc(PAGE_SHIFT); + vm_paddr_t pml4te_val = __boot_pmalloc(PAGE_SHIFT, MM_ZONE_NORMAL); panic_if(pml4te_val == BOOT_PMALLOC_ERR, "cannot reserve memory for vm_page_array"); __boot_clear_page(pml4te_val); pml4te_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_NOEXEC; @@ -98,8 +141,8 @@ void x86_paging_init(struct mb2_tag_mmap *mmap) vm_paddr_t pdpte_val; /* try allocating a 1 GB gigapage first */ - if (remaining_size >= 1 << X86_PDPT_SHIFT) { - pdpte_val = __boot_pmalloc(X86_PDPT_SHIFT); + if (map_end - map_pos > GIGAPAGE_SIZE) { + pdpte_val = __boot_pmalloc(X86_PDPT_SHIFT, MM_ZONE_NORMAL); /* CLion is warning about this condition being always true, but * that is not the case. I've checked the disassembly with -O2, * and clang is emitting the check. So it's fine, i guess. */ @@ -107,16 +150,15 @@ void x86_paging_init(struct mb2_tag_mmap *mmap) pdpte_val |= __P_PRESENT | __P_RW | __P_HUGE | __P_GLOBAL | __P_NOEXEC; pdpte->val = pdpte_val; - remaining_size -= 1 << X86_PDPT_SHIFT; - map_pos += 1 << X86_PDPT_SHIFT; - if (remaining_size == 0) + map_pos += GIGAPAGE_SIZE; + if (map_pos >= map_end) goto map_done; continue; } } /* couldn't use a gigapage, continue in hugepage steps */ - pdpte_val = __boot_pmalloc(PAGE_SHIFT); + pdpte_val = __boot_pmalloc(PAGE_SHIFT, MM_ZONE_NORMAL); panic_if(pdpte_val == BOOT_PMALLOC_ERR, "cannot reserve memory for vm_page_array"); __boot_clear_page(pdpte_val); @@ -130,22 +172,21 @@ void x86_paging_init(struct mb2_tag_mmap *mmap) vm_paddr_t pdte_val; /* try allocating a 2 MB hugepage first */ - if (remaining_size >= (1 << X86_PDT_SHIFT)) { - pdte_val = __boot_pmalloc(X86_PDT_SHIFT); + if (map_end - map_pos >= HUGEPAGE_SIZE) { + pdte_val = __boot_pmalloc(X86_PDT_SHIFT, MM_ZONE_NORMAL); if (pdte_val != BOOT_PMALLOC_ERR) { pdte_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_HUGE | __P_NOEXEC; pdte->val = pdte_val; - remaining_size -= 1 << X86_PDT_SHIFT; - map_pos += 1 << X86_PDT_SHIFT; - if (remaining_size == 0) + map_pos += HUGEPAGE_SIZE; + if (map_pos >= map_end) goto map_done; continue; } } /* couldn't use a hugepage, continue in page steps */ - pdte_val = __boot_pmalloc(PAGE_SHIFT); + pdte_val = __boot_pmalloc(PAGE_SHIFT, MM_ZONE_NORMAL); panic_if(pdte_val == BOOT_PMALLOC_ERR, "cannot reserve memory for vm_page_array"); __boot_clear_page(pdpte_val); @@ -156,15 +197,14 @@ void x86_paging_init(struct mb2_tag_mmap *mmap) /* PT loop */ for (int pt_index = 0; pt_index < 512; pt_index++) { x86_pte_t *pte = X86_PTE(map_pos); - vm_paddr_t pte_val = __boot_pmalloc(X86_PT_SHIFT); + vm_paddr_t pte_val = __boot_pmalloc(X86_PT_SHIFT, MM_ZONE_NORMAL); panic_if(pte_val == BOOT_PMALLOC_ERR, "cannot reserve memory for vm_page_array"); pte_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_NOEXEC; pte->val = pte_val; - remaining_size -= 1 << X86_PT_SHIFT; - map_pos += 1 << X86_PT_SHIFT; - if (remaining_size == 0) + map_pos += PAGE_SIZE; + if (map_pos >= map_end) goto map_done; } /* end of PT loop */ } /* end of PDT loop */ @@ -172,26 +212,8 @@ void x86_paging_init(struct mb2_tag_mmap *mmap) } /* end of PML4T loop */ map_done: - vm_flush(); -} - -static void init_page_range(vm_paddr_t start, vm_paddr_t end, u_int flags) -{ - KASSERT(start <= end); - vm_page_t cursor = vm_page_array + (start >> PAGE_SHIFT); - usize count = (end - start) >> PAGE_SHIFT; - - if (flags == 0) { - memset(cursor, 0, count * sizeof(*cursor)); - } else { - while (count--) { - atom_init(&cursor->count, 0); - cursor->flags = flags; - cursor->try_free = nil; - cursor->extra = nil; - cursor++; - } - } + map_direct_area(end); + paging_init(end); } /* @@ -199,7 +221,7 @@ static void init_page_range(vm_paddr_t start, vm_paddr_t end, u_int flags) * a page table, yet also need to reference it in the page table structures * (thereby mapping it into virtual memory) before we can zero it out. * This little hack temporarily maps the area at one PDP entry before KERNBASE - * (meaning index 1022 of _pdp0), zeroes the area, and then unmaps it again. + * (meaning index 510 of _pdp0), zeroes the area, and then unmaps it again. */ void __boot_clear_page(vm_paddr_t paddr) { diff --git a/arch/x86/mm/amd64/page.c b/arch/x86/mm/amd64/page.c index ce5b18d..ca69144 100644 --- a/arch/x86/mm/amd64/page.c +++ b/arch/x86/mm/amd64/page.c @@ -4,15 +4,10 @@ #include #include -#include -#include #include -#include #include #include -#include - /* * Initial Page Directory Pointer Table and Page Map Level 4 Table for the * assembly startup routine (see setup64.S). Used for statically mapping the @@ -21,46 +16,6 @@ __asmlink x86_pdpt_t _pdpt0; __asmlink x86_pml4t_t _pml4t; -int map_page(uintptr_t phys, void *virt, enum pflags flags) -{ - flags |= P_PRESENT; - x86_pml4te_t *pml4e = X86_PML4TE(virt); - if (!pml4e->flags.present) { - void *page = get_pages(0, M_ATOMIC); - if (page == nil) - return -ENOMEM; - pml4e->val = __p(page) | P_PRESENT | P_RW; - } - - return 0; -} - -/* - * The only difference between this and map_page() is that we can't allocate - * new pages using get_pages() but have to use __early_get_page() instead here. - * So, all we need to do is ensure that map_page() doesn't need to allocate new - * pages when we call it, which it only does if pflags does not have P_HUGE - * set and the page table doesn't exist (present bit in the page directory is - * clear). Therefore, we just need to make sure that, if P_HUGE is *not* - * set, the page table is already allocated and marked as present in the page - * directory. - */ -void __early_map_page(uintptr_t phys, void *virt, enum pflags pflags) -{ -} - -uintptr_t unmap_page(void *virt) -{ -} - -enum pflags get_pflags(void *page) -{ -} - -int set_pflags(void *page, enum pflags pflags) -{ -} - void x86_isr_page_fault(trap_frame_t *frame, u32 error_code) { void *address; diff --git a/cmake/config.cmake b/cmake/config.cmake index 8a3b411..3aaf223 100644 --- a/cmake/config.cmake +++ b/cmake/config.cmake @@ -14,6 +14,10 @@ option(CFG_POISON_PAGES "Poison pages after allocate and free" ON) option(CFG_POISON_HEAP "Poison heap memory after kmalloc() and kfree()" ON) +set(CFG_PAGE_EMERG_DENOM "16" CACHE STRING "Denominator for the fraction of pages kept in emergency reserves") + +set(CFG_PAGE_EMERG_MAX "1024" CACHE STRING "Absolute maximum number of pages kept in emergency reserves") + option(CFG_SMP "Enable Symmetric Multiprocessing" ON) set(CFG_MAX_CPU "64" CACHE STRING "Maximum number of logical processors") @@ -28,6 +32,8 @@ option(CFG_DEBUG_PAGE_ALLOCS "Debug page frame allocations" OFF) option(CFG_DEBUG_PAGE_ALLOCS_NOISY "Debug page frame allocations in full detail (VERY noisy)" OFF) +option(CFG_DEBUG_PGADDRS "Sanitize page frame addresses" OFF) + option(CFG_DEBUG_SLAB_ALLOCS "Debug slab allocations" OFF) option(CFG_DEBUG_SLAB_ALLOCS_NOISY "Debug slab allocations in full detail (VERY noisy)" OFF) diff --git a/include/gay/config.h.in b/include/gay/config.h.in index 9f98752..bd1bd99 100644 --- a/include/gay/config.h.in +++ b/include/gay/config.h.in @@ -31,6 +31,12 @@ /** @brief Poison heap areas after `kmalloc()` and `kfree()` */ #cmakedefine01 CFG_POISON_HEAP +/** @brief Denominator for the fraction of pages kept in emergency reserves */ +#define CFG_PAGE_EMERG_DENOM @CFG_PAGE_EMERG_DENOM@ + +/** @brief Absolute maximum number of pages kept in emergency reserves */ +#define CFG_PAGE_EMERG_MAX @CFG_PAGE_EMERG_THRESH@ + /** @brief Enable Symmetric Multiprocessing */ #cmakedefine01 CFG_SMP @@ -52,6 +58,9 @@ /** @brief Spit out the full details of page allocations */ #cmakedefine01 CFG_DEBUG_PAGE_ALLOCS_NOISY +/** @brief Sanitize page frame addresses */ +#cmakedefine01 CFG_DEBUG_PGADDRS + /** @brief Debug slab allocations */ #cmakedefine01 CFG_DEBUG_SLAB_ALLOCS diff --git a/include/gay/mm.h b/include/gay/mm.h index 886e143..d32eb5d 100644 --- a/include/gay/mm.h +++ b/include/gay/mm.h @@ -16,6 +16,13 @@ * bigger areas of memory that are not physically contiguous (for regular user * allocations). The entire physical memory is mapped statically in the range * `DMAP_START - DMAP_END`. + * + * Memory is split up into (currently) two zones: `MM_ZONE_NORMAL` and + * `MM_ZONE_DMA`. As their names suggest, the former is for general purpose + * allocations and the latter for getting memory suitable for DMA transfers. + * Zones are further divided into pools, each of which hold a list of groups of + * free pages. The size of these page groups is determined by the pool's order, + * where the pool of order `n` holds groups of `1 << n` pages. */ #ifdef _KERNEL @@ -23,10 +30,14 @@ #include #include +#include #include #include +#include #include +#include + #define _M_ZONE_NORMAL 0 #define _M_ZONE_DMA 1 #define _M_ZONE_INDEX(flags) ((flags) & 1) @@ -40,15 +51,34 @@ enum mm_zone_type { MM_NR_ZONES }; +/** @brief Boot memory area. */ struct _bmem_area { struct clist link; /* -> struct mm_zone::_bmem_areas */ vm_paddr_t start; vm_paddr_t end; }; +struct mm_pool { + struct clist freelist; /* -> vm_page_t::link */ + /** @brief Number of items in `freelist`. */ + usize free_entries; + /** @brief One bit per buddy *pair*, 1 if exactly one is allocated. */ + latom_t *bitmap; + spin_t lock; +}; + +#define MM_NR_ORDERS 10 +#define MM_MAX_ORDER (MM_NR_ORDERS - 1) + struct mm_zone { - patom_t freelist; /* -> struct vm_page */ - usize length; + /** @brief Current number of free pages in all pools */ + latom_t free_count; + /** @brief Thresholds for OOM behavior */ + struct { + /** @brief Minimum number of pages reserved for emergency allocations */ + u_long emerg; + } thrsh; + struct mm_pool pools[MM_NR_ORDERS]; struct clist _bmem_areas; /* -> struct _bmem_area */ }; @@ -59,7 +89,7 @@ struct mm_zone { * The mm subsystem isn't NUMA aware, because it's not really a thing on desktop * grade machines anyway and would only complicate things unnecessarily. */ -extern struct mm_zone mm_zones[MM_NR_ZONES]; +extern struct mm_zone mm_zones[MM_NR_ZONES]; /* kernel/mm/page.c */ /** * @brief Memory allocation flags passed to `kmalloc()`. @@ -122,108 +152,33 @@ enum pflags { #endif }; -/* - * Terrible hack that allows us to map pages before the page frame allocator is - * set up. Don't ever use these anywhere, because they *will* break everything. - */ -void __early_map_page(uintptr_t phys, void *virt, enum pflags flags); -/* This just shrinks phys_end by PAGE_SIZE and returns the page */ -uintptr_t __early_get_page(void); - -/** - * @brief Map a page in physical memory to a virtual address. - * Remember that if `vm` is the memory map currently in use, you will most - * likely need to call `vm_update()` when you've finished mapping everything - * to flush the TLB. - * - * @param phys Physical address of the page - * @param virt Virtual address to map the page to - * @param flags Flags to apply to the page - * @returns 0 on success, or `-ENOMEM` if OOM (for allocating new page tables) - */ -int map_page(uintptr_t phys, void *virt, enum pflags flags); - -/** - * @brief Remove a page mapping. - * - * @param virt Virtual address the page is mapped to, must be page aligned - * @returns The physical page address that was being mapped - */ -uintptr_t unmap_page(void *virt); - -/** - * @brief Get a page's flags in the page tables. - * - * @param page Page to get the flags of (if the page is in a hugepage area, - * the flags for that hugepage will be returned with `P_HUGE = 1`) - * @return The flags, as currently stored in the page table structures - * (but not necessarily applied if they have been modified and `vm_flush()` - * has not been called yet!) - */ -enum pflags get_pflags(void *page); - -/** - * @brief Update a page's flags in the page tables. - * You should always use this in conjunction with `get_pflags()`, as in getting - * the flags first, then toggling the flags you need to, and then setting them - * in the tables again. This is because this method will clear *any* previous - * flags. - * - * @param page Page to set flags for (if flags has `P_HUGE` set, must be - * `HUGEPAGE_SIZE` aligned, otherwise `PAGE_SIZE` aligned) - * @param flags Flags to set - * @return 0 on success, or a negative value if either a page table allocation - * failed or - */ -int set_pflags(void *page, enum pflags flags); - -/** - * @brief Initialize the memory allocator. - * - * This can only be called once, from the early `_boot()` routine. - * - * @param _phys_start Physical start address of the page area - * @param _phys_end Physical end address of the page area - * @returns 0 on success, or -1 if the pointers were garbage - */ -int kmalloc_init(uintptr_t _phys_start, uintptr_t _phys_end); - -/** @brief Start of the mapped, physically contiguous kernel heap */ -extern void *kheap_start; -/** @brief End of the mapped, physically contiguous kernel heap */ -extern void *kheap_end; - -/** @brief Start of the kernel heap in physical memory */ -extern uintptr_t phys_start; -/** @brief End of the kernel heap in physical memory */ -extern uintptr_t phys_end; - /** * @brief Initialize the buddy page frame allocator. - * This is only called once, internally from `kmalloc_init()`. - * - * @return 0 on success, or -1 if it messed up + * This is only called once, from the arch dependent counterpart after it has + * reserved memory for and mapped `vm_page_array`, as well as mapped the direct + * area. */ -int pages_init(void); +void paging_init(vm_paddr_t phys_end); /** * @brief Allocate a contiguous region in physical memory. * The returned region will be `(1 << order) * PAGE_SIZE` bytes long. * - * @param order Order of magnitude (as in `1 << order`) for the region size - * @param flags How to allocate (`order` must be 0 if `M_NOWAIT` is specified) + * **The pages are not initialized.** + * If you want zeroed pages, use `get_zero_pages()`. + * + * @param order Order of magnitude (as in `1 << order` pages) + * @param flags How to allocate * @return A pointer to the beginning of the region in the direct mapping area, * or `nil` if the allocation failed */ -void *get_pages(int order, enum mflags flags) __malloc_like; -#ifdef __HAVE_HUGEPAGES -#define GET_PAGE_ORDERS (HUGEPAGE_SHIFT - PAGE_SHIFT + 1) -#else -#define GET_PAGE_ORDERS 10 -#endif -#define GET_PAGE_MAX_ORDER (GET_PAGE_ORDERS - 1) +void *get_pages(u_int order, enum mflags flags) __malloc_like; +void *get_page(enum mflags flags) __malloc_like; +void *get_zero_pages(u_int order, enum mflags flags) __malloc_like; +void *get_zero_page(enum mflags flags) __malloc_like; void free_pages(void *ptr); +#define free_page(ptr) free_pages(ptr) /** * @brief Initialize the slab caches. @@ -240,14 +195,8 @@ void slab_init(void); * @param phys Physical address * @return Virtual address */ -static inline void *__v(uintptr_t phys) +static inline void *__v(vm_paddr_t phys) { -# ifdef DEBUG - if (phys > phys_end) { - kprintf("__v(%p): phys ptr out of range!\n", (void *)phys); - return nil; - } -# endif return (void *)phys + DMAP_OFFSET; } @@ -262,7 +211,7 @@ static inline void *__v(uintptr_t phys) * @return The physical address, i.e. `virt - DMAP_OFFSET` * @see vtophys() */ -static inline uintptr_t __p(void *virt) +static inline vm_paddr_t __p(void *virt) { # ifdef DEBUG if (virt < DMAP_START || virt >= DMAP_END) { diff --git a/include/gay/vm/page.h b/include/gay/vm/page.h index d7245be..72eb52b 100644 --- a/include/gay/vm/page.h +++ b/include/gay/vm/page.h @@ -5,9 +5,23 @@ #include #include +#include +#include #include #include +/* + * I'm trying really hard to keep the size of struct vm_page a power of two + * on LP64 systems, because that way we can quickly get to the page frame number + * by shifting the byte offset of the vm_page_t in vm_page_array to the right + * rather than doing a costly divide instruction (or store the page frame number + * within the structure itself, which takes up precious space). + * + * There is insane pressure on the size of this structure, because a typical + * system will have millions of instances of it. Every additional byte makes + * a significant difference in memory management overhead. + */ + /** * @brief Stores information about a single page in physical memory. * There is exactly one of these for every physical page, no matter what that @@ -16,66 +30,94 @@ struct vm_page { /** @brief Reference count (0 = unused) */ atom_t count; + unsigned order:8; /** @brief Various flags describing how and for what the page is used, see below */ - u_int flags; - /** @brief Singly linked list, if the page is free */ - patom_t next; - /** - * @brief Request this page to be freed if possible. - * This callback may be `nil` unless the `PG_FREEABLE` bit in `flags` - * is set. The presence of this bit does *not* guarantee that the page - * is actually reclaimable, it's merely a performance optimization to - * avoid having to call this function on pages that can never be - * reclaimed anyway. - * - * @param page Pointer to the page itself - * @return 0 if the page could be reclaimed and is now free - */ - int (*try_free)(struct vm_page *page); + unsigned flags:24; + struct clist link; /** * @brief Optional extra data pointer, reserved for private use. * The current owner of the page may use this to track the underlying * object in memory (or pretty much anything else), for example the * `struct slab` if this page is currently used by the slab allocator. - * Useful for implementing the `try_free()` callback. */ void *extra; }; typedef struct vm_page *vm_page_t; -/* values for struct page::flags */ +/* values for struct vm_page::flags */ + /** @brief Page must never be accessed */ #define PG_RESERVED (1 << 0) -/** @brief Page is in an atomic per-cpu cache */ -#define PG_ATOMIC (1 << 1) +/** @brief Page is in a per-cpu cache */ +#define PG_PCPU (1 << 1) /** @brief Page is used by the slab allocator */ #define PG_SLAB (1 << 2) -/** @brief It **might** be possible to reclaim this page using `try_free()` */ -#define PG_FREEABLE (1 << 3) +/** @brief Page is in `MM_ZONE_DMA`, rather than `MM_ZONE_NORMAL` */ +#define PG_DMA (1u << 3) /** @brief Array of every single page in physical memory, indexed by page frame number. */ extern struct vm_page *const vm_page_array; -#ifdef DEBUG + +#if CFG_DEBUG_PGADDRS extern vm_page_t _vm_page_array_end; +#define PGADDR_ASSERT(x) KASSERT(x) +#else +#define PGADDR_ASSERT(x) ({}) #endif +static inline bool page_get(vm_page_t page) +{ + return atom_inc(&page->count); +} + +static inline bool page_put(vm_page_t page) +{ + return atom_dec(&page->count); +} + /** @brief Get the page frame number of a page. */ -__pure2 static inline u_long pg2pfn(vm_page_t page) +__pure2 +static inline u_long pg2pfn(vm_page_t page) { - KASSERT(page < _vm_page_array_end); + PGADDR_ASSERT(page < _vm_page_array_end); return page - vm_page_array; } -__pure2 static inline u_long paddr2pfn(vm_paddr_t paddr) +__pure2 +static inline vm_page_t vaddr2pg(void *vaddr) +{ + PGADDR_ASSERT(vaddr >= DMAP_START && vaddr < (void *)_vm_page_array_end); + uintptr_t offset = (uintptr_t)vaddr - DMAP_OFFSET; + return &vm_page_array[offset >> PAGE_SHIFT]; +} + +__pure2 +static inline u_long vaddr2pfn(void *vaddr) +{ + u_long pfn = ((uintptr_t)vaddr - DMAP_OFFSET) >> PAGE_SHIFT; + PGADDR_ASSERT(vaddr >= DMAP_START && &vm_page_array[pfn] < _vm_page_array_end); + return pfn; +} + +__pure2 +static inline u_long paddr2pfn(vm_paddr_t paddr) { - KASSERT(&vm_page_array[paddr >> PAGE_SHIFT] < _vm_page_array_end); + PGADDR_ASSERT(&vm_page_array[paddr >> PAGE_SHIFT] < _vm_page_array_end); return paddr >> PAGE_SHIFT; } -__pure2 static inline vm_page_t paddr2pg(vm_paddr_t paddr) +__pure2 +static inline vm_page_t paddr2pg(vm_paddr_t paddr) { vm_page_t page = vm_page_array + (paddr >> PAGE_SHIFT); - KASSERT(page < _vm_page_array_end); + PGADDR_ASSERT(page < _vm_page_array_end); return page; } + +__pure2 +static inline void *pfn2vaddr(u_long pfn) +{ + PGADDR_ASSERT(&vm_page_array[pfn] < _vm_page_array_end); + return DMAP_START + (pfn << PAGE_SHIFT); +} diff --git a/kernel/mm/CMakeLists.txt b/kernel/mm/CMakeLists.txt index 3be2e46..f91f9b5 100644 --- a/kernel/mm/CMakeLists.txt +++ b/kernel/mm/CMakeLists.txt @@ -2,7 +2,6 @@ target_sources(gay_kernel PRIVATE boot.c - kmalloc.c page.c slab.c ) diff --git a/kernel/mm/boot.c b/kernel/mm/boot.c index 819fd31..e3ee82f 100644 --- a/kernel/mm/boot.c +++ b/kernel/mm/boot.c @@ -13,7 +13,7 @@ static CLIST(bmem_area_freelist); #ifdef DEBUG #define debug_free_bmem_area(area) ({ (area)->start = ~(vm_paddr_t)0; }) -#define debug_get_bmem_area(area) KASSERT((area)->start != ~(vm_paddr_t)0) +#define debug_get_bmem_area(area) KASSERT((area)->start == ~(vm_paddr_t)0) #else #define debug_free_bmem_area(area) ({}) #define debug_get_bmem_area(area) ({}) @@ -62,6 +62,9 @@ void __boot_pmalloc_init(void) debug_free_bmem_area(area); clist_add(&bmem_area_freelist, &area->link); } + + for (int i = 0; i < MM_NR_ZONES; i++) + clist_init(&mm_zones[i]._bmem_areas); } void __boot_register_mem_area(vm_paddr_t start, vm_paddr_t end, enum mm_zone_type zone_type) diff --git a/kernel/mm/kmalloc.c b/kernel/mm/kmalloc.c deleted file mode 100644 index ae1c798..0000000 --- a/kernel/mm/kmalloc.c +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (C) 2021 fef . All rights reserved. */ - -#include -#include -#include -#include - -extern void _image_start_phys; -extern void _image_end_phys; - -/* these are initialized by pages_init() */ -void *kheap_start; -void *kheap_end; - -int kmalloc_init(uintptr_t _phys_start, uintptr_t _phys_end) -{ - phys_start = _phys_start; - phys_end = _phys_end; - - /* - * The kernel image is very likely gonna be within the physical memory - * range, so we're gonna need to do some cropping in order to not hand - * out pages that actually contain kernel code. - * Furthermore, somebody should probably clean up this mess somehow. - */ - uintptr_t image_start_phys = (uintptr_t)&_image_start_phys; - uintptr_t image_end_phys = (uintptr_t)&_image_end_phys; - if (phys_start < image_start_phys && phys_end > image_start_phys) { - if (image_start_phys - phys_start > phys_end - image_start_phys) - phys_end = image_start_phys; - else - phys_start = image_end_phys; - } - if (phys_start < image_end_phys && _phys_end > image_end_phys) { - if (image_end_phys - phys_start > phys_end - image_end_phys) - phys_end = image_start_phys; - else - phys_start = image_end_phys; - } - - phys_start = align_ceil(phys_start, HUGEPAGE_SIZE); - /* - * This is intentionally not aligned to hugepages, because __early_get_page() - * shrinks it in single PAGE_SIZE steps whenever it is called anyway. - * I know, this is a terrible hack, but it will be aligned to a hugepage - * from within pages_init(), right after the entire physical memory has - * been mapped to the direct area (which is the only reason we need to - * be able to allocate pages before the page frame allocator is set up - * in the first place). - */ - phys_end = align_floor(phys_end, PAGE_SIZE); - - int err = pages_init(); - if (err) - return err; - - slab_init(); - return 0; -} - -__weak void *malloc(usize size) -{ - return kmalloc(size, M_KERN); -} - -__weak void free(void *ptr) -{ - kfree(ptr); -} - -/* - * Looking for kmalloc() and kfree()? - * Those two are in slab.c for purely organizational reasons. - */ diff --git a/kernel/mm/page.c b/kernel/mm/page.c index caf76d5..25522d1 100644 --- a/kernel/mm/page.c +++ b/kernel/mm/page.c @@ -1,32 +1,25 @@ /* Copyright (C) 2021 fef . All rights reserved. */ +#include #include -#include #include #include #include #include #include +#include #include #include #include +#include #include #include +#include -#ifndef __HAVE_HUGEPAGES -#error "Systems without huge pages are currently unsupported because i'm a dumb bitch" -#endif - -#if DMAP_OFFSET % HUGEPAGE_SIZE != 0 -#error "DMAP_OFFSET must be an integral multiple of HUGEPAGE_SIZE" -#endif - -/* this should be impossible because arch/page.h must also define PAGE_SHIFT - * and HUGEPAGE_SHIFT, meaning the two are definitively powers of 2 */ -#if HUGEPAGE_SIZE % PAGE_SIZE != 0 -#error "HUGEPAGE_SIZE must be an integral multiple of PAGE_SIZE" +#if DMAP_OFFSET % PAGE_SIZE != 0 +#error "DMAP_OFFSET must be an integral multiple of PAGE_SIZE" #endif #if PAGE_SIZE % LONG_BIT != 0 @@ -40,6 +33,7 @@ #if CFG_DEBUG_PAGE_ALLOCS # define PAGE_ASSERT(x) KASSERT(x) # define page_debug(msg, ...) kprintf("[page] " msg, ##__VA_ARGS__) +# define PAGE_DEBUG_BLOCK # if CFG_DEBUG_PAGE_ALLOCS_NOISY # define page_debug_noisy(msg, ...) kprintf("[page] " msg, ##__VA_ARGS__) # else @@ -47,359 +41,419 @@ # endif #else # define PAGE_ASSERT(x) ({}) +# define PAGE_DEBUG_BLOCK if (0) # define page_debug(msg, ...) ({}) # define page_debug_noisy(msg, ...) ({}) #endif -/** - * We have cache levels for areas ranging from a single page up to a huge page - * on a logarithmic scale. Every level covers double the pages per entry than - * the one below it, starting at one page per entry. The effective result is - * that a single entry in the cache on level L covers `(1 << L)` pages. - */ -#define CACHE_ORDERS GET_PAGE_ORDERS - #define ORDER_SHIFT(order) (PAGE_SHIFT + (order)) +#define ORDER_SIZE(order) (1 << ORDER_SHIFT(order)) -/** @brief There is one of this for every cache order. */ -struct cache_pool { - /** - * @brief List of free blocks on this order of granularity. - * The individual entries sit right at the beginning of each free block, - * and are always aligned to `entry_size` bytes. - */ - struct clist freelist; - /** - * @brief Bitmap that stores the allocated status of each entry. - * 1 means allocated, 0 means not. - */ - unsigned long *bitmap; - /** @brief Number of items in `freelist`. */ - usize free_entries; -}; -static struct cache_pool caches[CACHE_ORDERS]; -static MTX(caches_lock); - -/* these get set in kmalloc_init() */ -uintptr_t phys_start; -uintptr_t phys_end; - -uintptr_t __early_get_page(void) -{ - phys_end -= PAGE_SIZE; - return phys_end; -} +/* this should be the same as LONG_BIT because latom_t is really just a + * long wrapped in a struct, but my trust in compilers is exactly zero */ +#define LATOM_BIT (sizeof(latom_t) * CHAR_BIT) + +struct mm_zone mm_zones[MM_NR_ZONES]; -static int sanity_check(void) +static inline u_int paddr_find_order(vm_paddr_t addr) { - KASSERT(phys_start < phys_end); - KASSERT(phys_start == HUGEPAGE_ALIGN(phys_start)); - /* phys_end is only page aligned, see kmalloc_init() */ - KASSERT(phys_end == PAGE_ALIGN(phys_end)); - - if ((phys_end - phys_start) < (32 * 1024 * 1024)) { - kprintf("Less than 32 MB of usable RAM, this wouldn't go well\n"); - return 1; - } + int bit = ffsll((long long)addr) - 1; + if (bit == -1 || bit > ORDER_SHIFT(MM_MAX_ORDER)) + bit = ORDER_SHIFT(MM_MAX_ORDER); - return 0; + KASSERT(bit >= PAGE_SHIFT); + return bit - PAGE_SHIFT; } -/* - * Map the entire physical memory into the direct contiguous area. - * __early_map_page() might call __early_get_page() in order to allocate - * new page table structures, which in turn shrinks the physical memory - * size (see above). - */ -static inline void map_direct_area(void) +/** @brief Claim all free pages in one of the memory areas from the boot allocator. */ +static inline void claim_bmem_pages(struct mm_zone *zone, struct _bmem_area *area) { -#ifdef __HAVE_HUGEPAGES - const usize step = HUGEPAGE_SIZE; - const enum pflags flags = P_PRESENT | P_RW | P_HUGE; -#else - const usize step = PAGE_SIZE; - const enum pflags flags = P_PRESENT | P_RW; -#endif + vm_paddr_t start = area->start; + vm_paddr_t end = area->end; + vm_paddr_t pos = start; + vm_size_t nr_pages = end - start / PAGE_SIZE; + latom_add(&zone->free_count, (long)nr_pages); + + struct vm_page *page = &vm_page_array[start >> PAGE_SHIFT]; + u_int order = paddr_find_order(start); + /* make sure the boot memory allocator cannot under any circumstances hand + * out pages from this area anymore, even though that should be unnecessary */ + clist_del(&area->link); /* - * It might be necessary to use a volatile pointer to phys_end for this - * loop in case clang does The Optimization and caches its value for - * whatever reason, even though at least for x86 this is not the case - * (and i don't even thing the C standard allows it when calling - * external functions in between, but still, Never Trust The Compiler). + * We want to insert pages at the highest possible order. However, the + * start and end pointers of the area are only guaranteed to be page + * aligned. Therefore, we start with the highest possible order based + * on the start address, and then increment the order in every loop + * iteration (up to MM_MAX_ORDER). We do this until we have reached + * the end which, again, is only guaranteed to be page aligned, and + * subsequently lower the order again. */ - for (uintptr_t pos = phys_start; pos <= phys_end - step; pos += step) - __early_map_page(pos, __v(pos), flags); + while (pos < end) { + struct mm_pool *pool = &zone->pools[order]; + clist_add(&pool->freelist, &page->link); + pool->free_entries++; + + /* only the first page in the order group is inserted into + * the freelist, but all of them need to be initialized */ + for (u_int i = 0; i < (1 << order); i++) { + atom_init(&page[i].count, 0); + page[i].flags = 0; + page[i].order = 0; + } - vm_flush(); + /* + * order + * ^ + * | _________ < MM_MAX_ORDER + * | / | + * start | / \ < end order + * order > |/ + * |--------------|----> pos + * start end + */ + pos += ORDER_SIZE(order); + page += (1 << order); + if (order < MM_MAX_ORDER && pos + ORDER_SIZE(order) <= end) { + /* this makes the rising part of the graph */ + order++; + } else if (order > 0 && pos > end) { + /* we have overshot, lower the order */ + pos -= ORDER_SIZE(order); + page -= (1 << order); + /* this makes the abrupt downwards jump at the end of the graph */ + while (--order) { + if (pos + ORDER_SIZE(order) <= end) { + pos += ORDER_SIZE(order); + page += (1 << order); + break; + } + } + } + } } -/* - * This function maps the entire physical memory into the direct region - * (DMAP_START - DMAP_END) and sets up the caches. - * The bitmaps are stored one after another at the end of physical memory, and - * - */ -int pages_init(void) +void paging_init(vm_paddr_t phys_end) { - if (sanity_check() != 0) - return 1; - - map_direct_area(); + /* Sizes of the individual bitmaps per order, rounded up to the + * next full longword. We use the same bitmaps in all zones. */ + usize bitmap_sizes[MM_NR_ORDERS]; + /* size of all bitmaps combined */ + usize bitmap_total_size = 0; + + for (int order = 0; order < MM_NR_ORDERS; order++) { + usize pages = phys_end >> ORDER_SHIFT(order + 1); + pages = align_ceil(pages, LATOM_BIT * 2); + usize bytes = pages / (CHAR_BIT * 2); + bitmap_sizes[order] = bytes; + bitmap_total_size += bytes; + } - /* phys_end gets aligned, as promised by the comment in kmalloc_init() */ - phys_end = align_floor(phys_end, HUGEPAGE_SIZE); - usize phys_size = phys_end - phys_start; + page_debug("Reserving %zu bytes for page bitmaps\n", bitmap_total_size); /* - * calculate the size of each bitmap, as well as their combined size + * allocate memory for the bitmaps and zero them out */ - usize bitmap_bytes = 0; - for (int i = 0; i < CACHE_ORDERS; i++) { - usize bits = phys_size >> ORDER_SHIFT(i); - bits = align_ceil(bits, LONG_BIT); - bitmap_bytes += bits / 8; - } - - page_debug("Page frame overhead = %zu bytes, %zu bytes total\n", bitmap_bytes, phys_size); + u_int bitmap_size_log2 = flsl((long)bitmap_total_size); + KASSERT(bitmap_size_log2 != 0); + bitmap_size_log2--; /* the bit index returned by flsl starts at 1 */ + if (bitmap_total_size ^ (1ul << bitmap_size_log2)) + bitmap_size_log2++; /* bitmap_total_size is not a power of 2, round up */ + uintptr_t bitmap_start_phys = __boot_pmalloc(bitmap_size_log2, MM_ZONE_NORMAL); + panic_if(bitmap_start_phys == BOOT_PMALLOC_ERR, + "cannot allocate memory for the page bitmaps"); + memset(__v(bitmap_start_phys), 0, bitmap_total_size); /* - * zero out all bitmaps + * initialize the pools */ - uintptr_t bitmap_start_phys = phys_end - bitmap_bytes; - unsigned long *bitmap_start = __v(bitmap_start_phys); - memset(bitmap_start, 0, bitmap_bytes); + for (int zone_index = 0; zone_index < ARRAY_SIZE(mm_zones); zone_index++) { + struct mm_zone *zone = &mm_zones[zone_index]; + latom_t *bitmap_pos = __v(bitmap_start_phys); + for (int order = 0; order < MM_NR_ORDERS; order++) { + zone->pools[order].bitmap = bitmap_pos; + clist_init(&zone->pools[order].freelist); + zone->pools[order].free_entries = 0; + latom_init(&zone->free_count, 0); + + bitmap_pos += bitmap_sizes[order]; + } + } /* - * populate the remaining members of the cache_pool structures and - * preallocate entries that can't be handed out (i.e. the cache bitmaps) + * mark *all* pages as reserved first + * + * XXX this is totally unnecessary and i'm only doing it because i'm + * too tired to work out an algorithm that finds all pages that are + * not in the _bmem_areas lists of the mm_zones + * + * if the reserved bit is set, all other fields in the page are invalid. */ - unsigned long *bitmap_pos = bitmap_start; - for (int i = 0; i < CACHE_ORDERS; i++) { - /* total amount of entries on this level */ - usize total_bits = phys_size >> ORDER_SHIFT(i); - /* number of entries on this level that the bitmap itself takes up */ - usize wasted_bits = bitmap_bytes >> ORDER_SHIFT(i); - if (wasted_bits == 0) - wasted_bits = 1; - bit_set_range(bitmap_pos, total_bits - wasted_bits, wasted_bits); - - caches[i].bitmap = bitmap_pos; - bitmap_pos += total_bits / LONG_BIT; - - clist_init(&caches[i].freelist); - caches[i].free_entries = 0; + for (usize i = 0; i < phys_end >> PAGE_SHIFT; i++) { + /* This is merely an optimization to simplify checking whether + * two buddies can be coalesced into one. In reality, the + * reference count is invalid because the page is reserved. */ + atom_init(&vm_page_array[i].count, 1); + vm_page_array[i].flags = PG_RESERVED; } - /* kheap_start and kheap_end are globals */ - kheap_start = __v(phys_start); - kheap_end = align_floor(bitmap_start, HUGEPAGE_SIZE); - /* - * populate the freelist on the highest order, all orders beneath it - * stay empty until one of the large blocks gets split up + * populate the freelists */ - struct cache_pool *high_pool = &caches[CACHE_ORDERS - 1]; - usize step = 1 << ORDER_SHIFT(CACHE_ORDERS - 1); - for (void *pos = kheap_start; pos < kheap_end; pos += step) { - struct clist *entry = pos; - clist_add(&high_pool->freelist, entry); - high_pool->free_entries++; + for (int i = 0; i < ARRAY_SIZE(mm_zones); i++) { + struct mm_zone *zone = &mm_zones[i]; + struct _bmem_area *area, *tmp; + clist_foreach_entry_safe(&zone->_bmem_areas, area, tmp, link) { + claim_bmem_pages(zone, area); + } + zone->thrsh.emerg = latom_read(&zone->free_count) / CFG_PAGE_EMERG_DENOM; + if (zone->thrsh.emerg > CFG_PAGE_EMERG_MAX) + zone->thrsh.emerg = CFG_PAGE_EMERG_MAX; } - - return 0; } -/** - * @brief Split a block and return the lower half. - * The block is assumed to already have been removed from its freelist. - * The high half (i.e. the block that is *not* returned) is inserted into the - * freelist one level below `level`. - * - * @param ptr Pointer to the block - * @param level Current level of the block - * (`ptr` must be aligned to `1 << level` pages) - */ -static void *split_buddy(void *ptr, int level); - -/** - * @brief Attempt to coalesce a block with its buddy. - * If coalition is possible, the buddy is removed from its freelist at `order`. - * - * @param ptr Pointer to the block - * @param order Cache order, must be less than `CACHE_ORDERS - 1` (because you - * can't join blocks at the highest cache order) - * @return The joined block, or `nil` if coalition was not possible - */ -static void *try_join_buddy(void *ptr, int order); - -static inline usize get_bit_number(void *ptr, int order) +static inline bool pg_flip_bit(struct mm_zone *zone, u_long pfn, u_int order) { - return ((uintptr_t)ptr - (uintptr_t)kheap_start) >> ORDER_SHIFT(order); + usize bit = pfn >> (order + 1); + latom_t *bitmap = &zone->pools[order].bitmap[bit / LATOM_BIT]; + return latom_flip_bit(bitmap, (int)(bit % LATOM_BIT)); } -void *get_pages(int order, enum mflags flags) +__malloc_like +static void *__get_pages(u_int order, enum mflags flags) { PAGE_ASSERT(order >= 0); + struct mm_zone *zone = &mm_zones[_M_ZONE_INDEX(flags)]; - if (order >= GET_PAGE_ORDERS) { + if (order > MM_MAX_ORDER) { page_debug("get_pages(%d, %#08x): Order too high!\n", order, flags); return nil; } - if (flags & M_NOWAIT) { - kprintf("get_pages(): M_NOWAIT requested, this is not implemented yet :(\n"); - return nil; + u_long count_after = latom_sub(&zone->free_count, (1 << order)) - (1 << order); + if (count_after < zone->thrsh.emerg) { + if (count_after < 0 || !(flags & _M_EMERG)) { + latom_add(&zone->free_count, (1 << order)); + return nil; + } } - mtx_lock(&caches_lock); - struct clist *entry = nil; - int entry_order; - for (entry_order = order; entry_order < CACHE_ORDERS; entry_order++) { - if (caches[entry_order].free_entries > 0) { - entry = caches[entry_order].freelist.next; - break; + register_t cpuflags = read_flags(); + + /* + * Search for a free page. Start looking at the freelist for the + * requested order, and if it's empty, go over to the next higher order. + * Repeat until we found a page, or we've reached the highest order. + */ + vm_page_t page = nil; + u_int page_order = order; + while (page == nil && page_order < MM_NR_ORDERS) { + struct mm_pool *pool = &zone->pools[page_order]; + + disable_intr(); + spin_lock(&pool->lock); + if (pool->free_entries > 0) { + page = clist_del_first_entry(&pool->freelist, typeof(*page), link); + /* increment the reference count while we hold the lock on the pool, + * so that no other processor can try to coalesce this block if its + * buddy is being freed (coalition is only possible if the buddy + * has a reference count of zero, and while holding the pool lock) */ + page_get(page); + pool->free_entries--; + } else { + page_order++; } + spin_unlock(&pool->lock); + intr_restore(cpuflags); } - if (entry_order != CACHE_ORDERS) { - clist_del(entry); - caches[entry_order].free_entries--; - - usize bit_number = get_bit_number(entry, entry_order); - while (entry_order > order) { - entry = split_buddy(entry, entry_order); - bit_set(caches[entry_order].bitmap, bit_number); - entry_order--; - bit_number <<= 1; + /* + * if we found a page, check if we need to split it up + * (which is the case if we took one from a higher order freelist) + */ + if (page != nil) { + usize pfn = pg2pfn(page); + page_debug_noisy("alloc order %u, split pfn %#lx from order %u\n", + order, pfn, page_order); + pg_flip_bit(zone, pfn, page_order); + + /* split the page and insert the upper halves into the + * respective freelist until we reach the requested order */ + while (page_order-- > order) { + page_debug_noisy("split %p (order = %u)\n", pfn2vaddr(pfn), page_order); + struct mm_pool *pool = &zone->pools[page_order]; + vm_page_t buddy = page + (1 << page_order); + buddy->order = page_order; + pg_flip_bit(zone, pfn + (1 << page_order), page_order); + + disable_intr(); + spin_lock(&pool->lock); + clist_add_first(&pool->freelist, &buddy->link); + pool->free_entries++; + spin_unlock(&pool->lock); + intr_restore(cpuflags); } - bit_set(caches[order].bitmap, bit_number); -# if CFG_POISON_PAGES - memset(entry, 'a', 1 << ORDER_SHIFT(order)); -# endif + page->order = order; + void *vaddr = pfn2vaddr(pfn); + + return vaddr; + } else { + return nil; } +} - mtx_unlock(&caches_lock); - return (void *)entry; +/* faster memset for whole pages */ +static inline void init_pages(u_long *start, u_long val, u_int order) +{ + u_long *end = start + (ORDER_SIZE(order) / sizeof(*start)); + do { + *start++ = val; + } while (start != end); } -void free_pages(void *ptr) +void *get_pages(u_int order, enum mflags flags) { -# if CFG_DEBUG_PAGE_ALLOCS - if ((uintptr_t)ptr % PAGE_SIZE) { - kprintf("free_pages(%p): unaligned ptr!\n", ptr); - return; - } -# endif + void *pages = __get_pages(order, flags); - if (sus_nil(ptr)) { - page_debug("free_pages(%p): tried to free NULL!\n", ptr); - return; - } +#if CFG_POISON_PAGES + if (pages != nil) + init_pages(pages, PAGE_POISON_ALLOC, order); +#endif - int order = 0; - usize bit_number = get_bit_number(ptr, order); - for (; order < CACHE_ORDERS; order++) { - if (bit_tst(caches[order].bitmap, bit_number)) - break; - bit_number >>= 1; - } + return pages; +} - if (order == CACHE_ORDERS) { - page_debug("free_pages(%p): double free!\n", ptr); - return; - } - int original_order = order; +void *get_page(enum mflags flags) +{ + void *pages = __get_pages(0, flags); - mtx_lock(&caches_lock); +#if CFG_POISON_PAGES + if (pages != nil) + init_pages(pages, PAGE_POISON_ALLOC, 0); +#endif - while (order < CACHE_ORDERS - 1) { - bit_clr(caches[order].bitmap, bit_number); + return pages; +} - void *tmp = try_join_buddy(ptr, order); - if (tmp == nil) - break; +void *get_zero_pages(u_int order, enum mflags flags) +{ + void *pages = __get_pages(order, flags); - ptr = tmp; - order++; - bit_number >>= 1; - } + if (pages != nil) + init_pages(pages, 0, order); - if (order == CACHE_ORDERS - 1 && original_order != CACHE_ORDERS - 1) - set_pflags(HUGEPAGE_ALIGN(ptr), P_HUGE | P_RW); + return pages; +} -#if CFG_POISON_PAGES - memset(ptr, 'A', 1 << ORDER_SHIFT(order)); -#endif +void *get_zero_page(enum mflags flags) +{ + void *page = __get_pages(0, flags); - clist_add(&caches[order].freelist, (struct clist *)ptr); - caches[order].free_entries++; + if (page != nil) + init_pages(page, 0, 0); - mtx_unlock(&caches_lock); + return page; } -static inline void *split_buddy(void *ptr, int level) +/* + * Two buddies can be merged if: + * - you currently hold the lock for the pool + * - they both have a reference count of zero + * - they are in the same zone + * - neither of them is reserved + * + * This is only called from within the critical section of free_pages(), + * so execution speed is prioritized over anything else. + */ +static __always_inline bool can_merge(vm_page_t page, vm_page_t buddy) { -# if CFG_DEBUG_PAGE_ALLOCS - if ((uintptr_t)ptr % (1 << ORDER_SHIFT(level))) { - kprintf("split_buddy(ptr = %p, level = %d): unaligned ptr!\n", ptr, level); - return nil; - } - if (level < 1 || level >= CACHE_ORDERS) { - kprintf("split_buddy(ptr = %p, level = %d): invalid level!\n", ptr, level); - return nil; - } -# endif + bool merge = (atom_read(&buddy->count) == 0); - struct clist *high_buddy = ptr + (1 << ORDER_SHIFT(level - 1)); - clist_add(&caches[level - 1].freelist, high_buddy); - caches[level - 1].free_entries++; + /* we know that `page` doesn't have PG_RESERVED set, + * because we check that flag before anything else */ + const unsigned mask = PG_RESERVED | PG_DMA; + merge &= (page->flags & mask) == (buddy->flags & mask); - page_debug_noisy("split (%p:%p), lvl=%d\n", ptr, (void *)high_buddy, level); - - return ptr; + return merge; } -static void *try_join_buddy(void *ptr, int order) +void free_pages(void *ptr) { - const usize entry_size = 1 << ORDER_SHIFT(order); - -# if CFG_DEBUG_PAGE_ALLOCS - if ((uintptr_t)ptr % entry_size) { - kprintf("try_join_buddy(%p, %d): unaligned ptr!\n", ptr, order); - return nil; + PAGE_DEBUG_BLOCK { + if (ptr < DMAP_START || ptr >= DMAP_END) { + panic("free_pages(%p): not in DMAP region\n", ptr); } - /* order must be < CACHE_ORDERS - 1 because you - * can't join blocks on the topmost order */ - if (order >= CACHE_ORDERS - 1) { - kprintf("try_join_buddy(%p, %d): order >= CACHE_ORDERS - 1!\n", ptr, order); - return nil; - } -# endif + } - /* - * Test whether the buddy block is allocated and return nil if it is. - * entry_size is a power of 2, so we can quickly get to the buddy block - * with a cheap XOR of the address and the entry size without the need - * for any if branches. - */ - uintptr_t buddy = (uintptr_t)ptr ^ entry_size; - usize buddy_bitnum = get_bit_number((void *)buddy, order); - if (bit_tst(caches[order].bitmap, buddy_bitnum)) - return nil; + register_t cpuflags = read_flags(); - page_debug_noisy("join (%p:%p), order=%d\n", ptr, (void *)buddy, order); + vm_page_t page = vaddr2pg(ptr); + panic_if(page->flags & PG_RESERVED, "tried to free reserved page %p", ptr); - /* If the buddy is free, we remove it from the freelist ... */ - clist_del((struct clist *)buddy); - caches[order].free_entries--; + u_int order = page->order; + PAGE_ASSERT((uintptr_t)ptr % ORDER_SIZE(order) == 0); + u_long pfn = vaddr2pfn(ptr); - /* - * ... and return a pointer to the coalesced block. - * We use the same trick as above to get to the even (lower) block, just - * that this time we're zeroing the bit out rather than flipping it. - */ - uintptr_t even = (uintptr_t)ptr & ~entry_size; - return (void *)even; +#if CFG_POISON_PAGES + init_pages(ptr, PAGE_POISON_FREE, order); +#endif + + int old_count = atom_sub(&page->count, 1); + if (old_count != 1) { + if (old_count == 0) + panic("double free of page %p", ptr); + else + panic("attempted to free page %p with references", ptr); + } + + struct mm_zone *zone; + if (page->flags & PG_DMA) + zone = &mm_zones[MM_ZONE_DMA]; + else + zone = &mm_zones[MM_ZONE_NORMAL]; + + latom_add(&zone->free_count, (1 << order)); + + /* try to coalesce free buddy blocks until we're reached the highest order */ + while (order < MM_MAX_ORDER) { + if (pg_flip_bit(zone, pfn, order)) + break; + + page_debug_noisy("join %p (order = %u)\n", pfn2vaddr(pfn), order); + + /* precompute all values we need inside the critical section + * to avoid blocking other CPUs for longer than necessary */ + vm_page_t buddy = &vm_page_array[pfn ^ (1ul << order)]; + vm_page_t low = &vm_page_array[pfn & ~(1ul << order)]; + struct mm_pool *current_order_pool = &zone->pools[order]; + struct mm_pool *next_order_pool = &zone->pools[order + 1]; + + disable_intr(); + spin_lock(&zone->pools[order].lock); + if (can_merge(page, buddy)) { + clist_del(&buddy->link); + current_order_pool->free_entries--; + buddy->order = order + 1; + page->order = order + 1; + clist_add(&next_order_pool->freelist, &low->link); + next_order_pool->free_entries++; + } else { + order = MM_MAX_ORDER; /* break out of the loop */ + } + spin_unlock(&zone->pools[order].lock); + intr_restore(cpuflags); + + page = low; + order++; + } + + /* finally, we need to insert the page at its freelist */ + struct mm_pool *pool = &zone->pools[order]; + disable_intr(); + spin_lock(&pool->lock); + clist_add(&pool->freelist, &page->link); + pool->free_entries++; + spin_unlock(&zone->pools[order].lock); + intr_restore(cpuflags); }