mm: refactor page frame allocator

This is part 3 of the mm subsystem overhaul.
The allocator doesn't rely on mutexes anymore and
uses individual per-order spinlocks instead.
Also, it is aware of multiple memory zones (normal
and DMA) as well as emergency reserves.
Page bitmaps take up 50 % less overhead now.
main
anna 2 years ago
parent 825a981d67
commit 385af1b7ef
Signed by: fef
GPG Key ID: EC22E476DC2D3D84

@ -202,7 +202,7 @@ static inline bool latom_flip_bit(latom_t *latom, int pos)
__asm__ volatile(
X86_LOCK_PREFIX
" btcq %1, (%2) \n"
" btcq %q1, (%2) \n"
" setc %b0 \n"
: "+r"(ret)
: "r"(pos), "r"(&latom->_value)

@ -5,7 +5,6 @@
#include <arch/multiboot.h>
#include <arch/vmparam.h>
#include <gay/linker.h>
#include <gay/mm.h>
#include <gay/vm/page.h>
#include <gay/systm.h>
@ -15,7 +14,7 @@
#include <string.h>
struct vm_page *const vm_page_array = (vm_page_t)VM_PAGE_ARRAY_OFFSET;
#ifdef DEBUG
#if CFG_DEBUG_PGADDRS
/* this gets updated in x86_setup_paging() once we know how big the array is */
vm_page_t _vm_page_array_end = (vm_page_t)(VM_PAGE_ARRAY_OFFSET + VM_PAGE_ARRAY_LENGTH);
#endif
@ -41,6 +40,51 @@ static void register_area(struct mb2_mmap_entry *entry)
}
}
/**
* @brief Map the entire physical memory to `DMAP_OFFSET`.
*
* This may overshoot up to 1 GB because we only use gigapages, but considering
* the fact that mapping literally the entire physical RAM is probably the
* bigger problem here i'd say it's fine.
*
* @param end End of physical memory
*/
static void map_direct_area(vm_paddr_t end)
{
vm_paddr_t ppos = 0;
void *vpos = __v(0);
void *const vend = __v(end);
/* This assertion fails if > 4 TB of physical memory are available.
* Sorry gamers, we don't support enough RAM for all your Chrome tabs. */
KASSERT(vend < DMAP_END);
while (vpos < vend) {
x86_pml4te_t *pml4te = X86_PML4TE(vpos);
vm_paddr_t pdpt_phys = __boot_pmalloc(PAGE_SHIFT, MM_ZONE_NORMAL);
panic_if(pdpt_phys == BOOT_PMALLOC_ERR,
"cannot allocate memory for direct mapping");
__boot_clear_page(pdpt_phys);
pml4te->val = pdpt_phys | __P_PRESENT | __P_RW | __P_NOEXEC;
vm_flush();
for (int pdpti = 0; pdpti < 512; pdpti++) {
x86_pdpte_t *pdpte = X86_PDPTE(vpos);
pdpte->val = ppos | __P_PRESENT | __P_RW | __P_GLOBAL
| __P_HUGE | __P_NOEXEC;
ppos += GIGAPAGE_SIZE;
vpos += GIGAPAGE_SIZE;
if (vpos >= vend)
break;
}
pml4te->flags.global = 1;
}
vm_flush();
}
/*
* "Oh cool another deeply nested 100-liner that nobody understands"
*/
@ -68,16 +112,15 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
* (this is gonna be a long one)
*/
struct vm_page *vm_page_array_end = vm_page_array + (end >> PAGE_SHIFT);
#ifdef DEBUG
#if CFG_DEBUG_PGADDRS
_vm_page_array_end = vm_page_array_end;
#endif
void *map_pos = vm_page_array;
usize remaining_size = (void *)vm_page_array_end - (void *)vm_page_array;
remaining_size = align_ceil(remaining_size, PAGE_SIZE);
kprintf("Mapping %zu bytes for vm_page_array\n", remaining_size);
void *map_end = map_pos + ((void *)vm_page_array_end - (void *)vm_page_array);
kprintf("Mapping %zu bytes for vm_page_array\n", map_end - map_pos);
/* PML4T loop */
while (remaining_size != 0) {
while (map_pos < map_end) {
/* Is vm_page_array so huge that it spans almost the entire 2 TB
* kernel region? If that's the case, something has gone terribly
* wrong, unless we somehow happen to have about an Exabyte of RAM
@ -85,7 +128,7 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
KASSERT(map_pos < (void *)KERNBASE);
x86_pml4te_t *pml4te = X86_PML4TE(map_pos);
vm_paddr_t pml4te_val = __boot_pmalloc(PAGE_SHIFT);
vm_paddr_t pml4te_val = __boot_pmalloc(PAGE_SHIFT, MM_ZONE_NORMAL);
panic_if(pml4te_val == BOOT_PMALLOC_ERR, "cannot reserve memory for vm_page_array");
__boot_clear_page(pml4te_val);
pml4te_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_NOEXEC;
@ -98,8 +141,8 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
vm_paddr_t pdpte_val;
/* try allocating a 1 GB gigapage first */
if (remaining_size >= 1 << X86_PDPT_SHIFT) {
pdpte_val = __boot_pmalloc(X86_PDPT_SHIFT);
if (map_end - map_pos > GIGAPAGE_SIZE) {
pdpte_val = __boot_pmalloc(X86_PDPT_SHIFT, MM_ZONE_NORMAL);
/* CLion is warning about this condition being always true, but
* that is not the case. I've checked the disassembly with -O2,
* and clang is emitting the check. So it's fine, i guess. */
@ -107,16 +150,15 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
pdpte_val |= __P_PRESENT | __P_RW | __P_HUGE
| __P_GLOBAL | __P_NOEXEC;
pdpte->val = pdpte_val;
remaining_size -= 1 << X86_PDPT_SHIFT;
map_pos += 1 << X86_PDPT_SHIFT;
if (remaining_size == 0)
map_pos += GIGAPAGE_SIZE;
if (map_pos >= map_end)
goto map_done;
continue;
}
}
/* couldn't use a gigapage, continue in hugepage steps */
pdpte_val = __boot_pmalloc(PAGE_SHIFT);
pdpte_val = __boot_pmalloc(PAGE_SHIFT, MM_ZONE_NORMAL);
panic_if(pdpte_val == BOOT_PMALLOC_ERR,
"cannot reserve memory for vm_page_array");
__boot_clear_page(pdpte_val);
@ -130,22 +172,21 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
vm_paddr_t pdte_val;
/* try allocating a 2 MB hugepage first */
if (remaining_size >= (1 << X86_PDT_SHIFT)) {
pdte_val = __boot_pmalloc(X86_PDT_SHIFT);
if (map_end - map_pos >= HUGEPAGE_SIZE) {
pdte_val = __boot_pmalloc(X86_PDT_SHIFT, MM_ZONE_NORMAL);
if (pdte_val != BOOT_PMALLOC_ERR) {
pdte_val |= __P_PRESENT | __P_RW | __P_GLOBAL
| __P_HUGE | __P_NOEXEC;
pdte->val = pdte_val;
remaining_size -= 1 << X86_PDT_SHIFT;
map_pos += 1 << X86_PDT_SHIFT;
if (remaining_size == 0)
map_pos += HUGEPAGE_SIZE;
if (map_pos >= map_end)
goto map_done;
continue;
}
}
/* couldn't use a hugepage, continue in page steps */
pdte_val = __boot_pmalloc(PAGE_SHIFT);
pdte_val = __boot_pmalloc(PAGE_SHIFT, MM_ZONE_NORMAL);
panic_if(pdte_val == BOOT_PMALLOC_ERR,
"cannot reserve memory for vm_page_array");
__boot_clear_page(pdpte_val);
@ -156,15 +197,14 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
/* PT loop */
for (int pt_index = 0; pt_index < 512; pt_index++) {
x86_pte_t *pte = X86_PTE(map_pos);
vm_paddr_t pte_val = __boot_pmalloc(X86_PT_SHIFT);
vm_paddr_t pte_val = __boot_pmalloc(X86_PT_SHIFT, MM_ZONE_NORMAL);
panic_if(pte_val == BOOT_PMALLOC_ERR,
"cannot reserve memory for vm_page_array");
pte_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_NOEXEC;
pte->val = pte_val;
remaining_size -= 1 << X86_PT_SHIFT;
map_pos += 1 << X86_PT_SHIFT;
if (remaining_size == 0)
map_pos += PAGE_SIZE;
if (map_pos >= map_end)
goto map_done;
} /* end of PT loop */
} /* end of PDT loop */
@ -172,26 +212,8 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
} /* end of PML4T loop */
map_done:
vm_flush();
}
static void init_page_range(vm_paddr_t start, vm_paddr_t end, u_int flags)
{
KASSERT(start <= end);
vm_page_t cursor = vm_page_array + (start >> PAGE_SHIFT);
usize count = (end - start) >> PAGE_SHIFT;
if (flags == 0) {
memset(cursor, 0, count * sizeof(*cursor));
} else {
while (count--) {
atom_init(&cursor->count, 0);
cursor->flags = flags;
cursor->try_free = nil;
cursor->extra = nil;
cursor++;
}
}
map_direct_area(end);
paging_init(end);
}
/*
@ -199,7 +221,7 @@ static void init_page_range(vm_paddr_t start, vm_paddr_t end, u_int flags)
* a page table, yet also need to reference it in the page table structures
* (thereby mapping it into virtual memory) before we can zero it out.
* This little hack temporarily maps the area at one PDP entry before KERNBASE
* (meaning index 1022 of _pdp0), zeroes the area, and then unmaps it again.
* (meaning index 510 of _pdp0), zeroes the area, and then unmaps it again.
*/
void __boot_clear_page(vm_paddr_t paddr)
{

@ -4,15 +4,10 @@
#include <arch/trap.h>
#include <gay/cdefs.h>
#include <gay/config.h>
#include <gay/errno.h>
#include <gay/kprintf.h>
#include <gay/mm.h>
#include <gay/systm.h>
#include <gay/types.h>
#include <string.h>
/*
* Initial Page Directory Pointer Table and Page Map Level 4 Table for the
* assembly startup routine (see setup64.S). Used for statically mapping the
@ -21,46 +16,6 @@
__asmlink x86_pdpt_t _pdpt0;
__asmlink x86_pml4t_t _pml4t;
int map_page(uintptr_t phys, void *virt, enum pflags flags)
{
flags |= P_PRESENT;
x86_pml4te_t *pml4e = X86_PML4TE(virt);
if (!pml4e->flags.present) {
void *page = get_pages(0, M_ATOMIC);
if (page == nil)
return -ENOMEM;
pml4e->val = __p(page) | P_PRESENT | P_RW;
}
return 0;
}
/*
* The only difference between this and map_page() is that we can't allocate
* new pages using get_pages() but have to use __early_get_page() instead here.
* So, all we need to do is ensure that map_page() doesn't need to allocate new
* pages when we call it, which it only does if pflags does not have P_HUGE
* set and the page table doesn't exist (present bit in the page directory is
* clear). Therefore, we just need to make sure that, if P_HUGE is *not*
* set, the page table is already allocated and marked as present in the page
* directory.
*/
void __early_map_page(uintptr_t phys, void *virt, enum pflags pflags)
{
}
uintptr_t unmap_page(void *virt)
{
}
enum pflags get_pflags(void *page)
{
}
int set_pflags(void *page, enum pflags pflags)
{
}
void x86_isr_page_fault(trap_frame_t *frame, u32 error_code)
{
void *address;

@ -14,6 +14,10 @@ option(CFG_POISON_PAGES "Poison pages after allocate and free" ON)
option(CFG_POISON_HEAP "Poison heap memory after kmalloc() and kfree()" ON)
set(CFG_PAGE_EMERG_DENOM "16" CACHE STRING "Denominator for the fraction of pages kept in emergency reserves")
set(CFG_PAGE_EMERG_MAX "1024" CACHE STRING "Absolute maximum number of pages kept in emergency reserves")
option(CFG_SMP "Enable Symmetric Multiprocessing" ON)
set(CFG_MAX_CPU "64" CACHE STRING "Maximum number of logical processors")
@ -28,6 +32,8 @@ option(CFG_DEBUG_PAGE_ALLOCS "Debug page frame allocations" OFF)
option(CFG_DEBUG_PAGE_ALLOCS_NOISY "Debug page frame allocations in full detail (VERY noisy)" OFF)
option(CFG_DEBUG_PGADDRS "Sanitize page frame addresses" OFF)
option(CFG_DEBUG_SLAB_ALLOCS "Debug slab allocations" OFF)
option(CFG_DEBUG_SLAB_ALLOCS_NOISY "Debug slab allocations in full detail (VERY noisy)" OFF)

@ -31,6 +31,12 @@
/** @brief Poison heap areas after `kmalloc()` and `kfree()` */
#cmakedefine01 CFG_POISON_HEAP
/** @brief Denominator for the fraction of pages kept in emergency reserves */
#define CFG_PAGE_EMERG_DENOM @CFG_PAGE_EMERG_DENOM@
/** @brief Absolute maximum number of pages kept in emergency reserves */
#define CFG_PAGE_EMERG_MAX @CFG_PAGE_EMERG_THRESH@
/** @brief Enable Symmetric Multiprocessing */
#cmakedefine01 CFG_SMP
@ -52,6 +58,9 @@
/** @brief Spit out the full details of page allocations */
#cmakedefine01 CFG_DEBUG_PAGE_ALLOCS_NOISY
/** @brief Sanitize page frame addresses */
#cmakedefine01 CFG_DEBUG_PGADDRS
/** @brief Debug slab allocations */
#cmakedefine01 CFG_DEBUG_SLAB_ALLOCS

@ -16,6 +16,13 @@
* bigger areas of memory that are not physically contiguous (for regular user
* allocations). The entire physical memory is mapped statically in the range
* `DMAP_START - DMAP_END`.
*
* Memory is split up into (currently) two zones: `MM_ZONE_NORMAL` and
* `MM_ZONE_DMA`. As their names suggest, the former is for general purpose
* allocations and the latter for getting memory suitable for DMA transfers.
* Zones are further divided into pools, each of which hold a list of groups of
* free pages. The size of these page groups is determined by the pool's order,
* where the pool of order `n` holds groups of `1 << n` pages.
*/
#ifdef _KERNEL
@ -23,10 +30,14 @@
#include <arch/page.h>
#include <gay/cdefs.h>
#include <gay/clist.h>
#include <gay/config.h>
#include <gay/kprintf.h>
#include <gay/mutex.h>
#include <gay/types.h>
#include <string.h>
#define _M_ZONE_NORMAL 0
#define _M_ZONE_DMA 1
#define _M_ZONE_INDEX(flags) ((flags) & 1)
@ -40,15 +51,34 @@ enum mm_zone_type {
MM_NR_ZONES
};
/** @brief Boot memory area. */
struct _bmem_area {
struct clist link; /* -> struct mm_zone::_bmem_areas */
vm_paddr_t start;
vm_paddr_t end;
};
struct mm_pool {
struct clist freelist; /* -> vm_page_t::link */
/** @brief Number of items in `freelist`. */
usize free_entries;
/** @brief One bit per buddy *pair*, 1 if exactly one is allocated. */
latom_t *bitmap;
spin_t lock;
};
#define MM_NR_ORDERS 10
#define MM_MAX_ORDER (MM_NR_ORDERS - 1)
struct mm_zone {
patom_t freelist; /* -> struct vm_page */
usize length;
/** @brief Current number of free pages in all pools */
latom_t free_count;
/** @brief Thresholds for OOM behavior */
struct {
/** @brief Minimum number of pages reserved for emergency allocations */
u_long emerg;
} thrsh;
struct mm_pool pools[MM_NR_ORDERS];
struct clist _bmem_areas; /* -> struct _bmem_area */
};
@ -59,7 +89,7 @@ struct mm_zone {
* The mm subsystem isn't NUMA aware, because it's not really a thing on desktop
* grade machines anyway and would only complicate things unnecessarily.
*/
extern struct mm_zone mm_zones[MM_NR_ZONES];
extern struct mm_zone mm_zones[MM_NR_ZONES]; /* kernel/mm/page.c */
/**
* @brief Memory allocation flags passed to `kmalloc()`.
@ -122,108 +152,33 @@ enum pflags {
#endif
};
/*
* Terrible hack that allows us to map pages before the page frame allocator is
* set up. Don't ever use these anywhere, because they *will* break everything.
*/
void __early_map_page(uintptr_t phys, void *virt, enum pflags flags);
/* This just shrinks phys_end by PAGE_SIZE and returns the page */
uintptr_t __early_get_page(void);
/**
* @brief Map a page in physical memory to a virtual address.
* Remember that if `vm` is the memory map currently in use, you will most
* likely need to call `vm_update()` when you've finished mapping everything
* to flush the TLB.
*
* @param phys Physical address of the page
* @param virt Virtual address to map the page to
* @param flags Flags to apply to the page
* @returns 0 on success, or `-ENOMEM` if OOM (for allocating new page tables)
*/
int map_page(uintptr_t phys, void *virt, enum pflags flags);
/**
* @brief Remove a page mapping.
*
* @param virt Virtual address the page is mapped to, must be page aligned
* @returns The physical page address that was being mapped
*/
uintptr_t unmap_page(void *virt);
/**
* @brief Get a page's flags in the page tables.
*
* @param page Page to get the flags of (if the page is in a hugepage area,
* the flags for that hugepage will be returned with `P_HUGE = 1`)
* @return The flags, as currently stored in the page table structures
* (but not necessarily applied if they have been modified and `vm_flush()`
* has not been called yet!)
*/
enum pflags get_pflags(void *page);
/**
* @brief Update a page's flags in the page tables.
* You should always use this in conjunction with `get_pflags()`, as in getting
* the flags first, then toggling the flags you need to, and then setting them
* in the tables again. This is because this method will clear *any* previous
* flags.
*
* @param page Page to set flags for (if flags has `P_HUGE` set, must be
* `HUGEPAGE_SIZE` aligned, otherwise `PAGE_SIZE` aligned)
* @param flags Flags to set
* @return 0 on success, or a negative value if either a page table allocation
* failed or
*/
int set_pflags(void *page, enum pflags flags);
/**
* @brief Initialize the memory allocator.
*
* This can only be called once, from the early `_boot()` routine.
*
* @param _phys_start Physical start address of the page area
* @param _phys_end Physical end address of the page area
* @returns 0 on success, or -1 if the pointers were garbage
*/
int kmalloc_init(uintptr_t _phys_start, uintptr_t _phys_end);
/** @brief Start of the mapped, physically contiguous kernel heap */
extern void *kheap_start;
/** @brief End of the mapped, physically contiguous kernel heap */
extern void *kheap_end;
/** @brief Start of the kernel heap in physical memory */
extern uintptr_t phys_start;
/** @brief End of the kernel heap in physical memory */
extern uintptr_t phys_end;
/**
* @brief Initialize the buddy page frame allocator.
* This is only called once, internally from `kmalloc_init()`.
*
* @return 0 on success, or -1 if it messed up
* This is only called once, from the arch dependent counterpart after it has
* reserved memory for and mapped `vm_page_array`, as well as mapped the direct
* area.
*/
int pages_init(void);
void paging_init(vm_paddr_t phys_end);
/**
* @brief Allocate a contiguous region in physical memory.
* The returned region will be `(1 << order) * PAGE_SIZE` bytes long.
*
* @param order Order of magnitude (as in `1 << order`) for the region size
* @param flags How to allocate (`order` must be 0 if `M_NOWAIT` is specified)
* **The pages are not initialized.**
* If you want zeroed pages, use `get_zero_pages()`.
*
* @param order Order of magnitude (as in `1 << order` pages)
* @param flags How to allocate
* @return A pointer to the beginning of the region in the direct mapping area,
* or `nil` if the allocation failed
*/
void *get_pages(int order, enum mflags flags) __malloc_like;
#ifdef __HAVE_HUGEPAGES
#define GET_PAGE_ORDERS (HUGEPAGE_SHIFT - PAGE_SHIFT + 1)
#else
#define GET_PAGE_ORDERS 10
#endif
#define GET_PAGE_MAX_ORDER (GET_PAGE_ORDERS - 1)
void *get_pages(u_int order, enum mflags flags) __malloc_like;
void *get_page(enum mflags flags) __malloc_like;
void *get_zero_pages(u_int order, enum mflags flags) __malloc_like;
void *get_zero_page(enum mflags flags) __malloc_like;
void free_pages(void *ptr);
#define free_page(ptr) free_pages(ptr)
/**
* @brief Initialize the slab caches.
@ -240,14 +195,8 @@ void slab_init(void);
* @param phys Physical address
* @return Virtual address
*/
static inline void *__v(uintptr_t phys)
static inline void *__v(vm_paddr_t phys)
{
# ifdef DEBUG
if (phys > phys_end) {
kprintf("__v(%p): phys ptr out of range!\n", (void *)phys);
return nil;
}
# endif
return (void *)phys + DMAP_OFFSET;
}
@ -262,7 +211,7 @@ static inline void *__v(uintptr_t phys)
* @return The physical address, i.e. `virt - DMAP_OFFSET`
* @see vtophys()
*/
static inline uintptr_t __p(void *virt)
static inline vm_paddr_t __p(void *virt)
{
# ifdef DEBUG
if (virt < DMAP_START || virt >= DMAP_END) {

@ -5,9 +5,23 @@
#include <arch/page.h>
#include <gay/cdefs.h>
#include <gay/clist.h>
#include <gay/config.h>
#include <gay/systm.h>
#include <gay/types.h>
/*
* I'm trying really hard to keep the size of struct vm_page a power of two
* on LP64 systems, because that way we can quickly get to the page frame number
* by shifting the byte offset of the vm_page_t in vm_page_array to the right
* rather than doing a costly divide instruction (or store the page frame number
* within the structure itself, which takes up precious space).
*
* There is insane pressure on the size of this structure, because a typical
* system will have millions of instances of it. Every additional byte makes
* a significant difference in memory management overhead.
*/
/**
* @brief Stores information about a single page in physical memory.
* There is exactly one of these for every physical page, no matter what that
@ -16,66 +30,94 @@
struct vm_page {
/** @brief Reference count (0 = unused) */
atom_t count;
unsigned order:8;
/** @brief Various flags describing how and for what the page is used, see below */
u_int flags;
/** @brief Singly linked list, if the page is free */
patom_t next;
/**
* @brief Request this page to be freed if possible.
* This callback may be `nil` unless the `PG_FREEABLE` bit in `flags`
* is set. The presence of this bit does *not* guarantee that the page
* is actually reclaimable, it's merely a performance optimization to
* avoid having to call this function on pages that can never be
* reclaimed anyway.
*
* @param page Pointer to the page itself
* @return 0 if the page could be reclaimed and is now free
*/
int (*try_free)(struct vm_page *page);
unsigned flags:24;
struct clist link;
/**
* @brief Optional extra data pointer, reserved for private use.
* The current owner of the page may use this to track the underlying
* object in memory (or pretty much anything else), for example the
* `struct slab` if this page is currently used by the slab allocator.
* Useful for implementing the `try_free()` callback.
*/
void *extra;
};
typedef struct vm_page *vm_page_t;
/* values for struct page::flags */
/* values for struct vm_page::flags */
/** @brief Page must never be accessed */
#define PG_RESERVED (1 << 0)
/** @brief Page is in an atomic per-cpu cache */
#define PG_ATOMIC (1 << 1)
/** @brief Page is in a per-cpu cache */
#define PG_PCPU (1 << 1)
/** @brief Page is used by the slab allocator */
#define PG_SLAB (1 << 2)
/** @brief It **might** be possible to reclaim this page using `try_free()` */
#define PG_FREEABLE (1 << 3)
/** @brief Page is in `MM_ZONE_DMA`, rather than `MM_ZONE_NORMAL` */
#define PG_DMA (1u << 3)
/** @brief Array of every single page in physical memory, indexed by page frame number. */
extern struct vm_page *const vm_page_array;
#ifdef DEBUG
#if CFG_DEBUG_PGADDRS
extern vm_page_t _vm_page_array_end;
#define PGADDR_ASSERT(x) KASSERT(x)
#else
#define PGADDR_ASSERT(x) ({})
#endif
static inline bool page_get(vm_page_t page)
{
return atom_inc(&page->count);
}
static inline bool page_put(vm_page_t page)
{
return atom_dec(&page->count);
}
/** @brief Get the page frame number of a page. */
__pure2 static inline u_long pg2pfn(vm_page_t page)
__pure2
static inline u_long pg2pfn(vm_page_t page)
{
KASSERT(page < _vm_page_array_end);
PGADDR_ASSERT(page < _vm_page_array_end);
return page - vm_page_array;
}
__pure2 static inline u_long paddr2pfn(vm_paddr_t paddr)
__pure2
static inline vm_page_t vaddr2pg(void *vaddr)
{
PGADDR_ASSERT(vaddr >= DMAP_START && vaddr < (void *)_vm_page_array_end);
uintptr_t offset = (uintptr_t)vaddr - DMAP_OFFSET;
return &vm_page_array[offset >> PAGE_SHIFT];
}
__pure2
static inline u_long vaddr2pfn(void *vaddr)
{
u_long pfn = ((uintptr_t)vaddr - DMAP_OFFSET) >> PAGE_SHIFT;
PGADDR_ASSERT(vaddr >= DMAP_START && &vm_page_array[pfn] < _vm_page_array_end);
return pfn;
}
__pure2
static inline u_long paddr2pfn(vm_paddr_t paddr)
{
KASSERT(&vm_page_array[paddr >> PAGE_SHIFT] < _vm_page_array_end);
PGADDR_ASSERT(&vm_page_array[paddr >> PAGE_SHIFT] < _vm_page_array_end);
return paddr >> PAGE_SHIFT;
}
__pure2 static inline vm_page_t paddr2pg(vm_paddr_t paddr)
__pure2
static inline vm_page_t paddr2pg(vm_paddr_t paddr)
{
vm_page_t page = vm_page_array + (paddr >> PAGE_SHIFT);
KASSERT(page < _vm_page_array_end);
PGADDR_ASSERT(page < _vm_page_array_end);
return page;
}
__pure2
static inline void *pfn2vaddr(u_long pfn)
{
PGADDR_ASSERT(&vm_page_array[pfn] < _vm_page_array_end);
return DMAP_START + (pfn << PAGE_SHIFT);
}

@ -2,7 +2,6 @@
target_sources(gay_kernel PRIVATE
boot.c
kmalloc.c
page.c
slab.c
)

@ -13,7 +13,7 @@ static CLIST(bmem_area_freelist);
#ifdef DEBUG
#define debug_free_bmem_area(area) ({ (area)->start = ~(vm_paddr_t)0; })
#define debug_get_bmem_area(area) KASSERT((area)->start != ~(vm_paddr_t)0)
#define debug_get_bmem_area(area) KASSERT((area)->start == ~(vm_paddr_t)0)
#else
#define debug_free_bmem_area(area) ({})
#define debug_get_bmem_area(area) ({})
@ -62,6 +62,9 @@ void __boot_pmalloc_init(void)
debug_free_bmem_area(area);
clist_add(&bmem_area_freelist, &area->link);
}
for (int i = 0; i < MM_NR_ZONES; i++)
clist_init(&mm_zones[i]._bmem_areas);
}
void __boot_register_mem_area(vm_paddr_t start, vm_paddr_t end, enum mm_zone_type zone_type)

@ -1,74 +0,0 @@
/* Copyright (C) 2021 fef <owo@fef.moe>. All rights reserved. */
#include <gay/kprintf.h>
#include <gay/mm.h>
#include <gay/types.h>
#include <gay/util.h>
extern void _image_start_phys;
extern void _image_end_phys;
/* these are initialized by pages_init() */
void *kheap_start;
void *kheap_end;
int kmalloc_init(uintptr_t _phys_start, uintptr_t _phys_end)
{
phys_start = _phys_start;
phys_end = _phys_end;
/*
* The kernel image is very likely gonna be within the physical memory
* range, so we're gonna need to do some cropping in order to not hand
* out pages that actually contain kernel code.
* Furthermore, somebody should probably clean up this mess somehow.
*/
uintptr_t image_start_phys = (uintptr_t)&_image_start_phys;
uintptr_t image_end_phys = (uintptr_t)&_image_end_phys;
if (phys_start < image_start_phys && phys_end > image_start_phys) {
if (image_start_phys - phys_start > phys_end - image_start_phys)
phys_end = image_start_phys;
else
phys_start = image_end_phys;
}
if (phys_start < image_end_phys && _phys_end > image_end_phys) {
if (image_end_phys - phys_start > phys_end - image_end_phys)
phys_end = image_start_phys;
else
phys_start = image_end_phys;
}
phys_start = align_ceil(phys_start, HUGEPAGE_SIZE);
/*
* This is intentionally not aligned to hugepages, because __early_get_page()
* shrinks it in single PAGE_SIZE steps whenever it is called anyway.
* I know, this is a terrible hack, but it will be aligned to a hugepage
* from within pages_init(), right after the entire physical memory has
* been mapped to the direct area (which is the only reason we need to
* be able to allocate pages before the page frame allocator is set up
* in the first place).
*/
phys_end = align_floor(phys_end, PAGE_SIZE);
int err = pages_init();
if (err)
return err;
slab_init();
return 0;
}
__weak void *malloc(usize size)
{
return kmalloc(size, M_KERN);
}
__weak void free(void *ptr)
{
kfree(ptr);
}
/*
* Looking for kmalloc() and kfree()?
* Those two are in slab.c for purely organizational reasons.
*/

@ -1,32 +1,25 @@
/* Copyright (C) 2021 fef <owo@fef.moe>. All rights reserved. */
#include <arch/cpufunc.h>
#include <arch/page.h>
#include <gay/bits.h>
#include <gay/clist.h>
#include <gay/config.h>
#include <gay/kprintf.h>
#include <gay/mm.h>
#include <gay/mutex.h>
#include <gay/poison.h>
#include <gay/systm.h>
#include <gay/types.h>
#include <gay/util.h>
#include <gay/vm/page.h>
#include <limits.h>
#include <string.h>
#include <strings.h>
#ifndef __HAVE_HUGEPAGES
#error "Systems without huge pages are currently unsupported because i'm a dumb bitch"
#endif
#if DMAP_OFFSET % HUGEPAGE_SIZE != 0
#error "DMAP_OFFSET must be an integral multiple of HUGEPAGE_SIZE"
#endif
/* this should be impossible because arch/page.h must also define PAGE_SHIFT
* and HUGEPAGE_SHIFT, meaning the two are definitively powers of 2 */
#if HUGEPAGE_SIZE % PAGE_SIZE != 0
#error "HUGEPAGE_SIZE must be an integral multiple of PAGE_SIZE"
#if DMAP_OFFSET % PAGE_SIZE != 0
#error "DMAP_OFFSET must be an integral multiple of PAGE_SIZE"
#endif
#if PAGE_SIZE % LONG_BIT != 0
@ -40,6 +33,7 @@
#if CFG_DEBUG_PAGE_ALLOCS
# define PAGE_ASSERT(x) KASSERT(x)
# define page_debug(msg, ...) kprintf("[page] " msg, ##__VA_ARGS__)
# define PAGE_DEBUG_BLOCK
# if CFG_DEBUG_PAGE_ALLOCS_NOISY
# define page_debug_noisy(msg, ...) kprintf("[page] " msg, ##__VA_ARGS__)
# else
@ -47,359 +41,419 @@
# endif
#else
# define PAGE_ASSERT(x) ({})
# define PAGE_DEBUG_BLOCK if (0)
# define page_debug(msg, ...) ({})
# define page_debug_noisy(msg, ...) ({})
#endif
/**
* We have cache levels for areas ranging from a single page up to a huge page
* on a logarithmic scale. Every level covers double the pages per entry than
* the one below it, starting at one page per entry. The effective result is
* that a single entry in the cache on level L covers `(1 << L)` pages.
*/
#define CACHE_ORDERS GET_PAGE_ORDERS
#define ORDER_SHIFT(order) (PAGE_SHIFT + (order))
#define ORDER_SIZE(order) (1 << ORDER_SHIFT(order))
/** @brief There is one of this for every cache order. */
struct cache_pool {
/**
* @brief List of free blocks on this order of granularity.
* The individual entries sit right at the beginning of each free block,
* and are always aligned to `entry_size` bytes.
*/
struct clist freelist;
/**
* @brief Bitmap that stores the allocated status of each entry.
* 1 means allocated, 0 means not.
*/
unsigned long *bitmap;
/** @brief Number of items in `freelist`. */
usize free_entries;
};
static struct cache_pool caches[CACHE_ORDERS];
static MTX(caches_lock);
/* these get set in kmalloc_init() */
uintptr_t phys_start;
uintptr_t phys_end;
uintptr_t __early_get_page(void)
{
phys_end -= PAGE_SIZE;
return phys_end;
}
/* this should be the same as LONG_BIT because latom_t is really just a
* long wrapped in a struct, but my trust in compilers is exactly zero */
#define LATOM_BIT (sizeof(latom_t) * CHAR_BIT)
struct mm_zone mm_zones[MM_NR_ZONES];
static int sanity_check(void)
static inline u_int paddr_find_order(vm_paddr_t addr)
{
KASSERT(phys_start < phys_end);
KASSERT(phys_start == HUGEPAGE_ALIGN(phys_start));
/* phys_end is only page aligned, see kmalloc_init() */
KASSERT(phys_end == PAGE_ALIGN(phys_end));
if ((phys_end - phys_start) < (32 * 1024 * 1024)) {
kprintf("Less than 32 MB of usable RAM, this wouldn't go well\n");
return 1;
}
int bit = ffsll((long long)addr) - 1;
if (bit == -1 || bit > ORDER_SHIFT(MM_MAX_ORDER))
bit = ORDER_SHIFT(MM_MAX_ORDER);
return 0;
KASSERT(bit >= PAGE_SHIFT);
return bit - PAGE_SHIFT;
}
/*
* Map the entire physical memory into the direct contiguous area.
* __early_map_page() might call __early_get_page() in order to allocate
* new page table structures, which in turn shrinks the physical memory
* size (see above).
*/
static inline void map_direct_area(void)
/** @brief Claim all free pages in one of the memory areas from the boot allocator. */
static inline void claim_bmem_pages(struct mm_zone *zone, struct _bmem_area *area)
{
#ifdef __HAVE_HUGEPAGES
const usize step = HUGEPAGE_SIZE;
const enum pflags flags = P_PRESENT | P_RW | P_HUGE;
#else
const usize step = PAGE_SIZE;
const enum pflags flags = P_PRESENT | P_RW;
#endif
vm_paddr_t start = area->start;
vm_paddr_t end = area->end;
vm_paddr_t pos = start;
vm_size_t nr_pages = end - start / PAGE_SIZE;
latom_add(&zone->free_count, (long)nr_pages);
struct vm_page *page = &vm_page_array[start >> PAGE_SHIFT];
u_int order = paddr_find_order(start);
/* make sure the boot memory allocator cannot under any circumstances hand
* out pages from this area anymore, even though that should be unnecessary */
clist_del(&area->link);
/*
* It might be necessary to use a volatile pointer to phys_end for this
* loop in case clang does The Optimization and caches its value for
* whatever reason, even though at least for x86 this is not the case
* (and i don't even thing the C standard allows it when calling
* external functions in between, but still, Never Trust The Compiler).
* We want to insert pages at the highest possible order. However, the
* start and end pointers of the area are only guaranteed to be page
* aligned. Therefore, we start with the highest possible order based
* on the start address, and then increment the order in every loop
* iteration (up to MM_MAX_ORDER). We do this until we have reached
* the end which, again, is only guaranteed to be page aligned, and
* subsequently lower the order again.
*/
for (uintptr_t pos = phys_start; pos <= phys_end - step; pos += step)
__early_map_page(pos, __v(pos), flags);
while (pos < end) {
struct mm_pool *pool = &zone->pools[order];
clist_add(&pool->freelist, &page->link);
pool->free_entries++;
/* only the first page in the order group is inserted into
* the freelist, but all of them need to be initialized */
for (u_int i = 0; i < (1 << order); i++) {
atom_init(&page[i].count, 0);
page[i].flags = 0;
page[i].order = 0;
}
vm_flush();
/*
* order
* ^
* | _________ < MM_MAX_ORDER
* | / |
* start | / \ < end order
* order > |/
* |--------------|----> pos
* start end
*/
pos += ORDER_SIZE(order);
page += (1 << order);
if (order < MM_MAX_ORDER && pos + ORDER_SIZE(order) <= end) {
/* this makes the rising part of the graph */
order++;
} else if (order > 0 && pos > end) {
/* we have overshot, lower the order */
pos -= ORDER_SIZE(order);
page -= (1 << order);
/* this makes the abrupt downwards jump at the end of the graph */
while (--order) {
if (pos + ORDER_SIZE(order) <= end) {
pos += ORDER_SIZE(order);
page += (1 << order);
break;
}
}
}
}
}
/*
* This function maps the entire physical memory into the direct region
* (DMAP_START - DMAP_END) and sets up the caches.
* The bitmaps are stored one after another at the end of physical memory, and
*
*/
int pages_init(void)
void paging_init(vm_paddr_t phys_end)
{
if (sanity_check() != 0)
return 1;
map_direct_area();
/* Sizes of the individual bitmaps per order, rounded up to the
* next full longword. We use the same bitmaps in all zones. */
usize bitmap_sizes[MM_NR_ORDERS];
/* size of all bitmaps combined */
usize bitmap_total_size = 0;
for (int order = 0; order < MM_NR_ORDERS; order++) {
usize pages = phys_end >> ORDER_SHIFT(order + 1);
pages = align_ceil(pages, LATOM_BIT * 2);
usize bytes = pages / (CHAR_BIT * 2);
bitmap_sizes[order] = bytes;
bitmap_total_size += bytes;
}
/* phys_end gets aligned, as promised by the comment in kmalloc_init() */
phys_end = align_floor(phys_end, HUGEPAGE_SIZE);
usize phys_size = phys_end - phys_start;
page_debug("Reserving %zu bytes for page bitmaps\n", bitmap_total_size);
/*
* calculate the size of each bitmap, as well as their combined size
* allocate memory for the bitmaps and zero them out
*/
usize bitmap_bytes = 0;
for (int i = 0; i < CACHE_ORDERS; i++) {
usize bits = phys_size >> ORDER_SHIFT(i);
bits = align_ceil(bits, LONG_BIT);
bitmap_bytes += bits / 8;
}
page_debug("Page frame overhead = %zu bytes, %zu bytes total\n", bitmap_bytes, phys_size);
u_int bitmap_size_log2 = flsl((long)bitmap_total_size);
KASSERT(bitmap_size_log2 != 0);
bitmap_size_log2--; /* the bit index returned by flsl starts at 1 */
if (bitmap_total_size ^ (1ul << bitmap_size_log2))
bitmap_size_log2++; /* bitmap_total_size is not a power of 2, round up */
uintptr_t bitmap_start_phys = __boot_pmalloc(bitmap_size_log2, MM_ZONE_NORMAL);
panic_if(bitmap_start_phys == BOOT_PMALLOC_ERR,
"cannot allocate memory for the page bitmaps");
memset(__v(bitmap_start_phys), 0, bitmap_total_size);
/*
* zero out all bitmaps
* initialize the pools
*/
uintptr_t bitmap_start_phys = phys_end - bitmap_bytes;
unsigned long *bitmap_start = __v(bitmap_start_phys);
memset(bitmap_start, 0, bitmap_bytes);
for (int zone_index = 0; zone_index < ARRAY_SIZE(mm_zones); zone_index++) {
struct mm_zone *zone = &mm_zones[zone_index];
latom_t *bitmap_pos = __v(bitmap_start_phys);
for (int order = 0; order < MM_NR_ORDERS; order++) {
zone->pools[order].bitmap = bitmap_pos;
clist_init(&zone->pools[order].freelist);
zone->pools[order].free_entries = 0;
latom_init(&zone->free_count, 0);
bitmap_pos += bitmap_sizes[order];
}
}
/*
* populate the remaining members of the cache_pool structures and
* preallocate entries that can't be handed out (i.e. the cache bitmaps)
* mark *all* pages as reserved first
*
* XXX this is totally unnecessary and i'm only doing it because i'm
* too tired to work out an algorithm that finds all pages that are
* not in the _bmem_areas lists of the mm_zones
*
* if the reserved bit is set, all other fields in the page are invalid.
*/
unsigned long *bitmap_pos = bitmap_start;
for (int i = 0; i < CACHE_ORDERS; i++) {
/* total amount of entries on this level */
usize total_bits = phys_size >> ORDER_SHIFT(i);
/* number of entries on this level that the bitmap itself takes up */
usize wasted_bits = bitmap_bytes >> ORDER_SHIFT(i);
if (wasted_bits == 0)
wasted_bits = 1;
bit_set_range(bitmap_pos, total_bits - wasted_bits, wasted_bits);
caches[i].bitmap = bitmap_pos;
bitmap_pos += total_bits / LONG_BIT;
clist_init(&caches[i].freelist);
caches[i].free_entries = 0;
for (usize i = 0; i < phys_end >> PAGE_SHIFT; i++) {
/* This is merely an optimization to simplify checking whether
* two buddies can be coalesced into one. In reality, the
* reference count is invalid because the page is reserved. */
atom_init(&vm_page_array[i].count, 1);
vm_page_array[i].flags = PG_RESERVED;
}
/* kheap_start and kheap_end are globals */
kheap_start = __v(phys_start);
kheap_end = align_floor(bitmap_start, HUGEPAGE_SIZE);
/*
* populate the freelist on the highest order, all orders beneath it
* stay empty until one of the large blocks gets split up
* populate the freelists
*/
struct cache_pool *high_pool = &caches[CACHE_ORDERS - 1];
usize step = 1 << ORDER_SHIFT(CACHE_ORDERS - 1);
for (void *pos = kheap_start; pos < kheap_end; pos += step) {
struct clist *entry = pos;
clist_add(&high_pool->freelist, entry);
high_pool->free_entries++;
for (int i = 0; i < ARRAY_SIZE(mm_zones); i++) {
struct mm_zone *zone = &mm_zones[i];
struct _bmem_area *area, *tmp;
clist_foreach_entry_safe(&zone->_bmem_areas, area, tmp, link) {
claim_bmem_pages(zone, area);
}
zone->thrsh.emerg = latom_read(&zone->free_count) / CFG_PAGE_EMERG_DENOM;
if (zone->thrsh.emerg > CFG_PAGE_EMERG_MAX)
zone->thrsh.emerg = CFG_PAGE_EMERG_MAX;
}
return 0;
}
/**
* @brief Split a block and return the lower half.
* The block is assumed to already have been removed from its freelist.
* The high half (i.e. the block that is *not* returned) is inserted into the
* freelist one level below `level`.
*
* @param ptr Pointer to the block
* @param level Current level of the block
* (`ptr` must be aligned to `1 << level` pages)
*/
static void *split_buddy(void *ptr, int level);
/**
* @brief Attempt to coalesce a block with its buddy.
* If coalition is possible, the buddy is removed from its freelist at `order`.
*
* @param ptr Pointer to the block
* @param order Cache order, must be less than `CACHE_ORDERS - 1` (because you
* can't join blocks at the highest cache order)
* @return The joined block, or `nil` if coalition was not possible
*/
static void *try_join_buddy(void *ptr, int order);
static inline usize get_bit_number(void *ptr, int order)
static inline bool pg_flip_bit(struct mm_zone *zone, u_long pfn, u_int order)
{
return ((uintptr_t)ptr - (uintptr_t)kheap_start) >> ORDER_SHIFT(order);
usize bit = pfn >> (order + 1);
latom_t *bitmap = &zone->pools[order].bitmap[bit / LATOM_BIT];
return latom_flip_bit(bitmap, (int)(bit % LATOM_BIT));
}
void *get_pages(int order, enum mflags flags)
__malloc_like
static void *__get_pages(u_int order, enum mflags flags)
{
PAGE_ASSERT(order >= 0);
struct mm_zone *zone = &mm_zones[_M_ZONE_INDEX(flags)];
if (order >= GET_PAGE_ORDERS) {
if (order > MM_MAX_ORDER) {
page_debug("get_pages(%d, %#08x): Order too high!\n", order, flags);
return nil;
}
if (flags & M_NOWAIT) {
kprintf("get_pages(): M_NOWAIT requested, this is not implemented yet :(\n");
return nil;
u_long count_after = latom_sub(&zone->free_count, (1 << order)) - (1 << order);
if (count_after < zone->thrsh.emerg) {
if (count_after < 0 || !(flags & _M_EMERG)) {
latom_add(&zone->free_count, (1 << order));
return nil;
}
}
mtx_lock(&caches_lock);
struct clist *entry = nil;
int entry_order;
for (entry_order = order; entry_order < CACHE_ORDERS; entry_order++) {
if (caches[entry_order].free_entries > 0) {
entry = caches[entry_order].freelist.next;
break;
register_t cpuflags = read_flags();
/*
* Search for a free page. Start looking at the freelist for the
* requested order, and if it's empty, go over to the next higher order.
* Repeat until we found a page, or we've reached the highest order.
*/
vm_page_t page = nil;
u_int page_order = order;
while (page == nil && page_order < MM_NR_ORDERS) {
struct mm_pool *pool = &zone->pools[page_order];
disable_intr();
spin_lock(&pool->lock);
if (pool->free_entries > 0) {
page = clist_del_first_entry(&pool->freelist, typeof(*page), link);
/* increment the reference count while we hold the lock on the pool,
* so that no other processor can try to coalesce this block if its
* buddy is being freed (coalition is only possible if the buddy
* has a reference count of zero, and while holding the pool lock) */
page_get(page);
pool->free_entries--;
} else {
page_order++;
}
spin_unlock(&pool->lock);
intr_restore(cpuflags);
}
if (entry_order != CACHE_ORDERS) {
clist_del(entry);
caches[entry_order].free_entries--;
usize bit_number = get_bit_number(entry, entry_order);
while (entry_order > order) {
entry = split_buddy(entry, entry_order);
bit_set(caches[entry_order].bitmap, bit_number);
entry_order--;
bit_number <<= 1;
/*
* if we found a page, check if we need to split it up
* (which is the case if we took one from a higher order freelist)
*/
if (page != nil) {
usize pfn = pg2pfn(page);
page_debug_noisy("alloc order %u, split pfn %#lx from order %u\n",
order, pfn, page_order);
pg_flip_bit(zone, pfn, page_order);
/* split the page and insert the upper halves into the
* respective freelist until we reach the requested order */
while (page_order-- > order) {
page_debug_noisy("split %p (order = %u)\n", pfn2vaddr(pfn), page_order);
struct mm_pool *pool = &zone->pools[page_order];
vm_page_t buddy = page + (1 << page_order);
buddy->order = page_order;
pg_flip_bit(zone, pfn + (1 << page_order), page_order);
disable_intr();
spin_lock(&pool->lock);
clist_add_first(&pool->freelist, &buddy->link);
pool->free_entries++;
spin_unlock(&pool->lock);
intr_restore(cpuflags);
}
bit_set(caches[order].bitmap, bit_number);
# if CFG_POISON_PAGES
memset(entry, 'a', 1 << ORDER_SHIFT(order));
# endif
page->order = order;
void *vaddr = pfn2vaddr(pfn);
return vaddr;
} else {
return nil;
}
}
mtx_unlock(&caches_lock);
return (void *)entry;
/* faster memset for whole pages */
static inline void init_pages(u_long *start, u_long val, u_int order)
{
u_long *end = start + (ORDER_SIZE(order) / sizeof(*start));
do {
*start++ = val;
} while (start != end);
}
void free_pages(void *ptr)
void *get_pages(u_int order, enum mflags flags)
{
# if CFG_DEBUG_PAGE_ALLOCS
if ((uintptr_t)ptr % PAGE_SIZE) {
kprintf("free_pages(%p): unaligned ptr!\n", ptr);
return;
}
# endif
void *pages = __get_pages(order, flags);
if (sus_nil(ptr)) {
page_debug("free_pages(%p): tried to free NULL!\n", ptr);
return;
}
#if CFG_POISON_PAGES
if (pages != nil)
init_pages(pages, PAGE_POISON_ALLOC, order);
#endif
int order = 0;
usize bit_number = get_bit_number(ptr, order);
for (; order < CACHE_ORDERS; order++) {
if (bit_tst(caches[order].bitmap, bit_number))
break;
bit_number >>= 1;
}
return pages;
}
if (order == CACHE_ORDERS) {
page_debug("free_pages(%p): double free!\n", ptr);
return;
}
int original_order = order;
void *get_page(enum mflags flags)
{
void *pages = __get_pages(0, flags);
mtx_lock(&caches_lock);
#if CFG_POISON_PAGES
if (pages != nil)
init_pages(pages, PAGE_POISON_ALLOC, 0);
#endif
while (order < CACHE_ORDERS - 1) {
bit_clr(caches[order].bitmap, bit_number);
return pages;
}
void *tmp = try_join_buddy(ptr, order);
if (tmp == nil)
break;
void *get_zero_pages(u_int order, enum mflags flags)
{
void *pages = __get_pages(order, flags);
ptr = tmp;
order++;
bit_number >>= 1;
}
if (pages != nil)
init_pages(pages, 0, order);
if (order == CACHE_ORDERS - 1 && original_order != CACHE_ORDERS - 1)
set_pflags(HUGEPAGE_ALIGN(ptr), P_HUGE | P_RW);
return pages;
}
#if CFG_POISON_PAGES
memset(ptr, 'A', 1 << ORDER_SHIFT(order));
#endif
void *get_zero_page(enum mflags flags)
{
void *page = __get_pages(0, flags);
clist_add(&caches[order].freelist, (struct clist *)ptr);
caches[order].free_entries++;
if (page != nil)
init_pages(page, 0, 0);
mtx_unlock(&caches_lock);
return page;
}
static inline void *split_buddy(void *ptr, int level)
/*
* Two buddies can be merged if:
* - you currently hold the lock for the pool
* - they both have a reference count of zero
* - they are in the same zone
* - neither of them is reserved
*
* This is only called from within the critical section of free_pages(),
* so execution speed is prioritized over anything else.
*/
static __always_inline bool can_merge(vm_page_t page, vm_page_t buddy)
{
# if CFG_DEBUG_PAGE_ALLOCS
if ((uintptr_t)ptr % (1 << ORDER_SHIFT(level))) {
kprintf("split_buddy(ptr = %p, level = %d): unaligned ptr!\n", ptr, level);
return nil;
}
if (level < 1 || level >= CACHE_ORDERS) {
kprintf("split_buddy(ptr = %p, level = %d): invalid level!\n", ptr, level);
return nil;
}
# endif
bool merge = (atom_read(&buddy->count) == 0);
struct clist *high_buddy = ptr + (1 << ORDER_SHIFT(level - 1));
clist_add(&caches[level - 1].freelist, high_buddy);
caches[level - 1].free_entries++;
/* we know that `page` doesn't have PG_RESERVED set,
* because we check that flag before anything else */
const unsigned mask = PG_RESERVED | PG_DMA;
merge &= (page->flags & mask) == (buddy->flags & mask);
page_debug_noisy("split (%p:%p), lvl=%d\n", ptr, (void *)high_buddy, level);
return ptr;
return merge;
}
static void *try_join_buddy(void *ptr, int order)
void free_pages(void *ptr)
{
const usize entry_size = 1 << ORDER_SHIFT(order);
# if CFG_DEBUG_PAGE_ALLOCS
if ((uintptr_t)ptr % entry_size) {
kprintf("try_join_buddy(%p, %d): unaligned ptr!\n", ptr, order);
return nil;
PAGE_DEBUG_BLOCK {
if (ptr < DMAP_START || ptr >= DMAP_END) {
panic("free_pages(%p): not in DMAP region\n", ptr);
}
/* order must be < CACHE_ORDERS - 1 because you
* can't join blocks on the topmost order */
if (order >= CACHE_ORDERS - 1) {
kprintf("try_join_buddy(%p, %d): order >= CACHE_ORDERS - 1!\n", ptr, order);
return nil;
}
# endif
}
/*
* Test whether the buddy block is allocated and return nil if it is.
* entry_size is a power of 2, so we can quickly get to the buddy block
* with a cheap XOR of the address and the entry size without the need
* for any if branches.
*/
uintptr_t buddy = (uintptr_t)ptr ^ entry_size;
usize buddy_bitnum = get_bit_number((void *)buddy, order);
if (bit_tst(caches[order].bitmap, buddy_bitnum))
return nil;
register_t cpuflags = read_flags();
page_debug_noisy("join (%p:%p), order=%d\n", ptr, (void *)buddy, order);
vm_page_t page = vaddr2pg(ptr);
panic_if(page->flags & PG_RESERVED, "tried to free reserved page %p", ptr);
/* If the buddy is free, we remove it from the freelist ... */
clist_del((struct clist *)buddy);
caches[order].free_entries--;
u_int order = page->order;
PAGE_ASSERT((uintptr_t)ptr % ORDER_SIZE(order) == 0);
u_long pfn = vaddr2pfn(ptr);
/*
* ... and return a pointer to the coalesced block.
* We use the same trick as above to get to the even (lower) block, just
* that this time we're zeroing the bit out rather than flipping it.
*/
uintptr_t even = (uintptr_t)ptr & ~entry_size;
return (void *)even;
#if CFG_POISON_PAGES
init_pages(ptr, PAGE_POISON_FREE, order);
#endif
int old_count = atom_sub(&page->count, 1);
if (old_count != 1) {
if (old_count == 0)
panic("double free of page %p", ptr);
else
panic("attempted to free page %p with references", ptr);
}
struct mm_zone *zone;
if (page->flags & PG_DMA)
zone = &mm_zones[MM_ZONE_DMA];
else
zone = &mm_zones[MM_ZONE_NORMAL];
latom_add(&zone->free_count, (1 << order));
/* try to coalesce free buddy blocks until we're reached the highest order */
while (order < MM_MAX_ORDER) {
if (pg_flip_bit(zone, pfn, order))
break;
page_debug_noisy("join %p (order = %u)\n", pfn2vaddr(pfn), order);
/* precompute all values we need inside the critical section
* to avoid blocking other CPUs for longer than necessary */
vm_page_t buddy = &vm_page_array[pfn ^ (1ul << order)];
vm_page_t low = &vm_page_array[pfn & ~(1ul << order)];
struct mm_pool *current_order_pool = &zone->pools[order];
struct mm_pool *next_order_pool = &zone->pools[order + 1];
disable_intr();
spin_lock(&zone->pools[order].lock);
if (can_merge(page, buddy)) {
clist_del(&buddy->link);
current_order_pool->free_entries--;
buddy->order = order + 1;
page->order = order + 1;
clist_add(&next_order_pool->freelist, &low->link);
next_order_pool->free_entries++;
} else {
order = MM_MAX_ORDER; /* break out of the loop */
}
spin_unlock(&zone->pools[order].lock);
intr_restore(cpuflags);
page = low;
order++;
}
/* finally, we need to insert the page at its freelist */
struct mm_pool *pool = &zone->pools[order];
disable_intr();
spin_lock(&pool->lock);
clist_add(&pool->freelist, &page->link);
pool->free_entries++;
spin_unlock(&zone->pools[order].lock);
intr_restore(cpuflags);
}

Loading…
Cancel
Save