mm: refactor page frame allocator
This is part 3 of the mm subsystem overhaul. The allocator doesn't rely on mutexes anymore and uses individual per-order spinlocks instead. Also, it is aware of multiple memory zones (normal and DMA) as well as emergency reserves. Page bitmaps take up 50 % less overhead now.
This commit is contained in:
parent
825a981d67
commit
385af1b7ef
11 changed files with 566 additions and 601 deletions
|
|
@ -2,7 +2,6 @@
|
|||
|
||||
target_sources(gay_kernel PRIVATE
|
||||
boot.c
|
||||
kmalloc.c
|
||||
page.c
|
||||
slab.c
|
||||
)
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ static CLIST(bmem_area_freelist);
|
|||
|
||||
#ifdef DEBUG
|
||||
#define debug_free_bmem_area(area) ({ (area)->start = ~(vm_paddr_t)0; })
|
||||
#define debug_get_bmem_area(area) KASSERT((area)->start != ~(vm_paddr_t)0)
|
||||
#define debug_get_bmem_area(area) KASSERT((area)->start == ~(vm_paddr_t)0)
|
||||
#else
|
||||
#define debug_free_bmem_area(area) ({})
|
||||
#define debug_get_bmem_area(area) ({})
|
||||
|
|
@ -62,6 +62,9 @@ void __boot_pmalloc_init(void)
|
|||
debug_free_bmem_area(area);
|
||||
clist_add(&bmem_area_freelist, &area->link);
|
||||
}
|
||||
|
||||
for (int i = 0; i < MM_NR_ZONES; i++)
|
||||
clist_init(&mm_zones[i]._bmem_areas);
|
||||
}
|
||||
|
||||
void __boot_register_mem_area(vm_paddr_t start, vm_paddr_t end, enum mm_zone_type zone_type)
|
||||
|
|
|
|||
|
|
@ -1,74 +0,0 @@
|
|||
/* Copyright (C) 2021 fef <owo@fef.moe>. All rights reserved. */
|
||||
|
||||
#include <gay/kprintf.h>
|
||||
#include <gay/mm.h>
|
||||
#include <gay/types.h>
|
||||
#include <gay/util.h>
|
||||
|
||||
extern void _image_start_phys;
|
||||
extern void _image_end_phys;
|
||||
|
||||
/* these are initialized by pages_init() */
|
||||
void *kheap_start;
|
||||
void *kheap_end;
|
||||
|
||||
int kmalloc_init(uintptr_t _phys_start, uintptr_t _phys_end)
|
||||
{
|
||||
phys_start = _phys_start;
|
||||
phys_end = _phys_end;
|
||||
|
||||
/*
|
||||
* The kernel image is very likely gonna be within the physical memory
|
||||
* range, so we're gonna need to do some cropping in order to not hand
|
||||
* out pages that actually contain kernel code.
|
||||
* Furthermore, somebody should probably clean up this mess somehow.
|
||||
*/
|
||||
uintptr_t image_start_phys = (uintptr_t)&_image_start_phys;
|
||||
uintptr_t image_end_phys = (uintptr_t)&_image_end_phys;
|
||||
if (phys_start < image_start_phys && phys_end > image_start_phys) {
|
||||
if (image_start_phys - phys_start > phys_end - image_start_phys)
|
||||
phys_end = image_start_phys;
|
||||
else
|
||||
phys_start = image_end_phys;
|
||||
}
|
||||
if (phys_start < image_end_phys && _phys_end > image_end_phys) {
|
||||
if (image_end_phys - phys_start > phys_end - image_end_phys)
|
||||
phys_end = image_start_phys;
|
||||
else
|
||||
phys_start = image_end_phys;
|
||||
}
|
||||
|
||||
phys_start = align_ceil(phys_start, HUGEPAGE_SIZE);
|
||||
/*
|
||||
* This is intentionally not aligned to hugepages, because __early_get_page()
|
||||
* shrinks it in single PAGE_SIZE steps whenever it is called anyway.
|
||||
* I know, this is a terrible hack, but it will be aligned to a hugepage
|
||||
* from within pages_init(), right after the entire physical memory has
|
||||
* been mapped to the direct area (which is the only reason we need to
|
||||
* be able to allocate pages before the page frame allocator is set up
|
||||
* in the first place).
|
||||
*/
|
||||
phys_end = align_floor(phys_end, PAGE_SIZE);
|
||||
|
||||
int err = pages_init();
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
slab_init();
|
||||
return 0;
|
||||
}
|
||||
|
||||
__weak void *malloc(usize size)
|
||||
{
|
||||
return kmalloc(size, M_KERN);
|
||||
}
|
||||
|
||||
__weak void free(void *ptr)
|
||||
{
|
||||
kfree(ptr);
|
||||
}
|
||||
|
||||
/*
|
||||
* Looking for kmalloc() and kfree()?
|
||||
* Those two are in slab.c for purely organizational reasons.
|
||||
*/
|
||||
664
kernel/mm/page.c
664
kernel/mm/page.c
|
|
@ -1,32 +1,25 @@
|
|||
/* Copyright (C) 2021 fef <owo@fef.moe>. All rights reserved. */
|
||||
|
||||
#include <arch/cpufunc.h>
|
||||
#include <arch/page.h>
|
||||
|
||||
#include <gay/bits.h>
|
||||
#include <gay/clist.h>
|
||||
#include <gay/config.h>
|
||||
#include <gay/kprintf.h>
|
||||
#include <gay/mm.h>
|
||||
#include <gay/mutex.h>
|
||||
#include <gay/poison.h>
|
||||
#include <gay/systm.h>
|
||||
#include <gay/types.h>
|
||||
#include <gay/util.h>
|
||||
#include <gay/vm/page.h>
|
||||
|
||||
#include <limits.h>
|
||||
#include <string.h>
|
||||
#include <strings.h>
|
||||
|
||||
#ifndef __HAVE_HUGEPAGES
|
||||
#error "Systems without huge pages are currently unsupported because i'm a dumb bitch"
|
||||
#endif
|
||||
|
||||
#if DMAP_OFFSET % HUGEPAGE_SIZE != 0
|
||||
#error "DMAP_OFFSET must be an integral multiple of HUGEPAGE_SIZE"
|
||||
#endif
|
||||
|
||||
/* this should be impossible because arch/page.h must also define PAGE_SHIFT
|
||||
* and HUGEPAGE_SHIFT, meaning the two are definitively powers of 2 */
|
||||
#if HUGEPAGE_SIZE % PAGE_SIZE != 0
|
||||
#error "HUGEPAGE_SIZE must be an integral multiple of PAGE_SIZE"
|
||||
#if DMAP_OFFSET % PAGE_SIZE != 0
|
||||
#error "DMAP_OFFSET must be an integral multiple of PAGE_SIZE"
|
||||
#endif
|
||||
|
||||
#if PAGE_SIZE % LONG_BIT != 0
|
||||
|
|
@ -40,6 +33,7 @@
|
|||
#if CFG_DEBUG_PAGE_ALLOCS
|
||||
# define PAGE_ASSERT(x) KASSERT(x)
|
||||
# define page_debug(msg, ...) kprintf("[page] " msg, ##__VA_ARGS__)
|
||||
# define PAGE_DEBUG_BLOCK
|
||||
# if CFG_DEBUG_PAGE_ALLOCS_NOISY
|
||||
# define page_debug_noisy(msg, ...) kprintf("[page] " msg, ##__VA_ARGS__)
|
||||
# else
|
||||
|
|
@ -47,359 +41,419 @@
|
|||
# endif
|
||||
#else
|
||||
# define PAGE_ASSERT(x) ({})
|
||||
# define PAGE_DEBUG_BLOCK if (0)
|
||||
# define page_debug(msg, ...) ({})
|
||||
# define page_debug_noisy(msg, ...) ({})
|
||||
#endif
|
||||
|
||||
/**
|
||||
* We have cache levels for areas ranging from a single page up to a huge page
|
||||
* on a logarithmic scale. Every level covers double the pages per entry than
|
||||
* the one below it, starting at one page per entry. The effective result is
|
||||
* that a single entry in the cache on level L covers `(1 << L)` pages.
|
||||
*/
|
||||
#define CACHE_ORDERS GET_PAGE_ORDERS
|
||||
|
||||
#define ORDER_SHIFT(order) (PAGE_SHIFT + (order))
|
||||
#define ORDER_SIZE(order) (1 << ORDER_SHIFT(order))
|
||||
|
||||
/** @brief There is one of this for every cache order. */
|
||||
struct cache_pool {
|
||||
/**
|
||||
* @brief List of free blocks on this order of granularity.
|
||||
* The individual entries sit right at the beginning of each free block,
|
||||
* and are always aligned to `entry_size` bytes.
|
||||
*/
|
||||
struct clist freelist;
|
||||
/**
|
||||
* @brief Bitmap that stores the allocated status of each entry.
|
||||
* 1 means allocated, 0 means not.
|
||||
*/
|
||||
unsigned long *bitmap;
|
||||
/** @brief Number of items in `freelist`. */
|
||||
usize free_entries;
|
||||
};
|
||||
static struct cache_pool caches[CACHE_ORDERS];
|
||||
static MTX(caches_lock);
|
||||
/* this should be the same as LONG_BIT because latom_t is really just a
|
||||
* long wrapped in a struct, but my trust in compilers is exactly zero */
|
||||
#define LATOM_BIT (sizeof(latom_t) * CHAR_BIT)
|
||||
|
||||
/* these get set in kmalloc_init() */
|
||||
uintptr_t phys_start;
|
||||
uintptr_t phys_end;
|
||||
struct mm_zone mm_zones[MM_NR_ZONES];
|
||||
|
||||
uintptr_t __early_get_page(void)
|
||||
static inline u_int paddr_find_order(vm_paddr_t addr)
|
||||
{
|
||||
phys_end -= PAGE_SIZE;
|
||||
return phys_end;
|
||||
int bit = ffsll((long long)addr) - 1;
|
||||
if (bit == -1 || bit > ORDER_SHIFT(MM_MAX_ORDER))
|
||||
bit = ORDER_SHIFT(MM_MAX_ORDER);
|
||||
|
||||
KASSERT(bit >= PAGE_SHIFT);
|
||||
return bit - PAGE_SHIFT;
|
||||
}
|
||||
|
||||
static int sanity_check(void)
|
||||
/** @brief Claim all free pages in one of the memory areas from the boot allocator. */
|
||||
static inline void claim_bmem_pages(struct mm_zone *zone, struct _bmem_area *area)
|
||||
{
|
||||
KASSERT(phys_start < phys_end);
|
||||
KASSERT(phys_start == HUGEPAGE_ALIGN(phys_start));
|
||||
/* phys_end is only page aligned, see kmalloc_init() */
|
||||
KASSERT(phys_end == PAGE_ALIGN(phys_end));
|
||||
vm_paddr_t start = area->start;
|
||||
vm_paddr_t end = area->end;
|
||||
vm_paddr_t pos = start;
|
||||
vm_size_t nr_pages = end - start / PAGE_SIZE;
|
||||
latom_add(&zone->free_count, (long)nr_pages);
|
||||
|
||||
if ((phys_end - phys_start) < (32 * 1024 * 1024)) {
|
||||
kprintf("Less than 32 MB of usable RAM, this wouldn't go well\n");
|
||||
return 1;
|
||||
struct vm_page *page = &vm_page_array[start >> PAGE_SHIFT];
|
||||
u_int order = paddr_find_order(start);
|
||||
/* make sure the boot memory allocator cannot under any circumstances hand
|
||||
* out pages from this area anymore, even though that should be unnecessary */
|
||||
clist_del(&area->link);
|
||||
|
||||
/*
|
||||
* We want to insert pages at the highest possible order. However, the
|
||||
* start and end pointers of the area are only guaranteed to be page
|
||||
* aligned. Therefore, we start with the highest possible order based
|
||||
* on the start address, and then increment the order in every loop
|
||||
* iteration (up to MM_MAX_ORDER). We do this until we have reached
|
||||
* the end which, again, is only guaranteed to be page aligned, and
|
||||
* subsequently lower the order again.
|
||||
*/
|
||||
while (pos < end) {
|
||||
struct mm_pool *pool = &zone->pools[order];
|
||||
clist_add(&pool->freelist, &page->link);
|
||||
pool->free_entries++;
|
||||
|
||||
/* only the first page in the order group is inserted into
|
||||
* the freelist, but all of them need to be initialized */
|
||||
for (u_int i = 0; i < (1 << order); i++) {
|
||||
atom_init(&page[i].count, 0);
|
||||
page[i].flags = 0;
|
||||
page[i].order = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* order
|
||||
* ^
|
||||
* | _________ < MM_MAX_ORDER
|
||||
* | / |
|
||||
* start | / \ < end order
|
||||
* order > |/
|
||||
* |--------------|----> pos
|
||||
* start end
|
||||
*/
|
||||
pos += ORDER_SIZE(order);
|
||||
page += (1 << order);
|
||||
if (order < MM_MAX_ORDER && pos + ORDER_SIZE(order) <= end) {
|
||||
/* this makes the rising part of the graph */
|
||||
order++;
|
||||
} else if (order > 0 && pos > end) {
|
||||
/* we have overshot, lower the order */
|
||||
pos -= ORDER_SIZE(order);
|
||||
page -= (1 << order);
|
||||
/* this makes the abrupt downwards jump at the end of the graph */
|
||||
while (--order) {
|
||||
if (pos + ORDER_SIZE(order) <= end) {
|
||||
pos += ORDER_SIZE(order);
|
||||
page += (1 << order);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void paging_init(vm_paddr_t phys_end)
|
||||
{
|
||||
/* Sizes of the individual bitmaps per order, rounded up to the
|
||||
* next full longword. We use the same bitmaps in all zones. */
|
||||
usize bitmap_sizes[MM_NR_ORDERS];
|
||||
/* size of all bitmaps combined */
|
||||
usize bitmap_total_size = 0;
|
||||
|
||||
for (int order = 0; order < MM_NR_ORDERS; order++) {
|
||||
usize pages = phys_end >> ORDER_SHIFT(order + 1);
|
||||
pages = align_ceil(pages, LATOM_BIT * 2);
|
||||
usize bytes = pages / (CHAR_BIT * 2);
|
||||
bitmap_sizes[order] = bytes;
|
||||
bitmap_total_size += bytes;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Map the entire physical memory into the direct contiguous area.
|
||||
* __early_map_page() might call __early_get_page() in order to allocate
|
||||
* new page table structures, which in turn shrinks the physical memory
|
||||
* size (see above).
|
||||
*/
|
||||
static inline void map_direct_area(void)
|
||||
{
|
||||
#ifdef __HAVE_HUGEPAGES
|
||||
const usize step = HUGEPAGE_SIZE;
|
||||
const enum pflags flags = P_PRESENT | P_RW | P_HUGE;
|
||||
#else
|
||||
const usize step = PAGE_SIZE;
|
||||
const enum pflags flags = P_PRESENT | P_RW;
|
||||
#endif
|
||||
page_debug("Reserving %zu bytes for page bitmaps\n", bitmap_total_size);
|
||||
|
||||
/*
|
||||
* It might be necessary to use a volatile pointer to phys_end for this
|
||||
* loop in case clang does The Optimization and caches its value for
|
||||
* whatever reason, even though at least for x86 this is not the case
|
||||
* (and i don't even thing the C standard allows it when calling
|
||||
* external functions in between, but still, Never Trust The Compiler).
|
||||
* allocate memory for the bitmaps and zero them out
|
||||
*/
|
||||
for (uintptr_t pos = phys_start; pos <= phys_end - step; pos += step)
|
||||
__early_map_page(pos, __v(pos), flags);
|
||||
|
||||
vm_flush();
|
||||
}
|
||||
|
||||
/*
|
||||
* This function maps the entire physical memory into the direct region
|
||||
* (DMAP_START - DMAP_END) and sets up the caches.
|
||||
* The bitmaps are stored one after another at the end of physical memory, and
|
||||
*
|
||||
*/
|
||||
int pages_init(void)
|
||||
{
|
||||
if (sanity_check() != 0)
|
||||
return 1;
|
||||
|
||||
map_direct_area();
|
||||
|
||||
/* phys_end gets aligned, as promised by the comment in kmalloc_init() */
|
||||
phys_end = align_floor(phys_end, HUGEPAGE_SIZE);
|
||||
usize phys_size = phys_end - phys_start;
|
||||
u_int bitmap_size_log2 = flsl((long)bitmap_total_size);
|
||||
KASSERT(bitmap_size_log2 != 0);
|
||||
bitmap_size_log2--; /* the bit index returned by flsl starts at 1 */
|
||||
if (bitmap_total_size ^ (1ul << bitmap_size_log2))
|
||||
bitmap_size_log2++; /* bitmap_total_size is not a power of 2, round up */
|
||||
uintptr_t bitmap_start_phys = __boot_pmalloc(bitmap_size_log2, MM_ZONE_NORMAL);
|
||||
panic_if(bitmap_start_phys == BOOT_PMALLOC_ERR,
|
||||
"cannot allocate memory for the page bitmaps");
|
||||
memset(__v(bitmap_start_phys), 0, bitmap_total_size);
|
||||
|
||||
/*
|
||||
* calculate the size of each bitmap, as well as their combined size
|
||||
* initialize the pools
|
||||
*/
|
||||
usize bitmap_bytes = 0;
|
||||
for (int i = 0; i < CACHE_ORDERS; i++) {
|
||||
usize bits = phys_size >> ORDER_SHIFT(i);
|
||||
bits = align_ceil(bits, LONG_BIT);
|
||||
bitmap_bytes += bits / 8;
|
||||
for (int zone_index = 0; zone_index < ARRAY_SIZE(mm_zones); zone_index++) {
|
||||
struct mm_zone *zone = &mm_zones[zone_index];
|
||||
latom_t *bitmap_pos = __v(bitmap_start_phys);
|
||||
for (int order = 0; order < MM_NR_ORDERS; order++) {
|
||||
zone->pools[order].bitmap = bitmap_pos;
|
||||
clist_init(&zone->pools[order].freelist);
|
||||
zone->pools[order].free_entries = 0;
|
||||
latom_init(&zone->free_count, 0);
|
||||
|
||||
bitmap_pos += bitmap_sizes[order];
|
||||
}
|
||||
}
|
||||
|
||||
page_debug("Page frame overhead = %zu bytes, %zu bytes total\n", bitmap_bytes, phys_size);
|
||||
|
||||
/*
|
||||
* zero out all bitmaps
|
||||
* mark *all* pages as reserved first
|
||||
*
|
||||
* XXX this is totally unnecessary and i'm only doing it because i'm
|
||||
* too tired to work out an algorithm that finds all pages that are
|
||||
* not in the _bmem_areas lists of the mm_zones
|
||||
*
|
||||
* if the reserved bit is set, all other fields in the page are invalid.
|
||||
*/
|
||||
uintptr_t bitmap_start_phys = phys_end - bitmap_bytes;
|
||||
unsigned long *bitmap_start = __v(bitmap_start_phys);
|
||||
memset(bitmap_start, 0, bitmap_bytes);
|
||||
|
||||
/*
|
||||
* populate the remaining members of the cache_pool structures and
|
||||
* preallocate entries that can't be handed out (i.e. the cache bitmaps)
|
||||
*/
|
||||
unsigned long *bitmap_pos = bitmap_start;
|
||||
for (int i = 0; i < CACHE_ORDERS; i++) {
|
||||
/* total amount of entries on this level */
|
||||
usize total_bits = phys_size >> ORDER_SHIFT(i);
|
||||
/* number of entries on this level that the bitmap itself takes up */
|
||||
usize wasted_bits = bitmap_bytes >> ORDER_SHIFT(i);
|
||||
if (wasted_bits == 0)
|
||||
wasted_bits = 1;
|
||||
bit_set_range(bitmap_pos, total_bits - wasted_bits, wasted_bits);
|
||||
|
||||
caches[i].bitmap = bitmap_pos;
|
||||
bitmap_pos += total_bits / LONG_BIT;
|
||||
|
||||
clist_init(&caches[i].freelist);
|
||||
caches[i].free_entries = 0;
|
||||
for (usize i = 0; i < phys_end >> PAGE_SHIFT; i++) {
|
||||
/* This is merely an optimization to simplify checking whether
|
||||
* two buddies can be coalesced into one. In reality, the
|
||||
* reference count is invalid because the page is reserved. */
|
||||
atom_init(&vm_page_array[i].count, 1);
|
||||
vm_page_array[i].flags = PG_RESERVED;
|
||||
}
|
||||
|
||||
/* kheap_start and kheap_end are globals */
|
||||
kheap_start = __v(phys_start);
|
||||
kheap_end = align_floor(bitmap_start, HUGEPAGE_SIZE);
|
||||
|
||||
/*
|
||||
* populate the freelist on the highest order, all orders beneath it
|
||||
* stay empty until one of the large blocks gets split up
|
||||
* populate the freelists
|
||||
*/
|
||||
struct cache_pool *high_pool = &caches[CACHE_ORDERS - 1];
|
||||
usize step = 1 << ORDER_SHIFT(CACHE_ORDERS - 1);
|
||||
for (void *pos = kheap_start; pos < kheap_end; pos += step) {
|
||||
struct clist *entry = pos;
|
||||
clist_add(&high_pool->freelist, entry);
|
||||
high_pool->free_entries++;
|
||||
for (int i = 0; i < ARRAY_SIZE(mm_zones); i++) {
|
||||
struct mm_zone *zone = &mm_zones[i];
|
||||
struct _bmem_area *area, *tmp;
|
||||
clist_foreach_entry_safe(&zone->_bmem_areas, area, tmp, link) {
|
||||
claim_bmem_pages(zone, area);
|
||||
}
|
||||
zone->thrsh.emerg = latom_read(&zone->free_count) / CFG_PAGE_EMERG_DENOM;
|
||||
if (zone->thrsh.emerg > CFG_PAGE_EMERG_MAX)
|
||||
zone->thrsh.emerg = CFG_PAGE_EMERG_MAX;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Split a block and return the lower half.
|
||||
* The block is assumed to already have been removed from its freelist.
|
||||
* The high half (i.e. the block that is *not* returned) is inserted into the
|
||||
* freelist one level below `level`.
|
||||
*
|
||||
* @param ptr Pointer to the block
|
||||
* @param level Current level of the block
|
||||
* (`ptr` must be aligned to `1 << level` pages)
|
||||
*/
|
||||
static void *split_buddy(void *ptr, int level);
|
||||
|
||||
/**
|
||||
* @brief Attempt to coalesce a block with its buddy.
|
||||
* If coalition is possible, the buddy is removed from its freelist at `order`.
|
||||
*
|
||||
* @param ptr Pointer to the block
|
||||
* @param order Cache order, must be less than `CACHE_ORDERS - 1` (because you
|
||||
* can't join blocks at the highest cache order)
|
||||
* @return The joined block, or `nil` if coalition was not possible
|
||||
*/
|
||||
static void *try_join_buddy(void *ptr, int order);
|
||||
|
||||
static inline usize get_bit_number(void *ptr, int order)
|
||||
static inline bool pg_flip_bit(struct mm_zone *zone, u_long pfn, u_int order)
|
||||
{
|
||||
return ((uintptr_t)ptr - (uintptr_t)kheap_start) >> ORDER_SHIFT(order);
|
||||
usize bit = pfn >> (order + 1);
|
||||
latom_t *bitmap = &zone->pools[order].bitmap[bit / LATOM_BIT];
|
||||
return latom_flip_bit(bitmap, (int)(bit % LATOM_BIT));
|
||||
}
|
||||
|
||||
void *get_pages(int order, enum mflags flags)
|
||||
__malloc_like
|
||||
static void *__get_pages(u_int order, enum mflags flags)
|
||||
{
|
||||
PAGE_ASSERT(order >= 0);
|
||||
struct mm_zone *zone = &mm_zones[_M_ZONE_INDEX(flags)];
|
||||
|
||||
if (order >= GET_PAGE_ORDERS) {
|
||||
if (order > MM_MAX_ORDER) {
|
||||
page_debug("get_pages(%d, %#08x): Order too high!\n", order, flags);
|
||||
return nil;
|
||||
}
|
||||
|
||||
if (flags & M_NOWAIT) {
|
||||
kprintf("get_pages(): M_NOWAIT requested, this is not implemented yet :(\n");
|
||||
u_long count_after = latom_sub(&zone->free_count, (1 << order)) - (1 << order);
|
||||
if (count_after < zone->thrsh.emerg) {
|
||||
if (count_after < 0 || !(flags & _M_EMERG)) {
|
||||
latom_add(&zone->free_count, (1 << order));
|
||||
return nil;
|
||||
}
|
||||
}
|
||||
|
||||
register_t cpuflags = read_flags();
|
||||
|
||||
/*
|
||||
* Search for a free page. Start looking at the freelist for the
|
||||
* requested order, and if it's empty, go over to the next higher order.
|
||||
* Repeat until we found a page, or we've reached the highest order.
|
||||
*/
|
||||
vm_page_t page = nil;
|
||||
u_int page_order = order;
|
||||
while (page == nil && page_order < MM_NR_ORDERS) {
|
||||
struct mm_pool *pool = &zone->pools[page_order];
|
||||
|
||||
disable_intr();
|
||||
spin_lock(&pool->lock);
|
||||
if (pool->free_entries > 0) {
|
||||
page = clist_del_first_entry(&pool->freelist, typeof(*page), link);
|
||||
/* increment the reference count while we hold the lock on the pool,
|
||||
* so that no other processor can try to coalesce this block if its
|
||||
* buddy is being freed (coalition is only possible if the buddy
|
||||
* has a reference count of zero, and while holding the pool lock) */
|
||||
page_get(page);
|
||||
pool->free_entries--;
|
||||
} else {
|
||||
page_order++;
|
||||
}
|
||||
spin_unlock(&pool->lock);
|
||||
intr_restore(cpuflags);
|
||||
}
|
||||
|
||||
/*
|
||||
* if we found a page, check if we need to split it up
|
||||
* (which is the case if we took one from a higher order freelist)
|
||||
*/
|
||||
if (page != nil) {
|
||||
usize pfn = pg2pfn(page);
|
||||
page_debug_noisy("alloc order %u, split pfn %#lx from order %u\n",
|
||||
order, pfn, page_order);
|
||||
pg_flip_bit(zone, pfn, page_order);
|
||||
|
||||
/* split the page and insert the upper halves into the
|
||||
* respective freelist until we reach the requested order */
|
||||
while (page_order-- > order) {
|
||||
page_debug_noisy("split %p (order = %u)\n", pfn2vaddr(pfn), page_order);
|
||||
struct mm_pool *pool = &zone->pools[page_order];
|
||||
vm_page_t buddy = page + (1 << page_order);
|
||||
buddy->order = page_order;
|
||||
pg_flip_bit(zone, pfn + (1 << page_order), page_order);
|
||||
|
||||
disable_intr();
|
||||
spin_lock(&pool->lock);
|
||||
clist_add_first(&pool->freelist, &buddy->link);
|
||||
pool->free_entries++;
|
||||
spin_unlock(&pool->lock);
|
||||
intr_restore(cpuflags);
|
||||
}
|
||||
|
||||
page->order = order;
|
||||
void *vaddr = pfn2vaddr(pfn);
|
||||
|
||||
return vaddr;
|
||||
} else {
|
||||
return nil;
|
||||
}
|
||||
mtx_lock(&caches_lock);
|
||||
}
|
||||
|
||||
struct clist *entry = nil;
|
||||
int entry_order;
|
||||
for (entry_order = order; entry_order < CACHE_ORDERS; entry_order++) {
|
||||
if (caches[entry_order].free_entries > 0) {
|
||||
entry = caches[entry_order].freelist.next;
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* faster memset for whole pages */
|
||||
static inline void init_pages(u_long *start, u_long val, u_int order)
|
||||
{
|
||||
u_long *end = start + (ORDER_SIZE(order) / sizeof(*start));
|
||||
do {
|
||||
*start++ = val;
|
||||
} while (start != end);
|
||||
}
|
||||
|
||||
if (entry_order != CACHE_ORDERS) {
|
||||
clist_del(entry);
|
||||
caches[entry_order].free_entries--;
|
||||
void *get_pages(u_int order, enum mflags flags)
|
||||
{
|
||||
void *pages = __get_pages(order, flags);
|
||||
|
||||
usize bit_number = get_bit_number(entry, entry_order);
|
||||
while (entry_order > order) {
|
||||
entry = split_buddy(entry, entry_order);
|
||||
bit_set(caches[entry_order].bitmap, bit_number);
|
||||
entry_order--;
|
||||
bit_number <<= 1;
|
||||
}
|
||||
bit_set(caches[order].bitmap, bit_number);
|
||||
#if CFG_POISON_PAGES
|
||||
if (pages != nil)
|
||||
init_pages(pages, PAGE_POISON_ALLOC, order);
|
||||
#endif
|
||||
|
||||
# if CFG_POISON_PAGES
|
||||
memset(entry, 'a', 1 << ORDER_SHIFT(order));
|
||||
# endif
|
||||
}
|
||||
return pages;
|
||||
}
|
||||
|
||||
mtx_unlock(&caches_lock);
|
||||
return (void *)entry;
|
||||
void *get_page(enum mflags flags)
|
||||
{
|
||||
void *pages = __get_pages(0, flags);
|
||||
|
||||
#if CFG_POISON_PAGES
|
||||
if (pages != nil)
|
||||
init_pages(pages, PAGE_POISON_ALLOC, 0);
|
||||
#endif
|
||||
|
||||
return pages;
|
||||
}
|
||||
|
||||
void *get_zero_pages(u_int order, enum mflags flags)
|
||||
{
|
||||
void *pages = __get_pages(order, flags);
|
||||
|
||||
if (pages != nil)
|
||||
init_pages(pages, 0, order);
|
||||
|
||||
return pages;
|
||||
}
|
||||
|
||||
void *get_zero_page(enum mflags flags)
|
||||
{
|
||||
void *page = __get_pages(0, flags);
|
||||
|
||||
if (page != nil)
|
||||
init_pages(page, 0, 0);
|
||||
|
||||
return page;
|
||||
}
|
||||
|
||||
/*
|
||||
* Two buddies can be merged if:
|
||||
* - you currently hold the lock for the pool
|
||||
* - they both have a reference count of zero
|
||||
* - they are in the same zone
|
||||
* - neither of them is reserved
|
||||
*
|
||||
* This is only called from within the critical section of free_pages(),
|
||||
* so execution speed is prioritized over anything else.
|
||||
*/
|
||||
static __always_inline bool can_merge(vm_page_t page, vm_page_t buddy)
|
||||
{
|
||||
bool merge = (atom_read(&buddy->count) == 0);
|
||||
|
||||
/* we know that `page` doesn't have PG_RESERVED set,
|
||||
* because we check that flag before anything else */
|
||||
const unsigned mask = PG_RESERVED | PG_DMA;
|
||||
merge &= (page->flags & mask) == (buddy->flags & mask);
|
||||
|
||||
return merge;
|
||||
}
|
||||
|
||||
void free_pages(void *ptr)
|
||||
{
|
||||
# if CFG_DEBUG_PAGE_ALLOCS
|
||||
if ((uintptr_t)ptr % PAGE_SIZE) {
|
||||
kprintf("free_pages(%p): unaligned ptr!\n", ptr);
|
||||
return;
|
||||
PAGE_DEBUG_BLOCK {
|
||||
if (ptr < DMAP_START || ptr >= DMAP_END) {
|
||||
panic("free_pages(%p): not in DMAP region\n", ptr);
|
||||
}
|
||||
# endif
|
||||
|
||||
if (sus_nil(ptr)) {
|
||||
page_debug("free_pages(%p): tried to free NULL!\n", ptr);
|
||||
return;
|
||||
}
|
||||
|
||||
int order = 0;
|
||||
usize bit_number = get_bit_number(ptr, order);
|
||||
for (; order < CACHE_ORDERS; order++) {
|
||||
if (bit_tst(caches[order].bitmap, bit_number))
|
||||
break;
|
||||
bit_number >>= 1;
|
||||
}
|
||||
register_t cpuflags = read_flags();
|
||||
|
||||
if (order == CACHE_ORDERS) {
|
||||
page_debug("free_pages(%p): double free!\n", ptr);
|
||||
return;
|
||||
}
|
||||
int original_order = order;
|
||||
vm_page_t page = vaddr2pg(ptr);
|
||||
panic_if(page->flags & PG_RESERVED, "tried to free reserved page %p", ptr);
|
||||
|
||||
mtx_lock(&caches_lock);
|
||||
|
||||
while (order < CACHE_ORDERS - 1) {
|
||||
bit_clr(caches[order].bitmap, bit_number);
|
||||
|
||||
void *tmp = try_join_buddy(ptr, order);
|
||||
if (tmp == nil)
|
||||
break;
|
||||
|
||||
ptr = tmp;
|
||||
order++;
|
||||
bit_number >>= 1;
|
||||
}
|
||||
|
||||
if (order == CACHE_ORDERS - 1 && original_order != CACHE_ORDERS - 1)
|
||||
set_pflags(HUGEPAGE_ALIGN(ptr), P_HUGE | P_RW);
|
||||
u_int order = page->order;
|
||||
PAGE_ASSERT((uintptr_t)ptr % ORDER_SIZE(order) == 0);
|
||||
u_long pfn = vaddr2pfn(ptr);
|
||||
|
||||
#if CFG_POISON_PAGES
|
||||
memset(ptr, 'A', 1 << ORDER_SHIFT(order));
|
||||
init_pages(ptr, PAGE_POISON_FREE, order);
|
||||
#endif
|
||||
|
||||
clist_add(&caches[order].freelist, (struct clist *)ptr);
|
||||
caches[order].free_entries++;
|
||||
int old_count = atom_sub(&page->count, 1);
|
||||
if (old_count != 1) {
|
||||
if (old_count == 0)
|
||||
panic("double free of page %p", ptr);
|
||||
else
|
||||
panic("attempted to free page %p with references", ptr);
|
||||
}
|
||||
|
||||
mtx_unlock(&caches_lock);
|
||||
}
|
||||
|
||||
static inline void *split_buddy(void *ptr, int level)
|
||||
{
|
||||
# if CFG_DEBUG_PAGE_ALLOCS
|
||||
if ((uintptr_t)ptr % (1 << ORDER_SHIFT(level))) {
|
||||
kprintf("split_buddy(ptr = %p, level = %d): unaligned ptr!\n", ptr, level);
|
||||
return nil;
|
||||
}
|
||||
if (level < 1 || level >= CACHE_ORDERS) {
|
||||
kprintf("split_buddy(ptr = %p, level = %d): invalid level!\n", ptr, level);
|
||||
return nil;
|
||||
}
|
||||
# endif
|
||||
|
||||
struct clist *high_buddy = ptr + (1 << ORDER_SHIFT(level - 1));
|
||||
clist_add(&caches[level - 1].freelist, high_buddy);
|
||||
caches[level - 1].free_entries++;
|
||||
|
||||
page_debug_noisy("split (%p:%p), lvl=%d\n", ptr, (void *)high_buddy, level);
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
static void *try_join_buddy(void *ptr, int order)
|
||||
{
|
||||
const usize entry_size = 1 << ORDER_SHIFT(order);
|
||||
|
||||
# if CFG_DEBUG_PAGE_ALLOCS
|
||||
if ((uintptr_t)ptr % entry_size) {
|
||||
kprintf("try_join_buddy(%p, %d): unaligned ptr!\n", ptr, order);
|
||||
return nil;
|
||||
}
|
||||
/* order must be < CACHE_ORDERS - 1 because you
|
||||
* can't join blocks on the topmost order */
|
||||
if (order >= CACHE_ORDERS - 1) {
|
||||
kprintf("try_join_buddy(%p, %d): order >= CACHE_ORDERS - 1!\n", ptr, order);
|
||||
return nil;
|
||||
}
|
||||
# endif
|
||||
|
||||
/*
|
||||
* Test whether the buddy block is allocated and return nil if it is.
|
||||
* entry_size is a power of 2, so we can quickly get to the buddy block
|
||||
* with a cheap XOR of the address and the entry size without the need
|
||||
* for any if branches.
|
||||
*/
|
||||
uintptr_t buddy = (uintptr_t)ptr ^ entry_size;
|
||||
usize buddy_bitnum = get_bit_number((void *)buddy, order);
|
||||
if (bit_tst(caches[order].bitmap, buddy_bitnum))
|
||||
return nil;
|
||||
|
||||
page_debug_noisy("join (%p:%p), order=%d\n", ptr, (void *)buddy, order);
|
||||
|
||||
/* If the buddy is free, we remove it from the freelist ... */
|
||||
clist_del((struct clist *)buddy);
|
||||
caches[order].free_entries--;
|
||||
|
||||
/*
|
||||
* ... and return a pointer to the coalesced block.
|
||||
* We use the same trick as above to get to the even (lower) block, just
|
||||
* that this time we're zeroing the bit out rather than flipping it.
|
||||
*/
|
||||
uintptr_t even = (uintptr_t)ptr & ~entry_size;
|
||||
return (void *)even;
|
||||
struct mm_zone *zone;
|
||||
if (page->flags & PG_DMA)
|
||||
zone = &mm_zones[MM_ZONE_DMA];
|
||||
else
|
||||
zone = &mm_zones[MM_ZONE_NORMAL];
|
||||
|
||||
latom_add(&zone->free_count, (1 << order));
|
||||
|
||||
/* try to coalesce free buddy blocks until we're reached the highest order */
|
||||
while (order < MM_MAX_ORDER) {
|
||||
if (pg_flip_bit(zone, pfn, order))
|
||||
break;
|
||||
|
||||
page_debug_noisy("join %p (order = %u)\n", pfn2vaddr(pfn), order);
|
||||
|
||||
/* precompute all values we need inside the critical section
|
||||
* to avoid blocking other CPUs for longer than necessary */
|
||||
vm_page_t buddy = &vm_page_array[pfn ^ (1ul << order)];
|
||||
vm_page_t low = &vm_page_array[pfn & ~(1ul << order)];
|
||||
struct mm_pool *current_order_pool = &zone->pools[order];
|
||||
struct mm_pool *next_order_pool = &zone->pools[order + 1];
|
||||
|
||||
disable_intr();
|
||||
spin_lock(&zone->pools[order].lock);
|
||||
if (can_merge(page, buddy)) {
|
||||
clist_del(&buddy->link);
|
||||
current_order_pool->free_entries--;
|
||||
buddy->order = order + 1;
|
||||
page->order = order + 1;
|
||||
clist_add(&next_order_pool->freelist, &low->link);
|
||||
next_order_pool->free_entries++;
|
||||
} else {
|
||||
order = MM_MAX_ORDER; /* break out of the loop */
|
||||
}
|
||||
spin_unlock(&zone->pools[order].lock);
|
||||
intr_restore(cpuflags);
|
||||
|
||||
page = low;
|
||||
order++;
|
||||
}
|
||||
|
||||
/* finally, we need to insert the page at its freelist */
|
||||
struct mm_pool *pool = &zone->pools[order];
|
||||
disable_intr();
|
||||
spin_lock(&pool->lock);
|
||||
clist_add(&pool->freelist, &page->link);
|
||||
pool->free_entries++;
|
||||
spin_unlock(&zone->pools[order].lock);
|
||||
intr_restore(cpuflags);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue