kern/kernel/mm/page.c

/* Copyright (C) 2021,2022 fef <owo@fef.moe>.  All rights reserved. */

#include <arch/cpufunc.h>
#include <arch/page.h>

#include <gay/clist.h>
#include <gay/config.h>
#include <gay/kprintf.h>
#include <gay/mm.h>
#include <gay/mutex.h>
#include <gay/poison.h>
#include <gay/systm.h>
#include <gay/types.h>
#include <gay/util.h>
#include <gay/vm/page.h>

#include <limits.h>
#include <string.h>
#include <strings.h>

#if DMAP_OFFSET % PAGE_SIZE != 0
#error "DMAP_OFFSET must be an integral multiple of PAGE_SIZE"
#endif

#if PAGE_SIZE % LONG_BIT != 0
#error "PAGE_SIZE must be an integral multiple of LONG_BIT"
#endif

#if __SIZEOF_POINTER__ != __SIZEOF_LONG__
#error "long must be as wide as a pointer"
#endif

#if CFG_DEBUG_PAGE_ALLOCS
#	define PAGE_ASSERT(x) KASSERT(x)
#	define page_debug(msg, ...) kprintf("[page] " msg, ##__VA_ARGS__)
#	define PAGE_DEBUG_BLOCK
#	if CFG_DEBUG_PAGE_ALLOCS_NOISY
#		define page_debug_noisy(msg, ...) kprintf("[page] " msg, ##__VA_ARGS__)
#	else
#		define page_debug_noisy(msg, ...) ({})
#	endif
#else
#	define PAGE_ASSERT(x) ({})
#	define PAGE_DEBUG_BLOCK if (0)
#	define page_debug(msg, ...) ({})
#	define page_debug_noisy(msg, ...) ({})
#endif

#define ORDER_SHIFT(order) (PAGE_SHIFT + (order))
#define ORDER_SIZE(order) (1 << ORDER_SHIFT(order))

/* this should be the same as LONG_BIT because latom_t is really just a
 * long wrapped in a struct, but my trust in compilers is exactly zero */
#define LATOM_BIT (sizeof(latom_t) * CHAR_BIT)

struct mm_zone mm_zones[MM_NR_ZONES];

static inline u_int paddr_find_order(vm_paddr_t addr)
{
	int bit = ffsll((long long)addr) - 1;
	if (bit == -1 || bit > ORDER_SHIFT(MM_MAX_ORDER))
		bit = ORDER_SHIFT(MM_MAX_ORDER);

	KASSERT(bit >= PAGE_SHIFT);
	return bit - PAGE_SHIFT;
}

/** @brief Claim all free pages in one of the memory areas from the boot allocator. */
static inline void claim_bmem_area(struct mm_zone *zone, const struct _bmem_area *area)
{
	u_int order = paddr_find_order(area->start);
	while (area->start + ORDER_SIZE(order) > area->end)
		order--;

	struct vm_page *const start = paddr2pg(area->start);
	struct vm_page *const end = paddr2pg(area->end);
	struct vm_page *pos = start;

	const vm_size_t nr_pages = end->pfn - start->pfn;
	latom_add(&zone->free_count, (long)nr_pages);

	/*
	 * We want to insert pages at the highest possible order.  However, the
	 * start and end pointers of the area are only guaranteed to be page
	 * aligned.  Therefore, we start with the highest possible order based
	 * on the start address, and then increment the order in every loop
	 * iteration (up to MM_MAX_ORDER).  We do this until we have reached
	 * the end which, again, is only guaranteed to be page aligned, and
	 * subsequently lower the order again.
	 */
	while (pos < end) {
		struct mm_pool *const pool = &zone->pools[order];
		clist_add(&pool->freelist, &pos->link);
		pool->free_entries++;

		/* only the first page in the order group is inserted into
		 * the freelist, but all of them need to be initialized */
		for (u_int i = 0; i < (1u << order); i++) {
			if (pos >= end)
				panic("page %p out of range", pos);
			if (atom_read(&pos->count) != 420)
				panic("page %p double initialized\n", pos);
			atom_init(&pos->count, 0);
			atom_init(&pos->attr, 0);

			pos++;
		}

		/*
		 *       order
		 *         ^
		 *         |      ._____._____. < MM_MAX_ORDER
		 *         |  .___|           |
		 * start   |._|               |_.
		 * order > .|                   |. < end order
		 *         |---------------------|----> pos
		 *       start                  end
		 */
		if (order < MM_MAX_ORDER && pos + (1 << (order + 1)) <= end) {
			/* this makes the rising part of the graph */
			order++;
		} else if (order > 0 && pos + (1 << order) > end) {
			/* this makes the abrupt downwards jump at the end of the graph */
			while (--order) {
				if (pos + (1 << order) <= end)
					break;
			}
		}
	}
}

void paging_init(vm_paddr_t phys_end)
{
	/* Sizes of the individual bitmaps per order, rounded up to the
	 * next full longword.  We use the same bitmaps in all zones. */
	usize bitmap_sizes[MM_NR_ORDERS];
	/* size of all bitmaps combined */
	usize bitmap_total_size = 0;

	for (int order = 0; order < MM_NR_ORDERS; order++) {
		usize pages = phys_end >> ORDER_SHIFT(order);
		pages = align_ceil(pages, LATOM_BIT * 2);
		usize bytes = pages / (CHAR_BIT * 2);
		bitmap_sizes[order] = bytes;
		bitmap_total_size += bytes;
	}

	page_debug("Reserving %zu bytes for page bitmaps\n", bitmap_total_size);

	/*
	 * allocate memory for the bitmaps and zero them out
	 */
	u_int bitmap_size_log2 = flsl((long)bitmap_total_size);
	KASSERT(bitmap_size_log2 != 0);
	bitmap_size_log2--; /* the bit index returned by flsl starts at 1 */
	if (bitmap_total_size ^ (1ul << bitmap_size_log2))
		bitmap_size_log2++; /* bitmap_total_size is not a power of 2, round up */
	vm_paddr_t bitmap_start_phys = __boot_pmalloc(bitmap_size_log2, MM_ZONE_NORMAL);
	panic_if(bitmap_start_phys == BOOT_PMALLOC_ERR,
		 "cannot allocate memory for the page bitmaps");
	memset(__v(bitmap_start_phys), 0, bitmap_total_size);

	/*
	 * initialize the pools
	 */
	for (int zone_index = 0; zone_index < ARRAY_SIZE(mm_zones); zone_index++) {
		struct mm_zone *zone = &mm_zones[zone_index];
		latom_init(&zone->free_count, 0);
		/* we use the same bitmaps for all zones */
		latom_t *bitmap_pos = __v(bitmap_start_phys);
		for (int order = 0; order < MM_NR_ORDERS; order++) {
			struct mm_pool *pool = &zone->pools[order];
			pool->bitmap = bitmap_pos;
			pool->free_entries = 0;
			clist_init(&pool->freelist);
			spin_init(&pool->lock);

			bitmap_pos += bitmap_sizes[order];
		}
	}

	/*
	 * mark *all* pages as reserved first
	 *
	 * XXX this is totally unnecessary and i'm only doing it because i'm
	 *     too tired to work out an algorithm that finds all pages that are
	 *     not in the _bmem_areas lists of the mm_zones
	 *
	 * if the reserved bit is set, all other fields in the page are invalid.
	 */
	for (u_long pfn = 0; pfn < phys_end >> PAGE_SHIFT; pfn++) {
		/* This is merely an optimization to simplify checking whether
		 * two buddies can be coalesced into one.  In reality, the
		 * reference count is invalid because the page is reserved. */
		atom_init(&vm_page_array[pfn].count, 420);
		atom_init(&vm_page_array[pfn].attr, _PGA_RSVD_MASK);
		vm_page_array[pfn].pfn = pfn;
	}

	/*
	 * populate the freelists
	 */
	for (int i = 0; i < ARRAY_SIZE(mm_zones); i++) {
		struct mm_zone *zone = &mm_zones[i];
		struct _bmem_area *area, *tmp;
		clist_foreach_entry_safe(&zone->_bmem_areas, area, tmp, link) {
			/* make sure the boot memory allocator cannot under any circumstances hand
			 * out pages from this area anymore, even though that should be unnecessary */
			clist_del(&area->link);

			claim_bmem_area(zone, area);
			zone->thrsh.emerg = latom_read(&zone->free_count) / CFG_PAGE_EMERG_DENOM;
			if (zone->thrsh.emerg > CFG_PAGE_EMERG_MAX)
				zone->thrsh.emerg = CFG_PAGE_EMERG_MAX;
		}
	}
}

static inline bool pg_flip_bit(struct mm_zone *zone, u_long pfn, u_int order)
{
	usize bit = pfn >> (order + 1);
	latom_t *bitmap = &zone->pools[order].bitmap[bit / LATOM_BIT];
	return latom_flip_bit(bitmap, (int)(bit % LATOM_BIT));
}

vm_page_t page_alloc(u_int order, enum mflags flags)
{
	if (order > MM_MAX_ORDER) {
		page_debug("get_pages(%d, %#08x): Order too high!\n", order, flags);
		return nil;
	}

	struct mm_zone *zone = &mm_zones[_M_ZONE_INDEX(flags)];
	long count_after;
try_next_zone:
	count_after = latom_sub(&zone->free_count, (1 << order)) - (1 << order);
	if (count_after < zone->thrsh.emerg) {
		if (count_after < 0 || !(flags & _M_EMERG)) {
			latom_add(&zone->free_count, (1 << order));
			/* if we can't allocate from ZONE_NORMAL, fall back to ZONE_DMA */
			if (zone > &mm_zones[0]) {
				zone--;
				goto try_next_zone;
			} else {
				return nil;
			}
		}
	}

	register_t cpuflags = read_flags();

	/*
	 * Search for a free page.  Start looking at the freelist for the
	 * requested order, and if it's empty, go over to the next higher order.
	 * Repeat until we found a page, or we've reached the highest order.
	 */
	vm_page_t page = nil;
	u_int page_order = order;
	while (page == nil && page_order < MM_NR_ORDERS) {
		struct mm_pool *pool = &zone->pools[page_order];

		disable_intr();
		spin_lock(&pool->lock);
		if (pool->free_entries > 0) {
			page = clist_del_first_entry(&pool->freelist, typeof(*page), link);
			/* increment the reference count while we hold the lock on the pool,
			 * so that no other processor can try to coalesce this block if its
			 * buddy is being freed (coalition is only possible if the buddy
			 * has a reference count of zero, and while holding the pool lock) */
			page_get(page);
			pool->free_entries--;
		} else {
			page_order++;
		}
		spin_unlock(&pool->lock);
		intr_restore(cpuflags);
	}

	if (page == nil) {
		if (zone > &mm_zones[0]) {
			/*
			 * If we reach this, the current zone technically had enough free
			 * pages for the allocation, but those pages were split up into
			 * smaller chunks rather than a contiguous area.  However, we don't
			 * give up quite yet:  If possible, we fall back to a lower memory
			 * zone (ZONE_NORMAL -> ZONE_DMA) and start over from the top.
			 */
			zone--;
			goto try_next_zone;
		} else {
			return nil;
		}
	}

	/*
	 * if we found a page, check if we need to split it up
	 * (which is the case if we took one from a higher order freelist)
	 */
	usize pfn = pg2pfn(page);
	page_debug_noisy("alloc order %u, split pfn %#lx from order %u\n",
			 order, pfn, page_order);
	pg_flip_bit(zone, pfn, page_order);

	/* split the page and insert the upper halves into the
	 * respective freelist until we reach the requested order */
	while (page_order-- > order) {
		page_debug_noisy("split %p (order = %u)\n", pfn2vaddr(pfn), page_order);
		struct mm_pool *pool = &zone->pools[page_order];
		vm_page_t buddy = page + (1 << page_order);
		pga_set_order(buddy, page_order);
		pg_flip_bit(zone, pfn + (1 << page_order), page_order);

		disable_intr();
		spin_lock(&pool->lock);
		clist_add_first(&pool->freelist, &buddy->link);
		pool->free_entries++;
		spin_unlock(&pool->lock);
		intr_restore(cpuflags);
	}

	for (u_int i = 0; i < (1 << order); i++)
		pga_set_order(&page[i], order);
	page_clear(page);
	return page;
}

/*
 * XXX get_page() and get_pages() shouldn't depend on the direct map
 *
 * XXX Do we need these at all?  I don't think so.
 */

void *get_pages(u_int order, enum mflags flags)
{
	vm_page_t page = page_alloc(order, flags);
	if (page)
		return pfn2vaddr(pg2pfn(page));
	else
		return nil;
}

void *get_page(enum mflags flags)
{
	vm_page_t page = page_alloc(0, flags);
	if (page)
		return pfn2vaddr(pg2pfn(page));
	else
		return nil;
}

/*
 * Two buddies can be merged if:
 * - you currently hold the lock for the pool
 * - they both have a reference count of zero
 * - they are in the same zone
 * - neither of them is reserved
 *
 * This is only called from within the critical section of free_pages(),
 * so execution speed is prioritized over anything else.
 */
static __always_inline bool can_merge(vm_page_t page, vm_page_t buddy)
{
	bool merge = (atom_read(&buddy->count) == 0);

	/* we know that `page' is not reserved, because we
	 * check that flag before we even attempt coalition */
	const unsigned mask = _PGA_RSVD_MASK | _PGA_ZONE_MASK;
	merge &= (atom_read(&page->attr) & mask) == (atom_read(&buddy->attr) & mask);

	return merge;
}

void page_free(vm_page_t page)
{
	register_t cpuflags = read_flags();

	u_int order = pga_order(page);
	PAGE_ASSERT((uintptr_t)ptr % ORDER_SIZE(order) == 0);
	u_long pfn = pg2pfn(page);

	PAGE_DEBUG_BLOCK {
		int old_count = atom_sub(&page->count, 1);
		if (old_count != 1) {
			if (old_count == 0)
				page_debug("double free of %p", ptr);
			else
				page_debug("attempted to free %p with references", ptr);
			return;
		}
	} else {
		atom_dec(&page->count);
	}

	struct mm_zone *zone = &mm_zones[pga_zone(page)];
	latom_add(&zone->free_count, (1 << order));

	/* try to coalesce free buddy blocks until we're reached the highest order */
	while (order < MM_MAX_ORDER) {
		if (pg_flip_bit(zone, pfn, order))
			break;

		page_debug_noisy("join %p (order = %u)\n", pfn2vaddr(pfn), order);

		/* precompute all values we need inside the critical section
		 * to avoid blocking other CPUs for longer than necessary */
		vm_page_t buddy = &vm_page_array[pfn ^ (1ul << order)];
		vm_page_t low = &vm_page_array[pfn & ~(1ul << order)];
		struct mm_pool *current_order_pool = &zone->pools[order];
		struct mm_pool *next_order_pool = &zone->pools[order + 1];

		disable_intr();
		spin_lock(&zone->pools[order].lock);
		if (can_merge(page, buddy)) {
			clist_del(&buddy->link);
			current_order_pool->free_entries--;
			pga_set_order(buddy, order + 1);
			pga_set_order(page, order + 1);
			clist_add(&next_order_pool->freelist, &low->link);
			next_order_pool->free_entries++;
		} else {
			order = MM_MAX_ORDER; /* break out of the loop */
		}
		spin_unlock(&zone->pools[order].lock);
		intr_restore(cpuflags);

		page = low;
		order++;
	}

	/* finally, we need to insert the page at its freelist */
	struct mm_pool *pool = &zone->pools[order];
	disable_intr();
	spin_lock(&pool->lock);
	clist_add(&pool->freelist, &page->link);
	pool->free_entries++;
	spin_unlock(&zone->pools[order].lock);
	intr_restore(cpuflags);
}