/* See the end of this file for copyright and license terms. */

#include <arch/page.h>

#include <gay/bits.h>
#include <gay/clist.h>
#include <gay/config.h>
#include <gay/kprintf.h>
#include <gay/mm.h>
#include <gay/mutex.h>
#include <gay/types.h>
#include <gay/util.h>

#include <limits.h>
#include <string.h>

#ifndef __HAVE_HUGEPAGES
#error "Systems without huge pages are currently unsupported because i'm a dumb bitch"
#endif

#if DMAP_OFFSET % HUGEPAGE_SIZE != 0
#error "DMAP_OFFSET must be an integral multiple of HUGEPAGE_SIZE"
#endif

/* this should be impossible because arch/page.h must also define PAGE_SHIFT
 * and HUGEPAGE_SHIFT, meaning the two are definitively powers of 2 */
#if HUGEPAGE_SIZE % PAGE_SIZE != 0
#error "HUGEPAGE_SIZE must be an integral multiple of PAGE_SIZE"
#endif

#if PAGE_SIZE % LONG_BIT != 0
#error "PAGE_SIZE must be an integral multiple of LONG_BIT"
#endif

#if CFG_DEBUG_PAGE_ALLOCS
#define page_debug(msg, ...) kprintf("[page] " msg, ##__VA_ARGS__)
#else
#define page_debug(msg, ...)
#endif

/**
 * We have cache levels for areas ranging from a single page up to a huge page
 * on a logarithmic scale.  Every level covers double the pages per entry than
 * the one below it, starting at one page per entry.  The effective result is
 * that a single entry in the cache on level L covers `(1 << L)` pages.
 */
#define CACHE_LEVELS (HUGEPAGE_SHIFT - PAGE_SHIFT + 1)

/** @brief There is one of this for every cache level. */
struct cache_pool {
	/**
	 * @brief List of free blocks on this level of granularity.
	 * The individual entries sit right at the beginning of each free block,
	 * and are always aligned to `entry_size` bytes.
	 */
	struct clist freelist;
	/**
	 * @brief Bitmap that stores the allocated status of each entry.
	 * 1 means allocated, 0 means not.
	 */
	unsigned long *bitmap;
	/** @brief Number of items in `freelist`. */
	usize free_entries;
};
static struct cache_pool caches[CACHE_LEVELS];
MTX(caches_lock);

#define LONG_BIT_MASK (~(LONG_BIT - 1))

uintptr_t phys_start;
uintptr_t phys_end;

static int sanity_check(void)
{
	if (phys_end != HUGEPAGE_ALIGN(phys_end) || phys_start != HUGEPAGE_ALIGN(phys_start)) {
		kprintf("Unaligned memory, this should never be possible\n");
		return 1;
	}

	if ((phys_end - phys_start) < (32 * 1024 * 1024)) {
		kprintf("Less than 32 MB of usable RAM, this wouldn't go well\n");
		return 1;
	}

	if (phys_start > phys_end) {
		kprintf("Hey, this is funny. pages_init() was called with parameters "
			"such that phys_start > phys_end (%p > %p), which "
			"should absolutely never be possible. I can't really continue "
			"like this, so have a nice day.\n", (void *)phys_start, (void *)phys_end);
		return 1;
	}

	return 0;
}

/*
 * This function maps the entire physical memory into the direct region
 * (DMAP_START - DMAP_END) and sets up the caches.
 * The bitmaps are stored one after another at the end of physical memory, and
 *
 */
int pages_init(void)
{
	usize phys_size = phys_end - phys_start;

	if (sanity_check() != 0)
		return 1;

	/*
	 * map entire physical memory into the direct contiguous area
	 */
	for (uintptr_t physptr = phys_start; physptr < phys_end; physptr += HUGEPAGE_SIZE) {
		const enum mm_page_flags pflags = MM_PAGE_HUGE | MM_PAGE_RW | MM_PAGE_GLOBAL;
		map_page(physptr, (void *)(physptr + DMAP_OFFSET), pflags);
	}
	vm_flush();

	/*
	 * calculate the size of each bitmap, as well as their combined size
	 */
	usize bitmap_bytes = 0;
	for (int i = 0; i < CACHE_LEVELS; i++) {
		usize bits = phys_size >> (PAGE_SHIFT + i);
		/* round up to the next full long */
		if (bits & ~LONG_BIT_MASK) {
			bits &= LONG_BIT_MASK;
			bits += LONG_BIT;
		}
		bitmap_bytes += bits / 8;
	}

	page_debug("Page frame overhead = %zu bytes\n", bitmap_bytes);

	/*
	 * zero out all bitmaps
	 */
	uintptr_t bitmap_start_phys = phys_end - bitmap_bytes;
	unsigned long *bitmap_start = __v(bitmap_start_phys);
	memset(bitmap_start, 0, bitmap_bytes);

	/*
	 * populate the remaining members of the cache_pool structures and
	 * preallocate entries that can't be handed out (i.e. the cache bitmaps)
	 */
	unsigned long *bitmap_pos = bitmap_start;
	for (int i = 0; i < CACHE_LEVELS; i++) {
		/* total amount of entries on this level */
		usize total_bits = phys_size >> (PAGE_SHIFT + i);
		/* number of entries on this level that the bitmap itself takes up */
		usize wasted_bits = bitmap_bytes >> (PAGE_SHIFT + i);
		if (wasted_bits == 0)
			wasted_bits = 1;
		bit_set_range(bitmap_pos, total_bits - wasted_bits, wasted_bits);

		caches[i].bitmap = bitmap_pos;
		bitmap_pos += total_bits / LONG_BIT;

		clist_init(&caches[i].freelist);
		caches[i].free_entries = 0;
	}

	/* kheap_start and kheap_end are globals */
	kheap_start = __v(phys_start);
	kheap_end = ptr_align(bitmap_start, -HUGEPAGE_SHIFT);

	/*
	 * populate the freelist on the highest level, all levels beneath it
	 * stay empty until one of the large blocks gets split up
	 */
	struct cache_pool *high_pool = &caches[CACHE_LEVELS - 1];
	usize step = 1 << (PAGE_SHIFT + CACHE_LEVELS - 1);
	for (void *pos = kheap_start; pos < kheap_end; pos += step) {
		struct clist *entry = pos;
		clist_add(&high_pool->freelist, entry);
		high_pool->free_entries++;
	}

	return 0;
}

/**
 * @brief Split a block and return the lower half.
 * The block is assumed to already have been removed from its freelist.
 * The high half (i.e. the block that is *not* returned) is inserted into the
 * freelist one level below `level`.
 *
 * @param ptr Pointer to the block
 * @param level Current level of the block
 *	(`ptr` must be aligned to `1 << level` pages)
 */
static void *split_buddy(void *ptr, int level);

/**
 * @brief Attempt to coalesce a block with its buddy.
 * If coalition is possible, the buddy is removed from its freelist at
 * `level` and the union block is inserted at `level + 1`.
 *
 * @param ptr Pointer to the block
 * @param level Cache level, must be less than `CACHE_LEVELS - 1` (because you
 *	can't join blocks at the highest cache level)
 * @return The joined block, or `nil` if coalition was not possible
 */
static void *try_join_buddy(void *ptr, int level);

static usize get_bit_number(void *ptr, int level);

static int get_level(usize count)
{
	int level;
	for (level = 0; level < CACHE_LEVELS; level++) {
		if ((1 << level) >= count)
			break;
	}
	return level;
}

void *get_pages(usize count, enum mm_flags flags)
{
	int level = get_level(count);
	if (level == CACHE_LEVELS) {
		page_debug("get_pages(%zu, %08x): count too large!\n", count, flags);
		return nil;
	}

	if (flags & MM_NOSLEEP) {
		kprintf("get_pages(): MM_NOSLEEP requested, this is not implemented yet :(\n");
		return nil;
	}
	mtx_lock(&caches_lock);

	struct clist *entry;
	int entry_level;
	for (entry_level = level; entry_level < CACHE_LEVELS; entry_level++) {
		if (caches[entry_level].free_entries > 0) {
			entry = caches[entry_level].freelist.next;
			break;
		}
	}
	if (entry_level == CACHE_LEVELS) {
		mtx_unlock(&caches_lock);
		return nil;
	}

	clist_del(entry);
	caches[entry_level].free_entries--;

	usize bit_number = get_bit_number(entry, entry_level);
	while (entry_level > level) {
		entry = split_buddy(entry, entry_level);
		bit_set(caches[entry_level].bitmap, bit_number);
		entry_level--;
		bit_number <<= 1;
	}
	bit_set(caches[level].bitmap, bit_number);

	mtx_unlock(&caches_lock);
	return (void *)entry;
}

void free_pages(void *ptr, usize count)
{
	int level = get_level(count);
	if (level == CACHE_LEVELS) {
		page_debug("free_pages(%p, %zu): count too large!\n", ptr, count);
		return;
	}

	mtx_lock(&caches_lock);

	usize bit_number = get_bit_number(ptr, level);

#	if CFG_DEBUG_PAGE_ALLOCS
		if (!bit_tst(caches[level].bitmap, bit_number)) {
			kprintf("free_pages(%p, %zu): double free!\n", ptr, count);
			mtx_unlock(&caches_lock);
			return;
		}
#	endif

	bit_clr(caches[level].bitmap, bit_number);

	while (level < CACHE_LEVELS - 1) {
		bit_clr(ptr, bit_number);
		ptr = try_join_buddy(ptr, level);
		if (ptr == nil)
			break;
		level++;
		bit_number >>= 1;
	}

	mtx_unlock(&caches_lock);
}

static inline usize get_bit_number(void *ptr, int level)
{
	return ((uintptr_t)ptr - (uintptr_t)kheap_start) >> (PAGE_SHIFT + level);
}

static inline void *split_buddy(void *ptr, int level)
{
#	if CFG_DEBUG_PAGE_ALLOCS
		if ((uintptr_t)ptr % (1 << (PAGE_SHIFT + level))) {
			kprintf("split_buddy(ptr = %p, level = %d): unaligned ptr!\n", ptr, level);
			return nil;
		}
		if (level < 1 || level >= CACHE_LEVELS) {
			kprintf("split_buddy(ptr = %p, level = %d): invalid level!\n", ptr, level);
			return nil;
		}
#	endif

	struct clist *high_buddy = ptr + (1 << (PAGE_SHIFT + level - 1));
	clist_add(&caches[level - 1].freelist, high_buddy);
	caches[level - 1].free_entries++;

	page_debug("split (%p:%p), lvl=%d\n", ptr, (void *)high_buddy, level);

	return ptr;
}

static void *try_join_buddy(void *ptr, int level)
{
	const usize entry_size = 1 << (PAGE_SHIFT + level);

#	if CFG_DEBUG_PAGE_ALLOCS
		if ((uintptr_t)ptr % entry_size) {
			kprintf("try_join_buddy(%p, %d): unaligned ptr!\n", ptr, level);
			return nil;
		}
		/* level must be < CACHE_LEVELS - 1 because you
		 * can't join blocks on the topmost level */
		if (level >= CACHE_LEVELS - 1) {
			kprintf("try_join_buddy(%p, %d): level >= CACHE_LEVELS - 1!\n", ptr, level);
			return nil;
		}
#	endif

	/*
	 * Test whether the buddy block is allocated and return nil if it is.
	 * entry_size is a power of 2, so we can quickly get to the buddy block
	 * with a cheap XOR of the address and the entry size without the need
	 * for any if branches.
	 */
	uintptr_t buddy = (uintptr_t)ptr ^ entry_size;
	usize buddy_bitnum = get_bit_number((void *)buddy, level);
	if (bit_tst(caches[level].bitmap, buddy_bitnum))
		return nil;

	page_debug("join (%p:%p), lvl=%d\n", ptr, (void *)buddy, level);

	/* If the buddy is free, we remove it from the freelist ... */
	clist_del((struct clist *)buddy);
	caches[level].free_entries--;

	/*
	 * ... and add the coalesced block to the freelist one level above.
	 * We use the same trick as above to get to the even (lower) block, just
	 * that this time we're zeroing the bit out rather than flipping it.
	 */
	uintptr_t even = (uintptr_t)ptr & ~entry_size;
	clist_add(&caches[level + 1].freelist, (struct clist *)even);
	caches[level + 1].free_entries++;
	return (void *)even;
}

/*
 * This file is part of GayBSD.
 * Copyright (c) 2021 fef <owo@fef.moe>.
 *
 * GayBSD is nonviolent software: you may only use, redistribute, and/or
 * modify it under the terms of the Cooperative Nonviolent Public License
 * (CNPL) as found in the LICENSE file in the source code root directory
 * or at <https://git.pixie.town/thufie/npl-builder>; either version 7
 * of the license, or (at your option) any later version.
 *
 * GayBSD comes with ABSOLUTELY NO WARRANTY, to the extent
 * permitted by applicable law.  See the CNPL for details.
 */