You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
429 lines
14 KiB
C
429 lines
14 KiB
C
/* Copyright (C) 2021 fef <owo@fef.moe>. All rights reserved. */
|
|
|
|
#include <arch/atom.h>
|
|
#include <arch/multiboot.h>
|
|
#include <arch/vmparam.h>
|
|
|
|
#include <gay/linker.h>
|
|
#include <gay/mm.h>
|
|
#include <gay/vm/page.h>
|
|
#include <gay/systm.h>
|
|
#include <gay/util.h>
|
|
|
|
#include <inttypes.h>
|
|
#include <string.h>
|
|
|
|
/*
|
|
* This file is funny.
|
|
* Our job here seems simple at first glance: initialize the vm_page_array.
|
|
* The catch is that we can't use the regular kernel memory allocators for
|
|
* doing so, because those depend on vm_page_array. Classic chicken/egg stuff.
|
|
* So, how do we allocate (and map!) memory for the array? Simple, by using a
|
|
* completely separate page frame allocator that is so basic that it can't even
|
|
* free pages again. That's not a problem though, because it doesn't need to.
|
|
* Memory maps are created manually, which is very painful, but doable.
|
|
* HOWEVER! This boot page frame allocator needs to allocate memory for keeping
|
|
* track of which memory areas were already allocated and which ones are still
|
|
* free, too. Areas might also have to be split, if the region we want to
|
|
* allocate is not the exact size of the physical area. Therefore, we have
|
|
* *another* allocator, which is basically the most primitive slab allocator in
|
|
* existence. It uses a fixed-size "slab" (the `free_areas` array below), and
|
|
* keeps track of which free areas are available.
|
|
*
|
|
* To sum up:
|
|
* - The boot "slab" allocator hands out `struct free_area`s to ...
|
|
* - the boot page frame allocator, which is used to set up ...
|
|
* - the buddy page frame allocator, which serves as a backend to ...
|
|
* - the kernel slab allocator.
|
|
*
|
|
* XXX the boot memory allocator could probably be moved to an architecture
|
|
* independent file, because it is not really specific to the x86.
|
|
*/
|
|
|
|
struct vm_page *const vm_page_array = (vm_page_t)VM_PAGE_ARRAY_OFFSET;
|
|
#ifdef DEBUG
|
|
/* this gets updated in x86_setup_paging() once we know how big the array is */
|
|
vm_page_t _vm_page_array_end = (vm_page_t)(VM_PAGE_ARRAY_OFFSET + VM_PAGE_ARRAY_LENGTH);
|
|
#endif
|
|
|
|
/**
|
|
* @brief Memory area information for the boot page frame allocator.
|
|
* The multiboot bootloader gives us an array of memory areas, and tells us
|
|
* which ones are available and which aren't. We insert all available areas
|
|
* into a circular list (`free_area_list`), and the boot page frame allocator
|
|
* iterates over that list for getting memory.
|
|
*
|
|
* Also, this is probably one of the most unfortunately named structures in the
|
|
* entire system, because instances of this structure need to be allocated and,
|
|
* well, freed.
|
|
*/
|
|
struct free_area {
|
|
struct clist link;
|
|
vm_paddr_t start;
|
|
vm_size_t end;
|
|
};
|
|
/** @brief This is essentially a very basic slab. */
|
|
static struct free_area free_areas[16];
|
|
/** @brief List of all free memory areas, ordered by ascending address */
|
|
static CLIST(free_area_list);
|
|
/**
|
|
* @brief List of all the unused members in `free_areas`.
|
|
* This is essentially a very basic slab freelist.
|
|
*/
|
|
static CLIST(free_area_freelist);
|
|
|
|
/**
|
|
* @brief VERY early page frame allocator.
|
|
*
|
|
* Allocates `1 << log2` bytes of memory, aligned to at least its own size.
|
|
*
|
|
* @param log2 Binary logarithm of the allocation size. Must be at least `PAGE_SHIFT`.
|
|
* @returns Physical address of the allocated region, or `BOOT_PMALLOC_ERR` on failure
|
|
*/
|
|
static vm_paddr_t __boot_pmalloc(u_int log2);
|
|
#define BOOT_PMALLOC_ERR (~0ul)
|
|
/** @brief Zero out a single page (required for page tables) */
|
|
static void __boot_clear_page(vm_paddr_t paddr);
|
|
|
|
/** @brief Initialize the members of `vm_page_array` within the given range. */
|
|
static void init_page_range(vm_paddr_t start, vm_paddr_t end, u_int flags);
|
|
/** @brief Add a new entry to the list of free memory areas. */
|
|
static void insert_free_area(struct mb2_mmap_entry *entry);
|
|
static void init_free_area_freelist(void);
|
|
static void print_mem_area(struct mb2_mmap_entry *entry);
|
|
|
|
/*
|
|
* "Oh cool another deeply nested 100-liner that nobody understands"
|
|
*/
|
|
void x86_paging_init(struct mb2_tag_mmap *mmap)
|
|
{
|
|
init_free_area_freelist();
|
|
|
|
/*
|
|
* insert all free areas and find the end of physical memory
|
|
*/
|
|
struct mb2_mmap_entry *entry = mmap->entries;
|
|
vm_paddr_t end = 0;
|
|
kprintf("Memory map:\n");
|
|
while ((void *)entry - (void *)mmap < mmap->tag.size) {
|
|
vm_paddr_t entry_end = entry->addr + entry->len;
|
|
end = max(end, entry_end);
|
|
print_mem_area(entry);
|
|
if (entry->type == MB2_MEMORY_AVAILABLE)
|
|
insert_free_area(entry);
|
|
entry = (void *)entry + mmap->entry_size;
|
|
}
|
|
|
|
/*
|
|
* allocate and map vm_page_array into virtual memory at VM_PAGE_ARRAY_OFFSET
|
|
* (this is gonna be a long one)
|
|
*/
|
|
struct vm_page *vm_page_array_end = vm_page_array + (end >> PAGE_SHIFT);
|
|
#ifdef DEBUG
|
|
_vm_page_array_end = vm_page_array_end;
|
|
#endif
|
|
void *map_pos = vm_page_array;
|
|
usize remaining_size = (void *)vm_page_array_end - (void *)vm_page_array;
|
|
remaining_size = align_ceil(remaining_size, PAGE_SIZE);
|
|
kprintf("Mapping %zu bytes for vm_page_array\n", remaining_size);
|
|
|
|
while (remaining_size != 0) {
|
|
x86_pml4te_t *pml4te = X86_PML4TE(map_pos);
|
|
vm_paddr_t pml4te_val = __boot_pmalloc(PAGE_SHIFT);
|
|
KASSERT(pml4te_val != BOOT_PMALLOC_ERR);
|
|
__boot_clear_page(pml4te_val);
|
|
pml4te_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_NOEXEC;
|
|
pml4te->val = pml4te_val;
|
|
vm_flush();
|
|
|
|
for (int pdpt_index = 0; pdpt_index < 512; pdpt_index++) {
|
|
x86_pdpte_t *pdpte = X86_PDPTE(map_pos);
|
|
vm_paddr_t pdpte_val;
|
|
|
|
/* try allocating a 1 GB gigapage first */
|
|
if (remaining_size >= 1 << X86_PDPT_SHIFT) {
|
|
pdpte_val = __boot_pmalloc(X86_PDPT_SHIFT);
|
|
/* CLion is warning about this condition being always true, but
|
|
* that is not the case. I've checked the disassembly with -O2,
|
|
* and clang is emitting the check. So it's fine, i guess. */
|
|
if (pdpte_val != BOOT_PMALLOC_ERR) {
|
|
pdpte_val |= __P_PRESENT | __P_RW | __P_HUGE
|
|
| __P_GLOBAL | __P_NOEXEC;
|
|
pdpte->val = pdpte_val;
|
|
remaining_size -= 1 << X86_PDPT_SHIFT;
|
|
map_pos += 1 << X86_PDPT_SHIFT;
|
|
if (remaining_size == 0)
|
|
goto map_done;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
/* couldn't use a gigapage, continue in hugepage steps */
|
|
pdpte_val = __boot_pmalloc(PAGE_SHIFT);
|
|
KASSERT(pdpte_val != BOOT_PMALLOC_ERR);
|
|
__boot_clear_page(pdpte_val);
|
|
pdpte_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_NOEXEC;
|
|
pdpte->val = pdpte_val;
|
|
vm_flush();
|
|
|
|
for (int pdt_index = 0; pdt_index < 512; pdt_index++) {
|
|
x86_pdte_t *pdte = X86_PDTE(map_pos);
|
|
vm_paddr_t pdte_val;
|
|
|
|
/* try allocating a 2 MB hugepage first */
|
|
if (remaining_size >= (1 << X86_PDT_SHIFT)) {
|
|
pdte_val = __boot_pmalloc(X86_PDT_SHIFT);
|
|
if (pdte_val != BOOT_PMALLOC_ERR) {
|
|
pdte_val |= __P_PRESENT | __P_RW | __P_GLOBAL
|
|
| __P_HUGE | __P_NOEXEC;
|
|
pdte->val = pdte_val;
|
|
remaining_size -= 1 << X86_PDT_SHIFT;
|
|
map_pos += 1 << X86_PDT_SHIFT;
|
|
if (remaining_size == 0)
|
|
goto map_done;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
/* couldn't use a hugepage, continue in page steps */
|
|
pdte_val = __boot_pmalloc(PAGE_SHIFT);
|
|
KASSERT(pdte_val != BOOT_PMALLOC_ERR);
|
|
__boot_clear_page(pdpte_val);
|
|
pdte_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_NOEXEC;
|
|
pdte->val = pdte_val;
|
|
vm_flush();
|
|
|
|
for (int pt_index = 0; pt_index < 512; pt_index++) {
|
|
x86_pte_t *pte = X86_PTE(map_pos);
|
|
vm_paddr_t pte_val = __boot_pmalloc(X86_PT_SHIFT);
|
|
KASSERT(pte_val != BOOT_PMALLOC_ERR);
|
|
pte_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_NOEXEC;
|
|
pte->val = pte_val;
|
|
|
|
remaining_size -= 1 << X86_PT_SHIFT;
|
|
map_pos += 1 << X86_PT_SHIFT;
|
|
if (remaining_size == 0)
|
|
goto map_done;
|
|
} /* end of PT loop */
|
|
} /* end of PD loop */
|
|
} /* end of PDP loop */
|
|
} /* end of PML4 loop */
|
|
|
|
map_done:
|
|
vm_flush();
|
|
|
|
/*
|
|
* initialize the individual pages and calculate the usable RAM size
|
|
*/
|
|
vm_paddr_t prev_end = 0;
|
|
vm_size_t available_ram = 0;
|
|
struct free_area *cursor;
|
|
clist_foreach_entry(&free_area_list, cursor, link) {
|
|
/* list should have been ordered by ascending size */
|
|
KASSERT(cursor->start >= prev_end);
|
|
|
|
if (cursor->start != prev_end) {
|
|
vm_paddr_t reserved_start = prev_end;
|
|
vm_paddr_t reserved_end = cursor->start;
|
|
init_page_range(reserved_start, reserved_end, PG_RESERVED);
|
|
}
|
|
|
|
init_page_range(cursor->start, cursor->end, 0);
|
|
prev_end = cursor->end;
|
|
available_ram += cursor->end - cursor->start;
|
|
}
|
|
|
|
kprintf("Available RAM: %"PRIdVM_SIZE" bytes\n", available_ram);
|
|
}
|
|
|
|
static struct free_area *alloc_free_area_entry(void)
|
|
{
|
|
/* XXX this should pretty much never happen, but it would still be nice to
|
|
* have at least some sort of error recovery rather than giving up */
|
|
if (clist_is_empty(&free_area_freelist))
|
|
panic("Boot memory allocator has run out of free_areas");
|
|
return clist_del_first_entry(&free_area_freelist, struct free_area, link);
|
|
}
|
|
|
|
static void free_free_area_entry(struct free_area *area)
|
|
{
|
|
#ifdef DEBUG
|
|
area->start = ~0ul;
|
|
area->end = ~0ul;
|
|
#endif
|
|
clist_add(&free_area_freelist, &area->link);
|
|
}
|
|
|
|
static void init_free_area_freelist(void)
|
|
{
|
|
for (u_int i = 0; i < ARRAY_SIZE(free_areas); i++)
|
|
clist_add(&free_area_freelist, &free_areas[i].link);
|
|
}
|
|
|
|
static void insert_free_area(struct mb2_mmap_entry *entry)
|
|
{
|
|
vm_paddr_t start = align_ceil(entry->addr, PAGE_SIZE);
|
|
vm_paddr_t end = align_floor(entry->addr + entry->len, PAGE_SIZE);
|
|
if (start <= image_start_phys && end >= image_end_phys) {
|
|
/*
|
|
* This is the area that the kernel image is loaded in, which we need
|
|
* to treat differently than all the others because it gets split up
|
|
* into two usable areas. Illustration (addresses are examples only):
|
|
*
|
|
* 0x01000000 ---------------------- end (high_end)
|
|
* : <free real estate>
|
|
* 0x00500000 ---------------------- image_end_phys (high_start)
|
|
* : <kernel code & data>
|
|
* 0x00400000 ---------------------- image_start_phys (low_end)
|
|
* : <free real estate>
|
|
* 0x00100000 ---------------------- start (low_start)
|
|
*
|
|
* (we silently assert that the image always spans only one region)
|
|
*/
|
|
vm_paddr_t low_start = start;
|
|
vm_paddr_t low_end = align_floor(image_start_phys, PAGE_SIZE);
|
|
if (low_start < low_end) {
|
|
struct free_area *area = alloc_free_area_entry();
|
|
area->start = low_start;
|
|
area->end = low_end;
|
|
clist_add(&free_area_list, &area->link);
|
|
}
|
|
|
|
vm_paddr_t high_start = align_ceil(image_end_phys, PAGE_SIZE);
|
|
vm_paddr_t high_end = end;
|
|
if (high_start < high_end) {
|
|
struct free_area *area = alloc_free_area_entry();
|
|
area->start = high_start;
|
|
area->end = high_end;
|
|
clist_add(&free_area_list, &area->link);
|
|
}
|
|
} else {
|
|
struct free_area *area = alloc_free_area_entry();
|
|
area->start = start;
|
|
area->end = end;
|
|
clist_add(&free_area_list, &area->link);
|
|
}
|
|
}
|
|
|
|
static void init_page_range(vm_paddr_t start, vm_paddr_t end, u_int flags)
|
|
{
|
|
KASSERT(start <= end);
|
|
vm_page_t cursor = vm_page_array + (start >> PAGE_SHIFT);
|
|
usize count = (end - start) >> PAGE_SHIFT;
|
|
|
|
if (flags == 0) {
|
|
memset(cursor, 0, count * sizeof(*cursor));
|
|
} else {
|
|
while (count--) {
|
|
atom_init(&cursor->count, 0);
|
|
cursor->flags = flags;
|
|
cursor->try_free = nil;
|
|
cursor->extra = nil;
|
|
cursor++;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* This works relatively simple, actually.
|
|
* We iterate over the list of `struct free_area`s in reverse order because the
|
|
* list is sorted by ascending physical address and i've decided that we prefer
|
|
* using higher physical addresses for the page array. The first fit wins, and
|
|
* all that's left is to split up the area and insert the top and bottom
|
|
* remainder back into the list, if applicable.
|
|
*/
|
|
static vm_paddr_t __boot_pmalloc(u_int log2)
|
|
{
|
|
const usize alloc_size = 1 << log2;
|
|
KASSERT(log2 >= PAGE_SHIFT); /* never hand out less than a full page */
|
|
|
|
struct free_area *cursor;
|
|
clist_foreach_entry_rev(&free_area_list, cursor, link) {
|
|
vm_paddr_t area_start = cursor->start;
|
|
vm_paddr_t area_end = cursor->end;
|
|
KASSERT(area_start < area_end);
|
|
/* the areas tend to be aligned to greater sizes at their beginning */
|
|
vm_paddr_t alloc_start = align_ceil(area_start, alloc_size);
|
|
vm_paddr_t alloc_end = alloc_start + alloc_size;
|
|
|
|
if (alloc_start >= area_start && alloc_end <= area_end) {
|
|
/*
|
|
* Example with log2 == 21 (alloc_size == 0x00200000):
|
|
*
|
|
* 0x00500000 ------------------- area_end (not aligned)
|
|
* : <high_rest>
|
|
* 0x00400000 ------------------- alloc_end (aligned to alloc_size)
|
|
* : <allocated block>
|
|
* 0x00200000 ------------------- alloc_start (aligned to alloc_size)
|
|
* : <low_rest>
|
|
* 0x00100000 ------------------- area_start (not aligned)
|
|
*/
|
|
|
|
if (alloc_start > area_start) {
|
|
struct free_area *low_rest = alloc_free_area_entry();
|
|
low_rest->start = area_start;
|
|
low_rest->end = alloc_start;
|
|
clist_add(&cursor->link, &low_rest->link);
|
|
}
|
|
|
|
if (alloc_end < area_end) {
|
|
struct free_area *high_rest = alloc_free_area_entry();
|
|
high_rest->start = alloc_end;
|
|
high_rest->end = area_end;
|
|
clist_add_first(&cursor->link, &high_rest->link);
|
|
}
|
|
|
|
clist_del(&cursor->link);
|
|
free_free_area_entry(cursor);
|
|
return alloc_start;
|
|
}
|
|
}
|
|
|
|
return BOOT_PMALLOC_ERR;
|
|
}
|
|
|
|
/*
|
|
* It's really unfortunate that we have to zero a page before we can use it as
|
|
* a page table, yet also need to reference it in the page table structures
|
|
* (thereby mapping it into virtual memory) before we can zero it out.
|
|
* This little hack temporarily maps the area at one PDP entry before KERNBASE
|
|
* (meaning index 1022 of _pdp0), zeroes the area, and then unmaps it again.
|
|
*/
|
|
static void __boot_clear_page(vm_paddr_t paddr)
|
|
{
|
|
vm_paddr_t pbase = align_floor(paddr, 1 << X86_PDPT_SHIFT);
|
|
vm_offset_t offset = paddr - pbase;
|
|
void *vbase = (void *)KERNBASE - (1 << X86_PDPT_SHIFT);
|
|
x86_pdpte_t *pdpe = X86_PDPTE(vbase);
|
|
pdpe->val = pbase | __P_PRESENT | __P_RW | __P_HUGE | __P_NOEXEC;
|
|
vm_flush();
|
|
memset(vbase + offset, 0, PAGE_SIZE);
|
|
pdpe->flags.present = false;
|
|
vm_flush();
|
|
}
|
|
|
|
static void print_mem_area(struct mb2_mmap_entry *entry)
|
|
{
|
|
const char *name;
|
|
switch (entry->type) {
|
|
case MB2_MEMORY_AVAILABLE:
|
|
name = "Available";
|
|
break;
|
|
case MB2_MEMORY_RESERVED:
|
|
name = "Reserved";
|
|
break;
|
|
case MB2_MEMORY_ACPI_RECLAIMABLE:
|
|
name = "ACPI (reclaimable)";
|
|
break;
|
|
case MB2_MEMORY_NVS:
|
|
name = "Non-Volatile Storage";
|
|
break;
|
|
case MB2_MEMORY_BADRAM:
|
|
name = "Bad RAM";
|
|
break;
|
|
}
|
|
|
|
kprintf(" [0x%016"PRIxVM_PADDR"-0x%016"PRIxVM_PADDR"] %s\n",
|
|
entry->addr, entry->addr + entry->len - 1, name);
|
|
}
|