mm: refactor page allocator
This is hopefully the last time in a while that something in the mm subsystem needs a refactor this large. There are two main changes: - The page frame allocator returns a vm_page_t rather than a virtual address. - Data for the slab allocator is now stored in struct vm_page, which means there is no overhead in the slab itself so the space is used in a more efficient manner.
This commit is contained in:
parent
f8a85a1541
commit
b4ed811920
16 changed files with 369 additions and 344 deletions
|
@ -116,7 +116,7 @@ ENTRY(_setup)
|
|||
* because the page directory is being interpreted as a page table.
|
||||
* This allows us to manipulate the table while we are in virtual memory.
|
||||
*/
|
||||
movl $(PADDR(pd0) + 0x003), PADDR(pd0) + 1023 * 4 /* 0xffc00000 */
|
||||
movl $(PADDR(pd0) + 0x013), PADDR(pd0) + 1023 * 4 /* 0xffc00000 */
|
||||
|
||||
/* set the Page Size Extensions (4) and Page Global Enable (7) bits in cr4 */
|
||||
mov %cr4, %ecx
|
||||
|
|
|
@ -160,11 +160,11 @@ ENTRY(_setup)
|
|||
movl $0x00000083, PADDR(_pdpt0 + PDPT_OFFSET(KERNBASE))
|
||||
movl $0x40000083, PADDR(_pdpt0 + PDPT_OFFSET(KERNBASE + 0x40000000))
|
||||
|
||||
movl $PADDR(_pdpt0 + 0x003), PADDR(_pml4t) /* present (0), write (1), huge (7) */
|
||||
movl $PADDR(_pdpt0 + 0x003), PADDR(_pml4t) /* present (0), write (1) */
|
||||
movl $PADDR(_pdpt0 + 0x003), PADDR(_pml4t + PML4T_OFFSET(KERNBASE))
|
||||
|
||||
/* map the PML4 to itself */
|
||||
movl $PADDR(_pml4t + 0x003), PADDR(_pml4t + PML4T_OFFSET(X86_PMAP_OFFSET))
|
||||
/* map the PML4 to itself (set the cache disable bit (4)) */
|
||||
movl $PADDR(_pml4t + 0x013), PADDR(_pml4t + PML4T_OFFSET(X86_PMAP_OFFSET))
|
||||
movb $0x80, PADDR(_pml4t + PML4T_OFFSET(X86_PMAP_OFFSET) + 7) /* NX bit */
|
||||
|
||||
/*
|
||||
|
|
|
@ -14,10 +14,6 @@
|
|||
.code32
|
||||
.section .multiboot.text, "ax", @progbits
|
||||
|
||||
/*
|
||||
* miscellaneous utility routines
|
||||
*/
|
||||
|
||||
/* void _x86_write_tss_base(u64 *gdt_entry, struct x86_tss *tss) */
|
||||
ENTRY(_x86_write_tss_base)
|
||||
movl 4(%esp), %edi
|
||||
|
|
|
@ -68,8 +68,8 @@ static inline void vm_flush(void)
|
|||
{
|
||||
register_t tmp;
|
||||
__asm__ volatile(
|
||||
" mov %%cr3, %0 \n"
|
||||
" mov %0, %%cr3 \n"
|
||||
" mov %%cr3, %0 \n"
|
||||
" mov %0, %%cr3 \n"
|
||||
: "=r"(tmp)
|
||||
:
|
||||
: "memory"
|
||||
|
|
5
arch/x86/include/arch/string.h
Normal file
5
arch/x86/include/arch/string.h
Normal file
|
@ -0,0 +1,5 @@
|
|||
/* Copyright (C) 2021,2022 fef <owo@fef.moe>. All rights reserved. */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <arch/string/memset.h>
|
|
@ -27,14 +27,33 @@ static void register_area(struct mb2_mmap_entry *entry)
|
|||
vm_paddr_t end = start + entry->len;
|
||||
|
||||
if (start >= DMA_LIMIT) {
|
||||
/*
|
||||
* --------------------- end
|
||||
* MM_ZONE_NORMAL
|
||||
* --------------------- start
|
||||
* <not part of entry>
|
||||
* --------------------- DMA_LIMIT
|
||||
*/
|
||||
__boot_register_mem_area(start, end, MM_ZONE_NORMAL);
|
||||
} else if (start < DMA_LIMIT && end > DMA_LIMIT) {
|
||||
} else if (end > DMA_LIMIT) {
|
||||
/*
|
||||
* ----------------- end
|
||||
* MM_ZONE_NORMAL
|
||||
* ----------------- DMA_LIMIT
|
||||
* MM_ZONE_DMA
|
||||
* ----------------- start
|
||||
*/
|
||||
__boot_register_mem_area(start, DMA_LIMIT, MM_ZONE_DMA);
|
||||
__boot_register_mem_area(DMA_LIMIT, end, MM_ZONE_NORMAL);
|
||||
} else if (start < DMA_LIMIT && end <= DMA_LIMIT) {
|
||||
__boot_register_mem_area(start, end, MM_ZONE_DMA);
|
||||
} else {
|
||||
panic("congratulations, you reached an unreachable branch");
|
||||
/*
|
||||
* --------------------- DMA_LIMIT
|
||||
* <not part of entry>
|
||||
* --------------------- end
|
||||
* MM_ZONE_DMA
|
||||
* --------------------- start
|
||||
*/
|
||||
__boot_register_mem_area(start, end, MM_ZONE_DMA);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -68,8 +87,8 @@ static void map_direct_area(vm_paddr_t end)
|
|||
|
||||
for (int pdpti = 0; pdpti < 512; pdpti++) {
|
||||
x86_pdpte_t *pdpte = X86_PDPTE(vpos);
|
||||
pdpte->val = ppos | __P_PRESENT | __P_RW | __P_GLOBAL
|
||||
| __P_HUGE | __P_NOEXEC;
|
||||
pdpte->val = ppos | __P_PRESENT | __P_RW | __P_NOCACHE | __P_WRITE_THROUGH
|
||||
| __P_GLOBAL | __P_HUGE | __P_NOEXEC;
|
||||
|
||||
ppos += GIGAPAGE_SIZE;
|
||||
vpos += GIGAPAGE_SIZE;
|
||||
|
@ -129,7 +148,7 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
|
|||
vm_paddr_t pml4te_val = __boot_pmalloc(PAGE_SHIFT, MM_ZONE_NORMAL);
|
||||
panic_if(pml4te_val == BOOT_PMALLOC_ERR, "cannot reserve memory for vm_page_array");
|
||||
__boot_clear_page(pml4te_val);
|
||||
pml4te_val |= __P_PRESENT | __P_RW | __P_NOCACHE | __P_GLOBAL | __P_NOEXEC;
|
||||
pml4te_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_NOEXEC;
|
||||
pml4te->val = pml4te_val;
|
||||
vm_flush();
|
||||
|
||||
|
@ -145,8 +164,8 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
|
|||
* that is not the case. I've checked the disassembly with -O2,
|
||||
* and clang is emitting the check. So it's fine, i guess. */
|
||||
if (pdpte_val != BOOT_PMALLOC_ERR) {
|
||||
pdpte_val |= __P_PRESENT | __P_RW | __P_NOCACHE
|
||||
| __P_HUGE | __P_GLOBAL | __P_NOEXEC;
|
||||
pdpte_val |= __P_PRESENT | __P_RW | __P_HUGE
|
||||
| __P_GLOBAL | __P_NOEXEC;
|
||||
pdpte->val = pdpte_val;
|
||||
map_pos += GIGAPAGE_SIZE;
|
||||
if (map_pos >= map_end)
|
||||
|
@ -160,7 +179,7 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
|
|||
panic_if(pdpte_val == BOOT_PMALLOC_ERR,
|
||||
"cannot reserve memory for vm_page_array");
|
||||
__boot_clear_page(pdpte_val);
|
||||
pdpte_val |= __P_PRESENT | __P_RW | __P_NOCACHE | __P_GLOBAL | __P_NOEXEC;
|
||||
pdpte_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_NOEXEC;
|
||||
pdpte->val = pdpte_val;
|
||||
vm_flush();
|
||||
|
||||
|
@ -173,8 +192,8 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
|
|||
if (map_end - map_pos >= HUGEPAGE_SIZE) {
|
||||
pdte_val = __boot_pmalloc(X86_PDT_SHIFT, MM_ZONE_NORMAL);
|
||||
if (pdte_val != BOOT_PMALLOC_ERR) {
|
||||
pdte_val |= __P_PRESENT | __P_RW | __P_NOCACHE
|
||||
| __P_GLOBAL | __P_HUGE | __P_NOEXEC;
|
||||
pdte_val |= __P_PRESENT | __P_RW | __P_GLOBAL
|
||||
| __P_HUGE | __P_NOEXEC;
|
||||
pdte->val = pdte_val;
|
||||
map_pos += HUGEPAGE_SIZE;
|
||||
if (map_pos >= map_end)
|
||||
|
@ -188,8 +207,7 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
|
|||
panic_if(pdte_val == BOOT_PMALLOC_ERR,
|
||||
"cannot reserve memory for vm_page_array");
|
||||
__boot_clear_page(pdpte_val);
|
||||
pdte_val |= __P_PRESENT | __P_RW | __P_NOCACHE
|
||||
| __P_GLOBAL | __P_NOEXEC;
|
||||
pdte_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_NOEXEC;
|
||||
pdte->val = pdte_val;
|
||||
vm_flush();
|
||||
|
||||
|
@ -199,8 +217,7 @@ void x86_paging_init(struct mb2_tag_mmap *mmap)
|
|||
vm_paddr_t pte_val = __boot_pmalloc(X86_PT_SHIFT, MM_ZONE_NORMAL);
|
||||
panic_if(pte_val == BOOT_PMALLOC_ERR,
|
||||
"cannot reserve memory for vm_page_array");
|
||||
pte_val |= __P_PRESENT | __P_RW | __P_NOCACHE
|
||||
| __P_GLOBAL | __P_NOEXEC;
|
||||
pte_val |= __P_PRESENT | __P_RW | __P_GLOBAL | __P_NOEXEC;
|
||||
pte->val = pte_val;
|
||||
|
||||
map_pos += PAGE_SIZE;
|
||||
|
@ -228,8 +245,10 @@ void __boot_clear_page(vm_paddr_t paddr)
|
|||
vm_paddr_t pbase = align_floor(paddr, 1 << X86_PDPT_SHIFT);
|
||||
vm_offset_t offset = paddr - pbase;
|
||||
void *vbase = (void *)KERNBASE - (1 << X86_PDPT_SHIFT);
|
||||
x86_pdpte_t *pdpe = X86_PDPTE(vbase);
|
||||
pdpe->val = pbase | __P_PRESENT | __P_RW | __P_NOCACHE | __P_HUGE | __P_NOEXEC;
|
||||
x86_pdpte_t *pdpte = X86_PDPTE(vbase);
|
||||
x86_pdpte_t old_pdpte = *pdpte;
|
||||
old_pdpte.val = pdpte->val;
|
||||
pdpte->val = pbase | __P_PRESENT | __P_RW | __P_NOCACHE | __P_HUGE | __P_NOEXEC;
|
||||
vm_flush();
|
||||
memset64(vbase + offset, 0, PAGE_SIZE);
|
||||
pdpe->val = 0;
|
||||
|
|
|
@ -1,12 +1,18 @@
|
|||
/* Copyright (C) 2021,2022 fef <owo@fef.moe>. All rights reserved. */
|
||||
|
||||
#include <arch/cpufunc.h>
|
||||
#include <arch/page.h>
|
||||
#include <arch/segment.h>
|
||||
#include <arch/trap.h>
|
||||
|
||||
#include <gay/cdefs.h>
|
||||
#include <gay/kprintf.h>
|
||||
#include <gay/ktrace.h>
|
||||
#include <gay/systm.h>
|
||||
#include <gay/types.h>
|
||||
#include <gay/vm/page.h>
|
||||
|
||||
#include <string.h>
|
||||
|
||||
/*
|
||||
* Initial Page Directory Pointer Table and Page Map Level 4 Table for the
|
||||
|
@ -48,7 +54,10 @@ void x86_isr_page_fault(trap_frame_t *frame, u32 error_code)
|
|||
kprintf("\n########## B O N K ##########\n");
|
||||
kprintf("Illegal %s %s%s address %p!\n", space, rwx, present, address);
|
||||
print_regs(frame);
|
||||
panic("Page fault");
|
||||
/* print a stack trace if this came from kernel space */
|
||||
if (frame->hw_frame.cs == X86_64_KERN_CS)
|
||||
ktrace_print_from((void *)frame->rbp);
|
||||
panic_notrace("Page fault");
|
||||
}
|
||||
|
||||
vm_paddr_t vtophys(void *virt)
|
||||
|
@ -79,3 +88,14 @@ vm_paddr_t vtophys(void *virt)
|
|||
vm_paddr_t phys_base = pte->val & X86_PMAP_MASK;
|
||||
return phys_base + ((vm_paddr_t)virt % (1 << X86_PT_SHIFT));
|
||||
}
|
||||
|
||||
void page_clear(vm_page_t page)
|
||||
{
|
||||
register_t cpuflags = intr_disable();
|
||||
page_lock(page);
|
||||
u64 *dest = DMAP_START + (pg2pfn(page) << PAGE_SHIFT);
|
||||
usize nbyte = (usize)1 << (pga_order(page) + PAGE_SHIFT);
|
||||
memset64(dest, 0, nbyte);
|
||||
page_unlock(page);
|
||||
intr_restore(cpuflags);
|
||||
}
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
* address `0xfffff000-0xffffffff`, then points to the page directory itself.
|
||||
*/
|
||||
|
||||
#include <arch/cpufunc.h>
|
||||
#include <arch/page.h>
|
||||
#include <arch/trap.h>
|
||||
|
||||
|
@ -19,6 +20,7 @@
|
|||
#include <gay/mm.h>
|
||||
#include <gay/systm.h>
|
||||
#include <gay/types.h>
|
||||
#include <gay/vm/page.h>
|
||||
|
||||
#include <string.h>
|
||||
|
||||
|
@ -275,14 +277,12 @@ uintptr_t vtophys(void *virt)
|
|||
return phys;
|
||||
}
|
||||
|
||||
void vm_flush(void)
|
||||
void page_clear(vm_page_t page)
|
||||
{
|
||||
register_t tmp;
|
||||
__asm__ volatile(
|
||||
" mov %%cr3, %0 \n"
|
||||
" mov %0, %%cr3 \n"
|
||||
: "=r"(tmp)
|
||||
:
|
||||
: "memory"
|
||||
);
|
||||
register_t cpuflags = intr_disable();
|
||||
page_lock(page);
|
||||
u32 *dest = DMAP_START + (pg2pfn(page) << PAGE_SHIFT);
|
||||
usize nbyte = (usize)1 << (pga_order(page) + PAGE_SHIFT);
|
||||
memset32(dest, 0, nbyte);
|
||||
page_unlock(page);
|
||||
}
|
||||
|
|
|
@ -31,20 +31,24 @@ It also kind of makes you appreciate the sheer vastness of 64-bit address space.
|
|||
|
||||
Kernel space addresses start at `0xffff800000000000` because the MMU "only"
|
||||
supports 48-bit linear addresses.
|
||||
The way i've understood it, the Intel spec says the 17 MSBs of virtual
|
||||
addresses must be all the same, but other than that are ignored.
|
||||
The way i've understood it, the Intel spec says bits 63:48 of virtual
|
||||
addresses must be copies of bit 47, but other than that are ignored.
|
||||
So, as far as the MMU is concerned, the huge hole doesn't even exist:
|
||||
Userspace ranges from `0x000000000000~0x7fffffffffff`,
|
||||
and everything belonging to the kernel from `0x800000000000~0xffffffffffff`
|
||||
(note how the leading 0's/f's are missing, these are 48-bit values).
|
||||
|
||||
The linear physical memory is a direct mapping of physical RAM, which is
|
||||
required because `kmalloc()` needs to be able to allocate *physically*
|
||||
contiguous memory for DMA transfers.
|
||||
required because `kmalloc()` and friends need to be able to allocate
|
||||
*physically* contiguous memory for DMA transfers and i don't have the energy
|
||||
to update kernel page maps every time the kernel needs a new page.
|
||||
|
||||
The kernel image itself is loaded into physical memory at `0x00400000` by
|
||||
default, and the entire low 2 GB of physical memory are statically mapped to
|
||||
the end of virtual memory (-2 GB). That way, we can use `-mcmodel=kernel`,
|
||||
which prevents the compiler from emitting raw address loads and absolute jumps
|
||||
(this is significantly faster).
|
||||
All kernel code resides within the -2 GB region.
|
||||
All kernel code resides within the -2 GB region.
|
||||
|
||||
The `vm_page_array`, which keeps track of what each individual page is used for,
|
||||
starts directly at the beginning of the kernel area at -2 TB.
|
||||
|
|
|
@ -88,6 +88,9 @@
|
|||
/** @brief Mark the symbol as used, even if it really isn't. */
|
||||
#define __used __attribute__(( used ))
|
||||
|
||||
/** @brief Tell the compiler that a struct member is intentionally unused. */
|
||||
#define __unused __attribute__(( unused ))
|
||||
|
||||
/** @brief Symbol may be silently redefined. */
|
||||
#define __weak __attribute__(( weak ))
|
||||
|
||||
|
|
|
@ -6,16 +6,19 @@
|
|||
* @file include/gay/mm.h
|
||||
* @brief Header for dynamic memory management
|
||||
*
|
||||
* To avoid possible confusion (and Not break 32-bit systems, even though they
|
||||
* aren't really supported anyway), physical memory addresses always use type
|
||||
* `vm_paddr_t` and virtual ones are `void *`. This should give us at least
|
||||
* some type of compiler warning if they are accidentally mixed up.
|
||||
* To avoid possible confusion (and Not break systems where virtual addresses
|
||||
* are less wide than physical ones, like IA-32 with PAE), physical memory
|
||||
* addresses always use type `vm_paddr_t` and virtual ones are `void *`.
|
||||
* This should give us at least some type of compiler warning if they are
|
||||
* accidentally mixed up.
|
||||
*
|
||||
* GayBSD uses a classic slab algorithm for its own data structures, which is
|
||||
* backed by a buddy page frame allocator. The latter is also used for getting
|
||||
* bigger areas of memory that are not physically contiguous (for regular user
|
||||
* allocations). The entire physical memory is mapped statically in the range
|
||||
* `DMAP_START - DMAP_END`.
|
||||
* `DMAP_START - DMAP_END` in order to make clearing pages without a specific
|
||||
* mapping easier, even though regular code outside the mm subsystem should be
|
||||
* completely oblivious to this fact.
|
||||
*
|
||||
* Memory is split up into (currently) two zones: `MM_ZONE_NORMAL` and
|
||||
* `MM_ZONE_DMA`. As their names suggest, the former is for general purpose
|
||||
|
@ -23,6 +26,10 @@
|
|||
* Zones are further divided into pools, each of which hold a list of groups of
|
||||
* free pages. The size of these page groups is determined by the pool's order,
|
||||
* where the pool of order `n` holds groups of `1 << n` pages.
|
||||
*
|
||||
* The mm subsystem needs to allocate memory for initializing itself.
|
||||
* Therefore, there is an additional boot page frame allocator, which gets the
|
||||
* free areas from architecture dependent code (`arch/mm/.../init.c`).
|
||||
*/
|
||||
|
||||
#ifdef _KERNEL
|
||||
|
@ -38,17 +45,24 @@
|
|||
|
||||
#include <string.h>
|
||||
|
||||
#define _M_ZONE_NORMAL 0
|
||||
#define _M_ZONE_DMA 1
|
||||
#define _M_ZONE_INDEX(flags) ((flags) & 1)
|
||||
#define _M_ZONE_DMA 0
|
||||
#define _M_ZONE_NORMAL 1
|
||||
/* we use 2 bits because there are likely gonna be additional zones in the future */
|
||||
#define _M_ZONE_INDEX(flags) ((flags) & 3)
|
||||
|
||||
#define _M_EMERG (1 << 1)
|
||||
#define _M_NOWAIT (1 << 2)
|
||||
#define _M_EMERG (1 << 2)
|
||||
#define _M_NOWAIT (1 << 3)
|
||||
|
||||
#ifndef _HAVE_VM_PAGE_T
|
||||
#define _HAVE_VM_PAGE_T 1
|
||||
struct vm_page;
|
||||
typedef struct vm_page *vm_page_t;
|
||||
#endif
|
||||
|
||||
enum mm_zone_type {
|
||||
MM_ZONE_NORMAL = _M_ZONE_NORMAL,
|
||||
MM_ZONE_DMA = _M_ZONE_DMA,
|
||||
MM_NR_ZONES
|
||||
MM_ZONE_NORMAL = _M_ZONE_NORMAL,
|
||||
MM_NR_ZONES = 2
|
||||
};
|
||||
|
||||
/** @brief Boot memory area. */
|
||||
|
@ -76,7 +90,7 @@ struct mm_zone {
|
|||
/** @brief Thresholds for OOM behavior */
|
||||
struct {
|
||||
/** @brief Minimum number of pages reserved for emergency allocations */
|
||||
u_long emerg;
|
||||
long emerg;
|
||||
} thrsh;
|
||||
struct mm_pool pools[MM_NR_ORDERS];
|
||||
struct clist _bmem_areas; /* -> struct _bmem_area */
|
||||
|
@ -92,7 +106,9 @@ struct mm_zone {
|
|||
extern struct mm_zone mm_zones[MM_NR_ZONES]; /* kernel/mm/page.c */
|
||||
|
||||
/**
|
||||
* @brief Memory allocation flags passed to `kmalloc()`.
|
||||
* @brief Memory allocation flags commonly used by all allocators.
|
||||
* All of them are eventually passed down to `page_alloc()`, the physical page
|
||||
* frame allocator,
|
||||
*/
|
||||
enum mflags {
|
||||
/** @brief Use emergency memory reserves if necessary */
|
||||
|
@ -107,6 +123,9 @@ enum mflags {
|
|||
M_DMA = _M_ZONE_DMA,
|
||||
};
|
||||
|
||||
/** @brief Initialize the slab allocator. */
|
||||
void kmalloc_init(void);
|
||||
|
||||
/**
|
||||
* @brief Allocate memory.
|
||||
*
|
||||
|
@ -125,33 +144,6 @@ void *kmalloc(size_t size, enum mflags flags) __malloc_like __alloc_size(1);
|
|||
*/
|
||||
void kfree(void *ptr);
|
||||
|
||||
/**
|
||||
* @brief Flags for the paging structures.
|
||||
*
|
||||
* The macros with two underscores in front of them are defined in `arch/page.h`
|
||||
* and match the respective bit positions in the platform's native hardware
|
||||
* layout for better performance (no shifting around required).
|
||||
*/
|
||||
enum pflags {
|
||||
P_PRESENT = __P_PRESENT, /**< @brief Page exists */
|
||||
P_RW = __P_RW, /**< @brief Page is writable */
|
||||
P_USER = __P_USER, /**< @brief Page is accessible from ring 3 */
|
||||
P_ACCESSED = __P_ACCESSED, /**< @brief Page has been accessed */
|
||||
P_DIRTY = __P_DIRTY, /**< @brief Page has been written */
|
||||
P_GLOBAL = __P_GLOBAL, /**< @brief The entry survives `vm_flush()` */
|
||||
P_NOCACHE = __P_NOCACHE, /**< @brief The TLB won't cache this entry */
|
||||
P_SLAB = __P_SLAB, /**< @brief Page is used by the slab allocator */
|
||||
P_NOSLEEP = __P_ATOMIC, /**< @brief Page is atomic */
|
||||
#ifdef __HAVE_HUGEPAGES
|
||||
/** @brief This page is `HUGEPAGE_SIZE` bytes long, rather than `PAGE_SIZE` */
|
||||
P_HUGE = __P_HUGE,
|
||||
#endif
|
||||
#ifdef __HAVE_NOEXEC
|
||||
/** @brief No instructions can be fetched from this page */
|
||||
P_NOEXEC = __P_NOEXEC,
|
||||
#endif
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Initialize the buddy page frame allocator.
|
||||
* This is only called once, from the arch dependent counterpart after it has
|
||||
|
@ -161,11 +153,22 @@ enum pflags {
|
|||
void paging_init(vm_paddr_t phys_end);
|
||||
|
||||
/**
|
||||
* @brief Allocate a contiguous region in physical memory.
|
||||
* @brief Allocate a physically contiguous region and initialize it with zeroes.
|
||||
* The returned region will be `(1 << order) * PAGE_SIZE` bytes long.
|
||||
*
|
||||
* **The pages are not initialized.**
|
||||
* If you want zeroed pages, use `get_zero_pages()`.
|
||||
* @param order Order of magnitude (as in `1 << order` pages)
|
||||
* @param flags How to allocate
|
||||
* @return The page group that was allocated (evaluates false on failure)
|
||||
*/
|
||||
vm_page_t page_alloc(u_int order, enum mflags flags) __malloc_like;
|
||||
|
||||
/**
|
||||
* @brief Allocate and map a physically contiguous region in memory.
|
||||
* The returned region will be `(1 << order) * PAGE_SIZE` bytes long,
|
||||
* and initialized with zeroes.
|
||||
*
|
||||
* If filling the page with zeroes takes too much time, use `page_alloc()`.
|
||||
* But only if you're careful and it's not an allocation for user space.
|
||||
*
|
||||
* @param order Order of magnitude (as in `1 << order` pages)
|
||||
* @param flags How to allocate
|
||||
|
@ -173,12 +176,11 @@ void paging_init(vm_paddr_t phys_end);
|
|||
* or `nil` if the allocation failed
|
||||
*/
|
||||
void *get_pages(u_int order, enum mflags flags) __malloc_like;
|
||||
/** @brief Alias for `get_pages(0, flags)`. */
|
||||
void *get_page(enum mflags flags) __malloc_like;
|
||||
void *get_zero_pages(u_int order, enum mflags flags) __malloc_like;
|
||||
void *get_zero_page(enum mflags flags) __malloc_like;
|
||||
|
||||
void free_pages(void *ptr);
|
||||
#define free_page(ptr) free_pages(ptr)
|
||||
/** @brief Free a page from `page_alloc()`. */
|
||||
void page_free(vm_page_t page);
|
||||
|
||||
/**
|
||||
* @brief Initialize the slab caches.
|
||||
|
|
|
@ -11,18 +11,6 @@
|
|||
#include <gay/systm.h>
|
||||
#include <gay/types.h>
|
||||
|
||||
/*
|
||||
* I'm trying really hard to keep the size of struct vm_page a power of two
|
||||
* on LP64 systems, because that way we can quickly get to the page frame number
|
||||
* by shifting the byte offset of the vm_page_t in vm_page_array to the right
|
||||
* rather than doing a costly divide instruction (or store the page frame number
|
||||
* within the structure itself, which takes up precious space).
|
||||
*
|
||||
* There is insane pressure on the size of this structure, because a typical
|
||||
* system will have millions of instances of it. Every additional byte makes
|
||||
* a significant difference in memory management overhead.
|
||||
*/
|
||||
|
||||
union vm_page_attr {
|
||||
int _val;
|
||||
struct {
|
||||
|
@ -49,6 +37,9 @@ union vm_page_attr {
|
|||
|
||||
typedef union vm_page_attr vm_page_attr_t;
|
||||
|
||||
/* defined in kernel/mm/slab.c */
|
||||
struct slab_pool;
|
||||
|
||||
/**
|
||||
* @brief Stores information about a single page in physical memory.
|
||||
* There is exactly one of these for every physical page, no matter what that
|
||||
|
@ -59,18 +50,31 @@ struct vm_page {
|
|||
atom_t count;
|
||||
/** @brief Page attributes, use the macros below to access this */
|
||||
atom_t attr;
|
||||
/** @brief If the page is free, this is its freelist. */
|
||||
struct clist link;
|
||||
/** @brief Page frame number */
|
||||
u_long pfn;
|
||||
/**
|
||||
* @brief Optional extra data pointer, reserved for private use.
|
||||
* The current owner of the page may use this to track the underlying
|
||||
* object in memory (or pretty much anything else), for example the
|
||||
* `struct slab` if this page is currently used by the slab allocator.
|
||||
* @brief If the page is free, this is its freelist.
|
||||
* If the page is used in the slab allocator, this is the list for the
|
||||
* pool in which it currently resides.
|
||||
*/
|
||||
void *extra;
|
||||
struct clist link;
|
||||
union {
|
||||
struct {
|
||||
void **freelist;
|
||||
struct slab_pool *pool;
|
||||
u_int entry_size;
|
||||
u_int free_count;
|
||||
} slab;
|
||||
};
|
||||
};
|
||||
|
||||
#define INVALID_PAGE nil
|
||||
#define SLAB(page) (&(page)->slab)
|
||||
|
||||
#ifndef _HAVE_VM_PAGE_T
|
||||
#define _HAVE_VM_PAGE_T 1
|
||||
typedef struct vm_page *vm_page_t;
|
||||
#endif
|
||||
|
||||
/** @brief Array of every single page in physical memory, indexed by page frame number. */
|
||||
extern struct vm_page *const vm_page_array;
|
||||
|
@ -82,6 +86,9 @@ extern vm_page_t _vm_page_array_end;
|
|||
#define PGADDR_ASSERT(x) ({})
|
||||
#endif
|
||||
|
||||
/** @brief Fill a page with zeroes (size depends on the current page order). */
|
||||
void page_clear(vm_page_t page);
|
||||
|
||||
static inline u8 pga_order(vm_page_t page)
|
||||
{
|
||||
union vm_page_attr attr = { ._val = atom_read(&page->attr) };
|
||||
|
@ -211,7 +218,7 @@ __pure2
|
|||
static inline u_long pg2pfn(vm_page_t page)
|
||||
{
|
||||
PGADDR_ASSERT(page < _vm_page_array_end);
|
||||
return page - vm_page_array;
|
||||
return page->pfn;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -224,7 +231,8 @@ static inline vm_page_t vaddr2pg(void *vaddr)
|
|||
{
|
||||
PGADDR_ASSERT(vaddr >= DMAP_START && vaddr < (void *)_vm_page_array_end);
|
||||
uintptr_t offset = (uintptr_t)vaddr - DMAP_OFFSET;
|
||||
return &vm_page_array[offset >> PAGE_SHIFT];
|
||||
struct vm_page *page = &vm_page_array[offset >> PAGE_SHIFT];
|
||||
return page - page->pfn % (1 << pga_order(page));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -254,7 +262,7 @@ static inline vm_page_t paddr2pg(vm_paddr_t paddr)
|
|||
{
|
||||
vm_page_t page = vm_page_array + (paddr >> PAGE_SHIFT);
|
||||
PGADDR_ASSERT(page < _vm_page_array_end);
|
||||
return page;
|
||||
return page - page->pfn % (1 << pga_order(page));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -267,19 +275,3 @@ static inline void *pfn2vaddr(u_long pfn)
|
|||
PGADDR_ASSERT(&vm_page_array[pfn] < _vm_page_array_end);
|
||||
return DMAP_START + (pfn << PAGE_SHIFT);
|
||||
}
|
||||
|
||||
/*
|
||||
* We have to be careful in this macro, because only the first page in the
|
||||
* order group has the correct order set. So we can only read it once at
|
||||
* the beginning of the loop, since the page pointer is being updated.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief Iterate over every page in its order group.
|
||||
*
|
||||
* @param page The first `vm_page_t` in the group.
|
||||
*/
|
||||
#define vm_page_foreach_in_order(page) \
|
||||
for (int __i = 1 << pga_order(page); \
|
||||
__i >= 0; \
|
||||
__i = ({ ++(page); --__i; }))
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
|
||||
#include <limits.h>
|
||||
|
||||
static struct _bmem_area _bmem_area_cache[16];
|
||||
static struct _bmem_area _bmem_area_cache[128];
|
||||
static CLIST(bmem_area_freelist);
|
||||
|
||||
#ifdef DEBUG
|
||||
|
@ -37,6 +37,7 @@ static void free_bmem_area(struct _bmem_area *area)
|
|||
clist_add(&bmem_area_freelist, &area->link);
|
||||
}
|
||||
|
||||
/* insert an area when we already know there are no intersections with reserved memory */
|
||||
static void insert_area_unsafe(vm_paddr_t start, vm_paddr_t end, enum mm_zone_type zone_type)
|
||||
{
|
||||
KASSERT((start % PAGE_SIZE) == 0);
|
||||
|
|
246
kernel/mm/page.c
246
kernel/mm/page.c
|
@ -66,19 +66,18 @@ static inline u_int paddr_find_order(vm_paddr_t addr)
|
|||
}
|
||||
|
||||
/** @brief Claim all free pages in one of the memory areas from the boot allocator. */
|
||||
static inline void claim_bmem_area(struct mm_zone *zone, struct _bmem_area *area)
|
||||
static inline void claim_bmem_area(struct mm_zone *zone, const struct _bmem_area *area)
|
||||
{
|
||||
vm_paddr_t start = area->start;
|
||||
vm_paddr_t end = area->end;
|
||||
vm_paddr_t pos = start;
|
||||
vm_size_t nr_pages = end - start / PAGE_SIZE;
|
||||
latom_add(&zone->free_count, (long)nr_pages);
|
||||
u_int order = paddr_find_order(area->start);
|
||||
while (area->start + ORDER_SIZE(order) > area->end)
|
||||
order--;
|
||||
|
||||
struct vm_page *page = &vm_page_array[start >> PAGE_SHIFT];
|
||||
u_int order = paddr_find_order(start);
|
||||
/* make sure the boot memory allocator cannot under any circumstances hand
|
||||
* out pages from this area anymore, even though that should be unnecessary */
|
||||
clist_del(&area->link);
|
||||
struct vm_page *const start = paddr2pg(area->start);
|
||||
struct vm_page *const end = paddr2pg(area->end);
|
||||
struct vm_page *pos = start;
|
||||
|
||||
const vm_size_t nr_pages = end->pfn - start->pfn;
|
||||
latom_add(&zone->free_count, (long)nr_pages);
|
||||
|
||||
/*
|
||||
* We want to insert pages at the highest possible order. However, the
|
||||
|
@ -90,15 +89,21 @@ static inline void claim_bmem_area(struct mm_zone *zone, struct _bmem_area *area
|
|||
* subsequently lower the order again.
|
||||
*/
|
||||
while (pos < end) {
|
||||
struct mm_pool *pool = &zone->pools[order];
|
||||
clist_add(&pool->freelist, &page->link);
|
||||
struct mm_pool *const pool = &zone->pools[order];
|
||||
clist_add(&pool->freelist, &pos->link);
|
||||
pool->free_entries++;
|
||||
|
||||
/* only the first page in the order group is inserted into
|
||||
* the freelist, but all of them need to be initialized */
|
||||
for (u_int i = 0; i < (1 << order); i++) {
|
||||
atom_init(&page[i].count, 0);
|
||||
atom_init(&page[i].attr, 0);
|
||||
for (u_int i = 0; i < (1u << order); i++) {
|
||||
if (pos >= end)
|
||||
panic("page %p out of range", pos);
|
||||
if (atom_read(&pos->count) != 420)
|
||||
panic("page %p double initialized\n", pos);
|
||||
atom_init(&pos->count, 0);
|
||||
atom_init(&pos->attr, 0);
|
||||
|
||||
pos++;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -111,22 +116,14 @@ static inline void claim_bmem_area(struct mm_zone *zone, struct _bmem_area *area
|
|||
* |---------------------|----> pos
|
||||
* start end
|
||||
*/
|
||||
pos += ORDER_SIZE(order);
|
||||
page += (1 << order);
|
||||
if (order < MM_MAX_ORDER && pos + ORDER_SIZE(order) <= end) {
|
||||
if (order < MM_MAX_ORDER && pos + (1 << (order + 1)) <= end) {
|
||||
/* this makes the rising part of the graph */
|
||||
order++;
|
||||
} else if (order > 0 && pos > end) {
|
||||
/* we have overshot, lower the order */
|
||||
pos -= ORDER_SIZE(order);
|
||||
page -= (1 << order);
|
||||
} else if (order > 0 && pos + (1 << order) > end) {
|
||||
/* this makes the abrupt downwards jump at the end of the graph */
|
||||
while (--order) {
|
||||
if (pos + ORDER_SIZE(order) <= end) {
|
||||
pos += ORDER_SIZE(order);
|
||||
page += (1 << order);
|
||||
if (pos + (1 << order) <= end)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -141,7 +138,7 @@ void paging_init(vm_paddr_t phys_end)
|
|||
usize bitmap_total_size = 0;
|
||||
|
||||
for (int order = 0; order < MM_NR_ORDERS; order++) {
|
||||
usize pages = phys_end >> ORDER_SHIFT(order + 1);
|
||||
usize pages = phys_end >> ORDER_SHIFT(order);
|
||||
pages = align_ceil(pages, LATOM_BIT * 2);
|
||||
usize bytes = pages / (CHAR_BIT * 2);
|
||||
bitmap_sizes[order] = bytes;
|
||||
|
@ -158,7 +155,7 @@ void paging_init(vm_paddr_t phys_end)
|
|||
bitmap_size_log2--; /* the bit index returned by flsl starts at 1 */
|
||||
if (bitmap_total_size ^ (1ul << bitmap_size_log2))
|
||||
bitmap_size_log2++; /* bitmap_total_size is not a power of 2, round up */
|
||||
uintptr_t bitmap_start_phys = __boot_pmalloc(bitmap_size_log2, MM_ZONE_NORMAL);
|
||||
vm_paddr_t bitmap_start_phys = __boot_pmalloc(bitmap_size_log2, MM_ZONE_NORMAL);
|
||||
panic_if(bitmap_start_phys == BOOT_PMALLOC_ERR,
|
||||
"cannot allocate memory for the page bitmaps");
|
||||
memset(__v(bitmap_start_phys), 0, bitmap_total_size);
|
||||
|
@ -168,12 +165,15 @@ void paging_init(vm_paddr_t phys_end)
|
|||
*/
|
||||
for (int zone_index = 0; zone_index < ARRAY_SIZE(mm_zones); zone_index++) {
|
||||
struct mm_zone *zone = &mm_zones[zone_index];
|
||||
latom_init(&zone->free_count, 0);
|
||||
/* we use the same bitmaps for all zones */
|
||||
latom_t *bitmap_pos = __v(bitmap_start_phys);
|
||||
for (int order = 0; order < MM_NR_ORDERS; order++) {
|
||||
zone->pools[order].bitmap = bitmap_pos;
|
||||
clist_init(&zone->pools[order].freelist);
|
||||
zone->pools[order].free_entries = 0;
|
||||
latom_init(&zone->free_count, 0);
|
||||
struct mm_pool *pool = &zone->pools[order];
|
||||
pool->bitmap = bitmap_pos;
|
||||
pool->free_entries = 0;
|
||||
clist_init(&pool->freelist);
|
||||
spin_init(&pool->lock);
|
||||
|
||||
bitmap_pos += bitmap_sizes[order];
|
||||
}
|
||||
|
@ -188,12 +188,13 @@ void paging_init(vm_paddr_t phys_end)
|
|||
*
|
||||
* if the reserved bit is set, all other fields in the page are invalid.
|
||||
*/
|
||||
for (usize i = 0; i < phys_end >> PAGE_SHIFT; i++) {
|
||||
for (u_long pfn = 0; pfn < phys_end >> PAGE_SHIFT; pfn++) {
|
||||
/* This is merely an optimization to simplify checking whether
|
||||
* two buddies can be coalesced into one. In reality, the
|
||||
* reference count is invalid because the page is reserved. */
|
||||
atom_init(&vm_page_array[i].count, 1);
|
||||
atom_init(&vm_page_array[i].attr, _PGA_RSVD_MASK);
|
||||
atom_init(&vm_page_array[pfn].count, 420);
|
||||
atom_init(&vm_page_array[pfn].attr, _PGA_RSVD_MASK);
|
||||
vm_page_array[pfn].pfn = pfn;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -203,11 +204,15 @@ void paging_init(vm_paddr_t phys_end)
|
|||
struct mm_zone *zone = &mm_zones[i];
|
||||
struct _bmem_area *area, *tmp;
|
||||
clist_foreach_entry_safe(&zone->_bmem_areas, area, tmp, link) {
|
||||
/* make sure the boot memory allocator cannot under any circumstances hand
|
||||
* out pages from this area anymore, even though that should be unnecessary */
|
||||
clist_del(&area->link);
|
||||
|
||||
claim_bmem_area(zone, area);
|
||||
zone->thrsh.emerg = latom_read(&zone->free_count) / CFG_PAGE_EMERG_DENOM;
|
||||
if (zone->thrsh.emerg > CFG_PAGE_EMERG_MAX)
|
||||
zone->thrsh.emerg = CFG_PAGE_EMERG_MAX;
|
||||
}
|
||||
zone->thrsh.emerg = latom_read(&zone->free_count) / CFG_PAGE_EMERG_DENOM;
|
||||
if (zone->thrsh.emerg > CFG_PAGE_EMERG_MAX)
|
||||
zone->thrsh.emerg = CFG_PAGE_EMERG_MAX;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -218,22 +223,27 @@ static inline bool pg_flip_bit(struct mm_zone *zone, u_long pfn, u_int order)
|
|||
return latom_flip_bit(bitmap, (int)(bit % LATOM_BIT));
|
||||
}
|
||||
|
||||
__malloc_like
|
||||
static void *__get_pages(u_int order, enum mflags flags)
|
||||
vm_page_t page_alloc(u_int order, enum mflags flags)
|
||||
{
|
||||
PAGE_ASSERT(order >= 0);
|
||||
struct mm_zone *zone = &mm_zones[_M_ZONE_INDEX(flags)];
|
||||
|
||||
if (order > MM_MAX_ORDER) {
|
||||
page_debug("get_pages(%d, %#08x): Order too high!\n", order, flags);
|
||||
return nil;
|
||||
}
|
||||
|
||||
u_long count_after = latom_sub(&zone->free_count, (1 << order)) - (1 << order);
|
||||
struct mm_zone *zone = &mm_zones[_M_ZONE_INDEX(flags)];
|
||||
long count_after;
|
||||
try_next_zone:
|
||||
count_after = latom_sub(&zone->free_count, (1 << order)) - (1 << order);
|
||||
if (count_after < zone->thrsh.emerg) {
|
||||
if (count_after < 0 || !(flags & _M_EMERG)) {
|
||||
latom_add(&zone->free_count, (1 << order));
|
||||
return nil;
|
||||
/* if we can't allocate from ZONE_NORMAL, fall back to ZONE_DMA */
|
||||
if (zone > &mm_zones[0]) {
|
||||
zone--;
|
||||
goto try_next_zone;
|
||||
} else {
|
||||
return nil;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -266,93 +276,76 @@ static void *__get_pages(u_int order, enum mflags flags)
|
|||
intr_restore(cpuflags);
|
||||
}
|
||||
|
||||
if (page == nil) {
|
||||
if (zone > &mm_zones[0]) {
|
||||
/*
|
||||
* If we reach this, the current zone technically had enough free
|
||||
* pages for the allocation, but those pages were split up into
|
||||
* smaller chunks rather than a contiguous area. However, we don't
|
||||
* give up quite yet: If possible, we fall back to a lower memory
|
||||
* zone (ZONE_NORMAL -> ZONE_DMA) and start over from the top.
|
||||
*/
|
||||
zone--;
|
||||
goto try_next_zone;
|
||||
} else {
|
||||
return nil;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* if we found a page, check if we need to split it up
|
||||
* (which is the case if we took one from a higher order freelist)
|
||||
*/
|
||||
if (page != nil) {
|
||||
usize pfn = pg2pfn(page);
|
||||
page_debug_noisy("alloc order %u, split pfn %#lx from order %u\n",
|
||||
order, pfn, page_order);
|
||||
pg_flip_bit(zone, pfn, page_order);
|
||||
usize pfn = pg2pfn(page);
|
||||
page_debug_noisy("alloc order %u, split pfn %#lx from order %u\n",
|
||||
order, pfn, page_order);
|
||||
pg_flip_bit(zone, pfn, page_order);
|
||||
|
||||
/* split the page and insert the upper halves into the
|
||||
* respective freelist until we reach the requested order */
|
||||
while (page_order-- > order) {
|
||||
page_debug_noisy("split %p (order = %u)\n", pfn2vaddr(pfn), page_order);
|
||||
struct mm_pool *pool = &zone->pools[page_order];
|
||||
vm_page_t buddy = page + (1 << page_order);
|
||||
pga_set_order(buddy, page_order);
|
||||
pg_flip_bit(zone, pfn + (1 << page_order), page_order);
|
||||
/* split the page and insert the upper halves into the
|
||||
* respective freelist until we reach the requested order */
|
||||
while (page_order-- > order) {
|
||||
page_debug_noisy("split %p (order = %u)\n", pfn2vaddr(pfn), page_order);
|
||||
struct mm_pool *pool = &zone->pools[page_order];
|
||||
vm_page_t buddy = page + (1 << page_order);
|
||||
pga_set_order(buddy, page_order);
|
||||
pg_flip_bit(zone, pfn + (1 << page_order), page_order);
|
||||
|
||||
disable_intr();
|
||||
spin_lock(&pool->lock);
|
||||
clist_add_first(&pool->freelist, &buddy->link);
|
||||
pool->free_entries++;
|
||||
spin_unlock(&pool->lock);
|
||||
intr_restore(cpuflags);
|
||||
}
|
||||
|
||||
pga_set_order(page, order);
|
||||
void *vaddr = pfn2vaddr(pfn);
|
||||
|
||||
return vaddr;
|
||||
} else {
|
||||
return nil;
|
||||
disable_intr();
|
||||
spin_lock(&pool->lock);
|
||||
clist_add_first(&pool->freelist, &buddy->link);
|
||||
pool->free_entries++;
|
||||
spin_unlock(&pool->lock);
|
||||
intr_restore(cpuflags);
|
||||
}
|
||||
|
||||
for (u_int i = 0; i < (1 << order); i++)
|
||||
pga_set_order(&page[i], order);
|
||||
page_clear(page);
|
||||
return page;
|
||||
}
|
||||
|
||||
/* faster memset for whole pages */
|
||||
static inline void init_pages(u_long *start, u_long val, u_int order)
|
||||
{
|
||||
u_long *end = start + (ORDER_SIZE(order) / sizeof(*start));
|
||||
do {
|
||||
*start++ = val;
|
||||
} while (start != end);
|
||||
}
|
||||
/*
|
||||
* XXX get_page() and get_pages() shouldn't depend on the direct map
|
||||
*
|
||||
* XXX Do we need these at all? I don't think so.
|
||||
*/
|
||||
|
||||
void *get_pages(u_int order, enum mflags flags)
|
||||
{
|
||||
void *pages = __get_pages(order, flags);
|
||||
|
||||
#if CFG_POISON_PAGES
|
||||
if (pages != nil)
|
||||
init_pages(pages, PAGE_POISON_ALLOC, order);
|
||||
#endif
|
||||
|
||||
return pages;
|
||||
vm_page_t page = page_alloc(order, flags);
|
||||
if (page)
|
||||
return pfn2vaddr(pg2pfn(page));
|
||||
else
|
||||
return nil;
|
||||
}
|
||||
|
||||
void *get_page(enum mflags flags)
|
||||
{
|
||||
void *pages = __get_pages(0, flags);
|
||||
|
||||
#if CFG_POISON_PAGES
|
||||
if (pages != nil)
|
||||
init_pages(pages, PAGE_POISON_ALLOC, 0);
|
||||
#endif
|
||||
|
||||
return pages;
|
||||
}
|
||||
|
||||
void *get_zero_pages(u_int order, enum mflags flags)
|
||||
{
|
||||
void *pages = __get_pages(order, flags);
|
||||
|
||||
if (pages != nil)
|
||||
init_pages(pages, 0, order);
|
||||
|
||||
return pages;
|
||||
}
|
||||
|
||||
void *get_zero_page(enum mflags flags)
|
||||
{
|
||||
void *page = __get_pages(0, flags);
|
||||
|
||||
if (page != nil)
|
||||
init_pages(page, 0, 0);
|
||||
|
||||
return page;
|
||||
vm_page_t page = page_alloc(0, flags);
|
||||
if (page)
|
||||
return pfn2vaddr(pg2pfn(page));
|
||||
else
|
||||
return nil;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -377,26 +370,13 @@ static __always_inline bool can_merge(vm_page_t page, vm_page_t buddy)
|
|||
return merge;
|
||||
}
|
||||
|
||||
void free_pages(void *ptr)
|
||||
void page_free(vm_page_t page)
|
||||
{
|
||||
PAGE_DEBUG_BLOCK {
|
||||
if (ptr < DMAP_START || ptr >= DMAP_END) {
|
||||
panic("free_pages(%p): not in DMAP region\n", ptr);
|
||||
}
|
||||
}
|
||||
|
||||
register_t cpuflags = read_flags();
|
||||
|
||||
vm_page_t page = vaddr2pg(ptr);
|
||||
panic_if(pga_rsvd(page), "tried to free reserved page %p", ptr);
|
||||
|
||||
u_int order = pga_order(page);
|
||||
PAGE_ASSERT((uintptr_t)ptr % ORDER_SIZE(order) == 0);
|
||||
u_long pfn = vaddr2pfn(ptr);
|
||||
|
||||
#if CFG_POISON_PAGES
|
||||
init_pages(ptr, PAGE_POISON_FREE, order);
|
||||
#endif
|
||||
u_long pfn = pg2pfn(page);
|
||||
|
||||
PAGE_DEBUG_BLOCK {
|
||||
int old_count = atom_sub(&page->count, 1);
|
||||
|
@ -407,6 +387,8 @@ void free_pages(void *ptr)
|
|||
page_debug("attempted to free %p with references", ptr);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
atom_dec(&page->count);
|
||||
}
|
||||
|
||||
struct mm_zone *zone = &mm_zones[pga_zone(page)];
|
||||
|
|
140
kernel/mm/slab.c
140
kernel/mm/slab.c
|
@ -21,7 +21,7 @@
|
|||
|
||||
#if CFG_POISON_SLABS
|
||||
struct slab_poison {
|
||||
void *_pad; /**< @brief That's where the freelist pointer is stored */
|
||||
void *_pad __unused; /**< @brief That's where the freelist pointer is stored */
|
||||
void *alloc_source; /**< @brief Code address that made the alloc call */
|
||||
u_long exact_size;
|
||||
u_long low_poison;
|
||||
|
@ -33,32 +33,6 @@ static void poison_after_alloc(struct slab_poison *poison, u_int exact_size, voi
|
|||
static void poison_after_free(struct slab_poison *poison);
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief This header sits at the beginning of each slab.
|
||||
* The individual entries follow immediately after the struct itself.
|
||||
*/
|
||||
struct slab {
|
||||
struct clist link;
|
||||
void **freelist;
|
||||
struct slab_pool *pool;
|
||||
/** @brief For `link` */
|
||||
spin_t lock;
|
||||
/**
|
||||
* @brief Number of free entries.
|
||||
* The slabs are sorted within their pool by this value, so that we
|
||||
* always hand out entries from the fullest slabs (increases locality
|
||||
* and thus decreases the stress on the TLB).
|
||||
*
|
||||
* This is intentionally not a `usize` because entry sizes are really
|
||||
* small anyway (we currently refuse to allocate anything bigger than
|
||||
* `PAGE_SIZE`), so this saves a couple of bytes on systems where `int`
|
||||
* is smaller than `usize`.
|
||||
*/
|
||||
u_int free_entries;
|
||||
};
|
||||
|
||||
#define SLAB_OVERHEAD (sizeof(struct slab))
|
||||
|
||||
#if CFG_DEBUG_SLAB_ALLOCS
|
||||
# define slab_debug(msg, ...) kprintf("[slab] " msg, ##__VA_ARGS__)
|
||||
# define SLAB_DEBUG_BLOCK
|
||||
|
@ -77,12 +51,12 @@ struct slab {
|
|||
|
||||
struct slab_pool {
|
||||
const u_int entry_size; /**< @brief Size of one entry in bytes */
|
||||
const int entries_per_slab; /**< @brief Max number of entries per slab */
|
||||
const u_int entries_per_slab; /**< @brief Max number of entries per slab */
|
||||
atom_t total_used; /**< @brief Total allocated entries */
|
||||
const u_int page_order; /**< @brief Order passed to `get_pages()` */
|
||||
struct clist empty_list; /* -> struct slab::link */
|
||||
struct clist partial_list; /* -> struct slab::link */
|
||||
struct clist full_list; /* -> struct slab::link */
|
||||
struct clist empty_list; /* -> struct vm_page::link */
|
||||
struct clist partial_list; /* -> struct vm_page::link */
|
||||
struct clist full_list; /* -> struct vm_page::link */
|
||||
spin_t empty_lock; /**< @brief Lock for `empty_list` */
|
||||
spin_t partial_lock; /**< @brief Lock for `partial_list` */
|
||||
spin_t full_lock; /**< @brief Lock for `full_list` */
|
||||
|
@ -98,12 +72,10 @@ struct slab_pool {
|
|||
* powers of two and perfectly aligned then.
|
||||
*/
|
||||
#define _MIN1(x) ((x) < 1 ? 1 : (x))
|
||||
#define POOL_ENTRY_SIZE(sz) (( (sz) - ( SLAB_OVERHEAD / _MIN1(PAGE_SIZE / (sz)) ) ) & ~0xfu)
|
||||
#define POOL_ENTRIES_PER_TABLE(sz) \
|
||||
_MIN1((PAGE_SIZE - SLAB_OVERHEAD) / POOL_ENTRY_SIZE(sz))
|
||||
#define POOL_ENTRIES_PER_TABLE(sz) _MIN1(PAGE_SIZE / (sz))
|
||||
|
||||
#define POOL_DEFINE(sz) { \
|
||||
.entry_size = POOL_ENTRY_SIZE(sz), \
|
||||
.entry_size = (sz), \
|
||||
.entries_per_slab = POOL_ENTRIES_PER_TABLE(sz), \
|
||||
.total_used = ATOM_DEFINE(0), \
|
||||
.page_order = ((sz) - 1) / PAGE_SIZE, \
|
||||
|
@ -127,7 +99,7 @@ static struct slab_pool slab_pools_normal[] = {
|
|||
POOL_DEFINE(8192),
|
||||
POOL_DEFINE(16384),
|
||||
POOL_DEFINE(32768),
|
||||
{ .entry_size = 0 } /* terminator */
|
||||
{ /* terminator */ }
|
||||
};
|
||||
static struct slab_pool slab_pools_dma[] = {
|
||||
POOL_DEFINE(32),
|
||||
|
@ -136,16 +108,16 @@ static struct slab_pool slab_pools_dma[] = {
|
|||
POOL_DEFINE(256),
|
||||
POOL_DEFINE(512),
|
||||
POOL_DEFINE(1024),
|
||||
{ .entry_size = 0 } /* terminator */
|
||||
{ /* terminator */ }
|
||||
};
|
||||
#undef _MIN1 /* we don't wanna end up using this in actual code, do we? */
|
||||
|
||||
static struct slab_pool *slab_zone_pools[MM_NR_ZONES] = {
|
||||
[_M_ZONE_NORMAL] = slab_pools_normal,
|
||||
[_M_ZONE_DMA] = slab_pools_dma,
|
||||
[_M_ZONE_NORMAL] = slab_pools_normal,
|
||||
};
|
||||
|
||||
static struct slab *slab_create(struct slab_pool *pool, enum mflags flags);
|
||||
static vm_page_t slab_create(struct slab_pool *pool, enum mflags flags);
|
||||
|
||||
void kmalloc_init(void)
|
||||
{
|
||||
|
@ -206,31 +178,31 @@ void *kmalloc(usize size, enum mflags flags)
|
|||
* it can't possibly be used for allocations anymore.
|
||||
* This is probably not worth the overhead, though.
|
||||
*/
|
||||
struct slab *slab = nil;
|
||||
vm_page_t page = INVALID_PAGE;
|
||||
|
||||
/* try to use a slab that is already partially used first */
|
||||
register_t cpuflags = intr_disable();
|
||||
spin_lock(&pool->partial_lock);
|
||||
if (!clist_is_empty(&pool->partial_list)) {
|
||||
atom_dec(&pool->partial_count);
|
||||
slab = clist_del_first_entry(&pool->partial_list, typeof(*slab), link);
|
||||
page = clist_del_first_entry(&pool->partial_list, typeof(*page), link);
|
||||
}
|
||||
spin_unlock(&pool->partial_lock);
|
||||
|
||||
if (slab == nil) {
|
||||
if (!page) {
|
||||
/* no partially used slab available, see if we have a completely free one */
|
||||
spin_lock(&pool->empty_lock);
|
||||
if (!clist_is_empty(&pool->empty_list)) {
|
||||
atom_dec(&pool->empty_count);
|
||||
slab = clist_del_first_entry(&pool->empty_list, typeof(*slab), link);
|
||||
page = clist_del_first_entry(&pool->empty_list, typeof(*page), link);
|
||||
}
|
||||
spin_unlock(&pool->empty_lock);
|
||||
|
||||
if (slab == nil) {
|
||||
if (!page) {
|
||||
/* we're completely out of usable slabs, allocate a new one */
|
||||
intr_restore(cpuflags);
|
||||
slab = slab_create(pool, flags);
|
||||
if (slab == nil) {
|
||||
page = slab_create(pool, flags);
|
||||
if (!page) {
|
||||
slab_debug("kernel OOM\n");
|
||||
return nil;
|
||||
}
|
||||
|
@ -238,22 +210,22 @@ void *kmalloc(usize size, enum mflags flags)
|
|||
}
|
||||
}
|
||||
|
||||
/* if we've made it to here, slab != nil and interrupts are disabled */
|
||||
spin_lock(&slab->lock);
|
||||
void *ret = slab->freelist;
|
||||
slab->freelist = *slab->freelist;
|
||||
if (--slab->free_entries == 0) {
|
||||
/* if we've made it to here, we have a slab and interrupts are disabled */
|
||||
page_lock(page);
|
||||
void *ret = page->slab.freelist;
|
||||
SLAB(page)->freelist = *SLAB(page)->freelist;
|
||||
if (--page->slab.free_count == 0) {
|
||||
spin_lock(&pool->full_lock);
|
||||
clist_add(&pool->full_list, &slab->link);
|
||||
clist_add(&pool->full_list, &page->link);
|
||||
spin_unlock(&pool->full_lock);
|
||||
atom_inc(&pool->full_count);
|
||||
} else {
|
||||
spin_lock(&pool->partial_lock);
|
||||
clist_add(&pool->partial_list, &slab->link);
|
||||
clist_add(&pool->partial_list, &page->link);
|
||||
spin_unlock(&pool->partial_lock);
|
||||
atom_inc(&pool->partial_count);
|
||||
}
|
||||
spin_unlock(&slab->lock);
|
||||
page_unlock(page);
|
||||
intr_restore(cpuflags);
|
||||
|
||||
atom_inc(&pool->total_used);
|
||||
|
@ -275,8 +247,7 @@ void kfree(void *ptr)
|
|||
|
||||
vm_page_t page = vaddr2pg(ptr);
|
||||
SLAB_ASSERT(pga_slab(page));
|
||||
struct slab *slab = page->extra;
|
||||
struct slab_pool *pool = slab->pool;
|
||||
struct slab_pool *pool = SLAB(page)->pool;
|
||||
#if CFG_POISON_SLABS
|
||||
struct slab_poison *poison = container_of(ptr, typeof(*poison), data);
|
||||
poison_after_free(poison);
|
||||
|
@ -284,63 +255,63 @@ void kfree(void *ptr)
|
|||
#endif
|
||||
|
||||
register_t cpuflags = intr_disable();
|
||||
spin_lock(&slab->lock);
|
||||
*(void **)ptr = slab->freelist;
|
||||
slab->freelist = (void **)ptr;
|
||||
if (++slab->free_entries == pool->entries_per_slab) {
|
||||
page_lock(page);
|
||||
*(void **)ptr = SLAB(page)->freelist;
|
||||
SLAB(page)->freelist = (void **)ptr;
|
||||
if (++SLAB(page)->free_count == pool->entries_per_slab) {
|
||||
spin_lock(&pool->partial_lock);
|
||||
clist_del(&slab->link);
|
||||
clist_del(&page->link);
|
||||
spin_unlock(&pool->partial_lock);
|
||||
atom_dec(&pool->partial_count);
|
||||
|
||||
spin_lock(&pool->empty_lock);
|
||||
clist_add(&pool->empty_list, &slab->link);
|
||||
clist_add(&pool->empty_list, &page->link);
|
||||
spin_unlock(&pool->empty_lock);
|
||||
atom_inc(&pool->empty_count);
|
||||
}
|
||||
spin_unlock(&slab->lock);
|
||||
page_unlock(page);
|
||||
atom_dec(&pool->total_used);
|
||||
intr_restore(cpuflags);
|
||||
}
|
||||
|
||||
static struct slab *slab_create(struct slab_pool *pool, enum mflags flags)
|
||||
static vm_page_t slab_create(struct slab_pool *pool, enum mflags flags)
|
||||
{
|
||||
slab_debug_noisy("Creating new cache for entry_size %u\n", pool->entry_size);
|
||||
struct slab *slab = get_zero_pages(pool->page_order, flags);
|
||||
vm_page_t page = page_alloc(pool->page_order, flags);
|
||||
|
||||
if (slab != nil) {
|
||||
vm_page_t page = vaddr2pg(slab);
|
||||
/* XXX it's probably sufficient to only do this for the lowest page */
|
||||
vm_page_foreach_in_order(page) {
|
||||
pga_set_slab(page, true);
|
||||
page->extra = slab;
|
||||
}
|
||||
|
||||
spin_init(&slab->lock);
|
||||
slab->pool = pool;
|
||||
slab->free_entries = pool->entries_per_slab;
|
||||
if (page) {
|
||||
pga_set_slab(page, true);
|
||||
SLAB(page)->pool = pool;
|
||||
SLAB(page)->free_count = pool->entries_per_slab;
|
||||
void *prev = nil;
|
||||
void *end = (void *)slab + (1 << (pool->page_order + PAGE_SHIFT));
|
||||
/* XXX this should not rely on a direct map */
|
||||
void *start = pfn2vaddr(pg2pfn(page));
|
||||
void *end = start + (1 << (pool->page_order + PAGE_SHIFT));
|
||||
void *pos = end;
|
||||
do {
|
||||
pos -= pool->entry_size;
|
||||
*(void **)pos = prev;
|
||||
prev = pos;
|
||||
} while (pos >= (void *)&slab[1] + pool->entry_size);
|
||||
slab->freelist = pos;
|
||||
} while (pos > start);
|
||||
SLAB(page)->freelist = pos;
|
||||
}
|
||||
|
||||
return slab;
|
||||
return page;
|
||||
}
|
||||
|
||||
#if CFG_POISON_SLABS
|
||||
|
||||
static inline void poison_after_alloc(struct slab_poison *poison, u_int exact_size,
|
||||
void *alloc_source)
|
||||
{
|
||||
u_int offset = align_ceil(poison->exact_size, sizeof(long)) / sizeof(long);
|
||||
u_long *poison_start = &poison->low_poison;
|
||||
|
||||
/* slabs are zeroed out when they are newly allocated */
|
||||
/*
|
||||
* page_alloc() always initializes the allocated page to zeroes.
|
||||
* Therefore, if exact_size is 0, we know this particular slab entry has
|
||||
* never been used before, and we can skip the check.
|
||||
*/
|
||||
if (poison->exact_size != 0) {
|
||||
for (u_long *pos = poison_start; pos < &poison->high_poison[offset]; pos++) {
|
||||
if (*pos != SLAB_POISON_FREE) {
|
||||
|
@ -377,7 +348,12 @@ static inline void poison_after_free(struct slab_poison *poison)
|
|||
for (u_long *pos = &poison->low_poison; pos <= &poison->high_poison[offset]; pos++)
|
||||
*pos = SLAB_POISON_FREE;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* CFG_POISON_SLABS */
|
||||
|
||||
/*
|
||||
* for certain libc routines
|
||||
*/
|
||||
|
||||
__weak void *malloc(usize size)
|
||||
{
|
||||
|
|
|
@ -2,6 +2,8 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include <arch/string.h>
|
||||
|
||||
#include <gay/cdefs.h>
|
||||
#include <gay/types.h>
|
||||
|
||||
|
@ -71,6 +73,7 @@ void *memcpy(void *__restrict dest, const void *__restrict src, usize n);
|
|||
*/
|
||||
__pure int memcmp(const void *s1, const void *s2, usize n);
|
||||
|
||||
#ifndef __HAVE_ARCH_MEMSET
|
||||
/**
|
||||
* @brief Starting from `ptr`, fill `n` bytes with the constant byte `c`.
|
||||
*
|
||||
|
@ -80,6 +83,28 @@ __pure int memcmp(const void *s1, const void *s2, usize n);
|
|||
* @returns A pointer to `ptr`
|
||||
*/
|
||||
void *memset(void *ptr, int c, usize n);
|
||||
#endif
|
||||
|
||||
#if _GAY_SOURCE >= 202109L
|
||||
#ifndef __HAVE_ARCH_MEMSET16
|
||||
void *memset16(u16 *dest, u16 c, usize nbyte);
|
||||
#endif
|
||||
#ifndef __HAVE_ARCH_MEMSET32
|
||||
void *memset32(u32 *dest, u32 c, usize nbyte);
|
||||
#endif
|
||||
#ifndef __HAVE_ARCH_MEMSET64
|
||||
void *memset64(u64 *dest, u64 c, usize nbyte);
|
||||
#endif
|
||||
|
||||
#include <limits.h>
|
||||
#if LONG_BIT == 32
|
||||
#define memsetl memset32
|
||||
#elif LONG_BIT == 64
|
||||
#define memsetl memset64
|
||||
#else
|
||||
#error "Unsupported sizeof(long)"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief Copy a memory area.
|
||||
|
|
Loading…
Reference in a new issue