diff options
Diffstat (limited to 'kernel/mm')
-rw-r--r-- | kernel/mm/memcheck.py | 158 | ||||
-rw-r--r-- | kernel/mm/mobj.c | 313 | ||||
-rw-r--r-- | kernel/mm/page.c | 658 | ||||
-rw-r--r-- | kernel/mm/page.py | 47 | ||||
-rw-r--r-- | kernel/mm/pagecache.c | 23 | ||||
-rw-r--r-- | kernel/mm/pagetable.c | 873 | ||||
-rw-r--r-- | kernel/mm/pagetable.gdb | 25 | ||||
-rw-r--r-- | kernel/mm/pframe.c | 59 | ||||
-rw-r--r-- | kernel/mm/slab.c | 550 | ||||
-rw-r--r-- | kernel/mm/slab.py | 55 |
10 files changed, 2761 insertions, 0 deletions
diff --git a/kernel/mm/memcheck.py b/kernel/mm/memcheck.py new file mode 100644 index 0000000..49f40fd --- /dev/null +++ b/kernel/mm/memcheck.py @@ -0,0 +1,158 @@ +import gdb + +import string + +import weenix +import weenix.kmem +import weenix.stack + + +class SlabAllocation: + def __init__(self, addr, stack, allocator, initialization): + self.addr = addr + self.stack = stack + self.allocator = allocator + self.initialization = initialization + + +class PageAllocation: + def __init__(self, addr, stack, npages, slabdata, initialization): + self.addr = addr + self.stack = stack + self.npages = npages + self.slabdata = slabdata + self.initialization = initialization + + +class Memcheck: + def __init__(self): + self._slab_alloc_count = 0 + self._slab_free_count = 0 + self._slab_invalid_free = 0 + self._slab_allocated = {} + self._page_alloc_count = 0 + self._page_free_count = 0 + self._page_invalid_free = 0 + self._page_allocated = {} + self._initialized = False + weenix.Hook("slab_obj_alloc", self._slab_alloc_callback) + weenix.Hook("slab_obj_free", self._slab_free_callback) + weenix.Hook("page_alloc", self._page_alloc_callback) + weenix.Hook("page_free", self._page_free_callback) + weenix.Hook("initialized", self._initialized_callback) + weenix.Hook("shutdown", self._shutdown_callback) + + def _slab_alloc_callback(self, args): + addr = args["addr"] + if string.atol(addr, 16) == 0: + return False + stack = weenix.stack.Stack(gdb.newest_frame().older()) + allocator = weenix.kmem.SlabAllocator( + gdb.Value(string.atol(args["allocator"].split(" ")[0], 16)).cast( + gdb.lookup_type("void").pointer() + ) + ) + self._slab_allocated[addr] = SlabAllocation( + addr, stack, allocator, not self._initialized + ) + if self._initialized: + self._slab_alloc_count += 1 + return False + + def _slab_free_callback(self, args): + if not args["addr"] in self._slab_allocated: + self._slab_invalid_free += 1 + print("Invalid free of address " + args["addr"] + ":") + print(weenix.stack.Stack(gdb.newest_frame().older())) + else: + if not self._slab_allocated[args["addr"]].initialization: + self._slab_free_count += 1 + del self._slab_allocated[args["addr"]] + return False + + def _page_alloc_callback(self, args): + addr = args["addr"] + if string.atol(addr, 16) == 0: + return False + stack = weenix.stack.Stack(gdb.newest_frame().older()) + slabdata = stack.contains("_slab_allocator_grow") + self._page_allocated[addr] = PageAllocation( + addr, stack, string.atoi(args["npages"]), slabdata, not self._initialized + ) + if self._initialized and not slabdata: + self._page_alloc_count += 1 + return False + + def _page_free_callback(self, args): + if not args["addr"] in self._page_allocated: + self._page_invalid_free += 1 + print("Invalid free of address " + args["addr"] + ":") + print(weenix.stack.Stack(gdb.newest_frame().older())) + elif self._page_allocated[args["addr"]].npages != string.atoi(args["npages"]): + self._page_invalid_free += 1 + print( + "Address " + + args["addr"] + + " allocated for " + + str(self._page_allocated[args["addr"]].npages) + + " pages:" + ) + print(self._page_allocated[args["addr"]].stack) + print("but freed with " + args["npages"] + " pages:") + print(weenix.stack.Stack(gdb.newest_frame().older())) + del self._page_allocated[args["addr"]] + else: + if ( + not self._page_allocated[args["addr"]].initialization + and not self._page_allocated[args["addr"]].slabdata + ): + self._page_free_count += 1 + del self._page_allocated[args["addr"]] + return False + + def _initialized_callback(self, args): + self._initialized = True + return False + + def _shutdown_callback(self, args): + size = 0 + for alloc in self._slab_allocated.itervalues(): + if not alloc.initialization: + size += alloc.allocator.size() + print( + 'Leaked {0} bytes from "{1}" at {2}:'.format( + alloc.allocator.size(), alloc.allocator.name(), alloc.addr + ) + ) + print(alloc.stack) + npages = 0 + for page in self._page_allocated.itervalues(): + if not page.initialization and not page.slabdata: + npages += page.npages + print("Leaked {0} pages at {1}:".format(page.npages, page.addr)) + print(page.stack) + print( + "{0} slab allocs, {1} frees ({2} bytes leaked)".format( + self._slab_alloc_count, self._slab_free_count, size + ) + ) + print( + "{0} page allocs, {1} frees ({2} pages leaked)".format( + self._page_alloc_count, self._page_free_count, npages + ) + ) + print("{0} invalid slab frees".format(self._slab_invalid_free)) + print("{0} invalid page frees".format(self._page_invalid_free)) + return False + + +class MemcheckFlag(weenix.Flag): + def __init__(self): + weenix.Flag.__init__(self, "memcheck", gdb.COMMAND_DATA) + + def callback(self, value): + if value: + Memcheck() + + +MemcheckFlag() diff --git a/kernel/mm/mobj.c b/kernel/mm/mobj.c new file mode 100644 index 0000000..4b9c80f --- /dev/null +++ b/kernel/mm/mobj.c @@ -0,0 +1,313 @@ +#include "errno.h" + +#include "mm/mobj.h" +#include "mm/pframe.h" + +#include "util/debug.h" +#include <util/string.h> + +/* + * Initialize o according to type and ops. If ops do not specify a + * get_pframe function, set it to the default, mobj_default_get_pframe. + * Do the same with the destructor function pointer. + * + * Upon return, the refcount of the mobj should be 1. + */ +void mobj_init(mobj_t *o, long type, mobj_ops_t *ops) +{ + o->mo_type = type; + + memcpy(&o->mo_ops, ops, sizeof(mobj_ops_t)); + + if (!o->mo_ops.get_pframe) + { + o->mo_ops.get_pframe = mobj_default_get_pframe; + KASSERT(o->mo_ops.fill_pframe); + KASSERT(o->mo_ops.flush_pframe); + } + if (!o->mo_ops.destructor) + { + o->mo_ops.destructor = mobj_default_destructor; + } + + kmutex_init(&o->mo_mutex); + + o->mo_refcount = ATOMIC_INIT(1); + list_init(&o->mo_pframes); +} + +/* + * Lock the mobj's mutex + */ +inline void mobj_lock(mobj_t *o) { kmutex_lock(&o->mo_mutex); } + +/* + * Unlock the mobj's mutex + */ +inline void mobj_unlock(mobj_t *o) { kmutex_unlock(&o->mo_mutex); } + +/* + * Increment refcount + */ +void mobj_ref(mobj_t *o) +{ + atomic_inc(&o->mo_refcount); +} + +void mobj_put_locked(mobj_t **op) +{ + mobj_unlock(*op); + mobj_put(op); +} + +/* + * Decrement refcount, and set *op = NULL. + * If the refcount drop to 0, call the destructor, otherwise unlock the mobj. + */ +void mobj_put(mobj_t **op) +{ + mobj_t *o = *op; + KASSERT(o->mo_refcount); + *op = NULL; + + dbg(DBG_ERROR, "count: %d\n", o->mo_refcount); + if (atomic_dec_and_test(&o->mo_refcount)) + { + dbg(DBG_ERROR, "count: %d\n", o->mo_refcount); + + KASSERT(!kmutex_owns_mutex(&o->mo_mutex)); + o->mo_ops.destructor(o); + } + else + { + dbg(DBG_ERROR, "count: %d\n", o->mo_refcount); + } +} + +/* + * Find a pframe that already exists in the memory object's mo_pframes list. + * If a pframe is found, it must be locked upon return from this function using + * pf_mutex. + */ +void mobj_find_pframe(mobj_t *o, uint64_t pagenum, pframe_t **pfp) +{ + KASSERT(kmutex_owns_mutex(&o->mo_mutex)); + list_iterate(&o->mo_pframes, pf, pframe_t, pf_link) + { + if (pf->pf_pagenum == pagenum) + { + kmutex_lock(&pf->pf_mutex); + *pfp = pf; + return; + } + } + *pfp = NULL; +} + +/* + * Wrapper around the memory object's get_pframe function + * Assert a sane state of the world surrounding the call to get_pframe + */ +long mobj_get_pframe(mobj_t *o, uint64_t pagenum, long forwrite, + pframe_t **pfp) +{ + KASSERT(kmutex_owns_mutex(&o->mo_mutex)); + *pfp = NULL; + long ret = o->mo_ops.get_pframe(o, pagenum, forwrite, pfp); + KASSERT((!*pfp && ret) || kmutex_owns_mutex(&(*pfp)->pf_mutex)); + return ret; +} + +/* + * Create and initialize a pframe and add it to the mobj's mo_pframes list. + * Upon successful return, the pframe's pf_mutex is locked. + */ +#ifdef OLD +static void mobj_create_pframe(mobj_t *o, uint64_t pagenum, pframe_t **pfp) +#endif +void mobj_create_pframe(mobj_t *o, uint64_t pagenum, uint64_t loc, pframe_t **pfp) +{ + KASSERT(kmutex_owns_mutex(&o->mo_mutex)); + pframe_t *pf = pframe_create(); + if (pf) + { + kmutex_lock(&pf->pf_mutex); + + pf->pf_pagenum = pagenum; + pf->pf_loc = loc; + list_insert_tail(&o->mo_pframes, &pf->pf_link); + } + KASSERT(!pf || kmutex_owns_mutex(&pf->pf_mutex)); + *pfp = pf; +} + +/* + * The default get pframe that is at the center of the mobj/pframe subsystem. + * This is the routine that is used when the memory object does not have a + * get_pframe function associated with it (or called in the case of shadow objects + * when the forwrite flag is set). + * + * First, check if an pframe already exists in the mobj, creating one as + * necessary. Then, ensure that the pframe's contents are loaded: i.e. that + * pf->pf_addr is non-null. You will want to use page_alloc() and fill_pframe + * function pointer of the mobj. Finally, if forwrite is true, mark the pframe + * as dirtied. The resulting pframe should be set in *pfp. + * + * Note that upon failure, *pfp MUST be null. As always, make sure you cleanup + * properly in all error cases (especially if fill_prame fails) + * + * Upon successful return, *pfp refers to the found pframe and MUST be locked. + * + * Error cases mobj_default_get_pframe is responsible for generating: + * - ENOMEM: either cannot create the pframe or cannot allocate memory for + * the pframe's contents + */ +long mobj_default_get_pframe(mobj_t *o, uint64_t pagenum, long forwrite, + pframe_t **pfp) +{ + KASSERT(kmutex_owns_mutex(&o->mo_mutex)); + *pfp = NULL; + pframe_t *pf = NULL; + mobj_find_pframe(o, pagenum, &pf); + if (!pf) + { + mobj_create_pframe(o, pagenum, 0, &pf); // XXX is zero correct??? + } + if (!pf) + { + return -ENOMEM; + } + KASSERT(kmutex_owns_mutex(&pf->pf_mutex)); + if (!pf->pf_addr) + { + KASSERT(!pf->pf_dirty && + "dirtied page doesn't have a physical address"); + pf->pf_addr = page_alloc(); + if (!pf->pf_addr) + { + return -ENOMEM; + } + + dbg(DBG_PFRAME, "filling pframe 0x%p (mobj 0x%p page %lu)\n", pf, o, + pf->pf_pagenum); + KASSERT(o->mo_ops.fill_pframe); + long ret = o->mo_ops.fill_pframe(o, pf); + if (ret) + { + page_free(pf->pf_addr); + pf->pf_addr = NULL; + kmutex_unlock(&pf->pf_mutex); + return ret; + } + } + pf->pf_dirty |= forwrite; + *pfp = pf; + return 0; +} + +/* + * If the pframe is dirty, call the mobj's flush_pframe; if flush_pframe returns + * successfully, clear pf_dirty flag and return 0. Otherwise, return what + * flush_pframe returned. + * + * Both o and pf must be locked when calling this function + */ +long mobj_flush_pframe(mobj_t *o, pframe_t *pf) +{ + KASSERT(kmutex_owns_mutex(&o->mo_mutex)); + KASSERT(kmutex_owns_mutex(&pf->pf_mutex)); + KASSERT(pf->pf_addr && "cannot flush a frame not in memory!"); + dbg(DBG_PFRAME, "pf 0x%p, mobj 0x%p, page %lu\n", pf, o, pf->pf_pagenum); + if (pf->pf_dirty) + { + KASSERT(o->mo_ops.flush_pframe); + long ret = o->mo_ops.flush_pframe(o, pf); + if (ret) + return ret; + pf->pf_dirty = 0; + } + KASSERT(!pf->pf_dirty); + return 0; +} + +/* + * Iterate through the pframes of the mobj and try to flush each one. + * If any of them fail, let that reflect in the return value. + * + * The mobj o must be locked when calling this function + */ +long mobj_flush(mobj_t *o) +{ + long ret = 0; + KASSERT(kmutex_owns_mutex(&o->mo_mutex)); + list_iterate(&o->mo_pframes, pf, pframe_t, pf_link) + { + kmutex_lock(&pf->pf_mutex); // get the pframe (lock it) + if (pf->pf_addr) + { + ret |= mobj_flush_pframe(o, pf); + } + pframe_release(&pf); + } + return ret; +} + +/* + * Attempt to flush the pframe. If the flush succeeds, then free the pframe's + * contents (pf->pf_addr) using page_free, remove the pframe from the mobj's + * list and call pframe_free. + * + * Upon successful return, *pfp MUST be null. If the function returns an error + * code, *pfp must be unchanged. + */ +long mobj_free_pframe(mobj_t *o, pframe_t **pfp) +{ + pframe_t *pf = *pfp; + + if (pf->pf_addr) + { + long ret = mobj_flush_pframe(o, pf); + if (ret) + return ret; + + // [+] TODO REMOVE THIS SECTION WHEN FLUSH DOES IT (I.E. WHEN WE HAVE + // SUPPORT FOR FREEING PFRAME'S IN USE BY UNMAPPING THEM FROM PAGE + // TABLES THAT USE THEM) + if (pf->pf_addr) + { + page_free(pf->pf_addr); + pf->pf_addr = NULL; + } + } + *pfp = NULL; + list_remove(&pf->pf_link); + pframe_free(&pf); + return 0; +} + +/* + * Simply flush the memory object + */ +void mobj_default_destructor(mobj_t *o) +{ + mobj_lock(o); + KASSERT(kmutex_owns_mutex(&o->mo_mutex)); + + long ret = 0; + list_iterate(&o->mo_pframes, pf, pframe_t, pf_link) + { + kmutex_lock(&pf->pf_mutex); // get the pframe (lock it) + ret |= mobj_free_pframe(o, &pf); + } + + if (ret) + { + dbg(DBG_MM, + "WARNING: flushing pframes in mobj destructor failed for one or " + "more frames\n" + "This means the memory for the pframe will be leaked!"); + } + + KASSERT(!kmutex_has_waiters(&o->mo_mutex)); + mobj_unlock(o); +} diff --git a/kernel/mm/page.c b/kernel/mm/page.c new file mode 100644 index 0000000..b42dca4 --- /dev/null +++ b/kernel/mm/page.c @@ -0,0 +1,658 @@ +// SMP.1 + SMP.3 +// spinlocks + mask interrupts +#include "kernel.h" +#include "types.h" +#include <boot/multiboot_macros.h> + +#include "boot/config.h" + +#include "mm/mm.h" +#include "mm/page.h" + +#include "util/debug.h" +#include "util/gdb.h" +#include "util/string.h" + +#include "multiboot.h" + +// BTREE === Binary Tree (not an actual B-Tree) + +// Algorithmic optimization ideas +// have a "min free idx" pointer for each order (have a "count free" at each +// order) delay cascading availability bits up the tree until needed; would +// prevent state "thrashing" +// can do this with a cascaded_order flag that equals the highest order +// which we have cascaed up to. For a given allocation, if the required +// order is > cascaded_order, then we cascade up to the required order + +// get ready for bit manipulation heaven :) + +typedef uintptr_t btree_word; + +#define BTREE_ROW_START_INDEX(order) \ + (((uintptr_t)1 << (max_order - (order))) - 1) +#define BTREE_ROW_END_INDEX(order) ((BTREE_ROW_START_INDEX(order) << 1) | 1) +#define BTREE_INDEX_TO_ADDR(idx, order) \ + (((1 << (order)) * ((idx)-BTREE_ROW_START_INDEX(order))) << PAGE_SHIFT) +#define BTREE_ADDR_TO_INDEX(addr, order) \ + (BTREE_ROW_START_INDEX(order) + \ + ((((uintptr_t)(addr)) >> PAGE_SHIFT) / (1 << (order)))) + +#define BTREE_LEAF_START_INDEX BTREE_ROW_START_INDEX(0) +#define BTREE_ADDR_TO_LEAF_INDEX(addr) BTREE_ADDR_TO_INDEX(addr, 0) +#define BTREE_LEAF_INDEX_TO_ADDR(idx) BTREE_INDEX_TO_ADDR(idx, 0) + +#define BTREE_NUM_BITS (sizeof(btree_word) << 3) +#define BTREE_WORD_POS(idx) ((idx) / BTREE_NUM_BITS) +#define BTREE_BIT_POS(idx) ((idx) & (BTREE_NUM_BITS - 1)) +#define BTREE_AVAILABILITY_MASK(idx) \ + ((uintptr_t)1 << (BTREE_NUM_BITS - 1 - BTREE_BIT_POS(idx))) + +// we really don't want branching here (predictor would be quite bad and +// branches are slowwwww) +#define BTREE_SIBLING(idx) ((idx)-1 + (((idx)&1) << 1)) +// uintptr_t btree_sibling(uintptr_t idx) { +// // in a 0-indexed binary tree, a sibling of an odd node is its right +// neighbor --> add 1 +// // and the sibling of an even node is its left neighbor --> subtract 1 +// // so we need: (idx % 2) ? (idx + 1) : (idx - 1); +// uintptr_t odd_addend = idx & 1; // 1 if odd, 0 if even +// uintptr_t even_addend = (uintptr_t) -1 + odd_addend; // 0 if odd, -1 if +// even return idx + odd_addend + even_addend; return idx + (idx & 1) + +// ((uintptr_t) -1 + (idx & 1)); return idx - 1 + ((idx & 1) << 1); +// // now it looks like: always subtract 1, add 2 if odd. which works :) +// } + +// get the left sibling (odd) of a pair; idx may already be the left sibling or +// may be the right sibling (even) subtract 1 from idx if it's even --> subtract +// 1 from LSB and add it back in +#define BTREE_LEFT_SIBLING(idx) ((idx) + (((idx)&1) - 1)) + +#define BTREE_PARENT(idx) (((idx)-1) >> 1) +#define BTREE_LEFT_CHILD(idx) (((idx) << 1) + 1) +#define BTREE_RIGHT_CHILD(idx) (((idx) + 1) << 1) +#define BTREE_IS_LEFT_CHILD(idx) ((idx)&1) +#define BTREE_IS_RIGHT_CHILD(idx) (!BTREE_IS_LEFT_CHILD(idx)) + +#define BTREE_IS_AVAILABLE(idx) \ + (btree[BTREE_WORD_POS(idx)] & BTREE_AVAILABILITY_MASK(idx)) +#define BTREE_MARK_AVAILABLE(idx) \ + (btree[BTREE_WORD_POS(idx)] |= BTREE_AVAILABILITY_MASK(idx)) +#define BTREE_MARK_UNAVAILABLE(idx) \ + (btree[BTREE_WORD_POS(idx)] &= ~BTREE_AVAILABILITY_MASK(idx)) + +// potential optimization: use these when clearing pairs. something about the +// following is apparently buggy though (causes fault) #define +// BTREE_SIBLING_AVAILABILITY_MASK(idx) (BTREE_AVAILABILITY_MASK(idx) | +// BTREE_IS_AVAILABLE(BTREE_SIBLING(idx))) #define +// BTREE_MARK_SIBLINGS_AVAILABLE(idx) (btree[BTREE_WORD_POS(idx)] |= +// BTREE_SIBLING_AVAILABILITY_MASK(idx)) #define +// BTREE_MARK_SIBLINGS_UNAVAILABLE(idx) (btree[BTREE_WORD_POS(idx)] &= +// ~BTREE_SIBLING_AVAILABILITY_MASK(idx)) + +GDB_DEFINE_HOOK(page_alloc, void *addr, size_t npages) + +GDB_DEFINE_HOOK(page_free, void *addr, size_t npages) + +static size_t page_freecount; + +// if you rename these variables, update them in the macros above +static size_t + max_pages; // max number of pages as determined by RAM, NOT max_order +static size_t max_order; // max depth of binary tree + +static btree_word *btree; +static uintptr_t *min_available_idx_by_order; +static size_t *count_available_by_order; + +static char *type_strings[] = {"ERROR: type = 0", "Available", "Reserved", + "ACPI Reclaimable", "ACPI NVS", "GRUB Bad Ram"}; +static size_t type_count = sizeof(type_strings) / sizeof(type_strings[0]); + +inline void *physmap_start() { return (void *)PHYS_OFFSET; } + +inline void *physmap_end() +{ + return (void *)(PHYS_OFFSET + (max_pages << PAGE_SHIFT)); +} + +#undef DEBUG_PHYSICAL_PAGING + +static inline void _btree_expensive_sanity_check() +{ +#ifdef DEBUG_PHYSICAL_PAGING + size_t available = 0; + for (unsigned order = 0; order <= max_order; order++) + { + long checked_first = 0; + unsigned order_count = 0; + uintptr_t max = BTREE_ROW_END_INDEX(order); + + for (uintptr_t idx = BTREE_ROW_START_INDEX(order); idx < max; idx++) + { + if (BTREE_IS_AVAILABLE(idx)) + { + if (!checked_first) + { + KASSERT(min_available_idx_by_order[order] == idx); + checked_first = 1; + } + available += (1 << order); + order_count++; + KASSERT(BTREE_INDEX_TO_ADDR(idx + 1, order) <= physmap_end()); + } + } + if (!checked_first) + { + KASSERT(min_available_idx_by_order[order] == max); + } + KASSERT(count_available_by_order[order] == order_count); + } + KASSERT(available == page_freecount); +#endif +} + +void page_init() +{ + uintptr_t ram = 0; + uintptr_t memory_available_for_use = 0; + + // detect amount of RAM and memory available for use immediately after + // kernel before any reserved region + + KASSERT(PAGE_ALIGNED(mb_tag) && (uintptr_t)mb_tag == KERNEL_PHYS_END); + + for (struct multiboot_tag *tag = mb_tag + 1; + tag->type != MULTIBOOT_TAG_TYPE_END; tag += TAG_SIZE(tag->size)) + { + if (tag->type != MULTIBOOT_TAG_TYPE_MMAP) + { + continue; + } + struct multiboot_tag_mmap *mmap = (struct multiboot_tag_mmap *)tag; + dbg(DBG_PAGEALLOC, "Physical memory map (%d entries):\n", + mmap->size / mmap->entry_size); + for (unsigned i = 0; i < mmap->size / mmap->entry_size; i++) + { + struct multiboot_mmap_entry *entry = &mmap->entries[i]; + dbgq(DBG_MM, "\t[0x%p-0x%p) (%llu bytes): %s\n", + (void *)entry->addr, (void *)(entry->addr + entry->len), + entry->len, + entry->type < type_count ? type_strings[entry->type] + : "Unknown"); + if (entry->type != MULTIBOOT_MEMORY_AVAILABLE) + { + continue; + } + + if (entry->addr < KERNEL_PHYS_END && + entry->addr + entry->len > KERNEL_PHYS_END) + { + memory_available_for_use = + entry->addr + entry->len - KERNEL_PHYS_END; + } + + if (entry->addr + entry->len > ram) + { + ram = entry->addr + entry->len; + } + } + } + + // check we have enough space available following the kernel to map in all + // of RAM detected + max_pages = ram >> PAGE_SHIFT; + max_order = 0; + size_t npages = max_pages; + while (npages) + { + max_order++; + npages >>= 1; + } + + // we may have too much RAM than we can map in with the single memory holy + // after the kernel keep shrinking the maximum order until we find a size + // that fits (this can obviously be done more intelligently, but this also + // works) + size_t btree_size; + size_t metadata_size; + while (max_order) + { + // we need 2^(max_order+1) pages, and one byte maps 8 pages, so we need + // 2^(max_order-2) bytes for the binary tree + btree_size = 1UL << (max_order - 2); + metadata_size = sizeof(uintptr_t) * (max_order + 1) + + sizeof(size_t) * (max_order + 1); + + if (memory_available_for_use >= btree_size + metadata_size) + { + break; + } + if (max_pages == + (ram >> PAGE_SHIFT)) + { // only print first time we shrink + dbg(DBG_PAGEALLOC, + "Warning! Need 0x%p B of memory to map in 0x%p B of RAM, but " + "only have 0x%p available!", + (void *)(btree_size + metadata_size), (void *)ram, + (void *)memory_available_for_use); + } + max_order--; + max_pages = 1UL << max_order; + } + if (max_pages != + (ram >> PAGE_SHIFT)) + { // only print if we shrank available RAM + dbg(DBG_PAGEALLOC, "Supporting only up to 0x%p B of RAM!", + (void *)(max_pages << PAGE_SHIFT)); + } + + btree = (btree_word + *)(KERNEL_PHYS_END + + PAGE_SIZE); // 1 page padding for the multiboot information + memset(btree, 0, btree_size); + + min_available_idx_by_order = (uintptr_t *)((uintptr_t)btree + btree_size); + for (unsigned order = 0; order <= max_order; order++) + { + min_available_idx_by_order[order] = BTREE_ROW_END_INDEX(order); + } + + count_available_by_order = + min_available_idx_by_order + sizeof(uintptr_t) * (max_order + 1); + memset(count_available_by_order, 0, sizeof(size_t) * (max_order + 1)); + + page_freecount = 0; + + uintptr_t reserved_ram_start = KERNEL_PHYS_BASE; + uintptr_t reserved_ram_end = + KERNEL_PHYS_END + PAGE_SIZE + btree_size + metadata_size; + + for (struct multiboot_tag *tag = mb_tag + 1; + tag->type != MULTIBOOT_TAG_TYPE_END; tag += TAG_SIZE(tag->size)) + { + if (tag->type != MULTIBOOT_TAG_TYPE_MMAP) + { + continue; + } + struct multiboot_tag_mmap *mmap = (struct multiboot_tag_mmap *)tag; + for (unsigned i = 0; i < mmap->size / mmap->entry_size; i++) + { + struct multiboot_mmap_entry *entry = &mmap->entries[i]; + if (entry->type != MULTIBOOT_MEMORY_AVAILABLE) + { + continue; + } + uintptr_t addr = entry->addr; + uintptr_t len = entry->len; + + if (addr >= reserved_ram_start && addr < reserved_ram_end) + { + if (len <= reserved_ram_end - addr) + { + continue; + } + len -= reserved_ram_end - addr; + addr = reserved_ram_end; + } + if (addr < reserved_ram_start && addr + len > reserved_ram_start) + { + len = reserved_ram_start - addr; + } + + // TODO [+] see why removing this crashes SMP + if (addr < reserved_ram_start) + { + continue; + } + + page_add_range((void *)addr, (void *)(addr + len)); + } + } + + page_mark_reserved(0); // don't allocate the first page of memory + + size_t bytes = page_freecount << PAGE_SHIFT; + size_t gigabytes = (bytes >> 30); + bytes -= (gigabytes << 30); + size_t megabytes = (bytes >> 20); + bytes -= (megabytes << 20); + size_t kilobytes = (bytes >> 10); + bytes -= (kilobytes << 10); + KASSERT(bytes == 0); + + dbg(DBG_PAGEALLOC, + "Amount of physical memory available for use: %lu GB, %lu MB, and %lu " + "KB; [0x%p, 0x%p)\n", + gigabytes, megabytes, kilobytes, physmap_start(), physmap_end()); + _btree_expensive_sanity_check(); +} + +void page_init_finish() +{ + btree = (btree_word *)((uintptr_t)btree + PHYS_OFFSET); + min_available_idx_by_order = + (uintptr_t *)((uintptr_t)min_available_idx_by_order + PHYS_OFFSET); + count_available_by_order = + (uintptr_t *)((uintptr_t)count_available_by_order + PHYS_OFFSET); +} + +static void _btree_update_metadata_after_removal(size_t order, size_t idx) +{ + // [+] TODO Intel-specific optimizations, see BSF, BSR, REPE CMPS/SCAS + if (count_available_by_order[order]) + { + if (idx == min_available_idx_by_order[order]) + { + uintptr_t word_idx = BTREE_WORD_POS(idx); + if (btree[word_idx] && + word_idx == BTREE_WORD_POS(BTREE_ROW_START_INDEX(order))) + { + // mask off bits to the left of BTREE_BIT_POS(idx); i.e. + // consider only positions > than BTREE_BIT_POS(idx) in + // btree[word_idx] when idx is the old index of the first + // available node for the given order. This is to avoid setting + // min available for an order x to be an index that actually + // belongs to order (x + 1) (in the row above). + btree_word copy = + btree[word_idx] & + ((1UL << (BTREE_NUM_BITS - BTREE_BIT_POS(idx))) - 1); + unsigned bit_idx = BTREE_NUM_BITS; + while (copy != 0 && bit_idx > BTREE_BIT_POS(idx)) + { + bit_idx--; + copy = copy >> 1; + } + if (BTREE_IS_AVAILABLE(word_idx * BTREE_NUM_BITS + bit_idx)) + { + min_available_idx_by_order[order] = + word_idx * BTREE_NUM_BITS + bit_idx; + return; + } + word_idx++; + } + while (!btree[word_idx]) + word_idx++; + btree_word copy = btree[word_idx]; + unsigned bit_idx = BTREE_NUM_BITS; + while (copy != 0) + { + bit_idx--; + copy = copy >> 1; + } + uintptr_t min_available = word_idx * BTREE_NUM_BITS + bit_idx; + if (min_available > BTREE_ROW_END_INDEX(order)) + { + min_available = BTREE_ROW_END_INDEX(order); + } + min_available_idx_by_order[order] = min_available; + } + } + else + { + min_available_idx_by_order[order] = BTREE_ROW_END_INDEX(order); + } +} + +static void _btree_mark_available(uintptr_t idx, size_t order) +{ + KASSERT(!BTREE_IS_AVAILABLE(idx)); + BTREE_MARK_AVAILABLE(idx); + + uintptr_t start = BTREE_INDEX_TO_ADDR(idx, order); + uintptr_t end = BTREE_INDEX_TO_ADDR(idx + 1, order); + dbg(DBG_MM, "marking available (0x%p, 0x%p)\n", (void *)start, (void *)end); + KASSERT(!(0xb1000 >= start && 0xb1000 < end)); + + count_available_by_order[order]++; + if (idx < min_available_idx_by_order[order]) + { + min_available_idx_by_order[order] = idx; + } + + while (idx > 0 && BTREE_IS_AVAILABLE(BTREE_SIBLING(idx))) + { + BTREE_MARK_UNAVAILABLE(idx); + BTREE_MARK_UNAVAILABLE(BTREE_SIBLING(idx)); + + count_available_by_order[order] -= 2; + _btree_update_metadata_after_removal(order, BTREE_LEFT_SIBLING(idx)); + + idx = BTREE_PARENT(idx); + order++; + BTREE_MARK_AVAILABLE(idx); + count_available_by_order[order]++; + if (idx < min_available_idx_by_order[order]) + { + min_available_idx_by_order[order] = idx; + } + } +} + +static void _btree_mark_range_available(uintptr_t leaf_idx, size_t npages) +{ + // coult be optimized further so that we don't need to keep traversing fromm + // leaf to max order. can instead jump to parent's (right) sibling when + // we are a right child, and by jumping to left child while npages > what + // would be allocated but for now, this works and is fast enough it seems... + // TODO potential optimization + while (npages) + { + uintptr_t idx = leaf_idx; + size_t order = 0; + while (BTREE_IS_LEFT_CHILD(idx) && (2UL << order) <= npages) + { + idx = BTREE_PARENT(idx); + order++; + } + _btree_mark_available(idx, order); + npages -= 1 << order; + leaf_idx += 1 << order; + } +} + +void page_add_range(void *start, void *end) +{ + dbg(DBG_MM, "Page system adding range [0x%p, 0x%p)\n", start, end); + KASSERT(end > start); + if (start == 0) + { + start = PAGE_ALIGN_UP(1); + if (end <= start) + { + return; + } + } + start = PAGE_ALIGN_UP(start); + end = PAGE_ALIGN_DOWN(end); + size_t npages = ((uintptr_t)end - (uintptr_t)start) >> PAGE_SHIFT; + _btree_mark_range_available(BTREE_ADDR_TO_LEAF_INDEX(start), npages); + page_freecount += npages; + _btree_expensive_sanity_check(); +} + +void *page_alloc() { return page_alloc_n(1); } + +void *page_alloc_bounded(void *max_paddr) +{ + return page_alloc_n_bounded(1, max_paddr); +} + +void page_free(void *addr) { page_free_n(addr, 1); } + +static void *_btree_alloc(size_t npages, uintptr_t idx, size_t smallest_order, + size_t actual_order) +{ + while (actual_order != smallest_order) + { + BTREE_MARK_UNAVAILABLE(idx); + count_available_by_order[actual_order]--; + _btree_update_metadata_after_removal(actual_order, idx); + + idx = BTREE_LEFT_CHILD(idx); + BTREE_MARK_AVAILABLE(idx); + BTREE_MARK_AVAILABLE(BTREE_SIBLING(idx)); + actual_order--; + + count_available_by_order[actual_order] += 2; + if (idx < min_available_idx_by_order[actual_order]) + { + min_available_idx_by_order[actual_order] = idx; + } + _btree_expensive_sanity_check(); + } + + // actually allocate the 2^smallest_order pages by marking them unavailable + BTREE_MARK_UNAVAILABLE(idx); + count_available_by_order[actual_order]--; + _btree_update_metadata_after_removal(actual_order, idx); + + uintptr_t allocated_idx = idx; + size_t allocated_order = actual_order; + while (allocated_order-- > 0) + allocated_idx = BTREE_LEFT_CHILD(allocated_idx); + + KASSERT(BTREE_LEAF_INDEX_TO_ADDR(allocated_idx)); + + // we allocated some 2^smallest_order of pages; it is possible they asked + // for fewer than 2^smallest_order pages; make sure we mark as available the + // remaining (2^smallest_order - npages) pages. + _btree_mark_range_available(allocated_idx + npages, + (1 << smallest_order) - npages); + + // while (over_allocated > 0 && (1 << reclaimed_order) < over_allocated + // && next_leaf_to_reclaim < max_reclaim_idx) { + // BTREE_MARK_AVAILABLE(idx); + // count_available_by_order[reclaimed_order]++; + // if (idx < min_available_idx_by_order[reclaimed_order]) { + // min_available_idx_by_order[reclaimed_order] = idx; + // } + // over_allocated -= (1 << reclaimed_order); + // next_leaf_to_reclaim += (2 << reclaimed_order); + // idx = BTREE_SIBLING(BTREE_PARENT(idx)); + // reclaimed_order++; + // } + + page_freecount -= npages; + + uintptr_t addr = BTREE_LEAF_INDEX_TO_ADDR(allocated_idx); + dbgq(DBG_MM, "page_alloc_n(%lu): [0x%p, 0x%p)\t\t%lu pages remain\n", + npages, (void *)(PHYS_OFFSET + addr), + (void *)(PHYS_OFFSET + addr + (npages << PAGE_SHIFT)), page_freecount); + _btree_expensive_sanity_check(); + return (void *)(addr + PHYS_OFFSET); +} + +void *page_alloc_n(size_t npages) +{ + return page_alloc_n_bounded(npages, (void *)~0UL); +} + +// this is really only used for setting up initial page tables +// this memory will be immediately overriden, so no need to poison the memory +void *page_alloc_n_bounded(size_t npages, void *max_paddr) +{ + KASSERT(npages > 0 && npages <= (1UL << max_order)); + if (npages > page_freecount) + { + return 0; + } + // a note on max_pages: so long as we never mark a page that is beyond our + // RAM as available, we will never allocate it. So put all those checks at + // the free and map functions + + // find the smallest order that will fit npages + uintptr_t max_page_number = + ((uintptr_t)max_paddr >> PAGE_SHIFT) - npages + 1; + + // [+] TODO intel-specific optimization possible here? + size_t smallest_order = 0; + while ((1UL << smallest_order) < npages) + smallest_order++; + + for (size_t actual_order = smallest_order; actual_order <= max_order; + actual_order++) + { + if (!count_available_by_order[actual_order]) + { + continue; + } + uintptr_t idx = min_available_idx_by_order[actual_order]; + KASSERT(idx >= BTREE_ROW_START_INDEX(actual_order) && + idx < BTREE_ROW_END_INDEX(actual_order)); + if ((idx - BTREE_ROW_START_INDEX(actual_order)) * (1 << actual_order) < + max_page_number) + { + KASSERT((idx - BTREE_ROW_START_INDEX(actual_order)) * + (1 << actual_order) < + max_pages); + + void *ret = _btree_alloc(npages, idx, smallest_order, actual_order); + KASSERT(((uintptr_t)ret + (npages << PAGE_SHIFT)) <= + (uintptr_t)physmap_end()); + return ret; + } + } + return 0; +} + +void page_free_n(void *addr, size_t npages) +{ + dbgq(DBG_MM, "page_free_n(%lu): [0x%p, 0x%p)\t\t%lu pages remain\n", npages, + addr, (void *)((uintptr_t)addr + (npages << PAGE_SHIFT)), + page_freecount); + GDB_CALL_HOOK(page_free, addr, npages); + KASSERT(npages > 0 && npages <= (1UL << max_order) && PAGE_ALIGNED(addr)); + uintptr_t idx = BTREE_ADDR_TO_LEAF_INDEX((uintptr_t)addr - PHYS_OFFSET); + KASSERT(idx + npages - BTREE_LEAF_START_INDEX <= max_pages); + _btree_mark_range_available(idx, npages); + page_freecount += npages; + _btree_expensive_sanity_check(); +} + +void page_mark_reserved(void *paddr) +{ + if ((uintptr_t)paddr > (max_pages << PAGE_SHIFT)) + return; + + dbgq(DBG_MM, "page_mark_reserved(0x%p): [0x%p, 0x%p)\n", + (void *)((uintptr_t)paddr + PHYS_OFFSET), + (void *)((uintptr_t)paddr + PHYS_OFFSET), + (void *)((uintptr_t)paddr + PHYS_OFFSET + PAGE_SIZE)); + + KASSERT(PAGE_ALIGNED(paddr)); + uintptr_t idx = BTREE_ADDR_TO_LEAF_INDEX(paddr); + size_t order = 0; + while (idx && !BTREE_IS_AVAILABLE(idx)) + { + idx = BTREE_PARENT(idx); + order++; + } + if (!BTREE_IS_AVAILABLE(idx)) + { + return; // can sometimes be a part of reserved RAM anyway + } + + BTREE_MARK_UNAVAILABLE(idx); + count_available_by_order[order]--; + _btree_update_metadata_after_removal(order, idx); + + uintptr_t unavailable_leaf_idx = BTREE_ADDR_TO_LEAF_INDEX(paddr); + uintptr_t still_available_leaf_idx_start = + BTREE_ADDR_TO_LEAF_INDEX(BTREE_INDEX_TO_ADDR(idx, order)); + uintptr_t still_available_leaf_idx_end = + BTREE_ADDR_TO_LEAF_INDEX(BTREE_INDEX_TO_ADDR(idx + 1, order)); + + _btree_mark_range_available( + still_available_leaf_idx_start, + unavailable_leaf_idx - still_available_leaf_idx_start); + _btree_mark_range_available( + unavailable_leaf_idx + 1, + still_available_leaf_idx_end - unavailable_leaf_idx - 1); + + page_freecount--; + + _btree_expensive_sanity_check(); +} + +size_t page_free_count() { return page_freecount; } diff --git a/kernel/mm/page.py b/kernel/mm/page.py new file mode 100644 index 0000000..9dfedf0 --- /dev/null +++ b/kernel/mm/page.py @@ -0,0 +1,47 @@ +import gdb + +import weenix +import weenix.kmem + + +class PageCommand(weenix.Command): + def __init__(self): + weenix.Command.__init__(self, "page", gdb.COMMAND_DATA, gdb.COMPLETE_NONE) + + def invoke(self, args, tty): + total = 0 + print("pagesize: {0}".format(weenix.kmem.pagesize())) + + names = list() + blobs = list() + pages = list() + bytes = list() + + for order, count in weenix.kmem.freepages().items(): + pcount = count * (1 << order) + bcount = pcount * weenix.kmem.pagesize() + names.append("freepages[{0}]:".format(order)) + blobs.append("{0} blob{1}".format(count, " " if (count == 1) else "s")) + pages.append("{0} page{1}".format(pcount, " " if (pcount == 1) else "s")) + bytes.append("{0} byte{1}".format(bcount, " " if (bcount == 1) else "s")) + total += count * (1 << order) + + names.append("total:") + blobs.append("") + pages.append("{0} page{1}".format(total, " " if (total == 1) else "s")) + bytes.append("{0} bytes".format(total * weenix.kmem.pagesize())) + + namewidth = max(list(map(lambda x: len(x), names))) + blobwidth = max(list(map(lambda x: len(x), blobs))) + pagewidth = max(list(map(lambda x: len(x), pages))) + bytewidth = max(list(map(lambda x: len(x), bytes))) + + for name, blob, page, byte in zip(names, blobs, pages, bytes): + print( + "{1:<{0}} {3:>{2}} {5:>{4}} {7:>{6}}".format( + namewidth, name, blobwidth, blob, pagewidth, page, bytewidth, byte + ) + ) + + +PageCommand() diff --git a/kernel/mm/pagecache.c b/kernel/mm/pagecache.c new file mode 100644 index 0000000..b1763ba --- /dev/null +++ b/kernel/mm/pagecache.c @@ -0,0 +1,23 @@ +#include "errno.h" +#include "globals.h" +#include "kernel.h" +#include "util/debug.h" + +#include "mm/pframe.h" + +long pagecache_get_page(pframe_t *pf) { + if (pf->pf_addr) { + // all set + return 1; + } + //Somehow load the page + KASSERT(0 && "page not in pagecache"); + return 0; +} + +#ifdef NO +void pagecache_newsource(pframe_t pf, blockdev_t *dev, long loc) { + pf->pf_srcdev.pf_dev = dev; + pf->pf_loc = loc; +} +#endif
\ No newline at end of file diff --git a/kernel/mm/pagetable.c b/kernel/mm/pagetable.c new file mode 100644 index 0000000..daf49ef --- /dev/null +++ b/kernel/mm/pagetable.c @@ -0,0 +1,873 @@ +#include "errno.h" +#include "globals.h" +#include "kernel.h" +#include "types.h" + +#include "mm/mm.h" +#include "mm/pframe.h" +#include "mm/mobj.h" + +#include "util/debug.h" +#include "util/string.h" + +#include "vm/pagefault.h" + +typedef enum +{ + UNMAPPED, + PAGE_4KB, + PAGE_2MB, + PAGE_1GB +} vaddr_map_status; + +static pml4_t *global_kernel_only_pml4; + +void pt_set(pml4_t *pml4) +{ + KASSERT((void *)pml4 >= physmap_start()); + uintptr_t phys_addr = pt_virt_to_phys((uintptr_t)pml4); + __asm__ volatile("movq %0, %%cr3" ::"r"(phys_addr) + : "memory"); +} + +/* + * Don't use this for proc_create. You want each new proc to have a copy + * of the current page table (see pt_create). + * + * Returns a pointer to the current pagetable (a virtual address). + */ +inline pml4_t *pt_get() +{ + uintptr_t pml4; + __asm__ volatile("movq %%cr3, %0" + : "=r"(pml4)); + return (pml4_t *)(pml4 + PHYS_OFFSET); +} + +vaddr_map_status _vaddr_status(pml4_t *pml4, uintptr_t vaddr) +{ + uint64_t idx; + pml4_t *table = pml4; + + idx = PML4E(vaddr); + if (!IS_PRESENT(table->phys[idx])) + { + return UNMAPPED; + } + table = (pdp_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET); + + // PDP (1GB pages) + idx = PDPE(vaddr); + if (!IS_PRESENT(table->phys[idx])) + { + return UNMAPPED; + } + if (IS_1GB_PAGE(table->phys[idx])) + { + return PAGE_1GB; + } + table = (pd_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET); + + // PD (2MB pages) + idx = PDE(vaddr); + if (!IS_PRESENT(table->phys[idx])) + { + return UNMAPPED; + } + if (IS_2MB_PAGE(table->phys[idx])) + { + return PAGE_2MB; + } + table = (pt_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET); + + // PT (4KB pages) + idx = PTE(vaddr); + if (!IS_PRESENT(table->phys[idx])) + { + return UNMAPPED; + } + return PAGE_4KB; +} + +uintptr_t pt_virt_to_phys_helper(pml4_t *table, uintptr_t vaddr) +{ + if (vaddr >= (uintptr_t)physmap_start() && + vaddr < (uintptr_t)physmap_end()) + { + return vaddr - PHYS_OFFSET; + } + + uint64_t idx; + + // PML4 + idx = PML4E(vaddr); + KASSERT(IS_PRESENT(table->phys[idx])); + table = (pdp_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET); + + // PDP (1GB pages) + idx = PDPE(vaddr); + KASSERT(IS_PRESENT(table->phys[idx])); + if (USE_1GB_PAGES && IS_1GB_PAGE(table->phys[idx])) + { + return PAGE_ALIGN_DOWN_1GB(table->phys[idx]) + PAGE_OFFSET_1GB(vaddr); + } + table = (pd_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET); + + // PD (2MB pages) + idx = PDE(vaddr); + KASSERT(IS_PRESENT(table->phys[idx])); + if (USE_2MB_PAGES && IS_2MB_PAGE(table->phys[idx])) + { + return PAGE_ALIGN_DOWN_2MB(table->phys[idx]) + PAGE_OFFSET_2MB(vaddr); + } + table = (pt_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET); + + // PT (4KB pages) + idx = PTE(vaddr); + + KASSERT(IS_PRESENT(table->phys[idx])); + + return (uintptr_t)PAGE_ALIGN_DOWN(table->phys[idx]) + PAGE_OFFSET(vaddr); +} + +uintptr_t pt_virt_to_phys(uintptr_t vaddr) +{ + if (vaddr >= (uintptr_t)physmap_start() && + vaddr < (uintptr_t)physmap_end()) + { + // if the address is within the PHYS_MAP region, then subtract the + // PHYS_OFFSET to get the physical address. There is a one-to-one mapping + // between virtual and physical addresses in this region. + return vaddr - PHYS_OFFSET; + } + return pt_virt_to_phys_helper(pt_get(), vaddr); +} + +void _fill_pt(pt_t *pt, uintptr_t paddr, uintptr_t vaddr, uintptr_t vmax) +{ + for (uintptr_t idx = PTE(vaddr); idx < PT_ENTRY_COUNT && vaddr < vmax; + idx++, paddr += PAGE_SIZE, vaddr += PAGE_SIZE) + { + pt->phys[idx] = (uintptr_t)paddr | PT_PRESENT | PT_WRITE; + } +} + +long _fill_pd(pd_t *pd, uintptr_t paddr, uintptr_t vaddr, uintptr_t vmax, + uintptr_t max_paddr) +{ + for (uintptr_t idx = PDE(vaddr); idx < PT_ENTRY_COUNT && vaddr < vmax; + idx++, paddr += PT_VADDR_SIZE, vaddr += PT_VADDR_SIZE) + { + KASSERT(!IS_PRESENT(pd->phys[idx])); +#if USE_2MB_PAGES + if (vmax - vaddr >= PT_VADDR_SIZE) + { + pd->phys[idx] = paddr | PT_PRESENT | PT_WRITE | PT_SIZE; + continue; + } +#endif + + uintptr_t pt = (uintptr_t)page_alloc_bounded((void *)max_paddr); + if (!pt) + { + return 1; + } + pt -= PHYS_OFFSET; + + memset((void *)pt, 0, PAGE_SIZE); + pd->phys[idx] = pt | PT_PRESENT | PT_WRITE; + _fill_pt((pt_t *)pt, paddr, vaddr, vmax); + } + return 0; +} + +long _fill_pdp(pdp_t *pdp, uintptr_t paddr, uintptr_t vaddr, uintptr_t vmax, + uintptr_t max_paddr) +{ + for (uintptr_t idx = PDPE(vaddr); idx < PT_ENTRY_COUNT && vaddr < vmax; + idx++, paddr += PD_VADDR_SIZE, vaddr += PD_VADDR_SIZE) + { + KASSERT(!IS_PRESENT(pdp->phys[idx])); +#if USE_1GB_PAGES + if (vmax - vaddr >= PD_VADDR_SIZE) + { + pdp->phys[idx] = paddr | PT_PRESENT | PT_WRITE | PT_SIZE; + continue; + } +#endif + + uintptr_t pd = (uintptr_t)page_alloc_bounded((void *)max_paddr); + if (!pd) + { + return 1; + } + pd -= PHYS_OFFSET; + + memset((void *)pd, 0, PAGE_SIZE); + pdp->phys[idx] = pd | PT_PRESENT | PT_WRITE; + if (_fill_pd((pd_t *)pd, paddr, vaddr, vmax, max_paddr)) + { + return 1; + } + } + return 0; +} + +long _fill_pml4(pml4_t *pml4, uintptr_t paddr, uintptr_t vaddr, uintptr_t vmax, + uintptr_t max_paddr) +{ + for (uintptr_t idx = PML4E(vaddr); idx < PT_ENTRY_COUNT && vaddr < vmax; + idx++, paddr += PDP_VADDR_SIZE, vaddr += PDP_VADDR_SIZE) + { + KASSERT(!IS_PRESENT(pml4->phys[idx])); + + uintptr_t pdp = (uintptr_t)page_alloc_bounded((void *)max_paddr); + if (!pdp) + { + return 1; + } + pdp -= PHYS_OFFSET; + + memset((void *)pdp, 0, PAGE_SIZE); + pml4->phys[idx] = pdp | PT_PRESENT | PT_WRITE; + if (_fill_pdp((pdp_t *)pdp, paddr, vaddr, vmax, max_paddr)) + { + return 1; + } + } + return 0; +} + +long pt_map(pml4_t *pml4, uintptr_t paddr, uintptr_t vaddr, uint32_t pdflags, + uint32_t ptflags) +{ + return pt_map_range(pml4, paddr, vaddr, vaddr + PAGE_SIZE, pdflags, + ptflags); +} + +long pt_map_range(pml4_t *pml4, uintptr_t paddr, uintptr_t vaddr, + uintptr_t vmax, uint32_t pdflags, uint32_t ptflags) +{ + dbg(DBG_PGTBL, "[0x%p, 0x%p) mapped to 0x%p; pml4: 0x%p\n", (void *)vaddr, + (void *)vmax, (void *)paddr, pml4); + KASSERT(PAGE_ALIGNED(paddr) && PAGE_ALIGNED(vaddr) && PAGE_ALIGNED(vmax)); + KASSERT(vmax > vaddr && (ptflags & PAGE_MASK) == 0 && + (pdflags & PAGE_MASK) == 0); + KASSERT((pdflags & PT_USER) == (ptflags & PT_USER)); + KASSERT(!(pdflags & PT_SIZE) && !(ptflags & PT_SIZE)); + + while (vaddr < vmax) + { + uint64_t size = vmax - vaddr; + + uint64_t idx = PML4E(vaddr); + pml4_t *table = pml4; + + if (!IS_PRESENT(table->phys[idx])) + { + uintptr_t page = (uintptr_t)page_alloc(); + if (!page) + { + return -ENOMEM; + } + memset((void *)page, 0, PAGE_SIZE); + KASSERT(pt_virt_to_phys(page) == page - PHYS_OFFSET); + KASSERT(*(uintptr_t *)page == 0); + table->phys[idx] = (page - PHYS_OFFSET) | pdflags; + } + else + { + // can't split up if control flags don't match, so liberally include + // all of them + table->phys[idx] |= pdflags; + } + table = (pdp_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET); + + // PDP (1GB pages) + idx = PDPE(vaddr); + if (!IS_PRESENT(table->phys[idx])) + { +#if USE_1GB_PAGES + if (PAGE_ALIGNED_1GB(vaddr) && size > PAGE_SIZE_1GB) + { + table->phys[idx] = (uintptr_t)paddr | ptflags | PT_SIZE; + paddr += PAGE_SIZE_1GB; + vaddr += PAGE_SIZE_1GB; + continue; + } +#endif + uintptr_t page = (uintptr_t)page_alloc(); + if (!page) + { + return -ENOMEM; + } + memset((void *)page, 0, PAGE_SIZE); + table->phys[idx] = (page - PHYS_OFFSET) | pdflags; + } + else if (IS_1GB_PAGE(table->phys[idx])) + { + if (PAGE_SAME_1GB(table->phys[idx], paddr) && + PAGE_OFFSET_1GB(paddr) == PAGE_OFFSET_1GB(vaddr) && + PAGE_CONTROL_FLAGS(table->phys[idx]) - PT_SIZE == pdflags) + { + vaddr = PAGE_ALIGN_UP_1GB(vaddr + 1); + continue; + } + pd_t *pd = page_alloc(); + if (!pd) + { + return -ENOMEM; + } + for (unsigned i = 0; i < PT_ENTRY_COUNT; i++) + { + pd->phys[i] = + table->phys[idx] + + i * PAGE_SIZE_2MB; // keeps all flags, including PT_SIZE + } + table->phys[idx] = + ((uintptr_t)pd - PHYS_OFFSET) | + pdflags; // overwrite flags as well for particular entry + } + else + { + table->phys[idx] |= pdflags; + } + table = (pd_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET); + + // PD (2MB pages) + idx = PDE(vaddr); + if (!IS_PRESENT(table->phys[idx])) + { +#if USE_2MB_PAGES + if (PAGE_ALIGNED_2MB(vaddr) && size > PAGE_SIZE_2MB) + { + table->phys[idx] = (uintptr_t)paddr | ptflags | PT_SIZE; + paddr += PAGE_SIZE_2MB; + vaddr += PAGE_SIZE_2MB; + continue; + } +#endif + uintptr_t page = (uintptr_t)page_alloc(); + if (!page) + { + return -ENOMEM; + } + memset((void *)page, 0, PAGE_SIZE); + table->phys[idx] = (page - PHYS_OFFSET) | pdflags; + } + else if (IS_2MB_PAGE(table->phys[idx])) + { + if (PAGE_SAME_2MB(table->phys[idx], paddr) && + PAGE_OFFSET_2MB(paddr) == PAGE_OFFSET_2MB(vaddr) && + PAGE_CONTROL_FLAGS(table->phys[idx]) - PT_SIZE == ptflags) + { + vaddr = PAGE_ALIGN_UP_2MB(vaddr + 1); + continue; + } + pt_t *pt = page_alloc(); + if (!pt) + { + return -ENOMEM; + } + for (unsigned i = 0; i < PT_ENTRY_COUNT; i++) + { + pt->phys[i] = table->phys[idx] + i * PAGE_SIZE - + PT_SIZE; // remove PT_SIZE flag + } + table->phys[idx] = + ((uintptr_t)pt - PHYS_OFFSET) | pdflags; // overwrite flags + } + else + { + table->phys[idx] |= pdflags; + } + table = (pt_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET); + + // PT (4KB pages) + + idx = PTE(vaddr); + table->phys[idx] = (uintptr_t)paddr | ptflags; + + KASSERT(IS_PRESENT(table->phys[idx])); + + paddr += PAGE_SIZE; + vaddr += PAGE_SIZE; + } + + return 0; +} + +static long _pt_fault_handler(regs_t *regs) +{ + uintptr_t vaddr; + /* Get the address where the fault occurred */ + __asm__ volatile("movq %%cr2, %0" + : "=r"(vaddr)); + uintptr_t cause = regs->r_err; + + /* Check if pagefault was in user space (otherwise, BAD!) */ + if (cause & FAULT_USER) + { + handle_pagefault(vaddr, cause); + } + else + { + dump_registers(regs); + panic("\nKernel page fault at vaddr 0x%p\n", (void *)vaddr); + } + return 0; +} + +void pt_init() +{ + static long inited = 0; + if (!inited) + { + inited = 1; + // allocate a page to set up the new page table structure + // important caveat: we have not mapped in the physmap region, which + // is where the addresses from page_alloc come, so we use the actual + // physical addrses of the page, which we request to be in the + // first 4MB of RAM, as they are identity-mapped by the boot-time + // page tables + uintptr_t max_paddr = (1UL << 22); + pml4_t *pml4 = page_alloc_bounded((void *)max_paddr); + if (!pml4) + panic("ran out of memory in pt_init"); + pml4 = (pml4_t *)((uintptr_t)pml4 - PHYS_OFFSET); + KASSERT((uintptr_t)pml4 < max_paddr); + memset(pml4, 0, PAGE_SIZE); + + // map the kernel in to it's expected virtual memory address + if (_fill_pml4(pml4, KERNEL_PHYS_BASE, KERNEL_VMA + KERNEL_PHYS_BASE, + KERNEL_VMA + KERNEL_PHYS_END, max_paddr)) + panic("ran out of memory in pt_init"); + + // map in physmap + if (_fill_pml4(pml4, 0, (uintptr_t)physmap_start(), + (uintptr_t)physmap_end(), max_paddr)) + panic("ran out of memory in pt_init"); + + page_init_finish(); + + // use the kernel memory address synonym instead of the physical address + // identity map for pml4 make the MMU use the new pml4 + pt_set((pml4_t *)((uintptr_t)pml4 + PHYS_OFFSET)); + global_kernel_only_pml4 = (pml4_t *)((uintptr_t)pml4 + PHYS_OFFSET); + // pt_unmap_range(global_kernel_only_pml4, USER_MEM_LOW, USER_MEM_HIGH); + intr_register(INTR_PAGE_FAULT, _pt_fault_handler); + } + pt_set(global_kernel_only_pml4); +} + +pt_t *clone_pt(pt_t *pt) +{ + pt_t *clone = page_alloc(); + dbg(DBG_PRINT, "cloning pt at 0x%p to 0x%p\n", pt, clone); + if (clone) + { + memcpy(clone, pt, PAGE_SIZE); + } + return clone; +} + +pd_t *clone_pd(pd_t *pd) +{ + pd_t *clone = page_alloc(); + dbg(DBG_PRINT, "cloning pd at 0x%p to 0x%p\n", pd, clone); + if (!clone) + { + return NULL; + } + memset(clone, 0, PAGE_SIZE); // in case the clone fails, need to know what + // we have allocated + for (unsigned i = 0; i < PT_ENTRY_COUNT; i++) + { + // dbg(DBG_PRINT, "checking pd i = %u\n", i); + if (pd->phys[i]) + { + if (IS_2MB_PAGE(pd->phys[i])) + { + clone->phys[i] = pd->phys[i]; + continue; + } + pt_t *cloned_pt = + clone_pt((pt_t *)((pd->phys[i] & PAGE_MASK) + PHYS_OFFSET)); + if (!cloned_pt) + { + return NULL; + } + clone->phys[i] = (((uintptr_t)cloned_pt) - PHYS_OFFSET) | + PAGE_FLAGS(pd->phys[i]); + } + else + { + clone->phys[i] = 0; + } + } + return clone; +} + +pdp_t *clone_pdp(pdp_t *pdp) +{ + pdp_t *clone = page_alloc(); + dbg(DBG_PRINT, "cloning pdp at 0x%p to 0x%p\n", pdp, clone); + if (!clone) + { + return NULL; + } + memset(clone, 0, PAGE_SIZE); // in case the clone fails, need to know what + // we have allocated + for (unsigned i = 0; i < PT_ENTRY_COUNT; i++) + { + // dbg(DBG_PRINT, "checking pdp i = %u\n", i); + if (pdp->phys[i]) + { + if (IS_1GB_PAGE(pdp->phys[i])) + { + clone->phys[i] = pdp->phys[i]; + continue; + } + pd_t *cloned_pd = + clone_pd((pd_t *)((pdp->phys[i] & PAGE_MASK) + PHYS_OFFSET)); + if (!cloned_pd) + { + return NULL; + } + clone->phys[i] = (((uintptr_t)cloned_pd) - PHYS_OFFSET) | + PAGE_FLAGS(pdp->phys[i]); + } + else + { + clone->phys[i] = 0; + } + } + return clone; +} + +pml4_t *clone_pml4(pml4_t *pml4, long include_user_mappings) +{ + pml4_t *clone = page_alloc(); + dbg(DBG_PRINT, "cloning pml4 at 0x%p to 0x%p\n", pml4, clone); + if (!clone) + { + return NULL; + } + memset(clone, 0, PAGE_SIZE); // in case the clone fails, need to know what + // we have allocated + for (uintptr_t i = include_user_mappings ? 0 : PT_ENTRY_COUNT / 2; + i < PT_ENTRY_COUNT; i++) + { + // dbg(DBG_PRINT, "checking pml4 i = %u\n", i); + if (pml4->phys[i]) + { + pdp_t *cloned_pdp = + clone_pdp((pdp_t *)((pml4->phys[i] & PAGE_MASK) + PHYS_OFFSET)); + if (!cloned_pdp) + { + pt_destroy(clone); + return NULL; + } + clone->phys[i] = (((uintptr_t)cloned_pdp) - PHYS_OFFSET) | + PAGE_FLAGS(pml4->phys[i]); + } + else + { + clone->phys[i] = 0; + } + } + return clone; +} + +pml4_t *pt_create() { return clone_pml4(pt_get(), 0); } + +void pt_destroy_helper(pt_t *pt, long depth) +{ + // 4 = pml4, 3 = pdp, 2 = pd, 1 = pt + if (depth != 1) + { + for (uintptr_t i = 0; i < PT_ENTRY_COUNT; i++) + { + if (!pt->phys[i] || (PT_SIZE & pt->phys[i])) + { + continue; + } + KASSERT(IS_PRESENT(pt->phys[i]) && (pt->phys[i] & PAGE_MASK)); + pt_destroy_helper((pt_t *)((pt->phys[i] & PAGE_MASK) + PHYS_OFFSET), + depth - 1); + pt->phys[i] = 0; + } + } + page_free(pt); +} + +void pt_destroy(pml4_t *pml4) { pt_destroy_helper(pml4, 4); } + +void pt_unmap(pml4_t *pml4, uintptr_t vaddr) +{ + pt_unmap_range(pml4, vaddr, vaddr + PAGE_SIZE); +} + +void pt_unmap_range(pml4_t *pml4, uintptr_t vaddr, uintptr_t vmax) +{ + // TODO reclaim pages on-the-fly? + + dbg(DBG_PGTBL, "virt[0x%p, 0x%p); pml4: 0x%p\n", (void *)vaddr, + (void *)vmax, pml4); + KASSERT(PAGE_ALIGNED(vaddr) && PAGE_ALIGNED(vmax) && vmax > vaddr); + + uintptr_t vaddr_start = vaddr; + + while (vaddr < vmax) + { + uint64_t size = vmax - vaddr; + + uint64_t idx = PML4E(vaddr); + pml4_t *table = pml4; + + if (!IS_PRESENT(table->phys[idx])) + { + vaddr = PAGE_ALIGN_UP_512GB(vaddr + 1); + continue; + } + table = (pdp_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET); + + // PDP (1GB pages) + idx = PDPE(vaddr); + if (!IS_PRESENT(table->phys[idx])) + { + vaddr = PAGE_ALIGN_UP_1GB(vaddr + 1); + ; + continue; + } + if (IS_1GB_PAGE(table->phys[idx])) + { + if (PAGE_ALIGNED_1GB(vaddr) && size >= PAGE_SIZE_1GB) + { + table->phys[idx] = 0; + vaddr += PAGE_SIZE_1GB; + } + else + { + pd_t *pd = page_alloc(); + if (!pd) + { + panic( + "Ran out of memory during pt_unmap_range; recovery " + "from this situation has not yet been implemented!"); + } + uint64_t unmap_start = PDE(vaddr); + uint64_t unmap_end = + PAGE_SAME_1GB(vaddr, vmax) ? PDE(vmax) : 512; + for (unsigned i = 0; i < unmap_start; i++) + { + pd->phys[i] = table->phys[idx] + + i * PAGE_SIZE_2MB; // keeps all flags, + // including PT_SIZE + } + memset(&pd->phys[unmap_start], 0, + sizeof(uint64_t) * (unmap_end - unmap_start)); + vaddr += (unmap_end - unmap_start) * PAGE_SIZE_2MB; + for (uintptr_t i = unmap_end; unmap_end < PT_ENTRY_COUNT; i++) + { + pd->phys[i] = table->phys[idx] + + i * PAGE_SIZE_2MB; // keeps all flags, + // including PT_SIZE + } + table->phys[idx] = ((uintptr_t)pd - PHYS_OFFSET) | + PAGE_CONTROL_FLAGS(table->phys[idx]); + } + continue; + } + table = (pd_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET); + + // PD (2MB pages) + idx = PDE(vaddr); + if (!IS_PRESENT(table->phys[idx])) + { + vaddr = PAGE_ALIGN_UP_2MB(vaddr + 1); + continue; + } + if (IS_2MB_PAGE(table->phys[idx])) + { + if (PAGE_ALIGNED_2MB(vaddr) && size >= PAGE_SIZE_2MB) + { + table->phys[idx] = 0; + vaddr += PAGE_SIZE_2MB; + } + else + { + pt_t *pt = page_alloc(); + if (!pt) + { + panic( + "Ran out of memory during pt_unmap_range; recovery " + "from this situation has not yet been implemented!"); + } + uint64_t unmap_start = PTE(vaddr); + uint64_t unmap_end = + PAGE_SAME_2MB(vaddr, vmax) ? PTE(vmax) : 512; + for (unsigned i = 0; i < unmap_start; i++) + { + pt->phys[i] = table->phys[idx] + i * PAGE_SIZE - + PT_SIZE; // remove PT_SIZE flag + } + memset(&pt->phys[unmap_start], 0, + sizeof(uint64_t) * (unmap_end - unmap_start)); + vaddr += (unmap_end - unmap_start) * PAGE_SIZE; + for (uintptr_t i = unmap_end; unmap_end < PT_ENTRY_COUNT; i++) + { + pt->phys[i] = table->phys[idx] + i * PAGE_SIZE - + PT_SIZE; // remove PT_SIZE flag + } + table->phys[idx] = + ((uintptr_t)pt - PHYS_OFFSET) | + (PAGE_CONTROL_FLAGS(table->phys[idx]) - PT_SIZE); + } + continue; + } + table = (pt_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET); + + // PT (4KB pages) + idx = PTE(vaddr); + if (!IS_PRESENT(table->phys[idx])) + { + vaddr += PAGE_SIZE; + continue; + } + table->phys[idx] = 0; + + vaddr += PAGE_SIZE; + } + KASSERT(_vaddr_status(pml4, vaddr_start) == UNMAPPED); +} + +static char *entry_strings[] = { + "4KB", + "2MB", + "1GB", + "512GB", +}; + +inline long _vaddr_status_detailed(pml4_t *pml4, uintptr_t vaddr) +{ + uintptr_t idx; + pml4_t *table = pml4; + + idx = PML4E(vaddr); + if (!IS_PRESENT(table->phys[idx])) + { + return -4; + } + table = (pdp_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET); + + // PDP (1GB pages) + idx = PDPE(vaddr); + if (!IS_PRESENT(table->phys[idx])) + { + return -3; + } + if (IS_1GB_PAGE(table->phys[idx])) + { + return 3; + } + table = (pd_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET); + + // PD (2MB pages) + idx = PDE(vaddr); + if (!IS_PRESENT(table->phys[idx])) + { + return -2; + } + if (IS_2MB_PAGE(table->phys[idx])) + { + return 2; + } + table = (pt_t *)((table->phys[idx] & PAGE_MASK) + PHYS_OFFSET); + + // PT (4KB pages) + idx = PTE(vaddr); + if (!IS_PRESENT(table->phys[idx])) + { + return -1; + } + return 1; +} + +void check_invalid_mappings(pml4_t *pml4, vmmap_t *vmmap, char *prompt) +{ + // checks that anything that is mapped in pml4 actually should be according + // to vmmap + + uintptr_t vaddr = USER_MEM_LOW; + while (vaddr < USER_MEM_HIGH) + { + long state = _vaddr_status_detailed(pml4, vaddr); + if (state > 0) + { + uintptr_t paddr = pt_virt_to_phys_helper(pml4, vaddr); + + vmarea_t *vma = vmmap_lookup(vmmap, ADDR_TO_PN(vaddr)); + if (!vma) + { + dbg(DBG_PGTBL, + "[+] %s: pml4 0x%p, 0x%p (paddr: 0x%p) cannot be found in " + "vmmap!\n", + prompt, pml4, (void *)vaddr, (void *)paddr); + pt_unmap(pml4, vaddr); + } + else + { + pframe_t *pf = NULL; + uintptr_t pagenum = + vma->vma_off + (ADDR_TO_PN(vaddr) - vma->vma_start); + + mobj_lock(vma->vma_obj); + long ret = mobj_get_pframe(vma->vma_obj, pagenum, 0, &pf); + mobj_unlock(vma->vma_obj); + if (ret) + { + dbg(DBG_PGTBL, + "[+] %s: pml4 0x%p, the page frame for virtual address " + "0x%p (mapping to 0x%p) could not be found!\n", + prompt, pml4, (void *)vaddr, (void *)paddr); + pt_unmap(pml4, vaddr); + } + else + { + uintptr_t pf_paddr = + pt_virt_to_phys_helper(pml4, (uintptr_t)pf->pf_addr); + if (pf_paddr != paddr) + { + dbg(DBG_PGTBL, + "[+] %s: pml4 0x%p, 0x%p (paddr: 0x%p) supposed to " + "be 0x%p (obj: 0x%p, %lu)\n", + prompt, pml4, (void *)vaddr, (void *)paddr, + (void *)pf_paddr, vma->vma_obj, pf->pf_pagenum); + pt_unmap(pml4, vaddr); + } + } + } + } + switch (state) + { + case 1: + case -1: + vaddr = (uintptr_t)PAGE_ALIGN_UP(vaddr + 1); + break; + case -2: + vaddr = (uintptr_t)PAGE_ALIGN_UP_2MB(vaddr + 1); + break; + case -3: + vaddr = (uintptr_t)PAGE_ALIGN_UP_1GB(vaddr + 1); + break; + case -4: + vaddr = (uintptr_t)PAGE_ALIGN_UP_512GB(vaddr + 1); + break; + case 2: + case 3: + default: + panic("should not get here!"); + } + } +} diff --git a/kernel/mm/pagetable.gdb b/kernel/mm/pagetable.gdb new file mode 100644 index 0000000..b145804 --- /dev/null +++ b/kernel/mm/pagetable.gdb @@ -0,0 +1,25 @@ +define pagetable + if $argc > 0 + set $proc = proc_lookup($arg0) + if $proc != NULL + printf "Process %i (%s):\n", $proc->p_pid, $proc->p_name + set $pagedir = $proc->p_pml4 + else + printf "No process with PID %i exists\n", $arg0 + set $pagedir = NULL + end + else + printf "Current mappings:\n" + set $pagedir = current_pagedir + end + + if $pagedir != NULL + kinfo pt_mapping_info current_pagedir + end +end +document pagetable +Without arguments displays current page table mappings in the form +"[vstart, vend) => [pstart, pend)". Takes an optional integer argument +to specify the PID of a process whose page table mappings should be +printed instead. +end diff --git a/kernel/mm/pframe.c b/kernel/mm/pframe.c new file mode 100644 index 0000000..6eff123 --- /dev/null +++ b/kernel/mm/pframe.c @@ -0,0 +1,59 @@ +#include "globals.h" + +#include "mm/pframe.h" +#include "mm/slab.h" + +#include "util/debug.h" +#include "util/string.h" + +static slab_allocator_t *pframe_allocator; + +void pframe_init() +{ + pframe_allocator = slab_allocator_create("pframe", sizeof(pframe_t)); + KASSERT(pframe_allocator); +} + +/* + * Create a pframe and initialize its members appropriately. + */ +pframe_t *pframe_create() +{ + pframe_t *pf = slab_obj_alloc(pframe_allocator); + if (!pf) + { + return NULL; + } + memset(pf, 0, sizeof(pframe_t)); + kmutex_init(&pf->pf_mutex); + list_link_init(&pf->pf_link); + return pf; +} + +/* + * Free the pframe (don't forget to unlock the mutex) and set *pfp = NULL + * + * The pframe must be locked, its contents not in memory (pf->pf_addr == NULL), + * have a pincount of 0, and not be linked into a memory object's list. + */ +void pframe_free(pframe_t **pfp) +{ + KASSERT(kmutex_owns_mutex(&(*pfp)->pf_mutex)); + KASSERT(!(*pfp)->pf_addr); + KASSERT(!(*pfp)->pf_dirty); + KASSERT(!list_link_is_linked(&(*pfp)->pf_link)); + kmutex_unlock(&(*pfp)->pf_mutex); + slab_obj_free(pframe_allocator, *pfp); + *pfp = NULL; +} + +/* + * Unlock the pframe and set *pfp = NULL + */ +void pframe_release(pframe_t **pfp) +{ + pframe_t *pf = *pfp; + KASSERT(kmutex_owns_mutex(&pf->pf_mutex)); + *pfp = NULL; + kmutex_unlock(&pf->pf_mutex); +} diff --git a/kernel/mm/slab.c b/kernel/mm/slab.c new file mode 100644 index 0000000..bec70d1 --- /dev/null +++ b/kernel/mm/slab.c @@ -0,0 +1,550 @@ +// SMP.1 + SMP.3 +// spinlocks + mask interrupts +/* + * slab_alloc.c - Kernel memory allocator + * Jason Lango <jal@cs.brown.edu> + * + * This implementation is based on the description of slab allocation + * (used in Solaris and Linux) from UNIX Internals: The New Frontiers, + * by Uresh Vahalia. + * + * Note that there is no need for locking in allocation and deallocation because + * it never blocks nor is used by an interrupt handler. Hurray for non + * preemptible kernels! + * + * darmanio: ^ lol, look at me now :D + */ + +#include "types.h" + +#include "mm/mm.h" +#include "mm/page.h" +#include "mm/slab.h" + +#include "proc/spinlock.h" + +#include "util/debug.h" +#include "util/gdb.h" +#include "util/string.h" + +#ifdef SLAB_REDZONE +#define front_rz(obj) (*(uintptr_t *)(obj)) +#define rear_rz(cache, obj) \ + (*(uintptr_t *)(((uintptr_t)(obj)) + (cache)->sa_objsize - \ + sizeof(uintptr_t))) + +#define VERIFY_REDZONES(cache, obj) \ + do \ + { \ + if (front_rz(obj) != SLAB_REDZONE) \ + panic("alloc: red-zone check failed: *(0x%p)=0x%.8lx\n", \ + (void *)&front_rz(obj), front_rz(obj)); \ + if (rear_rz(cache, obj) != SLAB_REDZONE) \ + panic("alloc: red-zone check failed: *(0x%p)=0x%.8lx\n", \ + (void *)&rear_rz(cache, obj), rear_rz(cache, obj)); \ + } while (0); + +#endif + +struct slab +{ + struct slab *s_next; /* link on list of slabs */ + size_t s_inuse; /* number of allocated objs */ + void *s_free; /* head of obj free list */ + void *s_addr; /* start address */ +}; + +typedef struct slab_allocator +{ + const char *sa_name; /* user-provided name */ + size_t sa_objsize; /* object size */ + struct slab *sa_slabs; /* head of slab list */ + size_t sa_order; /* npages = (1 << order) */ + size_t sa_slab_nobjs; /* number of objs per slab */ + struct slab_allocator *sa_next; /* link on global list of allocators */ +} slab_allocator_t; + +/* Stored at the end of every object to keep track of the + associated slab when allocated or a pointer to the next free object */ +typedef struct slab_bufctl +{ + union { + void *sb_next; /* next free object */ + struct slab *sb_slab; /* containing slab */ + } u; +#ifdef SLAB_CHECK_FREE + uint8_t sb_free; /* true if is object is free */ +#endif +} slab_bufctl_t; +#define sb_next u.sb_next +#define sb_slab u.sb_slab + +/* Returns a pointer to the start of the bufctl struct */ +#define obj_bufctl(allocator, obj) \ + ((slab_bufctl_t *)(((uintptr_t)(obj)) + (allocator)->sa_objsize)) +/* Given a pointer to bufctrl, returns a pointer to the start of the object */ +#define bufctl_obj(allocator, buf) \ + ((void *)(((uintptr_t)(buf)) - (allocator)->sa_objsize)) +/* Given a pointer to the object, returns a pointer to the next object (after bufctl) */ +#define next_obj(allocator, obj) \ + ((void *)(((uintptr_t)(obj)) + (allocator)->sa_objsize + \ + sizeof(slab_bufctl_t))) + +GDB_DEFINE_HOOK(slab_obj_alloc, void *addr, slab_allocator_t *allocator) + +GDB_DEFINE_HOOK(slab_obj_free, void *addr, slab_allocator_t *allocator) + +/* Head of global list of slab allocators. This is used in the python gdb script */ +static slab_allocator_t *slab_allocators = NULL; + +/* Special case - allocator for allocation of slab_allocator objects. */ +static slab_allocator_t slab_allocator_allocator; + +/* + * This constant defines how many orders of magnitude (in page block + * sizes) we'll search for an optimal slab size (past the smallest + * possible slab size). + */ +#define SLAB_MAX_ORDER 5 + +/** + * Given the object size and the number of objects, calculates + * the size of the slab. Each object includes a slab_bufctl_t, + * and each slab includes a slab struct. +*/ +static size_t _slab_size(size_t objsize, size_t nobjs) +{ + return (nobjs * (objsize + sizeof(slab_bufctl_t)) + sizeof(struct slab)); +} + +/** + * Given the object size and the order, calculate how many objects + * can fit in a certain number of pages (excluding the slab struct). + * + * PAGE_SIZE << order effectively is just PAGE_SIZE * 2^order. +*/ +static size_t _slab_nobjs(size_t objsize, size_t order) +{ + return (((PAGE_SIZE << order) - sizeof(struct slab)) / + (objsize + sizeof(slab_bufctl_t))); +} + +static size_t _slab_waste(size_t objsize, size_t order) +{ + /* Waste is defined as the amount of unused space in the page + * block, that is the number of bytes in the page block minus + * the optimal slab size for that particular block size. + */ + return ((PAGE_SIZE << order) - + _slab_size(objsize, _slab_nobjs(objsize, order))); +} + +static void _calc_slab_size(slab_allocator_t *allocator) +{ + size_t best_order; + size_t best_waste; + size_t order; + size_t minorder; + size_t minsize; + size_t waste; + + /* Find the minimum page block size that this slab requires. */ + minsize = _slab_size(allocator->sa_objsize, 1); + for (minorder = 0; minorder < PAGE_NSIZES; minorder++) + { + if ((PAGE_SIZE << minorder) >= minsize) + { + break; + } + } + if (minorder == PAGE_NSIZES) + panic("unable to find minorder\n"); + + /* Start the search with the minimum block size for this slab. */ + best_order = minorder; + best_waste = _slab_waste(allocator->sa_objsize, minorder); + + dbg(DBG_MM, "calc_slab_size: minorder %lu, waste %lu\n", minorder, + best_waste); + + /* Find the optimal number of objects per slab and slab size, + * up to a predefined (somewhat arbitrary) limit on the number + * of pages per slab. + */ + for (order = minorder + 1; order < SLAB_MAX_ORDER; order++) + { + if ((waste = _slab_waste(allocator->sa_objsize, order)) < best_waste) + { + best_waste = waste; + best_order = order; + dbg(DBG_MM, "calc_slab_size: replacing with order %lu, waste %lu\n", + best_order, best_waste); + } + } + + /* Finally, the best page block size wins. + */ + allocator->sa_order = best_order; + allocator->sa_slab_nobjs = _slab_nobjs(allocator->sa_objsize, best_order); + KASSERT(allocator->sa_slab_nobjs); +} + +/* + * Initializes a given allocator using the name and size passed in. +*/ +static void _allocator_init(slab_allocator_t *allocator, const char *name, + size_t size) +{ +#ifdef SLAB_REDZONE + /* + * Add space for the front and rear red-zones. + */ + size += 2 * sizeof(uintptr_t); +#endif + + if (!name) + { + name = "<unnamed>"; + } + + allocator->sa_name = name; + allocator->sa_objsize = size; + allocator->sa_slabs = NULL; + // this will set the fields sa_order and the number of objects per slab + _calc_slab_size(allocator); + + /* Add cache to global cache list. */ + allocator->sa_next = slab_allocators; + slab_allocators = allocator; + + dbg(DBG_MM, "Initialized new slab allocator:\n"); + dbgq(DBG_MM, " Name: \"%s\" (0x%p)\n", allocator->sa_name, + allocator); + dbgq(DBG_MM, " Object Size: %lu\n", allocator->sa_objsize); + dbgq(DBG_MM, " Order: %lu\n", allocator->sa_order); + dbgq(DBG_MM, " Slab Capacity: %lu\n", allocator->sa_slab_nobjs); +} + +/* + * Given a name and size of object will create a slab_allocator + * to manage slabs that store objects of size `size`, along with + * some metadata. +*/ +slab_allocator_t *slab_allocator_create(const char *name, size_t size) +{ + slab_allocator_t *allocator; + + allocator = (slab_allocator_t *)slab_obj_alloc(&slab_allocator_allocator); + if (!allocator) + { + return NULL; + } + + _allocator_init(allocator, name, size); + return allocator; +} + +/* + * Free a given allocator. +*/ +void slab_allocator_destroy(slab_allocator_t *allocator) +{ + slab_obj_free(&slab_allocator_allocator, allocator); +} + +/* + * In the event that a slab with free objects is not found, + * this routine will be called. +*/ +static long _slab_allocator_grow(slab_allocator_t *allocator) +{ + void *addr; + void *obj; + struct slab *slab; + + addr = page_alloc_n(1UL << allocator->sa_order); + if (!addr) + { + return 0; + } + + /* Initialize each bufctl to be free and point to the next object. */ + obj = addr; + for (size_t i = 0; i < (allocator->sa_slab_nobjs - 1); i++) + { +#ifdef SLAB_CHECK_FREE + obj_bufctl(allocator, obj)->sb_free = 1; +#endif + obj = obj_bufctl(allocator, obj)->sb_next = next_obj(allocator, obj); + } + + /* The last bufctl is the tail of the list. */ +#ifdef SLAB_CHECK_FREE + obj_bufctl(allocator, obj)->sb_free = 1; +#endif + obj_bufctl(allocator, obj)->sb_next = NULL; + + /* After the last object comes the slab structure itself. */ + slab = (struct slab *)next_obj(allocator, obj); + + /* + * The first object in the slab will be the head of the free + * list and the start address of the slab. + */ + slab->s_free = addr; + slab->s_addr = addr; + slab->s_inuse = 0; + + /* Initialize objects. */ + obj = addr; + for (size_t i = 0; i < allocator->sa_slab_nobjs; i++) + { +#ifdef SLAB_REDZONE + front_rz(obj) = SLAB_REDZONE; + rear_rz(allocator, obj) = SLAB_REDZONE; +#endif + obj = next_obj(allocator, obj); + } + + dbg(DBG_MM, "Growing cache \"%s\" (0x%p), new slab 0x%p (%lu pages)\n", + allocator->sa_name, allocator, slab, 1UL << allocator->sa_order); + + /* Place this slab into the cache. */ + slab->s_next = allocator->sa_slabs; + allocator->sa_slabs = slab; + + return 1; +} + +/* + * Given an allocator, will allocate an object. +*/ +void *slab_obj_alloc(slab_allocator_t *allocator) +{ + struct slab *slab; + void *obj; + + /* Find a slab with a free object. */ + for (;;) + { + slab = allocator->sa_slabs; + while (slab && (slab->s_inuse == allocator->sa_slab_nobjs)) + slab = slab->s_next; + if (slab && (slab->s_inuse < allocator->sa_slab_nobjs)) + { + break; + } + if (!_slab_allocator_grow(allocator)) + { + return NULL; + } + } + + /* + * Remove an object from the slab's free list. We'll use the + * free list pointer to store a pointer back to the containing + * slab. + */ + obj = slab->s_free; + slab->s_free = obj_bufctl(allocator, obj)->sb_next; + obj_bufctl(allocator, obj)->sb_slab = slab; +#ifdef SLAB_CHECK_FREE + obj_bufctl(allocator, obj)->sb_free = 0; +#endif + + slab->s_inuse++; + + dbg(DBG_MM, + "Allocated object 0x%p from \"%s\" (0x%p), " + "slab 0x%p, inuse %lu\n", + obj, allocator->sa_name, allocator, allocator, slab->s_inuse); + +#ifdef SLAB_REDZONE + VERIFY_REDZONES(allocator, obj); + + /* + * Make object pointer point past the first red-zone. + */ + obj = (void *)((uintptr_t)obj + sizeof(uintptr_t)); +#endif + + GDB_CALL_HOOK(slab_obj_alloc, obj, allocator); + return obj; +} + +void slab_obj_free(slab_allocator_t *allocator, void *obj) +{ + struct slab *slab; + GDB_CALL_HOOK(slab_obj_free, obj, allocator); + +#ifdef SLAB_REDZONE + /* Move pointer back to verify that the REDZONE is unchanged. */ + obj = (void *)((uintptr_t)obj - sizeof(uintptr_t)); + + VERIFY_REDZONES(allocator, obj); +#endif + +#ifdef SLAB_CHECK_FREE + KASSERT(!obj_bufctl(allocator, obj)->sb_free && "INVALID FREE!"); + obj_bufctl(allocator, obj)->sb_free = 1; +#endif + + slab = obj_bufctl(allocator, obj)->sb_slab; + + /* Place this object back on the slab's free list. */ + obj_bufctl(allocator, obj)->sb_next = slab->s_free; + slab->s_free = obj; + + slab->s_inuse--; + + dbg(DBG_MM, "Freed object 0x%p from \"%s\" (0x%p), slab 0x%p, inuse %lu\n", + obj, allocator->sa_name, allocator, slab, slab->s_inuse); +} + +/* + * Reclaims as much memory (up to a target) from + * unused slabs as possible + * @param target - target number of pages to reclaim. If negative, + * try to reclaim as many pages as possible + * @return number of pages freed + */ +long slab_allocators_reclaim(long target) +{ + panic("slab_allocators_reclaim NYI for SMP"); + // spinlock_lock(&allocator->sa_lock); + // int npages_freed = 0, npages; + + // slab_allocator_t *a; + // struct slab *s, **prev; + + // /* Go through all caches */ + // for (a = slab_allocators; NULL != a; a = a->sa_next) { + // prev = &(a->sa_slabs); + // s = a->sa_slabs; + // while (NULL != s) { + // struct slab *next = s->s_next; + // if (0 == s->s_inuse) { + // /* Free Slab */ + // (*prev) = next; + // npages = 1 << a->sa_order; + + // page_free_n(s->s_addr, npages); + // npages_freed += npages; + // } else { + // prev = &(s->s_next); + // } + // /* Check if target was met */ + // if ((target > 0) && (npages_freed >= target)) { + // return npages_freed; + // } + // s = next; + // } + // } + // spinlock_unlock(&allocator->sa_lock); + // return npages_freed; +} + +#define KMALLOC_SIZE_MIN_ORDER (6) +#define KMALLOC_SIZE_MAX_ORDER (18) + +static slab_allocator_t + *kmalloc_allocators[KMALLOC_SIZE_MAX_ORDER - KMALLOC_SIZE_MIN_ORDER + 1]; + +/* Note that kmalloc_allocator_names should be modified to remain consistent + * with KMALLOC_SIZE_MIN_ORDER ... KMALLOC_SIZE_MAX_ORDER. + */ +static const char *kmalloc_allocator_names[] = { + "size-64", "size-128", "size-256", "size-512", "size-1024", + "size-2048", "size-4096", "size-8192", "size-16384", "size-32768", + "size-65536", "size-131072", "size-262144"}; + +void *kmalloc(size_t size) +{ + size += sizeof(slab_allocator_t *); + + /* + * Find the first power of two bucket bigger than the + * requested size, and allocate from it. + */ + slab_allocator_t **cs = kmalloc_allocators; + for (size_t order = KMALLOC_SIZE_MIN_ORDER; order <= KMALLOC_SIZE_MAX_ORDER; + order++, cs++) + { + if ((1UL << order) >= size) + { + void *addr = slab_obj_alloc(*cs); + if (!addr) + { + dbg(DBG_MM, "WARNING: kmalloc out of memory\n"); + return NULL; + } +#ifdef MM_POISON + memset(addr, MM_POISON_ALLOC, size); +#endif /* MM_POISON */ + *((slab_allocator_t **)addr) = *cs; + return (void *)(((slab_allocator_t **)addr) + 1); + } + } + + panic("size bigger than maxorder %ld\n", size); +} + +__attribute__((used)) static void *malloc(size_t size) +{ + /* This function is used by gdb to allocate memory + * within the kernel, no code in the kernel should + * call it. */ + return kmalloc(size); +} + +void kfree(void *addr) +{ + addr = (void *)(((slab_allocator_t **)addr) - 1); + slab_allocator_t *sa = *(slab_allocator_t **)addr; + +#ifdef MM_POISON + /* If poisoning is enabled, wipe the memory given in + * this object, as specified by the cache object size + * (minus red-zone overhead, if any). + */ + size_t objsize = sa->sa_objsize; +#ifdef SLAB_REDZONE + objsize -= sizeof(uintptr_t) * 2; +#endif /* SLAB_REDZONE */ + memset(addr, MM_POISON_FREE, objsize); +#endif /* MM_POISON */ + + slab_obj_free(sa, addr); +} + +__attribute__((used)) static void free(void *addr) +{ + /* This function is used by gdb to free memory allocated + * by malloc, no code in the kernel should call it. */ + kfree(addr); +} + +void slab_init() +{ + /* Special case initialization of the allocator for `slab_allocator_t`s */ + /* In other words, initializes a slab allocator for other slab allocators. */ + _allocator_init(&slab_allocator_allocator, "slab_allocators", + sizeof(slab_allocator_t)); + + /* + * Allocate the power of two buckets for generic + * kmalloc/kfree. + */ + slab_allocator_t **cs = kmalloc_allocators; + for (size_t order = KMALLOC_SIZE_MIN_ORDER; order <= KMALLOC_SIZE_MAX_ORDER; + order++, cs++) + { + if (NULL == + (*cs = slab_allocator_create( + kmalloc_allocator_names[order - KMALLOC_SIZE_MIN_ORDER], + (1UL << order)))) + { + panic("Couldn't create kmalloc allocators!\n"); + } + } +} diff --git a/kernel/mm/slab.py b/kernel/mm/slab.py new file mode 100644 index 0000000..1b0c8fb --- /dev/null +++ b/kernel/mm/slab.py @@ -0,0 +1,55 @@ +import gdb + +import weenix +import weenix.kmem + + +class SlabCommand(weenix.Command): + def __init__(self): + weenix.Command.__init__(self, "slab", gdb.COMMAND_DATA) + + def _allocators(self): + l = list() + for alloc in weenix.kmem.allocators(): + l.append(alloc) + return l + + def invoke(self, args, tty): + names = list() + slabs = list() + sizes = list() + counts = list() + + names.append("") + slabs.append("slabs") + sizes.append("objsize") + counts.append("allocated") + + for alloc in weenix.kmem.allocators(): + names.append(alloc.name()) + slabs.append(str(len(list(alloc.slabs())))) + sizes.append(str(alloc.size())) + counts.append(str(len(list(alloc.objs())))) + + namewidth = max(map(lambda x: len(x), names)) + slabwidth = max(map(lambda x: len(x), slabs)) + sizewidth = max(map(lambda x: len(x), sizes)) + countwidth = max(map(lambda x: len(x), counts)) + + for name, slab, size, count in zip(names, slabs, sizes, counts): + print( + "{1:<{0}} {3:>{2}} {5:>{4}} {7:>{6}}".format( + namewidth, name, slabwidth, slab, sizewidth, size, countwidth, count + ) + ) + + def complete(self, line, word): + l = map(lambda x: x.name(), self._allocators()) + l = filter(lambda x: x.startswith(word), l) + for used in line.split(): + l = filter(lambda x: x != used, l) + l.sort() + return l + + +SlabCommand() |