diff --git a/src/core/bootstrap.c b/src/core/bootstrap.c index b608146..e7912fb 100644 --- a/src/core/bootstrap.c +++ b/src/core/bootstrap.c @@ -1,4 +1,4 @@ -/* Guest bootstrap helpers for elfuse +/* Guest bootstrap helpers * * Copyright 2026 elfuse contributors * SPDX-License-Identifier: Apache-2.0 @@ -30,7 +30,10 @@ #include "debug/log.h" -#define MAX_BOOT_REGIONS 32 +/* Worst case: 7 fixed regions (shim, shim-data, vDSO, brk, stack, mmap RX, mmap + * RW) plus up to ELF_MAX_SEGMENTS for both the executable and the interpreter. + */ +#define MAX_BOOT_REGIONS (8 + 2 * ELF_MAX_SEGMENTS) static bool append_boot_region(mem_region_t *regions, int *nregions, @@ -83,12 +86,12 @@ static void log_initial_page_tables(const guest_t *g, uint64_t ttbr0) } } -static int load_interpreter(guest_t *g, - const char *sysroot, - guest_bootstrap_t *boot) +static bool load_interpreter(guest_t *g, + const char *sysroot, + guest_bootstrap_t *boot) { if (boot->elf_info.interp_path[0] == '\0') - return 0; + return true; elf_resolve_interp(sysroot, boot->elf_info.interp_path, boot->interp_resolved, sizeof(boot->interp_resolved)); @@ -96,20 +99,20 @@ static int load_interpreter(guest_t *g, if (elf_load(boot->interp_resolved, &boot->interp_info) < 0) { log_error("failed to load interpreter: %s", boot->interp_resolved); - return -1; + return false; } if (boot->interp_info.e_machine != EM_AARCH64) { log_error("interpreter has unsupported machine type %u: %s", boot->interp_info.e_machine, boot->interp_resolved); - return -1; + return false; } boot->interp_base = g->interp_base; if (elf_map_segments(&boot->interp_info, boot->interp_resolved, g->host_base, g->guest_size, boot->interp_base) < 0) { log_error("failed to map interpreter segments"); - return -1; + return false; } log_debug( @@ -117,20 +120,27 @@ static int load_interpreter(guest_t *g, (unsigned long long) boot->interp_base, (unsigned long long) (boot->interp_info.entry + boot->interp_base), boot->interp_info.num_segments); - return 0; + return true; } -static int build_boot_regions(mem_region_t *regions, - int *nregions, - guest_t *g, - const guest_bootstrap_t *boot, - size_t shim_bin_len) +static bool build_boot_regions(mem_region_t *regions, + int *nregions, + guest_t *g, + const guest_bootstrap_t *boot, + size_t shim_bin_len) { + /* The vDSO trampolines live in the same 2MiB block as the shim. They must + * appear in the region set so finalize_block_perms validates and grants RX + * to the vDSO page when splitting the block; otherwise vdso_build cannot + * write into it through guest_ptr. + */ if (!append_boot_region(regions, nregions, SHIM_BASE, SHIM_BASE + shim_bin_len, MEM_PERM_RX) || !append_boot_region(regions, nregions, SHIM_DATA_BASE, - SHIM_DATA_BASE + BLOCK_2MB, MEM_PERM_RW)) { - return -1; + SHIM_DATA_BASE + BLOCK_2MIB, MEM_PERM_RW) || + !append_boot_region(regions, nregions, VDSO_BASE, VDSO_BASE + VDSO_SIZE, + MEM_PERM_RX)) { + return false; } for (int i = 0; i < boot->elf_info.num_segments; i++) { @@ -140,7 +150,7 @@ static int build_boot_regions(mem_region_t *regions, boot->elf_info.segments[i].gpa + boot->elf_info.segments[i].memsz + boot->elf_load_base, elf_pf_to_prot(boot->elf_info.segments[i].flags))) { - return -1; + return false; } } @@ -151,7 +161,7 @@ static int build_boot_regions(mem_region_t *regions, boot->interp_info.segments[i].gpa + boot->interp_info.segments[i].memsz + boot->interp_base, elf_pf_to_prot(boot->interp_info.segments[i].flags))) { - return -1; + return false; } } @@ -163,12 +173,12 @@ static int build_boot_regions(mem_region_t *regions, MMAP_RX_INITIAL_END, MEM_PERM_RX) || !append_boot_region(regions, nregions, MMAP_BASE, MMAP_INITIAL_END, MEM_PERM_RW)) { - return -1; + return false; } g->mmap_rx_end = MMAP_RX_INITIAL_END; g->mmap_end = MMAP_INITIAL_END; - return 0; + return true; } int guest_bootstrap_prepare(guest_t *g, @@ -214,7 +224,7 @@ int guest_bootstrap_prepare(guest_t *g, } *guest_initialized = true; - log_debug("IPA size: %u bits (%lluGB primary)", g->ipa_bits, + log_debug("IPA size: %u bits (%llu GiB primary)", g->ipa_bits, (unsigned long long) (g->guest_size / (1024ULL * 1024 * 1024))); boot->elf_load_base = (boot->elf_info.e_type == ET_DYN) ? PIE_LOAD_BASE : 0; @@ -229,15 +239,15 @@ int guest_bootstrap_prepare(guest_t *g, g->brk_base = BRK_BASE_DEFAULT; g->brk_current = g->brk_base; - g->stack_top = ALIGN_UP(g->brk_base, BLOCK_2MB) + STACK_SIZE; + g->stack_top = ALIGN_UP(g->brk_base, BLOCK_2MIB) + STACK_SIZE; if (g->stack_top < STACK_TOP_DEFAULT) g->stack_top = STACK_TOP_DEFAULT; g->stack_base = g->stack_top - STACK_SIZE; - if (load_interpreter(g, sysroot, boot) < 0) + if (!load_interpreter(g, sysroot, boot)) return -1; - if (shim_bin_len > BLOCK_2MB) { + if (shim_bin_len > BLOCK_2MIB) { log_error("shim binary too large (%zu bytes)", shim_bin_len); return -1; } @@ -252,7 +262,7 @@ int guest_bootstrap_prepare(guest_t *g, boot->interp_base); sys_icache_invalidate((uint8_t *) g->host_base + SHIM_BASE, shim_bin_len); - if (build_boot_regions(regions, &nregions, g, boot, shim_bin_len) < 0) { + if (!build_boot_regions(regions, &nregions, g, boot, shim_bin_len)) { log_error("too many memory regions (%d >= %d)", nregions, MAX_BOOT_REGIONS); return -1; @@ -263,25 +273,12 @@ int guest_bootstrap_prepare(guest_t *g, log_error("failed to build page tables"); return -1; } - - for (int i = 1; i < nregions; i++) { - uint64_t prev_block = (regions[i - 1].gpa_end - 1) & ~(BLOCK_2MB - 1); - uint64_t curr_block = regions[i].gpa_start & ~(BLOCK_2MB - 1); - if (prev_block == curr_block && - regions[i - 1].perms != regions[i].perms && - guest_split_block(g, curr_block) == 0) { - guest_update_perms(g, regions[i - 1].gpa_start, - regions[i - 1].gpa_end, regions[i - 1].perms); - guest_update_perms(g, regions[i].gpa_start, regions[i].gpa_end, - regions[i].perms); - } - } g->need_tlbi = true; guest_region_add(g, SHIM_BASE, SHIM_BASE + shim_bin_len, LINUX_PROT_READ | LINUX_PROT_EXEC, LINUX_MAP_PRIVATE, 0, "[shim]"); - guest_region_add(g, SHIM_DATA_BASE, SHIM_DATA_BASE + BLOCK_2MB, + guest_region_add(g, SHIM_DATA_BASE, SHIM_DATA_BASE + BLOCK_2MIB, LINUX_PROT_READ | LINUX_PROT_WRITE, LINUX_MAP_PRIVATE, 0, "[shim-data]"); @@ -386,7 +383,7 @@ int guest_bootstrap_create_vcpu(guest_t *g, uint64_t shim_ipa = guest_ipa(g, SHIM_BASE); uint64_t entry_ipa = guest_ipa(g, boot->entry_point); uint64_t sp_ipa = guest_ipa(g, boot->stack_pointer); - uint64_t el1_sp = guest_ipa(g, SHIM_DATA_BASE + BLOCK_2MB); + uint64_t el1_sp = guest_ipa(g, SHIM_DATA_BASE + BLOCK_2MIB); hv_vcpu_t vcpu; hv_vcpu_exit_t *vexit; diff --git a/src/core/bootstrap.h b/src/core/bootstrap.h index 0939f95..e2ce4c4 100644 --- a/src/core/bootstrap.h +++ b/src/core/bootstrap.h @@ -1,11 +1,11 @@ -#pragma once - -/* Guest bootstrap helpers for elfuse +/* Guest bootstrap helpers * * Copyright 2026 elfuse contributors * SPDX-License-Identifier: Apache-2.0 */ +#pragma once + #include #include #include diff --git a/src/core/elf.c b/src/core/elf.c index c8837a5..316ad7c 100644 --- a/src/core/elf.c +++ b/src/core/elf.c @@ -97,7 +97,7 @@ int elf_load(const char *path, elf_info_t *info) fclose(f); return -1; } - /* Linux kernel caps program headers at 64KB. Reject pathological inputs + /* Linux kernel caps program headers at 64KiB. Reject pathological inputs * before allocating to avoid attacker-controlled large allocations. */ if ((size_t) ehdr.e_phnum * ehdr.e_phentsize > 65536) { diff --git a/src/core/guest.c b/src/core/guest.c index 83c5b16..ea4258e 100644 --- a/src/core/guest.c +++ b/src/core/guest.c @@ -6,11 +6,11 @@ * * Identity-mapped guest memory: GVA == GPA == offset into host_base. * The guest address space size is determined by the VM's configured IPA width - * (capped at 40-bit = 1TB): 64GB for native aarch64 on M2 (36-bit), 1TB for M3+ - * (40-bit). Reserved via mmap(MAP_ANON); macOS demand-pages physical memory on - * first touch, so only used pages consume RAM. The slab is mapped RWX to + * (capped at 40-bit = 1TiB): 64GiB for native aarch64 on M2 (36-bit), 1TiB for + * M3+ (40-bit). Reserved via mmap(MAP_ANON); macOS demand-pages physical memory + * on first touch, so only used pages consume RAM. The slab is mapped RWX to * Hypervisor.framework. The guest's own page tables (built here) enforce - * per-region permissions using 2MB block descriptors, which are mandatory for + * per-region permissions using 2MiB block descriptors, which are mandatory for * transparent misaligned access. Page tables can be extended at runtime via * guest_extend_page_tables(). * @@ -21,12 +21,12 @@ * created on demand when mprotect changes PROT_NONE to an accessible * permission. * - * Page table format: AArch64 4KB granule, up to 4-level: - * L0 entry covers 512GB: multiple entries for >512GB address spaces - * L1 entry covers 1GB: either block or table pointing to L2 - * L2 entry covers 2MB: block descriptors with final permissions - * L3 entry covers 4KB: optional, created by guest_split_block() for - * mixed permissions within a 2MB block (W^X) + * Page table format: AArch64 4KiB granule, up to 4-level: + * L0 entry covers 512GiB: multiple entries for >512GiB address spaces + * L1 entry covers 1GiB: either block or table pointing to L2 + * L2 entry covers 2MiB: block descriptors with final permissions + * L3 entry covers 4KiB: optional, created by guest_split_block() for mixed + * permissions within a 2MiB block (W^X) */ #include @@ -57,11 +57,11 @@ static void guest_region_clear(guest_t *g); #define PT_AP_RW_EL0 (1ULL << 6) /* AP[2:1]=01: RW at EL1, RW at EL0 */ #define PT_AP_RO (3ULL << 6) /* AP[2:1]=11: RO at EL1, RO at EL0 */ -/* PAGE_SIZE / ALIGN_2MB_* live in utils.h; BLOCK_2MB lives in core/guest.h. */ +/* PAGE_SIZE / ALIGN_2MB_* live in utils.h; BLOCK_2MIB lives in core/guest.h. */ #define PAGE_SIZE GUEST_PAGE_SIZE -#define BLOCK_1GB (1ULL * 1024 * 1024 * 1024) +#define BLOCK_1GIB (1ULL * 1024 * 1024 * 1024) -/* Mask to extract the physical address from a 2MB L2 block descriptor */ +/* Mask to extract the physical address from a 2MiB L2 block descriptor */ #define L2_BLOCK_ADDR_MASK 0xFFFFFFE00000ULL /* Forward declaration (defined in the page table section below) */ @@ -77,7 +77,7 @@ static pthread_mutex_t pt_lock = PTHREAD_MUTEX_INITIALIZER; /* Lock order: 2 */ /* Track whether the 80% warning has been emitted (avoid log spam) */ static bool pt_pool_warned = false; -/* Allocate a zeroed 4KB page from the page table pool. +/* Allocate a zeroed 4KiB page from the page table pool. * Returns GPA of the page, or 0 on pool exhaustion. * Acquires pt_lock internally. Caller typically holds mmap_lock. */ @@ -136,8 +136,8 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) g->mmap_rx_next = MMAP_RX_BASE; /* Query the maximum IPA size supported by the hardware/kernel. macOS 15+ - * on Apple Silicon reports 40 bits (1TB). Older versions or fallback - * yields 36 bits (64GB). + * on Apple Silicon reports 40 bits (1TiB). Older versions or fallback + * yields 36 bits (64GiB). */ uint32_t max_ipa = 0; hv_vm_config_get_max_ipa_size(&max_ipa); @@ -157,7 +157,7 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) vm_ipa = 36; /* Primary buffer size: use the VM's configured IPA width (capped at - * 40-bit = 1TB). macOS demand-pages the host reservation, so only touched + * 40-bit = 1TiB). macOS demand-pages the host reservation, so only touched * pages cost physical memory. */ uint32_t buf_bits = (vm_ipa > 40) ? 40 : vm_ipa; @@ -168,17 +168,17 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) g->ipa_bits = vm_ipa; /* Compute dynamic layout limits from primary buffer size. - * interp_base: last 4GB (dynamic linker load address) - * mmap_limit: last 8GB reserved (max mmap RW address) - * For 64GB: interp=60GB, mmap_limit=56GB - * For 1TB: interp=1020GB, mmap_limit=1016GB + * interp_base: last 4GiB (dynamic linker load address) + * mmap_limit: last 8GiB reserved (max mmap RW address) + * For 64GiB: interp=60GiB, mmap_limit=56GiB + * For 1TiB: interp=1020GiB, mmap_limit=1016GiB */ g->interp_base = g->guest_size - 0x100000000ULL; g->mmap_limit = g->guest_size - 0x200000000ULL; /* Reserve primary address space via mmap(MAP_ANON). macOS demand-pages * this: physical pages are allocated only on first touch, so reserving up - * to 1TB costs nothing until pages are actually used. Do NOT memset + * to 1TiB costs nothing until pages are actually used. Do NOT memset * because that would touch all pages and defeat demand paging. */ g->host_base = @@ -261,14 +261,14 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) ret = hv_vm_map(g->host_base, GUEST_IPA_BASE, size, HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC); if (ret != HV_SUCCESS && buf_bits > max_ipa) { - /* 1TB primary map failed; fall back to hardware-default buffer. + /* 1TiB primary map failed; fall back to hardware-default buffer. * This handles undocumented HVF limits on primary buffer size. * Close shm_fd since the fallback uses anonymous memory (the file is no * longer mapped to host_base, so CoW fork cannot work). */ log_info( - "guest: hv_vm_map %lluGB failed (%d), " - "retrying with %u-bit (%lluGB)", + "guest: hv_vm_map %llu GiB failed (%d), " + "retrying with %u-bit (%llu GiB)", (unsigned long long) (size >> 30), (int) ret, max_ipa, 1ULL << (max_ipa - 30)); munmap(g->host_base, size); @@ -372,7 +372,7 @@ int guest_init_from_shm(guest_t *g, } log_debug( - "guest: CoW fork: mapped %lluGB from shm " + "guest: CoW fork: mapped %llu GiB from shm " "(ipa=%u bits)", (unsigned long long) (size / (1024ULL * 1024 * 1024)), ipa_bits); @@ -416,7 +416,7 @@ typedef struct { /* Per-thread GVA TLB cache. * * Single-entry translation cache: avoids 3-4 pointer chases through the page - * table on repeated accesses to the same 2MB block (or 4KB page if L3-split). + * table on repeated accesses to the same 2MiB block (or 4KiB page if L3-split). * Validated by an atomic generation counter in guest_t that is bumped on every * page table modification. */ @@ -424,7 +424,7 @@ static _Thread_local struct { const guest_t *owner; /* Which guest_t this entry belongs to */ uint64_t base_gva; /* Block/page-aligned GVA */ uint64_t base_gpa; /* Corresponding GPA offset */ - uint64_t size; /* 2MB or 4KB (0 = invalid) */ + uint64_t size; /* 2MiB or 4KiB (0 = invalid) */ int perms; /* Cached permissions */ uint64_t gen; /* guest_t.pt_gen at population time */ } gva_tlb; @@ -452,7 +452,7 @@ static int gva_translate_perm(const guest_t *g, uint64_t base = g->ipa_base; const uint64_t *l0 = pt_at(g, g->ttbr0 - base); - unsigned l0_idx = (unsigned) (gva / (512ULL * BLOCK_1GB)); + unsigned l0_idx = (unsigned) (gva / (512ULL * BLOCK_1GIB)); if (l0_idx >= 512 || !(l0[l0_idx] & PT_VALID)) return -1; @@ -460,7 +460,7 @@ static int gva_translate_perm(const guest_t *g, if (l1_ipa < base || l1_ipa - base >= g->guest_size) return -1; const uint64_t *l1 = pt_at(g, l1_ipa - base); - unsigned l1_idx = (unsigned) ((gva / BLOCK_1GB) % 512); + unsigned l1_idx = (unsigned) ((gva / BLOCK_1GIB) % 512); if (!(l1[l1_idx] & PT_VALID)) return -1; @@ -468,12 +468,12 @@ static int gva_translate_perm(const guest_t *g, if (l2_ipa < base || l2_ipa - base >= g->guest_size) return -1; const uint64_t *l2 = pt_at(g, l2_ipa - base); - unsigned l2_idx = (unsigned) ((gva / BLOCK_2MB) % 512); + unsigned l2_idx = (unsigned) ((gva / BLOCK_2MIB) % 512); if (!(l2[l2_idx] & PT_VALID)) return -1; if (l2[l2_idx] & PT_TABLE) { - /* L3 page descriptor: 4KB granularity. */ + /* L3 page descriptor: 4KiB granularity. */ uint64_t l3_ipa = l2[l2_idx] & 0xFFFFFFFFF000ULL; if (l3_ipa < base || l3_ipa - base >= g->guest_size) return -1; @@ -496,7 +496,7 @@ static int gva_translate_perm(const guest_t *g, out->gpa = gpa; out->chunk = PAGE_SIZE - (gva & (PAGE_SIZE - 1)); - /* Populate TLB cache for this 4KB page */ + /* Populate TLB cache for this 4KiB page */ gva_tlb.owner = g; gva_tlb.base_gva = gva & ~(PAGE_SIZE - 1); gva_tlb.base_gpa = page_ipa - base; @@ -506,7 +506,7 @@ static int gva_translate_perm(const guest_t *g, return 0; } - /* L2 block descriptor: 2MB granularity. */ + /* L2 block descriptor: 2MiB granularity. */ int perms = desc_to_perms(l2[l2_idx]); if ((perms & required_perms) != required_perms) return -1; @@ -514,18 +514,18 @@ static int gva_translate_perm(const guest_t *g, uint64_t block_ipa = l2[l2_idx] & L2_BLOCK_ADDR_MASK; if (block_ipa < base) return -1; - uint64_t gpa = (block_ipa - base) + (gva & (BLOCK_2MB - 1)); + uint64_t gpa = (block_ipa - base) + (gva & (BLOCK_2MIB - 1)); if (gpa >= g->guest_size) return -1; out->gpa = gpa; - out->chunk = BLOCK_2MB - (gva & (BLOCK_2MB - 1)); + out->chunk = BLOCK_2MIB - (gva & (BLOCK_2MIB - 1)); - /* Populate TLB cache for this 2MB block */ + /* Populate TLB cache for this 2MiB block */ gva_tlb.owner = g; - gva_tlb.base_gva = gva & ~(BLOCK_2MB - 1); + gva_tlb.base_gva = gva & ~(BLOCK_2MIB - 1); gva_tlb.base_gpa = block_ipa - base; - gva_tlb.size = BLOCK_2MB; + gva_tlb.size = BLOCK_2MIB; gva_tlb.perms = perms; gva_tlb.gen = gen; return 0; @@ -588,7 +588,7 @@ static void *gva_resolve_perm(const guest_t *g, { /* Always walk page tables to enforce permissions. The guest slab is * identity-mapped (GVA == GPA == offset), but L2 block descriptors carry - * permission bits and L3 page tables have per-4KB permissions after + * permission bits and L3 page tables have per-4KiB permissions after * guest_split_block. Skipping the walk would bypass W^X enforcement for * all normal guest addresses. */ @@ -755,7 +755,7 @@ int guest_read_str_small(const guest_t *g, uint64_t gva, char *dst, size_t max) void guest_reset(guest_t *g) { - /* Zero only actually-used memory regions. With a potentially 1TB address + /* Zero only actually-used memory regions. With a potentially 1TiB address * space, memset of the entire range would fault in all demand-paged memory * for no benefit. PROT_NONE regions (e.g., a managed runtime's heap * reservation) were never written to, so they're already in the MAP_ANON @@ -783,7 +783,7 @@ void guest_reset(guest_t *g) * callers; shim regions are added AFTER reset by the exec path) */ memset((uint8_t *) g->host_base + SHIM_BASE, 0, - SHIM_DATA_BASE + BLOCK_2MB - SHIM_BASE); + SHIM_DATA_BASE + BLOCK_2MIB - SHIM_BASE); /* Reset allocation state */ guest_pt_gen_bump(g); @@ -826,10 +826,10 @@ int guest_get_used_regions(const guest_t *g, n++; } - /* Shim data/stack (full 2MB block) */ + /* Shim data/stack (full 2MiB block) */ if (n < max) { out[n].offset = SHIM_DATA_BASE; - out[n].size = BLOCK_2MB; + out[n].size = BLOCK_2MIB; n++; } @@ -1263,7 +1263,7 @@ static void guest_region_clear(guest_t *g) /* Page table builder. */ -/* Build block descriptor for a 2MB block at the given GPA with perms. */ +/* Build block descriptor for a 2MiB block at the given GPA with perms. */ static uint64_t make_block_desc(uint64_t gpa, int perms) { uint64_t desc = (gpa & L2_BLOCK_ADDR_MASK) /* PA bits */ @@ -1289,6 +1289,144 @@ static uint64_t make_block_desc(uint64_t gpa, int perms) return desc; } +/* Convert mixed-permission and partially-covered 2MiB blocks into L3 4KiB + * pages. + * + * The block-emit loop in guest_build_page_tables uses 2MiB block descriptors + * and OR-merges permissions when multiple regions touch the same block. The + * merge is correct only when every region in the block agrees on perms AND the + * union of those regions covers the entire block; otherwise it leaves + * over-permissive PTEs (e.g. .text RX + .data RW + heap RW in one 2MiB block + * collapses to RWX) and grants access to gap pages that should fault. + * + * For each unique 2MiB block touched by the input regions, this pass either + * keeps the block descriptor in place (single-perm full coverage) or splits it + * into 512 L3 pages, invalidates the lot, and re-validates each region's pages + * with the correct perms. Pages with no region coverage stay invalid, matching + * Linux semantics for inter-segment gaps in small static binaries. + */ +static bool finalize_block_perms(guest_t *g, const mem_region_t *regions, int n) +{ + /* Walk every 2MiB block touched by any region. Blocks shared by multiple + * regions are processed multiple times; the underlying split / invalidate / + * re-validate sequence is idempotent (guest_split_block is a no-op once + * the L2 entry is a table descriptor; guest_invalidate_ptes + per-region + * guest_update_perms produce the same final L3 state on every pass), so + * dedup is an optimization the heap-region scale (~127 blocks for the + * default brk window) does not justify against a fixed-size visited set. + */ + for (int r = 0; r < n; r++) { + uint64_t r_block_lo = ALIGN_2MIB_DOWN(regions[r].gpa_start); + uint64_t r_block_hi = ALIGN_2MIB_UP(regions[r].gpa_end); + + for (uint64_t b = r_block_lo; b < r_block_hi; b += BLOCK_2MIB) { + /* Walk all regions touching this block. Track perm uniformity and + * collect them into idx[] sorted by start so coverage can be + * checked with a single sweep. + */ + int idx[GUEST_MAX_REGIONS]; + int nidx = 0; + int first_perm = -1; + bool same_perm = true; + + for (int s = 0; s < n; s++) { + if (regions[s].gpa_end <= b || + regions[s].gpa_start >= b + BLOCK_2MIB) + continue; + if (first_perm < 0) + first_perm = regions[s].perms; + else if (regions[s].perms != first_perm) + same_perm = false; + + int pos = nidx; + while (pos > 0 && + regions[idx[pos - 1]].gpa_start > regions[s].gpa_start) { + idx[pos] = idx[pos - 1]; + pos--; + } + idx[pos] = s; + nidx++; + } + + /* Coverage sweep: regions are sorted by start, so the union covers + * the block iff each region begins at or before the running + * high-water mark. + */ + uint64_t covered_until = b; + bool full_coverage = true; + for (int i = 0; i < nidx; i++) { + uint64_t cs = regions[idx[i]].gpa_start; + uint64_t ce = regions[idx[i]].gpa_end; + if (cs > covered_until) { + full_coverage = false; + break; + } + if (ce > covered_until) + covered_until = ce; + } + if (covered_until < b + BLOCK_2MIB) + full_coverage = false; + + /* Single perm covering the whole block: the existing 2MiB + * descriptor is already correct. + */ + if (same_perm && full_coverage) + continue; + + /* Split into L3 pages, invalidate the lot, then rebuild the block + * from per-page unions. This preserves the required permission + * union when adjacent ELF segments share a 4KiB page after + * page-granularity rounding. + */ + if (guest_split_block(g, b) < 0) + return false; + if (guest_invalidate_ptes(g, b, b + BLOCK_2MIB) < 0) + return false; + + int page_perms[BLOCK_2MIB / PAGE_SIZE] = {0}; + for (int i = 0; i < nidx; i++) { + uint64_t s_start = regions[idx[i]].gpa_start; + uint64_t s_end = regions[idx[i]].gpa_end; + uint64_t apply_start = (s_start > b) ? s_start : b; + uint64_t apply_end = + (s_end < b + BLOCK_2MIB) ? s_end : b + BLOCK_2MIB; + /* Page-align to 4KiB so partially covered pages are recreated + * with the union of all overlapping segment permissions. + */ + apply_start = ALIGN_DOWN(apply_start, PAGE_SIZE); + apply_end = PAGE_ALIGN_UP(apply_end); + if (apply_end > b + BLOCK_2MIB) + apply_end = b + BLOCK_2MIB; + + for (uint64_t pa = apply_start; pa < apply_end; + pa += PAGE_SIZE) { + unsigned page_idx = (unsigned) ((pa - b) / PAGE_SIZE); + page_perms[page_idx] |= regions[idx[i]].perms; + } + } + + for (int i = 0; i < (int) ARRAY_SIZE(page_perms);) { + int perms = page_perms[i]; + int run_start = i; + + while (i < (int) ARRAY_SIZE(page_perms) && + page_perms[i] == perms) + i++; + if (!perms) + continue; + + uint64_t run_gpa_start = b + (uint64_t) run_start * PAGE_SIZE; + uint64_t run_gpa_end = b + (uint64_t) i * PAGE_SIZE; + if (guest_update_perms(g, run_gpa_start, run_gpa_end, perms) < + 0) + return false; + } + } + } + + return true; +} + uint64_t guest_build_page_tables(guest_t *g, const mem_region_t *regions, int n) { uint64_t base = g->ipa_base; @@ -1300,20 +1438,20 @@ uint64_t guest_build_page_tables(guest_t *g, const mem_region_t *regions, int n) uint64_t *l0 = pt_at(g, l0_gpa); - /* For each region, determine which 2MB blocks need mapping. + /* For each region, determine which 2MiB blocks need mapping. * Identity-mapped: VA == GPA, so L0/L1/L2 indices and the block * descriptor output address are both derived from gpa_start + ipa_base. */ for (int r = 0; r < n; r++) { - uint64_t gpa_start = ALIGN_2MB_DOWN(regions[r].gpa_start); - uint64_t gpa_end = ALIGN_2MB_UP(regions[r].gpa_end); + uint64_t gpa_start = ALIGN_2MIB_DOWN(regions[r].gpa_start); + uint64_t gpa_end = ALIGN_2MIB_UP(regions[r].gpa_end); int perms = regions[r].perms; - for (uint64_t gpa = gpa_start; gpa < gpa_end; gpa += BLOCK_2MB) { + for (uint64_t gpa = gpa_start; gpa < gpa_end; gpa += BLOCK_2MIB) { uint64_t lookup_addr = base + gpa; - /* L0 index: which 512GB slot this VA falls in */ - unsigned l0_idx = (unsigned) (lookup_addr / (512ULL * BLOCK_1GB)); + /* L0 index: which 512GiB slot this VA falls in */ + unsigned l0_idx = (unsigned) (lookup_addr / (512ULL * BLOCK_1GIB)); if (l0_idx >= 512) { log_error("guest: VA 0x%llx out of L0 range", (unsigned long long) lookup_addr); @@ -1330,9 +1468,9 @@ uint64_t guest_build_page_tables(guest_t *g, const mem_region_t *regions, int n) uint64_t l1_ipa = l0[l0_idx] & 0xFFFFFFFFF000ULL; uint64_t *l1 = pt_at(g, l1_ipa - base); - /* L1 index within the 512GB L0 entry (from VA) */ + /* L1 index within the 512GiB L0 entry (from VA) */ unsigned l1_idx = - (unsigned) ((lookup_addr % (512ULL * BLOCK_1GB)) / BLOCK_1GB); + (unsigned) ((lookup_addr % (512ULL * BLOCK_1GIB)) / BLOCK_1GIB); if (l1_idx >= 512) { log_error("guest: VA 0x%llx out of L1 range", (unsigned long long) lookup_addr); @@ -1347,19 +1485,19 @@ uint64_t guest_build_page_tables(guest_t *g, const mem_region_t *regions, int n) l1[l1_idx] = (base + l2_gpa) | PT_VALID | PT_TABLE; } - /* L2 table for this 1GB region (stored in host at gpa offset) */ + /* L2 table for this 1GiB region (stored in host at gpa offset) */ uint64_t l2_ipa = l1[l1_idx] & 0xFFFFFFFFF000ULL; uint64_t l2_gpa_off = l2_ipa - base; uint64_t *l2 = pt_at(g, l2_gpa_off); - /* L2 index: which 2MB block within the 1GB region (from VA) */ + /* L2 index: which 2MiB block within the 1GiB region (from VA) */ unsigned l2_idx = - (unsigned) ((lookup_addr % BLOCK_1GB) / BLOCK_2MB); + (unsigned) ((lookup_addr % BLOCK_1GIB) / BLOCK_2MIB); /* If block already mapped, merge permissions (most permissive). * Use a local variable for the merged perms. Do NOT modify the * outer perms variable, which would leak accumulated permissions - * to subsequent 2MB blocks in the same region. + * to subsequent 2MiB blocks in the same region. */ int block_perms = perms; if (l2[l2_idx] & PT_BLOCK) { @@ -1380,11 +1518,18 @@ uint64_t guest_build_page_tables(guest_t *g, const mem_region_t *regions, int n) /* Store TTBR0 for later use by guest_extend_page_tables */ uint64_t ttbr0 = base + l0_gpa; g->ttbr0 = ttbr0; + + /* Convert blocks shared by regions with mixed perms or partial coverage + * into L3 4KiB pages so each segment's permissions are honored exactly. + */ + if (!finalize_block_perms(g, regions, n)) + return 0; + guest_pt_gen_bump(g); return ttbr0; } -/* Extend page tables to cover [start, end) with 2MB block descriptors. +/* Extend page tables to cover [start, end) with 2MiB block descriptors. * Walks the existing L0->L1 structure (from g->ttbr0) and allocates new * L2 tables as needed. This is safe to call while the vCPU is paused * (during HVC #5 handling). Sets g->need_tlbi so the shim flushes the @@ -1401,14 +1546,14 @@ int guest_extend_page_tables(guest_t *g, uint64_t l0_gpa_off = g->ttbr0 - base; uint64_t *l0 = pt_at(g, l0_gpa_off); - /* Walk 2MB blocks in [start, end) */ - uint64_t addr_start = ALIGN_2MB_DOWN(start), addr_end = ALIGN_2MB_UP(end); + /* Walk 2MiB blocks in [start, end) */ + uint64_t addr_start = ALIGN_2MIB_DOWN(start), addr_end = ALIGN_2MIB_UP(end); - for (uint64_t addr = addr_start; addr < addr_end; addr += BLOCK_2MB) { + for (uint64_t addr = addr_start; addr < addr_end; addr += BLOCK_2MIB) { uint64_t ipa = base + addr; - /* L0 index: which 512GB slot (>512GB addresses need L0[1]+) */ - unsigned l0_idx = (unsigned) (ipa / (512ULL * BLOCK_1GB)); + /* L0 index: which 512GiB slot (>512GiB addresses need L0[1]+) */ + unsigned l0_idx = (unsigned) (ipa / (512ULL * BLOCK_1GIB)); if (l0_idx >= 512) { log_error("guest: IPA 0x%llx out of L0 range in extend", (unsigned long long) ipa); @@ -1426,7 +1571,8 @@ int guest_extend_page_tables(guest_t *g, uint64_t l1_ipa = l0[l0_idx] & 0xFFFFFFFFF000ULL; uint64_t *l1 = pt_at(g, l1_ipa - base); - unsigned l1_idx = (unsigned) ((ipa % (512ULL * BLOCK_1GB)) / BLOCK_1GB); + unsigned l1_idx = + (unsigned) ((ipa % (512ULL * BLOCK_1GIB)) / BLOCK_1GIB); if (l1_idx >= 512) { log_error("guest: IPA 0x%llx out of L1 range in extend", (unsigned long long) ipa); @@ -1445,7 +1591,7 @@ int guest_extend_page_tables(guest_t *g, uint64_t l2_ipa = l1[l1_idx] & 0xFFFFFFFFF000ULL; uint64_t *l2 = pt_at(g, l2_ipa - base); - unsigned l2_idx = (unsigned) ((ipa % BLOCK_1GB) / BLOCK_2MB); + unsigned l2_idx = (unsigned) ((ipa % BLOCK_1GIB) / BLOCK_2MIB); /* Only map if not already mapped */ if (!(l2[l2_idx] & PT_BLOCK)) { @@ -1465,7 +1611,7 @@ int guest_extend_page_tables(guest_t *g, */ #define PT_L3_PAGE (3ULL) -/* Build a 4KB L3 page descriptor with the given permissions. +/* Build a 4KiB L3 page descriptor with the given permissions. * Layout matches block descriptors (AF, SH, NS, MAIR, AP, XN) except * bits[1:0]=11 instead of 01. */ @@ -1506,26 +1652,26 @@ static uint64_t *find_l2_entry(guest_t *g, uint64_t gpa_offset) uint64_t l0_gpa_off = g->ttbr0 - base; uint64_t *l0 = pt_at(g, l0_gpa_off); - /* L0 index from actual IPA (not base), correct for >512GB */ - unsigned l0_idx = (unsigned) (ipa / (512ULL * BLOCK_1GB)); + /* L0 index from actual IPA (not base), correct for >512GiB */ + unsigned l0_idx = (unsigned) (ipa / (512ULL * BLOCK_1GIB)); if (l0_idx >= 512 || !(l0[l0_idx] & PT_VALID)) return NULL; uint64_t l1_ipa = l0[l0_idx] & 0xFFFFFFFFF000ULL; uint64_t *l1 = pt_at(g, l1_ipa - base); - unsigned l1_idx = (unsigned) ((ipa % (512ULL * BLOCK_1GB)) / BLOCK_1GB); + unsigned l1_idx = (unsigned) ((ipa % (512ULL * BLOCK_1GIB)) / BLOCK_1GIB); if (l1_idx >= 512 || !(l1[l1_idx] & PT_VALID)) return NULL; uint64_t l2_ipa = l1[l1_idx] & 0xFFFFFFFFF000ULL; uint64_t *l2 = pt_at(g, l2_ipa - base); - unsigned l2_idx = (unsigned) ((ipa % BLOCK_1GB) / BLOCK_2MB); + unsigned l2_idx = (unsigned) ((ipa % BLOCK_1GIB) / BLOCK_2MIB); return &l2[l2_idx]; } -/* Split a 2MB L2 block descriptor into 512 × 4KB L3 page descriptors. +/* Split a 2MiB L2 block descriptor into 512 × 4KiB L3 page descriptors. * The caller provides the L2 entry via find_l2_entry. * Extracts the output IPA from the existing descriptor. */ @@ -1549,7 +1695,7 @@ static int split_l2_block(guest_t *g, uint64_t *l2_entry) return -1; uint64_t *l3 = pt_at(g, l3_gpa); - /* Fill 512 L3 entries with 4KB page descriptors inheriting the + /* Fill 512 L3 entries with 4KiB page descriptors inheriting the * block's permissions. Extract the output IPA from bits [47:21] * of the existing descriptor (not from the caller's address). */ @@ -1564,7 +1710,7 @@ static int split_l2_block(guest_t *g, uint64_t *l2_entry) int guest_split_block(guest_t *g, uint64_t block_gpa) { - uint64_t block_start = ALIGN_2MB_DOWN(block_gpa); + uint64_t block_start = ALIGN_2MIB_DOWN(block_gpa); uint64_t *l2_entry = find_l2_entry(g, block_start); return split_l2_block(g, l2_entry); } @@ -1580,13 +1726,13 @@ int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end) for (uint64_t addr = start; addr < end;) { uint64_t *l2_entry = find_l2_entry(g, addr); if (!l2_entry) { - /* No L2 entry (already unmapped); skip this 2MB block */ - addr = ALIGN_2MB_UP(addr + 1); + /* No L2 entry (already unmapped); skip this 2MiB block */ + addr = ALIGN_2MIB_UP(addr + 1); continue; } - uint64_t block_start = ALIGN_2MB_DOWN(addr); - uint64_t block_end = block_start + BLOCK_2MB; + uint64_t block_start = ALIGN_2MIB_DOWN(addr); + uint64_t block_end = block_start + BLOCK_2MIB; /* Not mapped at all: skip */ if (!(*l2_entry & 1)) { @@ -1594,25 +1740,25 @@ int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end) continue; } - /* Check if this is a 2MB block or already an L3 table */ + /* Check if this is a 2MiB block or already an L3 table */ if ((*l2_entry & 3) == 1) { - /* 2MB block descriptor */ + /* 2MiB block descriptor */ if (start <= block_start && end >= block_end) { - /* Invalidating the entire 2MB block: clear the L2 entry */ + /* Invalidating the entire 2MiB block: clear the L2 entry */ *l2_entry = 0; g->need_tlbi = true; addr = block_end; continue; } - /* Partial invalidation within a 2MB block: split first, + /* Partial invalidation within a 2MiB block: split first, * then invalidate individual L3 pages below. */ if (guest_split_block(g, block_start) < 0) return -1; } - /* L3 table: invalidate individual 4KB page descriptors */ + /* L3 table: invalidate individual 4KiB page descriptors */ uint64_t l3_ipa = *l2_entry & 0xFFFFFFFFF000ULL; uint64_t *l3 = pt_at(g, l3_ipa - base); @@ -1621,7 +1767,7 @@ int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end) for (uint64_t pa = page_start; pa < page_end; pa += PAGE_SIZE) { unsigned l3_idx = - (unsigned) (((base + pa) % BLOCK_2MB) / PAGE_SIZE); + (unsigned) (((base + pa) % BLOCK_2MIB) / PAGE_SIZE); l3[l3_idx] = 0; /* Invalid descriptor */ } @@ -1644,13 +1790,13 @@ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms) for (uint64_t addr = start; addr < end;) { uint64_t *l2_entry = find_l2_entry(g, addr); if (!l2_entry) { - /* Skip unmapped 2MB blocks */ - addr = ALIGN_2MB_UP(addr + 1); + /* Skip unmapped 2MiB blocks */ + addr = ALIGN_2MIB_UP(addr + 1); continue; } - uint64_t block_start = ALIGN_2MB_DOWN(addr); - uint64_t block_end = block_start + BLOCK_2MB; + uint64_t block_start = ALIGN_2MIB_DOWN(addr); + uint64_t block_end = block_start + BLOCK_2MIB; /* Not mapped at all: skip */ if (!(*l2_entry & 1)) { @@ -1658,12 +1804,12 @@ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms) continue; } - /* Check if this is a 2MB block or already an L3 table */ + /* Check if this is a 2MiB block or already an L3 table */ if ((*l2_entry & 3) == 1) { - /* 2MB block descriptor */ + /* 2MiB block descriptor */ int old_perms = desc_to_perms(*l2_entry); - /* If the whole 2MB block changes permissions, rewrite the block + /* If the whole 2MiB block changes permissions, rewrite the block * descriptor without splitting. Extract the output IPA from the * existing descriptor, correct for both identity and non-identity * mapped regions. @@ -1678,7 +1824,7 @@ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms) continue; } - /* Partial update: split the 2MB block into L3 pages first, then + /* Partial update: split the 2MiB block into L3 pages first, then * fall through to update individual pages below. */ if (old_perms != perms) { @@ -1691,17 +1837,17 @@ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms) } } - /* L3 table: update individual 4KB page descriptors */ + /* L3 table: update individual 4KiB page descriptors */ uint64_t l3_ipa = *l2_entry & 0xFFFFFFFFF000ULL; uint64_t *l3 = pt_at(g, l3_ipa - base); - /* Update pages within this 2MB block that fall in [start, end) */ + /* Update pages within this 2MiB block that fall in [start, end) */ uint64_t page_start = (addr > block_start) ? addr : block_start; uint64_t page_end = (end < block_end) ? end : block_end; for (uint64_t pa = page_start; pa < page_end; pa += PAGE_SIZE) { unsigned l3_idx = - (unsigned) (((base + pa) % BLOCK_2MB) / PAGE_SIZE); + (unsigned) (((base + pa) % BLOCK_2MIB) / PAGE_SIZE); /* Extract the existing output IPA from the L3 entry. For * non-identity mapped regions, pa is a VA not a GPA, so the builder * must use the IPA already stored in the descriptor (set by @@ -1745,14 +1891,14 @@ int guest_materialize_lazy(guest_t *g, uint64_t fault_offset) if (!region) return -1; /* Not a noreserve region */ - /* Materialize one 2MB block containing the fault address. This is + /* Materialize one 2MiB block containing the fault address. This is * the smallest granule that guest_extend_page_tables works with. * For the common case (sparse heap touch), materializing one block * at a time is the right trade-off: it avoids over-committing the * large reservation while keeping the fault rate manageable. */ - uint64_t block_start = fault_offset & ~(BLOCK_2MB - 1); - uint64_t block_end = block_start + BLOCK_2MB; + uint64_t block_start = fault_offset & ~(BLOCK_2MIB - 1); + uint64_t block_end = block_start + BLOCK_2MIB; /* Clamp to guest size */ if (block_end > g->guest_size) @@ -1791,9 +1937,9 @@ int guest_materialize_lazy(guest_t *g, uint64_t fault_offset) return -1; /* If this block had no page-table entry before the lazy fault, - * guest_extend_page_tables() necessarily created a full 2MB block. + * guest_extend_page_tables() necessarily created a full 2MiB block. * Split it and remove pages outside this noreserve region so holes and - * guards in the same 2MB block remain faults. Existing split blocks + * guards in the same 2MiB block remain faults. Existing split blocks * already encode neighboring mappings, so leave them intact. */ if (!had_mapping) { diff --git a/src/core/guest.h b/src/core/guest.h index ee99cb1..6e57623 100644 --- a/src/core/guest.h +++ b/src/core/guest.h @@ -6,8 +6,8 @@ * * Provides identity-mapped guest physical memory (GVA == GPA == offset into * host buffer). Buffer size is determined by the VM's configured IPA width: - * - Native aarch64 on M2 (36-bit IPA): 64GB - * - Native aarch64 on M3+ (40-bit IPA): 1TB + * - Native aarch64 on M2 (36-bit IPA): 64GiB + * - Native aarch64 on M3+ (40-bit IPA): 1TiB * * Reserved via mmap(MAP_ANON); macOS demand-pages physical memory on first * touch, so unused pages cost nothing. The slab is mapped RWX to @@ -27,49 +27,49 @@ /* Memory layout constants. * * Guest memory size is determined dynamically from the VM's IPA width - * (36-bit = 64GB on M2, 40-bit = 1TB on M3+). See guest.c for the + * (36-bit = 64GiB on M2, 40-bit = 1TiB on M3+). See guest.c for the * runtime probe that selects the correct size. */ #define PT_POOL_BASE 0x00010000ULL /* Page table pool start */ -#define PT_POOL_END 0x00100000ULL /* Page table pool end (960KB) */ -#define SHIM_BASE 0x00100000ULL /* Shim code (2MB block, RX) */ -#define SHIM_DATA_BASE 0x00200000ULL /* Shim stack/data (2MB block, RW) */ +#define PT_POOL_END 0x00100000ULL /* Page table pool end (960KiB) */ +#define SHIM_BASE 0x00100000ULL /* Shim code (2MiB block, RX) */ +#define SHIM_DATA_BASE 0x00200000ULL /* Shim stack/data (2MiB block, RW) */ #define ELF_DEFAULT_BASE 0x00400000ULL /* Typical ELF load base */ -#define PIE_LOAD_BASE 0x00400000ULL /* PIE (ET_DYN) executable base (4MB) */ -#define BRK_BASE_DEFAULT 0x01000000ULL /* Default brk start (16MB) */ +#define PIE_LOAD_BASE 0x00400000ULL /* PIE (ET_DYN) executable base (4MiB) */ +#define BRK_BASE_DEFAULT 0x01000000ULL /* Default brk start (16MiB) */ -/* 8MB stack (four 2MB blocks); unused HVF backing pages consume no RAM. */ +/* 8MiB stack (four 2MiB blocks); unused HVF backing pages consume no RAM. */ #define STACK_SIZE 0x00800000ULL -/* Used when brk_start is below 128MB; otherwise placed above brk. */ +/* Used when brk_start is below 128MiB; otherwise placed above brk. */ #define STACK_TOP_DEFAULT 0x08000000ULL -#define STACK_GUARD_SIZE 0x00001000ULL /* 4KB guard page at bottom of stack */ +#define STACK_GUARD_SIZE 0x00001000ULL /* 4KiB guard at stack bottom */ -/* mmap RX region for PROT_EXEC; placed below 8GB to leave the high mmap +/* mmap RX region for PROT_EXEC; placed below 8GiB to leave the high mmap * region clear for runtimes that demand a specific minimum heap address. */ #define MMAP_RX_BASE 0x10000000ULL -/* Initial pre-mapped mmap RX end. Only covers the first 2MB block; +/* Initial pre-mapped mmap RX end. Only covers the first 2MiB block; * additional pages are mapped lazily by guest_extend_page_tables() * when sys_mmap needs more PROT_EXEC space. Reduces startup time * and memory pressure for small binaries that never call mmap. */ -#define MMAP_RX_INITIAL_END (MMAP_RX_BASE + 0x200000ULL) /* +2MB */ +#define MMAP_RX_INITIAL_END (MMAP_RX_BASE + 0x200000ULL) /* +2MiB */ -/* mmap RW region starts at 8GB to match real Linux address layouts. */ +/* mmap RW region starts at 8GiB to match real Linux address layouts. */ #define MMAP_BASE 0x200000000ULL -/* Initial pre-mapped mmap RW end. Only covers the first 2MB block; +/* Initial pre-mapped mmap RW end. Only covers the first 2MiB block; * additional pages are mapped lazily by guest_extend_page_tables(). */ -#define MMAP_INITIAL_END (MMAP_BASE + 0x200000ULL) /* +2MB */ +#define MMAP_INITIAL_END (MMAP_BASE + 0x200000ULL) /* +2MiB */ /* mmap_limit and interp_base are computed dynamically from guest_size * in main.c and stored in guest_t. */ -#define BLOCK_2MB (2ULL * 1024 * 1024) +#define BLOCK_2MIB (2ULL * 1024 * 1024) /* IPA base: guest memory is mapped at this IPA in the hypervisor. * All guest physical addresses = GUEST_IPA_BASE + offset. @@ -91,8 +91,8 @@ * Identity-mapped: VA == GPA. */ typedef struct { - uint64_t gpa_start; /* Output IPA/GPA (2MB aligned) */ - uint64_t gpa_end; /* Output IPA/GPA end (exclusive, 2MB aligned) */ + uint64_t gpa_start; /* Output IPA/GPA (2MiB aligned) */ + uint64_t gpa_end; /* Output IPA/GPA end (exclusive, 2MiB aligned) */ int perms; /* MEM_PERM_* flags */ } mem_region_t; @@ -261,14 +261,14 @@ int guest_read_str(const guest_t *g, uint64_t gva, char *dst, size_t max); int guest_read_str_small(const guest_t *g, uint64_t gva, char *dst, size_t max); /* Build L0->L1->L2 page tables from an array of memory regions. - * Uses 2MB block descriptors. Returns the TTBR0 value (GPA of L0 table), + * Uses 2MiB block descriptors. Returns the TTBR0 value (GPA of L0 table), * or 0 on failure. */ uint64_t guest_build_page_tables(guest_t *g, const mem_region_t *regions, int n); -/* Extend page tables to cover a new address range [start, end) with 2MB +/* Extend page tables to cover a new address range [start, end) with 2MiB * block descriptors. Reuses the existing L0->L1 table structure and * allocates new L2 tables as needed. Sets g->need_tlbi = true. * Returns 0 on success, -1 on failure. @@ -278,8 +278,8 @@ int guest_extend_page_tables(guest_t *g, uint64_t end, int perms); -/* Split a 2MB block descriptor into 512 x 4KB L3 page descriptors. - * block_gpa must be within a currently-mapped 2MB block. The block's +/* Split a 2MiB block descriptor into 512 x 4KiB L3 page descriptors. + * block_gpa must be within a currently-mapped 2MiB block. The block's * permissions are inherited by all 512 page entries. If the block is * already split (L2 entry is a table descriptor), this is a no-op. * Sets g->need_tlbi = true. Returns 0 on success, -1 on failure. @@ -290,16 +290,16 @@ int guest_split_block(guest_t *g, uint64_t block_gpa); * Sets L2 block descriptors and L3 page descriptors to 0 (invalid), * causing translation faults on access. Used when mprotect sets * PROT_NONE; the correct behavior is for the guest to fault. - * If a 2MB block is only partially invalidated, the block is split + * If a 2MiB block is only partially invalidated, the block is split * into L3 pages first (preserving the non-invalidated pages). * Sets g->need_tlbi = true. Returns 0 on success, -1 on failure. */ int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end); /* Update page table permissions for the range [start, end). - * If a 2MB block needs mixed permissions (only part of it is being - * updated), the block is automatically split into 4KB L3 pages first. - * If the entire 2MB block is being updated, the block descriptor is + * If a 2MiB block needs mixed permissions (only part of it is being + * updated), the block is automatically split into 4KiB L3 pages first. + * If the entire 2MiB block is being updated, the block descriptor is * modified in place without splitting. * perms is a MEM_PERM_R/W/X combination. Sets g->need_tlbi = true. * Returns 0 on success, -1 on failure. @@ -377,7 +377,7 @@ void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot); /* Try to materialize a lazy (MAP_NORESERVE) page at the given offset. * Called from the data/instruction abort handler when the faulting address - * falls within a noreserve region. Creates page table entries for one 2MB + * falls within a noreserve region. Creates page table entries for one 2MiB * block containing the fault address, zeros the memory, and clears the * noreserve flag for the materialized sub-range. * Returns 0 on success (caller should TLBI and retry), -1 if the offset is not diff --git a/src/core/shim.S b/src/core/shim.S index 3b82e74..fe82f9a 100644 --- a/src/core/shim.S +++ b/src/core/shim.S @@ -169,7 +169,7 @@ _start: eret /* Exception Vector Table - * Must be 2KB (0x800) aligned. Each entry is 128 bytes (0x80). + * Must be 2KiB (0x800) aligned. Each entry is 128 bytes (0x80). * * bad_exception vectors: mov x5, #offset + b bad_exception * X5 carries the vector offset for host-side debugging. diff --git a/src/core/stack.c b/src/core/stack.c index 1ff369d..fb75916 100644 --- a/src/core/stack.c +++ b/src/core/stack.c @@ -161,7 +161,7 @@ uint64_t build_linux_stack(guest_t *g, } /* Bounds-check: Linux returns E2BIG for oversized argument/environment. - * ARG_MAX on Linux is typically 2MB; stack setup caps at reasonable stack + * ARG_MAX on Linux is typically 2MiB; stack setup caps at reasonable stack * limits. */ #define MAX_ARGS 131072 diff --git a/src/core/vdso.c b/src/core/vdso.c index 21078ba..444be88 100644 --- a/src/core/vdso.c +++ b/src/core/vdso.c @@ -23,8 +23,6 @@ #include "core/elf.h" #include "debug/log.h" -#define VDSO_SIZE 0x00001000ULL /* 4KB */ - /* ELF section header (not in core/elf.h). */ typedef struct { @@ -72,7 +70,7 @@ typedef struct { * [3] __kernel_gettimeofday */ -/* Offsets within the 4KB page */ +/* Offsets within the 4KiB page */ #define VDSO_OFF_EHDR 0x000 #define VDSO_OFF_PHDR 0x040 #define VDSO_OFF_PHDR1 0x078 @@ -100,7 +98,7 @@ typedef struct { /* 6 * 16 = 96, 0x1D8 + 96 = 0x238 */ #define VDSO_OFF_SHDR 0x238 -/* 6 * 64 = 384, 0x238 + 384 = 0x3B8 (fits in 4KB) */ +/* 6 * 64 = 384, 0x238 + 384 = 0x3B8 (fits in 4KiB) */ #define VDSO_NUM_SYMS 4 #define HASH_NCHAIN (VDSO_NUM_SYMS + 1) #define HASH_NBUCKET 1 diff --git a/src/core/vdso.h b/src/core/vdso.h index cb63aa4..e3a41d5 100644 --- a/src/core/vdso.h +++ b/src/core/vdso.h @@ -14,14 +14,15 @@ #include "core/guest.h" -/* Guest address where the vDSO is placed (one 4KB page, below PT pool) */ +/* Guest address where the vDSO is placed (one 4KiB page, below PT pool) */ #define VDSO_BASE 0x0000F000ULL -#define VDSO_OFF_TEXT 0x0B0 /* Offset of .text (trampoline code) */ +#define VDSO_SIZE 0x00001000ULL /* 4KiB */ +#define VDSO_OFF_TEXT 0x0B0 /* Offset of .text (trampoline code) */ /* Build a minimal vDSO ELF image at VDSO_BASE in guest memory. - * The image contains a valid ELF header, one LOAD program header, - * SHT_DYNSYM and SHT_STRTAB sections, and a __kernel_rt_sigreturn - * symbol pointing to a small trampoline (mov x8, #139; svc #0). + * The image contains a valid ELF header, one LOAD program header, SHT_DYNSYM + * and SHT_STRTAB sections, and a __kernel_rt_sigreturn symbol pointing to + * a small trampoline (mov x8, #139; svc #0). * Returns the GVA of the ELF header (== VDSO_BASE), or 0 on failure. */ uint64_t vdso_build(guest_t *g); diff --git a/src/debug/gdbstub.c b/src/debug/gdbstub.c index 242478b..a8f3e5b 100644 --- a/src/debug/gdbstub.c +++ b/src/debug/gdbstub.c @@ -50,7 +50,7 @@ /* Constants. */ -#define GDB_PKT_BUF_SIZE ((size_t) 128 * 1024) /* Max packet size (128KB) */ +#define GDB_PKT_BUF_SIZE ((size_t) 128 * 1024) /* Max packet size (128KiB) */ #define MAX_HW_BREAKPOINTS 16 #define MAX_HW_WATCHPOINTS 16 diff --git a/src/hvutil.h b/src/hvutil.h index 5a687f8..6a211d1 100644 --- a/src/hvutil.h +++ b/src/hvutil.h @@ -63,7 +63,7 @@ (1ULL << 7) /* ITD */) /* TCR_EL1. - * 4KB granule, 48-bit VA, EPD1=1 (TTBR1 walks disabled). + * 4KiB granule, 48-bit VA, EPD1=1 (TTBR1 walks disabled). * Used by main.c (initial setup) and syscall/exec.c (exec re-init). */ #define TCR_EL1_VALUE 0x25B5903510ULL diff --git a/src/main.c b/src/main.c index 60397af..fa7ce52 100644 --- a/src/main.c +++ b/src/main.c @@ -8,7 +8,7 @@ * - A minimal EL1 shim (embedded as shim_blob.h) that provides exception * vectors and forwards SVC #0 (Linux syscalls) to the host via HVC #5. * - All system registers configured from the host before vCPU start. - * - Guest memory identity-mapped at GVA=GPA with 2MB block page tables. + * - Guest memory identity-mapped at GVA=GPA with 2MiB block page tables. * - Syscall handlers that translate Linux syscalls to macOS equivalents. * * Usage: elfuse [--verbose] [--timeout N] [--sysroot PATH] [args...] diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c index 60cc9de..e01e6a0 100644 --- a/src/runtime/forkipc.c +++ b/src/runtime/forkipc.c @@ -1020,7 +1020,7 @@ int64_t sys_clone(hv_vcpu_t vcpu, * Siblings may mmap/munmap/mprotect after resume, so the code needs a * stable copy for the IPC send. Heap-allocated because * GUEST_MAX_REGIONS * sizeof(guest_region_t) exceeds safe - * stack limits on worker threads (512KB default). + * stack limits on worker threads (512KiB default). */ int nregions_snapshot = g->nregions; size_t snap_sz = (size_t) nregions_snapshot * sizeof(guest_region_t); diff --git a/src/runtime/proctitle.c b/src/runtime/proctitle.c index 10cbacf..4e296dc 100644 --- a/src/runtime/proctitle.c +++ b/src/runtime/proctitle.c @@ -1,4 +1,4 @@ -/* Process-title helpers for elfuse +/* Process-title helpers * * Copyright 2026 elfuse contributors * SPDX-License-Identifier: Apache-2.0 diff --git a/src/runtime/proctitle.h b/src/runtime/proctitle.h index d2c55c8..15c39ef 100644 --- a/src/runtime/proctitle.h +++ b/src/runtime/proctitle.h @@ -1,6 +1,6 @@ #pragma once -/* Process-title helpers for elfuse +/* Process-title helpers * * Copyright 2026 elfuse contributors * SPDX-License-Identifier: Apache-2.0 diff --git a/src/runtime/thread.c b/src/runtime/thread.c index a2c8ab5..aedddce 100644 --- a/src/runtime/thread.c +++ b/src/runtime/thread.c @@ -21,7 +21,7 @@ #include "runtime/thread.h" #include "debug/log.h" -#include "core/guest.h" /* SHIM_DATA_BASE, BLOCK_2MB, GUEST_IPA_BASE */ +#include "core/guest.h" /* SHIM_DATA_BASE, BLOCK_2MIB, GUEST_IPA_BASE */ #include "hvutil.h" /* vcpu_get_gpr, vcpu_get_sysreg */ /* From syscall/signal.h, included here directly to avoid pulling in @@ -32,8 +32,8 @@ static void thread_ptrace_init(thread_entry_t *t); -/* Top of the EL1 exception stack region (one 4KB slot per thread) */ -#define SP_EL1_TOP (GUEST_IPA_BASE + SHIM_DATA_BASE + BLOCK_2MB) +/* Top of the EL1 exception stack region (one 4KiB slot per thread) */ +#define SP_EL1_TOP (GUEST_IPA_BASE + SHIM_DATA_BASE + BLOCK_2MIB) /* Thread table. */ @@ -61,7 +61,7 @@ static _Atomic int active_thread_count = 0; /* Bitmask tracking allocated SP_EL1 slots. Bit N set = slot N in use. * MAX_THREADS=64 fits exactly in a uint64_t. Slot 0 is the main thread (top of - * shim data region); each subsequent slot is 4KB below. + * shim data region); each subsequent slot is 4KiB below. */ static uint64_t sp_el1_allocated = 0; @@ -272,8 +272,8 @@ uint64_t thread_alloc_sp_el1(void) log_error("thread: SP_EL1 slots exhausted"); } else { int slot = bit_ctz64(free_mask); - /* Main thread's SP_EL1 = IPA_BASE + SHIM_DATA_BASE + 2MB. - * Each subsequent thread is 4KB below. + /* Main thread's SP_EL1 = IPA_BASE + SHIM_DATA_BASE + 2MiB. + * Each subsequent thread is 4KiB below. */ uint64_t top = SP_EL1_TOP; sp = top - (uint64_t) slot * 4096; diff --git a/src/runtime/thread.h b/src/runtime/thread.h index 371433c..4304eaa 100644 --- a/src/runtime/thread.h +++ b/src/runtime/thread.h @@ -9,9 +9,9 @@ * threads are added via clone(CLONE_THREAD). A _Thread_local pointer provides * O(1) access to the current thread's entry from any syscall handler. * - * SP_EL1 allocation: each thread gets a 4KB EL1 exception stack carved from the - * shim data region (SHIM_DATA_BASE + 2MB). Thread 0 (main) gets the top, thread - * N gets offset -(N * 4096). + * SP_EL1 allocation: each thread gets a 4KiB EL1 exception stack carved from + * the shim data region (SHIM_DATA_BASE + 2MiB). Thread 0 (main) gets the top, + * thread N gets offset -(N * 4096). */ #pragma once @@ -156,10 +156,10 @@ int thread_active_count(void); /* Fast path: return non-zero when exactly one guest thread is active. */ int thread_is_single_active(void); -/* Allocate a per-thread SP_EL1 value. Thread N gets the Nth 4KB slot counting +/* Allocate a per-thread SP_EL1 value. Thread N gets the Nth 4KiB slot counting * down from the top of the shim data region. The IPA base (GUEST_IPA_BASE + - * SHIM_DATA_BASE + 2MB) is the main thread's SP_EL1; each subsequent thread - * subtracts 4KB. Returns the IPA, or 0 on failure. + * SHIM_DATA_BASE + 2MiB) is the main thread's SP_EL1; each subsequent thread + * subtracts 4KiB. Returns the IPA, or 0 on failure. */ uint64_t thread_alloc_sp_el1(void); diff --git a/src/syscall/exec.c b/src/syscall/exec.c index 7b856ef..52f109d 100644 --- a/src/syscall/exec.c +++ b/src/syscall/exec.c @@ -369,7 +369,7 @@ int64_t sys_execve(hv_vcpu_t vcpu, * Cleanup acquires sfd_lock or inotify_lock, which must NOT be held under * fd_lock (lock ordering: fd_lock(3) < sfd_lock(5a) < inotify_lock(7)). * - * Two passes: count first, then heap-allocate. Avoids placing a ~100KB + * Two passes: count first, then heap-allocate. Avoids placing a ~100KiB * VLA on the stack (FD_TABLE_SIZE * sizeof(fd_entry_t+int)). */ int cloexec_count = 0; @@ -529,20 +529,25 @@ int64_t sys_execve(hv_vcpu_t vcpu, g->brk_current = brk_start; /* Keep exec stack placement consistent with initial process startup. */ - uint64_t stack_top = ALIGN_UP(brk_start, BLOCK_2MB); + uint64_t stack_top = ALIGN_UP(brk_start, BLOCK_2MIB); stack_top += STACK_SIZE; if (stack_top < STACK_TOP_DEFAULT) stack_top = STACK_TOP_DEFAULT; g->stack_top = stack_top; g->stack_base = stack_top - STACK_SIZE; -#define MAX_REGIONS 32 + /* Worst case: 7 fixed regions (shim, shim-data, vDSO, brk, stack, mmap RX, + * mmap RW) plus up to ELF_MAX_SEGMENTS for both the executable and the + * interpreter. Sized comfortably to keep the bounds-check loops simple + * after the point of no return. + */ +#define MAX_REGIONS (8 + 2 * ELF_MAX_SEGMENTS) mem_region_t regions[MAX_REGIONS]; int nregions = 0; - /* Fixed regions (shim, brk, stack, mmap areas): 6 entries. - * Bounds-check before each to prevent array overflow. After the point of no - * return, overflow is fatal (exit). + /* Fixed regions (shim, shim-data, vDSO, brk, stack, mmap RX, mmap RW): 7 + * entries. Bounds-check before each to prevent array overflow. After the + * point of no return, overflow is fatal (exit). */ /* Keep the shim executable-only; HVF faults on merged RWX mappings. */ @@ -555,14 +560,29 @@ int64_t sys_execve(hv_vcpu_t vcpu, /* EL1 exception handlers use this block for stack and scratch state. */ if (nregions >= MAX_REGIONS) goto too_many_regions; - regions[nregions++] = (mem_region_t) {.gpa_start = SHIM_DATA_BASE, - .gpa_end = SHIM_DATA_BASE + BLOCK_2MB, - .perms = MEM_PERM_RW}; + regions[nregions++] = + (mem_region_t) {.gpa_start = SHIM_DATA_BASE, + .gpa_end = SHIM_DATA_BASE + BLOCK_2MIB, + .perms = MEM_PERM_RW}; + + /* The vDSO sits in the same 2MiB block as the shim. The page-table builder + * splits the block into 4KiB L3 pages when its regions don't fully cover + * it, so the vDSO must appear here to keep the trampoline page valid and + * RX after rebuild. + */ + if (nregions >= MAX_REGIONS) + goto too_many_regions; + regions[nregions++] = (mem_region_t) {.gpa_start = VDSO_BASE, + .gpa_end = VDSO_BASE + VDSO_SIZE, + .perms = MEM_PERM_RX}; - /* Translate ELF p_flags into guest page permissions. */ + /* Translate ELF p_flags into guest page permissions. Silent drops would + * leave the loaded segment unmapped, so treat overflow as fatal (we are + * already past the point of no return). + */ for (int i = 0; i < elf_info.num_segments; i++) { if (nregions >= MAX_REGIONS) - break; + goto too_many_regions; regions[nregions++] = (mem_region_t) { .gpa_start = elf_info.segments[i].gpa + elf_load_base, .gpa_end = elf_info.segments[i].gpa + elf_info.segments[i].memsz + @@ -571,11 +591,11 @@ int64_t sys_execve(hv_vcpu_t vcpu, } /* Interpreter segments use the same permission translation, shifted by - * interp_base. + * interp_base. Same fatal-overflow rule as the executable's segments. */ for (int i = 0; i < interp_info.num_segments; i++) { if (nregions >= MAX_REGIONS) - break; + goto too_many_regions; regions[nregions++] = (mem_region_t) { .gpa_start = interp_info.segments[i].gpa + interp_base, .gpa_end = interp_info.segments[i].gpa + @@ -598,7 +618,7 @@ int64_t sys_execve(hv_vcpu_t vcpu, .perms = MEM_PERM_RW}; /* PROT_EXEC mmap allocations start in a separate RX area to preserve W^X - * with 2MB page-table blocks. + * with 2MiB page-table blocks. */ if (nregions >= MAX_REGIONS) goto too_many_regions; @@ -629,7 +649,7 @@ int64_t sys_execve(hv_vcpu_t vcpu, guest_region_add(g, SHIM_BASE, SHIM_BASE + shim_size, LINUX_PROT_READ | LINUX_PROT_EXEC, LINUX_MAP_PRIVATE, 0, "[shim]"); - guest_region_add(g, SHIM_DATA_BASE, SHIM_DATA_BASE + BLOCK_2MB, + guest_region_add(g, SHIM_DATA_BASE, SHIM_DATA_BASE + BLOCK_2MIB, LINUX_PROT_READ | LINUX_PROT_WRITE, LINUX_MAP_PRIVATE, 0, "[shim-data]"); for (int i = 0; i < elf_info.num_segments; i++) { diff --git a/src/syscall/fs.c b/src/syscall/fs.c index 8d062d3..283ef7b 100644 --- a/src/syscall/fs.c +++ b/src/syscall/fs.c @@ -572,7 +572,7 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg) * macOS layout: {off_t l_start, off_t l_len, pid_t l_pid, * short l_type, short l_whence} * Use guest_read/guest_write (not guest_ptr) to safely handle - * structs that span 2MB page table block boundaries. + * structs that span 2MiB page table block boundaries. */ uint8_t lflock[32]; /* Linux struct flock is 32 bytes on aarch64 */ if (guest_read_small(g, arg, lflock, sizeof(lflock)) < 0) @@ -620,7 +620,7 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg) return 0; } case 1024: /* F_GETPIPE_SZ */ - /* macOS does not support pipe size queries; return default 64KB */ + /* macOS does not support pipe size queries; return default 64KiB */ return 65536; case 1031: /* F_SETPIPE_SZ */ /* macOS does not support pipe size setting; pretend success */ @@ -720,7 +720,7 @@ int64_t sys_getdents64(guest_t *g, int fd, uint64_t buf_gva, uint64_t count) /* Temp buffer for dirent serialization. Max dirent64 is 280 bytes * (19-byte header + NAME_MAX=255 + null + padding to 8). Using a * stack buffer avoids guest_ptr boundary issues: guest_write() handles - * 2MB block crossings that raw memcpy into guest_ptr() cannot. + * 2MiB block crossings that raw memcpy into guest_ptr() cannot. */ uint8_t entry_buf[280]; @@ -751,7 +751,7 @@ int64_t sys_getdents64(guest_t *g, int fd, uint64_t buf_gva, uint64_t count) lde.d_type = de->d_type; /* Serialize entry into temp buffer, then copy to guest via - * guest_write() which handles 2MB block boundary crossings. + * guest_write() which handles 2MiB block boundary crossings. */ memcpy(entry_buf, &lde, sizeof(lde)); memcpy(entry_buf + 19, de->d_name, name_len + 1); diff --git a/src/syscall/inotify.c b/src/syscall/inotify.c index 3398aa4..cf5205e 100644 --- a/src/syscall/inotify.c +++ b/src/syscall/inotify.c @@ -650,7 +650,7 @@ int64_t inotify_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count) pos += event_size; } - /* Copy event data to a local buffer (max 4KB) */ + /* Copy event data to a local buffer (max 4KiB) */ uint8_t local_buf[INOTIFY_BUFSIZE]; if (copied > 0) memcpy(local_buf, inst->event_buf, copied); diff --git a/src/syscall/io.c b/src/syscall/io.c index bfb14b1..3b40c03 100644 --- a/src/syscall/io.c +++ b/src/syscall/io.c @@ -686,7 +686,7 @@ int64_t sys_pwrite64(guest_t *g, } /* Helper: build host iovec array from guest iovec array. - * Uses guest_read for the iovec array (may cross 2MB block boundary) + * Uses guest_read for the iovec array (may cross 2MiB block boundary) * and guest_ptr_avail for each buffer (caps to contiguous bytes). * required_perms: MEM_PERM_W for readv (host writes to guest buffers), * MEM_PERM_R for writev (host reads from guest buffers). @@ -808,7 +808,7 @@ int64_t sys_readv(guest_t *g, int fd, uint64_t iov_gva, int iovcnt) if (iovcnt <= 0) return -LINUX_EINVAL; /* Use guest_read for the iov array since guest_ptr alone is unsafe - * if the array spans a 2MB block boundary. + * if the array spans a 2MiB block boundary. */ linux_iovec_t giov; if (guest_read_small(g, iov_gva, &giov, sizeof(giov)) < 0) diff --git a/src/syscall/mem.c b/src/syscall/mem.c index ad584d6..106b189 100644 --- a/src/syscall/mem.c +++ b/src/syscall/mem.c @@ -201,8 +201,8 @@ static int mremap_extend_range(guest_t *g, } int page_perms = prot_to_perms(prot); - uint64_t ext_start = ALIGN_DOWN(off, BLOCK_2MB); - uint64_t ext_end = ALIGN_UP(off + size, BLOCK_2MB); + uint64_t ext_start = ALIGN_DOWN(off, BLOCK_2MIB); + uint64_t ext_end = ALIGN_UP(off + size, BLOCK_2MIB); if (ext_end > g->guest_size) ext_end = g->guest_size; if (guest_extend_page_tables(g, ext_start, ext_end, page_perms) < 0) @@ -237,11 +237,11 @@ int64_t sys_brk(guest_t *g, uint64_t addr) * The brk region is initially mapped up to MMAP_RX_BASE; if it grows * past that, the mmap allocator needs to extend dynamically. */ - uint64_t brk_pt_end = ALIGN_UP(g->brk_current, BLOCK_2MB); + uint64_t brk_pt_end = ALIGN_UP(g->brk_current, BLOCK_2MIB); if (brk_pt_end < MMAP_RX_BASE) brk_pt_end = MMAP_RX_BASE; if (new_off > brk_pt_end) { - uint64_t new_end = ALIGN_UP(new_off, BLOCK_2MB); + uint64_t new_end = ALIGN_UP(new_off, BLOCK_2MIB); if (guest_extend_page_tables(g, brk_pt_end, new_end, MEM_PERM_RW) < 0) return (int64_t) ipa_brk; } @@ -426,8 +426,8 @@ int64_t sys_mmap(guest_t *g, */ int page_perms = prot_to_perms(prot); - uint64_t ext_start = ALIGN_DOWN(result_off, BLOCK_2MB); - uint64_t ext_end = ALIGN_UP(result_off + length, BLOCK_2MB); + uint64_t ext_start = ALIGN_DOWN(result_off, BLOCK_2MIB); + uint64_t ext_end = ALIGN_UP(result_off + length, BLOCK_2MIB); if (ext_end > g->guest_size) ext_end = g->guest_size; @@ -446,7 +446,7 @@ int64_t sys_mmap(guest_t *g, /* Fine-tune permissions for the exact range. Handles L3 * splitting when MAP_FIXED overlays different permissions - * onto an existing 2MB block (e.g., .data RW over .text RX). + * onto an existing 2MiB block (e.g., .data RW over .text RX). */ guest_update_perms(g, result_off, result_off + length, page_perms); @@ -503,8 +503,8 @@ int64_t sys_mmap(guest_t *g, if (!is_fixed) { if (needs_exec && !(prot & LINUX_PROT_WRITE)) { /* PROT_EXEC without PROT_WRITE: allocate from the RX mmap region. - * Apple HVF enforces W^X on 2MB block page table entries, so - * executable mappings must be in separate 2MB blocks from writable + * Apple HVF enforces W^X on 2MiB block page table entries, so + * executable mappings must be in separate 2MiB blocks from writable * ones. The RX region at MMAP_RX_BASE is pre-mapped with execute * permission. */ @@ -512,7 +512,7 @@ int64_t sys_mmap(guest_t *g, if (result_off == UINT64_MAX) { log_debug( "mmap: RX address space exhausted " - "(len=0x%llx, limit=0x%llx, %u-bit IPA / %lluGB)", + "(len=0x%llx, limit=0x%llx, %u-bit IPA / %llu GiB)", (unsigned long long) length, (unsigned long long) g->mmap_limit, g->ipa_bits, (unsigned long long) (g->guest_size >> 30)); @@ -526,7 +526,7 @@ int64_t sys_mmap(guest_t *g, /* RW (or PROT_NONE, or PROT_READ): allocate from main mmap region. * Honor the address hint if provided and within bounds. Some * managed-runtime allocators need the heap at a specific high - * address range (e.g., ~264GB for a megablock-style map) and + * address range (e.g., ~264GiB for a megablock-style map) and * spin-retry if they get a low address instead. On real Linux, * mmap tries the hint first and falls back to any suitable address. */ @@ -543,7 +543,7 @@ int64_t sys_mmap(guest_t *g, if (result_off == UINT64_MAX) { log_debug( "mmap: RW address space exhausted " - "(len=0x%llx, limit=0x%llx, %u-bit IPA / %lluGB)", + "(len=0x%llx, limit=0x%llx, %u-bit IPA / %llu GiB)", (unsigned long long) length, (unsigned long long) g->mmap_limit, g->ipa_bits, (unsigned long long) (g->guest_size >> 30)); @@ -590,8 +590,8 @@ int64_t sys_mmap(guest_t *g, * creating entries for PROT_NONE gaps between allocations. */ if (needs_exec && !(prot & LINUX_PROT_WRITE)) { - uint64_t ext_start = ALIGN_DOWN(result_off, BLOCK_2MB); - uint64_t ext_end = ALIGN_UP(result_off + length, BLOCK_2MB); + uint64_t ext_start = ALIGN_DOWN(result_off, BLOCK_2MIB); + uint64_t ext_end = ALIGN_UP(result_off + length, BLOCK_2MIB); if (ext_end > g->mmap_limit) ext_end = g->mmap_limit; if (guest_extend_page_tables(g, ext_start, ext_end, MEM_PERM_RX) < @@ -608,8 +608,8 @@ int64_t sys_mmap(guest_t *g, if (ext_end > g->mmap_rx_end) g->mmap_rx_end = ext_end; } else { - uint64_t ext_start = ALIGN_DOWN(result_off, BLOCK_2MB); - uint64_t ext_end = ALIGN_UP(result_off + length, BLOCK_2MB); + uint64_t ext_start = ALIGN_DOWN(result_off, BLOCK_2MIB); + uint64_t ext_end = ALIGN_UP(result_off + length, BLOCK_2MIB); if (ext_end > g->mmap_limit) ext_end = g->mmap_limit; /* Preserve execute permission for RWX requests. Stage-2 @@ -1133,7 +1133,7 @@ int64_t sys_munmap(guest_t *g, uint64_t addr, uint64_t length) if (unmap_off < ELF_DEFAULT_BASE && end > PT_POOL_BASE) return -LINUX_EINVAL; - /* Invalidate PTEs first. This may need to split a 2MB block + /* Invalidate PTEs first. This may need to split a 2MiB block * which can fail if the page table pool is exhausted. Failing * before region removal keeps metadata consistent. */ diff --git a/src/syscall/net-absock.h b/src/syscall/net-absock.h index 6aec06d..4e5eae8 100644 --- a/src/syscall/net-absock.h +++ b/src/syscall/net-absock.h @@ -1,11 +1,11 @@ -#pragma once - -/* Abstract AF_UNIX emulation helpers for elfuse +/* Abstract AF_UNIX emulation helpers * * Copyright 2026 elfuse contributors * SPDX-License-Identifier: Apache-2.0 */ +#pragma once + #include #include diff --git a/src/syscall/proc-state.h b/src/syscall/proc-state.h index c48c246..77adeca 100644 --- a/src/syscall/proc-state.h +++ b/src/syscall/proc-state.h @@ -1,9 +1,9 @@ -#pragma once - -/* Process metadata state helpers for elfuse +/* Process metadata state helpers * * Copyright 2026 elfuse contributors * SPDX-License-Identifier: Apache-2.0 */ +#pragma once + void proc_state_init(void); diff --git a/src/syscall/proc.c b/src/syscall/proc.c index 9c0dc62..60adda1 100644 --- a/src/syscall/proc.c +++ b/src/syscall/proc.c @@ -1360,12 +1360,13 @@ int vcpu_run_loop(hv_vcpu_t vcpu, * EC=0x24 data abort) and forwards the faulting address * here. * - * Toggling at 2MB granularity causes thrashing when the + * Toggling at 2MiB granularity causes thrashing when the * JIT writes new code and executes existing code within - * the same 2MB block. Instead, the code splits the 2MB - * block into 4KB L3 pages and toggle only the faulting 4KB - * page. This allows different pages within a 2MB block to - * have independent RW/RX permissions simultaneously. + * the same 2MiB block. Instead, the code splits the 2MiB + * block into 4KiB L3 pages and toggle only the faulting + * 4KiB page. This allows different pages within a 2MiB + * block to have independent RW/RX permissions + * simultaneously. * * x0 = FAR_EL1 (faulting virtual address) * x1 = type: 0 = exec fault -> flip to RX @@ -1421,7 +1422,7 @@ int vcpu_run_loop(hv_vcpu_t vcpu, prefix, (unsigned long long) far, (type == 0) ? "RX" : "RW", (unsigned long long) page_start); - uint64_t block_start = far & ~(BLOCK_2MB - 1); + uint64_t block_start = far & ~(BLOCK_2MIB - 1); int sr = guest_split_block(g, block_start); int ur = guest_update_perms(g, page_start, page_end, new_perms); diff --git a/src/syscall/sys.c b/src/syscall/sys.c index 65c26ad..4d540f4 100644 --- a/src/syscall/sys.c +++ b/src/syscall/sys.c @@ -88,7 +88,7 @@ static void sysinfo_init_cached_host_state(void) size_t ms_len = sizeof(memsize); int mib_mem[2] = {CTL_HW, HW_MEMSIZE}; if (sysctl(mib_mem, 2, &memsize, &ms_len, NULL, 0) == 0) { - const uint64_t vm_ram_cap = 4094595072ULL; /* Match Lima VZ 4GB VM */ + const uint64_t vm_ram_cap = 4094595072ULL; /* Match Lima VZ 4GiB VM */ cached_real_memsize = memsize; cached_totalram = (memsize > vm_ram_cap) ? vm_ram_cap : memsize; } @@ -367,8 +367,8 @@ static linux_rlimit64_t translate_host_rlimit(int resource, struct rlimit rl) lim.rlim_cur = (rl.rlim_cur == RLIM_INFINITY) ? UINT64_MAX : rl.rlim_cur; lim.rlim_max = (rl.rlim_max == RLIM_INFINITY) ? UINT64_MAX : rl.rlim_max; - /* macOS returns ~8MB-16KB for the default stack; round to Linux's - * conventional 8MB to keep guest userspace behavior stable. + /* macOS returns ~8MiB-16KiB for the default stack; round to Linux's + * conventional 8MiB to keep guest userspace behavior stable. */ if (resource == 3 /* RLIMIT_STACK */ && lim.rlim_cur > 0 && lim.rlim_cur < 8388608) { diff --git a/src/utils.h b/src/utils.h index 153b94a..efe55c5 100644 --- a/src/utils.h +++ b/src/utils.h @@ -30,18 +30,18 @@ /* Align x down to the previous multiple of a; a must be a power of two. */ #define ALIGN_DOWN(x, a) ((uint64_t) (x) & ~((uint64_t) (a) - 1)) -/* The Linux ABI fixes the page size at 4KB on aarch64 regardless of the host +/* The Linux ABI fixes the page size at 4KiB on aarch64 regardless of the host * page size, so this is shared by every guest memory path (mmap, brk, * mprotect, ELF loading). */ #define GUEST_PAGE_SIZE 4096ULL #define PAGE_ALIGN_UP(x) ALIGN_UP(x, GUEST_PAGE_SIZE) -/* 2MB block alignment shared by region setup, page table walking, and stack - * placement. BLOCK_2MB itself is defined in core/guest.h. +/* 2MiB block alignment shared by region setup, page table walking, and stack + * placement. BLOCK_2MIB itself is defined in core/guest.h. */ -#define ALIGN_2MB_DOWN(x) ALIGN_DOWN(x, 2ULL * 1024 * 1024) -#define ALIGN_2MB_UP(x) ALIGN_UP(x, 2ULL * 1024 * 1024) +#define ALIGN_2MIB_DOWN(x) ALIGN_DOWN(x, 2ULL * 1024 * 1024) +#define ALIGN_2MIB_UP(x) ALIGN_UP(x, 2ULL * 1024 * 1024) /* Branchless range check: true when minx <= x < minx + size. * diff --git a/tests/test-cow-fork.c b/tests/test-cow-fork.c index 3df6ffd..8770420 100644 --- a/tests/test-cow-fork.c +++ b/tests/test-cow-fork.c @@ -170,7 +170,7 @@ static void test_mmap_isolation(void) static void test_large_cow(void) { - TEST("fork: 1MB COW integrity"); + TEST("fork: 1MiB COW integrity"); int pipefd[2]; if (pipe(pipefd) != 0) { @@ -182,7 +182,7 @@ static void test_large_cow(void) char *buf = mmap(NULL, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (buf == MAP_FAILED) { - FAIL("mmap 1MB"); + FAIL("mmap 1MiB"); return; } @@ -229,7 +229,7 @@ static void test_large_cow(void) int status; waitpid(pid, &status, 0); - EXPECT_TRUE(parent_ok && child_ok, "1MB COW integrity failed"); + EXPECT_TRUE(parent_ok && child_ok, "1MiB COW integrity failed"); munmap(buf, sz); } diff --git a/tests/test-futex-pi.c b/tests/test-futex-pi.c index 0bfe294..8c5ca60 100644 --- a/tests/test-futex-pi.c +++ b/tests/test-futex-pi.c @@ -68,7 +68,7 @@ static long raw_futex_unlock_pi(uint32_t *addr) /* Child thread for dead-owner test */ -/* Stack for child thread (8KB, 16-byte aligned) */ +/* Stack for child thread (8KiB, 16-byte aligned) */ static char child_stack_buf[8192] __attribute__((aligned(16))); /* Child: acquire PI lock, signal parent, exit WITHOUT releasing. diff --git a/tests/test-guard-page.c b/tests/test-guard-page.c index b4809ce..caf91ac 100644 --- a/tests/test-guard-page.c +++ b/tests/test-guard-page.c @@ -61,13 +61,13 @@ static void test_prot_none(void) static void test_large_mmap(void) { - TEST("mmap 64MB anonymous"); + TEST("mmap 64MiB anonymous"); size_t sz = 64UL * 1024 * 1024; void *p = mmap(NULL, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (p == MAP_FAILED) { - FAIL("mmap 64MB failed"); + FAIL("mmap 64MiB failed"); return; } @@ -78,7 +78,7 @@ static void test_large_mmap(void) c[sz - 1] = 'C'; EXPECT_TRUE(c[0] == 'A' && c[sz / 2] == 'B' && c[sz - 1] == 'C', - "data mismatch in 64MB region"); + "data mismatch in 64MiB region"); munmap(p, sz); } diff --git a/tests/test-large-io-boundary.c b/tests/test-large-io-boundary.c index 18230d8..28b76e7 100644 --- a/tests/test-large-io-boundary.c +++ b/tests/test-large-io-boundary.c @@ -4,7 +4,7 @@ * Copyright 2025 Moritz Angermann, zw3rk pte. ltd. * SPDX-License-Identifier: Apache-2.0 * - * Tests: read/write buffers crossing 2MB L2 blocks and split L3 tables. + * Tests: read/write buffers crossing 2MiB L2 blocks and split L3 tables. */ #include @@ -19,7 +19,7 @@ int passes = 0, fails = 0; -#define BLOCK_2MB (2UL * 1024 * 1024) +#define BLOCK_2MIB (2UL * 1024 * 1024) #define MAP_SIZE (6UL * 1024 * 1024) #define IO_OFFSET 12345UL #define IO_SIZE (3UL * 1024 * 1024) @@ -27,7 +27,7 @@ int passes = 0, fails = 0; static unsigned char *next_2mb_boundary(unsigned char *p) { uintptr_t addr = (uintptr_t) p; - addr = (addr + BLOCK_2MB - 1) & ~(uintptr_t) (BLOCK_2MB - 1); + addr = (addr + BLOCK_2MIB - 1) & ~(uintptr_t) (BLOCK_2MIB - 1); return (unsigned char *) addr; } @@ -53,7 +53,7 @@ static int verify_pattern(const unsigned char *buf, size_t len) return 0; } -/* Verify a repeating 4KB seed pattern across a large buffer. +/* Verify a repeating 4KiB seed pattern across a large buffer. * The seed is: seed[i] = (i * 131 + 17) for i in [0, 4096). */ static int verify_repeating_seed(const unsigned char *buf, size_t len) @@ -68,7 +68,7 @@ static int verify_repeating_seed(const unsigned char *buf, size_t len) static void test_large_write(void) { - TEST("write crosses 2MB boundary"); + TEST("write crosses 2MiB boundary"); unsigned char *map = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); @@ -97,7 +97,7 @@ static void test_large_write(void) ok = 0; /* Read back the entire write and verify all bytes, including those - * spanning the 2MB page table boundary. + * spanning the 2MiB page table boundary. */ unsigned char *readback = mmap(NULL, IO_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); @@ -129,12 +129,12 @@ static void test_large_read_from_split_block(void) return; } - /* Force the first 2MB block to remain split into L3 pages while ending + /* Force the first 2MiB block to remain split into L3 pages while ending * with RW permissions, then read across the L3-to-L2 boundary. */ unsigned char *block = next_2mb_boundary(map); unsigned char *buf = block + IO_OFFSET; - void *page = block + BLOCK_2MB / 2; + void *page = block + BLOCK_2MIB / 2; if (mprotect(page, 4096, PROT_READ) != 0 || mprotect(page, 4096, PROT_READ | PROT_WRITE) != 0) { munmap(map, MAP_SIZE); @@ -170,7 +170,7 @@ static void test_large_read_from_split_block(void) ssize_t ret = read(fd, buf, IO_SIZE); ok = (ret == (ssize_t) IO_SIZE); } - /* Verify the entire read buffer, including the 2MB boundary + /* Verify the entire read buffer, including the 2MiB boundary * crossing where L3-to-L2 page table transitions happen. */ if (ok && verify_repeating_seed(buf, IO_SIZE) != 0) diff --git a/tests/test-madvise.c b/tests/test-madvise.c index 0b153d4..a81a9ba 100644 --- a/tests/test-madvise.c +++ b/tests/test-madvise.c @@ -212,7 +212,7 @@ static void test_advisory_hints(void) static void test_dontneed_large(void) { - TEST("MADV_DONTNEED 1MB range"); + TEST("MADV_DONTNEED 1MiB range"); size_t sz = 1024 * 1024; void *p = mmap(NULL, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); @@ -239,7 +239,7 @@ static void test_dontneed_large(void) } } - EXPECT_TRUE(ok, "1MB range not zeroed"); + EXPECT_TRUE(ok, "1MiB range not zeroed"); munmap(p, sz); } diff --git a/tests/test-mremap.c b/tests/test-mremap.c index 5375d7f..61d2764 100644 --- a/tests/test-mremap.c +++ b/tests/test-mremap.c @@ -234,7 +234,7 @@ static void test_same_size(void) static void test_large_realloc(void) { - TEST("mremap large (256KB->512KB)"); + TEST("mremap large (256KiB->512KiB)"); size_t old_sz = 256 * 1024, new_sz = 512 * 1024; void *p = mmap(NULL, old_sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); diff --git a/tests/test-multi-vcpu.c b/tests/test-multi-vcpu.c index b3277e4..01d9301 100644 --- a/tests/test-multi-vcpu.c +++ b/tests/test-multi-vcpu.c @@ -55,9 +55,9 @@ #define PT_AP_RO (3ULL << 6) /* RO at EL0 */ #define PAGE_SIZE_4K 4096ULL -#define BLOCK_2MB (2ULL * 1024 * 1024) +#define BLOCK_2MIB (2ULL * 1024 * 1024) -/* Memory layout (16MB total, much smaller than elfuse's 32GB) */ +/* Memory layout (16MiB total, much smaller than elfuse's 32GiB) */ #define GUEST_SIZE (16ULL * 1024 * 1024) @@ -70,18 +70,18 @@ #define STACK_A_BASE 0x00A00000ULL /* EL0 stack A (RW) */ #define STACK_B_BASE 0x00C00000ULL /* EL0 stack B (RW) */ -/* vCPU-A and vCPU-B SP_EL1 (top of respective 512KB regions within shim data) +/* vCPU-A and vCPU-B SP_EL1 (top of respective 512KiB regions within shim data) */ -#define SP_EL1_A (SHIM_DATA_BASE + BLOCK_2MB) /* 0x400000 */ -#define SP_EL1_B (SHIM_DATA_BASE + BLOCK_2MB / 2) /* 0x300000 */ +#define SP_EL1_A (SHIM_DATA_BASE + BLOCK_2MIB) /* 0x400000 */ +#define SP_EL1_B (SHIM_DATA_BASE + BLOCK_2MIB / 2) /* 0x300000 */ /* vCPU-A and vCPU-B EL0 code offsets within GUEST_CODE region */ #define CODE_A_OFF 0x0000ULL -#define CODE_B_OFF 0x1000ULL /* 4KB apart */ +#define CODE_B_OFF 0x1000ULL /* 4KiB apart */ -/* EL0 stack tops (top of each 2MB region) */ -#define SP_EL0_A (STACK_A_BASE + BLOCK_2MB) /* 0xC00000 */ -#define SP_EL0_B (STACK_B_BASE + BLOCK_2MB) /* 0xE00000 */ +/* EL0 stack tops (top of each 2MiB region) */ +#define SP_EL0_A (STACK_A_BASE + BLOCK_2MIB) /* 0xC00000 */ +#define SP_EL0_B (STACK_B_BASE + BLOCK_2MIB) /* 0xE00000 */ /* System register values (from main.c) */ @@ -134,7 +134,7 @@ static uint64_t pt_alloc(vm_state_t *vm) return off; } -/* Build a 2MB block descriptor at a given GPA with RX or RW perms. */ +/* Build a 2MiB block descriptor at a given GPA with RX or RW perms. */ static uint64_t make_block(uint64_t gpa, int perm) { uint64_t desc = (gpa & 0xFFFFFFFFE00000ULL) | PT_AF | PT_SH_ISH | PT_NS | @@ -162,10 +162,10 @@ static uint64_t build_page_tables(vm_state_t *vm, int include_tlbi_region) uint64_t *l0 = (uint64_t *) ((uint8_t *) vm->host_base + l0_off); uint64_t *l1 = (uint64_t *) ((uint8_t *) vm->host_base + l1_off); - /* L0[0] -> L1 table (all the current addresses are < 512GB) */ + /* L0[0] -> L1 table (all the current addresses are < 512GiB) */ l0[0] = l1_off | PT_VALID | PT_TABLE; - /* L1[0] -> L2 table (all the current addresses are < 1GB) */ + /* L1[0] -> L2 table (all the current addresses are < 1GiB) */ uint64_t l2_off = pt_alloc(vm); if (!l2_off) return 0; @@ -173,7 +173,7 @@ static uint64_t build_page_tables(vm_state_t *vm, int include_tlbi_region) uint64_t *l2 = (uint64_t *) ((uint8_t *) vm->host_base + l2_off); - /* Map 2MB blocks. L2 index = addr / 2MB. */ + /* Map 2MiB blocks. L2 index = addr / 2MiB. */ /* Shim code (RX) at 0x100000 -> L2[0] (shares 0x0-0x1FFFFF) */ l2[0] = make_block(0x000000, PERM_RX); @@ -199,8 +199,8 @@ static uint64_t build_page_tables(vm_state_t *vm, int include_tlbi_region) /* Stack B spills into 0xE00000 (SP=0xE00000 grows down into 0xC00000 * block), already covered by L2[6] since SP_EL0_B = 0xE00000 is top of - * 0xC00000 block. Actually 0xE00000 = 7 * 2MB, that's a separate block. Map - * it too: + * 0xC00000 block. Actually 0xE00000 = 7 * 2MiB, that's a separate block. + * Map it too: */ l2[7] = make_block(0xE00000, PERM_RW); @@ -475,7 +475,7 @@ static int vm_create(vm_state_t *vm) vm->pt_next = PT_POOL_BASE; /* Query max IPA size and configure VM (matches guest.c pattern). - * The test uses only 16MB, so any IPA size works; this is for + * The test uses only 16MiB, so any IPA size works; this is for * API consistency with elfuse's production code path. */ uint32_t max_ipa = 0; diff --git a/tests/test-perf.sh b/tests/test-perf.sh index f729cfc..8175409 100755 --- a/tests/test-perf.sh +++ b/tests/test-perf.sh @@ -104,10 +104,10 @@ benchmark "elfuse guest wc" sh -c "'$ELFUSE' '$TOOL_BIN/wc' -l '$SRC_SUBDIR'/*.c echo # --- Test 4: I/O throughput — cat large file through wc --- -printf "${YELLOW}▸ cat ~10MB | wc -l (I/O throughput)${RESET}\n" +printf "${YELLOW}▸ cat ~10MiB | wc -l (I/O throughput)${RESET}\n" TMPFILE=$(mktemp) trap 'rm -f "$TMPFILE"' EXIT -# Build ~10MB test file by repeating syscall.c (~100 times) +# Build ~10MiB test file by repeating syscall.c (~100 times) for _ in $(seq 1 100); do cat "$SYSCALL_C" >> "$TMPFILE"; done TMPSIZE=$(wc -c < "$TMPFILE" | tr -d ' ') printf " ${CYAN}(test file: %s bytes)${RESET}\n" "$TMPSIZE" diff --git a/tests/test-rwx.c b/tests/test-rwx.c index e04b75e..180e743 100644 --- a/tests/test-rwx.c +++ b/tests/test-rwx.c @@ -9,8 +9,8 @@ * page table entries work at stage-1 when SCTLR_EL1.WXN=0. * * Tests: - * 1. RWX 2MB block: L2 block descriptor with AP=RW_EL0, UXN=0, PXN=0 - * 2. RWX 4KB page: L3 page descriptor with the same RWX permissions + * 1. RWX 2MiB block: L2 block descriptor with AP=RW_EL0, UXN=0, PXN=0 + * 2. RWX 4KiB page: L3 page descriptor with the same RWX permissions * 3. Baseline RX: Confirm execution works on a normal RX page * 4. Baseline RW: Confirm writes work on a normal RW page * @@ -65,9 +65,9 @@ #define PT_AP_RO (3ULL << 6) /* AP[2:1]=11 -> RO at EL0 */ #define PAGE_SIZE_4K 4096ULL -#define BLOCK_2MB (2ULL * 1024 * 1024) +#define BLOCK_2MIB (2ULL * 1024 * 1024) -/* Memory layout (16MB total) */ +/* Memory layout (16MiB total) */ #define GUEST_SIZE (16ULL * 1024 * 1024) @@ -75,20 +75,20 @@ #define SHIM_BASE 0x00100000ULL /* Shim code (RX) */ #define SHIM_DATA_BASE 0x00200000ULL /* Shim data / EL1 stack (RW) */ #define GUEST_CODE 0x00400000ULL /* EL0 test code (RX) */ -#define RWX_BLOCK 0x00600000ULL /* 2MB block for RWX test (test 1) */ +#define RWX_BLOCK 0x00600000ULL /* 2MiB block for RWX test (test 1) */ #define RWX_PAGE_BLOCK \ - 0x00800000ULL /* 2MB region containing RWX 4KB page (test 2) */ + 0x00800000ULL /* 2MiB region containing RWX 4KiB page (test 2) */ #define GUEST_DATA 0x00A00000ULL /* RW data (test 4 baseline) */ #define STACK_BASE 0x00C00000ULL /* EL0 stack (RW) */ -/* Within RWX_PAGE_BLOCK, the RWX 4KB page is at offset 0 */ +/* Within RWX_PAGE_BLOCK, the RWX 4KiB page is at offset 0 */ #define RWX_PAGE_ADDR RWX_PAGE_BLOCK /* EL0 stack top and SP_EL1 */ -#define SP_EL0 (STACK_BASE + BLOCK_2MB) -#define SP_EL1 (SHIM_DATA_BASE + BLOCK_2MB) +#define SP_EL0 (STACK_BASE + BLOCK_2MIB) +#define SP_EL1 (SHIM_DATA_BASE + BLOCK_2MIB) -/* Code offsets within GUEST_CODE (4KB apart for different tests) */ +/* Code offsets within GUEST_CODE (4KiB apart for different tests) */ #define CODE_TEST1 0x0000ULL /* Test 1: RWX block write+exec */ #define CODE_TEST2 0x1000ULL /* Test 2: RWX page write+exec */ #define CODE_TEST3 0x2000ULL /* Test 3: baseline RX exec */ @@ -142,27 +142,27 @@ static uint64_t pt_alloc(vm_state_t *vm) /* Descriptor builders */ -/* Common base attributes for a 2MB block or 4KB page */ +/* Common base attributes for a 2MiB block or 4KiB page */ static uint64_t common_attrs(void) { return PT_AF | PT_SH_ISH | PT_NS | PT_ATTR1; } -/* 2MB block: RX (executable, read-only at EL0) */ +/* 2MiB block: RX (executable, read-only at EL0) */ static uint64_t make_block_rx(uint64_t gpa) { return (gpa & 0xFFFFFFFFE00000ULL) | common_attrs() | PT_BLOCK | PT_AP_RO; /* UXN=0, PXN=0 -> executable */ } -/* 2MB block: RW (writable, not executable) */ +/* 2MiB block: RW (writable, not executable) */ static uint64_t make_block_rw(uint64_t gpa) { return (gpa & 0xFFFFFFFFE00000ULL) | common_attrs() | PT_BLOCK | PT_AP_RW_EL0 | PT_UXN | PT_PXN; } -/* 2MB block: RWX (writable AND executable at EL0, the test subject) */ +/* 2MiB block: RWX (writable AND executable at EL0, the test subject) */ static uint64_t make_block_rwx(uint64_t gpa) { return (gpa & 0xFFFFFFFFE00000ULL) | common_attrs() | PT_BLOCK | @@ -170,7 +170,7 @@ static uint64_t make_block_rwx(uint64_t gpa) /* UXN=0, PXN=0 -> executable; AP=01 -> writable at EL0 */ } -/* 4KB L3 page: RWX (writable AND executable at EL0) */ +/* 4KiB L3 page: RWX (writable AND executable at EL0) */ static uint64_t make_page_rwx(uint64_t gpa) { return (gpa & 0xFFFFFFFFF000ULL) | common_attrs() | PT_VALID | PT_PAGE | @@ -178,7 +178,7 @@ static uint64_t make_page_rwx(uint64_t gpa) /* UXN=0, PXN=0 -> executable; AP=01 -> writable at EL0 */ } -/* 4KB L3 page: RW (not executable) */ +/* 4KiB L3 page: RW (not executable) */ static uint64_t make_page_rw(uint64_t gpa) { return (gpa & 0xFFFFFFFFF000ULL) | common_attrs() | PT_VALID | PT_PAGE | @@ -230,7 +230,7 @@ static uint64_t build_page_tables(vm_state_t *vm) l2[3] = make_block_rwx(0x600000); /* L2[4]: Table descriptor -> L3 page table for Test 2. - * the code splits this 2MB block into 512 x 4KB pages. The first page + * the code splits this 2MiB block into 512 x 4KiB pages. The first page * at 0x800000 is RWX, the rest are RW (non-executable). */ { @@ -519,9 +519,9 @@ static void print_bad_exception(const vcpu_exit_t *ex) } } -/* TEST 1: RWX 2MB Block +/* TEST 1: RWX 2MiB Block * - * Stage-1 page table has a 2MB block at 0x600000 with: + * Stage-1 page table has a 2MiB block at 0x600000 with: * AP[2:1]=01 (RW at EL0), UXN=0, PXN=0 (executable) * This is a true RWX mapping. * @@ -678,10 +678,10 @@ static int test1_rwx_block(void) return result; } -/* TEST 2: RWX 4KB Page (L3 descriptor) +/* TEST 2: RWX 4KiB Page (L3 descriptor) * - * Same as test 1, but using a 4KB L3 page descriptor at 0x800000 - * instead of a 2MB L2 block descriptor. Tests whether the + * Same as test 1, but using a 4KiB L3 page descriptor at 0x800000 + * instead of a 2MiB L2 block descriptor. Tests whether the * granularity matters for W^X enforcement. */ @@ -753,13 +753,13 @@ static int test2_rwx_page(void) (unsigned long long) ex.x0, (unsigned long long) ex.x1, ex.x1 == 0 ? "exec fault -> flip to RX" : "write fault -> flip to RW"); - printf(" " YELLOW "HVF enforces W^X at stage-2 (4KB page)" RESET + printf(" " YELLOW "HVF enforces W^X at stage-2 (4KiB page)" RESET "\n"); result = -1; } else if (ex.reason == HVF_EXIT_HVC5 && ex.x0 == 42) { printf("\n " GREEN "RWX works!" RESET - " Written code executed (4KB page, x0=%llu)\n", + " Written code executed (4KiB page, x0=%llu)\n", (unsigned long long) ex.x0); result = 0; @@ -769,11 +769,11 @@ static int test2_rwx_page(void) uint32_t ec = (uint32_t) (ex.esr >> 26) & 0x3F; if (ec == 0x20) printf(" " YELLOW - "Instruction abort: W^X blocks execution (4KB page)" RESET + "Instruction abort: W^X blocks execution (4KiB page)" RESET "\n"); else if (ec == 0x24) - printf(" " YELLOW "Data abort: W^X blocks write (4KB page)" RESET - "\n"); + printf(" " YELLOW + "Data abort: W^X blocks write (4KiB page)" RESET "\n"); result = -1; } else { @@ -952,8 +952,8 @@ int main(void) } tests[] = { {"Baseline: RX execution", test3_baseline_rx}, {"Baseline: RW write", test4_baseline_rw}, - {"RWX 2MB block (write+exec)", test1_rwx_block}, - {"RWX 4KB page (write+exec)", test2_rwx_page}, + {"RWX 2MiB block (write+exec)", test1_rwx_block}, + {"RWX 4KiB page (write+exec)", test2_rwx_page}, }; int ntests = (int) ARRAY_SIZE(tests); diff --git a/tests/test-stress.c b/tests/test-stress.c index 687afd3..f81d599 100644 --- a/tests/test-stress.c +++ b/tests/test-stress.c @@ -97,7 +97,7 @@ static void test_mmap_churn(void) TEST("mmap/munmap churn (256 cycles)"); #define CHURN_CYCLES 256 -#define CHURN_SIZE (64 * 1024) /* 64KB each */ +#define CHURN_SIZE (64 * 1024) /* 64KiB each */ bool ok = true; for (int i = 0; i < CHURN_CYCLES; i++) { @@ -275,7 +275,7 @@ static void test_mprotect_cycling(void) static void test_large_mmap(void) { - TEST("large mmap (16MB)"); + TEST("large mmap (16MiB)"); size_t sz = 16 * 1024 * 1024; void *p = mmap(NULL, sz, PROT_READ | PROT_WRITE, @@ -285,7 +285,7 @@ static void test_large_mmap(void) return; } - /* Touch every page (4KB stride) */ + /* Touch every page (4KiB stride) */ volatile char *vp = (volatile char *) p; for (size_t off = 0; off < sz; off += 4096) { vp[off] = (char) (off >> 12); diff --git a/tests/test-thread.c b/tests/test-thread.c index d4f88c6..da427df 100644 --- a/tests/test-thread.c +++ b/tests/test-thread.c @@ -57,7 +57,7 @@ static void child_work(void) /* Tests */ -/* Stack for child thread (8KB, 16-byte aligned) */ +/* Stack for child thread (8KiB, 16-byte aligned) */ static char child_stack_buf[8192] __attribute__((aligned(16))); /* Test 1: clone(CLONE_THREAD) creates a new thread that runs concurrently */