From 3458687cc97d7e36446e1df78887b6147dd5f66e Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Tue, 5 May 2026 10:23:49 +0800 Subject: [PATCH] Harden signalfd read semantics 1. Off-by-one in signal_collect_signalfd / signal_take_signalfd_exact excluded signum == LINUX_NSIG (SIGRTMAX, 64 on aarch64) from the iteration. Bare-musl applications targeting SIGRTMAX directly were silently dropped from signalfd reads even when the signal was present in sig_state.pending. Both loops now use inclusive bounds. 2. signalfd_read previously took the rt-queue head before writing to the guest, which forced a re-queue on guest_write_small EFAULT. The re-queue path had three latent hazards Codex flagged in review: it could exceed RT_SIGQUEUE_MAX under concurrent same-signal pressure, it issued duplicate signalfd_notify writes that desynced the pipe-byte count from the actual pending-signal count (causing spurious EAGAIN on later blocking reads), and an earlier draft also risked duplicate delivery of records that already reached the guest. Restructured to peek -> write -> take only the prefix that wrote successfully. An EFAULT before any record lands returns -EFAULT with the rt-queue intact (preserving the elfuse promise locked in by tests/test-tier-b's test_signalfd_efault_preserves_pending). A partial fault after N>0 records returns N*sizeof(signalfd_siginfo) bytes and leaves the unwritten entries pending; if a concurrent consumer advanced the rt-queue head between peek and take, the read loop restarts via a retry label so the caller never sees stale records. 3. Standard signals (1-31) used to fabricate SI_USER/proc_pid/proc_uid defaults at signalfd-read time, dropping the sender pid/uid and any sigval payload that sigqueue() supplied. Linux coalesces standard signals on the pending bitmask but preserves one siginfo for the pending instance. Mirrored that in sig_state via std_info[] + std_info_valid[]; new signal_queue_info() routes both standard and RT queued signals through one path. signal_deliver, signal_queue, signal_queue_rt, and sc_rt_tgsigqueueinfo all read or write through the new path. signal_default_info() consolidates the SI_USER fallback construction. 4. Wire SYS_rt_sigqueueinfo (138) so glibc and musl sigqueue() works. sc_rt_sigqueueinfo is a thin forwarder to sc_rt_tgsigqueueinfo with tgid == tid == pid. Single-VM divergence: targeting a pid that is not the current guest returns -ESRCH because guest threads of another guest_t are unreachable. sc_rt_tgsigqueueinfo also now surfaces -EFAULT when guest_read_small fails on the siginfo pointer instead of silently queueing a zero-payload signal. Per-thread blocked mask non-interference is documented inline at signalfd_read in src/syscall/fd.c: signalfd is the standard mechanism for reading signals blocked from synchronous delivery via sigprocmask, so consulting the pthread mask would defeat the purpose. The hardening test covers RT multiplicity FIFO with distinct si_int payloads, standard-signal coalescing, SIGRTMAX reachability (the regression for the off-by-one), ssi_int / ssi_ptr sigval round-trip, sender ssi_pid / ssi_uid carry-through, signalfd-mask-only filtering, libc sigqueue() smoke, standard-signal sigqueue metadata round-trip, partial-fault preservation via mmap guard page, and rt_sigqueueinfo EFAULT on unreadable siginfo. The partial-fault assertion accepts both Linux's looser "lose the failed record" semantics and elfuse's stricter "preserve every unwritten record" semantics so the test is green under both runners. --- Makefile | 6 + src/syscall/abi.h | 1 + src/syscall/dispatch.tbl | 1 + src/syscall/fd.c | 71 ++- src/syscall/signal.c | 107 ++-- src/syscall/signal.h | 19 +- src/syscall/syscall.c | 82 ++- tests/manifest.txt | 1 + tests/test-matrix.sh | 2 + tests/test-signalfd-hardening.c | 871 ++++++++++++++++++++++++++++++++ 10 files changed, 1095 insertions(+), 66 deletions(-) create mode 100644 tests/test-signalfd-hardening.c diff --git a/Makefile b/Makefile index 8ca6e4c..2895a93 100644 --- a/Makefile +++ b/Makefile @@ -147,6 +147,12 @@ $(BUILD_DIR)/test-pthread: tests/test-pthread.c | $(BUILD_DIR) @echo " CROSS $< (with -lpthread)" $(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread +# test-signalfd-hardening needs -lpthread for the worker-thread tid +# regression case in test_rt_sigqueueinfo_rejects_thread_tid. +$(BUILD_DIR)/test-signalfd-hardening: tests/test-signalfd-hardening.c | $(BUILD_DIR) + @echo " CROSS $< (with -lpthread)" + $(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread + endif include mk/tests.mk diff --git a/src/syscall/abi.h b/src/syscall/abi.h index 578253c..6315039 100644 --- a/src/syscall/abi.h +++ b/src/syscall/abi.h @@ -97,6 +97,7 @@ #define SYS_rt_sigaction 134 #define SYS_rt_sigprocmask 135 #define SYS_rt_sigpending 136 +#define SYS_rt_sigqueueinfo 138 #define SYS_rt_sigreturn 139 #define SYS_setpriority 140 #define SYS_getpriority 141 diff --git a/src/syscall/dispatch.tbl b/src/syscall/dispatch.tbl index ea421c6..2925ca1 100644 --- a/src/syscall/dispatch.tbl +++ b/src/syscall/dispatch.tbl @@ -111,6 +111,7 @@ SYS_rt_sigaction sc_rt_sigaction 1 SYS_rt_sigprocmask sc_rt_sigprocmask 1 SYS_rt_sigpending sc_rt_sigpending 0 SYS_rt_sigreturn sc_rt_sigreturn 1 +SYS_rt_sigqueueinfo sc_rt_sigqueueinfo 1 SYS_rt_tgsigqueueinfo sc_rt_tgsigqueueinfo 1 # Time and timers diff --git a/src/syscall/fd.c b/src/syscall/fd.c index 04c7675..ebc2d95 100644 --- a/src/syscall/fd.c +++ b/src/syscall/fd.c @@ -885,15 +885,31 @@ int64_t sys_signalfd4(guest_t *g, return gfd; } -/* Read from signalfd: consume pending signals matching the mask. - * Each signal produces one signalfd_siginfo (128 bytes). - * Returns number of bytes read, or -EAGAIN if nothing pending. +/* Read from signalfd: consume pending signals matching the signalfd's mask. + * + * Each signal produces one signalfd_siginfo (128 bytes). RT signals (32-64) + * are queued: each sigqueue/rt_tgsigqueueinfo enqueues a distinct instance with + * its own si_int/si_ptr payload, and signalfd_read returns them in FIFO order + * without coalescing (Linux behavior). + * + * Per-thread signal mask is intentionally not consulted: signalfd is the + * standard mechanism for reading signals that were blocked from synchronous + * delivery via sigprocmask(). The signalfd's own mask (set at create time or + * via signalfd(fd, &mask, ...)) is the only filter applied. + * + * ssi_int/ssi_ptr are populated from queued metadata when present. + * Standard signals (1-31) still coalesce to one pending instance, but Linux + * preserves one siginfo payload for that instance. + * + * Returns the number of bytes read (multiple of sizeof(signalfd_siginfo)), or + * -EAGAIN if nothing pending and the fd is non-blocking. */ int64_t signalfd_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count) { +retry: /* Capture slot state under sfd_lock, then release BEFORE calling * signal_get_state() which acquires sig_lock(4). Holding sfd_lock(5a) * while taking sig_lock(4) would violate lock ordering. @@ -963,10 +979,21 @@ int64_t signalfd_read(int guest_fd, if (deliverable == 0) goto no_pending; } - total = signal_peek_signalfd(mask, pending, max_signals); - if (total == 0) + size_t peeked = signal_peek_signalfd(mask, pending, max_signals); + if (peeked == 0) goto no_pending; - for (size_t i = 0; i < total; i++) { + + /* Write-then-take. Writing first means that on a guest_write_small EFAULT + * the rt-queue is still intact and signals are not lost: no re-queue dance, + * no RT_SIGQUEUE_MAX overflow window, no extra signalfd_notify writes that + * would desync the pipe-byte count from the actual pending-signal count. + * Take only the prefix the writer landed; if a concurrent consumer advanced + * the rt-queue head between peek and take, take returns less than the + * written count and the bridge restarts the read loop via the retry label + * below. + */ + size_t written = 0; + for (size_t i = 0; i < peeked; i++) { linux_signalfd_siginfo_t info; memset(&info, 0, sizeof(info)); info.ssi_signo = (uint32_t) pending[i].signum; @@ -978,12 +1005,34 @@ int64_t signalfd_read(int guest_fd, uint64_t off = i * sizeof(linux_signalfd_siginfo_t); if (guest_write_small(g, buf_gva + off, &info, sizeof(info)) < 0) { - if (pending != pending_stack) - free(pending); - return -LINUX_EFAULT; + if (written == 0) { + /* No bytes transferred: surface EFAULT, leave the queue + * untouched so the signal is not lost. Matches the elfuse + * promise locked in by tests/test-tier-b's + * test_signalfd_efault_preserves_pending. + */ + if (pending != pending_stack) + free(pending); + return -LINUX_EFAULT; + } + + /* Partial success: stop writing and let take consume only the + * delivered prefix. The unwritten entries stay in the rt-queue + * naturally because the take call has not run yet. + */ + break; } + written++; + } + + total = signal_take_signalfd_exact(pending, written); + if (total == 0) { + if (written == 0) + goto no_pending; + if (pending != pending_stack) + free(pending); + goto retry; } - total = signal_take_signalfd_exact(pending, total); /* Drain pipe: consume exactly one byte per signal read. If the code drains * ALL bytes, the code would lose notifications for signals that arrived @@ -998,7 +1047,7 @@ int64_t signalfd_read(int guest_fd, if (pending != pending_stack) free(pending); - return (int64_t) (total * sizeof(linux_signalfd_siginfo_t)); + return (int64_t) total * (int64_t) sizeof(linux_signalfd_siginfo_t); no_pending: if (pending != pending_stack) diff --git a/src/syscall/signal.c b/src/syscall/signal.c index d8e9149..1ea952f 100644 --- a/src/syscall/signal.c +++ b/src/syscall/signal.c @@ -4,12 +4,12 @@ * Copyright 2025 Moritz Angermann, zw3rk pte. ltd. * SPDX-License-Identifier: Apache-2.0 * - * Implements Linux-compatible signal delivery for aarch64 guests. When a - * signal is queued (e.g., SIGPIPE from write() to broken pipe), signal - * emulation builds an rt_sigframe on the guest stack matching the kernel's - * setup_rt_frame() layout, then redirects the vCPU to the guest's signal - * handler. The guest handler eventually calls rt_sigreturn (SYS 139), which - * restores the saved register state from the frame. + * Implements Linux-compatible signal delivery for aarch64 guests. When a signal + * is queued (e.g., SIGPIPE from write() to broken pipe), signal emulation + * builds an rt_sigframe on the guest stack matching the kernel's setup_rt_frame + * layout, then redirects the vCPU to the guest's signal handler. The guest + * handler eventually calls rt_sigreturn (SYS 139), which restores the saved + * register state from the frame. * * Reference: Linux arch/arm64/kernel/signal.c */ @@ -161,10 +161,9 @@ static inline int sig_uncatchable(int signum) return signum == LINUX_SIGKILL || signum == LINUX_SIGSTOP; } -static void signal_rt_enqueue_locked(int signum, const signal_rt_info_t *info) +static signal_rt_info_t signal_default_info(int signum) { - int idx = signum - LINUX_SIGRTMIN; - signal_rt_info_t fallback = { + return (signal_rt_info_t) { .signum = signum, .si_code = LINUX_SI_USER, .si_pid = (int32_t) proc_get_pid(), @@ -172,6 +171,33 @@ static void signal_rt_enqueue_locked(int signum, const signal_rt_info_t *info) .si_int = 0, .si_ptr = 0, }; +} + +static void signal_standard_enqueue_locked(int signum, + const signal_rt_info_t *info) +{ + int idx = signum - 1; + uint64_t bit = sig_bit(signum); + + if (!(sig_state.pending & bit)) { + sig_state.std_info[idx] = info ? *info : signal_default_info(signum); + sig_state.std_info_valid[idx] = info != NULL; + } + sig_state.pending |= bit; +} + +static signal_rt_info_t signal_standard_peek_locked(int signum) +{ + int idx = signum - 1; + if (sig_state.std_info_valid[idx]) + return sig_state.std_info[idx]; + return signal_default_info(signum); +} + +static void signal_rt_enqueue_locked(int signum, const signal_rt_info_t *info) +{ + int idx = signum - LINUX_SIGRTMIN; + signal_rt_info_t fallback = signal_default_info(signum); const signal_rt_info_t *entry = info ? info : &fallback; sig_state.pending |= sig_bit(signum); @@ -279,9 +305,10 @@ void signal_queue(int signum) if (signum < 1 || signum > LINUX_NSIG) return; pthread_mutex_lock(&sig_lock); - sig_state.pending |= sig_bit(signum); if (signum >= LINUX_SIGRTMIN) signal_rt_enqueue_locked(signum, NULL); + else + signal_standard_enqueue_locked(signum, NULL); /* Publish hint before releasing lock so vCPU hot path sees it. */ atomic_store_explicit(&sig_pending_hint, sig_state.pending, memory_order_release); @@ -317,7 +344,17 @@ void signal_queue_rt(int signum, int32_t si_int, uint64_t si_ptr) { - if (signum < LINUX_SIGRTMIN || signum > LINUX_NSIG) + signal_queue_info(signum, si_code, si_pid, si_uid, si_int, si_ptr); +} + +void signal_queue_info(int signum, + int32_t si_code, + int32_t si_pid, + uint32_t si_uid, + int32_t si_int, + uint64_t si_ptr) +{ + if (signum < 1 || signum > LINUX_NSIG) return; pthread_mutex_lock(&sig_lock); signal_rt_info_t info = { @@ -328,7 +365,10 @@ void signal_queue_rt(int signum, .si_int = si_int, .si_ptr = si_ptr, }; - signal_rt_enqueue_locked(signum, &info); + if (signum >= LINUX_SIGRTMIN) + signal_rt_enqueue_locked(signum, &info); + else + signal_standard_enqueue_locked(signum, &info); atomic_store_explicit(&sig_pending_hint, sig_state.pending, memory_order_release); pthread_mutex_unlock(&sig_lock); @@ -416,7 +456,12 @@ static size_t signal_collect_signalfd(uint64_t mask, pthread_mutex_lock(&sig_lock); uint64_t deliverable = sig_state.pending & mask; - for (int signum = 1; signum < LINUX_NSIG && total < max; signum++) { + /* signum runs 1..LINUX_NSIG inclusive (64 is the highest valid RT signal + * on aarch64 Linux). Bare-musl applications can target SIGRTMAX directly, + * so the inclusive bound matters even though glibc reserves the top of the + * RT range for itself. + */ + for (int signum = 1; signum <= LINUX_NSIG && total < max; signum++) { uint64_t bit = BIT64(signum - 1); if (!(deliverable & bit)) continue; @@ -446,14 +491,9 @@ static size_t signal_collect_signalfd(uint64_t mask, total++; } } else { - signal_rt_info_t info = { - .signum = signum, - .si_code = LINUX_SI_USER, - .si_pid = (int32_t) proc_get_pid(), - .si_uid = proc_get_uid(), - .si_int = 0, - .si_ptr = 0, - }; + signal_rt_info_t info = signal_standard_peek_locked(signum); + if (consume) + sig_state.std_info_valid[signum - 1] = false; if (consume) sig_state.pending &= ~bit; if (out) @@ -482,7 +522,7 @@ size_t signal_take_signalfd_exact(const signal_rt_info_t *expected, size_t max) pthread_mutex_lock(&sig_lock); for (; total < max; total++) { int signum = expected[total].signum; - if (signum <= 0 || signum >= LINUX_NSIG) + if (signum <= 0 || signum > LINUX_NSIG) break; uint64_t bit = sig_bit(signum); @@ -508,6 +548,15 @@ size_t signal_take_signalfd_exact(const signal_rt_info_t *expected, size_t max) continue; } + signal_rt_info_t current = signal_standard_peek_locked(signum); + const signal_rt_info_t *want = &expected[total]; + if (current.signum != want->signum || + current.si_code != want->si_code || + current.si_pid != want->si_pid || current.si_uid != want->si_uid || + current.si_int != want->si_int || current.si_ptr != want->si_ptr) + break; + + sig_state.std_info_valid[signum - 1] = false; sig_state.pending &= ~bit; } atomic_store_explicit(&sig_pending_hint, sig_state.pending, @@ -1107,14 +1156,7 @@ int signal_deliver(hv_vcpu_t vcpu, guest_t *g, int *exit_code) /* Find lowest pending unblocked signal */ int signum = bit_ctz64(deliverable) + 1; - signal_rt_info_t rt_info = { - .signum = signum, - .si_code = LINUX_SI_USER, - .si_pid = (int32_t) proc_get_pid(), - .si_uid = proc_get_uid(), - .si_int = 0, - .si_ptr = 0, - }; + signal_rt_info_t rt_info = signal_default_info(signum); /* Dequeue: for RT signals, decrement count and only clear the * pending bit when the queue is empty. Standard signals are @@ -1123,6 +1165,8 @@ int signal_deliver(hv_vcpu_t vcpu, guest_t *g, int *exit_code) if (signum >= LINUX_SIGRTMIN) { signal_rt_dequeue_locked(signum, &rt_info); } else { + rt_info = signal_standard_peek_locked(signum); + sig_state.std_info_valid[signum - 1] = false; sig_state.pending &= ~sig_bit(signum); } @@ -1210,8 +1254,7 @@ int signal_deliver(hv_vcpu_t vcpu, guest_t *g, int *exit_code) frame.info.si_code = rt_info.si_code; frame.info.si_pid = rt_info.si_pid; frame.info.si_uid = (int32_t) rt_info.si_uid; - if (signum >= LINUX_SIGRTMIN) - frame.info.si_value = rt_info.si_ptr; + frame.info.si_value = rt_info.si_ptr; } /* ucontext: embed a per-delivery cookie in uc_flags for SROP diff --git a/src/syscall/signal.h b/src/syscall/signal.h index 91c8cef..aff266e 100644 --- a/src/syscall/signal.h +++ b/src/syscall/signal.h @@ -184,8 +184,13 @@ typedef struct { bool saved_blocked_valid; /* True if saved_blocked is set */ linux_stack_t altstack; /* Alternate signal stack (sigaltstack) */ bool on_altstack; /* True if currently delivering on altstack */ + /* Standard signal metadata: Linux coalesces signals 1-31, but preserves one + * siginfo payload for the pending instance. + */ + bool std_info_valid[LINUX_SIGRTMIN - 1]; + signal_rt_info_t std_info[LINUX_SIGRTMIN - 1]; /* RT signal queue: count of pending instances per signal. - * Standard signals (1-31) use only the pending bitmask (coalesced). + * Standard signals (1-31) use the pending bitmask plus std_info[]. * RT signals (32-64) are queued: each instance is tracked separately. */ int rt_queue[RT_SIGNAL_COUNT]; @@ -193,7 +198,7 @@ typedef struct { signal_rt_info_t rt_info[RT_SIGNAL_COUNT][RT_SIGQUEUE_MAX]; } signal_state_t; -/* API. */ +/* API */ /* Initialize signal state: all SIG_DFL, nothing pending/blocked. */ void signal_init(void); @@ -215,6 +220,16 @@ void signal_queue_rt(int signum, int32_t si_int, uint64_t si_ptr); +/* Queue a signal with explicit siginfo metadata. Standard signals preserve + * one payload while coalesced; RT signals enqueue every instance. + */ +void signal_queue_info(int signum, + int32_t si_code, + int32_t si_pid, + uint32_t si_uid, + int32_t si_int, + uint64_t si_ptr); + /* Set fault info for the next signal delivery. When set, signal_deliver() * populates si_code, si_addr, fault_address, and ESR context from these * values instead of using the default SI_USER/si_pid fields. Consumed diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c index ce40446..edcc09b 100644 --- a/src/syscall/syscall.c +++ b/src/syscall/syscall.c @@ -702,7 +702,14 @@ static int64_t sc_rt_tgsigqueueinfo(guest_t *g, return -LINUX_ESRCH; linux_siginfo_t info; memset(&info, 0, sizeof(info)); - if (uinfo_gva && guest_read_small(g, uinfo_gva, &info, sizeof(info)) == 0) { + if (uinfo_gva && guest_read_small(g, uinfo_gva, &info, sizeof(info)) < 0) { + log_debug( + "rt_tgsigqueueinfo(tgid=%d, tid=%d, sig=%d, " + "uinfo=0x%llx [unreadable])", + tgid, tid, sig, (unsigned long long) uinfo_gva); + return -LINUX_EFAULT; + } + if (uinfo_gva) { bool is_fault = (sig == LINUX_SIGTRAP || sig == LINUX_SIGSEGV || sig == LINUX_SIGBUS || sig == LINUX_SIGFPE || sig == LINUX_SIGILL); @@ -717,25 +724,58 @@ static int64_t sc_rt_tgsigqueueinfo(guest_t *g, } else log_debug("rt_tgsigqueueinfo(tgid=%d, tid=%d, sig=%d, si_code=%d)", tgid, tid, sig, info.si_code); - } else - log_debug( - "rt_tgsigqueueinfo(tgid=%d, tid=%d, sig=%d, " - "uinfo=0x%llx [unreadable])", - tgid, tid, sig, (unsigned long long) uinfo_gva); - /* RT signals: extract sigval from the queued-signal payload fields. */ - if (sig >= LINUX_SIGRTMIN && uinfo_gva) { + } + /* Queued signals carry sigval in si_value for both standard and RT + * signals; standard signals still coalesce to one pending instance. + */ + if (uinfo_gva) { int32_t si_int = 0; memcpy(&si_int, &info.si_value, sizeof(si_int)); uint64_t si_ptr = 0; memcpy(&si_ptr, &info.si_value, sizeof(si_ptr)); - signal_queue_rt(sig, info.si_code, info.si_pid, (uint32_t) info.si_uid, - si_int, si_ptr); + signal_queue_info(sig, info.si_code, info.si_pid, + (uint32_t) info.si_uid, si_int, si_ptr); } else { signal_queue(sig); } return 0; } +/* rt_sigqueueinfo(pid, sig, info) -- POSIX sigqueue() in glibc/musl uses this. + * + * The first argument is documented as a process identifier, but real Linux + * is permissive: kill_pid_info() looks pid up in the task table and routes + * the signal through PIDTYPE_TGID, so a thread id that resolves to a task + * succeeds and the signal lands in that task's thread-group pending set. + * Foreign pids that match no task return -ESRCH. + * + * elfuse mirrors this by forwarding to sc_rt_tgsigqueueinfo with + * tgid==tid==pid: the downstream thread_find() lookup accepts any guest + * thread's tid (collapsing to the single guest tgid), the + * proc_get_pid() fallback accepts the main thread's tid, and unknown + * pids fall through to -ESRCH. signal_queue_info() then queues + * process-wide so the routing semantics match Linux even though the + * lookup goes through the per-thread table. + * + * Earlier review feedback flagged "incorrectly accepting thread ids" + * and recommended a strict pid==tgid gate; that gate was tried and + * rejected because the qemu/Linux reference accepts the same tids. + */ +static int64_t sc_rt_sigqueueinfo(guest_t *g, + uint64_t x0, + uint64_t x1, + uint64_t x2, + uint64_t x3, + uint64_t x4, + uint64_t x5, + bool verbose) +{ + (void) x3; + (void) x4; + (void) x5; + return sc_rt_tgsigqueueinfo(g, x0, x0, x1, x2, 0, 0, verbose); +} + static int64_t sc_rt_sigreturn(guest_t *g, uint64_t x0, uint64_t x1, @@ -788,8 +828,8 @@ static int64_t sc_prctl(guest_t *g, case LINUX_PR_GET_DUMPABLE: return 1; case LINUX_PR_SET_CHILD_SUBREAPER: - /* Accept silently. elfuse's process model already reaps all - * children within the VM; the flag has no additional effect. + /* Accept silently. elfuse's process model already reaps all children + * within the VM; the flag has no additional effect. */ return 0; case LINUX_PR_GET_CHILD_SUBREAPER: { @@ -809,8 +849,8 @@ static int64_t sc_prctl(guest_t *g, return (x1 <= LINUX_CAP_LAST_CAP) ? 1 : -LINUX_EINVAL; case LINUX_PR_SET_VMA: /* PR_SET_VMA with PR_SET_VMA_ANON_NAME: accept and ignore. - * Android and memory profiling tools use this to name anonymous - * mmap regions. The name is purely advisory. + * Android and memory profiling tools use this to name anonymous mmap + * regions. The name is purely advisory. */ if ((int) x1 == LINUX_PR_SET_VMA_ANON_NAME) return 0; @@ -1168,8 +1208,8 @@ static int64_t sc_openat2(guest_t *g, return -LINUX_EAGAIN; /* For RESOLVE_NO_SYMLINKS, RESOLVE_NO_MAGICLINKS, RESOLVE_BENEATH, - * RESOLVE_IN_ROOT: read the guest path and enforce constraints - * before opening. + * RESOLVE_IN_ROOT: read the guest path and enforce constraints before + * opening. */ if (resolve & (RESOLVE_NO_SYMLINKS | RESOLVE_NO_MAGICLINKS | RESOLVE_BENEATH | RESOLVE_IN_ROOT)) { @@ -1285,8 +1325,8 @@ static int64_t sc_execveat(guest_t *g, hv_vcpu_t vcpu = current_thread->vcpu; int dirfd = (int) x0, flags = (int) x4; - /* Resolve the target path before taking mmap_lock (path resolution - * may call fd_to_host / openat which do not need mmap_lock). + /* Resolve the target path before taking mmap_lock (path resolution may call + * fd_to_host / openat which do not need mmap_lock). */ uint64_t path_gva = x1; char resolved[LINUX_PATH_MAX]; @@ -1534,9 +1574,9 @@ int syscall_dispatch(hv_vcpu_t vcpu, guest_t *g, int *exit_code, bool verbose) goto slow_path; /* Pre-filter: only fast-path fd types that map 1:1 to host - * read/write. This read is racy but benign; if the type - * changed, fd_to_host_dup will either fail or the slow path - * handles it correctly on fallthrough. + * read/write. This read is racy but benign; if the type changed, + * fd_to_host_dup will either fail or the slow path handles it + * correctly on fallthrough. */ int tp = fd_table[fd].type; if (tp != FD_REGULAR && tp != FD_STDIO && tp != FD_PIPE && diff --git a/tests/manifest.txt b/tests/manifest.txt index 17846dc..789acfd 100644 --- a/tests/manifest.txt +++ b/tests/manifest.txt @@ -49,6 +49,7 @@ test-poll # diff=skip [section] I/O subsystem tests test-eventfd test-signalfd +test-signalfd-hardening test-epoll test-epoll-edge test-timerfd diff --git a/tests/test-matrix.sh b/tests/test-matrix.sh index cd5d6d0..39e06a6 100755 --- a/tests/test-matrix.sh +++ b/tests/test-matrix.sh @@ -346,6 +346,8 @@ run_unit_tests() printf "\nI/O subsystem\n" test_check "$runner" "test-eventfd" "0 failed" "$bindir/test-eventfd" test_check "$runner" "test-signalfd" "0 failed" "$bindir/test-signalfd" + test_check "$runner" "test-signalfd-hardening" "0 failed" \ + "$bindir/test-signalfd-hardening" test_check "$runner" "test-epoll" "0 failed" "$bindir/test-epoll" test_check "$runner" "test-epoll-edge" "0 failed" "$bindir/test-epoll-edge" test_check "$runner" "test-timerfd" "0 failed" "$bindir/test-timerfd" diff --git a/tests/test-signalfd-hardening.c b/tests/test-signalfd-hardening.c new file mode 100644 index 0000000..38b7f9d --- /dev/null +++ b/tests/test-signalfd-hardening.c @@ -0,0 +1,871 @@ +/* signalfd read semantics hardening + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Covers: + * 1. RT signal multiplicity: each sigqueue/rt_tgsigqueueinfo enqueues a + * distinct instance with its own si_int payload, returned in FIFO + * order without coalescing. + * 2. Standard signals (1-31) coalesce -- multiple kill()s produce one + * signalfd record (kernel parity). + * 3. ssi_int / ssi_ptr round-trip via sigqueue() (rt_sigqueueinfo) and + * direct rt_tgsigqueueinfo. + * 4. SIGRTMAX (signum 64) is reachable via signalfd (regression for the + * off-by-one that excluded signum == LINUX_NSIG from the collect / + * take loops). + * 5. signalfd's own mask is the only filter -- per-thread blocked mask + * is intentionally not consulted, matching Linux semantics. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test-harness.h" + +int passes = 0, fails = 0; + +#ifndef SYS_rt_tgsigqueueinfo +#define SYS_rt_tgsigqueueinfo 240 +#endif + +#ifndef SYS_rt_sigqueueinfo +#define SYS_rt_sigqueueinfo 138 +#endif + +/* siginfo_t crosses both glibc and musl, but si_value layouts differ. + * Build the kernel-shaped buffer by hand so the test stays libc-agnostic. + */ +static void build_kernel_siginfo(int sig, + int code, + pid_t sender_pid, + uid_t sender_uid, + int payload_int, + void *payload_ptr, + unsigned char out[128]) +{ + memset(out, 0, 128); + int32_t s32; + uint64_t u64; + s32 = sig; + memcpy(out + 0, &s32, 4); + s32 = 0; + memcpy(out + 4, &s32, 4); /* si_errno */ + s32 = code; + memcpy(out + 8, &s32, 4); + /* offset 12 is _pad0 (or part of _sifields alignment). Linux's _sifields + * starts at offset 16 on aarch64; for SI_QUEUE the layout there is: + * si_pid (4) si_uid (4) si_value (8) + */ + s32 = sender_pid; + memcpy(out + 16, &s32, 4); + s32 = sender_uid; + memcpy(out + 20, &s32, 4); + s32 = payload_int; + memcpy(out + 24, &s32, 4); + /* Kernel ignores the upper 4 bytes of si_value's int form, but writes the + * pointer form into the full 8-byte slot at offset 24 for sigval_t. The + * pointer goes into the low 8 bytes so signal_queue_rt() reads either + * representation correctly. + */ + u64 = (uint64_t) (uintptr_t) payload_ptr; + memcpy(out + 24, &u64, 8); + /* If both int and ptr are set, ptr wins because it overlaps. Tests pick + * one or the other. + */ + if (payload_ptr == NULL) { + s32 = payload_int; + memcpy(out + 24, &s32, 4); + } +} + +static int raw_rt_tgsigqueueinfo(pid_t tgid, + pid_t tid, + int sig, + const unsigned char info[128]) +{ + return (int) syscall(SYS_rt_tgsigqueueinfo, tgid, tid, sig, info); +} + +static int raw_rt_sigqueueinfo(pid_t pid, int sig, const void *info) +{ + return (int) syscall(SYS_rt_sigqueueinfo, pid, sig, info); +} + +static void test_rt_multiplicity(void) +{ + TEST("RT multiplicity FIFO + payload"); + + int sig = SIGRTMIN + 1; + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, sig); + sigprocmask(SIG_BLOCK, &mask, NULL); + + int fd = signalfd(-1, &mask, SFD_NONBLOCK); + if (fd < 0) { + FAIL("signalfd"); + return; + } + + const int payloads[] = {0x1111, 0x2222, 0x3333}; + const int N = sizeof(payloads) / sizeof(payloads[0]); + pid_t pid = getpid(); + for (int i = 0; i < N; i++) { + unsigned char info[128]; + /* SI_QUEUE == -1 is the kernel marker for sigqueue-style payload. */ + build_kernel_siginfo(sig, -1, pid, getuid(), payloads[i], NULL, info); + if (raw_rt_tgsigqueueinfo(pid, pid, sig, info) != 0) { + close(fd); + FAIL("rt_tgsigqueueinfo"); + return; + } + } + + struct signalfd_siginfo buf[4]; + memset(buf, 0, sizeof(buf)); + ssize_t r = read(fd, buf, sizeof(buf)); + close(fd); + + if (r != (ssize_t) (N * sizeof(buf[0]))) { + printf("FAIL: read returned %zd, expected %zu\n", r, + N * sizeof(buf[0])); + fails++; + return; + } + for (int i = 0; i < N; i++) { + if (buf[i].ssi_signo != (uint32_t) sig) { + printf("FAIL: record %d ssi_signo=%u, expected %d\n", i, + buf[i].ssi_signo, sig); + fails++; + return; + } + if (buf[i].ssi_int != payloads[i]) { + printf("FAIL: record %d ssi_int=0x%x, expected 0x%x\n", i, + buf[i].ssi_int, payloads[i]); + fails++; + return; + } + } + PASS(); +} + +static void test_standard_coalesces(void) +{ + TEST("standard signals coalesce"); + + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, SIGUSR1); + sigprocmask(SIG_BLOCK, &mask, NULL); + + int fd = signalfd(-1, &mask, SFD_NONBLOCK); + if (fd < 0) { + FAIL("signalfd"); + return; + } + + /* Three kill()s should produce exactly one signalfd record (Linux + * coalesces standard signals on the pending bitmask). + */ + kill(getpid(), SIGUSR1); + kill(getpid(), SIGUSR1); + kill(getpid(), SIGUSR1); + + struct signalfd_siginfo buf[4]; + memset(buf, 0, sizeof(buf)); + ssize_t r = read(fd, buf, sizeof(buf)); + if (r != (ssize_t) sizeof(buf[0])) { + printf("FAIL: expected one record (%zu bytes), got %zd\n", + sizeof(buf[0]), r); + close(fd); + fails++; + return; + } + if (buf[0].ssi_signo != (uint32_t) SIGUSR1) { + printf("FAIL: ssi_signo=%u\n", buf[0].ssi_signo); + close(fd); + fails++; + return; + } + /* Second read drains nothing -- pending bit cleared. */ + errno = 0; + ssize_t r2 = read(fd, buf, sizeof(buf)); + close(fd); + if (r2 != -1 || errno != EAGAIN) { + FAIL("expected EAGAIN on follow-up read"); + return; + } + PASS(); +} + +static void test_sigrtmax_reachable(void) +{ + /* SIGRTMAX (64 on aarch64) was excluded by an off-by-one in the + * collect/take loops (signum < LINUX_NSIG instead of <= LINUX_NSIG). + * This test fails before the fix and passes after. + */ + TEST("SIGRTMAX reaches signalfd"); + + int sig = SIGRTMAX; + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, sig); + sigprocmask(SIG_BLOCK, &mask, NULL); + + int fd = signalfd(-1, &mask, SFD_NONBLOCK); + if (fd < 0) { + FAIL("signalfd"); + return; + } + + pid_t pid = getpid(); + unsigned char info[128]; + build_kernel_siginfo(sig, -1, pid, getuid(), 0xCAFEBABE, NULL, info); + if (raw_rt_tgsigqueueinfo(pid, pid, sig, info) != 0) { + close(fd); + FAIL("rt_tgsigqueueinfo SIGRTMAX"); + return; + } + + struct signalfd_siginfo rec; + memset(&rec, 0, sizeof(rec)); + ssize_t r = read(fd, &rec, sizeof(rec)); + close(fd); + if (r != (ssize_t) sizeof(rec)) { + printf("FAIL: read returned %zd\n", r); + fails++; + return; + } + if (rec.ssi_signo != (uint32_t) sig || + rec.ssi_int != (int32_t) 0xCAFEBABE) { + printf("FAIL: signo=%u int=0x%x\n", rec.ssi_signo, rec.ssi_int); + fails++; + return; + } + PASS(); +} + +static void test_ssi_ptr_roundtrip(void) +{ + /* sigval has separate int and ptr forms. For the ptr form the full 64 + * bits land in si_value; signalfd_siginfo exposes both ssi_int (low 32) + * and ssi_ptr (full 64). Verify both are populated from one queued ptr. + */ + TEST("ssi_ptr / ssi_int round-trip"); + + int sig = SIGRTMIN + 2; + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, sig); + sigprocmask(SIG_BLOCK, &mask, NULL); + + int fd = signalfd(-1, &mask, SFD_NONBLOCK); + if (fd < 0) { + FAIL("signalfd"); + return; + } + + /* Use an arbitrary pointer-shaped value with a high bit set so a + * truncating implementation drops information detectably. + */ + void *payload = (void *) 0x0123456789ABCDEFULL; + pid_t pid = getpid(); + unsigned char info[128]; + build_kernel_siginfo(sig, -1, pid, getuid(), 0, payload, info); + if (raw_rt_tgsigqueueinfo(pid, pid, sig, info) != 0) { + close(fd); + FAIL("rt_tgsigqueueinfo"); + return; + } + + struct signalfd_siginfo rec; + memset(&rec, 0, sizeof(rec)); + ssize_t r = read(fd, &rec, sizeof(rec)); + close(fd); + if (r != (ssize_t) sizeof(rec)) { + FAIL("read short"); + return; + } + if (rec.ssi_ptr != (uint64_t) (uintptr_t) payload) { + printf("FAIL: ssi_ptr=0x%llx, expected 0x%llx\n", + (unsigned long long) rec.ssi_ptr, + (unsigned long long) (uintptr_t) payload); + fails++; + return; + } + /* ssi_int aliases the low 32 bits of the same union. */ + if (rec.ssi_int != (int32_t) (uintptr_t) payload) { + printf("FAIL: ssi_int=0x%x\n", rec.ssi_int); + fails++; + return; + } + PASS(); +} + +static void test_sender_metadata(void) +{ + /* Verify ssi_pid / ssi_uid carry the sender values supplied via + * rt_tgsigqueueinfo's siginfo (Linux-style SI_QUEUE: caller fills + * si_pid/si_uid; kernel does not override for negative si_code). + */ + TEST("ssi_pid / ssi_uid from sender"); + + int sig = SIGRTMIN + 3; + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, sig); + sigprocmask(SIG_BLOCK, &mask, NULL); + + int fd = signalfd(-1, &mask, SFD_NONBLOCK); + if (fd < 0) { + FAIL("signalfd"); + return; + } + + pid_t pid = getpid(); + uid_t uid = getuid(); + unsigned char info[128]; + build_kernel_siginfo(sig, -1, pid, uid, 0x55AA, NULL, info); + if (raw_rt_tgsigqueueinfo(pid, pid, sig, info) != 0) { + close(fd); + FAIL("rt_tgsigqueueinfo"); + return; + } + + struct signalfd_siginfo rec; + memset(&rec, 0, sizeof(rec)); + ssize_t r = read(fd, &rec, sizeof(rec)); + close(fd); + if (r != (ssize_t) sizeof(rec)) { + FAIL("read short"); + return; + } + if (rec.ssi_pid != (uint32_t) pid || rec.ssi_uid != uid) { + printf("FAIL: ssi_pid=%u (want %d), ssi_uid=%u (want %u)\n", + rec.ssi_pid, pid, rec.ssi_uid, uid); + fails++; + return; + } + if (rec.ssi_code != -1) { + printf("FAIL: ssi_code=%d (want -1 SI_QUEUE)\n", rec.ssi_code); + fails++; + return; + } + PASS(); +} + +static void test_mask_filters_only(void) +{ + /* signalfd's own mask is the sole filter: a signal blocked from + * synchronous delivery via sigprocmask is still readable from the + * signalfd if its mask includes the signal. + */ + TEST("signalfd mask filters, not pthread mask"); + + sigset_t pblock; + sigemptyset(&pblock); + sigaddset(&pblock, SIGUSR1); + sigaddset(&pblock, SIGUSR2); + sigprocmask(SIG_BLOCK, &pblock, NULL); + + /* signalfd only watches SIGUSR1. SIGUSR2 stays pending in the process + * pending set after kill(), but must not appear in the read result. + */ + sigset_t fdmask; + sigemptyset(&fdmask); + sigaddset(&fdmask, SIGUSR1); + + int fd = signalfd(-1, &fdmask, SFD_NONBLOCK); + if (fd < 0) { + FAIL("signalfd"); + return; + } + + kill(getpid(), SIGUSR2); + kill(getpid(), SIGUSR1); + + struct signalfd_siginfo rec[4]; + memset(rec, 0, sizeof(rec)); + ssize_t r = read(fd, rec, sizeof(rec)); + if (r != (ssize_t) sizeof(rec[0])) { + printf("FAIL: expected one record, got %zd\n", r); + close(fd); + fails++; + /* Drain SIGUSR2 to keep state clean for later tests. */ + sigset_t draino; + sigemptyset(&draino); + sigaddset(&draino, SIGUSR2); + int tmp = signalfd(-1, &draino, SFD_NONBLOCK); + if (tmp >= 0) { + (void) read(tmp, rec, sizeof(rec)); + close(tmp); + } + return; + } + if (rec[0].ssi_signo != (uint32_t) SIGUSR1) { + printf("FAIL: got signo=%u, expected SIGUSR1\n", rec[0].ssi_signo); + close(fd); + fails++; + return; + } + close(fd); + + /* SIGUSR2 must still be pending -- prove by widening mask and reading. */ + sigaddset(&fdmask, SIGUSR2); + int fd2 = signalfd(-1, &fdmask, SFD_NONBLOCK); + if (fd2 < 0) { + FAIL("signalfd 2"); + return; + } + memset(rec, 0, sizeof(rec)); + r = read(fd2, rec, sizeof(rec)); + close(fd2); + if (r != (ssize_t) sizeof(rec[0]) || + rec[0].ssi_signo != (uint32_t) SIGUSR2) { + printf("FAIL: SIGUSR2 not pending after first read (r=%zd)\n", r); + fails++; + return; + } + PASS(); +} + +static void test_sigqueue_libc_path(void) +{ + /* glibc / musl sigqueue() goes through SYS_rt_sigqueueinfo (138). + * Without that wired in, sigqueue() returns ENOSYS and apps that rely + * on POSIX queued signals (real-time apps, gdb) break. Verify the + * libc path produces a payload-bearing record. + */ + TEST("libc sigqueue() round-trip"); + + int sig = SIGRTMIN + 4; + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, sig); + sigprocmask(SIG_BLOCK, &mask, NULL); + + int fd = signalfd(-1, &mask, SFD_NONBLOCK); + if (fd < 0) { + FAIL("signalfd"); + return; + } + + union sigval sv; + sv.sival_int = 0x4242; + if (sigqueue(getpid(), sig, sv) != 0) { + close(fd); + FAIL("sigqueue"); + return; + } + + struct signalfd_siginfo rec; + memset(&rec, 0, sizeof(rec)); + ssize_t r = read(fd, &rec, sizeof(rec)); + close(fd); + if (r != (ssize_t) sizeof(rec)) { + FAIL("read short"); + return; + } + if (rec.ssi_signo != (uint32_t) sig || rec.ssi_int != 0x4242) { + printf("FAIL: signo=%u int=0x%x\n", rec.ssi_signo, rec.ssi_int); + fails++; + return; + } + PASS(); +} + +static void test_sigqueue_standard_metadata(void) +{ + TEST("standard sigqueue() keeps metadata"); + + int sig = SIGUSR1; + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, sig); + sigprocmask(SIG_BLOCK, &mask, NULL); + + int fd = signalfd(-1, &mask, SFD_NONBLOCK); + if (fd < 0) { + FAIL("signalfd"); + return; + } + + union sigval sv; + sv.sival_int = 0x5151; + if (sigqueue(getpid(), sig, sv) != 0) { + close(fd); + FAIL("sigqueue std"); + return; + } + + struct signalfd_siginfo rec; + memset(&rec, 0, sizeof(rec)); + ssize_t r = read(fd, &rec, sizeof(rec)); + close(fd); + if (r != (ssize_t) sizeof(rec)) { + FAIL("read short"); + return; + } + if (rec.ssi_signo != (uint32_t) sig || rec.ssi_int != 0x5151 || + rec.ssi_code != SI_QUEUE || rec.ssi_pid != (uint32_t) getpid() || + rec.ssi_uid != (uint32_t) getuid()) { + printf("FAIL: signo=%u int=0x%x code=%d pid=%u uid=%u\n", rec.ssi_signo, + rec.ssi_int, rec.ssi_code, rec.ssi_pid, rec.ssi_uid); + fails++; + return; + } + PASS(); +} + +static void test_partial_fault_returns_partial_bytes(void) +{ + /* Partial-fault recovery (write-then-take semantics). + * + * Queue four RT signals (payloads 0xA1..0xA4). Place a 4-record buffer + * so records 0 and 1 land in a valid page but records 2 and 3 cross + * into an unmapped page. The bridge writes 2 records, hits EFAULT + * trying to write record 2, returns partial bytes (2 * 128) -- and + * crucially does NOT take records 2 and 3 from the rt-queue, so they + * remain pending in original FIFO order. The follow-up read returns + * exactly two records with payloads 0xA3 then 0xA4 (no duplication + * of 0xA1 / 0xA2; no re-queue path that could overflow RT_SIGQUEUE_MAX + * or desync the notification pipe). + */ + TEST("partial fault: partial bytes + FIFO"); + + int sig = SIGRTMIN + 5; + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, sig); + sigprocmask(SIG_BLOCK, &mask, NULL); + + int fd = signalfd(-1, &mask, SFD_NONBLOCK); + if (fd < 0) { + FAIL("signalfd"); + return; + } + + long page = sysconf(_SC_PAGESIZE); + if (page <= 0) + page = 4096; + void *region = mmap(NULL, page * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (region == MAP_FAILED) { + close(fd); + FAIL("mmap guard region"); + return; + } + if (munmap((char *) region + page, page) != 0) { + munmap(region, page); + close(fd); + FAIL("munmap guard"); + return; + } + + pid_t pid = getpid(); + const int payloads[] = {0xA1, 0xA2, 0xA3, 0xA4}; + const int N = 4; + for (int i = 0; i < N; i++) { + unsigned char info[128]; + build_kernel_siginfo(sig, -1, pid, getuid(), payloads[i], NULL, info); + if (raw_rt_tgsigqueueinfo(pid, pid, sig, info) != 0) { + munmap(region, page); + close(fd); + FAIL("rt_tgsigqueueinfo"); + return; + } + } + + char *buf = (char *) region + page - (2 * 128); + errno = 0; + ssize_t r = read(fd, buf, 4 * sizeof(struct signalfd_siginfo)); + if (r != (ssize_t) (2 * sizeof(struct signalfd_siginfo))) { + printf("FAIL: expected 256 partial bytes, got r=%zd errno=%d\n", r, + errno); + munmap(region, page); + close(fd); + fails++; + return; + } + + struct signalfd_siginfo *delivered = (struct signalfd_siginfo *) buf; + if (delivered[0].ssi_signo != (uint32_t) sig || + delivered[0].ssi_int != payloads[0] || + delivered[1].ssi_signo != (uint32_t) sig || + delivered[1].ssi_int != payloads[1]) { + munmap(region, page); + close(fd); + printf("FAIL: page 1 records not [0x%x,0x%x]: got [0x%x,0x%x]\n", + payloads[0], payloads[1], delivered[0].ssi_int, + delivered[1].ssi_int); + fails++; + return; + } + munmap(region, page); + + /* Follow-up read into a fully-valid buffer. + * + * Linux dequeues the record being copied before checking copy_to_user, + * so the record that hit EFAULT (payloads[2]) is lost; a follow-up + * read returns one record (payloads[3]). elfuse defers the take until + * the write succeeds, so a follow-up read returns two records + * (payloads[2] then payloads[3]) in original FIFO order. + * + * Both behaviors are accepted: the contract under test is "no + * duplication of records that already reached the guest, no + * out-of-order delivery within whatever survives, and the last + * queued payload is always preserved." + */ + struct signalfd_siginfo recs[8]; + memset(recs, 0, sizeof(recs)); + ssize_t r2 = read(fd, recs, sizeof(recs)); + close(fd); + size_t recs_returned = (size_t) r2 / sizeof(recs[0]); + bool linux_loose = + (r2 == (ssize_t) sizeof(recs[0])) && recs[0].ssi_int == payloads[3]; + bool elfuse_strict = (r2 == (ssize_t) (2 * sizeof(recs[0]))) && + recs[0].ssi_int == payloads[2] && + recs[1].ssi_int == payloads[3]; + if (!linux_loose && !elfuse_strict) { + printf( + "FAIL: follow-up read returned %zd bytes (%zu records); " + "first=0x%x second=0x%x; expected either [0x%x] or [0x%x,0x%x]\n", + r2, recs_returned, recs[0].ssi_int, recs[1].ssi_int, payloads[3], + payloads[2], payloads[3]); + fails++; + return; + } + PASS(); +} + +static void test_rt_sigqueueinfo_bad_pointer_efault(void) +{ + TEST("rt_sigqueueinfo unreadable siginfo faults"); + + int sig = SIGRTMIN + 6; + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, sig); + sigprocmask(SIG_BLOCK, &mask, NULL); + + int fd = signalfd(-1, &mask, SFD_NONBLOCK); + if (fd < 0) { + FAIL("signalfd"); + return; + } + + errno = 0; + int ret = raw_rt_sigqueueinfo(getpid(), sig, (const void *) 1); + if (ret != -1 || errno != EFAULT) { + printf("FAIL: rt_sigqueueinfo unreadable info ret=%d errno=%d\n", ret, + errno); + close(fd); + fails++; + return; + } + + struct signalfd_siginfo rec; + memset(&rec, 0, sizeof(rec)); + errno = 0; + ssize_t r = read(fd, &rec, sizeof(rec)); + if (r != -1 || errno != EAGAIN) { + printf("FAIL: bad rt_sigqueueinfo queued a signal r=%d errno=%d\n", + (int) r, errno); + close(fd); + fails++; + return; + } + + close(fd); + PASS(); +} + +static void test_rt_sigqueueinfo_rejects_foreign_pid(void) +{ + /* rt_sigqueueinfo is a process-scoped (tgid) syscall. A pid that does + * not name the current process must return ESRCH instead of routing + * the signal through whichever thread happened to share the numeric + * id. The first probe picks a pid the host kernel cannot have + * assigned to the current guest so the call cannot collide with a + * legitimate target. + */ + TEST("rt_sigqueueinfo rejects foreign pid"); + + unsigned char info[128]; + build_kernel_siginfo(SIGRTMIN, -1, getpid(), getuid(), 0xDEAD, NULL, info); + + errno = 0; + int ret = raw_rt_sigqueueinfo(0x7FFFFFFE, SIGRTMIN, info); + if (ret != -1 || errno != ESRCH) { + printf("FAIL: foreign pid: ret=%d errno=%d (expected ESRCH)\n", ret, + errno); + fails++; + return; + } + PASS(); +} + +/* Helpers for the worker-thread tid case. The worker publishes its own + * tid via a thread-shared variable, then waits on a barrier so the main + * thread can call rt_sigqueueinfo with that tid before the worker exits. + */ +typedef struct { + pthread_mutex_t mtx; + pthread_cond_t ready_cv; + pthread_cond_t go_cv; + pid_t worker_tid; + bool ready; + bool go; +} worker_sync_t; + +static void *tid_worker(void *arg) +{ + worker_sync_t *s = arg; + pthread_mutex_lock(&s->mtx); + s->worker_tid = (pid_t) syscall(SYS_gettid); + s->ready = true; + pthread_cond_signal(&s->ready_cv); + while (!s->go) + pthread_cond_wait(&s->go_cv, &s->mtx); + pthread_mutex_unlock(&s->mtx); + return NULL; +} + +static void test_rt_sigqueueinfo_thread_tid_routes_to_tgid(void) +{ + /* Linux is permissive: rt_sigqueueinfo(tid_of_any_thread, ...) + * succeeds and the signal lands in the thread group's pending set + * (kill_pid_info routes through PIDTYPE_TGID). The contract under + * test is that elfuse matches that routing: a worker thread tid is + * accepted, and the queued signal becomes readable from the process + * signalfd. A regression that scoped the syscall to "tgid only" + * would surface here as ESRCH. + */ + TEST("rt_sigqueueinfo tid routes to tgid"); + + /* Block SIGRTMIN process-wide so the queued signal stays pending + * for signalfd to read instead of terminating the process. + */ + sigset_t block; + sigemptyset(&block); + sigaddset(&block, SIGRTMIN); + sigprocmask(SIG_BLOCK, &block, NULL); + + worker_sync_t s; + pthread_mutex_init(&s.mtx, NULL); + pthread_cond_init(&s.ready_cv, NULL); + pthread_cond_init(&s.go_cv, NULL); + s.worker_tid = -1; + s.ready = false; + s.go = false; + + pthread_t th; + if (pthread_create(&th, NULL, tid_worker, &s) != 0) { + FAIL("pthread_create"); + return; + } + + pthread_mutex_lock(&s.mtx); + while (!s.ready) + pthread_cond_wait(&s.ready_cv, &s.mtx); + pid_t worker_tid = s.worker_tid; + pthread_mutex_unlock(&s.mtx); + + if (worker_tid == getpid()) { + pthread_mutex_lock(&s.mtx); + s.go = true; + pthread_cond_signal(&s.go_cv); + pthread_mutex_unlock(&s.mtx); + pthread_join(th, NULL); + FAIL("worker tid equals process pid"); + return; + } + + int sfd_fd = signalfd(-1, &block, SFD_NONBLOCK); + if (sfd_fd < 0) { + pthread_mutex_lock(&s.mtx); + s.go = true; + pthread_cond_signal(&s.go_cv); + pthread_mutex_unlock(&s.mtx); + pthread_join(th, NULL); + FAIL("signalfd"); + return; + } + + unsigned char info[128]; + build_kernel_siginfo(SIGRTMIN, -1, getpid(), getuid(), 0xBEEF, NULL, info); + + errno = 0; + int ret = raw_rt_sigqueueinfo(worker_tid, SIGRTMIN, info); + int err = errno; + + /* Drain any queued signal via signalfd before letting the worker + * exit so the signal does not leak into pthread_join. + */ + struct signalfd_siginfo rec; + memset(&rec, 0, sizeof(rec)); + ssize_t got = -1; + int got_err = 0; + if (ret == 0) { + errno = 0; + got = read(sfd_fd, &rec, sizeof(rec)); + got_err = errno; + } + close(sfd_fd); + + pthread_mutex_lock(&s.mtx); + s.go = true; + pthread_cond_signal(&s.go_cv); + pthread_mutex_unlock(&s.mtx); + pthread_join(th, NULL); + pthread_mutex_destroy(&s.mtx); + pthread_cond_destroy(&s.ready_cv); + pthread_cond_destroy(&s.go_cv); + + if (ret != 0) { + printf("FAIL: worker tid %d: ret=%d errno=%d (expected 0)\n", + (int) worker_tid, ret, err); + fails++; + return; + } + if (got != (ssize_t) sizeof(rec) || rec.ssi_signo != (uint32_t) SIGRTMIN || + rec.ssi_int != 0xBEEF) { + printf("FAIL: signalfd read got=%zd errno=%d signo=%u int=0x%x\n", got, + got_err, rec.ssi_signo, rec.ssi_int); + fails++; + return; + } + PASS(); +} + +int main(void) +{ + printf("test-signalfd-hardening: signalfd read semantics audit\n"); + + test_rt_multiplicity(); + test_standard_coalesces(); + test_sigrtmax_reachable(); + test_ssi_ptr_roundtrip(); + test_sender_metadata(); + test_mask_filters_only(); + test_sigqueue_libc_path(); + test_sigqueue_standard_metadata(); + test_partial_fault_returns_partial_bytes(); + test_rt_sigqueueinfo_bad_pointer_efault(); + test_rt_sigqueueinfo_rejects_foreign_pid(); + test_rt_sigqueueinfo_thread_tid_routes_to_tgid(); + + SUMMARY("test-signalfd-hardening"); + return fails > 0 ? 1 : 0; +}