diff --git a/guides/self-hosting.md b/guides/self-hosting.md index 2d218eb..b1601a4 100644 --- a/guides/self-hosting.md +++ b/guides/self-hosting.md @@ -166,6 +166,23 @@ Put this behind a TLS-terminating reverse proxy (Caddy, nginx, Traefik) — asobi_lua speaks plain HTTP/WebSocket and expects the proxy to handle certificates. +## Tuning knobs + +These are read at start time from your `sys.config`. + +| Key | Default | What it does | +|---|---|---| +| `asobi_lua.max_heap_words` | `5_000_000` | Per-eval heap cap (in Erlang words) for every Lua callback the runtime invokes. If a single eval allocates past this, the eval process is killed by the VM and the runtime returns `{error, heap_exhausted}`. Persistent state held by the gen_server is not touched — only the runaway eval. Raise only if a single tick legitimately constructs a very large local structure; long-lived tables belong in the persistent Luerl state and cost nothing per eval. | + +```erlang +%% sys.config +[ + {asobi_lua, [ + {max_heap_words, 10_000_000} + ]} +]. +``` + ## Operating notes - **Database backups.** Postgres holds session tokens, world diff --git a/src/lua/asobi_lua_loader.erl b/src/lua/asobi_lua_loader.erl index 0bcf546..a7343aa 100644 --- a/src/lua/asobi_lua_loader.erl +++ b/src/lua/asobi_lua_loader.erl @@ -42,6 +42,16 @@ load a specific script and pin its base directory for `require`. %% notices the hang. -define(DEFAULT_INIT_TIMEOUT_MS, 2000). +%% Per-eval heap cap. A correctly-written tick handler should not +%% allocate near 40MB; legitimate large state lives in the persistent +%% Luerl state held by the gen_server, not in the per-eval process. +%% Configurable via `asobi_lua.max_heap_words` for ops with unusual +%% workloads. `kill => true` makes the VM kill the eval process if it +%% allocates past the limit; the parent receives `{'DOWN', _, _, _, +%% killed}` and surfaces `{error, heap_exhausted}` so the caller can +%% distinguish heap-blow from timeout. +-define(DEFAULT_MAX_HEAP_WORDS, 5_000_000). + -spec new(binary() | string()) -> {ok, dynamic()} | {error, term()}. new(ScriptPath) -> new(ScriptPath, ?DEFAULT_INIT_TIMEOUT_MS). @@ -67,10 +77,8 @@ new(ScriptPath, TimeoutMs) -> -spec do_with_timeout(string() | binary(), dynamic(), non_neg_integer()) -> {ok, dynamic()} | {error, term()}. do_with_timeout(Code, St, TimeoutMs) -> - Self = self(), - Ref = make_ref(), - Pid = spawn(fun() -> - Result = + bounded_eval( + fun() -> try luerl:do(ensure_string(Code), St) of {ok, _Results, St1} -> {ok, St1}; {error, Errors, _} -> {error, {lua_error, Errors}}; @@ -78,19 +86,10 @@ do_with_timeout(Code, St, TimeoutMs) -> catch error:{lua_error, Reason, _} -> {error, {lua_error, Reason}}; error:Reason -> {error, Reason} - end, - Self ! {Ref, Result} - end), - receive - {Ref, Result} -> Result - after TimeoutMs -> - exit(Pid, kill), - receive - {Ref, _} -> ok - after 0 -> ok + end end, - {error, timeout} - end. + TimeoutMs + ). -spec init_sandboxed() -> dynamic(). init_sandboxed() -> @@ -119,23 +118,67 @@ call(FuncPath, Args, St) -> end. -spec call(atom() | [atom() | binary()], [term()], dynamic(), non_neg_integer()) -> - {ok, [term()], dynamic()} | {error, timeout | term()}. + {ok, [term()], dynamic()} | {error, timeout | heap_exhausted | term()}. call(FuncPath, Args, St, TimeoutMs) -> + bounded_eval(fun() -> call(FuncPath, Args, St) end, TimeoutMs). + +%% Spawn the work in a child with a bounded wall-clock budget AND a +%% bounded heap, monitor it, and translate the three terminal states +%% the parent might observe into return values: +%% - normal exit + {Ref, Result} message → Result +%% - timeout (we kill it, exit reason `kill`) → {error, timeout} +%% - VM kills it for heap (exit reason `killed`) → {error, heap_exhausted} +%% A heap kill happens *before* the worker can send {Ref, _}, so the +%% DOWN message races. We give the message a tiny grace window in case +%% it is in flight. +-spec bounded_eval(fun(() -> R), non_neg_integer()) -> + R | {error, timeout | heap_exhausted | {worker_exit, term()}}. +bounded_eval(Fun, TimeoutMs) -> Self = self(), Ref = make_ref(), - Pid = spawn(fun() -> - Result = call(FuncPath, Args, St), - Self ! {Ref, Result} - end), + SpawnOpts = [ + monitor, + {max_heap_size, #{ + size => max_heap_words(), + kill => true, + error_logger => true, + include_shared_binaries => false + }} + ], + {Pid, MonRef} = + spawn_opt( + fun() -> + Self ! {Ref, Fun()} + end, + SpawnOpts + ), receive - {Ref, Result} -> Result + {Ref, Result} -> + erlang:demonitor(MonRef, [flush]), + Result; + {'DOWN', MonRef, process, Pid, killed} -> + {error, heap_exhausted}; + {'DOWN', MonRef, process, Pid, Reason} -> + {error, {worker_exit, Reason}} after TimeoutMs -> exit(Pid, kill), receive - {Ref, _} -> ok - after 0 -> ok - end, - {error, timeout} + {Ref, Result} -> + erlang:demonitor(MonRef, [flush]), + Result; + {'DOWN', MonRef, process, Pid, _} -> + {error, timeout} + after 0 -> + erlang:demonitor(MonRef, [flush]), + {error, timeout} + end + end. + +-spec max_heap_words() -> pos_integer(). +max_heap_words() -> + case application:get_env(asobi_lua, max_heap_words) of + {ok, N} when is_integer(N), N > 0 -> N; + _ -> ?DEFAULT_MAX_HEAP_WORDS end. %% --- Internal: state construction & sandbox --- diff --git a/test/asobi_lua_loader_tests.erl b/test/asobi_lua_loader_tests.erl index d4acdca..94f67b2 100644 --- a/test/asobi_lua_loader_tests.erl +++ b/test/asobi_lua_loader_tests.erl @@ -28,6 +28,8 @@ loader_test_() -> {"require loads submodule", fun require_loads_submodule/0}, {"call with timeout succeeds", fun call_with_timeout_ok/0}, {"call with timeout returns error on slow script", fun call_with_timeout_slow/0}, + {"call with heap cap returns error on heap bomb", fun call_heap_bomb/0}, + {"max_heap_words honors application env override", fun max_heap_env_override/0}, {"math.random works", fun math_random_works/0}, {"math.sqrt works", fun math_sqrt_works/0}, {"math.random no args returns float", fun math_random_no_args/0} @@ -73,6 +75,42 @@ call_with_timeout_slow() -> Cfg = encode_map(#{}, St), {error, timeout} = asobi_lua_loader:call(tick, [Cfg], St, 50). +%% A tick that allocates an unbounded table must be killed by the per-eval +%% heap cap and surface as `heap_exhausted`, not as a timeout. Use a +%% small heap budget so the eval trips quickly even on fast hardware. +call_heap_bomb() -> + OldEnv = application:get_env(asobi_lua, max_heap_words), + application:set_env(asobi_lua, max_heap_words, 200_000), + try + {ok, St} = asobi_lua_loader:new(fixture("heap_bomb.lua")), + Cfg = encode_map(#{}, St), + ?assertEqual( + {error, heap_exhausted}, + asobi_lua_loader:call(tick, [Cfg], St, 5000) + ) + after + case OldEnv of + {ok, V} -> application:set_env(asobi_lua, max_heap_words, V); + undefined -> application:unset_env(asobi_lua, max_heap_words) + end + end. + +%% A normal call still succeeds when an env override is set, proving the +%% override path is read on every eval rather than baked in once. +max_heap_env_override() -> + OldEnv = application:get_env(asobi_lua, max_heap_words), + application:set_env(asobi_lua, max_heap_words, 5_000_000), + try + {ok, St} = asobi_lua_loader:new(fixture("test_match.lua")), + Cfg = encode_map(#{}, St), + {ok, [_ | _], _} = asobi_lua_loader:call(init, [Cfg], St, 5000) + after + case OldEnv of + {ok, V} -> application:set_env(asobi_lua, max_heap_words, V); + undefined -> application:unset_env(asobi_lua, max_heap_words) + end + end. + math_random_works() -> {ok, St} = asobi_lua_loader:new(fixture("test_match.lua")), {ok, [Result | _], _} = asobi_lua_loader:call( diff --git a/test/fixtures/lua/heap_bomb.lua b/test/fixtures/lua/heap_bomb.lua new file mode 100644 index 0000000..28b729f --- /dev/null +++ b/test/fixtures/lua/heap_bomb.lua @@ -0,0 +1,33 @@ +-- Match whose tick allocates an unbounded table to trip the per-eval +-- heap cap. The init/join/leave/get_state callbacks are minimal; only +-- tick is the heap bomb so we can construct a state and then trigger +-- the limit from a single call. +function init(config) + return { players = {} } +end + +function join(player_id, state) + state.players[player_id] = {} + return state +end + +function leave(player_id, state) + state.players[player_id] = nil + return state +end + +function handle_input(player_id, input, state) + return state +end + +function tick(state) + local t = {} + for i = 1, 100000000 do + t[i] = { i, i, i, i, i, i, i, i, i, i } + end + return state +end + +function get_state(player_id, state) + return { players = state.players } +end