From c049b04e766709d713d391b43ebd5127d3a04915 Mon Sep 17 00:00:00 2001 From: mahaloz Date: Wed, 22 Apr 2026 20:46:21 -0700 Subject: [PATCH 01/10] Add list_strings, get_callers, and disassemble to DecompilerInterface Adds three new cross-decompiler APIs, wires them through the client, and implements them for angr and Ghidra: - list_strings(filter=regex): list (addr, text) tuples, regex-filterable - get_callers(target): Function, address, or name -> callers - disassemble(addr): text disassembly of a function Also fixes a latent bug in the angr xrefs_to where self.main_instance.kb was used in headless mode (main_instance == self). --- libbs/api/decompiler_client.py | 12 +++++ libbs/api/decompiler_interface.py | 66 +++++++++++++++++++++++++++ libbs/decompilers/angr/interface.py | 63 +++++++++++++++++++++++-- libbs/decompilers/ghidra/interface.py | 53 +++++++++++++++++++++ 4 files changed, 189 insertions(+), 5 deletions(-) diff --git a/libbs/api/decompiler_client.py b/libbs/api/decompiler_client.py index 95c4fee..17282be 100644 --- a/libbs/api/decompiler_client.py +++ b/libbs/api/decompiler_client.py @@ -532,6 +532,18 @@ def decompile(self, addr: int, map_lines=False, **kwargs) -> Optional[Decompilat def xrefs_to(self, artifact: Artifact, decompile=False, only_code=False) -> List[Artifact]: """Get cross-references to an artifact""" return self._send_request({"type": "method_call", "method_name": "xrefs_to", "args": [artifact], "kwargs": {"decompile": decompile, "only_code": only_code}}) + + def get_callers(self, target) -> List[Function]: + """Get callers of a function (by target Function, address, or symbol name)""" + return self._send_request({"type": "method_call", "method_name": "get_callers", "args": [target]}) + + def list_strings(self, filter: Optional[str] = None) -> List: + """List strings in the binary with an optional regex filter""" + return self._send_request({"type": "method_call", "method_name": "list_strings", "kwargs": {"filter": filter}}) + + def disassemble(self, addr: int, **kwargs) -> Optional[str]: + """Disassemble a function""" + return self._send_request({"type": "method_call", "method_name": "disassemble", "args": [addr], "kwargs": kwargs}) def get_callgraph(self, only_names=False): """Get the call graph""" diff --git a/libbs/api/decompiler_interface.py b/libbs/api/decompiler_interface.py index d1438aa..7ff2cfe 100644 --- a/libbs/api/decompiler_interface.py +++ b/libbs/api/decompiler_interface.py @@ -397,6 +397,72 @@ def xrefs_to(self, artifact: Artifact, decompile=False, only_code=False) -> List return [] + def get_callers(self, target) -> List[Function]: + """ + Returns a list of Functions that call/reference the provided target. + + @param target: A Function, address (int), or symbol name (str). + @return: List of Function objects whose bodies reference `target`. Each result is a (light) + Function; only its addr (and name when resolvable) are guaranteed to be populated. + """ + func: Optional[Function] = None + if isinstance(target, Function): + func = target + elif isinstance(target, int): + func = self.fast_get_function(target) + if func is None: + func = Function(target, 0) + elif isinstance(target, str): + for addr, light_func in self.functions.items(): + if light_func.name == target: + func = self.fast_get_function(addr) or Function(addr, 0) + break + if func is None: + raise ValueError(f"Unable to locate function named {target!r}") + else: + raise ValueError(f"Unsupported target type for get_callers: {type(target)}") + + callers: List[Function] = [] + seen = set() + for xref in self.xrefs_to(func): + if not isinstance(xref, Function): + continue + if xref.addr in seen: + continue + seen.add(xref.addr) + if not xref.name: + resolved = self.fast_get_function(xref.addr) + if resolved is not None: + xref = resolved + callers.append(xref) + + return callers + + def list_strings(self, filter: Optional[str] = None) -> List[Tuple[int, str]]: + """ + Returns a list of (addr, string) tuples for strings found in the binary. + + Subclasses are expected to override this to provide decompiler-native string discovery + (which is typically much faster and more accurate). The base implementation returns an + empty list. + + @param filter: Optional regex string; only strings that match will be returned. + @return: List of (address, string) tuples. + """ + return [] + + def disassemble(self, addr: int, **kwargs) -> Optional[str]: + """ + Returns the disassembly of a function as a single string. + + Subclasses should override this to emit decompiler-native disassembly. The default + implementation returns None. + + @param addr: Address of the function (or any address inside the function). + @return: The disassembly string, or None if unavailable. + """ + return None + def get_callgraph(self, only_names=False) -> nx.DiGraph: """ Returns the callgraph of the binary. This is a dict of function addresses to a list of function addresses diff --git a/libbs/decompilers/angr/interface.py b/libbs/decompilers/angr/interface.py index 4d9add7..e5694a1 100644 --- a/libbs/decompilers/angr/interface.py +++ b/libbs/decompilers/angr/interface.py @@ -1,8 +1,9 @@ import logging import os +import re from collections import defaultdict from functools import lru_cache -from typing import Optional, Dict, List +from typing import Optional, Dict, List, Tuple from pathlib import Path import angr @@ -114,7 +115,7 @@ def xrefs_to(self, artifact: Artifact, decompile=False, only_code=False) -> List l.warning("only_code is not supported in angr.") function: Function = self.art_lifter.lower(artifact) - program_cfg = self.main_instance.kb.cfgs.get_most_accurate() + program_cfg = self.main_instance.project.kb.cfgs.get_most_accurate() if program_cfg is None: return [] @@ -123,15 +124,67 @@ def xrefs_to(self, artifact: Artifact, decompile=False, only_code=False) -> List return [] xrefs = [] + seen_callers = set() for node in program_cfg.graph.predecessors(func_node): func_addr = node.function_address - if func_addr is None: + if func_addr is None or func_addr == function.addr: continue - - xrefs.append(Function(func_addr, 0)) + if func_addr in seen_callers: + continue + seen_callers.add(func_addr) + xrefs.append(self.art_lifter.lift(Function(func_addr, 0))) return xrefs + def list_strings(self, filter: Optional[str] = None) -> List[Tuple[int, str]]: + pattern = re.compile(filter) if filter else None + try: + cfg = self.main_instance.project.kb.cfgs.get_most_accurate() + except Exception: + cfg = None + results: List[Tuple[int, str]] = [] + seen = set() + if cfg is not None: + for addr, mem_data in cfg.memory_data.items(): + if mem_data.sort != "string" or not mem_data.content: + continue + try: + text = mem_data.content.decode("utf-8", errors="replace") + except Exception: + continue + lifted_addr = self.art_lifter.lift_addr(addr) + if lifted_addr in seen: + continue + seen.add(lifted_addr) + if pattern is None or pattern.search(text): + results.append((lifted_addr, text)) + results.sort(key=lambda item: item[0]) + return results + + def disassemble(self, addr: int, **kwargs) -> Optional[str]: + lowered = self.art_lifter.lower_addr(addr) + func = self.main_instance.project.kb.functions.get(lowered, None) + if func is None: + for _addr, _func in self.main_instance.project.kb.functions.items(): + if _addr <= lowered < (_addr + (_func.size or 0)): + func = _func + break + if func is None: + return None + + lines: List[str] = [] + try: + blocks = sorted(func.blocks, key=lambda b: b.addr) + except Exception: + blocks = list(func.blocks) + for block in blocks: + try: + for insn in block.capstone.insns: + lines.append(f"0x{insn.address:x}:\t{insn.mnemonic}\t{insn.op_str}".rstrip()) + except Exception: + continue + return "\n".join(lines) if lines else None + def _decompile(self, function: Function, map_lines=False, **kwargs) -> Optional[Decompilation]: if function.dec_obj is None: function.dec_obj = self.get_decompilation_object(function, do_lower=False) diff --git a/libbs/decompilers/ghidra/interface.py b/libbs/decompilers/ghidra/interface.py index b85b064..cd2bf46 100644 --- a/libbs/decompilers/ghidra/interface.py +++ b/libbs/decompilers/ghidra/interface.py @@ -1,4 +1,5 @@ import os +import re import sys import time import typing @@ -303,6 +304,58 @@ def xrefs_to(self, artifact: Artifact, decompile=False, only_code=False) -> List lifted_xrefs = [self.art_lifter.lift(x) for x in xrefs + new_xrefs] return lifted_xrefs + def list_strings(self, filter: Optional[str] = None) -> List[Tuple[int, str]]: + pattern = re.compile(filter) if filter else None + results: List[Tuple[int, str]] = [] + try: + program = self.currentProgram + listing = program.getListing() + # Iterate all defined data; pull strings. + data_iter = listing.getDefinedData(True) + while data_iter.hasNext(): + data = data_iter.next() + if not data.hasStringValue(): + continue + try: + raw = data.getValue() + text = str(raw) if raw is not None else "" + except Exception: + continue + if not text: + continue + addr = int(data.getAddress().getOffset()) + lifted = self.art_lifter.lift_addr(addr) + if pattern is None or pattern.search(text): + results.append((lifted, text)) + except Exception as exc: + _l.warning("Ghidra list_strings failed: %s", exc) + results.sort(key=lambda item: item[0]) + return results + + def disassemble(self, addr: int, **kwargs) -> Optional[str]: + lowered = self.art_lifter.lower_addr(addr) + func = self._get_nearest_function(lowered) + if func is None: + return None + + lines: List[str] = [] + try: + listing = self.currentProgram.getListing() + body = func.getBody() + insn_iter = listing.getInstructions(body, True) + while insn_iter.hasNext(): + insn = insn_iter.next() + try: + insn_addr = int(insn.getAddress().getOffset()) + lifted = self.art_lifter.lift_addr(insn_addr) + lines.append(f"0x{lifted:x}:\t{str(insn)}") + except Exception: + continue + except Exception as exc: + _l.warning("Ghidra disassemble failed: %s", exc) + return None + return "\n".join(lines) if lines else None + # # Extra API # From 68e4a47c2ca0d6f3163297d1576acf09fe7db565 Mon Sep 17 00:00:00 2001 From: mahaloz Date: Wed, 22 Apr 2026 20:56:56 -0700 Subject: [PATCH 02/10] Add `decompiler` CLI, multi-instance server registry Introduces an LLM-friendly command line entry point, `decompiler`, backed by DecompilerServer + DecompilerClient: - New commands: load, list, stop, decompile, disassemble, xref_to, xref_from, rename (func | var), list_strings, get_callers. - First `load` of a binary spawns a headless server in the background; later CLI calls auto-connect via the shared registry. - Multiple servers can run concurrently; each one is keyed by a short server ID, and commands disambiguate with --id, --binary, or --backend. - Backend selection via --backend {angr,ghidra,binja,ida}. - `libbs --server` grows a --server-id flag so subprocesses can be named. Implementation: - libbs/api/server_registry.py: per-server JSON records under the platform state dir, with stale-record pruning (PID/socket liveness check). - DecompilerServer: accepts server_id, writes/unregisters a registry entry, exposes server_id + binary_path in server_info. - DecompilerClient.discover_from_registry: filter by id/binary/hash/backend. - Tests cover load/list/stop, multi-instance, decompile/disassemble by name and address, xref_to/xref_from, rename func/var, list_strings (+ regex), get_callers, and direct tests of the new core APIs. --- libbs/__main__.py | 22 +- libbs/api/decompiler_client.py | 34 ++ libbs/api/decompiler_server.py | 64 ++- libbs/api/server_registry.py | 171 ++++++++ libbs/cli/__init__.py | 3 + libbs/cli/decompiler_cli.py | 637 ++++++++++++++++++++++++++++ libbs/decompilers/angr/interface.py | 8 +- pyproject.toml | 1 + tests/test_decompiler_cli.py | 290 +++++++++++++ 9 files changed, 1207 insertions(+), 23 deletions(-) create mode 100644 libbs/api/server_registry.py create mode 100644 libbs/cli/__init__.py create mode 100644 libbs/cli/decompiler_cli.py create mode 100644 tests/test_decompiler_cli.py diff --git a/libbs/__main__.py b/libbs/__main__.py index a85769d..aed22c2 100644 --- a/libbs/__main__.py +++ b/libbs/__main__.py @@ -11,15 +11,15 @@ def install(): LibBSPluginInstaller().install() -def start_server(socket_path=None, decompiler=None, binary_path=None, headless=False): +def start_server(socket_path=None, decompiler=None, binary_path=None, headless=False, server_id=None): """Start the DecompilerServer (AF_UNIX socket-based)""" try: from libbs.api.decompiler_server import DecompilerServer from libbs.api.decompiler_interface import DecompilerInterface - + # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') - + # Prepare interface kwargs interface_kwargs = {} if decompiler: @@ -28,7 +28,7 @@ def start_server(socket_path=None, decompiler=None, binary_path=None, headless=F interface_kwargs['binary_path'] = binary_path if headless: interface_kwargs['headless'] = headless - + # Create and start server if socket_path: _l.info(f"Starting AF_UNIX DecompilerServer on {socket_path}") @@ -36,15 +36,15 @@ def start_server(socket_path=None, decompiler=None, binary_path=None, headless=F _l.info("Starting AF_UNIX DecompilerServer with auto-generated socket path") if interface_kwargs: _l.info(f"Interface options: {interface_kwargs}") - - with DecompilerServer(socket_path=socket_path, **interface_kwargs) as server: + + with DecompilerServer(socket_path=socket_path, server_id=server_id, **interface_kwargs) as server: _l.info("Server started successfully. Press Ctrl+C to stop.") _l.info("Connect with: DecompilerClient.discover('unix://{}')".format(server.socket_path)) try: server.wait_for_shutdown() except KeyboardInterrupt: _l.info("Shutting down server...") - + except ImportError as e: _l.error(f"Failed to import required modules: {e}") sys.exit(1) @@ -146,6 +146,11 @@ def main(): Run the decompiler in headless mode (no GUI). Requires --binary-path. """ ) + parser.add_argument( + "--server-id", help=""" + Explicit server ID to use; if omitted, a unique one is generated. + """ + ) args = parser.parse_args() if args.single_decompiler_install: @@ -160,7 +165,8 @@ def main(): socket_path=args.socket_path, decompiler=args.decompiler, binary_path=args.binary_path, - headless=args.headless + headless=args.headless, + server_id=args.server_id, ) else: parser.print_help() diff --git a/libbs/api/decompiler_client.py b/libbs/api/decompiler_client.py index 17282be..4af8068 100644 --- a/libbs/api/decompiler_client.py +++ b/libbs/api/decompiler_client.py @@ -19,6 +19,7 @@ from libbs.api.decompiler_server import SocketProtocol from libbs.api.type_parser import CTypeParser from libbs.configuration import LibbsConfig +from libbs.api import server_registry _l = logging.getLogger(__name__) @@ -794,6 +795,39 @@ def __enter__(self): def __exit__(self, exc_type, exc_val, exc_tb): self.shutdown() + @staticmethod + def discover_from_registry( + server_id: Optional[str] = None, + binary_path: Optional[str] = None, + binary_hash: Optional[str] = None, + backend: Optional[str] = None, + **kwargs, + ) -> 'DecompilerClient': + """ + Find a running server via the shared registry and connect to it. + + Filters narrow the pool in the order: server_id, binary_path, binary_hash, backend. + If no server matches, a ConnectionError is raised. + """ + record = server_registry.find_server( + server_id=server_id, + binary_path=binary_path, + binary_hash=binary_hash, + backend=backend, + ) + if not record: + filters = { + "server_id": server_id, + "binary_path": binary_path, + "binary_hash": binary_hash, + "backend": backend, + } + active = {k: v for k, v in filters.items() if v} + raise ConnectionError( + f"No matching DecompilerServer in registry. Filters: {active or 'none'}." + ) + return DecompilerClient(socket_path=record["socket_path"], **kwargs) + # Static methods for compatibility @staticmethod def discover(server_url: str = None, binary_hash: str = None, **kwargs) -> 'DecompilerClient': diff --git a/libbs/api/decompiler_server.py b/libbs/api/decompiler_server.py index e1c1d0c..f27502d 100644 --- a/libbs/api/decompiler_server.py +++ b/libbs/api/decompiler_server.py @@ -13,6 +13,7 @@ from typing import Optional, Dict, Any, List from libbs.api.decompiler_interface import DecompilerInterface +from libbs.api import server_registry _l = logging.getLogger(__name__) @@ -146,7 +147,9 @@ def _process_request(self, request: Dict[str, Any], client_socket: socket.socket "version": "3.0.0", "decompiler": self.deci.name if self.deci else "unknown", "protocol": "unix_socket", - "binary_hash": self.deci.binary_hash if self.deci else None + "binary_hash": self.deci.binary_hash if self.deci else None, + "binary_path": str(self.deci.binary_path) if (self.deci and self.deci.binary_path) else None, + "server_id": self.server.server_id if self.server else None, } elif request_type == "get_light_artifacts": @@ -285,22 +288,29 @@ class DecompilerServer: to all its public methods and artifact collections through AF_UNIX sockets. """ - def __init__(self, + def __init__(self, decompiler_interface: Optional[DecompilerInterface] = None, socket_path: Optional[str] = None, + server_id: Optional[str] = None, + register: bool = True, **interface_kwargs): """ Initialize the DecompilerServer. - + Args: decompiler_interface: An existing DecompilerInterface instance. If None, one will be created using DecompilerInterface.discover() - socket_path: Path for the AF_UNIX socket. If None, a temporary path will be used + socket_path: Path for the AF_UNIX socket. If None, a path is derived from server_id. + server_id: Optional explicit server ID. If None, a new one is generated. + register: If True, write the server info into the shared registry. **interface_kwargs: Arguments passed to DecompilerInterface.discover() if decompiler_interface is None """ - + + self.server_id = server_id or server_registry.new_server_id() self.socket_path = socket_path + self._register = register + self._registered = False self._server_socket = None self._server_thread = None self._running = False @@ -335,13 +345,13 @@ def __init__(self, # Generate socket path if not provided if self.socket_path is None: - temp_dir = tempfile.mkdtemp(prefix="libbs_server_") - self.socket_path = os.path.join(temp_dir, "decompiler.sock") - self._temp_dir = temp_dir + socket_path = server_registry.default_socket_path(self.server_id) + self.socket_path = socket_path + self._temp_dir = os.path.dirname(socket_path) else: self._temp_dir = None - - _l.info(f"DecompilerServer initialized with {self.deci.name} interface") + + _l.info(f"DecompilerServer initialized with {self.deci.name} interface (id={self.server_id})") _l.info(f"Socket path: {self.socket_path}") def _register_artifact_callbacks(self): @@ -440,11 +450,31 @@ def start(self): # Set running flag before starting thread self._running = True - + # Start server in a separate thread self._server_thread = threading.Thread(target=self._server_loop, daemon=True) self._server_thread.start() - + + # Register in shared registry so other processes can find us. + if self._register: + try: + binary_path = str(self.deci.binary_path) if self.deci and self.deci.binary_path else None + binary_hash = None + try: + binary_hash = self.deci.binary_hash if self.deci else None + except Exception: + binary_hash = None + server_registry.register_server({ + "id": self.server_id, + "socket_path": self.socket_path, + "backend": self.deci.name if self.deci else None, + "binary_path": binary_path, + "binary_hash": binary_hash, + }) + self._registered = True + except Exception as exc: + _l.warning("Failed to register server: %s", exc) + _l.info(f"DecompilerServer started successfully on unix://{self.socket_path}") _l.info("Connect with: DecompilerClient.discover('unix://{}')".format(self.socket_path)) @@ -510,13 +540,21 @@ def stop(self): # Clean up socket file and temp directory if os.path.exists(self.socket_path): os.unlink(self.socket_path) - + if self._temp_dir and os.path.exists(self._temp_dir): try: os.rmdir(self._temp_dir) except: pass + # Remove from registry + if self._registered: + try: + server_registry.unregister_server(self.server_id) + except Exception as exc: + _l.debug("Failed to unregister server %s: %s", self.server_id, exc) + self._registered = False + # Shutdown the decompiler interface if self.deci: try: diff --git a/libbs/api/server_registry.py b/libbs/api/server_registry.py new file mode 100644 index 0000000..caf2ea0 --- /dev/null +++ b/libbs/api/server_registry.py @@ -0,0 +1,171 @@ +""" +Server registry for libbs DecompilerServer instances. + +Each running server writes a small JSON descriptor into a shared registry +directory so that the `decompiler` CLI (and DecompilerClient.discover) can +find, filter, and connect to the right server instance. Stale records +(servers whose process has exited or whose socket has vanished) are pruned +on read. +""" +import json +import logging +import os +import tempfile +import time +import uuid +from pathlib import Path +from typing import Dict, List, Optional + +import psutil +from platformdirs import user_state_dir + +_l = logging.getLogger(__name__) + + +def _registry_dir() -> Path: + """Return the registry directory, creating it if missing.""" + env_override = os.environ.get("LIBBS_SERVER_REGISTRY") + if env_override: + path = Path(env_override) + else: + path = Path(user_state_dir("libbs")) / "servers" + path.mkdir(parents=True, exist_ok=True) + return path + + +def new_server_id() -> str: + """Generate a short unique ID for a new server.""" + return uuid.uuid4().hex[:10] + + +def default_socket_path(server_id: str) -> str: + """Compute a default socket path for a server with the given ID.""" + temp_dir = Path(tempfile.gettempdir()) / f"libbs_server_{server_id}" + temp_dir.mkdir(parents=True, exist_ok=True) + return str(temp_dir / "decompiler.sock") + + +def registry_path(server_id: str) -> Path: + return _registry_dir() / f"{server_id}.json" + + +def register_server(info: Dict) -> Path: + """Write a server descriptor into the registry. Required keys: id, socket_path.""" + server_id = info["id"] + path = registry_path(server_id) + payload = dict(info) + payload.setdefault("started_at", time.time()) + payload.setdefault("pid", os.getpid()) + tmp_path = path.with_suffix(".json.tmp") + with open(tmp_path, "w") as f: + json.dump(payload, f, indent=2, default=str) + os.replace(tmp_path, path) + return path + + +def unregister_server(server_id: str) -> bool: + path = registry_path(server_id) + try: + path.unlink() + return True + except FileNotFoundError: + return False + + +def _is_record_live(record: Dict) -> bool: + pid = record.get("pid") + socket_path = record.get("socket_path") + if not socket_path or not os.path.exists(socket_path): + return False + if pid is not None: + try: + if not psutil.pid_exists(int(pid)): + return False + except Exception: + return False + return True + + +def list_servers(prune_stale: bool = True) -> List[Dict]: + """Return all server records, optionally dropping and removing stale entries.""" + records: List[Dict] = [] + try: + entries = sorted(_registry_dir().glob("*.json")) + except FileNotFoundError: + return [] + + for entry in entries: + try: + with open(entry, "r") as f: + record = json.load(f) + except Exception as exc: + _l.debug("Failed to read server registry file %s: %s", entry, exc) + continue + + if prune_stale and not _is_record_live(record): + try: + entry.unlink() + except FileNotFoundError: + pass + except Exception as exc: + _l.debug("Failed to remove stale registry entry %s: %s", entry, exc) + continue + + records.append(record) + return records + + +def find_server( + server_id: Optional[str] = None, + binary_path: Optional[str] = None, + binary_hash: Optional[str] = None, + backend: Optional[str] = None, +) -> Optional[Dict]: + """Return the first server record matching all provided filters, else None.""" + binary_path_resolved = str(Path(binary_path).expanduser().resolve()) if binary_path else None + for record in list_servers(): + if server_id and record.get("id") != server_id: + continue + if binary_path_resolved: + record_path = record.get("binary_path") + if not record_path: + continue + try: + if str(Path(record_path).expanduser().resolve()) != binary_path_resolved: + continue + except Exception: + if record_path != binary_path_resolved: + continue + if binary_hash and record.get("binary_hash") != binary_hash: + continue + if backend and record.get("backend") != backend: + continue + return record + return None + + +def find_servers( + binary_path: Optional[str] = None, + binary_hash: Optional[str] = None, + backend: Optional[str] = None, +) -> List[Dict]: + """Return all server records matching the provided filters.""" + matches: List[Dict] = [] + binary_path_resolved = str(Path(binary_path).expanduser().resolve()) if binary_path else None + for record in list_servers(): + if binary_path_resolved: + record_path = record.get("binary_path") + if not record_path: + continue + try: + if str(Path(record_path).expanduser().resolve()) != binary_path_resolved: + continue + except Exception: + if record_path != binary_path_resolved: + continue + if binary_hash and record.get("binary_hash") != binary_hash: + continue + if backend and record.get("backend") != backend: + continue + matches.append(record) + return matches diff --git a/libbs/cli/__init__.py b/libbs/cli/__init__.py new file mode 100644 index 0000000..51f251b --- /dev/null +++ b/libbs/cli/__init__.py @@ -0,0 +1,3 @@ +from libbs.cli.decompiler_cli import main + +__all__ = ["main"] diff --git a/libbs/cli/decompiler_cli.py b/libbs/cli/decompiler_cli.py new file mode 100644 index 0000000..15bccc9 --- /dev/null +++ b/libbs/cli/decompiler_cli.py @@ -0,0 +1,637 @@ +""" +The `decompiler` CLI: a simplified, LLM-friendly interface to libbs. + +The CLI is a client that connects to a DecompilerServer. The first `load` of +a binary auto-starts a headless server in the background; subsequent CLI +invocations (including `load`s of other binaries) connect to the right server +via the shared server registry (see libbs.api.server_registry). + +Subcommands implemented: +- load start a server on a binary +- list list running servers +- stop stop one or all servers +- decompile decompile a function by name or address +- disassemble disassemble a function by name or address +- xref_to list callers/references to a name or address +- xref_from list callees of a function (things it calls) +- rename rename a function or local variable +- list_strings list strings in the binary, optionally filtered by regex +- get_callers list callers of a function +""" +import argparse +import json +import logging +import os +import re +import subprocess +import sys +import time +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +from libbs.api import server_registry +from libbs.decompilers import SUPPORTED_DECOMPILERS + +_l = logging.getLogger("libbs.cli.decompiler") + +_SERVER_START_TIMEOUT = 300.0 # seconds; Ghidra initial analysis can be slow +_SERVER_POLL_INTERVAL = 0.25 + + +def _configure_logging(verbose: bool) -> None: + level = logging.DEBUG if verbose else logging.WARNING + logging.basicConfig(level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") + # Keep libbs chatter quiet unless --verbose; otherwise INFO logs clobber the CLI output. + if not verbose: + logging.getLogger("libbs").setLevel(logging.WARNING) + + +def _parse_target(target: str) -> Tuple[Optional[int], Optional[str]]: + """Parse a user-supplied target into (addr, name). + + Accepts hex (0x...), decimal, or a symbol name. Returns (addr, None) if numeric, + otherwise (None, target). + """ + if target is None: + return None, None + t = target.strip() + if t.lower().startswith("0x"): + try: + return int(t, 16), None + except ValueError: + pass + if t.isdigit(): + try: + return int(t, 10), None + except ValueError: + pass + return None, t + + +def _resolve_function_addr(client, target: str) -> Optional[int]: + """Resolve a function reference to its address using a client. + + Names are resolved by scanning light artifacts. Addresses may be given in either + lifted (relative to base) or lowered (absolute/loaded) form; we match whichever + the server's artifact dict uses. + """ + addr, name = _parse_target(target) + if name is not None: + for _addr, func in client.functions.items(): + if func.name == name: + return _addr + return None + if addr is None: + return None + + # Addresses may be given as absolute; the server exposes lifted addresses. + known = set(client.functions.keys()) + if addr in known: + return addr + try: + base = client.binary_base_addr + except Exception: + base = 0 + if base and addr >= base and (addr - base) in known: + return addr - base + if base and (addr + base) in known: + return addr + base + return addr # let the caller raise if it's truly invalid + + +def _select_server( + server_id: Optional[str], + binary_path: Optional[str], + backend: Optional[str], +) -> Dict: + """Pick a server record from the registry, or error out with a helpful message.""" + records = server_registry.find_servers( + binary_path=binary_path, + backend=backend, + ) + if server_id: + records = [r for r in records if r.get("id") == server_id] + + if not records: + filters = {"id": server_id, "binary_path": binary_path, "backend": backend} + active = {k: v for k, v in filters.items() if v} + raise SystemExit( + "No running decompiler server matches " + f"{active or '(no filters)'}. Start one with `decompiler load `." + ) + if len(records) > 1 and not server_id: + lines = [ + f"{r['id']} backend={r.get('backend')} binary={r.get('binary_path')}" + for r in records + ] + raise SystemExit( + "Multiple servers match. Specify --id to disambiguate:\n " + + "\n ".join(lines) + ) + return records[0] + + +def _connect_client(record: Dict): + from libbs.api.decompiler_client import DecompilerClient + + return DecompilerClient(socket_path=record["socket_path"]) + + +def _with_client(args): + """Resolve & connect to the selected server, returning the client.""" + record = _select_server( + server_id=getattr(args, "id", None), + binary_path=getattr(args, "binary", None), + backend=getattr(args, "backend", None), + ) + return _connect_client(record) + + +# --------------------------------------------------------------------------- +# load +# --------------------------------------------------------------------------- + +def _spawn_server(binary_path: Path, backend: str, server_id: str) -> subprocess.Popen: + """Start a detached headless server process for the given binary.""" + cmd = [ + sys.executable, "-m", "libbs", + "--server", + "--decompiler", backend, + "--headless", + "--binary-path", str(binary_path), + "--server-id", server_id, + ] + env = os.environ.copy() + # Inherit env so things like GHIDRA_INSTALL_DIR flow through. + + # Fully detach: new session so Ctrl-C in the CLI won't kill the server. + kwargs = { + "stdout": subprocess.DEVNULL, + "stderr": subprocess.DEVNULL, + "stdin": subprocess.DEVNULL, + "env": env, + "close_fds": True, + } + if os.name == "posix": + kwargs["start_new_session"] = True + else: + kwargs["creationflags"] = getattr(subprocess, "DETACHED_PROCESS", 0) | getattr( + subprocess, "CREATE_NEW_PROCESS_GROUP", 0 + ) + return subprocess.Popen(cmd, **kwargs) + + +def _wait_for_server(server_id: str, timeout: float = _SERVER_START_TIMEOUT) -> Dict: + """Block until a server with `server_id` appears in the registry or timeout.""" + deadline = time.time() + timeout + while time.time() < deadline: + record = server_registry.find_server(server_id=server_id) + if record and record.get("socket_path") and os.path.exists(record["socket_path"]): + return record + time.sleep(_SERVER_POLL_INTERVAL) + raise SystemExit( + f"Timed out waiting {timeout:.0f}s for server {server_id} to start. " + "Check backend dependencies (e.g. GHIDRA_INSTALL_DIR) and retry." + ) + + +def cmd_load(args) -> int: + binary_path = Path(args.binary).expanduser().resolve() + if not binary_path.exists(): + raise SystemExit(f"Binary not found: {binary_path}") + + backend = args.backend + if backend not in SUPPORTED_DECOMPILERS: + raise SystemExit( + f"Unsupported backend {backend!r}; pick one of: {sorted(SUPPORTED_DECOMPILERS)}" + ) + + # If there's already a matching server for this exact binary+backend, prefer that. + existing = server_registry.find_servers(binary_path=str(binary_path), backend=backend) + if existing and not args.force: + record = existing[0] + _emit(args, { + "status": "already_loaded", + "id": record["id"], + "binary_path": record.get("binary_path"), + "backend": record.get("backend"), + "socket_path": record.get("socket_path"), + }) + return 0 + + server_id = args.id or server_registry.new_server_id() + _spawn_server(binary_path, backend, server_id) + record = _wait_for_server(server_id) + _emit(args, { + "status": "started", + "id": record["id"], + "binary_path": record.get("binary_path"), + "backend": record.get("backend"), + "socket_path": record.get("socket_path"), + }) + return 0 + + +# --------------------------------------------------------------------------- +# list / stop +# --------------------------------------------------------------------------- + +def cmd_list(args) -> int: + records = server_registry.list_servers() + if args.json: + print(json.dumps(records, indent=2, default=str)) + return 0 + if not records: + print("No running decompiler servers.") + return 0 + print(f"{'ID':<12} {'BACKEND':<8} {'PID':<8} BINARY") + for r in records: + print(f"{r.get('id',''):<12} {str(r.get('backend','')):<8} {str(r.get('pid','')):<8} {r.get('binary_path','')}") + return 0 + + +def _stop_server_by_record(record: Dict) -> bool: + """Shut down a server via a client; returns True if a shutdown request was sent.""" + from libbs.api.decompiler_client import DecompilerClient + + try: + client = DecompilerClient(socket_path=record["socket_path"]) + except Exception as exc: + _l.warning("Could not connect to server %s: %s", record.get("id"), exc) + # Best-effort: drop stale registry entry so it's not stuck. + server_registry.unregister_server(record.get("id")) + return False + try: + try: + client._send_request({"type": "shutdown_deci"}) + except Exception: + pass + finally: + try: + client.shutdown() + except Exception: + pass + # Remove from registry in case the server exits before cleaning up. + time.sleep(0.2) + server_registry.unregister_server(record.get("id")) + return True + + +def cmd_stop(args) -> int: + records = server_registry.list_servers() + if args.all: + targets = records + elif args.id: + targets = [r for r in records if r.get("id") == args.id] + elif args.binary: + bp = str(Path(args.binary).expanduser().resolve()) + targets = [r for r in records if r.get("binary_path") == bp] + else: + raise SystemExit("decompiler stop needs --id, --binary, or --all") + + if not targets: + raise SystemExit("No matching server to stop") + + results = [] + for record in targets: + ok = _stop_server_by_record(record) + results.append({"id": record.get("id"), "stopped": bool(ok)}) + _emit(args, {"stopped": results}) + return 0 + + +# --------------------------------------------------------------------------- +# decompile / disassemble +# --------------------------------------------------------------------------- + +def cmd_decompile(args) -> int: + with _with_client(args) as client: + addr = _resolve_function_addr(client, args.target) + if addr is None: + raise SystemExit(f"Function not found: {args.target!r}") + dec = client.decompile(addr) + if dec is None: + raise SystemExit(f"Failed to decompile function at 0x{addr:x}") + out = { + "addr": addr, + "decompiler": dec.decompiler if hasattr(dec, "decompiler") else None, + "text": dec.text if hasattr(dec, "text") else str(dec), + } + _emit(args, out, text_field="text") + return 0 + + +def cmd_disassemble(args) -> int: + with _with_client(args) as client: + addr = _resolve_function_addr(client, args.target) + if addr is None: + raise SystemExit(f"Function not found: {args.target!r}") + text = client.disassemble(addr) + if text is None: + raise SystemExit(f"Failed to disassemble function at 0x{addr:x}") + _emit(args, {"addr": addr, "text": text}, text_field="text") + return 0 + + +# --------------------------------------------------------------------------- +# xrefs +# --------------------------------------------------------------------------- + +def _format_function(func) -> Dict: + out = { + "addr": getattr(func, "addr", None), + "name": getattr(func, "name", None), + } + return out + + +def cmd_xref_to(args) -> int: + with _with_client(args) as client: + addr = _resolve_function_addr(client, args.target) + if addr is None: + raise SystemExit(f"Function not found: {args.target!r}") + callers = client.get_callers(addr) + data = [_format_function(c) for c in callers] + _emit_xrefs(args, addr, data, direction="to") + return 0 + + +def cmd_xref_from(args) -> int: + """Return the callees of a function (what the function calls). + + Implementation: decompile the function then scan the callgraph for edges leaving + this function. Falls back to parsing `call` instructions in disassembly. + """ + with _with_client(args) as client: + addr = _resolve_function_addr(client, args.target) + if addr is None: + raise SystemExit(f"Function not found: {args.target!r}") + + callees: List[Dict] = [] + seen = set() + try: + cg = client.get_callgraph(only_names=False) + for caller, callee in cg.out_edges(nbunch=None): # type: ignore[attr-defined] + caller_addr = getattr(caller, "addr", None) + if caller_addr == addr: + callee_addr = getattr(callee, "addr", None) + if callee_addr in seen: + continue + seen.add(callee_addr) + callees.append(_format_function(callee)) + except Exception as exc: + _l.debug("Callgraph-based xref_from failed (%s); falling back to disasm scan.", exc) + + if not callees: + # Fallback: parse `call 0x...` from disassembly. + disasm = client.disassemble(addr) or "" + call_re = re.compile(r"\bcall\b[^0-9]*0x([0-9a-fA-F]+)") + functions_by_addr = dict(client.functions.items()) + for match in call_re.finditer(disasm): + try: + callee_addr = int(match.group(1), 16) + except ValueError: + continue + if callee_addr in seen: + continue + seen.add(callee_addr) + func = functions_by_addr.get(callee_addr) + callees.append({ + "addr": callee_addr, + "name": func.name if func else None, + }) + + _emit_xrefs(args, addr, callees, direction="from") + return 0 + + +def _emit_xrefs(args, addr: int, xrefs: List[Dict], *, direction: str) -> None: + payload = {"addr": addr, "direction": direction, "xrefs": xrefs} + if args.json: + print(json.dumps(payload, indent=2, default=str)) + return + if not xrefs: + print(f"No xrefs {direction} 0x{addr:x}") + return + for x in xrefs: + a = x.get("addr") + n = x.get("name") or "" + print(f"0x{a:x}\t{n}" if a is not None else f"?\t{n}") + + +# --------------------------------------------------------------------------- +# rename +# --------------------------------------------------------------------------- + +def cmd_rename(args) -> int: + kind = args.kind + with _with_client(args) as client: + if kind == "func": + addr = _resolve_function_addr(client, args.target) + if addr is None: + raise SystemExit(f"Function not found: {args.target!r}") + func = client.functions[addr] + if not func: + raise SystemExit(f"Could not load function at 0x{addr:x}") + func.name = args.new_name + if func.header is not None: + func.header.name = args.new_name + ok = bool(client.set_artifact(func)) + _emit(args, {"kind": "func", "addr": addr, "new_name": args.new_name, "success": ok}) + return 0 if ok else 2 + elif kind == "var": + if not args.function: + raise SystemExit("--function is required when renaming a variable") + func_addr = _resolve_function_addr(client, args.function) + if func_addr is None: + raise SystemExit(f"Function not found: {args.function!r}") + func = client.functions[func_addr] + if not func: + raise SystemExit(f"Could not load function at 0x{func_addr:x}") + name_map = {args.target: args.new_name} + ok = bool(client.rename_local_variables_by_names(func, name_map)) + _emit(args, {"kind": "var", "function_addr": func_addr, + "old_name": args.target, "new_name": args.new_name, + "success": ok}) + return 0 if ok else 2 + raise SystemExit(f"Unknown rename kind: {kind}") + + +# --------------------------------------------------------------------------- +# list_strings / get_callers (new core APIs) +# --------------------------------------------------------------------------- + +def cmd_list_strings(args) -> int: + with _with_client(args) as client: + strings = client.list_strings(filter=args.filter) + if args.json: + print(json.dumps( + [{"addr": a, "string": s} for a, s in strings], + indent=2, default=str, + )) + else: + for addr, s in strings: + print(f"0x{addr:x}\t{s}") + return 0 + + +def cmd_get_callers(args) -> int: + with _with_client(args) as client: + addr, name = _parse_target(args.target) + target = addr if addr is not None else name + try: + callers = client.get_callers(target) + except ValueError as exc: + raise SystemExit(str(exc)) + data = [_format_function(c) for c in callers] + if args.json: + print(json.dumps({"target": args.target, "callers": data}, indent=2, default=str)) + else: + if not data: + print(f"No callers found for {args.target!r}") + else: + for entry in data: + a = entry.get("addr") + n = entry.get("name") or "" + print(f"0x{a:x}\t{n}" if a is not None else f"?\t{n}") + return 0 + + +# --------------------------------------------------------------------------- +# shared helpers +# --------------------------------------------------------------------------- + +def _emit(args, payload: Dict, *, text_field: Optional[str] = None) -> None: + """Emit a response either as JSON or as a human-readable block.""" + if args.json: + print(json.dumps(payload, indent=2, default=str)) + return + if text_field and text_field in payload: + print(payload[text_field]) + return + # Default: key: value lines + for k, v in payload.items(): + print(f"{k}: {v}") + + +# --------------------------------------------------------------------------- +# argparse plumbing +# --------------------------------------------------------------------------- + +def _add_server_filter_args(p: argparse.ArgumentParser) -> None: + p.add_argument("--id", dest="id", help="Server ID to target (see `decompiler list`).") + p.add_argument("--binary", dest="binary", help="Match server by binary path.") + p.add_argument("--backend", dest="backend", choices=sorted(SUPPORTED_DECOMPILERS), help="Match server by backend.") + + +def _add_output_args(p: argparse.ArgumentParser) -> None: + p.add_argument("--json", action="store_true", help="Emit JSON output instead of text.") + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="decompiler", + description=( + "LLM-friendly decompiler CLI powered by LibBS. " + "Load a binary once, then run decompile/disassemble/xref/rename " + "commands. Multiple binaries/backends can run concurrently." + ), + ) + parser.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging.") + sub = parser.add_subparsers(dest="cmd", required=True) + + # load + p_load = sub.add_parser("load", help="Load a binary, starting a server if needed.") + p_load.add_argument("binary", help="Path to the binary to analyze.") + p_load.add_argument("--backend", default="angr", choices=sorted(SUPPORTED_DECOMPILERS), + help="Backend decompiler to use (default: angr).") + p_load.add_argument("--id", dest="id", help="Explicit server ID (otherwise auto-generated).") + p_load.add_argument("--force", action="store_true", + help="Start a new server even if one already exists for this binary.") + _add_output_args(p_load) + p_load.set_defaults(func=cmd_load) + + # list + p_list = sub.add_parser("list", help="List running decompiler servers.") + _add_output_args(p_list) + p_list.set_defaults(func=cmd_list) + + # stop + p_stop = sub.add_parser("stop", help="Stop a running server.") + p_stop.add_argument("--id", dest="id", help="Server ID to stop.") + p_stop.add_argument("--binary", dest="binary", help="Stop servers for this binary.") + p_stop.add_argument("--all", action="store_true", help="Stop every running server.") + _add_output_args(p_stop) + p_stop.set_defaults(func=cmd_stop) + + # decompile + p_dec = sub.add_parser("decompile", help="Decompile a function by name or address.") + p_dec.add_argument("target", help="Function name or address (hex/decimal).") + _add_server_filter_args(p_dec) + _add_output_args(p_dec) + p_dec.set_defaults(func=cmd_decompile) + + # disassemble + p_dis = sub.add_parser("disassemble", help="Disassemble a function by name or address.") + p_dis.add_argument("target", help="Function name or address (hex/decimal).") + _add_server_filter_args(p_dis) + _add_output_args(p_dis) + p_dis.set_defaults(func=cmd_disassemble) + + # xref_to + p_xto = sub.add_parser("xref_to", help="Functions/code that call or reference a target.") + p_xto.add_argument("target", help="Function name or address (hex/decimal).") + _add_server_filter_args(p_xto) + _add_output_args(p_xto) + p_xto.set_defaults(func=cmd_xref_to) + + # xref_from + p_xfrom = sub.add_parser("xref_from", help="Things a function calls (callees).") + p_xfrom.add_argument("target", help="Function name or address (hex/decimal).") + _add_server_filter_args(p_xfrom) + _add_output_args(p_xfrom) + p_xfrom.set_defaults(func=cmd_xref_from) + + # rename + p_ren = sub.add_parser("rename", help="Rename a function or a local variable.") + p_ren.add_argument("kind", choices=["func", "var"], help="What to rename.") + p_ren.add_argument("target", help="Function name/address (for `func`) or variable name (for `var`).") + p_ren.add_argument("new_name", help="New name.") + p_ren.add_argument("--function", help="When renaming a variable, the containing function.") + _add_server_filter_args(p_ren) + _add_output_args(p_ren) + p_ren.set_defaults(func=cmd_rename) + + # list_strings + p_ls = sub.add_parser("list_strings", help="List strings in the binary.") + p_ls.add_argument("--filter", dest="filter", help="Regex to filter strings.") + _add_server_filter_args(p_ls) + _add_output_args(p_ls) + p_ls.set_defaults(func=cmd_list_strings) + + # get_callers + p_gc = sub.add_parser("get_callers", help="List callers of a function (Function|addr|name).") + p_gc.add_argument("target", help="Function name or address (hex/decimal).") + _add_server_filter_args(p_gc) + _add_output_args(p_gc) + p_gc.set_defaults(func=cmd_get_callers) + + return parser + + +def main(argv: Optional[List[str]] = None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + _configure_logging(getattr(args, "verbose", False)) + try: + return args.func(args) or 0 + except SystemExit: + raise + except Exception as exc: # noqa: BLE001 + _l.exception("Unhandled error: %s", exc) + print(f"Error: {exc}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": # pragma: no cover + sys.exit(main()) diff --git a/libbs/decompilers/angr/interface.py b/libbs/decompilers/angr/interface.py index e5694a1..ce66418 100644 --- a/libbs/decompilers/angr/interface.py +++ b/libbs/decompilers/angr/interface.py @@ -236,11 +236,15 @@ def rename_local_variables_by_names(self, func: Function, name_map: Dict[str, st if not codegen or not codegen.cfunc or not codegen.cfunc.variable_manager: return False + changed = False for v in codegen.cfunc.variable_manager._unified_variables: - if v.name in name_map: + if v.name in name_map and v.name != name_map[v.name]: v.name = name_map[v.name] + changed = True - return self.refresh_decompilation(func.addr) + if not self.headless: + self.refresh_decompilation(func.addr) + return changed @property def binary_arch(self) -> str | None: diff --git a/pyproject.toml b/pyproject.toml index 57a6630..6901244 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,7 @@ ghidra = [ [project.scripts] libbs = "libbs.__main__:main" +decompiler = "libbs.cli:main" [tool.setuptools] include-package-data = true diff --git a/tests/test_decompiler_cli.py b/tests/test_decompiler_cli.py new file mode 100644 index 0000000..eafee0b --- /dev/null +++ b/tests/test_decompiler_cli.py @@ -0,0 +1,290 @@ +""" +Tests for the `decompiler` CLI and the new libbs core features it exposes +(list_strings, get_callers, disassemble). + +These tests use the angr backend so they work without external installs (IDA, +Ghidra, Binary Ninja). They run the CLI by spawning subprocesses so that the +real entry point and server-registry flow are exercised. +""" +import json +import os +import re +import shutil +import subprocess +import sys +import tempfile +import time +import unittest +from pathlib import Path + +from libbs.api import server_registry +from libbs.api.decompiler_client import DecompilerClient +from libbs.api.decompiler_interface import DecompilerInterface +from libbs.api.decompiler_server import DecompilerServer + + +TEST_BINARIES_DIR = Path( + os.getenv("TEST_BINARIES_DIR", Path(__file__).parent.parent.parent / "bs-artifacts" / "binaries") +) +FAUXWARE_PATH = TEST_BINARIES_DIR / "fauxware" +POSIX_SYSCALL_PATH = TEST_BINARIES_DIR / "posix_syscall" + + +def _cli_env(): + env = os.environ.copy() + # Isolate registry per-test so concurrent test runs don't collide and stale + # servers from previous runs don't leak in. + env["LIBBS_SERVER_REGISTRY"] = _REGISTRY_DIR + return env + + +def _run_cli(*args, check=True, timeout=600) -> subprocess.CompletedProcess: + """Run the `decompiler` CLI and return the result.""" + cmd = [sys.executable, "-m", "libbs.cli.decompiler_cli", *args] + env = _cli_env() + return subprocess.run(cmd, capture_output=True, text=True, check=check, timeout=timeout, env=env) + + +# Shared registry directory for this module's tests +_REGISTRY_DIR = tempfile.mkdtemp(prefix="libbs_cli_registry_") + + +def _stop_all_servers(): + """Best-effort teardown: kill every server present in the registry.""" + os.environ["LIBBS_SERVER_REGISTRY"] = _REGISTRY_DIR + try: + records = server_registry.list_servers(prune_stale=False) + except Exception: + records = [] + for record in records: + try: + client = DecompilerClient(socket_path=record["socket_path"]) + try: + client._send_request({"type": "shutdown_deci"}) + except Exception: + pass + client.shutdown() + except Exception: + pass + finally: + server_registry.unregister_server(record.get("id")) + # Also try to SIGKILL the PID as a fallback + pid = record.get("pid") + if pid: + try: + os.kill(int(pid), 9) + except Exception: + pass + + +@unittest.skipUnless(FAUXWARE_PATH.exists(), f"Missing test binary: {FAUXWARE_PATH}") +class TestDecompilerCLI(unittest.TestCase): + """End-to-end tests for the decompiler CLI using angr backend.""" + + @classmethod + def setUpClass(cls): + os.environ["LIBBS_SERVER_REGISTRY"] = _REGISTRY_DIR + _stop_all_servers() + + @classmethod + def tearDownClass(cls): + _stop_all_servers() + try: + shutil.rmtree(_REGISTRY_DIR, ignore_errors=True) + except Exception: + pass + + def tearDown(self): + _stop_all_servers() + + def _load_fauxware(self): + result = _run_cli("load", str(FAUXWARE_PATH), "--backend", "angr", "--json") + payload = json.loads(result.stdout) + self.assertIn(payload["status"], ("started", "already_loaded")) + self.assertEqual(payload["backend"], "angr") + return payload + + def test_load_and_list(self): + loaded = self._load_fauxware() + server_id = loaded["id"] + + list_result = _run_cli("list", "--json") + servers = json.loads(list_result.stdout) + ids = {s["id"] for s in servers} + self.assertIn(server_id, ids) + + def test_load_idempotent(self): + first = self._load_fauxware() + second = self._load_fauxware() + self.assertEqual(first["id"], second["id"]) + self.assertEqual(second["status"], "already_loaded") + + def test_multi_instance_same_binary_with_force(self): + first = self._load_fauxware() + forced = _run_cli( + "load", str(FAUXWARE_PATH), "--backend", "angr", "--force", "--json" + ) + second = json.loads(forced.stdout) + self.assertNotEqual(first["id"], second["id"]) + + # Ambiguous selection should fail helpfully. + result = _run_cli("decompile", "main", check=False) + self.assertNotEqual(result.returncode, 0) + self.assertIn("Specify --id", result.stdout + result.stderr) + + # Selecting a specific id disambiguates. + ok = _run_cli("decompile", "main", "--id", first["id"]) + self.assertIn("main", ok.stdout) + + def test_decompile(self): + self._load_fauxware() + result = _run_cli("decompile", "main", "--json") + payload = json.loads(result.stdout) + self.assertIn("text", payload) + self.assertIn("main", payload["text"]) + + # By address (lifted) + addr_dec = _run_cli("decompile", "0x71d", "--json") + self.assertIn("text", json.loads(addr_dec.stdout)) + + def test_disassemble(self): + self._load_fauxware() + result = _run_cli("disassemble", "main", "--json") + payload = json.loads(result.stdout) + self.assertIn("text", payload) + # sanity: some assembly + self.assertTrue(any(op in payload["text"] for op in ("push", "mov", "call"))) + + def test_xref_to(self): + self._load_fauxware() + result = _run_cli("xref_to", "authenticate", "--json") + payload = json.loads(result.stdout) + names = {x.get("name") for x in payload["xrefs"]} + self.assertIn("main", names) + + def test_xref_from(self): + self._load_fauxware() + result = _run_cli("xref_from", "main", "--json") + payload = json.loads(result.stdout) + # main should call at least `authenticate`; address is always populated. + addrs = {x.get("addr") for x in payload["xrefs"]} + self.assertGreaterEqual(len(addrs), 1) + names = {x.get("name") for x in payload["xrefs"] if x.get("name")} + # At least one named callee (puts/read/authenticate/accepted/rejected) + self.assertTrue(names & {"authenticate", "puts", "read", "accepted", "rejected"}) + + def test_rename_func(self): + self._load_fauxware() + result = _run_cli("rename", "func", "authenticate", "my_auth", "--json") + payload = json.loads(result.stdout) + self.assertTrue(payload["success"]) + + def test_rename_var(self): + self._load_fauxware() + # Fetch an existing local variable name dynamically via the client API + # so this doesn't depend on angr's specific naming. + record = server_registry.find_servers(binary_path=str(FAUXWARE_PATH))[0] + client = DecompilerClient(socket_path=record["socket_path"]) + try: + addrs = [a for a, f in client.functions.items() if f.name == "main"] + main_addr = addrs[0] + main_func = client.functions[main_addr] + names = client.local_variable_names(main_func) + target = next((n for n in names if n not in ("a0", "a1")), names[0]) + finally: + client.shutdown() + + result = _run_cli( + "rename", "var", target, "renamed_var", + "--function", "main", "--json", + ) + payload = json.loads(result.stdout) + self.assertTrue(payload["success"]) + + def test_list_strings(self): + self._load_fauxware() + result = _run_cli("list_strings", "--filter", "Welcome", "--json") + payload = json.loads(result.stdout) + self.assertTrue(any("Welcome" in s["string"] for s in payload)) + + def test_get_callers(self): + self._load_fauxware() + by_name = _run_cli("get_callers", "authenticate", "--json") + payload = json.loads(by_name.stdout) + names = {c.get("name") for c in payload["callers"]} + self.assertIn("main", names) + + def test_stop(self): + loaded = self._load_fauxware() + stop = _run_cli("stop", "--id", loaded["id"], "--json") + payload = json.loads(stop.stdout) + self.assertTrue(payload["stopped"][0]["stopped"]) + listing = _run_cli("list", "--json") + ids = {s["id"] for s in json.loads(listing.stdout)} + self.assertNotIn(loaded["id"], ids) + + @unittest.skipUnless(POSIX_SYSCALL_PATH.exists(), f"Missing: {POSIX_SYSCALL_PATH}") + def test_two_binaries_concurrent(self): + first = self._load_fauxware() + second_result = _run_cli( + "load", str(POSIX_SYSCALL_PATH), "--backend", "angr", "--json" + ) + second = json.loads(second_result.stdout) + self.assertNotEqual(first["id"], second["id"]) + + # Each CLI call with --id should return results from its binary. + fauxware_strings = _run_cli("list_strings", "--id", first["id"], "--json") + self.assertTrue(any("Welcome" in s["string"] for s in json.loads(fauxware_strings.stdout))) + + +@unittest.skipUnless(FAUXWARE_PATH.exists(), f"Missing test binary: {FAUXWARE_PATH}") +class TestNewLibbsFeatures(unittest.TestCase): + """Direct tests (not via CLI) for the new list_strings/get_callers/disassemble APIs.""" + + @classmethod + def setUpClass(cls): + cls.deci = DecompilerInterface.discover( + force_decompiler="angr", + headless=True, + binary_path=str(FAUXWARE_PATH), + ) + + def test_list_strings_no_filter(self): + strings = self.deci.list_strings() + self.assertGreater(len(strings), 0) + for addr, s in strings: + self.assertIsInstance(addr, int) + self.assertIsInstance(s, str) + + def test_list_strings_filter(self): + welcome = self.deci.list_strings(filter=r"Welcome") + self.assertEqual(len(welcome), 1) + self.assertIn("Welcome", welcome[0][1]) + # Ensure non-matching regex yields nothing. + self.assertEqual(self.deci.list_strings(filter=r"zzz_no_match_zzz"), []) + + def test_disassemble(self): + addrs = [a for a, f in self.deci.functions.items() if f.name == "main"] + self.assertEqual(len(addrs), 1) + main_addr = addrs[0] + text = self.deci.disassemble(main_addr) + self.assertIsNotNone(text) + self.assertTrue(any(mnem in text for mnem in ("push", "mov", "call"))) + + def test_get_callers_by_addr_name_and_function(self): + addrs_by_name = {f.name: a for a, f in self.deci.functions.items()} + auth_addr = addrs_by_name["authenticate"] + + by_addr = self.deci.get_callers(auth_addr) + by_name = self.deci.get_callers("authenticate") + self.assertGreater(len(by_addr), 0) + self.assertGreater(len(by_name), 0) + self.assertEqual({f.addr for f in by_addr}, {f.addr for f in by_name}) + + # A made-up name raises. + with self.assertRaises(ValueError): + self.deci.get_callers("no_such_function_xyz") + + +if __name__ == "__main__": + unittest.main() From b1e136f5b523dc9d5f5492a97449753ae96ff2dc Mon Sep 17 00:00:00 2001 From: mahaloz Date: Wed, 22 Apr 2026 21:03:19 -0700 Subject: [PATCH 03/10] Ship a bundled Agent Skill and `decompiler install-skill` Adds `libbs/skills/decompiler/SKILL.md` so that after `pip install libbs` an LLM-facing Agent Skill is available describing the full `decompiler` workflow: load, list, stop, decompile, disassemble, xref_to/xref_from, rename, list_strings, get_callers, plus multi-instance targeting. New `decompiler install-skill [--dest DIR] [--force]` copies the skill into `~/.claude/skills/` (or any path) so Claude Code and similar agents can pick it up. Tests verify the skill is present, installs cleanly, errors on re-install, and respects --force. --- libbs/cli/decompiler_cli.py | 68 ++++++++++++--- libbs/skills/__init__.py | 24 ++++++ libbs/skills/decompiler/SKILL.md | 143 +++++++++++++++++++++++++++++++ pyproject.toml | 3 + tests/test_decompiler_cli.py | 30 +++++++ 5 files changed, 258 insertions(+), 10 deletions(-) create mode 100644 libbs/skills/__init__.py create mode 100644 libbs/skills/decompiler/SKILL.md diff --git a/libbs/cli/decompiler_cli.py b/libbs/cli/decompiler_cli.py index 15bccc9..1c4d9dc 100644 --- a/libbs/cli/decompiler_cli.py +++ b/libbs/cli/decompiler_cli.py @@ -7,22 +7,24 @@ via the shared server registry (see libbs.api.server_registry). Subcommands implemented: -- load start a server on a binary -- list list running servers -- stop stop one or all servers -- decompile decompile a function by name or address -- disassemble disassemble a function by name or address -- xref_to list callers/references to a name or address -- xref_from list callees of a function (things it calls) -- rename rename a function or local variable -- list_strings list strings in the binary, optionally filtered by regex -- get_callers list callers of a function +- load start a server on a binary +- list list running servers +- stop stop one or all servers +- decompile decompile a function by name or address +- disassemble disassemble a function by name or address +- xref_to list callers/references to a name or address +- xref_from list callees of a function (things it calls) +- rename rename a function or local variable +- list_strings list strings in the binary, optionally filtered by regex +- get_callers list callers of a function +- install-skill install the bundled Agent Skill so LLMs learn the CLI """ import argparse import json import logging import os import re +import shutil import subprocess import sys import time @@ -31,6 +33,7 @@ from libbs.api import server_registry from libbs.decompilers import SUPPORTED_DECOMPILERS +from libbs import skills _l = logging.getLogger("libbs.cli.decompiler") @@ -497,6 +500,38 @@ def cmd_get_callers(args) -> int: return 0 +# --------------------------------------------------------------------------- +# install-skill +# --------------------------------------------------------------------------- + +def _default_skill_dest() -> Path: + return Path(os.path.expanduser("~/.claude/skills")) + + +def cmd_install_skill(args) -> int: + dest_root = Path(args.dest).expanduser().resolve() if args.dest else _default_skill_dest() + names = args.names or skills.available_skills() + if not names: + raise SystemExit("No bundled skills to install") + + dest_root.mkdir(parents=True, exist_ok=True) + installed: List[Dict] = [] + for name in names: + src = skills.skill_path(name) + dest = dest_root / name + if dest.exists() and not args.force: + raise SystemExit( + f"Skill already exists at {dest}. Pass --force to overwrite." + ) + if dest.exists() and args.force: + shutil.rmtree(dest) + shutil.copytree(src, dest) + installed.append({"name": name, "path": str(dest)}) + + _emit(args, {"installed": installed}) + return 0 + + # --------------------------------------------------------------------------- # shared helpers # --------------------------------------------------------------------------- @@ -616,6 +651,19 @@ def build_parser() -> argparse.ArgumentParser: _add_output_args(p_gc) p_gc.set_defaults(func=cmd_get_callers) + # install-skill + p_sk = sub.add_parser( + "install-skill", + help="Install the bundled Agent Skill (SKILL.md) into ~/.claude/skills/.", + ) + p_sk.add_argument("names", nargs="*", + help="Specific skill names to install (default: all bundled).") + p_sk.add_argument("--dest", help="Install destination (default: ~/.claude/skills).") + p_sk.add_argument("--force", action="store_true", + help="Overwrite an existing skill directory.") + _add_output_args(p_sk) + p_sk.set_defaults(func=cmd_install_skill) + return parser diff --git a/libbs/skills/__init__.py b/libbs/skills/__init__.py new file mode 100644 index 0000000..f599f10 --- /dev/null +++ b/libbs/skills/__init__.py @@ -0,0 +1,24 @@ +"""Bundled Agent Skills for libbs. + +Each subdirectory holds a SKILL.md (and any optional resources) that an LLM can +load to learn how to drive libbs via the `decompiler` CLI. Use +`decompiler install-skill` to copy a skill into the user's `~/.claude/skills/`. +""" +from pathlib import Path + +SKILLS_DIR = Path(__file__).parent + + +def available_skills() -> list[str]: + return sorted( + p.name + for p in SKILLS_DIR.iterdir() + if p.is_dir() and (p / "SKILL.md").is_file() + ) + + +def skill_path(name: str) -> Path: + path = SKILLS_DIR / name + if not (path / "SKILL.md").is_file(): + raise FileNotFoundError(f"Unknown bundled skill: {name!r}") + return path diff --git a/libbs/skills/decompiler/SKILL.md b/libbs/skills/decompiler/SKILL.md new file mode 100644 index 0000000..0112b08 --- /dev/null +++ b/libbs/skills/decompiler/SKILL.md @@ -0,0 +1,143 @@ +--- +name: decompiler +description: Reverse-engineer and modify binaries with a single `decompiler` CLI that drives IDA Pro, Ghidra, Binary Ninja, or angr via LibBS. Use whenever the user asks to decompile, disassemble, look up cross references, rename functions or variables, search strings, or otherwise inspect a binary file. Also use for multi-binary workflows (load several binaries at once and switch between them with --id). +--- + +# `decompiler` — LibBS CLI for LLMs + +The `decompiler` command is a thin client that talks to a long-running +`DecompilerServer` (IDA / Ghidra / Binary Ninja / angr). The first `load` of a +binary spawns a server in the background; every subsequent call reuses that +server, so repeated `decompile`/`disassemble`/`xref_*` calls are fast. + +## Setup (once per environment) + +```bash +pip install libbs # installs the `decompiler` and `libbs` entry points +libbs --install # registers LibBS plugins into detected decompilers +``` + +If you only want one backend (for example, Binary Ninja), use: +```bash +libbs --single-decompiler-install binja /Applications/Binary\ Ninja.app +``` + +`angr` needs no host install — it's a Python dependency and the fastest way +to verify the pipeline end-to-end. + +## Mental model + +| Concept | Description | +|---|---| +| **Server** | A headless `libbs --server` process holding a single binary open. Identified by a short ID. | +| **Client** | Every `decompiler ` call is a short-lived client that picks a server, does one thing, and exits. | +| **Registry** | `decompiler list` / the shared registry under the libbs state dir. Each record has `id`, `backend`, `binary_path`, `socket_path`, `pid`. | +| **Address form** | Servers expose **lifted** addresses (relative to the binary base). The CLI accepts either lifted (`0x71d`) or absolute (`0x40071d`) and does the conversion. | + +## Core workflow + +```bash +# 1. Load a binary (auto-starts a server; default backend = angr). +decompiler load ./fauxware +# 2. Poke around. +decompiler decompile main +decompiler disassemble authenticate +decompiler xref_to authenticate # who calls this? +decompiler xref_from main # what does main call? +decompiler list_strings --filter 'pass|key' +decompiler get_callers 0x71d +# 3. Mutate the database. +decompiler rename func sub_400662 trampoline +decompiler rename var v2 auth_result --function main +# 4. Tidy up when done. +decompiler stop --all +``` + +## Running multiple binaries concurrently + +Each binary gets its own server ID: + +```bash +decompiler load ./my-binary # id=abc1234 +decompiler load ./my-binary-2 # id=def5678 +decompiler list +# ID BACKEND PID BINARY +# abc1234... angr 4213 .../my-binary +# def5678... angr 4217 .../my-binary-2 +decompiler decompile main --id abc1234 +decompiler decompile main --binary ./my-binary-2 # or target by path +``` + +When more than one server matches, the CLI refuses and prints a +disambiguation list. Narrow with `--id`, `--binary`, or `--backend`. + +## Choosing a backend + +```bash +decompiler load ./my-binary --backend ghidra # needs GHIDRA_INSTALL_DIR +decompiler load ./my-binary --backend angr # pure-Python, always available +decompiler load ./my-binary --backend binja # Binary Ninja, needs license +decompiler load ./my-binary --backend ida # IDA Pro, needs install +``` + +`--backend` is also accepted on `decompile`/`disassemble`/`xref_*`/`rename`/ +`list_strings`/`get_callers` to narrow which server to target when multiple +backends are loaded for the same binary. + +## Full subcommand reference + +| Subcommand | Purpose | Key flags | +|---|---|---| +| `load ` | Start a server on the binary. Idempotent: returns existing server unless `--force`. | `--backend`, `--id`, `--force`, `--json` | +| `list` | Show all running servers. | `--json` | +| `stop` | Shut down one or all servers. | `--id`, `--binary`, `--all`, `--json` | +| `decompile ` | Pseudocode for a function (name or address). | `--id`, `--binary`, `--backend`, `--json` | +| `disassemble ` | Assembly for a function. | same | +| `xref_to ` | Functions that call `target`. | same | +| `xref_from ` | Functions that `target` calls. | same | +| `rename func ` | Rename a function. | same + `--json` | +| `rename var --function ` | Rename a local variable inside a function. | same | +| `list_strings [--filter REGEX]` | Strings in the binary, regex-filterable. | same | +| `get_callers ` | Functions that call `target` (by addr, lifted addr, or name). | same | + +## Machine-readable output + +Pass `--json` on any subcommand to get a structured payload suitable for +downstream parsing — ideal when an LLM wants to chain commands: + +```bash +decompiler list_strings --filter 'flag' --json +# [{"addr": 4197168, "string": "flag{...}"}] +decompiler decompile main --json +# {"addr": 1821, "decompiler": "angr", "text": "void main(...){...}"} +``` + +## Gotchas and tips + +- **First `load` is slow** (analysis pass). Subsequent calls on the same + server are fast. +- **Rename's "success" is authoritative**: if the old name is missing the + command exits non-zero and reports `success: false`. +- **Servers persist until explicitly stopped** (`decompiler stop --all`) or + the host reboots; `decompiler list` always reflects live processes. +- **Address formats**: `0x71d`, `0x40071d`, and `1821` all resolve the same + function in fauxware. Names are also accepted anywhere an address is. +- **Binary Ninja / IDA / Ghidra backends**: the CLI still works the same; only + the server process differs. `--backend` on `load` is what matters. + +## Library-level API (for Python scripts) + +Everything the CLI does is also available as a library: + +```python +from libbs.api.decompiler_client import DecompilerClient + +client = DecompilerClient.discover_from_registry(binary_path="./fauxware") +for addr, func in client.functions.items(): + if func.name == "main": + print(client.decompile(addr).text) +``` + +The new core APIs (`list_strings(filter=...)`, `get_callers(target)`, +`disassemble(addr)`) are on both the local `DecompilerInterface` and the +`DecompilerClient` proxy. diff --git a/pyproject.toml b/pyproject.toml index 6901244..d0e6943 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,9 @@ decompiler = "libbs.cli:main" include-package-data = true license-files = ["LICENSE"] +[tool.setuptools.package-data] +"libbs.skills" = ["**/SKILL.md", "**/*.md"] + [tool.setuptools.packages] find = {namespaces = false} diff --git a/tests/test_decompiler_cli.py b/tests/test_decompiler_cli.py index eafee0b..5354a7a 100644 --- a/tests/test_decompiler_cli.py +++ b/tests/test_decompiler_cli.py @@ -237,6 +237,36 @@ def test_two_binaries_concurrent(self): self.assertTrue(any("Welcome" in s["string"] for s in json.loads(fauxware_strings.stdout))) +class TestSkillInstaller(unittest.TestCase): + """The bundled `decompiler` skill should ship with the package and install cleanly.""" + + def test_bundled_skill_present(self): + from libbs import skills + + names = skills.available_skills() + self.assertIn("decompiler", names) + skill = skills.skill_path("decompiler") / "SKILL.md" + content = skill.read_text() + self.assertIn("name: decompiler", content) + self.assertIn("decompiler load", content) + + def test_install_skill_via_cli(self): + with tempfile.TemporaryDirectory() as dest: + result = _run_cli("install-skill", "--dest", dest, "--json") + payload = json.loads(result.stdout) + self.assertEqual(len(payload["installed"]), 1) + installed_path = Path(payload["installed"][0]["path"]) + self.assertTrue((installed_path / "SKILL.md").is_file()) + + # Re-install without --force should fail helpfully. + again = _run_cli("install-skill", "--dest", dest, "--json", check=False) + self.assertNotEqual(again.returncode, 0) + + # --force overwrites. + forced = _run_cli("install-skill", "--dest", dest, "--json", "--force") + self.assertEqual(len(json.loads(forced.stdout)["installed"]), 1) + + @unittest.skipUnless(FAUXWARE_PATH.exists(), f"Missing test binary: {FAUXWARE_PATH}") class TestNewLibbsFeatures(unittest.TestCase): """Direct tests (not via CLI) for the new list_strings/get_callers/disassemble APIs.""" From 5e0b70650595e03a4db5e7b17e68fb839838b576 Mon Sep 17 00:00:00 2001 From: mahaloz Date: Wed, 22 Apr 2026 21:51:58 -0700 Subject: [PATCH 04/10] Add decompiler CLI --- README.md | 5 +- docs/decompiler_cli.md | 409 ++++++++++++++++++++++++++++ libbs/api/decompiler_server.py | 8 + libbs/cli/decompiler_cli.py | 96 +++++-- libbs/decompilers/angr/interface.py | 24 +- 5 files changed, 519 insertions(+), 23 deletions(-) create mode 100644 docs/decompiler_cli.md diff --git a/README.md b/README.md index e450a01..01a9a82 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,10 @@ can be used in either the default mode, which assumes a GUI, or `headless` mode. start a new process using a specified decompiler. You can find various examples using LibBS in the [examples](./examples) folder. Examples that are plugins show off -more of the complicated API that allows you to use an abstracted UI, artifacts, and more. +more of the complicated API that allows you to use an abstracted UI, artifacts, and more. + +If you want a simplified command line interface (especially well-suited for LLMs), see the +[`decompiler` CLI guide](./docs/decompiler_cli.md). ### UI Mode (default) To use the same script everywhere, use the convenience function `DecompilerInterface.discover_interface()`, which will diff --git a/docs/decompiler_cli.md b/docs/decompiler_cli.md new file mode 100644 index 0000000..0798600 --- /dev/null +++ b/docs/decompiler_cli.md @@ -0,0 +1,409 @@ +# `decompiler` CLI + +The `decompiler` command is a thin, LLM-friendly client over LibBS. You load a +binary once (which spawns a headless decompiler server in the background) and +then run quick inspection or mutation commands against it. Multiple binaries +and backends can be loaded at the same time; each server is identified by a +short ID. + +This document is for humans; the short reference version used by LLM agents +lives at [`libbs/skills/decompiler/SKILL.md`](../libbs/skills/decompiler/SKILL.md) +and can be installed with `decompiler install-skill`. + +--- + +## Table of contents + +- [Install & setup](#install--setup) +- [Quick start](#quick-start) +- [How it works](#how-it-works) +- [Subcommand reference](#subcommand-reference) + - [`load`](#load) + - [`list`](#list) + - [`stop`](#stop) + - [`decompile`](#decompile) + - [`disassemble`](#disassemble) + - [`xref_to`](#xref_to) + - [`xref_from`](#xref_from) + - [`rename`](#rename) + - [`list_strings`](#list_strings) + - [`get_callers`](#get_callers) + - [`install-skill`](#install-skill) +- [Server selection (`--id`, `--binary`, `--backend`)](#server-selection) +- [JSON output (`--json`)](#json-output) +- [Running multiple binaries at once](#running-multiple-binaries-at-once) +- [Address formats](#address-formats) +- [Library-level API](#library-level-api) +- [Troubleshooting](#troubleshooting) + +--- + +## Install & setup + +```bash +pip install libbs +# Register LibBS plugins into every detected decompiler. +libbs --install +# Or point the installer at one specific decompiler: +libbs --single-decompiler-install binja "/Applications/Binary Ninja.app" +``` + +After `pip install libbs`, two entry points are available: + +- `libbs` — the existing management CLI (install plugins, run the server, + etc.) +- `decompiler` — the new LLM-facing CLI documented here. + +Pick a backend you have available: + +- **angr** — pure Python, always available. Good for end-to-end testing and + small/medium binaries. +- **ghidra** — requires `GHIDRA_INSTALL_DIR` and uses PyGhidra. +- **binja** — requires a Binary Ninja license. +- **ida** — requires IDA Pro. + +--- + +## Quick start + +```bash +# 1. Load a binary. The first call spawns a detached headless server. +decompiler load ./fauxware --backend angr +# id: 3308b81cf8 … + +# 2. Poke around. +decompiler decompile main # by name +decompiler disassemble 0x40071d # by absolute address +decompiler xref_to authenticate # callers of a function +decompiler xref_from main # what main calls +decompiler list_strings --filter 'pass|key' # regex-filtered strings +decompiler get_callers 0x71d # lifted address works too + +# 3. Mutate the database. +decompiler rename func sub_400662 trampoline +decompiler rename var v2 auth_result --function main + +# 4. Tear it down when you're done. +decompiler stop --all +``` + +--- + +## How it works + +``` +┌─────────────┐ spawns ┌─────────────────────────┐ +│ decompiler │ ────────────────▶ │ libbs --server (headless│ +│ CLI │ (first load) │ decompiler + AF_UNIX │ +│ │ │ socket) │ +│ │ ◀─────────────────│ │ +└─────────────┘ every command └─────────────────────────┘ + │ + ▼ +~/.local/state/libbs/servers/.json ← the shared registry +``` + +Each running server writes a small JSON descriptor (`id`, `socket_path`, +`binary_path`, `binary_hash`, `backend`, `pid`, `started_at`) into a shared +registry directory. The CLI reads the registry to figure out which server to +talk to. Stale records (server exited, socket missing) are pruned on read. + +Every subcommand except `load`, `list`, and `install-skill` accepts +`--id`, `--binary`, and `--backend` to pick which server to target when you +have more than one running. + +--- + +## Subcommand reference + +### `load` + +Load a binary, starting a headless server if one isn't already running for +it. + +```bash +decompiler load [--backend {angr,ghidra,binja,ida}] + [--id SERVER_ID] + [--force] + [--json] +``` + +- **`--backend`** (default: `angr`) — which decompiler to use. +- **`--id`** — explicit server ID; otherwise one is auto-generated. +- **`--force`** — start a fresh server even if an existing one matches this + `(binary, backend)`. + +Outputs `id`, `socket_path`, `binary_path`, `backend`, and `status` (either +`started` or `already_loaded`). + +### `list` + +Show all running decompiler servers. + +```bash +decompiler list [--json] +``` + +Text output: + +``` +ID BACKEND PID BINARY +3308b81cf8 angr 57613 /…/fauxware +9d77ab8fd4 angr 57786 /…/posix_syscall +``` + +### `stop` + +Stop one or all servers. + +```bash +decompiler stop [--id SERVER_ID] [--binary PATH] [--all] [--json] +``` + +You must pass one of `--id`, `--binary`, or `--all`. + +### `decompile` + +Decompile a function to pseudocode. + +```bash +decompiler decompile [--id ID] [--binary PATH] [--backend BACKEND] [--json] +``` + +`` is a function name or address (hex/decimal, lifted or absolute — +see [Address formats](#address-formats)). + +Text output is the decompilation. JSON output includes `addr`, `decompiler`, +and `text`. + +### `disassemble` + +Disassemble a function to text assembly. + +```bash +decompiler disassemble [--id ID] [--binary PATH] [--backend BACKEND] [--json] +``` + +### `xref_to` + +Functions that reference/call `target`. + +```bash +decompiler xref_to [--id ID] [--binary PATH] [--backend BACKEND] [--json] +``` + +### `xref_from` + +Functions that `target` calls (its callees). + +```bash +decompiler xref_from [--id ID] [--binary PATH] [--backend BACKEND] [--json] +``` + +Implementation note: this prefers the backend's call-graph. If the call-graph +is unavailable it falls back to scanning the function's disassembly for +`call 0x…` instructions. + +### `rename` + +Rename a function or a local variable. + +```bash +# Rename a function. +decompiler rename func [--id ID] [--json] + +# Rename a local variable inside a function. +decompiler rename var --function [--id ID] [--json] +``` + +The CLI exits non-zero if the rename didn't actually change anything (the +response's `success` field is authoritative). + +### `list_strings` + +List strings in the binary, optionally filtered by regex. + +```bash +decompiler list_strings [--filter REGEX] [--id ID] [--binary PATH] [--backend BACKEND] [--json] +``` + +Text output is `0x\t` per line. JSON output is a list of +`{"addr": int, "string": str}`. + +### `get_callers` + +List callers of a function (by address or symbol name). Equivalent to +`xref_to`, but accepts a `Function`, `int` (address), or `str` (name) and is +exposed as a first-class core API. + +```bash +decompiler get_callers [--id ID] [--binary PATH] [--backend BACKEND] [--json] +``` + +### `install-skill` + +Copy the bundled Agent Skill into `~/.claude/skills/` so Claude Code (or any +agent that picks up skills from that path) learns how to drive the CLI. + +```bash +decompiler install-skill [names ...] [--dest DIR] [--force] [--json] +``` + +With no `names`, every bundled skill is installed. Use `--dest` to copy the +skill somewhere else, and `--force` to overwrite an existing directory. + +--- + +## Server selection + +When more than one server is running, the inspection/mutation commands need +to know which one to talk to. Narrow with any combination of: + +- **`--id `** — exact match. +- **`--binary `** — match by binary path (resolved to an absolute + path). +- **`--backend `** — match by backend. + +If zero servers match, the CLI errors out and tells you to run +`decompiler load`. If multiple match, it prints a disambiguation list: + +``` +Multiple servers match. Specify --id to disambiguate: + 3308b81cf8 backend=angr binary=/…/fauxware + 9d77ab8fd4 backend=angr binary=/…/posix_syscall +``` + +--- + +## JSON output + +Pass `--json` on any subcommand to get a structured payload suitable for +downstream parsing. This is the recommended mode for scripts and LLM +callers: + +```bash +decompiler decompile main --json +# {"addr": 1821, "decompiler": "angr", "text": "void main(...){...}"} + +decompiler list_strings --filter 'flag' --json +# [{"addr": 4197168, "string": "flag{...}"}] + +decompiler xref_to authenticate --json +# {"addr": 1636, "direction": "to", "xrefs": [{"addr": 1821, "name": "main"}, ...]} +``` + +--- + +## Running multiple binaries at once + +```bash +decompiler load ./my-binary # id=abc1234 +decompiler load ./my-binary-2 # id=def5678 + +decompiler list +# ID BACKEND PID BINARY +# abc1234... angr 4213 .../my-binary +# def5678... angr 4217 .../my-binary-2 + +# Target by ID … +decompiler decompile main --id abc1234 + +# … or by binary path. +decompiler decompile main --binary ./my-binary-2 + +# Tear them both down. +decompiler stop --all +``` + +You can even mix backends on the same binary — add `--force` to `load` to +launch a second server for the same file: + +```bash +decompiler load ./bin --backend ghidra +decompiler load ./bin --backend angr --force +decompiler decompile main --binary ./bin --backend ghidra +decompiler decompile main --binary ./bin --backend angr +``` + +--- + +## Address formats + +LibBS normalizes addresses to a **lifted** form (relative to the binary's +base address), so artifacts stay stable across decompilers. The CLI, though, +accepts whatever is natural for the user: + +- `0x71d`, `1821` — lifted +- `0x40071d` — absolute (base + lifted) +- `main` — symbol name + +The CLI converts on the fly. The returned `addr` fields in JSON output are +**always lifted**, which matches what the server's artifact dictionaries +use. + +--- + +## Library-level API + +Everything the CLI does is also available as a library — useful when you +want to chain operations or integrate LibBS into a larger tool: + +```python +from libbs.api.decompiler_client import DecompilerClient + +# Pick a running server out of the shared registry. +client = DecompilerClient.discover_from_registry(binary_path="./fauxware") + +for addr, func in client.functions.items(): + if func.name == "main": + print(client.decompile(addr).text) + print(client.disassemble(addr)) + for caller in client.get_callers(addr): + print(caller.addr, caller.name) +``` + +The three APIs added to power the CLI are also usable directly through +`DecompilerInterface` (headless/embedded) and `DecompilerClient` (remote): + +- `list_strings(filter: str | None = None) -> list[tuple[int, str]]` +- `get_callers(target: Function | int | str) -> list[Function]` +- `disassemble(addr: int) -> str | None` + +Backends currently implementing them: angr and Ghidra. IDA and Binary Ninja +fall back to the default implementations. + +--- + +## Troubleshooting + +**`No running decompiler server matches …`** +You haven't loaded the binary yet. Run +`decompiler load --backend ` first, or use +`decompiler list` to see what's already running. + +**`Multiple servers match. Specify --id to disambiguate`** +Two servers match your filters. Either pass `--id` with one of the printed +IDs, or narrow with `--binary`/`--backend`. + +**`Timed out waiting … for server … to start.`** +The detached server process didn't come up in time (default 5 minutes). +Check backend prerequisites: +- Ghidra: `GHIDRA_INSTALL_DIR` must be set. +- IDA/Binary Ninja: their Python bindings must be importable. +- angr: should just work. + +**Rename reports `success: False`** +The old name was not found in the function (e.g. it was already renamed, or +you targeted the wrong function). The exit code will be non-zero so it's +easy to detect from a script. + +**Server-side logs** +Spawned servers have their stdout/stderr sent to `/dev/null`. If you're +debugging server startup, start one by hand in a foreground terminal: + +```bash +libbs --server --headless --decompiler angr --binary-path ./bin --server-id my-srv +``` + +That will print log output to the terminal, and the CLI in another terminal +can still drive it via `decompiler decompile main --id my-srv`. diff --git a/libbs/api/decompiler_server.py b/libbs/api/decompiler_server.py index f27502d..9ddf2f2 100644 --- a/libbs/api/decompiler_server.py +++ b/libbs/api/decompiler_server.py @@ -220,6 +220,14 @@ def _process_request(self, request: Dict[str, Any], client_socket: socket.socket if self.deci: self.deci.shutdown() return {"status": "shutdown"} + + elif request_type == "shutdown_server": + # Tear the server down asynchronously so we can still reply. + if self.server is not None: + threading.Thread( + target=self.server.stop, name="libbs-server-shutdown", daemon=True + ).start() + return {"status": "stopping"} else: raise ValueError(f"Unknown request type: {request_type}") diff --git a/libbs/cli/decompiler_cli.py b/libbs/cli/decompiler_cli.py index 1c4d9dc..8ff99ea 100644 --- a/libbs/cli/decompiler_cli.py +++ b/libbs/cli/decompiler_cli.py @@ -25,6 +25,7 @@ import os import re import shutil +import signal import subprocess import sys import time @@ -254,30 +255,81 @@ def cmd_list(args) -> int: def _stop_server_by_record(record: Dict) -> bool: - """Shut down a server via a client; returns True if a shutdown request was sent.""" + """Shut down the server process backing `record`. + + Asks the server to shut itself down gracefully, falling back to SIGTERM/SIGKILL + on the PID if the request fails. Returns True if we believe the process is + gone (or never existed) by the time we return. + """ from libbs.api.decompiler_client import DecompilerClient + server_id = record.get("id") + pid = record.get("pid") + socket_path = record.get("socket_path") + graceful = False try: - client = DecompilerClient(socket_path=record["socket_path"]) + client = DecompilerClient(socket_path=socket_path) except Exception as exc: - _l.warning("Could not connect to server %s: %s", record.get("id"), exc) - # Best-effort: drop stale registry entry so it's not stuck. - server_registry.unregister_server(record.get("id")) - return False - try: + _l.warning("Could not connect to server %s: %s", server_id, exc) + client = None + if client is not None: try: - client._send_request({"type": "shutdown_deci"}) - except Exception: - pass - finally: + client._send_request({"type": "shutdown_server"}) + graceful = True + except Exception as exc: + _l.debug("shutdown_server rejected by %s: %s", server_id, exc) + # Close the socket directly instead of calling client.shutdown(); the + # latter also fires `shutdown_deci`, which noisily fails once the server + # has stopped listening. try: - client.shutdown() + if client._socket is not None: + client._socket.close() except Exception: pass - # Remove from registry in case the server exits before cleaning up. - time.sleep(0.2) - server_registry.unregister_server(record.get("id")) - return True + client._connected = False + + if not _wait_for_process_exit(pid, timeout=3.0): + # Graceful request didn't land or server is stuck — escalate. + _signal_process(pid, signal.SIGTERM) + if not _wait_for_process_exit(pid, timeout=2.0): + _signal_process(pid, signal.SIGKILL) + _wait_for_process_exit(pid, timeout=1.0) + + server_registry.unregister_server(server_id) + return graceful or not _process_alive(pid) + + +def _process_alive(pid) -> bool: + if not pid: + return False + try: + import psutil + + return psutil.pid_exists(int(pid)) + except Exception: + return False + + +def _signal_process(pid, sig) -> None: + if not pid: + return + try: + os.kill(int(pid), sig) + except ProcessLookupError: + return + except Exception as exc: + _l.debug("Signal %s to pid %s failed: %s", sig, pid, exc) + + +def _wait_for_process_exit(pid, timeout: float) -> bool: + if not pid: + return True + deadline = time.time() + timeout + while time.time() < deadline: + if not _process_alive(pid): + return True + time.sleep(0.05) + return not _process_alive(pid) def cmd_stop(args) -> int: @@ -287,8 +339,7 @@ def cmd_stop(args) -> int: elif args.id: targets = [r for r in records if r.get("id") == args.id] elif args.binary: - bp = str(Path(args.binary).expanduser().resolve()) - targets = [r for r in records if r.get("binary_path") == bp] + targets = server_registry.find_servers(binary_path=args.binary) else: raise SystemExit("decompiler stop needs --id, --binary, or --all") @@ -480,10 +531,13 @@ def cmd_list_strings(args) -> int: def cmd_get_callers(args) -> int: with _with_client(args) as client: - addr, name = _parse_target(args.target) - target = addr if addr is not None else name + # Reuse the resolver so absolute addresses get normalized to the lifted + # form the server expects. + resolved = _resolve_function_addr(client, args.target) + if resolved is None: + raise SystemExit(f"Function not found: {args.target!r}") try: - callers = client.get_callers(target) + callers = client.get_callers(resolved) except ValueError as exc: raise SystemExit(str(exc)) data = [_format_function(c) for c in callers] diff --git a/libbs/decompilers/angr/interface.py b/libbs/decompilers/angr/interface.py index ce66418..483a63e 100644 --- a/libbs/decompilers/angr/interface.py +++ b/libbs/decompilers/angr/interface.py @@ -172,6 +172,26 @@ def disassemble(self, addr: int, **kwargs) -> Optional[str]: if func is None: return None + try: + base_addr = self.binary_base_addr + except Exception: + base_addr = 0 + hex_re = re.compile(r"0x([0-9a-fA-F]+)") + + def _rewrite_operands(op_str: str) -> str: + # Rewrite absolute addresses in operands to their lifted form so the + # output is consistent across decompilers (e.g. ghidra lifts addresses). + def _sub(match: "re.Match[str]") -> str: + try: + raw = int(match.group(1), 16) + except ValueError: + return match.group(0) + if base_addr and raw >= base_addr: + return f"0x{raw - base_addr:x}" + return match.group(0) + + return hex_re.sub(_sub, op_str) + lines: List[str] = [] try: blocks = sorted(func.blocks, key=lambda b: b.addr) @@ -180,7 +200,9 @@ def disassemble(self, addr: int, **kwargs) -> Optional[str]: for block in blocks: try: for insn in block.capstone.insns: - lines.append(f"0x{insn.address:x}:\t{insn.mnemonic}\t{insn.op_str}".rstrip()) + lifted = self.art_lifter.lift_addr(insn.address) + op_str = _rewrite_operands(insn.op_str) + lines.append(f"0x{lifted:x}:\t{insn.mnemonic}\t{op_str}".rstrip()) except Exception: continue return "\n".join(lines) if lines else None From 7a15d45598df0fd5fb244cbe5f8890c7b8099357 Mon Sep 17 00:00:00 2001 From: mahaloz Date: Wed, 22 Apr 2026 22:31:39 -0700 Subject: [PATCH 05/10] Address CLI_FEEDBACK: list_functions, richer strings, clearer errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changes driven by CLI_FEEDBACK.md: P0 - Add `list_functions [--filter REGEX]` subcommand (was the biggest gap — `decompile main` was the only entry for stripped binaries). - `list_strings` now has `--min-length`, `--rescan` / `--no-rescan`, section labeling for ELF files, and an automatic raw-bytes fallback scan when the backend detector returns fewer than 32 entries (angr in particular is thin). P1 - `xref_to` now returns *every* reference (code AND data) with a `kind` field, distinct from `get_callers` which is call-sites only. Add `--decompile` flag so Ghidra can pull in globals. Function names are enriched from the light cache when backends return (addr, 0) stubs. - JSON output is address-consistent: every `*addr` int field now has a `*_hex` sibling string so either form can be copied verbatim. - Unify every non-success exit code to 1 (rename var previously exited 2). - Distinct error messages for "no function starts at addr" vs "decompile engine failed" vs "target not found". - Add `--raw` to `decompile` and `disassemble` to print the body without JSON wrapping (avoids unreadable `\n` escapes at a terminal). P3 - `install-skill --json` emits real JSON instead of a Python-repr dict. - `decompiler list` now prints the registry directory, with `--show-registry` for just-the-path output. - `load --replace` stops any existing server for the binary+backend and starts a fresh one (vs `--force` which spawns alongside). Docs / Skill - SKILL.md and docs/decompiler_cli.md updated with `xref_to` vs `get_callers` guidance, the `list_strings` fidelity disclaimer, the new address-format / `addr_hex` rules, and a "first moves on a new binary" section pointing at `list_functions` first. Tests - +12 new CLI tests covering every feedback item (list_functions, --raw, not-a-function-start error, rename missing exits 1, --show-registry, --replace tears down old server, --rescan picks up more, --min-length, install-skill text/JSON formats, addr_hex annotations). - Existing tests updated for the `list --json` shape change. - Full suite: 70 passing (57→70), 2 preexisting env failures unchanged. --- CLI_FEEDBACK.md | 168 ++++++++++++ docs/decompiler_cli.md | 194 +++++++++++--- libbs/cli/decompiler_cli.py | 424 +++++++++++++++++++++++++++---- libbs/skills/decompiler/SKILL.md | 135 +++++++--- tests/test_decompiler_cli.py | 139 +++++++++- 5 files changed, 931 insertions(+), 129 deletions(-) create mode 100644 CLI_FEEDBACK.md diff --git a/CLI_FEEDBACK.md b/CLI_FEEDBACK.md new file mode 100644 index 0000000..266958b --- /dev/null +++ b/CLI_FEEDBACK.md @@ -0,0 +1,168 @@ +# `decompiler` CLI + Skill — field report from solving `rpc.out` + +Context: used the new `decompiler` CLI (angr backend) end-to-end to reverse +`challenge/rpc.out` and construct a solve script. Everything below is a +concrete friction point hit during that session, in rough priority order. + +## P0 — missing capability that hurt the workflow + +### 1. No way to *list functions* +There is no `decompiler list_functions` / `decompiler functions` command. For +a stripped binary the only entry point from the CLI is `decompile main`, and +from there you discover the call graph one `decompile sub_XXXX` at a time. +For a CTF workflow (or any exploratory reverse), this is painful — I'd expect +something like: + +``` +decompiler list_functions [--filter REGEX] [--json] +# ADDR SIZE NAME +# 0x401b53 240 serve_forever +# 0x401ad3 112 handle_client +# ... +``` + +The data is clearly available server-side (`client.functions.items()` is +mentioned in `SKILL.md`), it just isn't exposed as a subcommand. This was the +single biggest gap. + +### 2. `list_strings` under-reports drastically +On this binary: + +``` +$ decompiler list_strings | wc -l +18 +``` + +Only 18 strings, and most are PLT/data-table fillers (`0x4f38`, `" "`, `"("`, +`"0"`, ...). `admin`, `private`, `r` were caught. Running `strings -n 4` on +the same binary yields the same "real" strings, so angr isn't *missing* the +ASCII runs in this case — but the output is clearly not a full `.rodata` +walk. For larger binaries it will miss data. + +Would be nice if `list_strings`: +- had a `--min-length N` flag (default 4 would cut the noise above), +- distinguished source section (.rodata vs .data vs inline), +- optionally fell back to a raw-bytes scan when the backend's string + detector is thin (angr is). + +## P1 — correctness / UX + +### 3. `get_callers` and `xref_to` look redundant +Both commands return the same thing for the same target: + +``` +$ decompiler xref_to check_auth +0x1690 run_vm +$ decompiler get_callers check_auth +0x1690 run_vm +``` + +`SKILL.md` lists them as separate subcommands with slightly different +descriptions but no concrete difference in output. Either unify them, or +document precisely when you'd reach for one over the other (e.g. `xref_to` +is data *and* code refs, `get_callers` is call-site only?). As shipped I +never had a reason to use both. + +### 4. Mixed hex / decimal addresses in output +Text output prints addresses as hex (`0x3004`), `--json` output prints them +as decimals (`"addr": 12298`). Same data, different radix. Mildly annoying +when piping between commands or copying from a prior output. + +Suggestion: JSON should emit `"addr": "0x3004"` (or ship `addr` as int and +`addr_hex` as string), and be consistent across `list_strings`, `xref_to`, +`decompile`, `rename`. + +### 5. `rename var` on a nonexistent name exits 2, not 1 +``` +$ decompiler rename var not_there missing --function run_vm ; echo $? +success: False +2 +``` + +Everywhere else (`decompile nonexistent`, `rename func nonexistent`) exits +`1`. Exit-code inconsistency breaks simple `&&` chaining in shell scripts. + +### 6. Decompile failure message doesn't distinguish failure modes +``` +$ decompiler decompile 0x999999 +Failed to decompile function at 0x999999 +``` +Was the address invalid? Not a function start? Decompiler bug? Same message +for all three. At minimum, split "no function at address" from "decompilation +engine failed". + +### 7. `decompile --json` stuffs the whole pseudocode into one string +``` +{"addr": 6995, "decompiler": "angr", "text": "typedef struct ...\n ..."} +``` +For LLM consumption this is fine, but when debugging from a terminal the +embedded `\n`s are unreadable. A `--no-escape` or `--raw` flag to print the +`text` body directly (with a JSON header on stderr) would be nice. + +## P2 — skill / docs + +### 8. `SKILL.md` promises a richer `list_strings` than the angr backend ships +The skill example shows: +``` +decompiler list_strings --filter 'flag' --json +# [{"addr": 4197168, "string": "flag{...}"}] +``` +On angr this detector is thin (see #2). The skill should either warn +"`list_strings` fidelity varies by backend (angr < ghidra < ida)" or include +a fallback pattern ("if nothing comes back, fall through to `strings(1)` on +the binary directly"). + +### 9. `SKILL.md` says "Address forms: 0x71d, 0x40071d, and 1821 all resolve the same function" +This worked correctly in my session (both lifted `0x1b53` and absolute +`0x401b53` decompiled `serve_forever`), but the skill doesn't mention that +`decompile` will happily accept an *invalid* address and print +"Failed to decompile function at 0x999999" rather than validating the input. +A note that "address form doesn't round-trip — the CLI canonicalises to +lifted internally" would help downstream agents reason about output. + +### 10. Skill lacks a "functions" example +Since `list_functions` doesn't exist (see #1), the skill also can't show +"first step in a new binary: see all functions." A seasoned reverser opens a +new binary and immediately wants the function list. The skill currently +pushes toward `decompile main` as the entry, which only helps if `main` is a +known name — and on stripped binaries where main is `sub_XXXX`, that fails +silently. + +## P3 — nits + +### 11. Stale server records pile up after `--force` +Running `load ... --force` leaves the old server running (good) and starts a +new one. `decompiler list` shows both, but there's no visual cue that the +two are for the same binary (apart from the path). A column marking +`ORIGINAL`/`FORCED` or a `--kill-existing` flag would be clearer. + +### 12. `install-skill` success output is a Python repr, not JSON +``` +installed: [{'name': 'decompiler', 'path': '...'}] +``` +Single-quoted dict. Either make it valid JSON (so `decompiler install-skill +--json | jq` works) or print a human-friendly line. Currently it's neither. + +### 13. Registry state-dir path isn't obvious +`decompiler list` doesn't say *where* the registry lives. When a stale +record survives a reboot or a kill -9, a user has no obvious breadcrumb to +go clean it up. Add the path to `list --json` output or a `--show-registry` +flag. + +## What worked well + +- `load` + automatic server spawn: zero ceremony, fast on small binaries. +- `--id` / `--binary` disambiguation refusal is exactly right. +- `rename func` / `rename var` worked first try, renames stuck across + subsequent `decompile` calls, and the JSON surface (`{kind, addr, + new_name, success}`) is clean. +- `xref_from` on the VM dispatcher gave me the five opcode handlers + instantly — this is the CLI's sweet spot. +- `install-skill --force` behaved correctly, exit codes correct. +- Multi-server support works; `stop --id` targets cleanly. + +## Summary +The CLI handled the solve well enough that I never dropped to `objdump` or +`strings(1)` for *analysis* — only for cross-checking. The two biggest gaps +are **no `list_functions`** (P0) and **under-reporting `list_strings`** (P1). +Everything else is polish. diff --git a/docs/decompiler_cli.md b/docs/decompiler_cli.md index 0798600..53dbf6b 100644 --- a/docs/decompiler_cli.md +++ b/docs/decompiler_cli.md @@ -21,6 +21,7 @@ and can be installed with `decompiler install-skill`. - [`load`](#load) - [`list`](#list) - [`stop`](#stop) + - [`list_functions`](#list_functions) - [`decompile`](#decompile) - [`disassemble`](#disassemble) - [`xref_to`](#xref_to) @@ -30,7 +31,8 @@ and can be installed with `decompiler install-skill`. - [`get_callers`](#get_callers) - [`install-skill`](#install-skill) - [Server selection (`--id`, `--binary`, `--backend`)](#server-selection) -- [JSON output (`--json`)](#json-output) +- [JSON output (`--json`, `--raw`)](#json-output) +- [Exit codes](#exit-codes) - [Running multiple binaries at once](#running-multiple-binaries-at-once) - [Address formats](#address-formats) - [Library-level API](#library-level-api) @@ -72,12 +74,13 @@ decompiler load ./fauxware --backend angr # id: 3308b81cf8 … # 2. Poke around. +decompiler list_functions # enumerate every function first decompiler decompile main # by name decompiler disassemble 0x40071d # by absolute address -decompiler xref_to authenticate # callers of a function +decompiler xref_to authenticate # every code+data reference +decompiler get_callers authenticate # call-sites only (subset of xref_to) decompiler xref_from main # what main calls decompiler list_strings --filter 'pass|key' # regex-filtered strings -decompiler get_callers 0x71d # lifted address works too # 3. Mutate the database. decompiler rename func sub_400662 trampoline @@ -107,6 +110,7 @@ Each running server writes a small JSON descriptor (`id`, `socket_path`, `binary_path`, `binary_hash`, `backend`, `pid`, `started_at`) into a shared registry directory. The CLI reads the registry to figure out which server to talk to. Stale records (server exited, socket missing) are pruned on read. +Run `decompiler list --show-registry` to print just the path. Every subcommand except `load`, `list`, and `install-skill` accepts `--id`, `--binary`, and `--backend` to pick which server to target when you @@ -124,14 +128,17 @@ it. ```bash decompiler load [--backend {angr,ghidra,binja,ida}] [--id SERVER_ID] - [--force] + [--force | --replace] [--json] ``` - **`--backend`** (default: `angr`) — which decompiler to use. - **`--id`** — explicit server ID; otherwise one is auto-generated. -- **`--force`** — start a fresh server even if an existing one matches this - `(binary, backend)`. +- **`--force`** — start an additional server even if one already matches + this `(binary, backend)`. Keeps the old server alive. +- **`--replace`** — stop any existing server for this `(binary, backend)` + first, then start a fresh one. Use this when you want to re-analyze from + scratch. Outputs `id`, `socket_path`, `binary_path`, `backend`, and `status` (either `started` or `already_loaded`). @@ -141,7 +148,7 @@ Outputs `id`, `socket_path`, `binary_path`, `backend`, and `status` (either Show all running decompiler servers. ```bash -decompiler list [--json] +decompiler list [--show-registry] [--json] ``` Text output: @@ -150,8 +157,14 @@ Text output: ID BACKEND PID BINARY 3308b81cf8 angr 57613 /…/fauxware 9d77ab8fd4 angr 57786 /…/posix_syscall + +(registry: /Users/me/Library/Application Support/libbs/servers) ``` +- **`--show-registry`** — print the registry directory and exit (useful for + scripting manual cleanup). +- **`--json`** emits `{"registry_dir": "...", "servers": [...]}`. + ### `stop` Stop one or all servers. @@ -162,36 +175,80 @@ decompiler stop [--id SERVER_ID] [--binary PATH] [--all] [--json] You must pass one of `--id`, `--binary`, or `--all`. +### `list_functions` + +Enumerate every function in the loaded binary. This is usually the first +thing you want on a new (possibly stripped) binary. + +```bash +decompiler list_functions [--filter REGEX] [--id ID] [--binary PATH] [--backend BACKEND] [--json] +``` + +Text output: + +``` +ADDR SIZE NAME +0x540 6 __libc_start_main +0x71d 184 main +0x664 184 authenticate +... +``` + +JSON output is a list of `{"addr": int, "size": int, "name": str, "addr_hex": str}`. + ### `decompile` Decompile a function to pseudocode. ```bash -decompiler decompile [--id ID] [--binary PATH] [--backend BACKEND] [--json] +decompiler decompile [--raw] [--id ID] [--binary PATH] [--backend BACKEND] [--json] ``` `` is a function name or address (hex/decimal, lifted or absolute — see [Address formats](#address-formats)). -Text output is the decompilation. JSON output includes `addr`, `decompiler`, -and `text`. +- **`--raw`** — print the decompilation text directly, skipping all + wrapping. Useful at a terminal when `--json`'s escaped `\n`s are + unreadable. + +Default text output is the decompilation. JSON output includes `addr`, +`addr_hex`, `decompiler`, and `text`. + +Error messages distinguish three failure modes: + +- **target not found** — function name/address doesn't resolve. +- **not a function start** — address resolves, but isn't a function + boundary. Exit 1. +- **decompiler engine failed** — address is a known function start, but + the backend gave up. Exit 1. ### `disassemble` Disassemble a function to text assembly. ```bash -decompiler disassemble [--id ID] [--binary PATH] [--backend BACKEND] [--json] +decompiler disassemble [--raw] [--id ID] [--binary PATH] [--backend BACKEND] [--json] ``` +Same error semantics and `--raw` flag as `decompile`. + ### `xref_to` -Functions that reference/call `target`. +**Every reference** to `target` — code AND data. ```bash -decompiler xref_to [--id ID] [--binary PATH] [--backend BACKEND] [--json] +decompiler xref_to [--decompile] [--id ID] [--binary PATH] [--backend BACKEND] [--json] ``` +Each row has a `kind` field (`Function`, `GlobalVariable`, …) so you can +tell code refs from data refs. + +- **`--decompile`** — ask the backend to decompile first. On Ghidra this + surfaces additional references (e.g. globals pulled in through the + HighFunction's global symbol map). + +When you want only call-sites, reach for `get_callers` instead. + ### `xref_from` Functions that `target` calls (its callees). @@ -200,9 +257,9 @@ Functions that `target` calls (its callees). decompiler xref_from [--id ID] [--binary PATH] [--backend BACKEND] [--json] ``` -Implementation note: this prefers the backend's call-graph. If the call-graph -is unavailable it falls back to scanning the function's disassembly for -`call 0x…` instructions. +Implementation note: this prefers the backend's call-graph. If the +call-graph is unavailable it falls back to scanning the function's +disassembly for `call 0x…` instructions. ### `rename` @@ -216,34 +273,49 @@ decompiler rename func [--id ID] [--json] decompiler rename var --function [--id ID] [--json] ``` -The CLI exits non-zero if the rename didn't actually change anything (the +The CLI exits `1` if the rename didn't actually change anything (the response's `success` field is authoritative). ### `list_strings` -List strings in the binary, optionally filtered by regex. +List strings in the binary, combining the backend's native detector with a +raw `strings(1)`-style scan of the file. ```bash -decompiler list_strings [--filter REGEX] [--id ID] [--binary PATH] [--backend BACKEND] [--json] +decompiler list_strings [--filter REGEX] + [--min-length N] + [--rescan] [--no-rescan] + [--id ID] [--binary PATH] [--backend BACKEND] [--json] ``` -Text output is `0x\t` per line. JSON output is a list of -`{"addr": int, "string": str}`. +- **`--filter REGEX`** — only return strings matching the regex. +- **`--min-length N`** — drop strings shorter than N characters (default 4). +- **`--rescan`** — always run the raw-bytes scan in addition to the + backend detector. +- **`--no-rescan`** — never run the raw-bytes scan. +- Default: the CLI auto-rescan when the backend returns fewer than 32 + strings (angr in particular misses most of `.rodata`). + +Text output is `0x\t[
]\t` per line. JSON +output is a list of `{"addr", "addr_hex", "string", "source", "section"?}` +where `source` is `"backend"` or `"rescan"` and `section` (when present) +is the ELF section the raw-scanned byte lives in. ### `get_callers` -List callers of a function (by address or symbol name). Equivalent to -`xref_to`, but accepts a `Function`, `int` (address), or `str` (name) and is -exposed as a first-class core API. +Functions that contain a call to `target` — a strict subset of `xref_to`. ```bash decompiler get_callers [--id ID] [--binary PATH] [--backend BACKEND] [--json] ``` +Unlike `xref_to`, this never returns globals or other data refs. Rows are +always of kind `Function`. + ### `install-skill` -Copy the bundled Agent Skill into `~/.claude/skills/` so Claude Code (or any -agent that picks up skills from that path) learns how to drive the CLI. +Copy the bundled Agent Skill into `~/.claude/skills/` so Claude Code (or +any agent that picks up skills from that path) learns how to drive the CLI. ```bash decompiler install-skill [names ...] [--dest DIR] [--force] [--json] @@ -251,6 +323,8 @@ decompiler install-skill [names ...] [--dest DIR] [--force] [--json] With no `names`, every bundled skill is installed. Use `--dest` to copy the skill somewhere else, and `--force` to overwrite an existing directory. +`--json` emits a well-formed JSON payload suitable for piping through +`jq`. --- @@ -279,19 +353,43 @@ Multiple servers match. Specify --id to disambiguate: Pass `--json` on any subcommand to get a structured payload suitable for downstream parsing. This is the recommended mode for scripts and LLM -callers: +callers. Every JSON payload that mentions an address provides both +`addr` (integer, lifted) and `addr_hex` (hex string, also lifted), so you +can copy either form without re-formatting: ```bash -decompiler decompile main --json -# {"addr": 1821, "decompiler": "angr", "text": "void main(...){...}"} - -decompiler list_strings --filter 'flag' --json -# [{"addr": 4197168, "string": "flag{...}"}] +decompiler list_functions --filter '^main$' --json +# [{"addr": 1821, "size": 184, "name": "main", "addr_hex": "0x71d"}] decompiler xref_to authenticate --json -# {"addr": 1636, "direction": "to", "xrefs": [{"addr": 1821, "name": "main"}, ...]} +# {"addr": 1636, "direction": "to", +# "xrefs": [{"kind": "Function", "addr": 1821, "name": "main", "addr_hex": "0x71d"}, ...], +# "addr_hex": "0x664"} ``` +For decompile/disassemble output, JSON wraps the text in a `text` field +with escaped newlines. At a terminal this is awkward; pass `--raw` +instead: + +```bash +decompiler decompile main --raw # prints the pseudocode directly +decompiler disassemble 0x71d --raw # prints assembly directly +``` + +--- + +## Exit codes + +Every CLI command uses these exit codes: + +| Code | Meaning | +|---|---| +| `0` | Success. | +| `1` | User-visible error — target not found, rename didn't apply, decompile failed, etc. All failure modes unify to `1` so that shell `&&` chaining works cleanly. | + +Argparse-level errors (unknown subcommand, missing required argument) exit +with Python's standard argparse code `2`. + --- ## Running multiple binaries at once @@ -304,6 +402,8 @@ decompiler list # ID BACKEND PID BINARY # abc1234... angr 4213 .../my-binary # def5678... angr 4217 .../my-binary-2 +# +# (registry: /…/libbs/servers) # Target by ID … decompiler decompile main --id abc1234 @@ -311,7 +411,13 @@ decompiler decompile main --id abc1234 # … or by binary path. decompiler decompile main --binary ./my-binary-2 -# Tear them both down. +# Restart a server cleanly (stop existing, spawn fresh): +decompiler load ./my-binary --replace + +# Run an additional server alongside the existing one: +decompiler load ./my-binary --force + +# Tear them all down. decompiler stop --all ``` @@ -339,7 +445,7 @@ accepts whatever is natural for the user: The CLI converts on the fly. The returned `addr` fields in JSON output are **always lifted**, which matches what the server's artifact dictionaries -use. +use. `addr_hex` is the same value as a hex string for convenience. --- @@ -392,10 +498,20 @@ Check backend prerequisites: - IDA/Binary Ninja: their Python bindings must be importable. - angr: should just work. -**Rename reports `success: False`** -The old name was not found in the function (e.g. it was already renamed, or -you targeted the wrong function). The exit code will be non-zero so it's -easy to detect from a script. +**`No function starts at 0x…`** +The address is valid in the binary but doesn't correspond to the first +byte of any known function. Use `decompiler list_functions` to find a +valid start. (Prior to v2 this was reported with the same error as +"decompiler engine failed"; they're now distinct.) + +**Rename reports `success: False` (exit 1)** +The old name was not found in the function (e.g. it was already renamed, +or you targeted the wrong function). + +**`list_strings` looks thin** +angr's string detector is minimal. The CLI auto-falls-back to a raw-bytes +scan — if you don't see `source: "rescan"` entries, the threshold (32 +backend entries) wasn't crossed. Pass `--rescan` to force it. **Server-side logs** Spawned servers have their stdout/stderr sent to `/dev/null`. If you're diff --git a/libbs/cli/decompiler_cli.py b/libbs/cli/decompiler_cli.py index 8ff99ea..cdad63e 100644 --- a/libbs/cli/decompiler_cli.py +++ b/libbs/cli/decompiler_cli.py @@ -7,17 +7,18 @@ via the shared server registry (see libbs.api.server_registry). Subcommands implemented: -- load start a server on a binary -- list list running servers -- stop stop one or all servers -- decompile decompile a function by name or address -- disassemble disassemble a function by name or address -- xref_to list callers/references to a name or address -- xref_from list callees of a function (things it calls) -- rename rename a function or local variable -- list_strings list strings in the binary, optionally filtered by regex -- get_callers list callers of a function -- install-skill install the bundled Agent Skill so LLMs learn the CLI +- load start a server on a binary +- list list running servers +- stop stop one or all servers +- list_functions list functions in the binary, optionally filtered by regex +- decompile decompile a function by name or address +- disassemble disassemble a function by name or address +- xref_to data + code references to a target +- xref_from things a function calls (callees) +- rename rename a function or local variable +- list_strings list strings in the binary, optionally filtered by regex +- get_callers functions (call sites only) that call a target +- install-skill install the bundled Agent Skill so LLMs learn the CLI """ import argparse import json @@ -30,7 +31,14 @@ import sys import time from pathlib import Path -from typing import Dict, List, Optional, Tuple +from typing import Dict, Iterable, List, Optional, Tuple + +# Standardized exit codes — keep these consistent across subcommands so that +# `&&` chaining and scripts have predictable behavior. +EXIT_OK = 0 +EXIT_USER_ERROR = 1 # user asked for something that didn't happen +EXIT_NOT_FOUND = 1 # missing function/name/binary +EXIT_RUNTIME_ERROR = 1 # unhandled/unknown failure from libbs.api import server_registry from libbs.decompilers import SUPPORTED_DECOMPILERS @@ -210,8 +218,13 @@ def cmd_load(args) -> int: f"Unsupported backend {backend!r}; pick one of: {sorted(SUPPORTED_DECOMPILERS)}" ) - # If there's already a matching server for this exact binary+backend, prefer that. + # Existing server(s) for this binary+backend. existing = server_registry.find_servers(binary_path=str(binary_path), backend=backend) + if existing and args.replace: + # --replace: tear the old one(s) down first, then start fresh. + for record in existing: + _stop_server_by_record(record) + existing = [] if existing and not args.force: record = existing[0] _emit(args, { @@ -242,15 +255,20 @@ def cmd_load(args) -> int: def cmd_list(args) -> int: records = server_registry.list_servers() + registry_dir = str(server_registry._registry_dir()) # type: ignore[attr-defined] + if args.show_registry and not args.json: + print(registry_dir) + return 0 if args.json: - print(json.dumps(records, indent=2, default=str)) + print(json.dumps({"registry_dir": registry_dir, "servers": records}, indent=2, default=str)) return 0 if not records: - print("No running decompiler servers.") + print(f"No running decompiler servers. (registry: {registry_dir})") return 0 print(f"{'ID':<12} {'BACKEND':<8} {'PID':<8} BINARY") for r in records: print(f"{r.get('id',''):<12} {str(r.get('backend','')):<8} {str(r.get('pid','')):<8} {r.get('binary_path','')}") + print(f"\n(registry: {registry_dir})") return 0 @@ -358,18 +376,42 @@ def cmd_stop(args) -> int: # decompile / disassemble # --------------------------------------------------------------------------- +def _known_function_addrs(client) -> set: + try: + return set(client.functions.keys()) + except Exception: + return set() + + def cmd_decompile(args) -> int: with _with_client(args) as client: addr = _resolve_function_addr(client, args.target) + known = _known_function_addrs(client) if addr is None: raise SystemExit(f"Function not found: {args.target!r}") + if known and addr not in known: + raise SystemExit( + f"No function starts at 0x{addr:x}. " + f"Try `decompiler list_functions --filter '{args.target}'` or " + "pick a function-start address." + ) dec = client.decompile(addr) if dec is None: - raise SystemExit(f"Failed to decompile function at 0x{addr:x}") + raise SystemExit( + f"Decompiler engine returned no result for 0x{addr:x}. " + "The address is a known function start, but decompilation " + "failed — this usually means the backend can't handle this " + "function (unreachable code, ARM/x86 mode mismatch, etc.)." + ) + text = dec.text if hasattr(dec, "text") else str(dec) + if getattr(args, "raw", False): + # --raw: dump just the text body to stdout, regardless of --json. + print(text) + return 0 out = { "addr": addr, "decompiler": dec.decompiler if hasattr(dec, "decompiler") else None, - "text": dec.text if hasattr(dec, "text") else str(dec), + "text": text, } _emit(args, out, text_field="text") return 0 @@ -378,34 +420,101 @@ def cmd_decompile(args) -> int: def cmd_disassemble(args) -> int: with _with_client(args) as client: addr = _resolve_function_addr(client, args.target) + known = _known_function_addrs(client) if addr is None: raise SystemExit(f"Function not found: {args.target!r}") + if known and addr not in known: + raise SystemExit( + f"No function starts at 0x{addr:x}. " + f"Try `decompiler list_functions --filter '{args.target}'` or " + "pick a function-start address." + ) text = client.disassemble(addr) if text is None: - raise SystemExit(f"Failed to disassemble function at 0x{addr:x}") + raise SystemExit( + f"Disassembler returned no instructions for 0x{addr:x} " + "(likely a function too small to disassemble or a backend bug)." + ) + if getattr(args, "raw", False): + print(text) + return 0 _emit(args, {"addr": addr, "text": text}, text_field="text") return 0 +def cmd_list_functions(args) -> int: + with _with_client(args) as client: + pattern = re.compile(args.filter) if args.filter else None + entries: List[Dict] = [] + for addr, func in sorted(client.functions.items(), key=lambda kv: kv[0]): + name = getattr(func, "name", None) or "" + if pattern and not pattern.search(name): + continue + size = getattr(func, "size", 0) or 0 + entries.append({"addr": addr, "size": int(size), "name": name}) + + if args.json: + _emit_list(args, entries) + else: + if not entries: + print("No functions matched.") + return 0 + print(f"{'ADDR':<12} {'SIZE':<8} NAME") + for e in entries: + print(f"0x{e['addr']:<10x} {e['size']:<8} {e['name']}") + return 0 + + # --------------------------------------------------------------------------- # xrefs # --------------------------------------------------------------------------- -def _format_function(func) -> Dict: - out = { - "addr": getattr(func, "addr", None), - "name": getattr(func, "name", None), +def _format_xref(artifact) -> Dict: + """Render any artifact (Function, GlobalVariable, etc.) as a uniform dict. + + Unlike `_format_function`, this keeps the artifact kind so callers can + tell code refs apart from data refs. + """ + return { + "kind": type(artifact).__name__, + "addr": getattr(artifact, "addr", None), + "name": getattr(artifact, "name", None), } - return out def cmd_xref_to(args) -> int: + """All references — code and data — to the target. + + Note: distinct from `get_callers`, which is call-sites only. `xref_to` + here asks the backend for *every* artifact that points at the target, + including globals, strings, and non-call code references. + """ + from libbs.artifacts import Function + with _with_client(args) as client: addr = _resolve_function_addr(client, args.target) if addr is None: raise SystemExit(f"Function not found: {args.target!r}") - callers = client.get_callers(addr) - data = [_format_function(c) for c in callers] + # Build a Function stub to hand to xrefs_to so backends that *do* + # surface non-function refs (Ghidra via `decompile=True`) can add them. + func_stub = Function(addr, 0) + try: + xrefs = client.xrefs_to(func_stub, decompile=bool(args.decompile)) + except Exception as exc: + _l.debug("xrefs_to raised %s; falling back to get_callers", exc) + xrefs = client.get_callers(addr) + + # Enrich Function entries with names from the light artifact cache, + # since some backends only return (addr, 0) stubs from xrefs_to. + light_funcs = dict(client.functions.items()) + data: List[Dict] = [] + for x in xrefs: + entry = _format_xref(x) + if entry["kind"] == "Function" and not entry.get("name"): + func = light_funcs.get(entry.get("addr")) + if func is not None: + entry["name"] = getattr(func, "name", None) + data.append(entry) _emit_xrefs(args, addr, data, direction="to") return 0 @@ -432,7 +541,7 @@ def cmd_xref_from(args) -> int: if callee_addr in seen: continue seen.add(callee_addr) - callees.append(_format_function(callee)) + callees.append(_format_xref(callee)) except Exception as exc: _l.debug("Callgraph-based xref_from failed (%s); falling back to disasm scan.", exc) @@ -451,6 +560,7 @@ def cmd_xref_from(args) -> int: seen.add(callee_addr) func = functions_by_addr.get(callee_addr) callees.append({ + "kind": "Function", "addr": callee_addr, "name": func.name if func else None, }) @@ -462,7 +572,7 @@ def cmd_xref_from(args) -> int: def _emit_xrefs(args, addr: int, xrefs: List[Dict], *, direction: str) -> None: payload = {"addr": addr, "direction": direction, "xrefs": xrefs} if args.json: - print(json.dumps(payload, indent=2, default=str)) + print(json.dumps(_annotate_addrs(payload), indent=2, default=str)) return if not xrefs: print(f"No xrefs {direction} 0x{addr:x}") @@ -470,7 +580,11 @@ def _emit_xrefs(args, addr: int, xrefs: List[Dict], *, direction: str) -> None: for x in xrefs: a = x.get("addr") n = x.get("name") or "" - print(f"0x{a:x}\t{n}" if a is not None else f"?\t{n}") + kind = x.get("kind") or "" + if a is not None: + print(f"0x{a:x}\t{kind}\t{n}" if kind else f"0x{a:x}\t{n}") + else: + print(f"?\t{kind}\t{n}" if kind else f"?\t{n}") # --------------------------------------------------------------------------- @@ -492,7 +606,7 @@ def cmd_rename(args) -> int: func.header.name = args.new_name ok = bool(client.set_artifact(func)) _emit(args, {"kind": "func", "addr": addr, "new_name": args.new_name, "success": ok}) - return 0 if ok else 2 + return EXIT_OK if ok else EXIT_USER_ERROR elif kind == "var": if not args.function: raise SystemExit("--function is required when renaming a variable") @@ -507,7 +621,7 @@ def cmd_rename(args) -> int: _emit(args, {"kind": "var", "function_addr": func_addr, "old_name": args.target, "new_name": args.new_name, "success": ok}) - return 0 if ok else 2 + return EXIT_OK if ok else EXIT_USER_ERROR raise SystemExit(f"Unknown rename kind: {kind}") @@ -516,20 +630,80 @@ def cmd_rename(args) -> int: # --------------------------------------------------------------------------- def cmd_list_strings(args) -> int: + """List strings. Two data sources: + + 1. The backend's native string detector (default). Fast but fidelity + varies — angr's detector is thin and will miss most of `.rodata`. + 2. A raw-bytes scan of the binary file (`--rescan`). Equivalent to + `strings -n ` plus ELF section labeling. Always enabled + automatically if the native detector returns fewer than `_RESCAN_FLOOR` + entries; pass `--no-rescan` to disable. + """ + _RESCAN_FLOOR = 32 + with _with_client(args) as client: - strings = client.list_strings(filter=args.filter) + filter_pat = re.compile(args.filter) if args.filter else None + native = client.list_strings(filter=args.filter) or [] + + results: List[Dict] = [] + seen = set() + for addr, s in native: + if len(s) < args.min_length: + continue + seen.add((addr, s)) + results.append({"addr": addr, "string": s, "source": "backend"}) + + should_rescan = args.rescan or ( + not args.no_rescan and len(results) < _RESCAN_FLOOR + ) + if should_rescan: + # Find the binary path via the registry record. + record = _select_server( + server_id=getattr(args, "id", None), + binary_path=getattr(args, "binary", None), + backend=getattr(args, "backend", None), + ) + binary_path = record.get("binary_path") + if binary_path and os.path.exists(binary_path): + data = _read_binary_bytes(binary_path) + if data is not None: + sections = _elf_sections_from_file(binary_path) + for offset, text in _scan_ascii_strings(data, min_length=args.min_length): + if filter_pat and not filter_pat.search(text): + continue + key = (offset, text) + if key in seen: + continue + seen.add(key) + record_entry: Dict = { + "addr": offset, + "string": text, + "source": "rescan", + } + sec = _section_for_offset(sections, offset) + if sec: + record_entry["section"] = sec + results.append(record_entry) + + # Sort by addr. + results.sort(key=lambda e: e.get("addr", 0)) + if args.json: - print(json.dumps( - [{"addr": a, "string": s} for a, s in strings], - indent=2, default=str, - )) + _emit_list(args, results) else: - for addr, s in strings: - print(f"0x{addr:x}\t{s}") + for entry in results: + sec = entry.get("section") or entry.get("source") or "" + sec_col = f"[{sec}]\t" if sec else "" + print(f"0x{entry['addr']:x}\t{sec_col}{entry['string']}") return 0 def cmd_get_callers(args) -> int: + """Functions that contain a call to the target (call-sites only). + + Distinct from `xref_to`, which returns every reference (code *or* data). + If you want the full reference set, use `xref_to` instead. + """ with _with_client(args) as client: # Reuse the resolver so absolute addresses get normalized to the lifted # form the server expects. @@ -540,9 +714,9 @@ def cmd_get_callers(args) -> int: callers = client.get_callers(resolved) except ValueError as exc: raise SystemExit(str(exc)) - data = [_format_function(c) for c in callers] + data = [_format_xref(c) for c in callers] if args.json: - print(json.dumps({"target": args.target, "callers": data}, indent=2, default=str)) + _emit(args, {"target": args.target, "target_addr": resolved, "callers": data}) else: if not data: print(f"No callers found for {args.target!r}") @@ -582,7 +756,11 @@ def cmd_install_skill(args) -> int: shutil.copytree(src, dest) installed.append({"name": name, "path": str(dest)}) - _emit(args, {"installed": installed}) + if args.json: + print(json.dumps({"installed": installed}, indent=2, default=str)) + else: + for entry in installed: + print(f"installed {entry['name']} → {entry['path']}") return 0 @@ -590,10 +768,36 @@ def cmd_install_skill(args) -> int: # shared helpers # --------------------------------------------------------------------------- +def _annotate_addrs(payload): + """Recursively add `*_hex` siblings for every `*addr` integer field. + + JSON historically emitted addresses as decimals; feedback was that this + is awkward when copying from one command to another. Instead of breaking + existing int fields, we add a sibling hex-string field so both forms + are available. A key named `addr` gets `addr_hex`, `target_addr` gets + `target_addr_hex`, `function_addr` gets `function_addr_hex`, etc. + """ + if isinstance(payload, dict): + for key in list(payload.keys()): + value = payload[key] + if ( + (key == "addr" or key.endswith("_addr")) + and isinstance(value, int) + and f"{key}_hex" not in payload + ): + payload[f"{key}_hex"] = f"0x{value:x}" + for v in payload.values(): + _annotate_addrs(v) + elif isinstance(payload, list): + for item in payload: + _annotate_addrs(item) + return payload + + def _emit(args, payload: Dict, *, text_field: Optional[str] = None) -> None: """Emit a response either as JSON or as a human-readable block.""" if args.json: - print(json.dumps(payload, indent=2, default=str)) + print(json.dumps(_annotate_addrs(payload), indent=2, default=str)) return if text_field and text_field in payload: print(payload[text_field]) @@ -603,6 +807,92 @@ def _emit(args, payload: Dict, *, text_field: Optional[str] = None) -> None: print(f"{k}: {v}") +def _emit_list(args, payload): + """Same as _emit but for a top-level list payload (JSON arrays).""" + if args.json: + print(json.dumps(_annotate_addrs(payload), indent=2, default=str)) + return + # Fallback: print each item on its own line as "key: value" pairs if + # it's a dict; otherwise str(item). + for item in payload: + if isinstance(item, dict): + print(" ".join(f"{k}={v}" for k, v in item.items())) + else: + print(item) + + +def _format_function(func) -> Dict: + return { + "addr": getattr(func, "addr", None), + "name": getattr(func, "name", None), + } + + +def _read_binary_bytes(binary_path: str, max_bytes: int = 32 * 1024 * 1024) -> Optional[bytes]: + """Read up to `max_bytes` from `binary_path`. Returns None on failure.""" + try: + with open(binary_path, "rb") as f: + return f.read(max_bytes) + except OSError as exc: + _l.debug("Could not read binary %s: %s", binary_path, exc) + return None + + +def _scan_ascii_strings(data: bytes, min_length: int = 4) -> List[Tuple[int, str]]: + """strings(1)-equivalent scan over a raw byte buffer. + + Returns `(offset_in_buffer, decoded_ascii)` tuples. The caller is + responsible for relocating `offset_in_buffer` into whatever address + space makes sense (e.g. file offset vs mapped vaddr). + """ + results: List[Tuple[int, str]] = [] + start = -1 + for i, b in enumerate(data): + # Printable ASCII (space..tilde) plus tab as an allowed interior byte. + if 0x20 <= b < 0x7f or b == 0x09: + if start < 0: + start = i + else: + if start >= 0 and (i - start) >= min_length: + try: + text = data[start:i].decode("ascii", errors="strict") + except UnicodeDecodeError: + pass + else: + results.append((start, text)) + start = -1 + if start >= 0 and (len(data) - start) >= min_length: + try: + text = data[start:].decode("ascii", errors="strict") + except UnicodeDecodeError: + pass + else: + results.append((start, text)) + return results + + +def _section_for_offset(elf_sections: Iterable, offset: int) -> Optional[str]: + """Return the name of the ELF section a file offset lives in, or None.""" + for name, start, size in elf_sections: + if start <= offset < start + size: + return name + return None + + +def _elf_sections_from_file(binary_path: str): + """Return [(name, file_offset, size), ...] for an ELF, or [] if not ELF.""" + try: + from elftools.elf.elffile import ELFFile # type: ignore + except ImportError: + return [] + try: + with open(binary_path, "rb") as f: + elf = ELFFile(f) + return [(sec.name, sec["sh_offset"], sec["sh_size"]) for sec in elf.iter_sections()] + except Exception: + return [] + + # --------------------------------------------------------------------------- # argparse plumbing # --------------------------------------------------------------------------- @@ -637,14 +927,25 @@ def build_parser() -> argparse.ArgumentParser: p_load.add_argument("--id", dest="id", help="Explicit server ID (otherwise auto-generated).") p_load.add_argument("--force", action="store_true", help="Start a new server even if one already exists for this binary.") + p_load.add_argument("--replace", action="store_true", + help="Stop the existing server for this binary+backend (if any) before starting.") _add_output_args(p_load) p_load.set_defaults(func=cmd_load) # list p_list = sub.add_parser("list", help="List running decompiler servers.") + p_list.add_argument("--show-registry", action="store_true", + help="Print just the registry directory path and exit.") _add_output_args(p_list) p_list.set_defaults(func=cmd_list) + # list_functions + p_lf = sub.add_parser("list_functions", help="List functions in the binary.") + p_lf.add_argument("--filter", dest="filter", help="Regex to filter function names.") + _add_server_filter_args(p_lf) + _add_output_args(p_lf) + p_lf.set_defaults(func=cmd_list_functions) + # stop p_stop = sub.add_parser("stop", help="Stop a running server.") p_stop.add_argument("--id", dest="id", help="Server ID to stop.") @@ -656,6 +957,8 @@ def build_parser() -> argparse.ArgumentParser: # decompile p_dec = sub.add_parser("decompile", help="Decompile a function by name or address.") p_dec.add_argument("target", help="Function name or address (hex/decimal).") + p_dec.add_argument("--raw", action="store_true", + help="Print the decompilation text directly (no JSON or header wrapping).") _add_server_filter_args(p_dec) _add_output_args(p_dec) p_dec.set_defaults(func=cmd_decompile) @@ -663,13 +966,23 @@ def build_parser() -> argparse.ArgumentParser: # disassemble p_dis = sub.add_parser("disassemble", help="Disassemble a function by name or address.") p_dis.add_argument("target", help="Function name or address (hex/decimal).") + p_dis.add_argument("--raw", action="store_true", + help="Print the disassembly text directly (no JSON or header wrapping).") _add_server_filter_args(p_dis) _add_output_args(p_dis) p_dis.set_defaults(func=cmd_disassemble) # xref_to - p_xto = sub.add_parser("xref_to", help="Functions/code that call or reference a target.") + p_xto = sub.add_parser( + "xref_to", + help=( + "Every reference (code AND data) to a target. " + "For call-sites only, see `get_callers`." + ), + ) p_xto.add_argument("target", help="Function name or address (hex/decimal).") + p_xto.add_argument("--decompile", action="store_true", + help="Ask the backend to decompile first (picks up more refs on Ghidra).") _add_server_filter_args(p_xto) _add_output_args(p_xto) p_xto.set_defaults(func=cmd_xref_to) @@ -692,14 +1005,33 @@ def build_parser() -> argparse.ArgumentParser: p_ren.set_defaults(func=cmd_rename) # list_strings - p_ls = sub.add_parser("list_strings", help="List strings in the binary.") + p_ls = sub.add_parser( + "list_strings", + help=( + "List strings in the binary. Backend detectors vary in fidelity " + "(angr < ghidra < ida); --rescan does a raw strings(1)-like scan " + "of the file as a fallback." + ), + ) p_ls.add_argument("--filter", dest="filter", help="Regex to filter strings.") + p_ls.add_argument("--min-length", dest="min_length", type=int, default=4, + help="Minimum string length to keep (default: 4).") + p_ls.add_argument("--rescan", action="store_true", + help="Force a raw-bytes scan of the binary file on top of the backend result.") + p_ls.add_argument("--no-rescan", action="store_true", + help="Never fall back to the raw scan, even if the backend returns few results.") _add_server_filter_args(p_ls) _add_output_args(p_ls) p_ls.set_defaults(func=cmd_list_strings) # get_callers - p_gc = sub.add_parser("get_callers", help="List callers of a function (Function|addr|name).") + p_gc = sub.add_parser( + "get_callers", + help=( + "Functions that call a target (call-sites only). " + "For every reference (code AND data), see `xref_to`." + ), + ) p_gc.add_argument("target", help="Function name or address (hex/decimal).") _add_server_filter_args(p_gc) _add_output_args(p_gc) @@ -726,13 +1058,13 @@ def main(argv: Optional[List[str]] = None) -> int: args = parser.parse_args(argv) _configure_logging(getattr(args, "verbose", False)) try: - return args.func(args) or 0 + return args.func(args) or EXIT_OK except SystemExit: raise except Exception as exc: # noqa: BLE001 _l.exception("Unhandled error: %s", exc) print(f"Error: {exc}", file=sys.stderr) - return 1 + return EXIT_RUNTIME_ERROR if __name__ == "__main__": # pragma: no cover diff --git a/libbs/skills/decompiler/SKILL.md b/libbs/skills/decompiler/SKILL.md index 0112b08..995d8f7 100644 --- a/libbs/skills/decompiler/SKILL.md +++ b/libbs/skills/decompiler/SKILL.md @@ -1,6 +1,6 @@ --- name: decompiler -description: Reverse-engineer and modify binaries with a single `decompiler` CLI that drives IDA Pro, Ghidra, Binary Ninja, or angr via LibBS. Use whenever the user asks to decompile, disassemble, look up cross references, rename functions or variables, search strings, or otherwise inspect a binary file. Also use for multi-binary workflows (load several binaries at once and switch between them with --id). +description: Reverse-engineer and modify binaries with a single `decompiler` CLI that drives IDA Pro, Ghidra, Binary Ninja, or angr via LibBS. Use whenever the user asks to decompile, disassemble, look up cross references, rename functions or variables, search strings or functions, or otherwise inspect a binary file. Also use for multi-binary workflows (load several binaries at once and switch between them with --id). --- # `decompiler` — LibBS CLI for LLMs @@ -31,25 +31,35 @@ to verify the pipeline end-to-end. |---|---| | **Server** | A headless `libbs --server` process holding a single binary open. Identified by a short ID. | | **Client** | Every `decompiler ` call is a short-lived client that picks a server, does one thing, and exits. | -| **Registry** | `decompiler list` / the shared registry under the libbs state dir. Each record has `id`, `backend`, `binary_path`, `socket_path`, `pid`. | -| **Address form** | Servers expose **lifted** addresses (relative to the binary base). The CLI accepts either lifted (`0x71d`) or absolute (`0x40071d`) and does the conversion. | +| **Registry** | `decompiler list` / the shared registry under the libbs state dir. Each record has `id`, `backend`, `binary_path`, `socket_path`, `pid`. Use `decompiler list --show-registry` to print just the path. | +| **Address form** | Servers expose **lifted** addresses (relative to the binary base). The CLI accepts either lifted (`0x71d`) or absolute (`0x40071d`) and does the conversion. JSON output always includes both `addr` (int) and `addr_hex` (hex string). | + +## First moves on a new binary + +```bash +decompiler load ./target # start a server (angr by default) +decompiler list_functions # enumerate every function +decompiler list_functions --filter 'main|auth' # or narrow by regex +decompiler list_strings --filter 'flag|pass' # find useful string constants +``` + +For stripped binaries `decompile main` often fails — use `list_functions` +first to discover the real entry (`sub_XXXX`, `entry`, etc.) and start from +there. ## Core workflow ```bash -# 1. Load a binary (auto-starts a server; default backend = angr). -decompiler load ./fauxware -# 2. Poke around. -decompiler decompile main -decompiler disassemble authenticate -decompiler xref_to authenticate # who calls this? -decompiler xref_from main # what does main call? -decompiler list_strings --filter 'pass|key' -decompiler get_callers 0x71d -# 3. Mutate the database. -decompiler rename func sub_400662 trampoline -decompiler rename var v2 auth_result --function main -# 4. Tidy up when done. +decompiler load ./fauxware # start a server +decompiler list_functions # enumerate functions +decompiler decompile main # by name +decompiler disassemble 0x40071d # by absolute address +decompiler xref_to authenticate # every code+data reference +decompiler get_callers authenticate # call-sites only (subset of xref_to) +decompiler xref_from main # what does main call? +decompiler list_strings --filter 'pass|key' # backend detector + raw fallback +decompiler rename func sub_400662 trampoline # rename a function +decompiler rename var v2 auth_result --function main # rename a local decompiler stop --all ``` @@ -69,7 +79,10 @@ decompiler decompile main --binary ./my-binary-2 # or target by path ``` When more than one server matches, the CLI refuses and prints a -disambiguation list. Narrow with `--id`, `--binary`, or `--backend`. +disambiguation list. Narrow with `--id`, `--binary`, or `--backend`. If you +want to restart the server for a binary cleanly, use `load ... --replace` +which stops the old server and starts a new one (vs `--force` which adds a +second server alongside the existing one). ## Choosing a backend @@ -80,50 +93,92 @@ decompiler load ./my-binary --backend binja # Binary Ninja, needs license decompiler load ./my-binary --backend ida # IDA Pro, needs install ``` -`--backend` is also accepted on `decompile`/`disassemble`/`xref_*`/`rename`/ -`list_strings`/`get_callers` to narrow which server to target when multiple -backends are loaded for the same binary. +`--backend` is also accepted on the inspection/mutation subcommands to +narrow which server to target when multiple backends are loaded for the +same binary. ## Full subcommand reference | Subcommand | Purpose | Key flags | |---|---|---| -| `load ` | Start a server on the binary. Idempotent: returns existing server unless `--force`. | `--backend`, `--id`, `--force`, `--json` | -| `list` | Show all running servers. | `--json` | +| `load ` | Start a server on the binary. Idempotent: returns existing unless `--force`/`--replace`. | `--backend`, `--id`, `--force`, `--replace`, `--json` | +| `list` | Show all running servers and the registry path. | `--show-registry`, `--json` | | `stop` | Shut down one or all servers. | `--id`, `--binary`, `--all`, `--json` | -| `decompile ` | Pseudocode for a function (name or address). | `--id`, `--binary`, `--backend`, `--json` | -| `disassemble ` | Assembly for a function. | same | -| `xref_to ` | Functions that call `target`. | same | +| `list_functions` | Enumerate every function (ADDR, SIZE, NAME). | `--filter REGEX`, `--json` | +| `decompile ` | Pseudocode for a function (name or address). | `--raw`, `--id`, `--binary`, `--backend`, `--json` | +| `disassemble ` | Assembly for a function. | `--raw`, same | +| `xref_to ` | Every reference (code + data) to the target. | `--decompile`, same | | `xref_from ` | Functions that `target` calls. | same | | `rename func ` | Rename a function. | same + `--json` | | `rename var --function ` | Rename a local variable inside a function. | same | -| `list_strings [--filter REGEX]` | Strings in the binary, regex-filterable. | same | -| `get_callers ` | Functions that call `target` (by addr, lifted addr, or name). | same | +| `list_strings` | Strings (backend + raw rescan fallback). | `--filter`, `--min-length N`, `--rescan`, `--no-rescan`, same | +| `get_callers ` | Call-sites only — subset of `xref_to`. | same | +| `install-skill` | Install this file into `~/.claude/skills/`. | `--dest`, `--force`, `--json` | + +### `xref_to` vs `get_callers` + +- `xref_to` asks the backend for **every reference** — code *and* data. On + Ghidra with `--decompile` this includes global variables and string + references. Rows include a `kind` field (`Function`, `GlobalVariable`, + ...). +- `get_callers` is the narrower call-sites-only view: only functions that + contain a `call` to the target. When you want "who calls this?" reach + for `get_callers`; when you want "who touches this in any way?" reach + for `xref_to`. + +### `list_strings` fidelity + +String detection quality varies by backend: `angr < ghidra < ida`. The CLI +hides this: if the backend returns few results (threshold: 32), we auto-run +a raw `strings(1)`-like scan of the binary file and label those entries +with their ELF section (`.rodata`, `.dynstr`, `.text`, etc.). Use +`--rescan` to force the scan, or `--no-rescan` to skip it and see only the +backend result. `--min-length` defaults to 4. ## Machine-readable output Pass `--json` on any subcommand to get a structured payload suitable for -downstream parsing — ideal when an LLM wants to chain commands: +downstream parsing — ideal when an LLM wants to chain commands. Every +JSON payload that mentions an address provides both `addr` (int, lifted) +and `addr_hex` (hex string, also lifted): ```bash +decompiler list_functions --filter '^main$' --json +# [{"addr": 1821, "size": 184, "name": "main", "addr_hex": "0x71d"}] + decompiler list_strings --filter 'flag' --json -# [{"addr": 4197168, "string": "flag{...}"}] +# [{"addr": 4197168, "string": "flag{...}", "source": "backend", "addr_hex": "0x4008e0"}, +# {"addr": 4197232, "string": "flag_check_ok", "source": "rescan", +# "section": ".rodata", "addr_hex": "0x400900"}] + decompiler decompile main --json -# {"addr": 1821, "decompiler": "angr", "text": "void main(...){...}"} +# {"addr": 1821, "decompiler": "angr", "text": "void main(...){...}", "addr_hex": "0x71d"} + +# Terminal-friendly form of decompile: skip JSON wrapping entirely. +decompiler decompile main --raw ``` ## Gotchas and tips -- **First `load` is slow** (analysis pass). Subsequent calls on the same - server are fast. -- **Rename's "success" is authoritative**: if the old name is missing the - command exits non-zero and reports `success: false`. -- **Servers persist until explicitly stopped** (`decompiler stop --all`) or - the host reboots; `decompiler list` always reflects live processes. -- **Address formats**: `0x71d`, `0x40071d`, and `1821` all resolve the same - function in fauxware. Names are also accepted anywhere an address is. -- **Binary Ninja / IDA / Ghidra backends**: the CLI still works the same; only - the server process differs. `--backend` on `load` is what matters. +- **First `load` is slow** (backend analysis pass). Subsequent calls on the + same server are fast. +- **`rename` exit codes**: every CLI command exits `0` on success and `1` + on any failure (including "rename didn't find the old name"). Use + `&&` safely. +- **Stripped binaries**: use `list_functions` before `decompile` to find + the real entry. `main` may not exist; look for `sub_XXXX` with plausible + sizes and xrefs. +- **Invalid addresses** fail with a clear message distinguishing "no + function starts here" from "decompiler engine failed". The CLI does not + auto-round-trip invalid addresses. +- **Address formats**: `0x71d`, `0x40071d`, and `1821` all resolve the + same function in fauxware. Names are also accepted wherever an address + is. +- **Servers persist** until explicitly stopped (`decompiler stop --all`) + or the host reboots; `decompiler list` always reflects live processes. +- **Registry path**: `decompiler list --show-registry` prints just the + directory so you can clean up manually if you ever need to (e.g. after + a `kill -9`). ## Library-level API (for Python scripts) diff --git a/tests/test_decompiler_cli.py b/tests/test_decompiler_cli.py index 5354a7a..fe833c4 100644 --- a/tests/test_decompiler_cli.py +++ b/tests/test_decompiler_cli.py @@ -109,10 +109,17 @@ def test_load_and_list(self): server_id = loaded["id"] list_result = _run_cli("list", "--json") - servers = json.loads(list_result.stdout) - ids = {s["id"] for s in servers} + payload = json.loads(list_result.stdout) + # Feedback P3.13: list --json should expose the registry path. + self.assertIn("registry_dir", payload) + self.assertTrue(payload["registry_dir"]) + ids = {s["id"] for s in payload["servers"]} self.assertIn(server_id, ids) + def test_list_show_registry(self): + result = _run_cli("list", "--show-registry") + self.assertTrue(result.stdout.strip()) + def test_load_idempotent(self): first = self._load_fauxware() second = self._load_fauxware() @@ -136,31 +143,99 @@ def test_multi_instance_same_binary_with_force(self): ok = _run_cli("decompile", "main", "--id", first["id"]) self.assertIn("main", ok.stdout) + def test_load_replace_stops_old_server(self): + """`load --replace` should tear the existing server down, not leave two.""" + first = self._load_fauxware() + replaced_result = _run_cli( + "load", str(FAUXWARE_PATH), "--backend", "angr", "--replace", "--json" + ) + replaced = json.loads(replaced_result.stdout) + self.assertEqual(replaced["status"], "started") + self.assertNotEqual(replaced["id"], first["id"]) + + # Only one server should remain. + listing = _run_cli("list", "--json") + servers = json.loads(listing.stdout)["servers"] + fauxware_servers = [s for s in servers if s["binary_path"] == str(FAUXWARE_PATH)] + self.assertEqual(len(fauxware_servers), 1) + self.assertEqual(fauxware_servers[0]["id"], replaced["id"]) + def test_decompile(self): self._load_fauxware() result = _run_cli("decompile", "main", "--json") payload = json.loads(result.stdout) self.assertIn("text", payload) self.assertIn("main", payload["text"]) + # Feedback P1.4: JSON should include addr_hex alongside addr. + self.assertIn("addr_hex", payload) + self.assertTrue(payload["addr_hex"].startswith("0x")) # By address (lifted) addr_dec = _run_cli("decompile", "0x71d", "--json") self.assertIn("text", json.loads(addr_dec.stdout)) + def test_decompile_raw(self): + """Feedback P1.7: --raw should print text directly, not JSON-wrapped.""" + self._load_fauxware() + raw = _run_cli("decompile", "main", "--raw") + # Raw output: no literal '\\n' escape or JSON quoting. + self.assertNotIn('\\n', raw.stdout) + self.assertNotIn('{"addr"', raw.stdout) + self.assertIn("main", raw.stdout) + + def test_decompile_not_a_function_start(self): + """Feedback P1.6: clear error distinguishing 'not a start' from engine failure.""" + self._load_fauxware() + # 0x71e is inside main (main starts at 0x71d). + result = _run_cli("decompile", "0x71e", check=False) + self.assertEqual(result.returncode, 1) + self.assertIn("No function starts at", result.stdout + result.stderr) + def test_disassemble(self): self._load_fauxware() result = _run_cli("disassemble", "main", "--json") payload = json.loads(result.stdout) self.assertIn("text", payload) + self.assertIn("addr_hex", payload) # sanity: some assembly self.assertTrue(any(op in payload["text"] for op in ("push", "mov", "call"))) + def test_disassemble_raw(self): + self._load_fauxware() + raw = _run_cli("disassemble", "main", "--raw") + self.assertNotIn('\\n', raw.stdout) + self.assertNotIn('{"addr"', raw.stdout) + + def test_list_functions(self): + """Feedback P0.1: `list_functions` subcommand.""" + self._load_fauxware() + result = _run_cli("list_functions", "--filter", "main", "--json") + entries = json.loads(result.stdout) + names = {e["name"] for e in entries} + self.assertIn("main", names) + # Each entry must carry addr, addr_hex, size, name. + for e in entries: + self.assertIn("addr", e) + self.assertIn("addr_hex", e) + self.assertIn("size", e) + self.assertIn("name", e) + # Text output is tabular. + text = _run_cli("list_functions", "--filter", "main").stdout + self.assertIn("ADDR", text) + self.assertIn("main", text) + def test_xref_to(self): self._load_fauxware() result = _run_cli("xref_to", "authenticate", "--json") payload = json.loads(result.stdout) names = {x.get("name") for x in payload["xrefs"]} self.assertIn("main", names) + # Feedback P1.3: rows should be kind-tagged. + kinds = {x.get("kind") for x in payload["xrefs"]} + self.assertIn("Function", kinds) + # addr_hex present for each xref + for x in payload["xrefs"]: + self.assertIn("addr_hex", x) def test_xref_from(self): self._load_fauxware() @@ -179,6 +254,23 @@ def test_rename_func(self): payload = json.loads(result.stdout) self.assertTrue(payload["success"]) + def test_rename_func_missing_exits_1(self): + """Feedback P1.5: non-existent rename should exit 1, not 2.""" + self._load_fauxware() + result = _run_cli( + "rename", "func", "nonexistent_fn_xyz", "whatever", check=False + ) + self.assertEqual(result.returncode, 1) + + def test_rename_var_missing_exits_1(self): + """Feedback P1.5: var rename with missing old name should exit 1, not 2.""" + self._load_fauxware() + result = _run_cli( + "rename", "var", "no_such_var_xyz", "whatever", + "--function", "main", check=False, + ) + self.assertEqual(result.returncode, 1) + def test_rename_var(self): self._load_fauxware() # Fetch an existing local variable name dynamically via the client API @@ -205,7 +297,32 @@ def test_list_strings(self): self._load_fauxware() result = _run_cli("list_strings", "--filter", "Welcome", "--json") payload = json.loads(result.stdout) - self.assertTrue(any("Welcome" in s["string"] for s in payload)) + # Each entry has addr, addr_hex, string, source. + hit = next((s for s in payload if "Welcome" in s["string"]), None) + self.assertIsNotNone(hit) + self.assertIn("addr_hex", hit) + self.assertIn("source", hit) + + def test_list_strings_rescan_picks_up_more(self): + """Feedback P0/P1.2: fallback scan should surface more entries than the thin angr detector.""" + self._load_fauxware() + backend_only = _run_cli("list_strings", "--no-rescan", "--json") + with_rescan = _run_cli("list_strings", "--rescan", "--json") + self.assertGreater(len(json.loads(with_rescan.stdout)), + len(json.loads(backend_only.stdout))) + # rescan entries should include section info for ELF binaries. + rescan_entries = [s for s in json.loads(with_rescan.stdout) + if s.get("source") == "rescan"] + self.assertTrue(rescan_entries) + self.assertTrue(any("section" in e for e in rescan_entries)) + + def test_list_strings_min_length(self): + self._load_fauxware() + result = _run_cli("list_strings", "--min-length", "20", "--json") + entries = json.loads(result.stdout) + # Every entry must meet the threshold. + for e in entries: + self.assertGreaterEqual(len(e["string"]), 20) def test_get_callers(self): self._load_fauxware() @@ -213,6 +330,10 @@ def test_get_callers(self): payload = json.loads(by_name.stdout) names = {c.get("name") for c in payload["callers"]} self.assertIn("main", names) + # Every *_addr field should have a hex sibling (target_addr -> target_addr_hex, etc.) + self.assertIn("target_addr_hex", payload) + for c in payload["callers"]: + self.assertIn("addr_hex", c) def test_stop(self): loaded = self._load_fauxware() @@ -220,7 +341,8 @@ def test_stop(self): payload = json.loads(stop.stdout) self.assertTrue(payload["stopped"][0]["stopped"]) listing = _run_cli("list", "--json") - ids = {s["id"] for s in json.loads(listing.stdout)} + servers = json.loads(listing.stdout)["servers"] + ids = {s["id"] for s in servers} self.assertNotIn(loaded["id"], ids) @unittest.skipUnless(POSIX_SYSCALL_PATH.exists(), f"Missing: {POSIX_SYSCALL_PATH}") @@ -253,6 +375,7 @@ def test_bundled_skill_present(self): def test_install_skill_via_cli(self): with tempfile.TemporaryDirectory() as dest: result = _run_cli("install-skill", "--dest", dest, "--json") + # Feedback P3.12: --json must emit parseable JSON (not Python repr). payload = json.loads(result.stdout) self.assertEqual(len(payload["installed"]), 1) installed_path = Path(payload["installed"][0]["path"]) @@ -266,6 +389,14 @@ def test_install_skill_via_cli(self): forced = _run_cli("install-skill", "--dest", dest, "--json", "--force") self.assertEqual(len(json.loads(forced.stdout)["installed"]), 1) + def test_install_skill_text_output_is_parsable(self): + """Text output should be readable (not single-quoted Python repr).""" + with tempfile.TemporaryDirectory() as dest: + result = _run_cli("install-skill", "--dest", dest) + # No Python-style single quotes around the payload. + self.assertNotIn("[{'name'", result.stdout) + self.assertIn("decompiler", result.stdout) + @unittest.skipUnless(FAUXWARE_PATH.exists(), f"Missing test binary: {FAUXWARE_PATH}") class TestNewLibbsFeatures(unittest.TestCase): From 9e0a4f9af0d7f47a0de57038d4c8ea3710fe963b Mon Sep 17 00:00:00 2001 From: mahaloz Date: Thu, 23 Apr 2026 20:33:28 -0700 Subject: [PATCH 06/10] remove feedback file --- CLI_FEEDBACK.md | 168 ------------------------------------------------ 1 file changed, 168 deletions(-) delete mode 100644 CLI_FEEDBACK.md diff --git a/CLI_FEEDBACK.md b/CLI_FEEDBACK.md deleted file mode 100644 index 266958b..0000000 --- a/CLI_FEEDBACK.md +++ /dev/null @@ -1,168 +0,0 @@ -# `decompiler` CLI + Skill — field report from solving `rpc.out` - -Context: used the new `decompiler` CLI (angr backend) end-to-end to reverse -`challenge/rpc.out` and construct a solve script. Everything below is a -concrete friction point hit during that session, in rough priority order. - -## P0 — missing capability that hurt the workflow - -### 1. No way to *list functions* -There is no `decompiler list_functions` / `decompiler functions` command. For -a stripped binary the only entry point from the CLI is `decompile main`, and -from there you discover the call graph one `decompile sub_XXXX` at a time. -For a CTF workflow (or any exploratory reverse), this is painful — I'd expect -something like: - -``` -decompiler list_functions [--filter REGEX] [--json] -# ADDR SIZE NAME -# 0x401b53 240 serve_forever -# 0x401ad3 112 handle_client -# ... -``` - -The data is clearly available server-side (`client.functions.items()` is -mentioned in `SKILL.md`), it just isn't exposed as a subcommand. This was the -single biggest gap. - -### 2. `list_strings` under-reports drastically -On this binary: - -``` -$ decompiler list_strings | wc -l -18 -``` - -Only 18 strings, and most are PLT/data-table fillers (`0x4f38`, `" "`, `"("`, -`"0"`, ...). `admin`, `private`, `r` were caught. Running `strings -n 4` on -the same binary yields the same "real" strings, so angr isn't *missing* the -ASCII runs in this case — but the output is clearly not a full `.rodata` -walk. For larger binaries it will miss data. - -Would be nice if `list_strings`: -- had a `--min-length N` flag (default 4 would cut the noise above), -- distinguished source section (.rodata vs .data vs inline), -- optionally fell back to a raw-bytes scan when the backend's string - detector is thin (angr is). - -## P1 — correctness / UX - -### 3. `get_callers` and `xref_to` look redundant -Both commands return the same thing for the same target: - -``` -$ decompiler xref_to check_auth -0x1690 run_vm -$ decompiler get_callers check_auth -0x1690 run_vm -``` - -`SKILL.md` lists them as separate subcommands with slightly different -descriptions but no concrete difference in output. Either unify them, or -document precisely when you'd reach for one over the other (e.g. `xref_to` -is data *and* code refs, `get_callers` is call-site only?). As shipped I -never had a reason to use both. - -### 4. Mixed hex / decimal addresses in output -Text output prints addresses as hex (`0x3004`), `--json` output prints them -as decimals (`"addr": 12298`). Same data, different radix. Mildly annoying -when piping between commands or copying from a prior output. - -Suggestion: JSON should emit `"addr": "0x3004"` (or ship `addr` as int and -`addr_hex` as string), and be consistent across `list_strings`, `xref_to`, -`decompile`, `rename`. - -### 5. `rename var` on a nonexistent name exits 2, not 1 -``` -$ decompiler rename var not_there missing --function run_vm ; echo $? -success: False -2 -``` - -Everywhere else (`decompile nonexistent`, `rename func nonexistent`) exits -`1`. Exit-code inconsistency breaks simple `&&` chaining in shell scripts. - -### 6. Decompile failure message doesn't distinguish failure modes -``` -$ decompiler decompile 0x999999 -Failed to decompile function at 0x999999 -``` -Was the address invalid? Not a function start? Decompiler bug? Same message -for all three. At minimum, split "no function at address" from "decompilation -engine failed". - -### 7. `decompile --json` stuffs the whole pseudocode into one string -``` -{"addr": 6995, "decompiler": "angr", "text": "typedef struct ...\n ..."} -``` -For LLM consumption this is fine, but when debugging from a terminal the -embedded `\n`s are unreadable. A `--no-escape` or `--raw` flag to print the -`text` body directly (with a JSON header on stderr) would be nice. - -## P2 — skill / docs - -### 8. `SKILL.md` promises a richer `list_strings` than the angr backend ships -The skill example shows: -``` -decompiler list_strings --filter 'flag' --json -# [{"addr": 4197168, "string": "flag{...}"}] -``` -On angr this detector is thin (see #2). The skill should either warn -"`list_strings` fidelity varies by backend (angr < ghidra < ida)" or include -a fallback pattern ("if nothing comes back, fall through to `strings(1)` on -the binary directly"). - -### 9. `SKILL.md` says "Address forms: 0x71d, 0x40071d, and 1821 all resolve the same function" -This worked correctly in my session (both lifted `0x1b53` and absolute -`0x401b53` decompiled `serve_forever`), but the skill doesn't mention that -`decompile` will happily accept an *invalid* address and print -"Failed to decompile function at 0x999999" rather than validating the input. -A note that "address form doesn't round-trip — the CLI canonicalises to -lifted internally" would help downstream agents reason about output. - -### 10. Skill lacks a "functions" example -Since `list_functions` doesn't exist (see #1), the skill also can't show -"first step in a new binary: see all functions." A seasoned reverser opens a -new binary and immediately wants the function list. The skill currently -pushes toward `decompile main` as the entry, which only helps if `main` is a -known name — and on stripped binaries where main is `sub_XXXX`, that fails -silently. - -## P3 — nits - -### 11. Stale server records pile up after `--force` -Running `load ... --force` leaves the old server running (good) and starts a -new one. `decompiler list` shows both, but there's no visual cue that the -two are for the same binary (apart from the path). A column marking -`ORIGINAL`/`FORCED` or a `--kill-existing` flag would be clearer. - -### 12. `install-skill` success output is a Python repr, not JSON -``` -installed: [{'name': 'decompiler', 'path': '...'}] -``` -Single-quoted dict. Either make it valid JSON (so `decompiler install-skill ---json | jq` works) or print a human-friendly line. Currently it's neither. - -### 13. Registry state-dir path isn't obvious -`decompiler list` doesn't say *where* the registry lives. When a stale -record survives a reboot or a kill -9, a user has no obvious breadcrumb to -go clean it up. Add the path to `list --json` output or a `--show-registry` -flag. - -## What worked well - -- `load` + automatic server spawn: zero ceremony, fast on small binaries. -- `--id` / `--binary` disambiguation refusal is exactly right. -- `rename func` / `rename var` worked first try, renames stuck across - subsequent `decompile` calls, and the JSON surface (`{kind, addr, - new_name, success}`) is clean. -- `xref_from` on the VM dispatcher gave me the five opcode handlers - instantly — this is the CLI's sweet spot. -- `install-skill --force` behaved correctly, exit codes correct. -- Multi-server support works; `stop --id` targets cleanly. - -## Summary -The CLI handled the solve well enough that I never dropped to `objdump` or -`strings(1)` for *analysis* — only for cross-checking. The two biggest gaps -are **no `list_functions`** (P0) and **under-reporting `list_strings`** (P1). -Everything else is polish. From e7ae2de6610e0dc038062cb7225acb597cfa09f8 Mon Sep 17 00:00:00 2001 From: mahaloz Date: Thu, 23 Apr 2026 22:42:51 -0700 Subject: [PATCH 07/10] Fix IDA main-thread dispatch, Ghidra string coverage, CLI robustness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - IDA: cache server_info metadata at init so the client handshake never re-enters idalib from a worker thread (root cause of "Function can be called from the main thread only"). Use IDA's -o flag to redirect .id*/.i64 sidecars into the project dir. Fill in list_strings, disassemble, xrefs_from, and xrefs_to_addr (with 2-hop data-indirection chasing for PIE string references). - Ghidra: supplement listing.getDefinedData() with a StringSearcher pass so byte arrays typed as uchar[N] (e.g. base64 alphabets) still surface in list_strings. Pass TaskMonitor.DUMMY — null NPE's inside the searcher. - Wire format: JSON instead of TOML (the toml package mangles backslash-x escapes in decompilation text). - xref_from: direct per-function callee query on each backend. - project_dir: default to platformdirs user_cache so backends stop cluttering the binary's directory; --project-dir "" restores legacy. - list_strings rescan removed; docs now point at strings(1) / rabin2 / readelf for exhaustive scans. - Tests: parametrized CLI suite (angr, ghidra, ida subclasses) plus a Ghidra-specific regression for the base64 alphabet at rpc.out:.data:0x5020. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/decompiler_cli.md | 91 +++- libbs/__main__.py | 16 +- libbs/api/decompiler_client.py | 54 ++- libbs/api/decompiler_interface.py | 65 ++- libbs/api/decompiler_server.py | 253 ++++++++-- libbs/cli/decompiler_cli.py | 438 ++++++++++-------- libbs/decompilers/angr/interface.py | 74 ++- libbs/decompilers/ghidra/interface.py | 184 +++++++- libbs/decompilers/ida/compat.py | 51 +++ libbs/decompilers/ida/interface.py | 136 +++++- libbs/skills/__init__.py | 2 +- libbs/skills/decompiler/SKILL.md | 78 +++- tests/test_decompiler_cli.py | 637 ++++++++++++++++++-------- 13 files changed, 1580 insertions(+), 499 deletions(-) diff --git a/docs/decompiler_cli.md b/docs/decompiler_cli.md index 53dbf6b..8ab3d07 100644 --- a/docs/decompiler_cli.md +++ b/docs/decompiler_cli.md @@ -129,6 +129,7 @@ it. decompiler load [--backend {angr,ghidra,binja,ida}] [--id SERVER_ID] [--force | --replace] + [--project-dir PATH] [--json] ``` @@ -139,9 +140,16 @@ decompiler load [--backend {angr,ghidra,binja,ida}] - **`--replace`** — stop any existing server for this `(binary, backend)` first, then start a fresh one. Use this when you want to re-analyze from scratch. +- **`--project-dir PATH`** — where to keep the backend's + project/database files (Ghidra project, IDA `.id*`/`.til`, etc.). + Default: a per-binary directory under the user cache + (`/libbs/projects/-/`), so analysis + artifacts don't pollute the binary's directory. Pass `--project-dir ""` + to disable the cache dir and let the backend drop files alongside the + binary (legacy behavior). -Outputs `id`, `socket_path`, `binary_path`, `backend`, and `status` (either -`started` or `already_loaded`). +Outputs `id`, `socket_path`, `binary_path`, `backend`, `project_dir`, and +`status` (either `started` or `already_loaded`). ### `list` @@ -240,8 +248,20 @@ Same error semantics and `--raw` flag as `decompile`. decompiler xref_to [--decompile] [--id ID] [--binary PATH] [--backend BACKEND] [--json] ``` +`` can be: + +- a **function name or address** — resolves to function xrefs (who calls + this function), +- a **raw address** that isn't a function start — resolves via the + backend's address-level reference table (useful for globals, jump + table entries, etc.), +- a **string literal** — looked up via `list_strings` first, then queried + as a raw-address xref. Great for "who reads this constant?". + Each row has a `kind` field (`Function`, `GlobalVariable`, …) so you can -tell code refs from data refs. +tell code refs from data refs. The JSON payload also carries +`target_kind` (`function`, `address`, or `string`) so callers can tell +which resolution path fired. - **`--decompile`** — ask the backend to decompile first. On Ghidra this surfaces additional references (e.g. globals pulled in through the @@ -278,28 +298,36 @@ response's `success` field is authoritative). ### `list_strings` -List strings in the binary, combining the backend's native detector with a -raw `strings(1)`-style scan of the file. +List strings the decompiler's own string detector has identified in the +binary. ```bash decompiler list_strings [--filter REGEX] [--min-length N] - [--rescan] [--no-rescan] [--id ID] [--binary PATH] [--backend BACKEND] [--json] ``` - **`--filter REGEX`** — only return strings matching the regex. - **`--min-length N`** — drop strings shorter than N characters (default 4). -- **`--rescan`** — always run the raw-bytes scan in addition to the - backend detector. -- **`--no-rescan`** — never run the raw-bytes scan. -- Default: the CLI auto-rescan when the backend returns fewer than 32 - strings (angr in particular misses most of `.rodata`). -Text output is `0x\t[
]\t` per line. JSON -output is a list of `{"addr", "addr_hex", "string", "source", "section"?}` -where `source` is `"backend"` or `"rescan"` and `section` (when present) -is the ELF section the raw-scanned byte lives in. +Text output is `0x\t` per line. JSON output is a list of +`{"addr", "addr_hex", "string"}` entries. + +**Fidelity caveat.** This command only returns what the decompiler +itself surfaced — it does not second-guess the backend or supplement with +a file-level scan. Backend string detection quality varies +(`angr < ghidra < ida`); angr in particular misses most of `.rodata`. +If the output looks thin, cross-check with an external tool before +concluding a string isn't in the binary: + +```bash +strings -a -n 4 ./target # classic strings(1) +rabin2 -z ./target # radare2, structured output +readelf -p .rodata ./target # ELF-specific, per section +``` + +Once you've located a string that way you can feed its address back into +the CLI via `decompiler xref_to 0x...` or `decompiler decompile 0x...`. ### `get_callers` @@ -314,17 +342,23 @@ always of kind `Function`. ### `install-skill` -Copy the bundled Agent Skill into `~/.claude/skills/` so Claude Code (or -any agent that picks up skills from that path) learns how to drive the CLI. +Copy the bundled Agent Skill into a supported agent skill directory so Claude +Code or Codex learns how to drive the CLI. ```bash -decompiler install-skill [names ...] [--dest DIR] [--force] [--json] +decompiler install-skill [names ...] [--agent claude|codex|all] [--dest DIR] [--force] [--json] ``` -With no `names`, every bundled skill is installed. Use `--dest` to copy the -skill somewhere else, and `--force` to overwrite an existing directory. -`--json` emits a well-formed JSON payload suitable for piping through -`jq`. +With no `names`, every bundled skill is installed. By default the installer +uses Codex when `CODEX_*` environment variables are present, otherwise Claude. +Use `--agent codex`, `--agent claude`, repeated `--agent` flags, or +`--agent all` to choose explicitly. Claude installs under `~/.claude/skills`; +Codex installs under `$CODEX_HOME/skills` when `CODEX_HOME` is set, otherwise +`~/.codex/skills`. + +Use `--dest` to copy the skill somewhere else, and `--force` to overwrite an +existing directory. `--json` emits a well-formed JSON payload suitable for +piping through `jq`. --- @@ -509,9 +543,16 @@ The old name was not found in the function (e.g. it was already renamed, or you targeted the wrong function). **`list_strings` looks thin** -angr's string detector is minimal. The CLI auto-falls-back to a raw-bytes -scan — if you don't see `source: "rescan"` entries, the threshold (32 -backend entries) wasn't crossed. Pass `--rescan` to force it. +This is expected on angr (and can happen on Ghidra for stripped binaries) — +`list_strings` returns only what the decompiler itself identified. Use an +external scanner to see every ASCII constant in the file, then feed the +address back into `xref_to` / `decompile`: + +```bash +strings -a -n 4 ./target +rabin2 -z ./target +readelf -p .rodata ./target +``` **Server-side logs** Spawned servers have their stdout/stderr sent to `/dev/null`. If you're diff --git a/libbs/__main__.py b/libbs/__main__.py index aed22c2..9663f81 100644 --- a/libbs/__main__.py +++ b/libbs/__main__.py @@ -11,7 +11,10 @@ def install(): LibBSPluginInstaller().install() -def start_server(socket_path=None, decompiler=None, binary_path=None, headless=False, server_id=None): +def start_server( + socket_path=None, decompiler=None, binary_path=None, headless=False, + server_id=None, project_dir=None, +): """Start the DecompilerServer (AF_UNIX socket-based)""" try: from libbs.api.decompiler_server import DecompilerServer @@ -28,6 +31,8 @@ def start_server(socket_path=None, decompiler=None, binary_path=None, headless=F interface_kwargs['binary_path'] = binary_path if headless: interface_kwargs['headless'] = headless + if project_dir: + interface_kwargs['project_dir'] = project_dir # Create and start server if socket_path: @@ -151,6 +156,14 @@ def main(): Explicit server ID to use; if omitted, a unique one is generated. """ ) + parser.add_argument( + "--project-dir", help=""" + Directory where the backend should store its project/database files + (Ghidra project, IDA .id*, etc.). If omitted, backend defaults apply + (Ghidra creates a project next to the binary; IDA writes .id* next + to the binary). + """ + ) args = parser.parse_args() if args.single_decompiler_install: @@ -167,6 +180,7 @@ def main(): binary_path=args.binary_path, headless=args.headless, server_id=args.server_id, + project_dir=args.project_dir, ) else: parser.print_help() diff --git a/libbs/api/decompiler_client.py b/libbs/api/decompiler_client.py index 4af8068..0989262 100644 --- a/libbs/api/decompiler_client.py +++ b/libbs/api/decompiler_client.py @@ -16,6 +16,7 @@ Artifact, Function, Comment, Patch, GlobalVariable, Segment, Struct, Enum, Typedef, Context, Decompilation ) +from libbs.artifacts.formatting import ArtifactFormat from libbs.api.decompiler_server import SocketProtocol from libbs.api.type_parser import CTypeParser from libbs.configuration import LibbsConfig @@ -23,6 +24,10 @@ _l = logging.getLogger(__name__) +# Must match decompiler_server._WIRE_FMT; JSON avoids the `toml` package's +# buggy handling of raw `\x` escapes inside decompilation text. +_WIRE_FMT = ArtifactFormat.JSON + class ArtLifterProxy: """ @@ -168,13 +173,13 @@ def _get_light_artifacts(self) -> Dict: module_name = artifact_info['module'] class_name = artifact_info['type'] serialized_data = artifact_info['data'] - + # Import the module and get the class module = __import__(module_name, fromlist=[class_name]) artifact_class = getattr(module, class_name) - + # Reconstruct the artifact using its loads method - artifact = artifact_class.loads(serialized_data) + artifact = artifact_class.loads(serialized_data, fmt=_WIRE_FMT) reconstructed_artifacts[addr] = artifact except Exception as e: @@ -442,13 +447,13 @@ def _send_request(self, request: Dict[str, Any]) -> Any: module_name = response['module'] class_name = response['type'] serialized_data = response['data'] - + # Import the module and get the class module = __import__(module_name, fromlist=[class_name]) artifact_class = getattr(module, class_name) - + # Reconstruct the artifact using its loads method - artifact = artifact_class.loads(serialized_data) + artifact = artifact_class.loads(serialized_data, fmt=_WIRE_FMT) return artifact except Exception as e: @@ -534,6 +539,14 @@ def xrefs_to(self, artifact: Artifact, decompile=False, only_code=False) -> List """Get cross-references to an artifact""" return self._send_request({"type": "method_call", "method_name": "xrefs_to", "args": [artifact], "kwargs": {"decompile": decompile, "only_code": only_code}}) + def xrefs_to_addr(self, addr: int, only_code: bool = False) -> List[Artifact]: + """Get references to a raw address (e.g. a string constant)""" + return self._send_request({"type": "method_call", "method_name": "xrefs_to_addr", "args": [addr], "kwargs": {"only_code": only_code}}) + + def xrefs_from(self, func_addr: int) -> List[Function]: + """Get the callees of a function (what the function calls).""" + return self._send_request({"type": "method_call", "method_name": "xrefs_from", "args": [func_addr]}) + def get_callers(self, target) -> List[Function]: """Get callers of a function (by target Function, address, or symbol name)""" return self._send_request({"type": "method_call", "method_name": "get_callers", "args": [target]}) @@ -725,7 +738,7 @@ def _process_event(self, event: Dict[str, Any]) -> None: artifact_class = getattr(module, class_name) # Reconstruct the artifact - artifact = artifact_class.loads(serialized_data) + artifact = artifact_class.loads(serialized_data, fmt=_WIRE_FMT) # Extract additional kwargs kwargs = event.get("kwargs", {}) @@ -753,7 +766,13 @@ def _process_event(self, event: Dict[str, Any]) -> None: # Lifecycle methods def shutdown(self) -> None: - """Shutdown the client""" + """Disconnect this client from the server. + + This only tears down the *local* client; the server (and its loaded + decompiler project) keeps running so other clients can still connect. + To actually stop the server, use ``shutdown_server()`` or the + ``decompiler stop`` CLI. + """ _l.info("DecompilerClient shutting down") # Stop event listener first @@ -762,14 +781,25 @@ def shutdown(self) -> None: if self._socket: try: - # Send shutdown request to server - self._send_request({"type": "shutdown_deci"}) - except: + self._socket.close() + except Exception: pass - self._socket.close() self._connected = False _l.info("DecompilerClient shut down complete") + def shutdown_server(self) -> None: + """Ask the server to tear down its decompiler interface, then disconnect. + + Used by CLI commands like ``decompiler stop``. Regular usage should + prefer :meth:`shutdown`, which leaves the server running. + """ + if self._socket: + try: + self._send_request({"type": "shutdown_deci"}) + except Exception: + pass + self.shutdown() + def is_connected(self) -> bool: """Check if connected to the server""" return self._connected and self._socket diff --git a/libbs/api/decompiler_interface.py b/libbs/api/decompiler_interface.py index 7ff2cfe..bec3f01 100644 --- a/libbs/api/decompiler_interface.py +++ b/libbs/api/decompiler_interface.py @@ -397,6 +397,49 @@ def xrefs_to(self, artifact: Artifact, decompile=False, only_code=False) -> List return [] + def xrefs_to_addr(self, addr: int, only_code: bool = False) -> List[Artifact]: + """Return artifacts that reference ``addr``. + + Unlike :meth:`xrefs_to`, which assumes a Function target and therefore + only fires on function entry points, this is a raw "who references + this address?" query. It's what you want after ``list_strings`` finds + a candidate string and you need to know which functions read it. + + The default implementation turns the address into a stub Function and + delegates to :meth:`xrefs_to`; subclasses should override this with a + real data-xref query when their backend exposes one. + + @param addr: Address (lifted) to find references to. + @param only_code: Restrict to code references if the backend supports it. + @return: List of referencing artifacts (typically Function stubs). + """ + return self.xrefs_to(Function(addr, 0), only_code=only_code) + + def xrefs_from(self, func_addr: int) -> List[Function]: + """Return the functions that ``func_addr`` calls (its direct callees). + + The default implementation falls back to get_callgraph + out_edges, + which is expensive because it computes xrefs for every function in + the binary. Subclasses should override with a direct per-function + callee query when their backend exposes one. + """ + try: + cg = self.get_callgraph(only_names=False) + except Exception as exc: + _l.debug("get_callgraph failed: %s", exc) + return [] + callees: List[Function] = [] + seen = set() + for caller, callee in cg.out_edges(nbunch=None): + if getattr(caller, "addr", None) != func_addr: + continue + callee_addr = getattr(callee, "addr", None) + if callee_addr in seen: + continue + seen.add(callee_addr) + callees.append(callee) + return callees + def get_callers(self, target) -> List[Function]: """ Returns a list of Functions that call/reference the provided target. @@ -442,9 +485,16 @@ def list_strings(self, filter: Optional[str] = None) -> List[Tuple[int, str]]: """ Returns a list of (addr, string) tuples for strings found in the binary. - Subclasses are expected to override this to provide decompiler-native string discovery - (which is typically much faster and more accurate). The base implementation returns an - empty list. + This reports **only what the decompiler's own string detector + surfaced** — it is deliberately not a substitute for a full-file + scan. Backend fidelity varies (angr in particular misses most of + ``.rodata``); callers that need an exhaustive list should fall + back to external tools such as ``strings(1)``, ``rabin2 -z``, or + ``readelf -p`` and then use the resulting addresses with the + other APIs (``decompile``, ``xrefs_to_addr``, etc.). + + Subclasses are expected to override this with native, fast string + discovery. The base implementation returns an empty list. @param filter: Optional regex string; only strings that match will be returned. @return: List of (address, string) tuples. @@ -1161,10 +1211,17 @@ def discover( else: current_decompiler = DecompilerInterface.find_current_decompiler(force=force_decompiler) + # `project_dir` is a user-facing kwarg that translates to the + # backend-specific cache/project location. Backends without a concept + # of this simply ignore it. + project_dir = interface_kwargs.pop("project_dir", None) + if current_decompiler == IDA_DECOMPILER: from libbs.decompilers.ida.interface import IDAInterface deci_class = IDAInterface extra_kwargs = {} + if project_dir: + extra_kwargs["project_dir"] = project_dir elif current_decompiler == BINJA_DECOMPILER: from libbs.decompilers.binja.interface import BinjaInterface deci_class = BinjaInterface @@ -1177,6 +1234,8 @@ def discover( from libbs.decompilers.ghidra.interface import GhidraDecompilerInterface deci_class = GhidraDecompilerInterface extra_kwargs = {"flat_api": DecompilerInterface._find_global_in_call_frames('__this__')} + if project_dir: + extra_kwargs["project_location"] = project_dir else: raise ValueError("Please use LibBS with our supported decompiler set!") diff --git a/libbs/api/decompiler_server.py b/libbs/api/decompiler_server.py index 9ddf2f2..a6785f6 100644 --- a/libbs/api/decompiler_server.py +++ b/libbs/api/decompiler_server.py @@ -4,6 +4,7 @@ import logging import pickle +import queue import socket import struct import threading @@ -14,9 +15,27 @@ from libbs.api.decompiler_interface import DecompilerInterface from libbs.api import server_registry +from libbs.artifacts.formatting import ArtifactFormat _l = logging.getLogger(__name__) +# JSON, not TOML: the `toml` package's encoder mangles raw `\x` escapes, +# which show up in decompilation text for C char literals like `'\x01'`. +_WIRE_FMT = ArtifactFormat.JSON + +# Sentinel used to poke the main-thread dispatcher awake on shutdown. +_MAIN_THREAD_SHUTDOWN = object() + + +class _MainThreadError: + """Wrap exceptions that occurred on the main thread so the waiting client + thread can re-raise them after receiving the result.""" + + __slots__ = ("exc",) + + def __init__(self, exc: BaseException): + self.exc = exc + class SocketProtocol: """Helper class for socket protocol message framing""" @@ -83,6 +102,18 @@ def __init__(self, deci: DecompilerInterface, server: 'DecompilerServer' = None) self._light_caches = {} self._cache_lock = threading.Lock() self._cache_ttl = 10.0 + + def _dispatch(self, func, *args, **kwargs): + """Call ``func`` either directly or via the server's main-thread queue. + + Backends like IDA reject cross-thread API access, so the server + declares ``_requires_main_thread`` and we route everything through + its dispatcher. For thread-safe backends (ghidra headless, angr, + binja) we short-circuit to a direct call. + """ + if self.server is None or not self.server.requires_main_thread: + return func(*args, **kwargs) + return self.server.run_on_main_thread(func, *args, **kwargs) def handle_client(self, client_socket: socket.socket, addr: str): """Handle a client connection""" @@ -142,33 +173,43 @@ def _process_request(self, request: Dict[str, Any], client_socket: socket.socket return {"status": "error", "message": "Server not available"} elif request_type == "server_info": + # Return the metadata cached by the server at init time. Reading + # ``deci.binary_hash`` or ``deci.binary_path`` here would re-enter + # the backend from a worker thread — which IDA/idalib rejects + # with "Function can be called from the main thread only". + if self.server is not None and self.server._cached_server_info is not None: + return dict(self.server._cached_server_info) return { "name": "LibBS DecompilerServer (AF_UNIX)", "version": "3.0.0", "decompiler": self.deci.name if self.deci else "unknown", "protocol": "unix_socket", - "binary_hash": self.deci.binary_hash if self.deci else None, - "binary_path": str(self.deci.binary_path) if (self.deci and self.deci.binary_path) else None, + "binary_hash": None, + "binary_path": None, "server_id": self.server.server_id if self.server else None, } - + elif request_type == "get_light_artifacts": collection_name = request.get("collection_name") return self._get_light_artifacts(collection_name) - + elif request_type == "get_full_artifact": collection_name = request.get("collection_name") key = request.get("key") - collection = getattr(self.deci, collection_name) - artifact = collection[key] - + + def _fetch_full_artifact(): + collection = getattr(self.deci, collection_name) + return collection[key] + + artifact = self._dispatch(_fetch_full_artifact) + # Serialize the full artifact safely if hasattr(artifact, 'dumps') and hasattr(artifact, '__class__'): try: return { 'type': artifact.__class__.__name__, 'module': artifact.__class__.__module__, - 'data': artifact.dumps(), + 'data': artifact.dumps(fmt=_WIRE_FMT), 'is_artifact': True } except Exception as e: @@ -177,12 +218,12 @@ def _process_request(self, request: Dict[str, Any], client_socket: socket.socket return artifact else: return artifact - + elif request_type == "method_call": method_name = request.get("method_name") args = request.get("args", []) kwargs = request.get("kwargs", {}) - + # Handle dotted method names like "art_lifter.lift" if "." in method_name: obj = self.deci @@ -192,7 +233,7 @@ def _process_request(self, request: Dict[str, Any], client_socket: socket.socket else: # Get the method from the decompiler interface method = getattr(self.deci, method_name) - result = method(*args, **kwargs) + result = self._dispatch(method, *args, **kwargs) # Check if result is an artifact and serialize it properly if hasattr(result, 'dumps') and hasattr(result, '__class__'): @@ -201,7 +242,7 @@ def _process_request(self, request: Dict[str, Any], client_socket: socket.socket return { 'type': result.__class__.__name__, 'module': result.__class__.__module__, - 'data': result.dumps(), + 'data': result.dumps(fmt=_WIRE_FMT), 'is_artifact': True } except Exception as e: @@ -214,11 +255,15 @@ def _process_request(self, request: Dict[str, Any], client_socket: socket.socket elif request_type == "property_get": property_name = request.get("property_name") - return getattr(self.deci, property_name) + return self._dispatch(lambda: getattr(self.deci, property_name)) elif request_type == "shutdown_deci": - if self.deci: - self.deci.shutdown() + if self.deci and self.server is not None and not self.server._deci_shutdown_done: + # Route through the main-thread dispatcher for IDA — calling + # idapro.close_database() from a worker thread raises + # "Function can be called from the main thread only". + self._dispatch(self.deci.shutdown) + self.server._deci_shutdown_done = True return {"status": "shutdown"} elif request_type == "shutdown_server": @@ -236,18 +281,18 @@ def _get_light_artifacts(self, collection_name: str) -> Dict: """Get light artifacts for a collection, computing and caching on first request""" with self._cache_lock: cache_entry = self._light_caches.get(collection_name) - + # Check if we have a valid cache entry if cache_entry and time.time() - cache_entry["timestamp"] < self._cache_ttl: return cache_entry["items"] - + # Cache miss or stale - compute light artifacts on-demand _l.debug(f"Computing light artifacts for {collection_name} on-demand") try: collection = getattr(self.deci, collection_name) if hasattr(collection, '_lifted_art_lister'): start_time = time.time() - light_items = collection._lifted_art_lister() + light_items = self._dispatch(collection._lifted_art_lister) end_time = time.time() # Convert artifacts to serializable format using their own serialization @@ -255,7 +300,7 @@ def _get_light_artifacts(self, collection_name: str) -> Dict: for addr, artifact in light_items.items(): try: # Use the artifact's built-in serialization which handles complex objects - serialized = artifact.dumps() + serialized = artifact.dumps(fmt=_WIRE_FMT) # Store as a tuple of (type_name, serialized_data) for reconstruction serializable_items[addr] = { 'type': artifact.__class__.__name__, @@ -325,10 +370,20 @@ def __init__(self, self._clients = [] self._client_threads = [] + # Main-thread dispatch: some backends (notably IDA/idalib) reject + # cross-thread API access. For those we route backend calls through + # a queue so they run on the thread that set the backend up. + self._main_thread_queue: "queue.Queue" = queue.Queue() + self._main_thread_ident: Optional[int] = None + # Event subscription tracking self._event_subscribers = [] # List of sockets subscribed to events self._event_subscribers_lock = threading.Lock() - + + # Track whether deci.shutdown() already ran, so teardown is idempotent + # across the worker-initiated stop() and the main-thread __exit__. + self._deci_shutdown_done = False + # Initialize the decompiler interface if decompiler_interface is not None: self.deci = decompiler_interface @@ -344,7 +399,13 @@ def __init__(self, self.deci = DecompilerInterface.discover(**interface_kwargs) if self.deci is None: raise RuntimeError("Failed to discover decompiler interface") - + + # Cache static metadata on the *main* thread so that the connection + # handshake (`server_info`) never touches the backend from a worker + # thread — IDA/idalib raises "Function can be called from the main + # thread only" the moment such access happens. + self._cached_server_info = self._build_static_server_info() + # Create socket handler self.handler = SocketServerHandler(self.deci, server=self) @@ -362,6 +423,38 @@ def __init__(self, _l.info(f"DecompilerServer initialized with {self.deci.name} interface (id={self.server_id})") _l.info(f"Socket path: {self.socket_path}") + def _build_static_server_info(self) -> Dict[str, Any]: + """Collect immutable server metadata on whatever thread calls us. + + This runs from ``__init__`` — i.e. the thread that constructed the + deci (the main thread in the CLI path). Capturing the values here + means ``server_info`` replies can be served from any worker thread + without re-entering backends like IDA that reject cross-thread API + calls. + """ + binary_path = None + binary_hash = None + if self.deci: + try: + raw_path = self.deci.binary_path + binary_path = str(raw_path) if raw_path else None + except Exception as exc: + _l.debug("Failed to cache binary_path: %s", exc) + try: + binary_hash = self.deci.binary_hash + except Exception as exc: + _l.debug("Failed to cache binary_hash: %s", exc) + + return { + "name": "LibBS DecompilerServer (AF_UNIX)", + "version": "3.0.0", + "decompiler": self.deci.name if self.deci else "unknown", + "protocol": "unix_socket", + "binary_hash": binary_hash, + "binary_path": binary_path, + "server_id": self.server_id, + } + def _register_artifact_callbacks(self): """Register callbacks to broadcast artifact changes to subscribed clients""" from libbs.artifacts import Comment, Struct, Enum, Typedef, GlobalVariable, FunctionHeader, StackVariable @@ -401,7 +494,7 @@ def _broadcast_event(self, event_type: str, artifact, **kwargs): serialized_artifact = { 'type': artifact.__class__.__name__, 'module': artifact.__class__.__module__, - 'data': artifact.dumps(), + 'data': artifact.dumps(fmt=_WIRE_FMT), 'is_artifact': True } @@ -525,7 +618,13 @@ def stop(self): _l.info("Stopping DecompilerServer...") self._running = False - + + # Wake the main-thread dispatcher so it can exit `wait_for_shutdown`. + try: + self._main_thread_queue.put_nowait(_MAIN_THREAD_SHUTDOWN) + except Exception: + pass + # Close all client connections for client in self._clients: try: @@ -563,35 +662,121 @@ def stop(self): _l.debug("Failed to unregister server %s: %s", self.server_id, exc) self._registered = False - # Shutdown the decompiler interface - if self.deci: - try: - self.deci.shutdown() - except Exception as e: - _l.warning(f"Error shutting down decompiler: {e}") + # Shutdown the decompiler interface. For backends that need the main + # thread (IDA/idalib), defer to the main thread which will run the + # shutdown after leaving the dispatch loop — doing it from a worker + # thread here raises "Function can be called from the main thread + # only". wait_for_shutdown() / __exit__ pick it up via + # _shutdown_deci_if_needed(). + if self.deci and not self._deci_shutdown_done: + on_main = ( + self._main_thread_ident is None + or threading.get_ident() == self._main_thread_ident + ) + if on_main or not self.requires_main_thread: + try: + self.deci.shutdown() + self._deci_shutdown_done = True + except Exception as e: + _l.warning(f"Error shutting down decompiler: {e}") _l.info("DecompilerServer stopped") def is_running(self) -> bool: """Check if the server is currently running""" return self._running - + + @property + def requires_main_thread(self) -> bool: + """Whether backend API calls must be routed to the main thread. + + Set by the decompiler interface; IDA's idalib is the canonical case. + """ + if not self.deci: + return False + return bool(getattr(self.deci, "requires_main_thread_dispatch", False)) + + def run_on_main_thread(self, func, *args, **kwargs): + """Run ``func(*args, **kwargs)`` on the server's main thread. + + If the calling thread *is* the main thread, execute inline — this + avoids a deadlock when the main thread is itself invoking a method + (e.g. during ``__enter__`` / ``start``). + """ + if self._main_thread_ident is not None and threading.get_ident() == self._main_thread_ident: + return func(*args, **kwargs) + + result_q: "queue.Queue" = queue.Queue(maxsize=1) + self._main_thread_queue.put((func, args, kwargs, result_q)) + result = result_q.get() + if isinstance(result, _MainThreadError): + raise result.exc + return result + + def _main_thread_dispatch_loop(self): + """Drain backend work from the main-thread queue until shutdown. + + Only used for backends that require main-thread dispatch (IDA). + Runs on the thread that called ``wait_for_shutdown`` — i.e. the + thread that originally created the ``deci``. + """ + self._main_thread_ident = threading.get_ident() + while self._running: + try: + item = self._main_thread_queue.get(timeout=0.25) + except queue.Empty: + continue + if item is _MAIN_THREAD_SHUTDOWN: + break + func, args, kwargs, result_q = item + try: + result = func(*args, **kwargs) + except BaseException as exc: # relay every failure, including Java exceptions + result = _MainThreadError(exc) + result_q.put(result) + def wait_for_shutdown(self): """Wait for the server to be shut down (blocking)""" + if self.requires_main_thread: + # Become the main-thread dispatcher. This blocks until stop(). + try: + self._main_thread_dispatch_loop() + except KeyboardInterrupt: + _l.info("Received interrupt signal, stopping server...") + self.stop() + # Now that we're back on the main thread with the dispatch loop + # drained, finish any backend teardown stop() had to defer. + self._shutdown_deci_if_needed() + return + if self._server_thread and self._server_thread.is_alive(): try: self._server_thread.join() except KeyboardInterrupt: _l.info("Received interrupt signal, stopping server...") self.stop() - + + def _shutdown_deci_if_needed(self): + """Run deci.shutdown() once, from the caller's thread. + + Callers must ensure they are on the thread that owns the backend + (typically the main thread). Idempotent. + """ + if not self.deci or self._deci_shutdown_done: + return + try: + self.deci.shutdown() + except Exception as e: + _l.warning(f"Error shutting down decompiler: {e}") + finally: + self._deci_shutdown_done = True + def __enter__(self): """Context manager entry""" self.start() return self - + def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit""" self.stop() - if self.deci: - self.deci.shutdown() \ No newline at end of file + self._shutdown_deci_if_needed() \ No newline at end of file diff --git a/libbs/cli/decompiler_cli.py b/libbs/cli/decompiler_cli.py index cdad63e..7f17d2e 100644 --- a/libbs/cli/decompiler_cli.py +++ b/libbs/cli/decompiler_cli.py @@ -31,7 +31,7 @@ import sys import time from pathlib import Path -from typing import Dict, Iterable, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple # Standardized exit codes — keep these consistent across subcommands so that # `&&` chaining and scripts have predictable behavior. @@ -163,7 +163,12 @@ def _with_client(args): # load # --------------------------------------------------------------------------- -def _spawn_server(binary_path: Path, backend: str, server_id: str) -> subprocess.Popen: +def _spawn_server( + binary_path: Path, + backend: str, + server_id: str, + project_dir: Optional[Path] = None, +) -> subprocess.Popen: """Start a detached headless server process for the given binary.""" cmd = [ sys.executable, "-m", "libbs", @@ -173,6 +178,8 @@ def _spawn_server(binary_path: Path, backend: str, server_id: str) -> subprocess "--binary-path", str(binary_path), "--server-id", server_id, ] + if project_dir is not None: + cmd.extend(["--project-dir", str(project_dir)]) env = os.environ.copy() # Inherit env so things like GHIDRA_INSTALL_DIR flow through. @@ -237,7 +244,18 @@ def cmd_load(args) -> int: return 0 server_id = args.id or server_registry.new_server_id() - _spawn_server(binary_path, backend, server_id) + # Default project/database location: a per-binary folder under the user + # cache dir so analysis artifacts don't pollute the binary's directory. + # Pass --project-dir "" to disable and let the backend drop files beside + # the binary (legacy behavior). + project_dir: Optional[Path] + if args.project_dir == "": + project_dir = None + elif args.project_dir is not None: + project_dir = Path(args.project_dir).expanduser().resolve() + else: + project_dir = _default_project_dir(binary_path, backend) + _spawn_server(binary_path, backend, server_id, project_dir=project_dir) record = _wait_for_server(server_id) _emit(args, { "status": "started", @@ -245,10 +263,26 @@ def cmd_load(args) -> int: "binary_path": record.get("binary_path"), "backend": record.get("backend"), "socket_path": record.get("socket_path"), + "project_dir": str(project_dir) if project_dir is not None else None, }) return 0 +def _default_project_dir(binary_path: Path, backend: str) -> Path: + """Return a stable per-binary cache dir under the user cache root. + + Keyed by binary name + short hash of the absolute path, so two binaries + with the same basename don't collide. The directory is created lazily + by the backend (Ghidra creates `/_ghidra/`; IDA writes its + `.id*` files directly into ``). + """ + from platformdirs import user_cache_dir + import hashlib + + path_hash = hashlib.sha1(str(binary_path).encode()).hexdigest()[:8] + return Path(user_cache_dir("libbs")) / "projects" / f"{binary_path.name}-{path_hash}" + + # --------------------------------------------------------------------------- # list / stop # --------------------------------------------------------------------------- @@ -296,15 +330,7 @@ def _stop_server_by_record(record: Dict) -> bool: graceful = True except Exception as exc: _l.debug("shutdown_server rejected by %s: %s", server_id, exc) - # Close the socket directly instead of calling client.shutdown(); the - # latter also fires `shutdown_deci`, which noisily fails once the server - # has stopped listening. - try: - if client._socket is not None: - client._socket.close() - except Exception: - pass - client._connected = False + client.shutdown() if not _wait_for_process_exit(pid, timeout=3.0): # Graceful request didn't land or server is stuck — escalate. @@ -488,21 +514,59 @@ def cmd_xref_to(args) -> int: Note: distinct from `get_callers`, which is call-sites only. `xref_to` here asks the backend for *every* artifact that points at the target, including globals, strings, and non-call code references. + + Resolution order for ``target``: + 1. Function name or address that matches a known function — use the + function-level xref path (entry-point references). + 2. A raw numeric address or a string literal surfaced by `list_strings` + — use the raw-address xref path (data refs to strings, globals, etc.). """ from libbs.artifacts import Function with _with_client(args) as client: - addr = _resolve_function_addr(client, args.target) - if addr is None: + parsed_addr, parsed_name = _parse_target(args.target) + func_addr = _resolve_function_addr(client, args.target) + known = _known_function_addrs(client) + is_function_target = func_addr is not None and (not known or func_addr in known) + + resolved_addr: Optional[int] = None + target_kind: str # "function" | "address" | "string" + + if is_function_target: + resolved_addr = func_addr + target_kind = "function" + elif parsed_addr is not None: + # Raw address that isn't a function start — try data xrefs. + resolved_addr = parsed_addr + target_kind = "address" + elif parsed_name is not None: + # Treat as a string literal: find that string and xref its address. + match = _find_string_addr(client, parsed_name) + if match is None: + raise SystemExit( + f"Not found: {args.target!r} is not a function, address, " + "or known string. Try `decompiler list_strings --filter " + f"'{parsed_name}'` to search." + ) + resolved_addr = match + target_kind = "string" + else: raise SystemExit(f"Function not found: {args.target!r}") - # Build a Function stub to hand to xrefs_to so backends that *do* - # surface non-function refs (Ghidra via `decompile=True`) can add them. - func_stub = Function(addr, 0) - try: - xrefs = client.xrefs_to(func_stub, decompile=bool(args.decompile)) - except Exception as exc: - _l.debug("xrefs_to raised %s; falling back to get_callers", exc) - xrefs = client.get_callers(addr) + + xrefs: List = [] + if target_kind == "function": + func_stub = Function(resolved_addr, 0) + try: + xrefs = client.xrefs_to(func_stub, decompile=bool(args.decompile)) + except Exception as exc: + _l.debug("xrefs_to raised %s; falling back to get_callers", exc) + xrefs = client.get_callers(resolved_addr) + else: + try: + xrefs = client.xrefs_to_addr(resolved_addr) + except Exception as exc: + _l.debug("xrefs_to_addr raised %s; returning empty", exc) + xrefs = [] # Enrich Function entries with names from the light artifact cache, # since some backends only return (addr, 0) stubs from xrefs_to. @@ -515,15 +579,32 @@ def cmd_xref_to(args) -> int: if func is not None: entry["name"] = getattr(func, "name", None) data.append(entry) - _emit_xrefs(args, addr, data, direction="to") + _emit_xrefs(args, resolved_addr, data, direction="to", target_kind=target_kind) return 0 +def _find_string_addr(client, value: str) -> Optional[int]: + """Look up the address of a string literal (exact match, then substring).""" + try: + strings = client.list_strings() or [] + except Exception: + return None + exact = [addr for addr, text in strings if text == value] + if exact: + return exact[0] + contains = [addr for addr, text in strings if value in text] + if contains: + return contains[0] + return None + + def cmd_xref_from(args) -> int: """Return the callees of a function (what the function calls). - Implementation: decompile the function then scan the callgraph for edges leaving - this function. Falls back to parsing `call` instructions in disassembly. + Implementation: + 1. Use the backend's native per-function callee query (`xrefs_from`). + 2. Fall back to parsing `call 0x…` from disassembly when the backend + returns nothing. """ with _with_client(args) as client: addr = _resolve_function_addr(client, args.target) @@ -533,17 +614,14 @@ def cmd_xref_from(args) -> int: callees: List[Dict] = [] seen = set() try: - cg = client.get_callgraph(only_names=False) - for caller, callee in cg.out_edges(nbunch=None): # type: ignore[attr-defined] - caller_addr = getattr(caller, "addr", None) - if caller_addr == addr: - callee_addr = getattr(callee, "addr", None) - if callee_addr in seen: - continue - seen.add(callee_addr) - callees.append(_format_xref(callee)) + for callee in client.xrefs_from(addr): + callee_addr = getattr(callee, "addr", None) + if callee_addr in seen: + continue + seen.add(callee_addr) + callees.append(_format_xref(callee)) except Exception as exc: - _l.debug("Callgraph-based xref_from failed (%s); falling back to disasm scan.", exc) + _l.debug("xrefs_from failed (%s); falling back to disasm scan.", exc) if not callees: # Fallback: parse `call 0x...` from disassembly. @@ -565,26 +643,46 @@ def cmd_xref_from(args) -> int: "name": func.name if func else None, }) + # Enrich entries that came back without a name but whose addr is known + # from the light artifact cache. + if callees: + light_funcs = dict(client.functions.items()) + for entry in callees: + if entry.get("kind") == "Function" and not entry.get("name"): + func = light_funcs.get(entry.get("addr")) + if func is not None: + entry["name"] = getattr(func, "name", None) + _emit_xrefs(args, addr, callees, direction="from") return 0 -def _emit_xrefs(args, addr: int, xrefs: List[Dict], *, direction: str) -> None: - payload = {"addr": addr, "direction": direction, "xrefs": xrefs} +def _emit_xrefs( + args, + addr: int, + xrefs: List[Dict], + *, + direction: str, + target_kind: Optional[str] = None, +) -> None: + payload: Dict = {"addr": addr, "direction": direction, "xrefs": xrefs} + if target_kind is not None: + payload["target_kind"] = target_kind if args.json: print(json.dumps(_annotate_addrs(payload), indent=2, default=str)) return if not xrefs: - print(f"No xrefs {direction} 0x{addr:x}") + print(f"No xrefs {direction} {_format_addr_hex(addr)}") return for x in xrefs: a = x.get("addr") n = x.get("name") or "" kind = x.get("kind") or "" - if a is not None: - print(f"0x{a:x}\t{kind}\t{n}" if kind else f"0x{a:x}\t{n}") + a_str = _format_addr_hex(a) if isinstance(a, int) else "?" + if kind: + print(f"{a_str}\t{kind}\t{n}") else: - print(f"?\t{kind}\t{n}" if kind else f"?\t{n}") + print(f"{a_str}\t{n}") # --------------------------------------------------------------------------- @@ -630,60 +728,22 @@ def cmd_rename(args) -> int: # --------------------------------------------------------------------------- def cmd_list_strings(args) -> int: - """List strings. Two data sources: - - 1. The backend's native string detector (default). Fast but fidelity - varies — angr's detector is thin and will miss most of `.rodata`. - 2. A raw-bytes scan of the binary file (`--rescan`). Equivalent to - `strings -n ` plus ELF section labeling. Always enabled - automatically if the native detector returns fewer than `_RESCAN_FLOOR` - entries; pass `--no-rescan` to disable. - """ - _RESCAN_FLOOR = 32 + """List strings the decompiler has identified in the binary. + This surfaces exactly what the backend's own string detector produced — + nothing more, nothing less. Decompilers miss things (angr in particular + is thin on `.rodata`), so if this looks sparse, reach for an external + tool (`strings(1)`, `rabin2 -z`, `readelf -p .rodata`) to get the + complete picture. + """ with _with_client(args) as client: - filter_pat = re.compile(args.filter) if args.filter else None native = client.list_strings(filter=args.filter) or [] results: List[Dict] = [] - seen = set() for addr, s in native: if len(s) < args.min_length: continue - seen.add((addr, s)) - results.append({"addr": addr, "string": s, "source": "backend"}) - - should_rescan = args.rescan or ( - not args.no_rescan and len(results) < _RESCAN_FLOOR - ) - if should_rescan: - # Find the binary path via the registry record. - record = _select_server( - server_id=getattr(args, "id", None), - binary_path=getattr(args, "binary", None), - backend=getattr(args, "backend", None), - ) - binary_path = record.get("binary_path") - if binary_path and os.path.exists(binary_path): - data = _read_binary_bytes(binary_path) - if data is not None: - sections = _elf_sections_from_file(binary_path) - for offset, text in _scan_ascii_strings(data, min_length=args.min_length): - if filter_pat and not filter_pat.search(text): - continue - key = (offset, text) - if key in seen: - continue - seen.add(key) - record_entry: Dict = { - "addr": offset, - "string": text, - "source": "rescan", - } - sec = _section_for_offset(sections, offset) - if sec: - record_entry["section"] = sec - results.append(record_entry) + results.append({"addr": addr, "string": s}) # Sort by addr. results.sort(key=lambda e: e.get("addr", 0)) @@ -692,9 +752,7 @@ def cmd_list_strings(args) -> int: _emit_list(args, results) else: for entry in results: - sec = entry.get("section") or entry.get("source") or "" - sec_col = f"[{sec}]\t" if sec else "" - print(f"0x{entry['addr']:x}\t{sec_col}{entry['string']}") + print(f"{_format_addr_hex(entry['addr'])}\t{entry['string']}") return 0 @@ -732,35 +790,86 @@ def cmd_get_callers(args) -> int: # install-skill # --------------------------------------------------------------------------- -def _default_skill_dest() -> Path: - return Path(os.path.expanduser("~/.claude/skills")) +_SKILL_AGENT_CHOICES = ("claude", "codex", "all") + + +def _codex_skill_dest() -> Path: + codex_home = os.environ.get("CODEX_HOME") + if codex_home: + return Path(codex_home).expanduser() / "skills" + return Path(os.path.expanduser("~/.codex/skills")) + + +def _skill_dest_for_agent(agent: str) -> Path: + if agent == "claude": + return Path(os.path.expanduser("~/.claude/skills")) + if agent == "codex": + return _codex_skill_dest() + raise ValueError(f"Unknown skill agent: {agent!r}") + + +def _default_skill_agents() -> List[str]: + # Codex sets CODEX_* env vars in its execution environment. Prefer its + # skill directory there, while preserving Claude as the normal shell default. + if any(key.startswith("CODEX_") for key in os.environ): + return ["codex"] + return ["claude"] + + +def _selected_skill_agents(raw_agents: Optional[List[str]]) -> List[str]: + agents = raw_agents or _default_skill_agents() + if "all" in agents: + agents = ["claude", "codex"] + + selected: List[str] = [] + for agent in agents: + if agent not in ("claude", "codex"): + raise SystemExit( + f"Unsupported skill agent {agent!r}; pick one of: claude, codex, all" + ) + if agent not in selected: + selected.append(agent) + return selected + + +def _skill_destinations(args) -> List[Tuple[str, Path]]: + if args.dest: + if args.agent: + raise SystemExit("--dest cannot be combined with --agent") + return [("custom", Path(args.dest).expanduser().resolve())] + + return [ + (agent, _skill_dest_for_agent(agent).expanduser().resolve()) + for agent in _selected_skill_agents(args.agent) + ] def cmd_install_skill(args) -> int: - dest_root = Path(args.dest).expanduser().resolve() if args.dest else _default_skill_dest() names = args.names or skills.available_skills() if not names: raise SystemExit("No bundled skills to install") - dest_root.mkdir(parents=True, exist_ok=True) installed: List[Dict] = [] - for name in names: - src = skills.skill_path(name) - dest = dest_root / name - if dest.exists() and not args.force: - raise SystemExit( - f"Skill already exists at {dest}. Pass --force to overwrite." - ) - if dest.exists() and args.force: - shutil.rmtree(dest) - shutil.copytree(src, dest) - installed.append({"name": name, "path": str(dest)}) + for agent, dest_root in _skill_destinations(args): + dest_root.mkdir(parents=True, exist_ok=True) + for name in names: + src = skills.skill_path(name) + dest = dest_root / name + if dest.exists() and not args.force: + raise SystemExit( + f"Skill already exists at {dest}. Pass --force to overwrite." + ) + if dest.exists() and args.force: + shutil.rmtree(dest) + shutil.copytree(src, dest) + installed.append({"name": name, "agent": agent, "path": str(dest)}) if args.json: print(json.dumps({"installed": installed}, indent=2, default=str)) else: for entry in installed: - print(f"installed {entry['name']} → {entry['path']}") + agent = "" if entry["agent"] == "custom" else f" ({entry['agent']})" + print(f"installed {entry['name']}{agent} -> {entry['path']}") return 0 @@ -785,7 +894,7 @@ def _annotate_addrs(payload): and isinstance(value, int) and f"{key}_hex" not in payload ): - payload[f"{key}_hex"] = f"0x{value:x}" + payload[f"{key}_hex"] = _format_addr_hex(value) for v in payload.values(): _annotate_addrs(v) elif isinstance(payload, list): @@ -794,6 +903,19 @@ def _annotate_addrs(payload): return payload +def _format_addr_hex(value: int) -> str: + """Format an address as `0x`, normalizing negatives to unsigned 64-bit. + + Some backends (Ghidra in particular) can surface java-signed long values + for synthetic addresses. Emitting `0x-100000` in JSON is useless — render + those as their unsigned-64 equivalent so downstream consumers always see + a well-formed hex address. + """ + if value < 0: + value &= (1 << 64) - 1 + return f"0x{value:x}" + + def _emit(args, payload: Dict, *, text_field: Optional[str] = None) -> None: """Emit a response either as JSON or as a human-readable block.""" if args.json: @@ -828,71 +950,6 @@ def _format_function(func) -> Dict: } -def _read_binary_bytes(binary_path: str, max_bytes: int = 32 * 1024 * 1024) -> Optional[bytes]: - """Read up to `max_bytes` from `binary_path`. Returns None on failure.""" - try: - with open(binary_path, "rb") as f: - return f.read(max_bytes) - except OSError as exc: - _l.debug("Could not read binary %s: %s", binary_path, exc) - return None - - -def _scan_ascii_strings(data: bytes, min_length: int = 4) -> List[Tuple[int, str]]: - """strings(1)-equivalent scan over a raw byte buffer. - - Returns `(offset_in_buffer, decoded_ascii)` tuples. The caller is - responsible for relocating `offset_in_buffer` into whatever address - space makes sense (e.g. file offset vs mapped vaddr). - """ - results: List[Tuple[int, str]] = [] - start = -1 - for i, b in enumerate(data): - # Printable ASCII (space..tilde) plus tab as an allowed interior byte. - if 0x20 <= b < 0x7f or b == 0x09: - if start < 0: - start = i - else: - if start >= 0 and (i - start) >= min_length: - try: - text = data[start:i].decode("ascii", errors="strict") - except UnicodeDecodeError: - pass - else: - results.append((start, text)) - start = -1 - if start >= 0 and (len(data) - start) >= min_length: - try: - text = data[start:].decode("ascii", errors="strict") - except UnicodeDecodeError: - pass - else: - results.append((start, text)) - return results - - -def _section_for_offset(elf_sections: Iterable, offset: int) -> Optional[str]: - """Return the name of the ELF section a file offset lives in, or None.""" - for name, start, size in elf_sections: - if start <= offset < start + size: - return name - return None - - -def _elf_sections_from_file(binary_path: str): - """Return [(name, file_offset, size), ...] for an ELF, or [] if not ELF.""" - try: - from elftools.elf.elffile import ELFFile # type: ignore - except ImportError: - return [] - try: - with open(binary_path, "rb") as f: - elf = ELFFile(f) - return [(sec.name, sec["sh_offset"], sec["sh_size"]) for sec in elf.iter_sections()] - except Exception: - return [] - - # --------------------------------------------------------------------------- # argparse plumbing # --------------------------------------------------------------------------- @@ -929,6 +986,15 @@ def build_parser() -> argparse.ArgumentParser: help="Start a new server even if one already exists for this binary.") p_load.add_argument("--replace", action="store_true", help="Stop the existing server for this binary+backend (if any) before starting.") + p_load.add_argument( + "--project-dir", + dest="project_dir", + help=( + "Where the backend should store its project/database files " + "(Ghidra project, IDA .id*, etc.). Default: a per-binary folder " + "under the user cache dir. Pass '' to drop files next to the binary." + ), + ) _add_output_args(p_load) p_load.set_defaults(func=cmd_load) @@ -1008,18 +1074,15 @@ def build_parser() -> argparse.ArgumentParser: p_ls = sub.add_parser( "list_strings", help=( - "List strings in the binary. Backend detectors vary in fidelity " - "(angr < ghidra < ida); --rescan does a raw strings(1)-like scan " - "of the file as a fallback." + "List strings the decompiler identified in the binary. " + "Fidelity varies by backend (angr < ghidra < ida) and may be " + "incomplete — use external tools (strings(1), rabin2 -z, " + "readelf -p) for an exhaustive scan." ), ) p_ls.add_argument("--filter", dest="filter", help="Regex to filter strings.") p_ls.add_argument("--min-length", dest="min_length", type=int, default=4, help="Minimum string length to keep (default: 4).") - p_ls.add_argument("--rescan", action="store_true", - help="Force a raw-bytes scan of the binary file on top of the backend result.") - p_ls.add_argument("--no-rescan", action="store_true", - help="Never fall back to the raw scan, even if the backend returns few results.") _add_server_filter_args(p_ls) _add_output_args(p_ls) p_ls.set_defaults(func=cmd_list_strings) @@ -1040,11 +1103,24 @@ def build_parser() -> argparse.ArgumentParser: # install-skill p_sk = sub.add_parser( "install-skill", - help="Install the bundled Agent Skill (SKILL.md) into ~/.claude/skills/.", + help="Install the bundled Agent Skill (SKILL.md) for Claude Code or Codex.", ) p_sk.add_argument("names", nargs="*", help="Specific skill names to install (default: all bundled).") - p_sk.add_argument("--dest", help="Install destination (default: ~/.claude/skills).") + p_sk.add_argument( + "--agent", + action="append", + choices=_SKILL_AGENT_CHOICES, + help=( + "Agent skill directory to install into. Repeat for multiple agents, " + "or use 'all'. Default: codex when CODEX_* env vars are present, " + "otherwise claude." + ), + ) + p_sk.add_argument( + "--dest", + help="Install destination override. Cannot be combined with --agent.", + ) p_sk.add_argument("--force", action="store_true", help="Overwrite an existing skill directory.") _add_output_args(p_sk) diff --git a/libbs/decompilers/angr/interface.py b/libbs/decompilers/angr/interface.py index 483a63e..1439a21 100644 --- a/libbs/decompilers/angr/interface.py +++ b/libbs/decompilers/angr/interface.py @@ -48,7 +48,11 @@ def __init__(self, workspace=None, **kwargs): def _init_headless_components(self, *args, **kwargs): super()._init_headless_components(*args, **kwargs) self.project = angr.Project(str(self._binary_path), auto_load_libs=False) - self._cfg = self.project.analyses.CFG(show_progressbar=False, normalize=True, data_references=True) + # cross_references=True populates kb.xrefs so xrefs_to_addr (e.g. + # "who references this string constant?") works. + self._cfg = self.project.analyses.CFG( + show_progressbar=False, normalize=True, data_references=True, cross_references=True, + ) self.project.analyses.CompleteCallingConventions(cfg=self._cfg, recover_variables=True, analyze_callsites=True) def _init_gui_components(self, *args, **kwargs): @@ -136,6 +140,74 @@ def xrefs_to(self, artifact: Artifact, decompile=False, only_code=False) -> List return xrefs + def xrefs_from(self, func_addr: int) -> List[Function]: + """angr callees: use the kb.callgraph successor set. + + ``kb.callgraph`` is a NetworkX digraph populated during CFG analysis; + its successors of a function address are the direct callees, which is + what we want. Unlike ``Function.transition_graph``, these are + deduplicated per target and come with kb function lookups. + """ + lowered = self.art_lifter.lower_addr(func_addr) + project = self.main_instance.project + callgraph = getattr(project.kb, "callgraph", None) + if callgraph is None or lowered not in callgraph: + return [] + + kb_functions = project.kb.functions + callees: List[Function] = [] + seen = set() + for succ_addr in callgraph.successors(lowered): + if succ_addr in seen: + continue + seen.add(succ_addr) + func_obj = kb_functions.get(succ_addr, None) + name = getattr(func_obj, "name", None) if func_obj is not None else None + header = FunctionHeader(name=name, addr=succ_addr) if name else None + callees.append(self.art_lifter.lift(Function(succ_addr, 0, header=header))) + return callees + + def xrefs_to_addr(self, addr: int, only_code: bool = False) -> List[Artifact]: + """angr data-xref lookup: look up kb.xrefs references to ``addr``. + + Falls back to the default (empty) if the xref manager isn't populated. + """ + lowered = self.art_lifter.lower_addr(addr) + project = self.main_instance.project + xref_manager = getattr(project.kb, "xrefs", None) + if xref_manager is None: + return [] + + try: + xref_set = xref_manager.get_xrefs_by_dst(lowered) + except Exception: + return [] + if not xref_set: + return [] + + program_cfg = project.kb.cfgs.get_most_accurate() + if program_cfg is None: + return [] + + results: List[Artifact] = [] + seen = set() + for xref in xref_set: + node = program_cfg.get_any_node(xref.ins_addr, anyaddr=True) + if node is None or node.function_address is None: + continue + func_addr = node.function_address + if func_addr in seen: + continue + seen.add(func_addr) + name = None + try: + name = project.kb.functions[func_addr].name + except Exception: + pass + header = FunctionHeader(name=name, addr=func_addr) if name else None + results.append(self.art_lifter.lift(Function(func_addr, 0, header=header))) + return results + def list_strings(self, filter: Optional[str] = None) -> List[Tuple[int, str]]: pattern = re.compile(filter) if filter else None try: diff --git a/libbs/decompilers/ghidra/interface.py b/libbs/decompilers/ghidra/interface.py index cd2bf46..52a85c6 100644 --- a/libbs/decompilers/ghidra/interface.py +++ b/libbs/decompilers/ghidra/interface.py @@ -273,8 +273,88 @@ def get_decompilation_object(self, function: Function, do_lower=True) -> Optiona lowered_addr = self.art_lifter.lower_addr(function.addr) if do_lower else function.addr return self._ghidra_decompile(self._get_nearest_function(lowered_addr)) + def xrefs_from(self, func_addr: int) -> List[Function]: + """Ghidra callees: use Function.getCalledFunctions for an O(1) hit per caller.""" + from .compat.imports import ConsoleTaskMonitor + + lowered = self.art_lifter.lower_addr(func_addr) + gfunc = self._get_nearest_function(lowered) + if gfunc is None: + return [] + callees: List[Function] = [] + seen = set() + try: + for called_gfunc in gfunc.getCalledFunctions(ConsoleTaskMonitor()): + entry_addr = int(called_gfunc.getEntryPoint().getOffset()) + if entry_addr in seen: + continue + seen.add(entry_addr) + func = Function( + addr=entry_addr, + size=int(called_gfunc.getBody().getNumAddresses()), + header=FunctionHeader(name=str(called_gfunc.getName()), addr=entry_addr), + ) + callees.append(self.art_lifter.lift(func)) + except Exception as exc: + _l.warning("Ghidra xrefs_from(0x%x) failed: %s", func_addr, exc) + return callees + + def xrefs_to_addr(self, addr: int, only_code: bool = False) -> List[Artifact]: + """Ghidra data-xref lookup: walk ReferenceManager refs to ``addr``. + + Backends' stock ``xrefs_to(Function)`` only fires on function entry + points, so it misses data refs to string constants, globals, etc. + This uses Ghidra's ReferenceManager directly and resolves each + referencing instruction back to its containing function. + """ + lowered = self.art_lifter.lower_addr(addr) + return self._ghidra_refs_to_address(lowered, only_code=only_code) + + def _ghidra_refs_to_address(self, lowered_addr: int, only_code: bool = False) -> List[Artifact]: + refs: List[Artifact] = [] + seen_funcs = set() + try: + gaddr = self._to_gaddr(lowered_addr) + reference_manager = self.currentProgram.getReferenceManager() + function_manager = self.currentProgram.getFunctionManager() + ref_iter = reference_manager.getReferencesTo(gaddr) + while ref_iter.hasNext(): + ref = ref_iter.next() + from_addr_g = ref.getFromAddress() + if only_code: + ref_type = ref.getReferenceType() + try: + is_data = ref_type.isData() + except Exception: + is_data = False + if is_data: + continue + gfunc = function_manager.getFunctionContaining(from_addr_g) + if gfunc is None: + continue + entry_addr = int(gfunc.getEntryPoint().getOffset()) + if entry_addr in seen_funcs: + continue + seen_funcs.add(entry_addr) + func = Function( + addr=entry_addr, + size=int(gfunc.getBody().getNumAddresses()), + header=FunctionHeader(name=str(gfunc.getName()), addr=entry_addr), + ) + refs.append(self.art_lifter.lift(func)) + except Exception as exc: + _l.warning("Ghidra reference lookup at 0x%x failed: %s", lowered_addr, exc) + return refs + def xrefs_to(self, artifact: Artifact, decompile=False, only_code=False) -> List[Artifact]: - xrefs = super().xrefs_to(artifact) + if not isinstance(artifact, Function): + raise ValueError("Only functions are supported for xrefs_to") + + # Base function-level xref: who references the entry point. + # Without this, get_callgraph() + xref_from are empty on Ghidra + # because the base class returns `[]`. + lowered = self.art_lifter.lower(artifact) + xrefs = self._ghidra_refs_to_address(lowered.addr, only_code=only_code) if not decompile: return xrefs @@ -299,18 +379,47 @@ def xrefs_to(self, artifact: Artifact, decompile=False, only_code=False) -> List type_=str(global_sym.getDataType().getPathName()) if global_sym.getDataType() else None, size=int(global_sym.getSize()), ) - new_xrefs.append(gvar) + new_xrefs.append(self.art_lifter.lift(gvar)) - lifted_xrefs = [self.art_lifter.lift(x) for x in xrefs + new_xrefs] - return lifted_xrefs + # xrefs are already lifted by _ghidra_refs_to_address; only new_xrefs need lifting. + return xrefs + new_xrefs def list_strings(self, filter: Optional[str] = None) -> List[Tuple[int, str]]: pattern = re.compile(filter) if filter else None - results: List[Tuple[int, str]] = [] + found: Dict[int, str] = {} try: program = self.currentProgram listing = program.getListing() - # Iterate all defined data; pull strings. + memory = program.getMemory() + base_addr = self.binary_base_addr + + def _record(gaddr, text: str) -> None: + if not text: + return + block = memory.getBlock(gaddr) if memory is not None else None + if block is None: + return + try: + if not block.isLoaded(): + return + except Exception: + pass + if gaddr.isNonLoadedMemoryAddress(): + return + addr = int(gaddr.getOffset()) + # Java signed longs can surface negative values for synthetic + # addresses (ELF section name tables, overlays, etc.). + if addr < base_addr: + return + if pattern is not None and not pattern.search(text): + return + # First writer wins — defined-data results carry the + # decompiler's own typing / encoding, so we prefer them + # over raw StringSearcher hits at the same address. + found.setdefault(addr, text) + + # Pass 1: strings the decompiler has already committed to a + # defined data type (char[], TerminatedCString, unicode). data_iter = listing.getDefinedData(True) while data_iter.hasNext(): data = data_iter.next() @@ -321,17 +430,68 @@ def list_strings(self, filter: Optional[str] = None) -> List[Tuple[int, str]]: text = str(raw) if raw is not None else "" except Exception: continue - if not text: - continue - addr = int(data.getAddress().getOffset()) - lifted = self.art_lifter.lift_addr(addr) - if pattern is None or pattern.search(text): - results.append((lifted, text)) + _record(data.getAddress(), text) + + # Pass 2: ask Ghidra's own StringSearcher to scan initialized + # memory for ASCII runs. Ghidra's auto-analyzer misses sequences + # that it instead typed as `byte[N]` (e.g. a base64 alphabet + # stored as `uchar[64]`). This uses Ghidra's native detector — + # no parallel byte scanning. + self._scan_strings_via_searcher(program, memory, _record) except Exception as exc: _l.warning("Ghidra list_strings failed: %s", exc) + + results: List[Tuple[int, str]] = [ + (self.art_lifter.lift_addr(addr), text) + for addr, text in found.items() + ] results.sort(key=lambda item: item[0]) return results + def _scan_strings_via_searcher(self, program, memory, record) -> None: + """Run Ghidra's StringSearcher over loaded memory. + + The searcher is the same component Ghidra's "Search > For Strings" + command uses. This catches ASCII runs that Ghidra auto-typed as + ``byte[N]`` / ``uchar[N]`` instead of promoting to a string (e.g. a + base64 alphabet stored as ``uchar[64]``). + """ + try: + from ghidra.program.util.string import StringSearcher, FoundStringCallback + from ghidra.util.task import TaskMonitor + from jpype import JImplements, JOverride + except Exception as exc: + _l.warning("StringSearcher unavailable, skipping supplemental scan: %s", exc) + return + + @JImplements(FoundStringCallback) + class _Collector: + def __init__(self, mem, on_string): + self._mem = mem + self._on_string = on_string + + @JOverride + def stringFound(self, found_string): + try: + text = found_string.getString(self._mem) + except Exception: + return + if text is None: + return + self._on_string(found_string.getAddress(), str(text)) + + try: + # ctor args: program, minStringSize, alignment, allCharSizes, + # requireNullTermination. allCharSizes=False keeps us on ASCII; + # the UTF variants would otherwise inflate results with noise. + searcher = StringSearcher(program, 4, 1, False, False) + scan_set = memory.getLoadedAndInitializedAddressSet() + # TaskMonitor.DUMMY is non-null but does nothing — passing None + # here crashes with NullPointerException inside AbstractStringSearcher. + searcher.search(scan_set, _Collector(memory, record), True, TaskMonitor.DUMMY) + except Exception as exc: + _l.warning("StringSearcher pass failed: %s", exc) + def disassemble(self, addr: int, **kwargs) -> Optional[str]: lowered = self.art_lifter.lower_addr(addr) func = self._get_nearest_function(lowered) diff --git a/libbs/decompilers/ida/compat.py b/libbs/decompilers/ida/compat.py index b987f55..9ed439a 100644 --- a/libbs/decompilers/ida/compat.py +++ b/libbs/decompilers/ida/compat.py @@ -1718,6 +1718,57 @@ def xrefs_to(addr): return list(idautils.XrefsTo(addr)) +@execute_write +def xrefs_from(addr): + """Return the list of code refs originating at ``addr``. + + Filters to code-flow xrefs of kind Near/Far call, so the results line + up with ``Function.getCalledFunctions()`` on Ghidra and angr's + ``kb.callgraph.successors`` — i.e. only direct callees. + """ + out = [] + for xref in idautils.XrefsFrom(addr): + if xref.iscode and xref.type in (idaapi.fl_CN, idaapi.fl_CF): + out.append(int(xref.to)) + return out + + +@execute_write +def list_strings(): + """Return ``(ea, text)`` tuples for every string IDA found. + + Mirrors the Strings window / ``idautils.Strings()``; the caller filters + on text. + """ + results = [] + for s in idautils.Strings(): + try: + text = str(s) + except Exception: + continue + if not text: + continue + results.append((int(s.ea), text)) + return results + + +@execute_write +def disassemble_function(addr): + """Return a single-string disassembly for the function containing ``addr``.""" + func = ida_funcs.get_func(addr) + if func is None: + return None + lines = [] + start, end = func.start_ea, func.end_ea + ea = start + while ea < end and ea != idaapi.BADADDR: + line = idc.generate_disasm_line(ea, 0) + if line is not None: + lines.append(f"{ea:016x} {line}") + ea = idc.next_head(ea, end) + return "\n".join(lines) if lines else None + + @execute_write def wait_for_idc_initialization(): idc.auto_wait() diff --git a/libbs/decompilers/ida/interface.py b/libbs/decompilers/ida/interface.py index 98d25e0..5f6fc6c 100755 --- a/libbs/decompilers/ida/interface.py +++ b/libbs/decompilers/ida/interface.py @@ -45,13 +45,21 @@ def _qt_shortcut_to_ida(shortcut: str) -> str: # class IDAInterface(DecompilerInterface): - def __init__(self, **kwargs): + # idalib (IDA's headless mode) enforces main-thread-only API access and + # raises ``RuntimeError: Function can be called from the main thread only`` + # when called from a worker thread. The DecompilerServer checks this flag + # and routes backend calls through its main-thread dispatcher. + requires_main_thread_dispatch = True + + def __init__(self, project_dir=None, **kwargs): self._ctx_menu_names = [] self._ui_hooks = [] self._artifact_watcher_hooks = [] self._gui_active_context = None self._deleted_artifacts = defaultdict(set) self.cached_ord_to_type_names = {} + # Optional cache directory where the .id* database files should live. + self._project_dir = project_dir super().__init__( name="ida", qt_version=get_ida_gui_version(), artifact_lifter=IDAArtifactLifter(self), @@ -72,9 +80,35 @@ def _init_headless_components(self, *args, **kwargs): This also means that this feature is only supported in IDA versions >= 9.0 """ super()._init_headless_components(*args, **kwargs) - failure = idapro.open_database(str(self.binary_path), True) + binary_path = str(self.binary_path) + extra_args = self._ida_open_args() + failure = idapro.open_database(binary_path, True, extra_args) if failure: - raise RuntimeError(f"Failed to open database {self.binary_path}") + raise RuntimeError(f"Failed to open database {binary_path}") + + def _ida_open_args(self) -> Optional[str]: + """Build the extra args string passed to ``idapro.open_database``. + + When ``project_dir`` is configured we redirect IDA's database sidecar + files (``.id0/.id1/.id2/.nam/.til``) into that directory using IDA's + own ``-o`` command-line flag. The sidecars go into a nested + ``ida/`` subdirectory so they don't collide with anything else the + user / other backends leave in the top-level project_dir (Ghidra's + ``_ghidra/`` project, stale symlinks, etc.). + """ + from pathlib import Path as _Path + + if not self._project_dir: + return None + + project_dir = _Path(self._project_dir).expanduser().resolve() + ida_dir = project_dir / "ida" + ida_dir.mkdir(parents=True, exist_ok=True) + binary_name = _Path(str(self.binary_path)).name + # IDA's -o takes the database base path (no extension); it picks + # .idb / .i64 / .id* itself. + db_base = ida_dir / binary_name + return f"-o{db_base}" def _deinit_headless_components(self): """ @@ -175,22 +209,92 @@ def xrefs_to(self, artifact: Artifact, decompile=False, only_code=False) -> List return [] function: Function = self.art_lifter.lower(artifact) - ida_xrefs = compat.xrefs_to(function.addr) - if not ida_xrefs: - return [] + return self._collect_xrefs_to(function.addr, only_code=only_code) - xrefs = [] - for ida_xref in ida_xrefs: - from_func_addr = compat.ida_func_addr(ida_xref.frm) - if only_code and not ida_xref.iscode: - continue + def xrefs_to_addr(self, addr: int, only_code: bool = False) -> List[Artifact]: + lowered = self.art_lifter.lower_addr(addr) + return self._collect_xrefs_to(lowered, only_code=only_code) - if from_func_addr is None: + def xrefs_from(self, func_addr: int) -> List[Function]: + """Direct callees of ``func_addr`` — just the call targets, no data.""" + lowered = self.art_lifter.lower_addr(func_addr) + func = compat.fast_get_function(lowered, get_rtype=False) + if func is None: + return [] + callees: List[Function] = [] + seen = set() + # Walk every instruction in the function body; cheap because fauxware- + # sized binaries are typical, and this is the same approach the + # ``idautils.CodeRefsFrom`` helpers use under the hood. + import ida_funcs as _ida_funcs # local to keep interface.py clean + ida_func = _ida_funcs.get_func(lowered) + if ida_func is None: + return [] + ea = ida_func.start_ea + while ea < ida_func.end_ea and ea != idaapi.BADADDR: + for callee_ea in compat.xrefs_from(ea): + callee_func_addr = compat.ida_func_addr(callee_ea) or callee_ea + if callee_func_addr in seen: + continue + seen.add(callee_func_addr) + lifted = self.art_lifter.lift_addr(callee_func_addr) + fast_func = self.fast_get_function(lifted) or Function(lifted, 0) + callees.append(fast_func) + ea = idc.next_head(ea, ida_func.end_ea) + return callees + + def list_strings(self, filter: Optional[str] = None) -> List[tuple]: + import re as _re + pattern = _re.compile(filter) if filter else None + out = [] + for ea, text in compat.list_strings(): + if pattern is not None and not pattern.search(text): continue - - fast_func = self.fast_get_function(self.art_lifter.lift_addr(from_func_addr)) - xrefs.append(fast_func) - + out.append((self.art_lifter.lift_addr(ea), text)) + out.sort(key=lambda item: item[0]) + return out + + def disassemble(self, addr: int, **kwargs) -> Optional[str]: + lowered = self.art_lifter.lower_addr(addr) + return compat.disassemble_function(lowered) + + def _collect_xrefs_to(self, lowered_addr: int, only_code: bool, + _max_chase: int = 2) -> List[Artifact]: + """Collect function-level xrefs to ``lowered_addr``. + + PIE binaries route string / global references through indirection + tables (GOT / _RDATA pointer arrays), so a direct + ``idautils.XrefsTo(str_addr)`` only lands on the pointer — not on + the code that dereferences it. We BFS up to ``_max_chase`` levels + of data indirection so ``xrefs_to SOSNEAKY`` can still name the + caller. + """ + visited_targets: set = set() + seen_funcs: set = set() + xrefs: List[Artifact] = [] + + frontier = [(lowered_addr, 0)] + while frontier: + target, depth = frontier.pop(0) + if target in visited_targets: + continue + visited_targets.add(target) + + for ida_xref in compat.xrefs_to(target): + if only_code and not ida_xref.iscode: + continue + from_ea = int(ida_xref.frm) + from_func_addr = compat.ida_func_addr(from_ea) + if from_func_addr is not None: + if from_func_addr in seen_funcs: + continue + seen_funcs.add(from_func_addr) + lifted = self.art_lifter.lift_addr(from_func_addr) + fast_func = self.fast_get_function(lifted) or Function(lifted, 0) + xrefs.append(fast_func) + elif depth < _max_chase and not only_code: + # data-to-data indirection: chase one hop further. + frontier.append((from_ea, depth + 1)) return xrefs def get_decompilation_object(self, function: Function, do_lower=True, **kwargs) -> Optional[object]: diff --git a/libbs/skills/__init__.py b/libbs/skills/__init__.py index f599f10..86e642d 100644 --- a/libbs/skills/__init__.py +++ b/libbs/skills/__init__.py @@ -2,7 +2,7 @@ Each subdirectory holds a SKILL.md (and any optional resources) that an LLM can load to learn how to drive libbs via the `decompiler` CLI. Use -`decompiler install-skill` to copy a skill into the user's `~/.claude/skills/`. +`decompiler install-skill` to copy a skill into Claude Code or Codex. """ from pathlib import Path diff --git a/libbs/skills/decompiler/SKILL.md b/libbs/skills/decompiler/SKILL.md index 995d8f7..b791c92 100644 --- a/libbs/skills/decompiler/SKILL.md +++ b/libbs/skills/decompiler/SKILL.md @@ -36,28 +36,38 @@ to verify the pipeline end-to-end. ## First moves on a new binary +**Always start with `list_functions` and `list_strings`** — the same binary +can have the entry named `main` (angr), `FUN_00101c5c` (Ghidra), or +`sub_101c5c` (IDA). Don't assume `main` exists. + ```bash -decompiler load ./target # start a server (angr by default) -decompiler list_functions # enumerate every function +decompiler load ./target # start a server (angr by default) +decompiler list_functions # enumerate every function — pick a real entry decompiler list_functions --filter 'main|auth' # or narrow by regex -decompiler list_strings --filter 'flag|pass' # find useful string constants +decompiler list_strings --filter 'flag|pass' # find interesting string constants ``` -For stripped binaries `decompile main` often fails — use `list_functions` -first to discover the real entry (`sub_XXXX`, `entry`, etc.) and start from -there. +Typical first-hour workflow on a stripped binary: + +1. `decompiler load ./bin --backend ghidra` (or `angr` if no Ghidra install) +2. `decompiler list_functions` → note non-stub function names + sizes +3. `decompiler list_strings` → look for error messages, user prompts, + format strings — they often point at the interesting code +4. `decompiler xref_to "Welcome"` → jump from a string to its users +5. `decompiler decompile ` on whichever function came out of steps 3–4 ## Core workflow ```bash decompiler load ./fauxware # start a server -decompiler list_functions # enumerate functions -decompiler decompile main # by name +decompiler list_functions # enumerate functions (do this first) +decompiler list_strings --filter 'pass|key' # strings the decompiler identified +decompiler xref_to SOSNEAKY # who references this string? +decompiler decompile authenticate # by name (from list_functions) decompiler disassemble 0x40071d # by absolute address decompiler xref_to authenticate # every code+data reference decompiler get_callers authenticate # call-sites only (subset of xref_to) decompiler xref_from main # what does main call? -decompiler list_strings --filter 'pass|key' # backend detector + raw fallback decompiler rename func sub_400662 trampoline # rename a function decompiler rename var v2 auth_result --function main # rename a local decompiler stop --all @@ -101,7 +111,7 @@ same binary. | Subcommand | Purpose | Key flags | |---|---|---| -| `load ` | Start a server on the binary. Idempotent: returns existing unless `--force`/`--replace`. | `--backend`, `--id`, `--force`, `--replace`, `--json` | +| `load ` | Start a server on the binary. Idempotent: returns existing unless `--force`/`--replace`. | `--backend`, `--id`, `--force`, `--replace`, `--project-dir`, `--json` | | `list` | Show all running servers and the registry path. | `--show-registry`, `--json` | | `stop` | Shut down one or all servers. | `--id`, `--binary`, `--all`, `--json` | | `list_functions` | Enumerate every function (ADDR, SIZE, NAME). | `--filter REGEX`, `--json` | @@ -111,29 +121,42 @@ same binary. | `xref_from ` | Functions that `target` calls. | same | | `rename func ` | Rename a function. | same + `--json` | | `rename var --function ` | Rename a local variable inside a function. | same | -| `list_strings` | Strings (backend + raw rescan fallback). | `--filter`, `--min-length N`, `--rescan`, `--no-rescan`, same | +| `list_strings` | Strings the decompiler found (may be incomplete — see below). | `--filter`, `--min-length N`, same | | `get_callers ` | Call-sites only — subset of `xref_to`. | same | -| `install-skill` | Install this file into `~/.claude/skills/`. | `--dest`, `--force`, `--json` | +| `install-skill` | Install this file for Claude Code or Codex. | `--agent`, `--dest`, `--force`, `--json` | ### `xref_to` vs `get_callers` - `xref_to` asks the backend for **every reference** — code *and* data. On Ghidra with `--decompile` this includes global variables and string references. Rows include a `kind` field (`Function`, `GlobalVariable`, - ...). + ...). `xref_to` also accepts **strings and raw addresses**: if the + target isn't a function, it's looked up in `list_strings` first, then + queried as a raw-address xref — so you can go straight from + `list_strings --filter "admin"` to `xref_to admin` to find who reads + that constant. - `get_callers` is the narrower call-sites-only view: only functions that contain a `call` to the target. When you want "who calls this?" reach for `get_callers`; when you want "who touches this in any way?" reach for `xref_to`. -### `list_strings` fidelity +### `list_strings` may be incomplete + +`list_strings` returns exactly what the backend's own string detector +surfaced — the CLI does not second-guess the decompiler. Fidelity varies +(`angr < ghidra < ida`); angr in particular misses most of `.rodata`. If +the output looks thin, check the binary file directly with an external +scanner: + +```bash +strings -a -n 4 ./target # classic strings(1) +rabin2 -z ./target # radare2: ASCII data-section scan +readelf -p .rodata ./target # ELF-specific, per section +``` -String detection quality varies by backend: `angr < ghidra < ida`. The CLI -hides this: if the backend returns few results (threshold: 32), we auto-run -a raw `strings(1)`-like scan of the binary file and label those entries -with their ELF section (`.rodata`, `.dynstr`, `.text`, etc.). Use -`--rescan` to force the scan, or `--no-rescan` to skip it and see only the -backend result. `--min-length` defaults to 4. +Use those to confirm a specific constant exists, then come back and +`decompile` / `xref_to` its address inside the CLI. `--min-length` +defaults to 4. ## Machine-readable output @@ -147,9 +170,7 @@ decompiler list_functions --filter '^main$' --json # [{"addr": 1821, "size": 184, "name": "main", "addr_hex": "0x71d"}] decompiler list_strings --filter 'flag' --json -# [{"addr": 4197168, "string": "flag{...}", "source": "backend", "addr_hex": "0x4008e0"}, -# {"addr": 4197232, "string": "flag_check_ok", "source": "rescan", -# "section": ".rodata", "addr_hex": "0x400900"}] +# [{"addr": 4197168, "string": "flag{...}", "addr_hex": "0x4008e0"}] decompiler decompile main --json # {"addr": 1821, "decompiler": "angr", "text": "void main(...){...}", "addr_hex": "0x71d"} @@ -166,8 +187,11 @@ decompiler decompile main --raw on any failure (including "rename didn't find the old name"). Use `&&` safely. - **Stripped binaries**: use `list_functions` before `decompile` to find - the real entry. `main` may not exist; look for `sub_XXXX` with plausible - sizes and xrefs. + the real entry. `main` may not exist; look for non-default names + (`sub_XXXX`, `FUN_...`, `entry`, etc.) with plausible sizes and xrefs. +- **Backend main-naming varies**: angr promotes the entry to `main`, + Ghidra leaves `FUN_00101c5c`, IDA emits `sub_101c5c`. Always resolve via + `list_functions` or a known entry address, not by assuming `main`. - **Invalid addresses** fail with a clear message distinguishing "no function starts here" from "decompiler engine failed". The CLI does not auto-round-trip invalid addresses. @@ -179,6 +203,10 @@ decompiler decompile main --raw - **Registry path**: `decompiler list --show-registry` prints just the directory so you can clean up manually if you ever need to (e.g. after a `kill -9`). +- **Project/database files**: by default they live in + `/libbs/projects/-/`, not next to the binary. + Pass `--project-dir ` to `load` to override, or `--project-dir ""` + to restore the legacy "write next to the binary" behavior. ## Library-level API (for Python scripts) diff --git a/tests/test_decompiler_cli.py b/tests/test_decompiler_cli.py index fe833c4..f913bd1 100644 --- a/tests/test_decompiler_cli.py +++ b/tests/test_decompiler_cli.py @@ -1,26 +1,27 @@ """ Tests for the `decompiler` CLI and the new libbs core features it exposes -(list_strings, get_callers, disassemble). +(list_strings, get_callers, disassemble, xref_to_addr, xref_from). -These tests use the angr backend so they work without external installs (IDA, -Ghidra, Binary Ninja). They run the CLI by spawning subprocesses so that the -real entry point and server-registry flow are exercised. +The CLI tests are backend-parametrized: each test method lives on a single +base class, and one subclass per supported decompiler re-runs them with a +different ``backend`` class attribute. Backends whose dependencies aren't +available are skipped. + +Subprocesses are used on purpose so the real entry point + server-registry +flow is exercised end-to-end. """ import json import os -import re import shutil import subprocess import sys import tempfile -import time import unittest from pathlib import Path from libbs.api import server_registry from libbs.api.decompiler_client import DecompilerClient from libbs.api.decompiler_interface import DecompilerInterface -from libbs.api.decompiler_server import DecompilerServer TEST_BINARIES_DIR = Path( @@ -30,6 +31,30 @@ POSIX_SYSCALL_PATH = TEST_BINARIES_DIR / "posix_syscall" +# --------------------------------------------------------------------------- +# Backend availability detection: skip subclasses cleanly when a decompiler +# isn't installed. Keep these tight and cheap — don't actually load a binary. +# --------------------------------------------------------------------------- + +def _backend_available(backend: str) -> bool: + try: + if backend == "angr": + import angr # noqa: F401 + elif backend == "ghidra": + import pyghidra # noqa: F401 + if not os.environ.get("GHIDRA_INSTALL_DIR"): + return False + elif backend == "binja": + import binaryninja # noqa: F401 + elif backend == "ida": + import idapro # noqa: F401 + else: + return False + except Exception: + return False + return True + + def _cli_env(): env = os.environ.copy() # Isolate registry per-test so concurrent test runs don't collide and stale @@ -38,10 +63,15 @@ def _cli_env(): return env -def _run_cli(*args, check=True, timeout=600) -> subprocess.CompletedProcess: +def _run_cli(*args, check=True, timeout=600, env_overrides=None) -> subprocess.CompletedProcess: """Run the `decompiler` CLI and return the result.""" cmd = [sys.executable, "-m", "libbs.cli.decompiler_cli", *args] env = _cli_env() + for key, value in (env_overrides or {}).items(): + if value is None: + env.pop(key, None) + else: + env[key] = value return subprocess.run(cmd, capture_output=True, text=True, check=check, timeout=timeout, env=env) @@ -77,49 +107,227 @@ def _stop_all_servers(): pass -@unittest.skipUnless(FAUXWARE_PATH.exists(), f"Missing test binary: {FAUXWARE_PATH}") -class TestDecompilerCLI(unittest.TestCase): - """End-to-end tests for the decompiler CLI using angr backend.""" +class _CLIBackendTestBase(unittest.TestCase): + """Base class for backend-parametrized CLI tests. + + Subclasses set ``backend`` to one of ``angr``, ``ghidra``, ``binja``, + ``ida``. Tests that rely on angr-specific quirks are gated inside the + method body rather than being split into separate subclasses, so a + single test method describes "what the CLI should do against any + backend" and the backend-specific allowances live near the asserts. + """ + + backend: str = "angr" @classmethod def setUpClass(cls): + # `_CLIBackendTestBase` itself is abstract; skip it so unittest doesn't + # try to run its inherited methods with the default angr backend. + if cls is _CLIBackendTestBase: + raise unittest.SkipTest("abstract base class") + if not FAUXWARE_PATH.exists(): + raise unittest.SkipTest(f"Missing test binary: {FAUXWARE_PATH}") + if not _backend_available(cls.backend): + raise unittest.SkipTest(f"{cls.backend} backend not available") os.environ["LIBBS_SERVER_REGISTRY"] = _REGISTRY_DIR _stop_all_servers() @classmethod def tearDownClass(cls): _stop_all_servers() - try: - shutil.rmtree(_REGISTRY_DIR, ignore_errors=True) - except Exception: - pass def tearDown(self): _stop_all_servers() - def _load_fauxware(self): - result = _run_cli("load", str(FAUXWARE_PATH), "--backend", "angr", "--json") + # ------------------------------------------------------------------- + # Helpers + # ------------------------------------------------------------------- + + def _load_fauxware(self, *extra_args, project_dir=None): + args = ["load", str(FAUXWARE_PATH), "--backend", self.backend, "--json", *extra_args] + if project_dir is not None: + args.extend(["--project-dir", str(project_dir)]) + result = _run_cli(*args) payload = json.loads(result.stdout) self.assertIn(payload["status"], ("started", "already_loaded")) - self.assertEqual(payload["backend"], "angr") + self.assertEqual(payload["backend"], self.backend) return payload + def _resolve_main_name(self): + """Return whatever the current backend calls the fauxware entry. + + angr promotes the entry to ``main``; Ghidra leaves ``main`` when the + symbol is present (fauxware is not stripped). We scan + ``list_functions`` so the tests don't depend on any particular + backend's naming convention. + """ + result = _run_cli("list_functions", "--json") + entries = json.loads(result.stdout) + preferred = {"main", "_main"} + for e in entries: + if e.get("name") in preferred: + return e["name"] + # Fauxware's `main` entry starts at offset 0x71d (lifted). + for e in entries: + if e.get("addr") == 0x71d: + return e["name"] or f"0x{e['addr']:x}" + self.fail("Couldn't locate main in list_functions output") + + # ------------------------------------------------------------------- + # Shared backend-agnostic tests + # ------------------------------------------------------------------- + def test_load_and_list(self): loaded = self._load_fauxware() server_id = loaded["id"] list_result = _run_cli("list", "--json") payload = json.loads(list_result.stdout) - # Feedback P3.13: list --json should expose the registry path. self.assertIn("registry_dir", payload) - self.assertTrue(payload["registry_dir"]) ids = {s["id"] for s in payload["servers"]} self.assertIn(server_id, ids) - def test_list_show_registry(self): - result = _run_cli("list", "--show-registry") - self.assertTrue(result.stdout.strip()) + def test_list_functions_and_decompile(self): + self._load_fauxware() + lf = _run_cli("list_functions", "--json").stdout + entries = json.loads(lf) + self.assertTrue(entries, "list_functions returned no entries") + for e in entries: + self.assertIn("addr", e) + self.assertIn("addr_hex", e) + self.assertIn("size", e) + self.assertIn("name", e) + + name = self._resolve_main_name() + dec_result = _run_cli("decompile", name, "--json") + payload = json.loads(dec_result.stdout) + self.assertIn("text", payload) + self.assertTrue(payload["text"], "empty decompilation") + self.assertIn("addr_hex", payload) + self.assertTrue(payload["addr_hex"].startswith("0x")) + + def test_disassemble(self): + self._load_fauxware() + name = self._resolve_main_name() + result = _run_cli("disassemble", name, "--json") + payload = json.loads(result.stdout) + self.assertIn("text", payload) + self.assertIn("addr_hex", payload) + # Any reasonable disassembler emits at least one of these opcodes for + # main. Compare case-insensitively so Ghidra's uppercase "PUSH" and + # angr/capstone's lowercase "push" both pass. + text = payload["text"].lower() + self.assertTrue(any(op in text for op in ("push", "mov", "call", "sub"))) + + def test_decompile_raw(self): + """--raw should print text directly, not JSON-wrapped.""" + self._load_fauxware() + name = self._resolve_main_name() + raw = _run_cli("decompile", name, "--raw") + self.assertNotIn('\\n', raw.stdout) + self.assertNotIn('{"addr"', raw.stdout) + + def test_list_strings(self): + self._load_fauxware() + # Every supported backend sees this string in fauxware. + result = _run_cli("list_strings", "--filter", "Welcome", "--json") + payload = json.loads(result.stdout) + self.assertTrue(any("Welcome" in s["string"] for s in payload), + f"{self.backend} list_strings missed 'Welcome': {payload!r}") + for entry in payload: + # Regression for negative-address / `0x-100000` formatting — the + # lifted hex rendering must always be a well-formed positive hex. + self.assertTrue(entry["addr_hex"].startswith("0x")) + self.assertNotIn("-", entry["addr_hex"][2:]) + + def test_xref_to_function(self): + self._load_fauxware() + # `authenticate` exists in fauxware and is called from main across + # all backends we support. + result = _run_cli("xref_to", "authenticate", "--json") + payload = json.loads(result.stdout) + self.assertEqual(payload.get("target_kind"), "function") + names = {x.get("name") for x in payload["xrefs"]} + self.assertIn("main", names, f"{self.backend}: 'main' not in xrefs_to(authenticate): {names!r}") + for x in payload["xrefs"]: + self.assertIn("addr_hex", x) + def test_xref_to_string(self): + """Regression: xref_to should accept a string literal as target.""" + self._load_fauxware() + # SOSNEAKY is the magic password constant in fauxware; it's + # referenced from `authenticate`. + result = _run_cli("xref_to", "SOSNEAKY", "--json", check=False) + if result.returncode != 0: + self.skipTest(f"{self.backend} doesn't surface SOSNEAKY: {result.stdout}") + payload = json.loads(result.stdout) + self.assertEqual(payload.get("target_kind"), "string") + xref_names = {x.get("name") for x in payload["xrefs"]} + self.assertIn("authenticate", xref_names, + f"{self.backend}: expected 'authenticate' in xref_to(SOSNEAKY): {xref_names}") + + def test_xref_from(self): + """Regression: xref_from must return non-empty callees on each backend.""" + self._load_fauxware() + name = self._resolve_main_name() + result = _run_cli("xref_from", name, "--json") + payload = json.loads(result.stdout) + addrs = {x.get("addr") for x in payload["xrefs"]} + self.assertGreater(len(addrs), 0, f"{self.backend}: xref_from({name}) empty") + # Backends with debug symbols recognize at least one of these names. + names = {x.get("name") for x in payload["xrefs"] if x.get("name")} + self.assertTrue(names & {"authenticate", "puts", "read", "accepted", "rejected"}, + f"{self.backend}: expected a known callee in {names}") + + def test_get_callers(self): + self._load_fauxware() + result = _run_cli("get_callers", "authenticate", "--json") + payload = json.loads(result.stdout) + names = {c.get("name") for c in payload["callers"]} + self.assertIn("main", names) + for c in payload["callers"]: + self.assertIn("addr_hex", c) + + #: Subclasses set this to True if their backend actually persists files + #: (Ghidra project, IDA database, etc). For in-memory backends like angr + #: it stays False and we only assert "nothing wound up next to the binary". + _persists_project_files: bool = False + + def test_project_dir_keeps_binary_dir_clean(self): + """`--project-dir` should make the backend write its DB outside the binary's dir.""" + with tempfile.TemporaryDirectory() as project_dir, tempfile.TemporaryDirectory() as bin_dir: + # Copy fauxware into an isolated directory so we can verify + # nothing gets written beside it. + local_bin = Path(bin_dir) / "fauxware" + shutil.copyfile(FAUXWARE_PATH, local_bin) + local_bin.chmod(0o755) + before = set(os.listdir(bin_dir)) + + _run_cli("load", str(local_bin), "--backend", self.backend, + "--project-dir", project_dir, "--json") + # Give the backend a beat to finish writing. + _run_cli("list_functions", "--json") + + after = set(os.listdir(bin_dir)) + new_files = after - before + self.assertFalse(new_files, + f"{self.backend} wrote unexpected files beside the binary: {new_files}") + # Backends that actually persist project state (Ghidra, IDA) should + # have written *something* to the override dir; in-memory backends + # (angr) correctly produce no files and that's the whole point — + # there's nothing to place anywhere. + if self._persists_project_files: + project_contents = list(Path(project_dir).rglob("*")) + self.assertTrue(project_contents, + f"{self.backend} wrote nothing to the project_dir") + + +class TestDecompilerCLIAngr(_CLIBackendTestBase): + """angr backend: always available (pure-Python dependency).""" + backend = "angr" + + # angr-specific sanity checks that don't map cleanly to the other + # backends live here. def test_load_idempotent(self): first = self._load_fauxware() second = self._load_fauxware() @@ -128,9 +336,8 @@ def test_load_idempotent(self): def test_multi_instance_same_binary_with_force(self): first = self._load_fauxware() - forced = _run_cli( - "load", str(FAUXWARE_PATH), "--backend", "angr", "--force", "--json" - ) + forced = _run_cli("load", str(FAUXWARE_PATH), "--backend", "angr", + "--force", "--json") second = json.loads(forced.stdout) self.assertNotEqual(first["id"], second["id"]) @@ -144,110 +351,37 @@ def test_multi_instance_same_binary_with_force(self): self.assertIn("main", ok.stdout) def test_load_replace_stops_old_server(self): - """`load --replace` should tear the existing server down, not leave two.""" first = self._load_fauxware() - replaced_result = _run_cli( - "load", str(FAUXWARE_PATH), "--backend", "angr", "--replace", "--json" - ) + replaced_result = _run_cli("load", str(FAUXWARE_PATH), "--backend", "angr", + "--replace", "--json") replaced = json.loads(replaced_result.stdout) self.assertEqual(replaced["status"], "started") self.assertNotEqual(replaced["id"], first["id"]) - - # Only one server should remain. listing = _run_cli("list", "--json") servers = json.loads(listing.stdout)["servers"] fauxware_servers = [s for s in servers if s["binary_path"] == str(FAUXWARE_PATH)] self.assertEqual(len(fauxware_servers), 1) self.assertEqual(fauxware_servers[0]["id"], replaced["id"]) - def test_decompile(self): - self._load_fauxware() - result = _run_cli("decompile", "main", "--json") - payload = json.loads(result.stdout) - self.assertIn("text", payload) - self.assertIn("main", payload["text"]) - # Feedback P1.4: JSON should include addr_hex alongside addr. - self.assertIn("addr_hex", payload) - self.assertTrue(payload["addr_hex"].startswith("0x")) - - # By address (lifted) - addr_dec = _run_cli("decompile", "0x71d", "--json") - self.assertIn("text", json.loads(addr_dec.stdout)) + def test_client_disconnect_does_not_tear_down_server(self): + """Regression: a client context-exiting must not close the server's project. - def test_decompile_raw(self): - """Feedback P1.7: --raw should print text directly, not JSON-wrapped.""" + Each `decompiler ` spawns a fresh client, uses it via `with`, and + exits. If the client's `shutdown()` sends `shutdown_deci` to the server, + the next invocation hits a closed program (ClosedException on ghidra). + """ self._load_fauxware() - raw = _run_cli("decompile", "main", "--raw") - # Raw output: no literal '\\n' escape or JSON quoting. - self.assertNotIn('\\n', raw.stdout) - self.assertNotIn('{"addr"', raw.stdout) - self.assertIn("main", raw.stdout) + for _ in range(3): + result = _run_cli("decompile", "main", "--json") + payload = json.loads(result.stdout) + self.assertIn("text", payload) def test_decompile_not_a_function_start(self): - """Feedback P1.6: clear error distinguishing 'not a start' from engine failure.""" self._load_fauxware() - # 0x71e is inside main (main starts at 0x71d). result = _run_cli("decompile", "0x71e", check=False) self.assertEqual(result.returncode, 1) self.assertIn("No function starts at", result.stdout + result.stderr) - def test_disassemble(self): - self._load_fauxware() - result = _run_cli("disassemble", "main", "--json") - payload = json.loads(result.stdout) - self.assertIn("text", payload) - self.assertIn("addr_hex", payload) - # sanity: some assembly - self.assertTrue(any(op in payload["text"] for op in ("push", "mov", "call"))) - - def test_disassemble_raw(self): - self._load_fauxware() - raw = _run_cli("disassemble", "main", "--raw") - self.assertNotIn('\\n', raw.stdout) - self.assertNotIn('{"addr"', raw.stdout) - - def test_list_functions(self): - """Feedback P0.1: `list_functions` subcommand.""" - self._load_fauxware() - result = _run_cli("list_functions", "--filter", "main", "--json") - entries = json.loads(result.stdout) - names = {e["name"] for e in entries} - self.assertIn("main", names) - # Each entry must carry addr, addr_hex, size, name. - for e in entries: - self.assertIn("addr", e) - self.assertIn("addr_hex", e) - self.assertIn("size", e) - self.assertIn("name", e) - # Text output is tabular. - text = _run_cli("list_functions", "--filter", "main").stdout - self.assertIn("ADDR", text) - self.assertIn("main", text) - - def test_xref_to(self): - self._load_fauxware() - result = _run_cli("xref_to", "authenticate", "--json") - payload = json.loads(result.stdout) - names = {x.get("name") for x in payload["xrefs"]} - self.assertIn("main", names) - # Feedback P1.3: rows should be kind-tagged. - kinds = {x.get("kind") for x in payload["xrefs"]} - self.assertIn("Function", kinds) - # addr_hex present for each xref - for x in payload["xrefs"]: - self.assertIn("addr_hex", x) - - def test_xref_from(self): - self._load_fauxware() - result = _run_cli("xref_from", "main", "--json") - payload = json.loads(result.stdout) - # main should call at least `authenticate`; address is always populated. - addrs = {x.get("addr") for x in payload["xrefs"]} - self.assertGreaterEqual(len(addrs), 1) - names = {x.get("name") for x in payload["xrefs"] if x.get("name")} - # At least one named callee (puts/read/authenticate/accepted/rejected) - self.assertTrue(names & {"authenticate", "puts", "read", "accepted", "rejected"}) - def test_rename_func(self): self._load_fauxware() result = _run_cli("rename", "func", "authenticate", "my_auth", "--json") @@ -255,26 +389,19 @@ def test_rename_func(self): self.assertTrue(payload["success"]) def test_rename_func_missing_exits_1(self): - """Feedback P1.5: non-existent rename should exit 1, not 2.""" self._load_fauxware() - result = _run_cli( - "rename", "func", "nonexistent_fn_xyz", "whatever", check=False - ) + result = _run_cli("rename", "func", "nonexistent_fn_xyz", "whatever", + check=False) self.assertEqual(result.returncode, 1) def test_rename_var_missing_exits_1(self): - """Feedback P1.5: var rename with missing old name should exit 1, not 2.""" self._load_fauxware() - result = _run_cli( - "rename", "var", "no_such_var_xyz", "whatever", - "--function", "main", check=False, - ) + result = _run_cli("rename", "var", "no_such_var_xyz", "whatever", + "--function", "main", check=False) self.assertEqual(result.returncode, 1) def test_rename_var(self): self._load_fauxware() - # Fetch an existing local variable name dynamically via the client API - # so this doesn't depend on angr's specific naming. record = server_registry.find_servers(binary_path=str(FAUXWARE_PATH))[0] client = DecompilerClient(socket_path=record["socket_path"]) try: @@ -286,78 +413,150 @@ def test_rename_var(self): finally: client.shutdown() - result = _run_cli( - "rename", "var", target, "renamed_var", - "--function", "main", "--json", - ) + result = _run_cli("rename", "var", target, "renamed_var", + "--function", "main", "--json") payload = json.loads(result.stdout) self.assertTrue(payload["success"]) - def test_list_strings(self): - self._load_fauxware() - result = _run_cli("list_strings", "--filter", "Welcome", "--json") - payload = json.loads(result.stdout) - # Each entry has addr, addr_hex, string, source. - hit = next((s for s in payload if "Welcome" in s["string"]), None) - self.assertIsNotNone(hit) - self.assertIn("addr_hex", hit) - self.assertIn("source", hit) - - def test_list_strings_rescan_picks_up_more(self): - """Feedback P0/P1.2: fallback scan should surface more entries than the thin angr detector.""" - self._load_fauxware() - backend_only = _run_cli("list_strings", "--no-rescan", "--json") - with_rescan = _run_cli("list_strings", "--rescan", "--json") - self.assertGreater(len(json.loads(with_rescan.stdout)), - len(json.loads(backend_only.stdout))) - # rescan entries should include section info for ELF binaries. - rescan_entries = [s for s in json.loads(with_rescan.stdout) - if s.get("source") == "rescan"] - self.assertTrue(rescan_entries) - self.assertTrue(any("section" in e for e in rescan_entries)) - def test_list_strings_min_length(self): self._load_fauxware() result = _run_cli("list_strings", "--min-length", "20", "--json") entries = json.loads(result.stdout) - # Every entry must meet the threshold. for e in entries: self.assertGreaterEqual(len(e["string"]), 20) - def test_get_callers(self): - self._load_fauxware() - by_name = _run_cli("get_callers", "authenticate", "--json") - payload = json.loads(by_name.stdout) - names = {c.get("name") for c in payload["callers"]} - self.assertIn("main", names) - # Every *_addr field should have a hex sibling (target_addr -> target_addr_hex, etc.) - self.assertIn("target_addr_hex", payload) - for c in payload["callers"]: - self.assertIn("addr_hex", c) - def test_stop(self): loaded = self._load_fauxware() stop = _run_cli("stop", "--id", loaded["id"], "--json") payload = json.loads(stop.stdout) self.assertTrue(payload["stopped"][0]["stopped"]) listing = _run_cli("list", "--json") - servers = json.loads(listing.stdout)["servers"] - ids = {s["id"] for s in servers} + ids = {s["id"] for s in json.loads(listing.stdout)["servers"]} self.assertNotIn(loaded["id"], ids) @unittest.skipUnless(POSIX_SYSCALL_PATH.exists(), f"Missing: {POSIX_SYSCALL_PATH}") def test_two_binaries_concurrent(self): first = self._load_fauxware() - second_result = _run_cli( - "load", str(POSIX_SYSCALL_PATH), "--backend", "angr", "--json" - ) + second_result = _run_cli("load", str(POSIX_SYSCALL_PATH), "--backend", "angr", "--json") second = json.loads(second_result.stdout) self.assertNotEqual(first["id"], second["id"]) - - # Each CLI call with --id should return results from its binary. fauxware_strings = _run_cli("list_strings", "--id", first["id"], "--json") - self.assertTrue(any("Welcome" in s["string"] for s in json.loads(fauxware_strings.stdout))) + self.assertTrue(any("Welcome" in s["string"] + for s in json.loads(fauxware_strings.stdout))) + + +@unittest.skipUnless(_backend_available("ghidra"), + "ghidra backend not available (no GHIDRA_INSTALL_DIR or pyghidra missing)") +class TestDecompilerCLIGhidra(_CLIBackendTestBase): + """Ghidra backend: same suite as angr, running against real Ghidra.""" + backend = "ghidra" + _persists_project_files = True # Ghidra writes its project under --project-dir + + def test_list_strings_picks_up_uchar_array(self): + """Regression: Ghidra auto-types the base64 alphabet as `uchar[64]` + rather than a string, so ``getDefinedData`` misses it. The + supplemental StringSearcher pass should surface it anyway. + + Skips when the challenge binary isn't checked in (it only ships in + the repo for local reproduction). Using ``pathlib`` rather than + copying the binary into TEST_BINARIES_DIR keeps the repo tidy. + """ + challenge = Path(__file__).parent.parent / "challenge" / "rpc.out" + if not challenge.exists(): + self.skipTest(f"challenge binary missing: {challenge}") + _run_cli("load", str(challenge), "--backend", "ghidra", "--json") + result = _run_cli("list_strings", "--filter", "ABCDEFGHIJKLMN", "--json") + payload = json.loads(result.stdout) + self.assertTrue( + any("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" + in s["string"] for s in payload), + f"Ghidra list_strings missed the base64 alphabet: {payload!r}" + ) + + +@unittest.skipUnless(_backend_available("ida"), + "ida backend not available (idapro module missing)") +class TestDecompilerCLIIDA(_CLIBackendTestBase): + """IDA (via idalib) backend: same suite as angr, running against real IDA. + + Mostly a regression test for main-thread dispatch: idalib rejects every + cross-thread API call with ``Function can be called from the main thread + only``, so every CLI round-trip here exercises the dispatcher path — + the client's ``server_info`` handshake included. + """ + backend = "ida" + _persists_project_files = True # .id0/.id1/.id2/.nam/.til + + +# --------------------------------------------------------------------------- +# Artifact-serialization unit tests: keep these separate from the CLI +# subprocess tests so they run in isolation and are cheap to iterate on. +# --------------------------------------------------------------------------- + +class TestArtifactWireSerialization(unittest.TestCase): + """The client↔server wire format must survive tricky decompilation text. + + Regression for the Ghidra `Reserved escape sequence used` failure: the + `toml` encoder mangles literal `\\x01` escapes that show up in C char + literals. The server now emits JSON on the wire; JSON is stricter about + backslash escaping, so this test locks that behavior in. + """ + + def test_decompilation_with_backslash_x_roundtrip_json(self): + from libbs.artifacts import Decompilation + from libbs.artifacts.formatting import ArtifactFormat + + # Exactly the kind of text Ghidra emits when decompiling code that + # compares a byte to a control character: `if (c == '\x01')`. + text = "if (c == '\\x01') { return 42; }" + dec = Decompilation(addr=0x1000, text=text, decompiler="ghidra") + encoded = dec.dumps(fmt=ArtifactFormat.JSON) + decoded = Decompilation.loads(encoded, fmt=ArtifactFormat.JSON) + self.assertEqual(decoded.text, text) + self.assertEqual(decoded.addr, 0x1000) + + def test_decompilation_toml_still_fails_on_backslash_x(self): + """Document WHY we moved off TOML — if this ever starts working we + can reconsider, but in the meantime it's load-bearing for the fix.""" + from libbs.artifacts import Decompilation + from libbs.artifacts.formatting import ArtifactFormat + import toml + + text = "if (c == '\\x01') { return 42; }" + dec = Decompilation(addr=0x1000, text=text, decompiler="ghidra") + encoded = dec.dumps(fmt=ArtifactFormat.TOML) + with self.assertRaises(toml.decoder.TomlDecodeError): + Decompilation.loads(encoded, fmt=ArtifactFormat.TOML) + + +class TestCLIFormatters(unittest.TestCase): + """Sanity tests for the small pure-Python helpers in the CLI.""" + + def test_format_addr_hex_handles_negative(self): + """Regression for Ghidra surfacing negative-signed-long section addrs.""" + from libbs.cli.decompiler_cli import _format_addr_hex + + # Positive values render as-is. + self.assertEqual(_format_addr_hex(0x400), "0x400") + # Negative values wrap to unsigned 64-bit, never emit '0x-...'. + rendered = _format_addr_hex(-0x100000) + self.assertTrue(rendered.startswith("0x")) + self.assertNotIn("-", rendered) + self.assertEqual(rendered, f"0x{((-0x100000) & ((1 << 64) - 1)):x}") + + def test_annotate_addrs_uses_safe_hex(self): + from libbs.cli.decompiler_cli import _annotate_addrs + + payload = {"addr": -0x100000, "target_addr": 0x1000} + annotated = _annotate_addrs(payload) + self.assertNotIn("-", annotated["addr_hex"]) + self.assertEqual(annotated["target_addr_hex"], "0x1000") + + +# --------------------------------------------------------------------------- +# Skill installer tests +# --------------------------------------------------------------------------- class TestSkillInstaller(unittest.TestCase): """The bundled `decompiler` skill should ship with the package and install cleanly.""" @@ -375,10 +574,10 @@ def test_bundled_skill_present(self): def test_install_skill_via_cli(self): with tempfile.TemporaryDirectory() as dest: result = _run_cli("install-skill", "--dest", dest, "--json") - # Feedback P3.12: --json must emit parseable JSON (not Python repr). payload = json.loads(result.stdout) self.assertEqual(len(payload["installed"]), 1) installed_path = Path(payload["installed"][0]["path"]) + self.assertEqual(payload["installed"][0]["agent"], "custom") self.assertTrue((installed_path / "SKILL.md").is_file()) # Re-install without --force should fail helpfully. @@ -390,17 +589,70 @@ def test_install_skill_via_cli(self): self.assertEqual(len(json.loads(forced.stdout)["installed"]), 1) def test_install_skill_text_output_is_parsable(self): - """Text output should be readable (not single-quoted Python repr).""" with tempfile.TemporaryDirectory() as dest: result = _run_cli("install-skill", "--dest", dest) - # No Python-style single quotes around the payload. self.assertNotIn("[{'name'", result.stdout) self.assertIn("decompiler", result.stdout) + def test_install_skill_agent_destinations(self): + with tempfile.TemporaryDirectory() as home, tempfile.TemporaryDirectory() as codex_home: + result = _run_cli( + "install-skill", + "--agent", "all", + "--json", + env_overrides={"HOME": home, "CODEX_HOME": codex_home}, + ) + payload = json.loads(result.stdout) + installed = {entry["agent"]: Path(entry["path"]) for entry in payload["installed"]} + self.assertEqual(set(installed), {"claude", "codex"}) + self.assertEqual(installed["claude"], + (Path(home) / ".claude" / "skills" / "decompiler").resolve()) + self.assertEqual(installed["codex"], + (Path(codex_home) / "skills" / "decompiler").resolve()) + + def test_install_skill_default_prefers_codex_under_codex(self): + with tempfile.TemporaryDirectory() as home, tempfile.TemporaryDirectory() as codex_home: + result = _run_cli( + "install-skill", + "--json", + env_overrides={"HOME": home, "CODEX_HOME": codex_home, "CODEX_CI": "1"}, + ) + installed = json.loads(result.stdout)["installed"] + self.assertEqual(len(installed), 1) + self.assertEqual(installed[0]["agent"], "codex") + + def test_install_skill_default_falls_back_to_claude(self): + codex_vars = { + "CODEX_CI": None, "CODEX_HOME": None, "CODEX_MANAGED_BY_NPM": None, + "CODEX_SANDBOX": None, "CODEX_SANDBOX_NETWORK_DISABLED": None, + "CODEX_THREAD_ID": None, + } + with tempfile.TemporaryDirectory() as home: + result = _run_cli( + "install-skill", + "--json", + env_overrides={"HOME": home, **codex_vars}, + ) + installed = json.loads(result.stdout)["installed"] + self.assertEqual(len(installed), 1) + self.assertEqual(installed[0]["agent"], "claude") + + def test_install_skill_dest_and_agent_are_mutually_exclusive(self): + with tempfile.TemporaryDirectory() as dest: + result = _run_cli("install-skill", "--dest", dest, "--agent", "codex", + check=False) + self.assertNotEqual(result.returncode, 0) + self.assertIn("--dest cannot be combined with --agent", + result.stdout + result.stderr) + + +# --------------------------------------------------------------------------- +# Direct library-level tests (don't need the CLI + subprocess machinery) +# --------------------------------------------------------------------------- @unittest.skipUnless(FAUXWARE_PATH.exists(), f"Missing test binary: {FAUXWARE_PATH}") class TestNewLibbsFeatures(unittest.TestCase): - """Direct tests (not via CLI) for the new list_strings/get_callers/disassemble APIs.""" + """Direct tests for list_strings, get_callers, disassemble, xref_from, xref_to_addr.""" @classmethod def setUpClass(cls): @@ -410,26 +662,18 @@ def setUpClass(cls): binary_path=str(FAUXWARE_PATH), ) - def test_list_strings_no_filter(self): + def test_list_strings(self): strings = self.deci.list_strings() self.assertGreater(len(strings), 0) - for addr, s in strings: - self.assertIsInstance(addr, int) - self.assertIsInstance(s, str) - def test_list_strings_filter(self): welcome = self.deci.list_strings(filter=r"Welcome") self.assertEqual(len(welcome), 1) self.assertIn("Welcome", welcome[0][1]) - # Ensure non-matching regex yields nothing. - self.assertEqual(self.deci.list_strings(filter=r"zzz_no_match_zzz"), []) + self.assertEqual(self.deci.list_strings(filter=r"zzz_no_match"), []) def test_disassemble(self): addrs = [a for a, f in self.deci.functions.items() if f.name == "main"] - self.assertEqual(len(addrs), 1) - main_addr = addrs[0] - text = self.deci.disassemble(main_addr) - self.assertIsNotNone(text) + text = self.deci.disassemble(addrs[0]) self.assertTrue(any(mnem in text for mnem in ("push", "mov", "call"))) def test_get_callers_by_addr_name_and_function(self): @@ -438,14 +682,31 @@ def test_get_callers_by_addr_name_and_function(self): by_addr = self.deci.get_callers(auth_addr) by_name = self.deci.get_callers("authenticate") - self.assertGreater(len(by_addr), 0) - self.assertGreater(len(by_name), 0) self.assertEqual({f.addr for f in by_addr}, {f.addr for f in by_name}) - - # A made-up name raises. with self.assertRaises(ValueError): self.deci.get_callers("no_such_function_xyz") + def test_xrefs_from_returns_callees(self): + """xrefs_from(main) should include authenticate, puts, read, etc.""" + addrs_by_name = {f.name: a for a, f in self.deci.functions.items()} + main_addr = addrs_by_name["main"] + callees = self.deci.xrefs_from(main_addr) + callee_names = {c.name for c in callees if c.name} + self.assertTrue( + callee_names & {"authenticate", "puts", "read", "accepted", "rejected"}, + f"expected a known callee in {callee_names}" + ) + + def test_xrefs_to_addr_on_string(self): + """xrefs_to_addr on the SOSNEAKY constant should point at authenticate.""" + strings = self.deci.list_strings(filter=r"SOSNEAKY") + self.assertTrue(strings, "SOSNEAKY not found in angr strings") + str_addr = strings[0][0] + refs = self.deci.xrefs_to_addr(str_addr) + ref_names = {getattr(r, "name", None) for r in refs} + self.assertIn("authenticate", ref_names, + f"expected 'authenticate' in xrefs_to_addr(SOSNEAKY): {ref_names}") + if __name__ == "__main__": unittest.main() From 2cb7e88a38f12a3f121935667f4b47e2360f7927 Mon Sep 17 00:00:00 2001 From: mahaloz Date: Fri, 24 Apr 2026 16:13:31 -0700 Subject: [PATCH 08/10] Add read_memory to DecompilerInterface and CLI - Base interface gains read_memory(addr, size) -> Optional[bytes]: None means the backend can't reach the region; short reads are valid and returned as-is. - angr: project.loader.memory.load; IDA: ida_bytes.get_bytes via @execute_write so idalib's main-thread rule holds; Ghidra: Memory.getBytes with a jpype JByte array (signed -> unsigned); Binja: BinaryView.read. - CLI: `decompiler read_memory ` with --format {hexdump,hex,raw} (default: hexdump) and --json (base64-encoded bytes + hex). - Tests: parametrized CLI coverage (angr/ghidra/ida subclasses) and direct library coverage on angr. SKILL.md updated. Co-Authored-By: Claude Opus 4.7 (1M context) --- libbs/api/decompiler_client.py | 4 ++ libbs/api/decompiler_interface.py | 15 ++++ libbs/cli/decompiler_cli.py | 98 +++++++++++++++++++++++++++ libbs/decompilers/angr/interface.py | 12 ++++ libbs/decompilers/binja/interface.py | 12 ++++ libbs/decompilers/ghidra/interface.py | 26 +++++++ libbs/decompilers/ida/compat.py | 15 ++++ libbs/decompilers/ida/interface.py | 6 ++ libbs/skills/decompiler/SKILL.md | 1 + tests/test_decompiler_cli.py | 82 ++++++++++++++++++++++ 10 files changed, 271 insertions(+) diff --git a/libbs/api/decompiler_client.py b/libbs/api/decompiler_client.py index 0989262..4c84348 100644 --- a/libbs/api/decompiler_client.py +++ b/libbs/api/decompiler_client.py @@ -558,6 +558,10 @@ def list_strings(self, filter: Optional[str] = None) -> List: def disassemble(self, addr: int, **kwargs) -> Optional[str]: """Disassemble a function""" return self._send_request({"type": "method_call", "method_name": "disassemble", "args": [addr], "kwargs": kwargs}) + + def read_memory(self, addr: int, size: int) -> Optional[bytes]: + """Read raw bytes from the loaded program.""" + return self._send_request({"type": "method_call", "method_name": "read_memory", "args": [addr, size]}) def get_callgraph(self, only_names=False): """Get the call graph""" diff --git a/libbs/api/decompiler_interface.py b/libbs/api/decompiler_interface.py index bec3f01..800a4a2 100644 --- a/libbs/api/decompiler_interface.py +++ b/libbs/api/decompiler_interface.py @@ -513,6 +513,21 @@ def disassemble(self, addr: int, **kwargs) -> Optional[str]: """ return None + def read_memory(self, addr: int, size: int) -> Optional[bytes]: + """Read ``size`` bytes from the loaded program at ``addr``. + + Returns the raw bytes the backend has for the requested span. ``None`` + means "I couldn't satisfy the read at all" — out-of-range, uninitialized, + or the backend can't reach that memory. A short read (fewer bytes than + requested) is still valid and returned as-is; callers should check + ``len(result)`` if they need an exact count. + + @param addr: Lifted address to start reading from. + @param size: Number of bytes to read. Must be > 0. + @return: Bytes read, or ``None`` if the backend can't read this region. + """ + raise NotImplementedError + def get_callgraph(self, only_names=False) -> nx.DiGraph: """ Returns the callgraph of the binary. This is a dict of function addresses to a list of function addresses diff --git a/libbs/cli/decompiler_cli.py b/libbs/cli/decompiler_cli.py index 7f17d2e..fd3aa38 100644 --- a/libbs/cli/decompiler_cli.py +++ b/libbs/cli/decompiler_cli.py @@ -18,6 +18,7 @@ - rename rename a function or local variable - list_strings list strings in the binary, optionally filtered by regex - get_callers functions (call sites only) that call a target +- read_memory read raw bytes from the binary at an address - install-skill install the bundled Agent Skill so LLMs learn the CLI """ import argparse @@ -786,6 +787,84 @@ def cmd_get_callers(args) -> int: return 0 +# --------------------------------------------------------------------------- +# read_memory +# --------------------------------------------------------------------------- + +def cmd_read_memory(args) -> int: + """Read ``size`` bytes from the binary starting at ``addr``. + + Address accepts hex (``0x...``) or decimal. Output defaults to a hex+ascii + dump; use ``--format hex`` for a single hex blob, ``--format raw`` to write + raw bytes to stdout, or ``--json`` for a JSON envelope with the bytes + base64-encoded. + """ + import base64 + + addr_value, name = _parse_target(args.addr) + if addr_value is None: + raise SystemExit( + f"Invalid address {args.addr!r}; expected hex (0x..) or decimal." + ) + if args.size <= 0: + raise SystemExit(f"--size must be > 0 (got {args.size})") + + with _with_client(args) as client: + data = client.read_memory(addr_value, args.size) + if data is None: + raise SystemExit( + f"Backend could not read 0x{args.size:x} bytes at " + f"{_format_addr_hex(addr_value)}. The address may be " + "uninitialized, unmapped, or outside any loaded segment." + ) + # Some backends return short reads when the request straddles the + # end of a mapped region; surface that in the JSON output and warn + # in text mode so the caller knows. + actual_size = len(data) + + if args.format == "raw" and not args.json: + sys.stdout.buffer.write(data) + return 0 + + if args.json: + payload = { + "addr": addr_value, + "size": actual_size, + "requested_size": args.size, + "bytes_b64": base64.b64encode(data).decode("ascii"), + "hex": data.hex(), + } + print(json.dumps(_annotate_addrs(payload), indent=2, default=str)) + return 0 + + if args.format == "hex": + print(data.hex()) + return 0 + + # Default: hexdump-style output. + for line in _hexdump(data, base_addr=addr_value): + print(line) + if actual_size < args.size: + print( + f"# short read: got {actual_size} of {args.size} requested bytes", + file=sys.stderr, + ) + return 0 + + +def _hexdump(data: bytes, *, base_addr: int = 0, width: int = 16) -> List[str]: + """Return a list of hexdump lines like ``addr: hh hh ... |ascii|``.""" + lines: List[str] = [] + for offset in range(0, len(data), width): + chunk = data[offset:offset + width] + hex_part = " ".join(f"{b:02x}" for b in chunk) + # Pad short final lines so the ASCII column stays aligned. + hex_part = hex_part.ljust(width * 3 - 1) + ascii_part = "".join(chr(b) if 32 <= b < 127 else "." for b in chunk) + lines.append(f"{_format_addr_hex(base_addr + offset)}: {hex_part} |{ascii_part}|") + return lines + + # --------------------------------------------------------------------------- # install-skill # --------------------------------------------------------------------------- @@ -1100,6 +1179,25 @@ def build_parser() -> argparse.ArgumentParser: _add_output_args(p_gc) p_gc.set_defaults(func=cmd_get_callers) + # read_memory + p_rm = sub.add_parser( + "read_memory", + help=( + "Read raw bytes from the binary at an address. " + "Default output is a hexdump; pass --format hex for a single hex " + "string, --format raw for binary stdout, or --json for a JSON " + "envelope with base64-encoded bytes." + ), + ) + p_rm.add_argument("addr", help="Address to start reading from (hex 0x.. or decimal).") + p_rm.add_argument("size", type=lambda x: int(x, 0), + help="Number of bytes to read (decimal or 0x-prefixed hex).") + p_rm.add_argument("--format", choices=("hexdump", "hex", "raw"), default="hexdump", + help="Text-mode output format. Ignored when --json is set.") + _add_server_filter_args(p_rm) + _add_output_args(p_rm) + p_rm.set_defaults(func=cmd_read_memory) + # install-skill p_sk = sub.add_parser( "install-skill", diff --git a/libbs/decompilers/angr/interface.py b/libbs/decompilers/angr/interface.py index 1439a21..da4b90b 100644 --- a/libbs/decompilers/angr/interface.py +++ b/libbs/decompilers/angr/interface.py @@ -233,6 +233,18 @@ def list_strings(self, filter: Optional[str] = None) -> List[Tuple[int, str]]: results.sort(key=lambda item: item[0]) return results + def read_memory(self, addr: int, size: int) -> Optional[bytes]: + if size <= 0: + return b"" + lowered = self.art_lifter.lower_addr(addr) + loader_memory = self.main_instance.project.loader.memory + try: + data = loader_memory.load(lowered, size) + except (KeyError, ValueError): + # cle's Clemory raises when the address isn't backed by a segment. + return None + return bytes(data) + def disassemble(self, addr: int, **kwargs) -> Optional[str]: lowered = self.art_lifter.lower_addr(addr) func = self.main_instance.project.kb.functions.get(lowered, None) diff --git a/libbs/decompilers/binja/interface.py b/libbs/decompilers/binja/interface.py index d57642d..439f875 100644 --- a/libbs/decompilers/binja/interface.py +++ b/libbs/decompilers/binja/interface.py @@ -327,6 +327,18 @@ def get_decompilation_object(self, function: Function, **kwargs) -> Optional[obj """ return None + def read_memory(self, addr: int, size: int) -> Optional[bytes]: + if size <= 0: + return b"" + lowered = self.art_lifter.lower_addr(addr) + try: + data = self.bv.read(lowered, size) + except Exception: + return None + if data is None: + return None + return bytes(data) + def start_artifact_watchers(self): if not self.artifact_watchers_started: from .hooks import DataMonitor diff --git a/libbs/decompilers/ghidra/interface.py b/libbs/decompilers/ghidra/interface.py index 52a85c6..3f3ee63 100644 --- a/libbs/decompilers/ghidra/interface.py +++ b/libbs/decompilers/ghidra/interface.py @@ -516,6 +516,32 @@ def disassemble(self, addr: int, **kwargs) -> Optional[str]: return None return "\n".join(lines) if lines else None + def read_memory(self, addr: int, size: int) -> Optional[bytes]: + if size <= 0: + return b"" + lowered = self.art_lifter.lower_addr(addr) + try: + import jpype + memory = self.currentProgram.getMemory() + gaddr = self._to_gaddr(lowered) + byte_array = jpype.JArray(jpype.JByte)(size) + # Memory.getBytes returns the count of bytes copied; on partial + # reads it raises MemoryAccessException, which we treat as the + # caller asked for memory we can't reach. + try: + read = int(memory.getBytes(gaddr, byte_array)) + except Exception as exc: + _l.debug("Ghidra read_memory at 0x%x size=%d failed: %s", lowered, size, exc) + return None + if read <= 0: + return b"" + # JByte values arrive as signed Python ints; mask back to unsigned + # so the resulting bytes match what the binary stores on disk. + return bytes(int(b) & 0xFF for b in byte_array[:read]) + except Exception as exc: + _l.warning("Ghidra read_memory failed: %s", exc) + return None + # # Extra API # diff --git a/libbs/decompilers/ida/compat.py b/libbs/decompilers/ida/compat.py index 9ed439a..e1bbd3b 100644 --- a/libbs/decompilers/ida/compat.py +++ b/libbs/decompilers/ida/compat.py @@ -1752,6 +1752,21 @@ def list_strings(): return results +@execute_write +def read_memory(addr, size): + """Read ``size`` bytes from the IDB at ``addr``. + + Uses ``ida_bytes.get_bytes`` which honors loaded segments and patched + bytes. Returns ``None`` when IDA can't satisfy the read at all. + """ + if size <= 0: + return b"" + data = ida_bytes.get_bytes(addr, size) + if data is None: + return None + return bytes(data) + + @execute_write def disassemble_function(addr): """Return a single-string disassembly for the function containing ``addr``.""" diff --git a/libbs/decompilers/ida/interface.py b/libbs/decompilers/ida/interface.py index 5f6fc6c..1fc3650 100755 --- a/libbs/decompilers/ida/interface.py +++ b/libbs/decompilers/ida/interface.py @@ -258,6 +258,12 @@ def disassemble(self, addr: int, **kwargs) -> Optional[str]: lowered = self.art_lifter.lower_addr(addr) return compat.disassemble_function(lowered) + def read_memory(self, addr: int, size: int) -> Optional[bytes]: + if size <= 0: + return b"" + lowered = self.art_lifter.lower_addr(addr) + return compat.read_memory(lowered, size) + def _collect_xrefs_to(self, lowered_addr: int, only_code: bool, _max_chase: int = 2) -> List[Artifact]: """Collect function-level xrefs to ``lowered_addr``. diff --git a/libbs/skills/decompiler/SKILL.md b/libbs/skills/decompiler/SKILL.md index b791c92..056c47d 100644 --- a/libbs/skills/decompiler/SKILL.md +++ b/libbs/skills/decompiler/SKILL.md @@ -123,6 +123,7 @@ same binary. | `rename var --function ` | Rename a local variable inside a function. | same | | `list_strings` | Strings the decompiler found (may be incomplete — see below). | `--filter`, `--min-length N`, same | | `get_callers ` | Call-sites only — subset of `xref_to`. | same | +| `read_memory ` | Read raw bytes from the binary at ``. Default output is a hexdump. | `--format {hexdump,hex,raw}`, same + `--json` (base64-encoded bytes) | | `install-skill` | Install this file for Claude Code or Codex. | `--agent`, `--dest`, `--force`, `--json` | ### `xref_to` vs `get_callers` diff --git a/tests/test_decompiler_cli.py b/tests/test_decompiler_cli.py index f913bd1..5afff22 100644 --- a/tests/test_decompiler_cli.py +++ b/tests/test_decompiler_cli.py @@ -75,6 +75,11 @@ def _run_cli(*args, check=True, timeout=600, env_overrides=None) -> subprocess.C return subprocess.run(cmd, capture_output=True, text=True, check=check, timeout=timeout, env=env) +def _format_hex(value: int) -> str: + """Tiny helper: render an int as ``0x...`` for CLI args.""" + return f"0x{value:x}" + + # Shared registry directory for this module's tests _REGISTRY_DIR = tempfile.mkdtemp(prefix="libbs_cli_registry_") @@ -288,6 +293,63 @@ def test_get_callers(self): for c in payload["callers"]: self.assertIn("addr_hex", c) + def test_read_memory(self): + """read_memory should return the bytes at a known location. + + Fauxware's ``Welcome to the admin console, trusted user!`` string + lives at lifted address ``0x8e0`` and the ELF header lives at the + binary's base. Both are stable across every backend we support, so + this is a clean cross-decompiler smoke test. + """ + import base64 + + self._load_fauxware() + + # 1. ELF magic at the binary's base. Lifted address 0x0. + result = _run_cli("read_memory", "0x0", "0x4", "--json") + payload = json.loads(result.stdout) + self.assertEqual(payload["size"], 4) + decoded = base64.b64decode(payload["bytes_b64"]) + self.assertEqual(decoded, b"\x7fELF", + f"{self.backend} read_memory(0x0, 4) returned {decoded!r}") + self.assertEqual(payload["hex"], "7f454c46") + + # 2. The "Welcome" string. Walk list_strings to find it so this + # isn't tied to a specific backend's address representation. + strings = json.loads(_run_cli("list_strings", "--filter", "Welcome", + "--json").stdout) + self.assertTrue(strings, f"{self.backend}: 'Welcome' string not surfaced") + welcome_addr = strings[0]["addr"] + + result = _run_cli("read_memory", _format_hex(welcome_addr), "7", "--json") + payload = json.loads(result.stdout) + self.assertEqual(base64.b64decode(payload["bytes_b64"]), b"Welcome", + f"{self.backend} read_memory at Welcome addr returned wrong bytes") + + def test_read_memory_hexdump_default(self): + """Default text output is a hexdump of the bytes.""" + self._load_fauxware() + result = _run_cli("read_memory", "0x0", "16") + # Hexdump of the ELF header starts with the magic + class + data. + self.assertIn("7f 45 4c 46", result.stdout) + # ASCII column should also be present. + self.assertIn("|.ELF", result.stdout) + + def test_read_memory_hex_format(self): + self._load_fauxware() + result = _run_cli("read_memory", "0x0", "4", "--format", "hex") + self.assertEqual(result.stdout.strip(), "7f454c46") + + def test_read_memory_invalid_address(self): + """An address far outside any segment should error cleanly.""" + self._load_fauxware() + result = _run_cli("read_memory", "0xdeadbeef00", "16", check=False) + self.assertNotEqual(result.returncode, 0) + # Either the backend rejects it, or it raises before responding. + # We just assert the CLI didn't print bytes. + combined = result.stdout + result.stderr + self.assertNotIn("|.ELF", combined) + #: Subclasses set this to True if their backend actually persists files #: (Ghidra project, IDA database, etc). For in-memory backends like angr #: it stays False and we only assert "nothing wound up next to the binary". @@ -707,6 +769,26 @@ def test_xrefs_to_addr_on_string(self): self.assertIn("authenticate", ref_names, f"expected 'authenticate' in xrefs_to_addr(SOSNEAKY): {ref_names}") + def test_read_memory(self): + """read_memory should return the ELF magic at the binary's base.""" + # ELF magic at lifted addr 0 + elf = self.deci.read_memory(0, 4) + self.assertEqual(elf, b"\x7fELF") + + # Welcome string — find via list_strings, then read its bytes. + strings = self.deci.list_strings(filter=r"Welcome") + self.assertTrue(strings, "Welcome string not found") + welcome_addr = strings[0][0] + bytes_ = self.deci.read_memory(welcome_addr, 7) + self.assertEqual(bytes_, b"Welcome") + + # Out-of-range read should return None. + self.assertIsNone(self.deci.read_memory(0xdeadbeef00, 16)) + + # Zero/negative size short-circuit. + self.assertEqual(self.deci.read_memory(0, 0), b"") + self.assertEqual(self.deci.read_memory(0, -5), b"") + if __name__ == "__main__": unittest.main() From 3c77b8bd04dd1d9a68c1b4245467ead25e815219 Mon Sep 17 00:00:00 2001 From: mahaloz Date: Tue, 28 Apr 2026 22:51:44 -0700 Subject: [PATCH 09/10] Fix IDA headless open_database for IDA <= 9.1 The 3-argument form of idapro.open_database (with extra_args) was introduced in IDA 9.2. On IDA 9.0/9.1 it raises TypeError, breaking CI which runs against 9.0. Branch on get_ida_version(): use the legacy 2-argument call on <= 9.1 and warn if a project_dir was configured, since redirecting database sidecars isn't supported there. Co-Authored-By: Claude Opus 4.7 (1M context) --- libbs/decompilers/ida/interface.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/libbs/decompilers/ida/interface.py b/libbs/decompilers/ida/interface.py index 1fc3650..b2c3573 100755 --- a/libbs/decompilers/ida/interface.py +++ b/libbs/decompilers/ida/interface.py @@ -82,7 +82,17 @@ def _init_headless_components(self, *args, **kwargs): super()._init_headless_components(*args, **kwargs) binary_path = str(self.binary_path) extra_args = self._ida_open_args() - failure = idapro.open_database(binary_path, True, extra_args) + # IDA <= 9.1 only accepts (path, run_auto_analysis); the extra_args + # parameter was added in 9.2. + if compat.get_ida_version() <= 910: + if extra_args: + _l.warning( + "project_dir/extra open args are only supported on IDA >= 9.2; ignoring %r.", + extra_args, + ) + failure = idapro.open_database(binary_path, True) + else: + failure = idapro.open_database(binary_path, True, extra_args) if failure: raise RuntimeError(f"Failed to open database {binary_path}") From 1ec982f1d9d7ed33a4d189c4a2702a00122d45a3 Mon Sep 17 00:00:00 2001 From: mahaloz Date: Tue, 28 Apr 2026 22:59:45 -0700 Subject: [PATCH 10/10] decompiler skill: prefer IDA, drop plugin install, document read_memory - Tell agents to always prefer IDA Pro (`--backend ida`) and document the `ida -> ghidra -> angr` fallback order when the load fails. - Drop the `libbs --install` / `--single-decompiler-install` step from setup; the headless CLI doesn't need plugins inside the GUIs. - Add a dedicated `read_memory` subsection covering use cases, output formats, short reads, and library-API access. Co-Authored-By: Claude Opus 4.7 (1M context) --- libbs/skills/decompiler/SKILL.md | 84 ++++++++++++++++++++++++++------ 1 file changed, 68 insertions(+), 16 deletions(-) diff --git a/libbs/skills/decompiler/SKILL.md b/libbs/skills/decompiler/SKILL.md index 056c47d..8979311 100644 --- a/libbs/skills/decompiler/SKILL.md +++ b/libbs/skills/decompiler/SKILL.md @@ -14,16 +14,12 @@ server, so repeated `decompile`/`disassemble`/`xref_*` calls are fast. ```bash pip install libbs # installs the `decompiler` and `libbs` entry points -libbs --install # registers LibBS plugins into detected decompilers ``` -If you only want one backend (for example, Binary Ninja), use: -```bash -libbs --single-decompiler-install binja /Applications/Binary\ Ninja.app -``` - -`angr` needs no host install — it's a Python dependency and the fastest way -to verify the pipeline end-to-end. +That's it — the `decompiler` CLI drives every backend headlessly via LibBS +and does **not** need any plugins installed inside IDA/Ghidra/Binary Ninja +to run. `angr` needs no host tool at all (it's a pure Python dependency) +and is the fastest way to verify the pipeline end-to-end. ## Mental model @@ -36,12 +32,18 @@ to verify the pipeline end-to-end. ## First moves on a new binary +**Always prefer IDA Pro when it's available** (`--backend ida`) — it +generally produces the cleanest decompilation and the most accurate type +recovery. If IDA fails to load the binary (missing license, unsupported +file type, decompiler error), fall back to `--backend ghidra`, then +`--backend angr` as a last resort. + **Always start with `list_functions` and `list_strings`** — the same binary can have the entry named `main` (angr), `FUN_00101c5c` (Ghidra), or `sub_101c5c` (IDA). Don't assume `main` exists. ```bash -decompiler load ./target # start a server (angr by default) +decompiler load ./target --backend ida # prefer IDA; fall back to ghidra if it fails decompiler list_functions # enumerate every function — pick a real entry decompiler list_functions --filter 'main|auth' # or narrow by regex decompiler list_strings --filter 'flag|pass' # find interesting string constants @@ -49,7 +51,8 @@ decompiler list_strings --filter 'flag|pass' # find interesting string constan Typical first-hour workflow on a stripped binary: -1. `decompiler load ./bin --backend ghidra` (or `angr` if no Ghidra install) +1. `decompiler load ./bin --backend ida` (fall back to `--backend ghidra`, + then `--backend angr`, if IDA can't open the binary) 2. `decompiler list_functions` → note non-stub function names + sizes 3. `decompiler list_strings` → look for error messages, user prompts, format strings — they often point at the interesting code @@ -59,7 +62,7 @@ Typical first-hour workflow on a stripped binary: ## Core workflow ```bash -decompiler load ./fauxware # start a server +decompiler load ./fauxware --backend ida # start a server (prefer IDA) decompiler list_functions # enumerate functions (do this first) decompiler list_strings --filter 'pass|key' # strings the decompiler identified decompiler xref_to SOSNEAKY # who references this string? @@ -96,13 +99,24 @@ second server alongside the existing one). ## Choosing a backend +**Default: IDA Pro.** Use `--backend ida` whenever IDA is installed and +licensed — its decompilation is the most reliable across architectures. +Only switch backends if IDA fails to load the binary (the `load` call +errors, or analysis stalls); fall through in this order: `ida → ghidra +→ angr`. Use `binja` only when explicitly requested. + ```bash -decompiler load ./my-binary --backend ghidra # needs GHIDRA_INSTALL_DIR -decompiler load ./my-binary --backend angr # pure-Python, always available +decompiler load ./my-binary --backend ida # PREFERRED: IDA Pro (needs install + license) +decompiler load ./my-binary --backend ghidra # FALLBACK: needs GHIDRA_INSTALL_DIR +decompiler load ./my-binary --backend angr # LAST RESORT: pure-Python, always available decompiler load ./my-binary --backend binja # Binary Ninja, needs license -decompiler load ./my-binary --backend ida # IDA Pro, needs install ``` +If the IDA `load` fails (e.g. unsupported file format, decompiler error), +re-issue `load` with `--backend ghidra` — `load` is idempotent per +backend, so this leaves any other server alone and just brings up a +Ghidra one alongside. + `--backend` is also accepted on the inspection/mutation subcommands to narrow which server to target when multiple backends are loaded for the same binary. @@ -141,6 +155,41 @@ same binary. for `get_callers`; when you want "who touches this in any way?" reach for `xref_to`. +### `read_memory` — raw bytes at an address + +`read_memory ` reads `` bytes from the loaded binary's +mapped memory starting at ``. It goes through the backend's own +memory accessor, so it returns whatever the decompiler currently has +loaded for that address (post-relocation, post-mapping) — not the raw +bytes from the on-disk ELF/PE/Mach-O. Use it when you need to: + +- Inspect a constant table, jump table, or vtable that the decompiler + rendered as `dword_` / `unk_`. +- Read a string the backend's string detector missed (cross-check + against `list_strings` first; if absent, dump bytes manually). +- Verify the actual bytes behind a global the decompiler shows as an + opaque symbol. +- Pull a magic header / signature out of `.rodata` to confirm a file + format or library version. + +```bash +decompiler read_memory 0x4008e0 64 # default: hexdump +decompiler read_memory 0x4008e0 64 --format hex # one-line hex blob +decompiler read_memory 0x4008e0 64 --format raw > bytes # raw bytes to a file +decompiler read_memory 0x4008e0 64 --json # base64-encoded payload +``` + +JSON output includes both `size` (actual bytes returned) and +`requested_size` — backends may produce **short reads** when the request +straddles the end of a mapped segment. In text mode the CLI prints a +`# short read: ...` notice on stderr in that case. If the address is +unmapped or uninitialized, the CLI exits non-zero with a message saying +the backend couldn't satisfy the read; try a smaller `size` or confirm +the address with `list_functions` / `xref_to`. + +Address formats follow the same rules as everywhere else: hex (`0x4008e0`), +decimal (`4197088`), or lifted (`0x8e0`) all work. + ### `list_strings` may be incomplete `list_strings` returns exactly what the backend's own string detector @@ -223,5 +272,8 @@ for addr, func in client.functions.items(): ``` The new core APIs (`list_strings(filter=...)`, `get_callers(target)`, -`disassemble(addr)`) are on both the local `DecompilerInterface` and the -`DecompilerClient` proxy. +`disassemble(addr)`, `read_memory(addr, size)`) are on both the local +`DecompilerInterface` and the `DecompilerClient` proxy. `read_memory` +returns `bytes` (or `None` if the backend can't satisfy the read), so +you can hexdump, decode, or feed the result straight into struct +parsers without going through the CLI.