diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..c2b7591 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,3 @@ +# Keep text files LF on all platforms. Windows runners would otherwise +# autocrlf .md fixtures to CRLF and break snapshot tests. +* text=auto eol=lf diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bb0a504..474981a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,7 +9,14 @@ on: jobs: test: - runs-on: ubuntu-latest + # fail-fast: false lets a failure on one OS surface all of the + # others' results in the same run, instead of cancelling siblings. + # Cheaper to read than re-running individually for each. + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + runs-on: ${{ matrix.os }} permissions: contents: read steps: @@ -24,4 +31,9 @@ jobs: run: npm ci - name: Run tests + # `npm test` runs `tsx --test tests/*.test.ts`. The glob is + # shell-expanded; cmd.exe (Windows default) doesn't expand it, + # so we pin Git Bash — preinstalled on the windows-latest runner. + # No-op on Linux/macOS where bash is already the default. + shell: bash run: npm test diff --git a/CHANGELOG.md b/CHANGELOG.md index 745d0d2..a599923 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.6.0] - 2026-05-14 + +### Added +- Cross-platform absolute-path validation for `savePath` (MCP). Windows-style paths (`C:\foo`, `C:/foo`, `\\server\share`, `\foo`) are now accepted alongside POSIX absolute paths. The schema delegates to `path.isAbsolute()` so it stays correct on whichever platform the process runs. +- Write sandbox restricting MCP `savePath` writes. By default the allowed set is `realpath(os.tmpdir())` ∪ `realpath(process.cwd())`. Symlinks are resolved via `fs.realpath` before the containment check, so a planted symlink inside the sandbox cannot be used to escape. Violations return the new `save_forbidden` error code; no file is created. The CLI is intentionally unrestricted (human at the shell is the security boundary; the LLM via MCP is the threat surface). +- `MARKFETCH_ALLOWED_WRITE_ROOTS` env var: platform-delimiter-separated list of absolute paths (`:` on POSIX, `;` on Windows). When set, **replaces** the default allowed roots entirely. Validated at startup with fail-fast-on-stderr semantics matching existing env-var conventions (`MARKFETCH_TIMEOUT_MS`, `MARKFETCH_MAX_BYTES`, `MARKFETCH_USER_AGENT`). +- `save_forbidden` error code (8th in the contract): returned when a `savePath` resolves outside the configured allowed write roots. +- CI test job now runs on `ubuntu-latest`, `macos-latest`, and `windows-latest`. `shell: bash` is set on the `npm test` step so the test-glob expands consistently across runners. + ### Changed -- Resolved code smell SonarQube findings (S4325 redundant `Document` casts, S6594 `String#match` → `RegExp#exec`) — no behavior change, all 50 tests pass. ([c993938](https://github.com/vasylenko/markfetch/commit/c9939385edfbe95f7f34a24ba8e33e5a74ac07f4)) +- MCP `savePath` schema replaced the literal `startsWith('/')` constraint with `z.string().refine(path.isAbsolute)`. **Breaking change for MCP callers that previously wrote outside `os.tmpdir()` or `process.cwd()`** — they will now receive `save_forbidden` and must either move the target inside the default roots or set `MARKFETCH_ALLOWED_WRITE_ROOTS`. CLI behavior is unchanged (no sandbox there). +- Resolved code smell SonarQube findings (S4325 redundant `Document` casts, S6594 `String#match` → `RegExp#exec`) — no behavior change, all tests pass. ([c993938](https://github.com/vasylenko/markfetch/commit/c9939385edfbe95f7f34a24ba8e33e5a74ac07f4)) - Documentation and inline comments cleaned up across README, SPEC, source, and test descriptions. Text-only, no runtime change. ([#2](https://github.com/vasylenko/markfetch/pull/2)) ## [0.5.0] - 2026-05-12 diff --git a/README.md b/README.md index 6e5c7fe..e2409b6 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ gemini mcp add -s user markfetch npx -y markfetch | Generic Playwright / Puppeteer | ✓ | – | – | – | | `mcp-server-fetch` (Python) | – | basic | – | – | | CloudFlare `/markdown` | ✓ | ✓ | – | paid | -| **`markfetch`** | **✓** | **✓** | **✓ (7 codes)** | **✓** | +| **`markfetch`** | **✓** | **✓** | **✓ (8 codes)** | **✓** | - **Real-browser HTTP/2 + Chrome fingerprint.** ALPN-negotiated h2, `User-Agent`, `Sec-CH-UA-*`, `Sec-Fetch-*`, `Accept-*`. A Chrome UA with no client hints is a *stronger* automation signal than curl — `markfetch` sends the full coherent set, derived from the UA at startup so an override stays internally consistent. @@ -108,6 +108,19 @@ Flags: Errors go to stderr with the same `[code] message` shape the MCP tool returns (see the table below), and the process exits with a non-zero status. The same env vars (`MARKFETCH_TIMEOUT_MS`, `MARKFETCH_MAX_BYTES`, `MARKFETCH_USER_AGENT`) apply in both modes. +Errors carry one of eight deterministic codes: + +| Code | Meaning | +|---|---| +| `network_error` | DNS / TCP / TLS failure, or an unexpected internal error from the fetcher. | +| `http_error` | Upstream returned a non-2xx status. | +| `timeout` | Per-request budget `MARKFETCH_TIMEOUT_MS` exceeded. | +| `unsupported_content_type` | Response was not `text/html` or `application/xhtml+xml`. | +| `extraction_failed` | Readability returned no article content (typical for pure client-rendered SPAs). | +| `too_large` | Response body or extracted markdown exceeded `MARKFETCH_MAX_BYTES`. | +| `save_failed` | `savePath` was given but `writeFile` failed (parent directory missing, permission denied, etc.). | +| `save_forbidden` | `savePath` resolves outside the allowed write roots — see [Write sandbox](#write-sandbox). MCP-only; the CLI has no sandbox. | + ## What it is not - **Not a crawler.** No recursion, no `robots.txt` parsing, no rate-limit orchestration. One URL in, one document out. @@ -138,9 +151,51 @@ Pass overrides via the `env` block of your MCP client config: } ``` +### Write sandbox + +MCP `savePath` writes are confined to a set of allowed root directories. By default the allowed set is `os.tmpdir()` ∪ `process.cwd()` (each resolved via `fs.realpath` once at startup). A `savePath` outside that set returns `save_forbidden` and no file is created. + +Override the default set with `MARKFETCH_ALLOWED_WRITE_ROOTS` — a list of absolute paths separated by the platform's path delimiter (`:` on POSIX, `;` on Windows). When set, the override **replaces** the defaults entirely — it does not merge. To keep `os.tmpdir()` or `process.cwd()` accessible, list them yourself; the example below shows `/tmp` for that reason. A malformed value (non-absolute entry, or a directory that doesn't exist) fails fast on stderr at startup. + +```json +{ + "mcpServers": { + "markfetch": { + "command": "npx", + "args": ["-y", "markfetch"], + "env": { + "MARKFETCH_ALLOWED_WRITE_ROOTS": "/Users/me/markfetch-out:/tmp" + } + } + } +} +``` + +On Windows, use backslashes and `;` as the delimiter: + +```json +{ + "mcpServers": { + "markfetch": { + "command": "npx", + "args": ["-y", "markfetch"], + "env": { + "MARKFETCH_ALLOWED_WRITE_ROOTS": "C:\\Users\\me\\markfetch-out;C:\\Users\\me\\AppData\\Local\\Temp" + } + } + } +} +``` + +Notes: + +- **The sandbox is MCP-only by design.** The CLI is unrestricted — a human at the shell is the security boundary, and the markfetch CLI doesn't run any sandbox check at all. The asymmetry exists because the MCP tool is driven by a language model, which may be steered by content from a page it just fetched. +- **Symlinks pointing outside are blocked.** Each candidate `savePath` is resolved via `fs.realpath` to its real destination before the containment check, so a symlink planted inside the sandbox cannot be used to escape. +- **Containment is case-insensitive on Windows** (`C:\Users\Bob` and `c:\users\bob` are the same path). + ## Develop -Requires Node.js ≥ 24. +Requires Node.js ≥ 24. Tested on Linux, macOS, and Windows in CI. When iterating on CLI changes, `tsx src/index.ts ` and `tsx src/index.ts --help` route through the same argv-discriminated dispatcher as the built `dist/index.js` — no rebuild needed between edits. diff --git a/docs/SPEC.md b/docs/SPEC.md index 3429baf..322949c 100644 --- a/docs/SPEC.md +++ b/docs/SPEC.md @@ -13,7 +13,7 @@ URL → caller markdown body, or "Saved N bytes to /path" confirmation ``` -Errors throw `MarkfetchError` uniformly from core; adapters catch once. Codes: `network_error`, `http_error`, `timeout`, `unsupported_content_type`, `extraction_failed`, `too_large`, `save_failed`. CLI emits `[code] message` to stderr and exits 1; MCP emits `{ isError: true, content: [{ text: "[code] message" }] }`. +Errors throw `MarkfetchError` uniformly from core; adapters catch once. Codes: `network_error`, `http_error`, `timeout`, `unsupported_content_type`, `extraction_failed`, `too_large`, `save_failed`; plus `save_forbidden`, emitted by the MCP adapter only (before `fetchMarkdown` runs — see "Asymmetric write sandbox" under Core Decisions). CLI emits `[code] message` to stderr and exits 1; MCP emits `{ isError: true, content: [{ text: "[code] message" }] }`. ## Core Decisions @@ -21,7 +21,7 @@ Errors throw `MarkfetchError` uniformly from core; adapters catch once. Codes: ` - **Lazy adapter imports.** The dispatcher uses `await import()` to load exactly one adapter. The only `console.log` in the project lives in `cli.ts`; under MCP, `cli.ts` never loads, so stdout-discipline is enforced by the module graph — not by linter or convention. -- **Core throws, adapters translate.** All 7 error codes surface from `core.ts` — five are thrown explicitly as `MarkfetchError`; `network_error`, `timeout`, and (sometimes) `http_error` are translated by `classifyError` from underlying-API errors (undici TypeErrors, AbortSignal timeouts). New codes need an `ErrorCode` union member + a throw site; adapters don't change. +- **Core throws, adapters translate.** Seven of the eight error codes surface from `core.ts` — five are thrown explicitly as `MarkfetchError`; `network_error`, `timeout`, and (sometimes) `http_error` are translated by `classifyError` from underlying-API errors (undici TypeErrors, AbortSignal timeouts). The eighth code, `save_forbidden`, is the exception — it's emitted by the MCP adapter before `fetchMarkdown` is invoked (see "Asymmetric write sandbox" below). New core codes need an `ErrorCode` union member + a throw site; adapters don't change. - **HTTP/2 + coherent Chrome fingerprint.** Wire protocol, headers, and UA must agree — a Chrome UA over HTTP/1.1 or without `Sec-CH-UA-*` is *more* suspicious than curl. `Sec-CH-UA-*` is derived from `MARKFETCH_USER_AGENT` at startup so override-coherence is mechanical. @@ -29,9 +29,11 @@ Errors throw `MarkfetchError` uniformly from core; adapters catch once. Codes: ` - **Whole document or `too_large`.** No pagination. Partial content lets the agent reason over truncated bodies without knowing they're truncated. `savePath` / `-o` is the escape valve for genuinely large documents. -- **Asymmetric `savePath`.** MCP requires absolute paths (zod `startsWith("/")`); CLI accepts relative and resolves against `process.cwd()`. CLI has a stable cwd the user typed `cd` into; MCP servers run in whatever cwd the client picks. +- **Asymmetric `savePath`.** MCP requires absolute paths via zod `refine(path.isAbsolute)` — accepts platform-appropriate shapes on POSIX (`/foo`) and Windows (`C:\foo`, `C:/foo`, `\\server\share`, `\foo`). CLI accepts relative and resolves against `process.cwd()`. CLI has a stable cwd the user typed `cd` into; MCP servers run in whatever cwd the client picks. -- **Stderr is fatal-only.** Per-request MCP errors round-trip through `{ isError }`; only startup misconfig / unrecoverable crashes touch stderr. CLI is its own session, so its per-request errors *are* fatal for that session. Regression guard: `tests/server.test.ts:436`. +- **Asymmetric write sandbox.** MCP `savePath` writes are confined to `realpath(os.tmpdir())` ∪ `realpath(process.cwd())` by default; the env var `MARKFETCH_ALLOWED_WRITE_ROOTS` (path-delimiter-separated) replaces the defaults. CLI writes anywhere the human's shell permits — no sandbox check. The asymmetry reflects the threat model: an LLM driving the MCP tool may be steered by content from the page it just fetched; a human typing into a shell is the security boundary. Symlinks are resolved via `fs.realpath` before the containment check, so a planted symlink inside the sandbox cannot escape. Containment compare is case-insensitive on `process.platform === "win32"`. Implementation lives in `src/sandbox.ts` (a leaf module — no imports from siblings — so it's unit-testable without spinning up the MCP server). Known limitation: TOCTOU between `realpath` and `writeFile` is not closed — acceptable for a single-user developer tool. + +- **Stderr is fatal-only.** Per-request MCP errors round-trip through `{ isError }`; only startup misconfig / unrecoverable crashes touch stderr. CLI is its own session, so its per-request errors *are* fatal for that session. Regression guard: `tests/server.test.ts:336`. ## Ideas for future @@ -41,4 +43,3 @@ Errors throw `MarkfetchError` uniformly from core; adapters catch once. Codes: ` - **Cookie reuse across redirects within a single fetch.** Currently none. Trigger: a target serves content only after a session-cookie redirect. - **Proxy support** (`MARKFETCH_PROXY_URL`) and **`Accept-Language` control** (`MARKFETCH_ACCEPT_LANGUAGE`). Trigger: corporate proxy / locale-specific content. - **Single-binary distribution.** Bun's `build --compile`, Node SEA, or similar. Trigger: `npx` first-run latency feedback, or an offline / airgapped need. -- **Windows-friendly `savePath` schema.** Currently Unix-shaped (`startsWith("/")`). Trigger: someone needs this on Windows. diff --git a/package-lock.json b/package-lock.json index 83fb40d..3a1e30b 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "markfetch", - "version": "0.5.0", + "version": "0.6.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "markfetch", - "version": "0.5.0", + "version": "0.6.0", "license": "MIT", "dependencies": { "@modelcontextprotocol/sdk": "^1.29.0", diff --git a/package.json b/package.json index 445f75b..9522e90 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "markfetch", - "version": "0.5.0", + "version": "0.6.0", "description": "Fetch a URL, return clean markdown. MCP server and CLI for AI agents.", "license": "MIT", "author": { diff --git a/src/cli.ts b/src/cli.ts index ac1f785..b8d0579 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -23,7 +23,7 @@ program "Fetch a URL and return clean markdown.\n" + "Run with no arguments to start the MCP stdio server.", ) - .version("0.5.0") + .version("0.6.0") .argument("", "absolute http(s) URL to fetch") .option( "-o, --output ", diff --git a/src/core.ts b/src/core.ts index f6122bc..f2a6d4b 100644 --- a/src/core.ts +++ b/src/core.ts @@ -154,7 +154,8 @@ export type ErrorCode = | "unsupported_content_type" | "extraction_failed" | "too_large" - | "save_failed"; + | "save_failed" + | "save_forbidden"; export class MarkfetchError extends Error { constructor( @@ -405,6 +406,9 @@ function convertToMarkdown(article: { // extraction_failed, too_large, save_failed // (The first three may also come from underlying APIs and be translated by // classifyError — adapters MUST run classifyError(err) in their catch blocks.) +// Note: the MCP adapter additionally emits save_forbidden (the 8th code in +// the contract) before fetchMarkdown is invoked — this function never throws +// it. See src/sandbox.ts and src/mcp.ts. export async function fetchMarkdown(input: { url: string; savePath?: string; diff --git a/src/mcp.ts b/src/mcp.ts index 94f0d7c..82e54e2 100644 --- a/src/mcp.ts +++ b/src/mcp.ts @@ -13,6 +13,12 @@ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; import { z } from "zod"; import { fetchMarkdown, classifyError, type ErrorCode } from "./core.js"; +import { isAbsolute } from "node:path"; +import { buildAllowedRoots, checkPath } from "./sandbox.js"; + +// Built once at startup. Bad config throws and surfaces on stderr (same +// fail-fast convention as intEnv() in core.ts). +const ALLOWED_ROOTS = await buildAllowedRoots(process.env); function errorResult(code: ErrorCode, message: string) { return { @@ -21,13 +27,13 @@ function errorResult(code: ErrorCode, message: string) { }; } -const server = new McpServer({ name: "markfetch", version: "0.5.0" }); +const server = new McpServer({ name: "markfetch", version: "0.6.0" }); server.registerTool( "fetch_markdown", { description: - "Fetch a single public HTTP/S URL and return its main article content as clean markdown. Best for articles, documentation, blog posts, news, and reference pages. Non-HTML responses return `unsupported_content_type`. Pure client-rendered SPAs with no extractable static HTML return `extraction_failed`; SPAs that ship server-rendered or SEO-prerendered HTML will extract whatever static content they expose. Also supports saving the markdown to a file, e.g., to bypass client tool-result size limits or to reuse later.", + "Fetch a single public HTTP/S URL and return its main article content as clean markdown. Best for articles, documentation, blog posts, news, and reference pages. Non-HTML responses return `unsupported_content_type`. Pure client-rendered SPAs with no extractable static HTML return `extraction_failed`; SPAs that ship server-rendered or SEO-prerendered HTML will extract whatever static content they expose. Also supports saving the markdown to a file, e.g., to bypass client tool-result size limits or to reuse later. Saved files must land inside the allowed write roots (defaults: system temp dir and the server's working directory; configurable via `MARKFETCH_ALLOWED_WRITE_ROOTS`); paths outside return `save_forbidden`.", inputSchema: { url: z .string() @@ -37,14 +43,22 @@ server.registerTool( ), savePath: z .string() - .startsWith("/") + .refine(isAbsolute, "savePath must be an absolute filesystem path") .optional() .describe( - "Optional. When provided, the fetched markdown is written to this absolute filesystem path and the response becomes a small confirmation. Use this when the markdown might exceed your client's tool-result inline cap. Must be an absolute path starting with '/'; relative paths and tilde-paths ('~/...') are rejected by the schema. Existing files are overwritten; the parent directory must exist (caller's responsibility). The file is written only on fetch success — fetch / extraction / size-cap errors return a [code] string and never touch the file.", + "Optional. When provided, the fetched markdown is written to this absolute filesystem path and the response becomes a small confirmation. Use this when the markdown might exceed your client's tool-result inline cap. Must be an absolute path on the host platform (e.g., `/foo/bar.md` on POSIX; `C:\\foo\\bar.md` or `\\\\server\\share\\bar.md` on Windows); relative paths and tilde paths (`~/...`) are rejected by the schema. Writes are confined to an allow-listed sandbox — defaults are the system temp dir (`os.tmpdir()`) and the server's working directory; operators can override with `MARKFETCH_ALLOWED_WRITE_ROOTS` (path-delimiter-separated). A `savePath` outside the allowed roots returns `save_forbidden` and no file is created. Existing files are overwritten; the parent directory must exist (caller's responsibility). The file is written only on fetch success — fetch / extraction / size-cap errors return a `[code]` string and never touch the file.", ), }, }, async ({ url, savePath }) => { + // Sandbox gate (MCP-only; CLI is intentionally unbounded). Runs before + // fetchMarkdown so a forbidden path short-circuits the fetch. + if (savePath !== undefined) { + const check = await checkPath(savePath, ALLOWED_ROOTS); + if (!check.ok) { + return errorResult("save_forbidden", check.reason); + } + } try { const { markdown, bytes, savedTo } = await fetchMarkdown({ url, diff --git a/src/sandbox.ts b/src/sandbox.ts new file mode 100644 index 0000000..e3de74a --- /dev/null +++ b/src/sandbox.ts @@ -0,0 +1,126 @@ +// Write-path containment for the MCP adapter. MCP's caller is a language +// model — possibly steered by the page it just fetched — so this module +// bounds the filesystem paths it can write to. CLI is intentionally +// unbounded (human at the shell is the security boundary); only MCP uses +// this module. +// +// Invariants: +// - Leaf module: no imports from siblings, unit-testable in isolation. +// - No console.* — buildAllowedRoots throws (escapes module init in +// mcp.ts, surfaces on stderr); checkPath returns a discriminated union. +// - No hardcoded platform paths; every platform-dependent value comes +// from a Node API. + +import { realpath, stat } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { + delimiter, + dirname, + isAbsolute, + join, + parse, + relative, + resolve, +} from "node:path"; + +const ENV_VAR = "MARKFETCH_ALLOWED_WRITE_ROOTS"; + +export type CheckResult = + | { ok: true; resolved: string } + | { ok: false; reason: string }; + +// Defaults: realpath(os.tmpdir()) ∪ realpath(process.cwd()). +// MARKFETCH_ALLOWED_WRITE_ROOTS REPLACES the defaults (no merge) — deliberate; +// setting it is asserting a policy, so additive defaults would weaken it. +// Callers who want tmpdir/cwd back must list them explicitly. Every entry +// must be absolute, resolvable via realpath, and a directory. Bad config +// throws at module init — same fail-fast contract as intEnv(). +export async function buildAllowedRoots( + env: NodeJS.ProcessEnv, +): Promise { + const raw = env[ENV_VAR]; + if (raw != null && raw !== "") { + const resolved: string[] = []; + for (const entry of raw.split(delimiter)) { + if (!isAbsolute(entry)) { + throw new Error( + `Invalid ${ENV_VAR} entry ${JSON.stringify(entry)} — every entry must be an absolute path.`, + ); + } + let resolvedEntry: string; + try { + resolvedEntry = await realpath(entry); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + throw new Error( + `Invalid ${ENV_VAR} entry ${JSON.stringify(entry)} — could not resolve: ${message}`, + ); + } + const stats = await stat(resolvedEntry); + if (!stats.isDirectory()) { + throw new Error( + `Invalid ${ENV_VAR} entry ${JSON.stringify(entry)} — resolved to ${JSON.stringify(resolvedEntry)} which is not a directory.`, + ); + } + resolved.push(resolvedEntry); + } + return resolved; + } + return [await realpath(tmpdir()), await realpath(process.cwd())]; +} + +// Resolve savePath through fs.realpath (defeating symlink escape) and check +// containment against allowed roots. Walks up to the deepest extant ancestor +// because the leaf usually doesn't exist yet — that's the point of "save". +export async function checkPath( + savePath: string, + roots: string[], +): Promise { + const normalized = resolve(savePath); + + let ancestor = normalized; + const trailing: string[] = []; + while (true) { + try { + await stat(ancestor); + break; + } catch { + const parent = dirname(ancestor); + if (parent === ancestor) { + // Reached filesystem root with no extant ancestor — fail closed. + return { + ok: false, + reason: `cannot resolve any extant ancestor for '${savePath}'`, + }; + } + trailing.unshift(parse(ancestor).base); + ancestor = parent; + } + } + + const resolvedAncestor = await realpath(ancestor); + const reattached = + trailing.length === 0 + ? resolvedAncestor + : join(resolvedAncestor, ...trailing); + + // Win32 case-fold: filesystem is case-insensitive and fs.realpath doesn't + // reliably canonicalize case, so compare both sides lowercased. + const fold = + process.platform === "win32" + ? (s: string) => s.toLowerCase() + : (s: string) => s; + const foldedTarget = fold(reattached); + + for (const root of roots) { + const rel = relative(fold(root), foldedTarget); + if (rel === "") return { ok: true, resolved: reattached }; + if (!rel.startsWith("..") && !isAbsolute(rel)) { + return { ok: true, resolved: reattached }; + } + } + return { + ok: false, + reason: `'${reattached}' is outside the allowed write roots: [${roots.map((r) => `'${r}'`).join(", ")}]`, + }; +} diff --git a/tests/_helpers.ts b/tests/_helpers.ts index 626e1a3..c12996c 100644 --- a/tests/_helpers.ts +++ b/tests/_helpers.ts @@ -10,9 +10,21 @@ import { type IncomingMessage, type ServerResponse, } from "node:http"; +import { resolve } from "node:path"; +import { pathToFileURL } from "node:url"; import { Client } from "@modelcontextprotocol/sdk/client/index.js"; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js"; +// Absolute file:// URL for tsx's loader. Spawn the CLI via +// `node --import src/index.ts` to bypass the `./node_modules/.bin/tsx` +// shim (a .cmd on Windows that child_process.spawn can't launch without +// shell:true). Absolute so tests that override the child cwd still work. +// spawnClient stays on `command: "tsx"` — StdioClientTransport handles +// cross-platform spawn itself. +export const TSX_LOADER_URL = pathToFileURL( + resolve("./node_modules/tsx/dist/loader.mjs"), +).href; + export async function startMock( handler: (req: IncomingMessage, res: ServerResponse) => void, ): Promise<{ url: string; close: () => Promise }> { @@ -58,11 +70,11 @@ export function textOf(result: { content: unknown }): string { return content[0]?.text ?? ""; } -// Matches any of the seven [code] error prefixes the tool emits. Used by +// Matches any of the eight [code] error prefixes the tool emits. Used by // schema-rejection assertions to prove the handler did NOT run — a [code] // prefix would mean the call escaped Zod and reached core. export const ERROR_CODE_PREFIX_RE = - /^\[(network_error|http_error|timeout|unsupported_content_type|extraction_failed|too_large|save_failed)\]/; + /^\[(network_error|http_error|timeout|unsupported_content_type|extraction_failed|too_large|save_failed|save_forbidden)\]/; // Asserts that a tool call is rejected at the Zod schema boundary, not by the // handler. The SDK either throws (some versions) or returns isError:true with @@ -103,10 +115,14 @@ export async function spawnAndCaptureExit( args: string[], env: Record, ): Promise<{ exitCode: number; stderr: string }> { - const child = spawn("./node_modules/.bin/tsx", args, { - env: { ...process.env, ...env } as Record, - stdio: ["pipe", "pipe", "pipe"], - }); + const child = spawn( + process.execPath, + ["--import", TSX_LOADER_URL, ...args], + { + env: { ...process.env, ...env } as Record, + stdio: ["pipe", "pipe", "pipe"], + }, + ); let stderr = ""; child.stderr.on("data", (d: Buffer) => { stderr += d.toString(); diff --git a/tests/cli.test.ts b/tests/cli.test.ts index 603bdae..ebdcdd5 100644 --- a/tests/cli.test.ts +++ b/tests/cli.test.ts @@ -9,15 +9,11 @@ import { promisify } from "node:util"; import { mkdtemp, readFile, rm, stat } from "node:fs/promises"; import { tmpdir } from "node:os"; import { join, resolve as resolvePath } from "node:path"; -import { startMock, HAPPY_FIXTURE } from "./_helpers.js"; +import { startMock, HAPPY_FIXTURE, TSX_LOADER_URL } from "./_helpers.js"; const execFileAsync = promisify(execFile); -// Resolved at module load against the test runner's cwd (the project root). -// Tests that override `cwd` to a tmpdir still need to find the tsx CLI -// and the source entry — passing relative paths would resolve against the -// new cwd and produce a confusing ENOENT instead of the behavior under test. -const TSX_CLI = resolvePath("./node_modules/.bin/tsx"); +// Absolute so tests that override the child cwd still locate the entry. const ENTRY = resolvePath("src/index.ts"); type RunResult = { code: number; stdout: string; stderr: string }; @@ -38,8 +34,8 @@ async function runCli( ): Promise { try { const { stdout, stderr } = await execFileAsync( - TSX_CLI, - [ENTRY, ...args], + process.execPath, + ["--import", TSX_LOADER_URL, ENTRY, ...args], { env: { ...process.env, ...env } as Record, cwd, @@ -176,8 +172,9 @@ test("CLI: timeout when MARKFETCH_TIMEOUT_MS is small and server hangs", async ( assert.notEqual(code, 0); assert.equal(stdout, ""); assert.match(stderr, /^\[timeout\]/); - // 1500ms allows for tsx cold-start; the timeout itself fires at 200ms. - assert.ok(elapsed < 1500, `timeout should fire fast; took ${elapsed}ms`); + // 3000ms is generous headroom for node + tsx ESM-loader cold-start + // (especially slow Windows runners); the timeout itself fires at 200ms. + assert.ok(elapsed < 3000, `timeout should fire fast; took ${elapsed}ms`); } finally { await mock.close(); } @@ -196,7 +193,7 @@ test("CLI: --version prints version to stdout, exit 0", async () => { const { code, stdout, stderr } = await runCli(["--version"]); assert.equal(code, 0); assert.equal(stderr, ""); - assert.equal(stdout, "0.5.0\n"); + assert.equal(stdout, "0.6.0\n"); }); // This test documents a deliberate asymmetry vs MCP: a malformed URL is NOT diff --git a/tests/e2e.test.ts b/tests/e2e.test.ts index 22553eb..09367bb 100644 --- a/tests/e2e.test.ts +++ b/tests/e2e.test.ts @@ -58,7 +58,7 @@ test("e2e: built output boots, exposes fetch_markdown, pins version", async () = try { const info = client.getServerVersion(); assert.equal(info?.name, "markfetch"); - assert.equal(info?.version, "0.5.0"); + assert.equal(info?.version, "0.6.0"); const { tools } = await client.listTools(); assert.equal(tools.length, 1); assert.equal(tools[0].name, "fetch_markdown"); @@ -166,5 +166,39 @@ test("e2e: built output --version prints package version, exit 0", async () => { { timeout: 10_000 }, ); assert.equal(stderr, ""); - assert.equal(stdout, "0.5.0\n"); + assert.equal(stdout, "0.6.0\n"); +}); + +// Live E2E. Exercises the full pipeline against real production URLs. +// Property-based assertions only — pages rewrite copy over time. + +test("e2e live: en.wikipedia.org/wiki/Markup_language extracts title, sections, and strips chrome", async () => { + const { stdout, stderr } = await execFileAsync( + "node", + [BUILT_JS, "https://en.wikipedia.org/wiki/Markup_language"], + { timeout: 30_000, maxBuffer: 10_000_000 }, + ); + assert.equal(stderr, "", "stderr must stay empty on a successful fetch"); + assert.match(stdout, /markup\s+language/i, "extracted markdown must mention the article title"); + const h2Count = (stdout.match(/^## /gm) ?? []).length; + assert.ok(h2Count >= 3, `expected ≥3 H2 sections; got ${h2Count}`); + assert.ok(!stdout.includes(" 5000, `markdown should be substantial; got ${stdout.length} chars`); +}); + +test("e2e live: code.claude.com/docs/en/commands extracts content as clean markdown", async () => { + const { stdout, stderr } = await execFileAsync( + "node", + [BUILT_JS, "https://code.claude.com/docs/en/commands"], + { timeout: 30_000, maxBuffer: 10_000_000 }, + ); + assert.equal(stderr, "", "stderr must stay empty on a successful fetch"); + assert.match(stdout, /command/i, "docs page must mention 'command' somewhere"); + assert.ok(stdout.length > 500, `markdown should be non-trivial; got ${stdout.length} chars`); + assert.ok(!stdout.includes("(fn: (dir: string) => Promise): Promise { + const dir = await realpath(await mkdtemp(join(tmpdir(), "sandbox-test-"))); + try { + return await fn(dir); + } finally { + await rm(dir, { recursive: true, force: true }); + } +} + +test("buildAllowedRoots: multi-entry env split by path.delimiter", async () => { + await withSandboxTmpDir(async (a) => { + await withSandboxTmpDir(async (b) => { + const roots = await buildAllowedRoots({ + MARKFETCH_ALLOWED_WRITE_ROOTS: `${a}${delimiter}${b}`, + }); + assert.deepEqual(roots, [a, b]); + }); + }); +}); + +test("buildAllowedRoots: non-existent entry throws fail-fast (realpath fails)", async () => { + await withSandboxTmpDir(async (dir) => { + const nope = join(dir, "does-not-exist"); + await assert.rejects( + () => buildAllowedRoots({ MARKFETCH_ALLOWED_WRITE_ROOTS: nope }), + /could not resolve/, + ); + }); +}); + +test("buildAllowedRoots: empty entry from leading/trailing/consecutive delimiter throws fail-fast", async () => { + await assert.rejects( + () => + buildAllowedRoots({ + MARKFETCH_ALLOWED_WRITE_ROOTS: `${delimiter}/some/path`, + }), + /every entry must be an absolute path/, + ); +}); + +test("buildAllowedRoots: regular-file entry throws fail-fast (must be a directory)", async () => { + await withSandboxTmpDir(async (dir) => { + const filePath = join(dir, "regular-file.txt"); + await writeFile(filePath, ""); + await assert.rejects( + () => buildAllowedRoots({ MARKFETCH_ALLOWED_WRITE_ROOTS: filePath }), + /not a directory/, + ); + }); +}); + +test("checkPath: ../ traversal that escapes a root is not ok", async () => { + await withSandboxTmpDir(async (dir) => { + // dir/sub/../../escape resolves to dir/../escape — a sibling of dir. + // path.resolve normalizes the ..; path.relative then rejects. + const escape = join(dir, "sub", "..", "..", "escape.md"); + const result = await checkPath(escape, [dir]); + assert.equal(result.ok, false); + }); +}); + +test("checkPath: prefix-overlap trap (root /tmp vs target /tmp-evil)", async () => { + await withSandboxTmpDir(async (root) => { + // Sibling sharing root's prefix. A naive target.startsWith(root) would + // pass this incorrectly; path.relative returns "..//trap.md". + const target = join(`${root}-evil`, "trap.md"); + const result = await checkPath(target, [root]); + assert.equal(result.ok, false); + }); +}); + +test( + "checkPath: case-variant paths compare equal on win32", + { skip: process.platform !== "win32" }, + async () => { + await withSandboxTmpDir(async (dir) => { + const variant = dir.charAt(0) + dir.slice(1).toLowerCase(); + const target = join(dir.toUpperCase(), "out.md"); + const result = await checkPath(target, [variant]); + assert.equal(result.ok, true); + }); + }, +); diff --git a/tests/server.test.ts b/tests/server.test.ts index 35d0c27..4b7c9e4 100644 --- a/tests/server.test.ts +++ b/tests/server.test.ts @@ -9,9 +9,11 @@ import { access, writeFile, rm, + mkdir, + symlink, } from "node:fs/promises"; import { tmpdir } from "node:os"; -import { join } from "node:path"; +import { join, parse } from "node:path"; import { startMock, textOf, @@ -26,7 +28,7 @@ test("server boots over stdio and completes the initialize handshake", async () try { const info = client.getServerVersion(); assert.equal(info?.name, "markfetch"); - assert.equal(info?.version, "0.5.0"); + assert.equal(info?.version, "0.6.0"); } finally { await client.close(); } @@ -514,25 +516,31 @@ test("savePath: writeFile rejection surfaces as [save_failed] with errno; file i res.writeHead(200, { "Content-Type": "text/html; charset=utf-8" }); res.end(HAPPY_FIXTURE); }); - const savePath = "/nonexistent-parent-zzz-savepath-test/out.md"; - const client = await spawnClient(); - try { - const result = await client.callTool({ - name: "fetch_markdown", - arguments: { url: mock.url, savePath }, - }); - assert.equal(result.isError, true); - assert.match(textOf(result), /^\[save_failed\]/); - assert.match(textOf(result), /ENOENT/); - await assert.rejects( - access(savePath), - /ENOENT/, - "file must not have been created on save failure", - ); - } finally { - await client.close(); - await mock.close(); - } + // Path lives inside the default sandbox (os.tmpdir() via mkdtemp), so the + // sandbox check passes and writeFile actually runs. The "nonexistent-subdir" + // intermediate doesn't exist, so writeFile fails with ENOENT — exercising + // the save_failed branch of core.ts. + await withTmpDir(async (dir) => { + const savePath = join(dir, "nonexistent-subdir", "out.md"); + const client = await spawnClient(); + try { + const result = await client.callTool({ + name: "fetch_markdown", + arguments: { url: mock.url, savePath }, + }); + assert.equal(result.isError, true); + assert.match(textOf(result), /^\[save_failed\]/); + assert.match(textOf(result), /ENOENT/); + await assert.rejects( + access(savePath), + /ENOENT/, + "file must not have been created on save failure", + ); + } finally { + await client.close(); + await mock.close(); + } + }); }); // T6 — THE Invariant. The file at savePath is only ever the markdown of the URL (per README and SPEC.md). @@ -628,3 +636,188 @@ test("savePath: existing file is overwritten", async () => { } }); }); + +// T9 — savePath outside the default sandbox → [save_forbidden]; no file. +test("savePath sandbox: path outside default roots → [save_forbidden] and file not written", async () => { + const mock = await startMock((_req, res) => { + res.writeHead(200, { "Content-Type": "text/html; charset=utf-8" }); + res.end(HAPPY_FIXTURE); + }); + // Construct a forbidden path without hardcoding "/etc" or "C:\\Windows". + // Filesystem root + sentinel filename is outside any mkdtemp dir under + // tmpdir, and outside the test runner's cwd (the repo root). On POSIX + // this is "/markfetch-sandbox-forbidden.md"; on Windows it is + // "C:\\markfetch-sandbox-forbidden.md". + const forbiddenPath = join( + parse(tmpdir()).root, + "markfetch-sandbox-forbidden.md", + ); + const client = await spawnClient(); + try { + const result = await client.callTool({ + name: "fetch_markdown", + arguments: { url: mock.url, savePath: forbiddenPath }, + }); + assert.equal(result.isError, true); + const text = textOf(result); + assert.match(text, /^\[save_forbidden\]/); + assert.ok( + text.includes("outside the allowed write roots"), + `error must explain the rule for caller recovery; got: ${text}`, + ); + await assert.rejects( + access(forbiddenPath), + /ENOENT/, + "save_forbidden must not create the file", + ); + } finally { + await client.close(); + await mock.close(); + } +}); + +// T10 — symlink inside sandbox pointing outside is blocked by realpath. +// POSIX-gated: Windows symlink creation typically requires elevation, and +// the property under test is platform-independent in src/sandbox.ts, so +// POSIX coverage is sufficient. +test( + "savePath sandbox: symlink escape from inside sandbox is blocked", + { skip: process.platform === "win32" }, + async () => { + const mock = await startMock((_req, res) => { + res.writeHead(200, { "Content-Type": "text/html; charset=utf-8" }); + res.end(HAPPY_FIXTURE); + }); + await withTmpDir(async (sandboxDir) => { + // Plant a symlink inside the sandbox dir pointing to filesystem root + // (definitely outside tmpdir). Then write through the symlink and + // expect rejection — realpath in checkPath resolves the symlink and + // the containment check sees the outside-the-sandbox target. + const innerDir = join(sandboxDir, "inner"); + await mkdir(innerDir); + const escapeLink = join(innerDir, "escape"); + const outsideRoot = parse(tmpdir()).root; + await symlink(outsideRoot, escapeLink); + const target = join(escapeLink, "markfetch-symlink-escape.md"); + const realTarget = join(outsideRoot, "markfetch-symlink-escape.md"); + const client = await spawnClient(); + try { + const result = await client.callTool({ + name: "fetch_markdown", + arguments: { url: mock.url, savePath: target }, + }); + assert.equal(result.isError, true); + assert.match(textOf(result), /^\[save_forbidden\]/); + await assert.rejects( + access(realTarget), + /ENOENT/, + "symlink-escape attempt must not write to the symlink target", + ); + } finally { + await client.close(); + await mock.close(); + } + }); + }, +); + +// T11 — MARKFETCH_ALLOWED_WRITE_ROOTS REPLACES the defaults (not merges). +test("savePath sandbox: MARKFETCH_ALLOWED_WRITE_ROOTS replaces the defaults", async () => { + const mock = await startMock((_req, res) => { + res.writeHead(200, { "Content-Type": "text/html; charset=utf-8" }); + res.end(HAPPY_FIXTURE); + }); + await withTmpDir(async (customRoot) => { + // Spawn the MCP server with the override pointing at customRoot only. + // If the override merged with defaults instead of replacing them, a + // sibling tmpdir path would still be allowed — the second assertion + // proves it doesn't. + const client = await spawnClient({ + env: { MARKFETCH_ALLOWED_WRITE_ROOTS: customRoot }, + }); + try { + // (a) Inside the override root → success. + const insideCustom = join(customRoot, "inside.md"); + const ok = await client.callTool({ + name: "fetch_markdown", + arguments: { url: mock.url, savePath: insideCustom }, + }); + assert.equal( + ok.isError, + false, + `write inside override root must succeed; got: ${textOf(ok)}`, + ); + + // (b) Inside os.tmpdir() but outside the override root → forbidden. + // A sibling mkdtemp dir is inside tmpdir but outside customRoot. + await withTmpDir(async (siblingDir) => { + const outsideCustom = join(siblingDir, "outside-override.md"); + const denied = await client.callTool({ + name: "fetch_markdown", + arguments: { url: mock.url, savePath: outsideCustom }, + }); + assert.equal( + denied.isError, + true, + "path inside tmpdir-but-outside-override must be rejected (proves replace, not merge)", + ); + assert.match(textOf(denied), /^\[save_forbidden\]/); + }); + } finally { + await client.close(); + await mock.close(); + } + }); +}); + +// T12 — bad MARKFETCH_ALLOWED_WRITE_ROOTS fails fast at startup. +// Mirrors the existing intEnv fail-fast tests for MARKFETCH_TIMEOUT_MS / +// MARKFETCH_MAX_BYTES / MARKFETCH_USER_AGENT. +test("env-var validation: non-absolute MARKFETCH_ALLOWED_WRITE_ROOTS fails fast at startup", async () => { + const { exitCode, stderr } = await spawnAndCaptureExit(["src/index.ts"], { + MARKFETCH_ALLOWED_WRITE_ROOTS: "relative/path", + }); + assert.notEqual( + exitCode, + 0, + "subprocess must exit non-zero when sandbox env var is malformed", + ); + assert.match(stderr, /MARKFETCH_ALLOWED_WRITE_ROOTS/); + assert.match(stderr, /absolute path/); +}); + +// T13 — Windows-shaped absolute path accepted at the schema and writes. +// Win32-gated. Regression guard against reverting the schema from +// path.isAbsolute back to startsWith("/"). The Windows shape must flow +// schema → sandbox check (tmpdir is in defaults) → writeFile. +test( + "savePath: Windows-shaped absolute path is accepted at the schema and writes (win32-gated)", + { skip: process.platform !== "win32" }, + async () => { + const mock = await startMock((_req, res) => { + res.writeHead(200, { "Content-Type": "text/html; charset=utf-8" }); + res.end(HAPPY_FIXTURE); + }); + await withTmpDir(async (dir) => { + // On Windows, join() with a Windows-shaped tmpdir produces a + // backslash path like C:\Users\...\Temp\xxxxx\win-shape.md. + const savePath = join(dir, "win-shape.md"); + const client = await spawnClient(); + try { + const result = await client.callTool({ + name: "fetch_markdown", + arguments: { url: mock.url, savePath }, + }); + assert.equal( + result.isError, + false, + `Windows-shaped path must be accepted; got: ${textOf(result)}`, + ); + await access(savePath); + } finally { + await client.close(); + await mock.close(); + } + }); + }, +);