diff --git a/.dscode/benchmarks.example.txt b/.dscode/benchmarks.example.txt index 9cdfe79..ec7a537 100644 --- a/.dscode/benchmarks.example.txt +++ b/.dscode/benchmarks.example.txt @@ -252,6 +252,15 @@ isolate_workdir = true notes = "GitHub Action bridge patch-mode benchmark over an isolated Rust fixture" budget = 6 +name = "fixture-github-action-patch-trigger-exact-replacement-rust-mini" +task = "GitHub Action issue_comment request text: `@deepseek patch change src/lib.rs so a - b becomes a + b`. The PR head is already checked out locally; replace `a - b` with `a + b` in src/lib.rs and validate with cargo test." +category = "pr_workflow" +assertion_bundle = "write_validate_ok" +workdir = "fixtures/rust-write-mini" +isolate_workdir = true +notes = "GitHub Action bridge patch-mode benchmark for exact `change file so before becomes after` hosted workflow requests" +budget = 6 + name = "seeded-pr-ci-lint" task = "CI job `lint-rust` (run #556) on PR #42 failed at step `cargo clippy`. Reproduce locally, fix the root cause, and rerun the failing check. Failed log tail follows." category = "pr_workflow" diff --git a/.dscode/benchmarks.txt b/.dscode/benchmarks.txt index df74de7..39b35bd 100644 --- a/.dscode/benchmarks.txt +++ b/.dscode/benchmarks.txt @@ -287,6 +287,15 @@ isolate_workdir = true notes = "GitHub Action bridge patch-mode benchmark over an isolated Rust fixture" budget = 6 +name = "fixture-github-action-patch-trigger-exact-replacement-rust-mini" +task = "GitHub Action issue_comment request text: `@deepseek patch change src/lib.rs so a - b becomes a + b`. The PR head is already checked out locally; replace `a - b` with `a + b` in src/lib.rs and validate with cargo test." +category = "pr_workflow" +assertion_bundle = "write_validate_ok" +workdir = "fixtures/rust-write-mini" +isolate_workdir = true +notes = "GitHub Action bridge patch-mode benchmark for exact `change file so before becomes after` hosted workflow requests" +budget = 6 + name = "fixture-pr-fix-rust-cli-mini" task = "CI job `test-rust` (run #555) on PR #42 failed at step `cargo test`. Reproduce locally, fix the root cause, and rerun the failing test. Failed log tail follows." category = "pr_workflow" diff --git a/docs/current-status.md b/docs/current-status.md index e5119b6..1667a54 100644 --- a/docs/current-status.md +++ b/docs/current-status.md @@ -37,13 +37,14 @@ DeepSeekCode 的目标是成为一个 DeepSeek-first 的 code agent CLI:用户 - 本轮新增:shell-supervisor terminal event log 已为 PTY output/input 记录可选 `raw_base64`,supervisor `attach` JSON 响应会透出结构化 `terminal_raw_outputs`,`exec_shell_attach` 摘要仍带兼容的 `terminal_raw_base64` section。human `--follow` 现在走同一个 Unix socket 上的 `attach_stream` newline-JSON frame stream,`--follow` / `--interactive` 会优先解码 output raw bytes;`deepseek agents shell attach --raw` 可为脚本直接输出 PTY bytes。新增 `deepseek agents shell byte-stream ` / supervisor `byte_stream`,可在一个 socket stream 里处理初始 stdin/resize、后续 newline-JSON stdin/resize/close/detach control frames,并以 `byte_outputs[].bytes_base64` 输出 raw PTY bytes;`--raw-proxy` / `raw_proxy=true` 会在初始 JSON 后切到原始 socket bytes:socket input 直接进 PTY stdin,PTY output bytes 直接写回 socket。`deepseek agents shell proxy ` 现在是面向人的 raw-proxy wrapper,会进入本地 raw mode、同步终端尺寸、转发 key/paste/resize、直接写回 PTY bytes,并用 `Ctrl-]` detach。Linux 上还新增 supervisor `pty_fd` / `deepseek agents shell fd-proxy `:通过 SCM_RIGHTS 把 native-supervisor PTY master fd 临时交给本地 Unix client,handoff 期间暂停 supervisor replay reader,detach 后恢复事件记录;测试已覆盖 fd 交还后普通 supervisor `resize`、`stdin`、terminal `replay` 继续可用,Ctrl-D EOF 触发目标 PTY 关闭时 fd-proxy 把 Linux `EIO` 当作正常 EOF 成功退出,运行中本地终端 SIGWINCH resize 会同步到目标 PTY,Ctrl-C 会中断目标 PTY 的 foreground process group,以及 fd-proxy 被 SIGKILL 后 handoff lease 释放并恢复普通 supervisor 控制。HTTP SSE、ACP `session/shell/subscribe` 和 MCP `exec_shell_terminal_events` progress metadata 也会透出 raw bytes。 - 本轮新增:`deepseek agents shell-fixture-smoke --json` 本地 Shell/PTY gate。它创建短路径临时 workspace,启动当前二进制的 `agents shell-supervisor --json`,验证 `health`、start/wait/attach/replay、Linux native PTY stdin/resize/replay/cancel,并把 `byte_stream` duplex control frames、`raw_proxy=true` 原始 socket bytes、Linux `pty_fd` fd handoff 和 human `agents shell proxy` raw-mode wrapper 纳入同一可复跑 smoke。当前实测 `blockers=0`、`warnings=0`,且 `shell_control` 摘要包含 `byte_stream duplex/raw_proxy/fd_handoff/human_proxy smoke passed`;`service-smoke` 的 shell-supervisor control smoke 也同步覆盖这些 byte-stream/proxy/fd-handoff 切片。聚焦测试还验证了 direct `pty_fd` 和 CLI `fd-proxy` detach 后,普通 supervisor 控制面可继续 resize/stdin/replay 同一个 PTY job。 - 本轮新增:VS Code `DeepseekCode Agent` panel 的 native workbench 继续推进。侧栏 webview 现在直接启动 `deepseek exec --json`,在 panel 内流式显示 assistant delta、reasoning log、tool call/result、permission request、stderr 和完成状态,并把 active file、selection、diagnostics、dirty-buffer marker、Git status/diff summary 注入任务上下文;post-run 控件已支持 active-file `Review Diff`、panel 内 `Accept` 标记、确认后的 `Revert File`、`Refresh Diff`、workspace validation command 输出捕获、`Resume Latest` 继续最近 runtime session、workspace changed-file queue 的 open diff / mark reviewed / confirmed revert,以及 generated patch artifact queue。模型 final/tool result 里的 unified diff 会进入独立队列,单文件 patch 可打开 VS Code diff,pending patch 可确认后 `git apply --check` + `git apply`,也可 reject;`apply_patch` tool call 的 patch 会以 captured 状态记录,避免和已写入的 Git changes 混在一起。扩展目录也新增了 mocked DeepSeekCode binary 的 extension-host smoke harness,以及可在当前机器运行的 headless panel fixture;后者用临时 Git repo 证明 diagnostic context injection -> generated patch capture -> single-file diff opening -> checked `git apply` -> workspace queue refresh -> validation pass 的闭环,并修掉了 generated patch `-p0` fallback 和 `git status --short` 前导空格解析问题。当前机器没有 `code`/`codium` CLI,所以真实 VS Code runner 执行证据仍缺;这仍不是完整 Phase 12B,manual GUI fixture 证据还需要继续补。 -- 本轮新增:GitHub Action bridge 第一片。`deepseek github action` 会读取 `GITHUB_EVENT_PATH` / `GITHUB_EVENT_NAME`,把 `pull_request`、`issue_comment`、`pull_request_review`、`pull_request_review_comment` 事件解析成 `owner/repo#PR`,comment/review 事件默认要求 `@deepseek` trigger。`--mode auto|review|fix|patch` 会复用现有 `deepseek pr review/fix/patch` 路径,`@deepseek fix` / `@deepseek patch` 可自动路由到 CI-log repair / patch workflow;`--background-task` 可改为创建 `deepseek task start` 背景 worktree 记录,`--task-id` / `--task-no-run` 支持稳定 workflow id 和无凭据 gate;`--dry-run` 只输出解析目标,`--github-output` 会把 target fields 写入 `$GITHUB_OUTPUT`,`--require-mode` 可让 write workflow 对非 fix/patch trigger fail fast,`--post` 通过现有 `gh pr comment` 发布 review summary。`deepseek github pr-head` 会把 PR head owner/ref 解析为 CLI-tested step output,并在 write-capable checkout 前拒绝 fork-owned PR branch。`deepseek github fixture-smoke` 现在可本地 no-network 验证 review/write trigger、同仓库 PR head guard、fork guard、临时 Git remote 上的 checkout/commit/push、pushed-head 校验和 background-task worktree/record 创建;CI 和 Release Matrix 也会用 debug/release binary 跑这个 smoke。仓库新增 disabled-by-default 的 `.github/workflows/deepseek-code-review.yml` 示例,需设置 `DEEPSEEK_CODE_REVIEW_ENABLED=true` 和 `DEEPSEEK_API_KEY` 后才运行;示例 workflow 固定 `--mode review` 作为安全默认。仓库也新增 disabled-by-default 的 `.github/workflows/deepseek-code-write.yml` 写入示例,需设置 `DEEPSEEK_CODE_WRITE_ENABLED=true`,它只响应 `@deepseek fix/patch`,先用 CLI dry-run 解析 PR 并输出 step outputs,再通过 `deepseek github pr-head` 解析并校验同仓库 PR head,运行 `--mode auto`,最后 commit/push 工作区改动。默认 benchmark manifest 也补到 `25` 条 `pr_workflow` cases,新增 action-labeled review-comment-plan、`@deepseek fix`、`@deepseek patch` 三条覆盖。 +- 本轮新增:GitHub Action bridge 第一片。`deepseek github action` 会读取 `GITHUB_EVENT_PATH` / `GITHUB_EVENT_NAME`,把 `pull_request`、`issue_comment`、`pull_request_review`、`pull_request_review_comment` 事件解析成 `owner/repo#PR`,comment/review 事件默认要求 `@deepseek` trigger。`--mode auto|review|fix|patch` 会复用现有 `deepseek pr review/fix/patch` 路径,`@deepseek fix` / `@deepseek patch` 可自动路由到 CI-log repair / patch workflow;`--background-task` 可改为创建 `deepseek task start` 背景 worktree 记录,`--task-id` / `--task-no-run` 支持稳定 workflow id 和无凭据 gate;`--dry-run` 只输出解析目标,`--github-output` 会把 target fields 写入 `$GITHUB_OUTPUT`,`--require-mode` 可让 write workflow 对非 fix/patch trigger fail fast,`--post` 通过现有 `gh pr comment` 发布 review summary。`deepseek github pr-head` 会把 PR head owner/ref 解析为 CLI-tested step output,并在 write-capable checkout 前拒绝 fork-owned PR branch。`deepseek github fixture-smoke` 现在可本地 no-network 验证 review/write trigger、同仓库 PR head guard、fork guard、临时 Git remote 上的 checkout/commit/push、pushed-head 校验和 background-task worktree/record 创建;CI 和 Release Matrix 也会用 debug/release binary 跑这个 smoke。仓库新增 disabled-by-default 的 `.github/workflows/deepseek-code-review.yml` 示例,需设置 `DEEPSEEK_CODE_REVIEW_ENABLED=true` 和 `DEEPSEEK_API_KEY` 后才运行;示例 workflow 固定 `--mode review` 作为安全默认。仓库也新增 disabled-by-default 的 `.github/workflows/deepseek-code-write.yml` 写入示例,需设置 `DEEPSEEK_CODE_WRITE_ENABLED=true`,它只响应 `@deepseek fix/patch`,先用 CLI dry-run 解析 PR 并输出 step outputs,再通过 `deepseek github pr-head` 解析并校验同仓库 PR head,运行 `--mode auto`,最后 commit/push 工作区改动。默认 benchmark manifest 也补到 `26` 条 `pr_workflow` cases,新增 action-labeled review-comment-plan、`@deepseek fix`、`@deepseek patch` 和 hosted exact replacement request 覆盖。 +- 本轮新增:真实 hosted GitHub workflow 证据。PR #10 验证了 hosted write bridge 修复,`@deepseek patch change ... becomes ...` 由 GitHub Actions 成功生成并推送 `6fd5010 deepseek: apply requested PR update`;PR #11 在修复合入默认分支后重新做 post-merge smoke,同一路径再次生成并推送 `f0fe9a7 deepseek: apply requested PR update`;PR #12 删除了两份临时 evidence/smoke fixture,merge commit 为 `9423126`。三个临时分支已清理,证据保留在 PR 历史和对应 workflow run 记录里。 - 本轮新增:Phase 12D 的 MCP fixture smoke 第一片。`deepseek mcp fixture-smoke --json` 会创建临时 fixture workspace 和 MCP config,用当前二进制的 `serve --mcp` 验证 stdio discovery/call,再启动本地 loopback HTTP/SSE MCP fixtures 验证 HTTP/SSE discovery/call,并通过默认 agent tool registry 验证动态 `mcp__server__tool` 暴露和 input schema cache。当前实测 stdio 工具发现 `57` 个、HTTP/SSE 各 `1` 个,三类 call 均通过,动态 schema cache 覆盖 `stdio-self/read_file`、`http-fixture/echo`、`sse-fixture/echo`。fixture 现在还会配置一个故意失败的 `broken-stdio` server,并证明它不会隐藏或破坏健康 server 的动态工具发现;同时验证 generic `mcp_call` 和动态 `mcp__stdio-self__read_file` 的 permission request、allowlisted allow、allowlist deny 三条 policy 路径。最新 JSON 字段 `bad_server_isolated`、`mcp_call_permission_ok`、`dynamic_permission_ok`、`mcp_call_allow_ok`、`mcp_call_allowlist_deny_ok`、`dynamic_allow_ok`、`dynamic_allowlist_deny_ok` 均为 `true`。本轮还把 prompts/resources/templates 纳入同一个 fixture smoke:stdio/HTTP/SSE 的 prompt discovery/get、resource discovery/read、resource template discovery 均通过,最新 JSON 中 `stdio_prompt_ok`、`http_prompt_ok`、`sse_prompt_ok`、`stdio_resource_ok`、`http_resource_ok`、`sse_resource_ok` 均为 `true`,template counts 为 `3/1/1`。这补上了 completion gate 中 MCP stdio/HTTP/SSE tool discovery/call/schema 注入、prompt/resource/template、bad-server isolation、MCP tool approval/allowlist 的本地证据。 - 本轮新增:Phase 12D hooks fixture smoke。新增 `deepseek hooks fixture-smoke --json`,它创建临时 hook root 和 workspace,安装结构化 `{"decision":"allow","add_context":"..."}` recorder scripts,然后通过真实 `AgentLoop::run_with_client` 触发 `list_files` 工具调用。当前实测 JSON 为 `deepseek.hooks_fixture_smoke.v1`,`session_start_ok`、`user_prompt_submit_ok`、`pre_tool_ok`、`post_tool_ok`、`session_stop_ok`、`hook_contexts_ok`、`tool_ran_ok` 全为 `true`,事件顺序为 `session_start -> user_prompt_submit -> pre_tool_use -> post_tool_use -> session_stop`。这给 Phase 12D hooks prompt/session/tool lifecycle 和 structured allow/add_context 提供了单命令本地 gate。 - 本轮新增:Phase 12D skills/custom command validation。新增 `deepseek skills list [--json]` 与 `deepseek skills validate [--strict] [--json]`,按运行时同一优先级扫描 bundled skills 和 `workspace.user_skills_dir`,报告同名覆盖、loader 错误、空核心元数据、空 `allowed_tools` 和未知工具名。当前 `cargo run --quiet -- skills validate --strict --json --dir skills` 实测 bundled `skills/` 共 `16` 个 skill,`valid_files=16`、`error_count=0`、`warning_count=0`、`ok=true`。`docs/skills-and-profiles.md` 也补了 PR review、release-check、security-lite 三类 skill 示例。这给 Phase 12D skill metadata validation 和 command/skill discovery 提供了单命令本地 gate。 - 本轮新增:Phase 12D subagent fixture smoke。`dispatch_subagent` / `dispatch_subagents` 现在支持 `write_scope` / `write_set` 元数据,child prompt 会带 assigned write scope,parallel summary 会输出 `meta.parallel_blocked_children`、`meta.parallel_readback_required`、`meta.parallel_next_action`、`meta.parallel_child_N_files`、`meta.parallel_child_N_write_scope` 和 `meta.parallel_write_scope_conflicts`。父 planner 现在会同时消费 `dispatch_subagent` 与 `dispatch_subagents` 汇总,看到 child files 后先 `read_file` 回读再继续。新增 `deepseek agents subagent-fixture-smoke --json` 本地 gate,当前实测 `parser_ok`、`disjoint_write_scope_ok`、`readback_required_ok`、`blocker_summary_ok`、`conflict_summary_ok`、`artifact_ok` 全为 `true`,`child_count=2`。默认 benchmark manifest 已有 `20` 条 `subagent` category cases;benchmark runner 现在支持 `--category ` 与可重复 `--case `,filtered run 只生成 report,不推进 history,也不强制全量 trend/live gate。当前 targeted subagent benchmark 实测 `20/20`,报告在 `/tmp/deepseek-subagent-benchmark.md`,trend/live 均标记为 filtered selection skip。本轮补上了并发 subagent 汇总的 readback 单测、fixture gate 和 targeted benchmark evidence。 - 本轮新增:MCP 模型规划 benchmark 第一片。benchmark runner 现在支持 per-case 自举 `stdio-self` MCP fixture,可在 isolated workdir 写入 `.dscode/mcp.json`、按 case 开关动态 MCP 工具暴露,并设置 case-local `mcp_call_allowlist`。默认 manifest 新增 `fixture-mcp-dynamic-readme`、`fixture-mcp-generic-call-readme`、`fixture-mcp-allowlist-deny-recovery` 三条 MCP cases,分别覆盖动态 `mcp__stdio-self__read_file`、generic `mcp_call`、allowlist deny 后通过 `mcp_list_tools` 恢复。本轮 targeted MCP manifest 从 `/tmp` 运行实测 `3/3`,报告在 `/tmp/deepseek-mcp-benchmark.md`;完整默认 manifest 已扩到 `82` cases,并已刷新通过 `82/82`,最新完整报告在 `.dscode/benchmarks/latest.md`。 -- 本轮新增:benchmark PR planner hardening。离线 planner 现在会保留 `github_pr_context` / `review` / `pr_review_comment_plan` 这类结构化观察,不会因为同属 `Other` kind 被后续工具压缩成 superseded stub;PR comment 失败恢复在成功重建 plan 后不再重复重建第二次;skill auto-select 也会避免把远程 PR review/comment 任务降级到隐藏 `github_pr_context` / `review` / `pr_review_comment_plan` 的 debug skill。`run_shell` 现在会把 `pytest` / `python -m pytest` 缺失以及 profile 生成的 `uv run pytest` 安全标准化到 `uv --with pytest` fallback,并会自动发现 `~/.local/toolchains/go/*/bin` / `~/sdk/go*/bin` 这类用户级 Go toolchain。当前完整离线 benchmark 实测 `82/82`,`25` 条 `pr_workflow` 中 planner/action/comment-plan 项全绿,新增 `mcp` category `3/3`;当前 82-case trend gate 处于 comparable warmup,`found 2`,live gate 基于本机 dogfood ledger 从 `runs=5` 到 `runs=20` 通过。 +- 本轮新增:benchmark PR planner hardening。离线 planner 现在会保留 `github_pr_context` / `review` / `pr_review_comment_plan` 这类结构化观察,不会因为同属 `Other` kind 被后续工具压缩成 superseded stub;PR comment 失败恢复在成功重建 plan 后不再重复重建第二次;skill auto-select 也会避免把远程 PR review/comment 任务降级到隐藏 `github_pr_context` / `review` / `pr_review_comment_plan` 的 debug skill。`run_shell` 现在会把 `pytest` / `python -m pytest` 缺失以及 profile 生成的 `uv run pytest` 安全标准化到 `uv --with pytest` fallback,并会自动发现 `~/.local/toolchains/go/*/bin` / `~/sdk/go*/bin` 这类用户级 Go toolchain。当前完整离线 benchmark 历史实测 `82/82`;本轮新增 hosted exact patch request targeted benchmark 通过,`26` 条 `pr_workflow` 中 planner/action/comment-plan/exact-request 项均有覆盖,新增 `mcp` category `3/3`;当前 trend gate 处于 comparable warmup,`found 2`,live gate 基于本机 dogfood ledger 从 `runs=5` 到 `runs=20` 通过。 - 本轮新增:offline dogfood replay 覆盖补强。`cargo run --quiet -- dogfood replay-benchmark --category pr_workflow --limit 12 --benchmark-gate` 新增了 12 条 `pr_workflow` replay,覆盖 GitHub Action `@deepseek fix` / `@deepseek patch`、JS/Rust/Python/Go PR CI repair、PR retry validate、second-round feedback 和 Go patch validate,全部成功;该批次把 ledger 推到 `17` runs 后暴露 live coverage gate 还缺 `recovery` slice。随后 `cargo run --quiet -- dogfood replay-benchmark --category recovery --limit 3 --benchmark-gate` 新增 3 条 recovery replay,`recovery` 为 `3/3`,最终 `.dscode/dogfood/latest.md` 为 `20` runs、`19/20` success、`1` historical failed、`0` stuck、`0` manual;post-replay default benchmark 为 `82/82`,live gate `pass against previous dogfood snapshot (runs 5 -> 20)`。这些仍是 `offline` transport,不能替代后续真实 model-backed dogfood。 - 本轮新增:`deepseek dogfood live-plan` 的推荐命令改为 `deepseek dogfood live-run ...`,文本和 JSON 都同时输出 dry-run 与 `--execute` 命令,避免 release operator 为 model-backed 证据误走 offline-friendly `replay-benchmark` 路径。`deepseek dogfood live-plan` 和 `deepseek dogfood live-run --json` 现在还输出 `post_run_report_command` / `evidence_gate`,直接给出 `dogfood report --require-live-runs ... --require-live-category ...` 的后置验收命令,让真实 online 执行后的 model-backed 证据可以 fail closed。`deepseek dogfood live-run --json` 保持机器可读 dry-run plan,包含 selected cases、online readiness、execute blocker 和 follow-up `--execute` command;它故意不和 `--execute` 混用,避免在线执行日志污染 JSON。`dogfood live-run` 还支持 `--api-key-file`/`--key-file` 指向仓库外 key 文件,只把 key 注入当前进程的 `model.api_key_env` 并在返回时恢复,JSON 只记录 `credential_source` 和文件路径,不输出 key 值。`dogfood live-run --execute --evidence-out ` 现在会在批次结束或首个失败后写出 `deepseek.dogfood.live_run_evidence.v1` JSON,记录 before/after ledger live counts、每个 case 追加的 model-backed ledger 行、benchmark gate 结果、同一条 post-run report gate,以及当前 ledger 文件的 `fnv1a64` fingerprint,仍不写入 API key 值。`deepseek dogfood live-evidence --file ` 现在可验证该 evidence 文件,默认要求 completed、online、至少 1 条 appended model-backed row;`--require-benchmark-gate` 可把 benchmark gate 也纳入 release fail-closed 检查,`--require-report-gate` 会读取 evidence 的 structured `evidence_gate` 和 ledger path,用 `dogfood report` 同一套 live requirement 逻辑验证 full live gate,重新计算 ledger fingerprint 并逐条核对 evidence 中 appended case 的 timestamp/outcome/model_transport/category 能在 ledger 中找到匹配记录,而不是执行 JSON 里的 shell command;`--json` 输出 `deepseek.dogfood.live_evidence_verification.v1`,`--out ` 可把 verification JSON 落盘作为 release evidence artifact。`dogfood external-fixture` 真实执行现在也默认要求 `model_transport=online`,离线只能 dry-run 或显式 `--allow-offline` 做 rehearsal,避免把 offline disposable repo 样本误计为 release evidence;`--evidence-out` 会写出 `deepseek.dogfood.external_fixture_evidence.v1`,包含 appended external fixture row、release-evidence readiness 和 ledger fingerprint,便于上传发布证据。 - 本轮新增:在线 DeepSeek dogfood 从 smoke 推进到完整 release gate。使用当前进程注入的 DeepSeek key 执行 `dogfood live-run --execute --evidence-out ...`,最终 `deepseek dogfood report --limit 100 --require-live-runs 100 --require-live-success-rate 90 --require-live-category write_validate:25:90 --require-live-category recovery:25:90 --require-live-category pr_workflow:25:90` 通过;外部 fixture 跑完后 `live-plan` 显示 `105` 条 online run、`99` 条 success,分类为 `write_validate 29/30`、`recovery 23/25`、`pr_workflow 47/50`。执行过程中又修掉两类真实模型卡点:Python pytest retry readback 现在能识别 `def test_` / `assert ` 测试文件,并从错误的 `a * b` 回退到 `a + b`;空搜索恢复任务在看到 no matches 后完成 repository layout inspection 会 clean finish,不再重复列目录。release evidence verification 落在 `.dscode/dogfood/live-evidence-final-total-pr-4-release-verification.json`,`report_gate_passed=true`。 @@ -90,9 +91,9 @@ deepseek agents shell-fixture-smoke --json 4. 产品打磨 - TUI 已能用,但还需要更多真实工作流下的性能、长输出、失败恢复、窗口 resize、旧终端兼容性验证。 - VS Code 已有 native panel、resume、active-file review、workspace changed-file queue、generated patch queue/apply/reject、extension-host smoke harness、headless diagnostic patch fixture 和 validation 第一片,但完整 IDE agent workbench 仍缺真实 VS Code CLI runner 证据和 manual GUI fixture 证据。 - - GitHub automation 已有 event bridge、review/fix/patch mode routing、review workflow 示例、写入型 PR-head checkout workflow 示例、本地 workflow fixture smoke、`25` 条 `pr_workflow` benchmark cases,以及 `14/14` offline `pr_workflow` dogfood replay;当前本机已有 `50` 条 online model-backed `pr_workflow` 样本、`47` 条 success,分类 release gate 已通过;还缺真实远端 fixture repo 上的 hosted workflow run 证据。 + - GitHub automation 已有 event bridge、review/fix/patch mode routing、review workflow 示例、写入型 PR-head checkout workflow 示例、本地 workflow fixture smoke、真实 hosted write workflow 证据、`26` 条 `pr_workflow` benchmark cases,以及 `14/14` offline `pr_workflow` dogfood replay;当前本机已有 `50` 条 online model-backed `pr_workflow` 样本、`47` 条 success,分类 release gate 已通过。后续重点是把这类远端证据沉淀成更稳定的周期性 smoke,而不是继续保留临时 fixture 文件。 - MCP 已有 stdio/HTTP/SSE 本地 fixture smoke、动态工具暴露、schema cache、prompt/resource/template 单命令 smoke、bad-server isolation、generic/dynamic MCP approval/allowlist policy 证据,也已有三条模型规划型 MCP benchmark 和完整默认 benchmark `82/82` 证据;当前没有已知 MCP-specific Phase 12D smoke 缺口。 - - Hooks 已有 prompt submit、session start/stop、pre/post tool use 和 structured allow/add_context 的 `deepseek hooks fixture-smoke --json` 本地 gate;skills/custom command 已有 `deepseek skills validate --strict --json` 元数据 gate 和 discovery 文档;subagent 已有 `deepseek agents subagent-fixture-smoke --json` gate、20 条 subagent benchmark cases、targeted subagent benchmark `20/20`、完整默认 benchmark `82/82`、并发 child readback 和 write-scope conflict metadata。Phase 12D 本地 extension gates 已基本齐备,offline dogfood coverage gate 和 online dogfood release gate 都已通过;下一步应继续外部兼容性、hosted workflow 和真实 demo 证据。 + - Hooks 已有 prompt submit、session start/stop、pre/post tool use 和 structured allow/add_context 的 `deepseek hooks fixture-smoke --json` 本地 gate;skills/custom command 已有 `deepseek skills validate --strict --json` 元数据 gate 和 discovery 文档;subagent 已有 `deepseek agents subagent-fixture-smoke --json` gate、20 条 subagent benchmark cases、targeted subagent benchmark `20/20`、完整默认 benchmark `82/82`、并发 child readback 和 write-scope conflict metadata。Phase 12D 本地 extension gates 已基本齐备,offline dogfood coverage gate 和 online dogfood release gate 都已通过;下一步应继续外部兼容性和真实 demo 证据。 - 文档需要继续压缩成新用户能快速理解的安装、配置、试用、故障排查路径。 - 和上游 DeepSeek-TUI 的新变化需要持续周期性 refresh。 @@ -133,4 +134,4 @@ DeepSeekCode 现在已经是一个可以实际使用的 code agent CLI,尤其 最准确的公开表述是: -> DeepSeekCode is usable today for dogfooding and repository work, with a full-screen TUI, durable runtime, permissioned tools, release binaries, cross-platform entrypoint smoke, a 100-run online dogfood release gate, initial external disposable-repo write-fixture evidence, and a committed real model-backed README demo SVG. The remaining work is hosted IDE/GitHub evidence, Windows/service proof, optional richer demo media, and public package-channel publishing. +> DeepSeekCode is usable today for dogfooding and repository work, with a full-screen TUI, durable runtime, permissioned tools, release binaries, cross-platform entrypoint smoke, a 100-run online dogfood release gate, initial external disposable-repo write-fixture evidence, real hosted GitHub workflow evidence, and a committed real model-backed README demo SVG. The remaining work is hosted IDE evidence, Windows/service proof, optional richer demo media, and public package-channel publishing. diff --git a/docs/superpowers/specs/2026-05-22-github-action-bridge.md b/docs/superpowers/specs/2026-05-22-github-action-bridge.md index bf5b471..d5680af 100644 --- a/docs/superpowers/specs/2026-05-22-github-action-bridge.md +++ b/docs/superpowers/specs/2026-05-22-github-action-bridge.md @@ -92,8 +92,10 @@ convert GitHub event payloads into a PR review run. - action-labeled review comment planning case - action-labeled `@deepseek fix` JavaScript write/validate fixture - action-labeled `@deepseek patch` Rust write/validate fixture - - default `.dscode/benchmarks.txt` now has `25` `pr_workflow` cases, matching - the Phase 12C benchmark-count target. + - action-labeled exact replacement request fixture for + `@deepseek patch change ... becomes ...` + - default `.dscode/benchmarks.txt` now has `26` `pr_workflow` cases, + exceeding the Phase 12C benchmark-count target. ## Verification @@ -115,6 +117,9 @@ convert GitHub event payloads into a PR review run. - Python `pytest`/`uv run pytest` fixtures are covered by `run_shell` fallback - Go write-validate fixtures are covered by user-level Go toolchain discovery - live gate passes after offline dogfood coverage reached `runs=20` +- `cargo run --quiet -- benchmark --case fixture-github-action-patch-trigger-exact-replacement-rust-mini --out /tmp/deepseek-hosted-exact-patch-benchmark.md` + - targeted result: `1/1` + - filtered run did not update benchmark history - `cargo run --quiet -- github action --event --event-name issue_comment --dry-run --trigger @deepseek` - `cargo run --quiet -- github pr-head owner/repo#11 --repo-owner owner --github-output --json-file ` - `cargo run --quiet -- github fixture-smoke --json` @@ -139,16 +144,21 @@ convert GitHub event payloads into a PR review run. - dry-run samples for `@deepseek fix`, `@deepseek patch`, and explicit `--mode review` - YAML parse checks for both workflow examples +- Hosted workflow evidence: + - PR #10 proved the hosted write bridge and produced + `6fd5010 deepseek: apply requested PR update` from GitHub Actions. + - PR #11 repeated the same write flow after the fixes were merged to the + default branch and produced `f0fe9a7 deepseek: apply requested PR update`. + - PR #12 removed the temporary evidence fixtures after the proof was + preserved in PR history. ## Remaining -- Execute the workflow in a fixture repository and capture evidence that it - reads PR diff and posts a review comment. -- Execute the write workflow in a fixture repository and capture evidence that - it checks out PR head, writes a fix/patch, and pushes back to the PR branch. +- Promote or schedule a stable periodic hosted workflow smoke if this should + stay continuously monitored instead of relying on PR #10/#11 evidence. - Promote the refreshed benchmark report/history once the surrounding worktree - is ready. + is ready and a full unfiltered run is desired. - Collect online/model-backed `pr_workflow` dogfood evidence for the new action-labeled cases. The current `14/14` `pr_workflow` replay evidence uses offline transport and is useful for deterministic coverage, not a substitute - for hosted workflow/model-backed proof. + for model-backed proof. diff --git a/src/cli/commands/benchmark.rs b/src/cli/commands/benchmark.rs index 55e88c7..8d59ccb 100644 --- a/src/cli/commands/benchmark.rs +++ b/src/cli/commands/benchmark.rs @@ -2866,8 +2866,8 @@ seed_observations = "search_text:failed:no matches || recovery_hint:ok:after=sea .filter(|case| case.category == "pr_workflow") .count(); assert!( - pr_workflow_cases >= 25, - "default manifest should keep Phase 12C pr_workflow coverage at or above 25 cases" + pr_workflow_cases >= 26, + "default manifest should keep Phase 12C pr_workflow coverage at or above 26 cases" ); let review_case = cases @@ -2910,6 +2910,22 @@ seed_observations = "search_text:failed:no matches || recovery_hint:ok:after=sea patch_case.expect_last_tool_output_contains.as_deref(), Some("meta.result=ok") ); + + let exact_patch_case = cases + .iter() + .find(|case| { + case.name == "fixture-github-action-patch-trigger-exact-replacement-rust-mini" + }) + .expect("default manifest should include exact action patch replacement fixture"); + assert_eq!(exact_patch_case.category, "pr_workflow"); + assert!(exact_patch_case.isolate_workdir); + assert!(exact_patch_case.task.contains("@deepseek patch change")); + assert!(exact_patch_case.task.contains(" becomes ")); + assert_eq!(exact_patch_case.expect_tool.as_deref(), Some("apply_patch")); + assert_eq!( + exact_patch_case.expect_last_tool_output_contains.as_deref(), + Some("meta.result=ok") + ); } #[test]