From 1e1acdba1295f7e6fab34ff55ebf3a3dac1b3911 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Sun, 17 May 2026 22:04:40 +0200 Subject: [PATCH] fix(compress-report-section): tolerate trailing characters from small-model structured output Adds LenientJsonModel mixin that catches Pydantic's 'json_invalid' trailing-characters failure and extracts the first balanced JSON object via json.JSONDecoder().raw_decode before validating. Small structured-output LLMs (notably the default openrouter-gemini-2.5-flash-lite-preview-09-2025) occasionally emit a valid JSON object followed by extra tokens; the strict validator was rejecting the whole payload despite a correct prefix, causing compress_premortem to fail repeatedly for plans with multi-clause failure-mode tables. Recovery here is local: the extracted prefix is validated against the same schema; if it still fails, the original error is re-raised so genuine schema problems are not masked. Includes 4 unit tests covering the trailing-object, trailing-prose, well-formed, and schema-error paths. Also includes the failure-analysis doc captured against 20251110_4DWW_India. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../20260517_compress_premortem_failure.md | 119 ++++++++++++++++++ .../compress_report_section.py | 63 ++++++++-- .../tests/test_compress_report_section.py | 33 +++++ 3 files changed, 208 insertions(+), 7 deletions(-) create mode 100644 experiments/napkin_math/docs/20260517_compress_premortem_failure.md diff --git a/experiments/napkin_math/docs/20260517_compress_premortem_failure.md b/experiments/napkin_math/docs/20260517_compress_premortem_failure.md new file mode 100644 index 00000000..9126705b --- /dev/null +++ b/experiments/napkin_math/docs/20260517_compress_premortem_failure.md @@ -0,0 +1,119 @@ +# `compress_report_section` premortem failure — analysis (2026-05-17) + +## Symptom + +Running `prepare_extract_input.py` against +`/Users/neoneye/git/PlanExe-web/20251110_4DWW_India` with the default LLM +(`openrouter-gemini-2.5-flash-lite-preview-09-2025`) fails to produce +`compress_premortem.md`. The driver logs: + +``` +ERROR __main__ [premortem] GIVE UP after 3 attempts. Last error: + Bucket 'gates_and_thresholds' failed after 3 attempts. Last error: + ValidationError: 1 validation error for GatesAndThresholdsOnly + Invalid JSON: trailing characters at line 1 column 413 + [type=json_invalid, input_value='{"line_english":"If SME ...0% across all cohorts"}', input_type=str] +WARNING: compressed section 'premortem' not produced; skipping +``` + +Two consecutive runs reproduced the failure. The other three compressed +sections (`selected_scenario`, `review_plan`, `expert_criticism`) emitted +clean output on the same runs. + +## Failure pattern from the logs + +Inside the premortem section the driver retried twice (six bucket attempts +total per outer attempt × three outer attempts = 18 LLM calls), and the +buckets that failed were always the same two: + +| Outer attempt | First failing bucket | Trailing-character column | +|---:|---|---:| +| 1 | `gates_and_thresholds` | 366, 366, 413 | +| 2 | `numeric_values` | 434, 443, line 8 col 4 | +| 3 | `gates_and_thresholds` | 498, 425, 413 | + +Every failure is the same pydantic verdict — +`Invalid JSON: trailing characters at line N column M` — and the captured +`input_value` always starts with a valid-looking `{"line_english": "..."`. +The model is emitting a syntactically valid JSON object, then continuing +with extra characters that break the strict single-object schema validator. + +The other compressed sections hit isolated retries on the same buckets +(notably the small-model `Could not extract json string from output` +fallback on `section_summary`) but recovered within three attempts. Only +premortem exhausts the budget. + +## Why premortem specifically + +The premortem source has more compress-resistant structure than the other +three sections: + +- **More entities to fit in one bucket.** The source defines 9 assumptions + (A1–A9) plus 9 failure modes (FM1–FM9) with 5×5 risk-level tuples; the + `gates_and_thresholds` bucket has to triage these into ≤8 items, each + with a quoted source string and per-field metadata. +- **Multi-clause if/then sentences.** Premortem rows nest a binding + condition, a stakeholder, and a consequence in one sentence (e.g. A1 + "agree to a binding PMO casting vote *despite political pressures*"). + The bucket prompt asks for one-line `If , then ` + rewrites — the model often glues two well-formed JSON objects together + trying to keep both halves. +- **Quote-rich source.** Assumption text contains single quotes, em-dashes, + parenthetical asides, and embedded percentages; structured-output models + occasionally close one object and start another instead of escaping + properly. + +Other sections that share the same multi-bucket schema do not have this +combination, so the failure does not surface there. + +## Root-cause hypothesis + +The default LLM is a small Gemini Flash Lite preview that already produces +trailing-character noise on the other sections (recoverable within three +retries). Premortem content pushes the noise rate above the retry budget. +The proximate cause is not the source text — it is the model's tendency to +emit `{...}` then continue writing instead of stopping, combined with a +strict single-object validator that refuses to take the prefix. + +## Is it fixable? + +Yes, at three levels. From cheapest to deepest: + +1. **Post-process the raw response before pydantic parses it.** Strip + anything after the first balanced JSON object using a brace counter, + then hand the prefix to the validator. The model already produces a + valid prefix; the validator just refuses the suffix. This is a small + patch in `compress_report_section.py` — wrap the `sllm.chat` call so + trailing characters are trimmed before `chat_response.raw` is read. + Risk: low. Removes the specific failure mode without changing prompts + or models. + +2. **Tighten the bucket prompts.** Add one explicit guardrail line to + `GATES_AND_THRESHOLDS_BUCKET_PROMPT` and `NUMERIC_VALUES_BUCKET_PROMPT` + (and the others) saying: *"Emit one JSON object and stop. Do not append + any prose, code fence, or second object."* Small models often respect + that line when it appears late in the prompt. Risk: low. Does not fix + the underlying brittleness, but reduces the failure rate. + +3. **Raise the model floor for compress_report_section.** Override + `COMPRESS_FULL_LLM` to a model whose structured-output handler is more + disciplined (`openrouter-gemini-2.0-flash-001`, + `openrouter-openai-gpt-4o-mini`, or higher). The script already + supports this via env var. Costs more per run but eliminates the + single-section gap. + +The right combination for production is probably (1) + (2): the trimmer +removes the known failure mode regardless of which model is in use, and +the prompt guardrail shrinks the population of failures the trimmer has to +handle. (3) becomes a fallback for plans where (1) and (2) still leave a +section uncompressed. + +## Operational note for this plan + +For `20251110_4DWW_India`, the bundled digest currently lacks the +premortem section. The remaining seven sections (executive_summary, +project_plan, selected_scenario, assumptions, review_plan, expert_criticism, +data_collection) carry enough overlap that parameter extraction is still +useful; the cost is reduced signal on unmodelled existential gates that +premortem typically surfaces. If a third retry with the default model +still fails, escalating to option (3) above is the appropriate next step. diff --git a/worker_plan/worker_plan_internal/parameter_extraction/compress_report_section.py b/worker_plan/worker_plan_internal/parameter_extraction/compress_report_section.py index 0065ecc6..d6c2c58d 100644 --- a/worker_plan/worker_plan_internal/parameter_extraction/compress_report_section.py +++ b/worker_plan/worker_plan_internal/parameter_extraction/compress_report_section.py @@ -32,6 +32,7 @@ PROMPT> python -m worker_plan_internal.parameter_extraction.run_compress_full """ import json +import json import logging import re import time @@ -43,11 +44,59 @@ from llama_index.core.llms import ChatMessage, MessageRole from llama_index.core.llms.llm import LLM -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, ValidationError logger = logging.getLogger(__name__) +class LenientJsonModel(BaseModel): + """BaseModel whose `model_validate_json` tolerates trailing characters. + + Small structured-output LLMs (notably Gemini Flash Lite) occasionally + emit a valid JSON object followed by extra tokens. Pydantic's strict + validator rejects the whole payload with `json_invalid` ("trailing + characters"), wasting retries on a response whose prefix is already + correct. This override falls back to `json.JSONDecoder().raw_decode` + to peel off the first balanced JSON value and validates that. If the + extracted value still fails schema validation, the original error is + re-raised so genuine schema problems are not hidden. + """ + + @classmethod + def model_validate_json(cls, json_data, **kwargs): + try: + return super().model_validate_json(json_data, **kwargs) + except ValidationError as primary_error: + if not _is_trailing_characters_error(primary_error): + raise + text = _coerce_to_str(json_data) + if text is None: + raise + try: + first_value, _ = json.JSONDecoder().raw_decode(text.lstrip()) + except json.JSONDecodeError: + raise primary_error + return cls.model_validate(first_value, **kwargs) + + +def _is_trailing_characters_error(err: ValidationError) -> bool: + for error in err.errors(): + if error.get("type") == "json_invalid" and "trailing" in error.get("msg", "").lower(): + return True + return False + + +def _coerce_to_str(json_data: Any) -> Optional[str]: + if isinstance(json_data, str): + return json_data + if isinstance(json_data, (bytes, bytearray)): + try: + return json_data.decode("utf-8") + except UnicodeDecodeError: + return None + return None + + class ReportSectionTypeEnum(str, Enum): SELECTED_SCENARIO = "selected_scenario" REVIEW_PLAN = "review_plan" @@ -283,35 +332,35 @@ class CompressedReportSection(BaseModel): BUCKET_FIELD_DESC = "See the bucket prompt in the user message for the expected content." -class SectionSummaryOnly(BaseModel): +class SectionSummaryOnly(LenientJsonModel): section_summary: str = Field(description=BUCKET_FIELD_DESC) -class NumericValuesOnly(BaseModel): +class NumericValuesOnly(LenientJsonModel): numeric_values: list[ScoredItem] = Field( default_factory=list, description=BUCKET_FIELD_DESC, ) -class LoadBearingAssumptionsOnly(BaseModel): +class LoadBearingAssumptionsOnly(LenientJsonModel): load_bearing_assumptions: list[ScoredItem] = Field( default_factory=list, description=BUCKET_FIELD_DESC, ) -class GatesAndThresholdsOnly(BaseModel): +class GatesAndThresholdsOnly(LenientJsonModel): gates_and_thresholds: list[ScoredItem] = Field( default_factory=list, description=BUCKET_FIELD_DESC, ) -class RisksAndShocksOnly(BaseModel): +class RisksAndShocksOnly(LenientJsonModel): risks_and_shocks: list[ScoredItem] = Field( default_factory=list, description=BUCKET_FIELD_DESC, ) -class MissingDataOnly(BaseModel): +class MissingDataOnly(LenientJsonModel): missing_data_to_estimate: list[ScoredItem] = Field( default_factory=list, description=BUCKET_FIELD_DESC, ) diff --git a/worker_plan/worker_plan_internal/parameter_extraction/tests/test_compress_report_section.py b/worker_plan/worker_plan_internal/parameter_extraction/tests/test_compress_report_section.py index bc0a0d64..114206c1 100644 --- a/worker_plan/worker_plan_internal/parameter_extraction/tests/test_compress_report_section.py +++ b/worker_plan/worker_plan_internal/parameter_extraction/tests/test_compress_report_section.py @@ -1,8 +1,14 @@ +import pytest +from pydantic import ValidationError + from worker_plan_internal.parameter_extraction.compress_report_section import ( COMPRESS_REPORT_SECTION_SYSTEM_PROMPT, CompressedReportSection, CompressReportSection, + GatesAndThresholdsOnly, + NumericValuesOnly, PublicScoredItem, + SectionSummaryOnly, build_user_prompt, infer_section_type_from_path, normalize_section_type, @@ -81,6 +87,33 @@ def test_pydantic_schema_shape() -> None: ) +def test_lenient_model_validate_json_strips_trailing_object() -> None: + text = ( + '{"section_summary": "summary text"}' + '{"section_summary": "second object the validator must ignore"}' + ) + obj = SectionSummaryOnly.model_validate_json(text) + assert obj.section_summary == "summary text" + + +def test_lenient_model_validate_json_strips_trailing_prose() -> None: + text = '{"numeric_values": []}\n\nHere is some commentary the model added.' + obj = NumericValuesOnly.model_validate_json(text) + assert obj.numeric_values == [] + + +def test_lenient_model_validate_json_keeps_well_formed_input() -> None: + text = '{"gates_and_thresholds": []}' + obj = GatesAndThresholdsOnly.model_validate_json(text) + assert obj.gates_and_thresholds == [] + + +def test_lenient_model_validate_json_preserves_schema_errors() -> None: + text = '{"section_summary": 123}' + with pytest.raises(ValidationError): + SectionSummaryOnly.model_validate_json(text) + + def _si( line: str, *,