From 1e1acdba1295f7e6fab34ff55ebf3a3dac1b3911 Mon Sep 17 00:00:00 2001
From: Simon Strandgaard <neoneye@gmail.com>
Date: Sun, 17 May 2026 22:04:40 +0200
Subject: [PATCH] fix(compress-report-section): tolerate trailing characters
 from small-model structured output

Adds LenientJsonModel mixin that catches Pydantic's 'json_invalid' trailing-characters failure and extracts the first balanced JSON object via json.JSONDecoder().raw_decode before validating.

Small structured-output LLMs (notably the default openrouter-gemini-2.5-flash-lite-preview-09-2025) occasionally emit a valid JSON object followed by extra tokens; the strict validator was rejecting the whole payload despite a correct prefix, causing compress_premortem to fail repeatedly for plans with multi-clause failure-mode tables. Recovery here is local: the extracted prefix is validated against the same schema; if it still fails, the original error is re-raised so genuine schema problems are not masked.

Includes 4 unit tests covering the trailing-object, trailing-prose, well-formed, and schema-error paths. Also includes the failure-analysis doc captured against 20251110_4DWW_India.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../20260517_compress_premortem_failure.md    | 119 ++++++++++++++++++
 .../compress_report_section.py                |  63 ++++++++--
 .../tests/test_compress_report_section.py     |  33 +++++
 3 files changed, 208 insertions(+), 7 deletions(-)
 create mode 100644 experiments/napkin_math/docs/20260517_compress_premortem_failure.md

diff --git a/experiments/napkin_math/docs/20260517_compress_premortem_failure.md b/experiments/napkin_math/docs/20260517_compress_premortem_failure.md
new file mode 100644
index 00000000..9126705b
--- /dev/null
+++ b/experiments/napkin_math/docs/20260517_compress_premortem_failure.md
@@ -0,0 +1,119 @@
+# `compress_report_section` premortem failure — analysis (2026-05-17)
+
+## Symptom
+
+Running `prepare_extract_input.py` against
+`/Users/neoneye/git/PlanExe-web/20251110_4DWW_India` with the default LLM
+(`openrouter-gemini-2.5-flash-lite-preview-09-2025`) fails to produce
+`compress_premortem.md`. The driver logs:
+
+```
+ERROR __main__ [premortem] GIVE UP after 3 attempts. Last error:
+  Bucket 'gates_and_thresholds' failed after 3 attempts. Last error:
+  ValidationError: 1 validation error for GatesAndThresholdsOnly
+  Invalid JSON: trailing characters at line 1 column 413
+  [type=json_invalid, input_value='{"line_english":"If SME ...0% across all cohorts"}', input_type=str]
+WARNING: compressed section 'premortem' not produced; skipping
+```
+
+Two consecutive runs reproduced the failure. The other three compressed
+sections (`selected_scenario`, `review_plan`, `expert_criticism`) emitted
+clean output on the same runs.
+
+## Failure pattern from the logs
+
+Inside the premortem section the driver retried twice (six bucket attempts
+total per outer attempt × three outer attempts = 18 LLM calls), and the
+buckets that failed were always the same two:
+
+| Outer attempt | First failing bucket | Trailing-character column |
+|---:|---|---:|
+| 1 | `gates_and_thresholds` | 366, 366, 413 |
+| 2 | `numeric_values`        | 434, 443, line 8 col 4 |
+| 3 | `gates_and_thresholds` | 498, 425, 413 |
+
+Every failure is the same pydantic verdict —
+`Invalid JSON: trailing characters at line N column M` — and the captured
+`input_value` always starts with a valid-looking `{"line_english": "..."`.
+The model is emitting a syntactically valid JSON object, then continuing
+with extra characters that break the strict single-object schema validator.
+
+The other compressed sections hit isolated retries on the same buckets
+(notably the small-model `Could not extract json string from output`
+fallback on `section_summary`) but recovered within three attempts. Only
+premortem exhausts the budget.
+
+## Why premortem specifically
+
+The premortem source has more compress-resistant structure than the other
+three sections:
+
+- **More entities to fit in one bucket.** The source defines 9 assumptions
+  (A1–A9) plus 9 failure modes (FM1–FM9) with 5×5 risk-level tuples; the
+  `gates_and_thresholds` bucket has to triage these into ≤8 items, each
+  with a quoted source string and per-field metadata.
+- **Multi-clause if/then sentences.** Premortem rows nest a binding
+  condition, a stakeholder, and a consequence in one sentence (e.g. A1
+  "agree to a binding PMO casting vote *despite political pressures*").
+  The bucket prompt asks for one-line `If <failure>, then <consequence>`
+  rewrites — the model often glues two well-formed JSON objects together
+  trying to keep both halves.
+- **Quote-rich source.** Assumption text contains single quotes, em-dashes,
+  parenthetical asides, and embedded percentages; structured-output models
+  occasionally close one object and start another instead of escaping
+  properly.
+
+Other sections that share the same multi-bucket schema do not have this
+combination, so the failure does not surface there.
+
+## Root-cause hypothesis
+
+The default LLM is a small Gemini Flash Lite preview that already produces
+trailing-character noise on the other sections (recoverable within three
+retries). Premortem content pushes the noise rate above the retry budget.
+The proximate cause is not the source text — it is the model's tendency to
+emit `{...}` then continue writing instead of stopping, combined with a
+strict single-object validator that refuses to take the prefix.
+
+## Is it fixable?
+
+Yes, at three levels. From cheapest to deepest:
+
+1. **Post-process the raw response before pydantic parses it.** Strip
+   anything after the first balanced JSON object using a brace counter,
+   then hand the prefix to the validator. The model already produces a
+   valid prefix; the validator just refuses the suffix. This is a small
+   patch in `compress_report_section.py` — wrap the `sllm.chat` call so
+   trailing characters are trimmed before `chat_response.raw` is read.
+   Risk: low. Removes the specific failure mode without changing prompts
+   or models.
+
+2. **Tighten the bucket prompts.** Add one explicit guardrail line to
+   `GATES_AND_THRESHOLDS_BUCKET_PROMPT` and `NUMERIC_VALUES_BUCKET_PROMPT`
+   (and the others) saying: *"Emit one JSON object and stop. Do not append
+   any prose, code fence, or second object."* Small models often respect
+   that line when it appears late in the prompt. Risk: low. Does not fix
+   the underlying brittleness, but reduces the failure rate.
+
+3. **Raise the model floor for compress_report_section.** Override
+   `COMPRESS_FULL_LLM` to a model whose structured-output handler is more
+   disciplined (`openrouter-gemini-2.0-flash-001`,
+   `openrouter-openai-gpt-4o-mini`, or higher). The script already
+   supports this via env var. Costs more per run but eliminates the
+   single-section gap.
+
+The right combination for production is probably (1) + (2): the trimmer
+removes the known failure mode regardless of which model is in use, and
+the prompt guardrail shrinks the population of failures the trimmer has to
+handle. (3) becomes a fallback for plans where (1) and (2) still leave a
+section uncompressed.
+
+## Operational note for this plan
+
+For `20251110_4DWW_India`, the bundled digest currently lacks the
+premortem section. The remaining seven sections (executive_summary,
+project_plan, selected_scenario, assumptions, review_plan, expert_criticism,
+data_collection) carry enough overlap that parameter extraction is still
+useful; the cost is reduced signal on unmodelled existential gates that
+premortem typically surfaces. If a third retry with the default model
+still fails, escalating to option (3) above is the appropriate next step.
diff --git a/worker_plan/worker_plan_internal/parameter_extraction/compress_report_section.py b/worker_plan/worker_plan_internal/parameter_extraction/compress_report_section.py
index 0065ecc6..d6c2c58d 100644
--- a/worker_plan/worker_plan_internal/parameter_extraction/compress_report_section.py
+++ b/worker_plan/worker_plan_internal/parameter_extraction/compress_report_section.py
@@ -32,6 +32,7 @@
 PROMPT> python -m worker_plan_internal.parameter_extraction.run_compress_full
 """
 import json
+import json
 import logging
 import re
 import time
@@ -43,11 +44,59 @@
 
 from llama_index.core.llms import ChatMessage, MessageRole
 from llama_index.core.llms.llm import LLM
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, ValidationError
 
 logger = logging.getLogger(__name__)
 
 
+class LenientJsonModel(BaseModel):
+    """BaseModel whose `model_validate_json` tolerates trailing characters.
+
+    Small structured-output LLMs (notably Gemini Flash Lite) occasionally
+    emit a valid JSON object followed by extra tokens. Pydantic's strict
+    validator rejects the whole payload with `json_invalid` ("trailing
+    characters"), wasting retries on a response whose prefix is already
+    correct. This override falls back to `json.JSONDecoder().raw_decode`
+    to peel off the first balanced JSON value and validates that. If the
+    extracted value still fails schema validation, the original error is
+    re-raised so genuine schema problems are not hidden.
+    """
+
+    @classmethod
+    def model_validate_json(cls, json_data, **kwargs):
+        try:
+            return super().model_validate_json(json_data, **kwargs)
+        except ValidationError as primary_error:
+            if not _is_trailing_characters_error(primary_error):
+                raise
+            text = _coerce_to_str(json_data)
+            if text is None:
+                raise
+            try:
+                first_value, _ = json.JSONDecoder().raw_decode(text.lstrip())
+            except json.JSONDecodeError:
+                raise primary_error
+            return cls.model_validate(first_value, **kwargs)
+
+
+def _is_trailing_characters_error(err: ValidationError) -> bool:
+    for error in err.errors():
+        if error.get("type") == "json_invalid" and "trailing" in error.get("msg", "").lower():
+            return True
+    return False
+
+
+def _coerce_to_str(json_data: Any) -> Optional[str]:
+    if isinstance(json_data, str):
+        return json_data
+    if isinstance(json_data, (bytes, bytearray)):
+        try:
+            return json_data.decode("utf-8")
+        except UnicodeDecodeError:
+            return None
+    return None
+
+
 class ReportSectionTypeEnum(str, Enum):
     SELECTED_SCENARIO = "selected_scenario"
     REVIEW_PLAN = "review_plan"
@@ -283,35 +332,35 @@ class CompressedReportSection(BaseModel):
 BUCKET_FIELD_DESC = "See the bucket prompt in the user message for the expected content."
 
 
-class SectionSummaryOnly(BaseModel):
+class SectionSummaryOnly(LenientJsonModel):
     section_summary: str = Field(description=BUCKET_FIELD_DESC)
 
 
-class NumericValuesOnly(BaseModel):
+class NumericValuesOnly(LenientJsonModel):
     numeric_values: list[ScoredItem] = Field(
         default_factory=list, description=BUCKET_FIELD_DESC,
     )
 
 
-class LoadBearingAssumptionsOnly(BaseModel):
+class LoadBearingAssumptionsOnly(LenientJsonModel):
     load_bearing_assumptions: list[ScoredItem] = Field(
         default_factory=list, description=BUCKET_FIELD_DESC,
     )
 
 
-class GatesAndThresholdsOnly(BaseModel):
+class GatesAndThresholdsOnly(LenientJsonModel):
     gates_and_thresholds: list[ScoredItem] = Field(
         default_factory=list, description=BUCKET_FIELD_DESC,
     )
 
 
-class RisksAndShocksOnly(BaseModel):
+class RisksAndShocksOnly(LenientJsonModel):
     risks_and_shocks: list[ScoredItem] = Field(
         default_factory=list, description=BUCKET_FIELD_DESC,
     )
 
 
-class MissingDataOnly(BaseModel):
+class MissingDataOnly(LenientJsonModel):
     missing_data_to_estimate: list[ScoredItem] = Field(
         default_factory=list, description=BUCKET_FIELD_DESC,
     )
diff --git a/worker_plan/worker_plan_internal/parameter_extraction/tests/test_compress_report_section.py b/worker_plan/worker_plan_internal/parameter_extraction/tests/test_compress_report_section.py
index bc0a0d64..114206c1 100644
--- a/worker_plan/worker_plan_internal/parameter_extraction/tests/test_compress_report_section.py
+++ b/worker_plan/worker_plan_internal/parameter_extraction/tests/test_compress_report_section.py
@@ -1,8 +1,14 @@
+import pytest
+from pydantic import ValidationError
+
 from worker_plan_internal.parameter_extraction.compress_report_section import (
     COMPRESS_REPORT_SECTION_SYSTEM_PROMPT,
     CompressedReportSection,
     CompressReportSection,
+    GatesAndThresholdsOnly,
+    NumericValuesOnly,
     PublicScoredItem,
+    SectionSummaryOnly,
     build_user_prompt,
     infer_section_type_from_path,
     normalize_section_type,
@@ -81,6 +87,33 @@ def test_pydantic_schema_shape() -> None:
         )
 
 
+def test_lenient_model_validate_json_strips_trailing_object() -> None:
+    text = (
+        '{"section_summary": "summary text"}'
+        '{"section_summary": "second object the validator must ignore"}'
+    )
+    obj = SectionSummaryOnly.model_validate_json(text)
+    assert obj.section_summary == "summary text"
+
+
+def test_lenient_model_validate_json_strips_trailing_prose() -> None:
+    text = '{"numeric_values": []}\n\nHere is some commentary the model added.'
+    obj = NumericValuesOnly.model_validate_json(text)
+    assert obj.numeric_values == []
+
+
+def test_lenient_model_validate_json_keeps_well_formed_input() -> None:
+    text = '{"gates_and_thresholds": []}'
+    obj = GatesAndThresholdsOnly.model_validate_json(text)
+    assert obj.gates_and_thresholds == []
+
+
+def test_lenient_model_validate_json_preserves_schema_errors() -> None:
+    text = '{"section_summary": 123}'
+    with pytest.raises(ValidationError):
+        SectionSummaryOnly.model_validate_json(text)
+
+
 def _si(
     line: str,
     *,