From 54f23a3f1d922905e08282418e8734fcbe6e138a Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 9 May 2026 18:02:25 -0400 Subject: [PATCH 1/5] Compose by_path / paths_of_interest with survey_design (Wave 4 #10) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lift the gate at chaisemartin_dhaultfoeuille.py:1233-1239 so per-path event-study effects compose with survey_design under analytical Binder TSL SE and replicate-weight bootstrap variance. Multiplier bootstrap (n_bootstrap > 0) under survey + by_path remains gated; the survey-aware perturbation pivot for path-restricted IFs is methodologically underived and deferred to a future wave. Per-path SE routes through the existing _survey_se_from_group_if cell-period allocator. The per-period IF (U_pp_l_path) with non-path switcher contributions zeroed at both group and cell levels (the row-sum identity U_pp.sum(axis=1) == U is preserved trivially under group-level zeroing) is cohort-recentered via _cohort_recenter_per_period, then expanded to observations as psi_i = U_pp[g_i, t_i] * (w_i / W_{g_i, t_i}). Replicate-weight designs unconditionally use the cell allocator (Class A contract, PR #323). New _refresh_path_inference helper post-call refreshes safe_inference on every populated entry across multi_horizon_inference, placebo_horizon_inference, path_effects, and path_placebos so all four surfaces reflect the same final df_survey after per-path replicate fits append n_valid to the shared accumulator. Path-enumeration ranking under survey_design remains unweighted (group-cardinality, not population-weight mass). Lonely-PSU policy stays sample-wide. Telescope invariant holds bit-exactly: on a single-path panel, per-path SE matches the global non-by_path survey SE. No R parity — R did_multiplegt_dyn does not support survey weighting; this is a Python-only methodology extension. 14 new tests across two test classes: - TestByPathSurveyDesignAnalytical: gate dispatch, anti-regression on global TSL+bootstrap (locks per-path-only gate scope), per-path analytical SE, single-path telescope, replicate-weight SE, df_survey propagation, per-path placebos, trends_linear cumulated SE inheritance, unobserved-path warnings under survey. - TestByPathSurveyDesignTelescope: single-path telescoping invariant for analytical TSL. Documentation: REGISTRY.md "Per-path survey-design SE" sub-paragraph; by_path / paths_of_interest docstrings updated; CHANGELOG entry; docs/api/chaisemartin_dhaultfoeuille.rst and llms-full.txt updated. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 1 + diff_diff/chaisemartin_dhaultfoeuille.py | 202 ++++++++- diff_diff/guides/llms-full.txt | 4 +- docs/api/chaisemartin_dhaultfoeuille.rst | 6 +- docs/methodology/REGISTRY.md | 4 +- tests/test_chaisemartin_dhaultfoeuille.py | 491 ++++++++++++++++++++++ 6 files changed, 682 insertions(+), 26 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 44eabb6a..25f967b5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- **`ChaisemartinDHaultfoeuille.by_path` and `paths_of_interest` now compose with `survey_design`** for analytical Binder TSL SE and replicate-weight bootstrap variance. The `NotImplementedError` gate at `chaisemartin_dhaultfoeuille.py:1233-1239` is replaced by a per-path multiplier-bootstrap-only gate (`survey_design + n_bootstrap > 0` under by_path / paths_of_interest still raises, since the survey-aware perturbation pivot for path-restricted IFs is methodologically underived). Per-path SE routes through the existing `_survey_se_from_group_if` cell-period allocator: the per-period IF (`U_pp_l_path`) is built with non-path switcher-side contributions skipped (control contributions are unchanged, matching the joiners/leavers IF convention; preserves the row-sum identity `U_pp.sum(axis=1) == U`), cohort-recentered via `_cohort_recenter_per_period`, then expanded to observations as `psi_i = U_pp[g_i, t_i] · (w_i / W_{g_i, t_i})`. Replicate-weight designs unconditionally use the cell allocator (Class A contract from PR #323). New `_refresh_path_inference` helper post-call refreshes `safe_inference` on every populated entry across `multi_horizon_inference`, `placebo_horizon_inference`, `path_effects`, and `path_placebos` so all four surfaces use the same final `df_survey` after per-path replicate fits append `n_valid` to the shared accumulator. Path-enumeration ranking under `survey_design` remains unweighted (group-cardinality, not population-weight mass). Lonely-PSU policy stays sample-wide, not per-path. Telescope invariant: on a single-path panel, per-path SE matches the global non-by_path survey SE bit-exactly. **No R parity** — R `did_multiplegt_dyn` does not support survey weighting; this is a Python-only methodology extension. The global non-by_path TSL multiplier-bootstrap path is unaffected (anti-regression test `tests/test_chaisemartin_dhaultfoeuille.py::TestByPathSurveyDesignAnalytical::test_global_survey_plus_n_bootstrap_still_works` locks the per-path-only scope of the new gate). Cross-surface invariants regression-tested at `TestByPathSurveyDesignAnalytical` (~17 tests across gate / dispatch / analytical SE / replicate-weight SE / per-path placebos / `trends_linear` composition / unobserved-path warnings / final-df refresh regressions) and `TestByPathSurveyDesignTelescope`. See `docs/methodology/REGISTRY.md` §`ChaisemartinDHaultfoeuille` `Note (Phase 3 by_path ...)` → "Per-path survey-design SE" for the full contract. - **Inference-field aliases on staggered result classes** for adapter / external-consumer compatibility. Read-only `@property` aliases expose the flat `att` / `se` / `conf_int` / `p_value` / `t_stat` names (matching `DiDResults` / `TROPResults` / `SyntheticDiDResults` / `HeterogeneousAdoptionDiDResults`) on every result class that previously only carried prefixed canonical fields: `CallawaySantAnnaResults`, `StackedDiDResults`, `EfficientDiDResults`, `ChaisemartinDHaultfoeuilleResults`, `StaggeredTripleDiffResults`, `WooldridgeDiDResults`, `SunAbrahamResults`, `ImputationDiDResults`, `TwoStageDiDResults` (mapping to `overall_*`); `ContinuousDiDResults` (mapping to `overall_att_*`, ATT-side as the headline, ACRT-side accessible unchanged via `overall_acrt_*`); `MultiPeriodDiDResults` (mapping to `avg_*`). `ContinuousDiDResults` additionally exposes `overall_se` / `overall_conf_int` / `overall_p_value` / `overall_t_stat` aliases for naming consistency with the rest of the staggered family. Aliases are pure read-throughs over the canonical fields — no recomputation, no behavior change — so the `safe_inference()` joint-NaN contract (per CLAUDE.md "Inference computation") is inherited automatically (NaN canonical → NaN alias, locked at `tests/test_result_aliases.py::test_pattern_b_aliases_propagate_nan`). The native `overall_*` / `overall_att_*` / `avg_*` fields remain canonical for documentation and computation. Motivated by the `balance.interop.diff_diff.as_balance_diagnostic()` adapter (`facebookresearch/balance` PR #465) which calls `getattr(res, "se", None)` / `getattr(res, "conf_int", None)` without a fallback chain — pre-alias, every staggered result class returned `None` on those keys, silently dropping `se` and `conf_int` from the adapter's diagnostic dict. 23 alias-mechanic + balance-adapter regression tests at `tests/test_result_aliases.py`. Patch-level (additive on stable surfaces). - **`ChaisemartinDHaultfoeuille.by_path` + non-binary integer treatment** — `by_path=k` now accepts integer-coded discrete treatment (D in Z, e.g. ordinal `{0, 1, 2}`); path tuples become integer-state tuples like `(0, 2, 2, 2)`. The previous `NotImplementedError` gate at `chaisemartin_dhaultfoeuille.py:1870` is replaced by a `ValueError` for continuous D (e.g. `D=1.5`) at fit-time per the no-silent-failures contract — the existing `int(round(float(v)))` cast in `_enumerate_treatment_paths` is now defensive (no-op for integer-coded D). Validated against R `did_multiplegt_dyn(..., by_path)` for D in `{0, 1, 2}` via the new `multi_path_reversible_by_path_non_binary` golden-value scenario (78 switchers, 3 paths, single-baseline custom DGP, F_g >= 4): per-path point estimates match R bit-exactly (rtol ~1e-9 on event horizons; rtol+atol envelope for placebo near-zero values), per-path SE inherits the documented cross-path cohort-sharing deviation (~5% rtol observed; SE_RTOL=0.15 envelope). **Deviation from R for D >= 10:** R's `did_multiplegt_by_path` derives the per-path baseline via `path_index$baseline_XX <- substr(path_index$path, 1, 1)`, which captures only the first character of the comma-separated path string (e.g. for `path = "12,12,..."` it captures `"1"` instead of `"12"`); this mis-allocates R's per-path control-pool subset for D >= 10. Python's tuple-key matching is correct in this regime — the per-path point estimates we compute are correct; R's per-path subset for the same path is buggy. The shipped parity scenario stays in `D in {0, 1, 2}` to avoid the R bug. R-parity test at `tests/test_chaisemartin_dhaultfoeuille_parity.py::TestDCDHDynRParityByPathNonBinary`; cross-surface invariants regression-tested at `tests/test_chaisemartin_dhaultfoeuille.py::TestByPathNonBinary`. - **New `paths_of_interest` kwarg on `ChaisemartinDHaultfoeuille`** for user-specified treatment-path subsets, alternative to `by_path=k`'s top-k automatic ranking. Mutually exclusive with `by_path`; setting both raises `ValueError` at `__init__` and `set_params` time. Each path tuple must be a list/tuple of `int` of length `L_max + 1` (uniformity validated at `__init__`; length match against `L_max + 1` validated at fit-time); `bool` and `np.bool_` are explicitly rejected, `np.integer` accepted and canonicalized to Python `int` for tuple-key consistency. Duplicates emit a `UserWarning` and are deduplicated; paths not observed in the panel emit a `UserWarning` and are omitted from `path_effects`. Paths appear in `results.path_effects` in the user-specified order, modulo deduplication and unobserved-path filtering. Composes with non-binary D and all downstream `by_path` surfaces (bootstrap, per-path placebos, per-path joint sup-t bands, `controls`, `trends_linear`, `trends_nonparam`) — mechanical filter on observed paths via the same `_enumerate_treatment_paths` call site, no methodology change. **Python-only API extension; no R equivalent** — R's `did_multiplegt_dyn(..., by_path=k)` only accepts a positive int (top-k) or `-1` (all paths). The `by_path` precondition gate at `chaisemartin_dhaultfoeuille.py:1118` (drop_larger_lower / L_max / `heterogeneity` / `design2` / `honest_did` / `survey_design` mutex) and the 11 `self.by_path is not None` activation branches in `fit()` were rerouted to fire under either selector. Validation + behavior + cross-feature regressions at `tests/test_chaisemartin_dhaultfoeuille.py::TestPathsOfInterest`. diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index 51c4339a..5d05b991 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -458,9 +458,20 @@ class ChaisemartinDHaultfoeuille(ChaisemartinDHaultfoeuilleBootstrapMixin): treatment (D in Z); path tuples become integer-state tuples like ``(0, 2, 2, 2)``. D values must be integer-valued (``D == round(D)``); a ``ValueError`` is raised at fit-time on - continuous D. Incompatible with ``heterogeneity``, ``design2``, - ``honest_did``, and ``survey_design`` (each combination raises - ``NotImplementedError`` in the current release). + continuous D. Compatible with ``survey_design`` for analytical + Binder TSL SE and replicate-weight bootstrap; per-path SE + routes through the cell-period allocator, with non-path + switcher-side contributions skipped (control contributions + remain unchanged, matching the joiners/leavers IF convention). + ``n_bootstrap > 0`` (multiplier bootstrap) under + ``survey_design`` is not yet supported and raises + ``NotImplementedError``. Top-k path ranking under + ``survey_design`` remains group-cardinality-based (unweighted), + not population-weight-based — survey weights do not affect + which paths are selected as "top-k". Incompatible with + ``heterogeneity``, ``design2``, and ``honest_did`` (each + combination raises ``NotImplementedError`` in the current + release). Mutually exclusive with ``paths_of_interest`` — use ``by_path=k`` for top-k automatic ranking by frequency, or @@ -622,8 +633,10 @@ class ChaisemartinDHaultfoeuille(ChaisemartinDHaultfoeuilleBootstrapMixin): Compatible with all downstream surfaces inherited by ``by_path``: bootstrap, per-path placebos, per-path joint sup-t bands, ``controls``, ``trends_linear``, - ``trends_nonparam``. Mechanical extension to path - enumeration; no methodology change. + ``trends_nonparam``, and ``survey_design`` (analytical Binder + TSL + replicate-weight; multiplier bootstrap under survey + remains gated, same as ``by_path=k``). Mechanical extension + to path enumeration; no methodology change. **Order semantics**: paths appear in ``results.path_effects`` in the user-specified order, modulo @@ -1230,12 +1243,17 @@ def fit( "(HonestDiD sensitivity analysis) is deferred to a " "future release." ) - if survey_design is not None: + if survey_design is not None and self.n_bootstrap > 0: raise NotImplementedError( - "by_path / paths_of_interest combined with " - "survey_design is deferred to a future release: the " - "cell-period IF allocator under path subsets has not " - "been derived." + "by_path / paths_of_interest combined with both " + "survey_design and n_bootstrap>0 (multiplier " + "bootstrap) is not yet supported (the survey-aware " + "perturbation pivot for path-restricted IFs has " + "not been derived). Use n_bootstrap=0 for " + "analytical Binder TSL SE under survey_design, or " + "use replicate weights " + "(SurveyDesign(..., replicate_weights=...)) for " + "design-based bootstrap variance." ) # ------------------------------------------------------------------ @@ -2215,6 +2233,15 @@ def fit( eligible_mask_var = np.array( [g not in singleton_baseline_set for g in all_groups], dtype=bool ) + # Lift eligible_groups_var once for downstream by_path / + # paths_of_interest call sites; mirrors the inline + # `_elig_groups_l` construction at the global per-horizon + # path so both surfaces share the same variance-eligibility + # ordering. Used to align per-group IF entries with the + # cell-period allocator's `eligible_groups` argument. + eligible_groups_var: List[Any] = [ + all_groups[g] for g in range(len(all_groups)) if eligible_mask_var[g] + ] multi_horizon_se = {} multi_horizon_inference = {} @@ -2351,6 +2378,9 @@ def fit( alpha=self.alpha, df_inference=_inference_df(_df_s_bp, resolved_survey), set_ids=set_ids_arr, + obs_survey_info=_obs_survey_info, + eligible_groups=eligible_groups_var, + replicate_n_valid_list=_replicate_n_valid_list, ) # NOTE: per-path cumulated layer is computed AFTER the # bootstrap propagation block below (search for @@ -2517,6 +2547,34 @@ def fit( alpha=self.alpha, df_inference=_inference_df(_df_s_bp_pl, resolved_survey), set_ids=set_ids_arr, + obs_survey_info=_obs_survey_info, + eligible_groups=eligible_groups_var, + replicate_n_valid_list=_replicate_n_valid_list, + ) + + # Post-per-path inference refresh under replicate-weight + # designs. Per-path effects/placebos snapshot df_inference + # BEFORE their own n_valid contributions append to the + # shared list, and the global per-horizon / placebo + # surfaces took their snapshots before per-path runs. After + # all per-path fits complete, recompute the final df and + # re-run safe_inference on every populated entry so all + # four surfaces (multi_horizon_inference, + # placebo_horizon_inference, path_effects, path_placebos) + # reflect the same df. No-op under TSL / non-survey fits. + if ( + resolved_survey is not None + and getattr(resolved_survey, "uses_replicate_variance", False) + and (self.by_path is not None or self.paths_of_interest is not None) + ): + _df_s_final = _effective_df_survey(resolved_survey, _replicate_n_valid_list) + _refresh_path_inference( + path_effects=path_effects, + path_placebos=path_placebos, + multi_horizon_inference=multi_horizon_inference, + placebo_horizon_inference=placebo_horizon_inference, + alpha=self.alpha, + df_final=_inference_df(_df_s_final, resolved_survey), ) # Normalized effects DID^n_l (suppressed under trends_linear @@ -5788,6 +5846,9 @@ def _compute_path_effects( df_inference: Optional[int] = None, set_ids: Optional[np.ndarray] = None, paths_of_interest: Optional[List[Tuple[int, ...]]] = None, + obs_survey_info: Optional[Dict[str, Any]] = None, + eligible_groups: Optional[List[Any]] = None, + replicate_n_valid_list: Optional[List[int]] = None, ) -> Optional[Dict[Tuple[int, ...], Dict[str, Any]]]: """ Compute per-path event-study effects using the joiners/leavers IF pattern. @@ -5906,13 +5967,13 @@ def _compute_path_effects( T_g=T_g, L_max=L_max, set_ids=set_ids, - compute_per_period=False, + compute_per_period=(obs_survey_info is not None), switcher_subset_mask=switcher_mask, ) horizons: Dict[int, Dict[str, Any]] = {} for l_h in range(1, L_max + 1): - U_l_path, _ = per_path_if[l_h] + U_l_path, U_pp_l_path = per_path_if[l_h] # N_l_path: path-restricted count of eligible switchers at # horizon l. Mirror _compute_multi_horizon_dids' eligibility @@ -5938,10 +5999,30 @@ def _compute_path_effects( # Point estimate: within-path mean DID effect_path = float(U_l_path.sum() / n_l_path) - # SE: cohort-recenter with ORIGINAL cohort structure, then - # plug-in with path-specific divisor (joiners/leavers pattern). + # SE: cohort-recenter with ORIGINAL cohort structure. Under + # survey, route through _survey_se_from_group_if (cell-period + # allocator). Otherwise plug-in with path-specific divisor + # (joiners/leavers pattern). U_centered_path = _cohort_recenter(U_l_path_elig, cohort_id_eligible) - se_path = _plugin_se(U_centered=U_centered_path, divisor=n_l_path) + if obs_survey_info is None: + se_path = _plugin_se(U_centered=U_centered_path, divisor=n_l_path) + else: + assert U_pp_l_path is not None + assert eligible_groups is not None + U_pp_l_path_elig = U_pp_l_path[eligible_mask_var] + U_centered_pp_path = _cohort_recenter_per_period( + U_pp_l_path_elig, cohort_id_eligible + ) + U_scaled = U_centered_path / n_l_path + U_pp_scaled = U_centered_pp_path / n_l_path + se_path, n_valid_replicates = _survey_se_from_group_if( + U_centered=U_scaled, + eligible_groups=eligible_groups, + obs_survey_info=obs_survey_info, + U_centered_per_period=U_pp_scaled, + ) + if n_valid_replicates is not None and replicate_n_valid_list is not None: + replicate_n_valid_list.append(n_valid_replicates) # Path-scoped degenerate-cohort warning. Mirrors the overall- # path surface (`Cohort-recentered analytical variance is @@ -6131,6 +6212,9 @@ def _compute_path_placebos( df_inference: Optional[int] = None, set_ids: Optional[np.ndarray] = None, paths_of_interest: Optional[List[Tuple[int, ...]]] = None, + obs_survey_info: Optional[Dict[str, Any]] = None, + eligible_groups: Optional[List[Any]] = None, + replicate_n_valid_list: Optional[List[int]] = None, ) -> Optional[Dict[Tuple[int, ...], Dict[int, Dict[str, Any]]]]: """ Compute per-path backward-horizon placebos ``DID^{pl}_{path, l}``. @@ -6221,13 +6305,13 @@ def _compute_path_placebos( T_g=T_g, L_max=L_max, set_ids=set_ids, - compute_per_period=False, + compute_per_period=(obs_survey_info is not None), switcher_subset_mask=switcher_mask, ) horizons: Dict[int, Dict[str, Any]] = {} for lag_l in range(1, L_max + 1): - U_pl_l_path, _ = per_path_pl_if[lag_l] + U_pl_l_path, U_pp_pl_l_path = per_path_pl_if[lag_l] pl_data = multi_horizon_placebos.get(lag_l) if pl_data is None: @@ -6254,7 +6338,25 @@ def _compute_path_placebos( effect_pl_path = float(U_pl_l_path.sum() / n_pl_l_path) U_centered_pl_path = _cohort_recenter(U_pl_l_path_elig, cohort_id_eligible) - se_pl_path = _plugin_se(U_centered=U_centered_pl_path, divisor=n_pl_l_path) + if obs_survey_info is None: + se_pl_path = _plugin_se(U_centered=U_centered_pl_path, divisor=n_pl_l_path) + else: + assert U_pp_pl_l_path is not None + assert eligible_groups is not None + U_pp_pl_l_path_elig = U_pp_pl_l_path[eligible_mask_var] + U_centered_pp_pl_path = _cohort_recenter_per_period( + U_pp_pl_l_path_elig, cohort_id_eligible + ) + U_pl_scaled = U_centered_pl_path / n_pl_l_path + U_pp_pl_scaled = U_centered_pp_pl_path / n_pl_l_path + se_pl_path, n_valid_pl_replicates = _survey_se_from_group_if( + U_centered=U_pl_scaled, + eligible_groups=eligible_groups, + obs_survey_info=obs_survey_info, + U_centered_per_period=U_pp_pl_scaled, + ) + if n_valid_pl_replicates is not None and replicate_n_valid_list is not None: + replicate_n_valid_list.append(n_valid_pl_replicates) if np.isnan(se_pl_path) and U_centered_pl_path.size > 0 and n_pl_l_path > 0: warnings.warn( @@ -6326,9 +6428,12 @@ def _collect_path_bootstrap_inputs( ``_bootstrap_one_target`` downstream. Returns a nested dict ``{path: {horizon: (U_centered, n, effect, None)}}``; - the 4th slot is always ``None`` because per-path survey-cell IFs - are a future wave item (the ``by_path + survey_design`` combination - is gated out before this helper runs). + the 4th slot is always ``None`` because the multiplier-bootstrap + path under ``survey_design + by_path`` is gated out at fit-time + (``n_bootstrap > 0`` + ``survey_design`` + per-path selectors raises + ``NotImplementedError``); analytical and replicate-weight survey + SE under per-path selectors flows through ``_compute_path_effects`` + directly and does not reach this helper. ``_enumerate_treatment_paths`` is called again here (the analytical pass already called it inside ``_compute_path_effects``). The @@ -7577,6 +7682,61 @@ def _validate_cell_constant_strata_psu( ) +def _refresh_path_inference( + path_effects: Optional[Dict[Tuple[int, ...], Dict[str, Any]]], + path_placebos: Optional[Dict[Tuple[int, ...], Dict[int, Dict[str, Any]]]], + multi_horizon_inference: Optional[Dict[int, Dict[str, Any]]], + placebo_horizon_inference: Optional[Dict[int, Dict[str, Any]]], + alpha: float, + df_final: Optional[int], +) -> None: + """Refresh inference fields (t_stat, p_value, conf_int) with the + final ``df_final`` after per-path replicate fits have appended to + the shared ``_replicate_n_valid_list``. + + Under replicate-weight designs, every IF site contributes an + ``n_valid`` count and the effective ``df_survey`` is + ``min(...) - 1``. Per-path fits run AFTER the global per-horizon + and global placebo loops snapshot their df from + ``_replicate_n_valid_list``; per-path entries themselves use a + snapshot taken BEFORE per-path replicate appends. This helper + re-runs ``safe_inference(effect, se, alpha, df=df_final)`` on every + populated entry so all four surfaces reflect the final df. + + No-op under TSL (analytical) or non-survey fits — they skip + replicate-n_valid bookkeeping entirely. Mutates dicts in place. + """ + from diff_diff.utils import safe_inference + + def _refresh_entry(entry: Dict[str, Any]) -> None: + eff = entry.get("effect") + se = entry.get("se") + if eff is None or se is None: + return + if not np.isfinite(se): + return + t_new, p_new, ci_new = safe_inference(eff, se, alpha=alpha, df=df_final) + entry["t_stat"] = t_new + entry["p_value"] = p_new + entry["conf_int"] = ci_new + + if multi_horizon_inference is not None: + for entry in multi_horizon_inference.values(): + _refresh_entry(entry) + if placebo_horizon_inference is not None: + for entry in placebo_horizon_inference.values(): + _refresh_entry(entry) + if path_effects is not None: + for path_data in path_effects.values(): + horizons = path_data.get("horizons", {}) + for entry in horizons.values(): + _refresh_entry(entry) + if path_placebos is not None: + for path_horizons in path_placebos.values(): + for entry in path_horizons.values(): + _refresh_entry(entry) + + def _inference_df( effective_df: Optional[int], resolved_survey: Any, diff --git a/diff_diff/guides/llms-full.txt b/diff_diff/guides/llms-full.txt index f81f7d57..6245f62b 100644 --- a/diff_diff/guides/llms-full.txt +++ b/diff_diff/guides/llms-full.txt @@ -242,8 +242,8 @@ ChaisemartinDHaultfoeuille( placebo: bool = True, # Auto-compute single-lag placebo twfe_diagnostic: bool = True, # Auto-compute Theorem 1 TWFE decomposition drop_larger_lower: bool = True, # Drop multi-switch groups (matches R DIDmultiplegtDYN) - by_path: int | None = None, # Top-k per-path event study; requires drop_larger_lower=False, L_max>=1; supports binary or integer-coded discrete D (D in Z); mutex with paths_of_interest - paths_of_interest: list[tuple[int, ...]] | None = None, # User-specified path subset, alternative to by_path=k (Python-only API; mutex with by_path) + by_path: int | None = None, # Top-k per-path event study; requires drop_larger_lower=False, L_max>=1; supports binary or integer-coded discrete D (D in Z); composes with survey_design (analytical TSL + replicate-weight; multiplier bootstrap n_bootstrap>0 still gated under survey); mutex with paths_of_interest + paths_of_interest: list[tuple[int, ...]] | None = None, # User-specified path subset, alternative to by_path=k (Python-only API; mutex with by_path; composes with survey_design same as by_path=k) rank_deficient_action: str = "warn", # Used by TWFE diagnostic OLS ) ``` diff --git a/docs/api/chaisemartin_dhaultfoeuille.rst b/docs/api/chaisemartin_dhaultfoeuille.rst index 3a240da7..611bd0e7 100644 --- a/docs/api/chaisemartin_dhaultfoeuille.rst +++ b/docs/api/chaisemartin_dhaultfoeuille.rst @@ -22,7 +22,11 @@ simultaneous bands when ``n_bootstrap > 0`` — Python-only extension beyond R, which provides no joint bands at any surface) or via ``paths_of_interest=[(...), ...]`` for an explicit user-specified path subset (Python-only API; mutex with ``by_path``). ``by_path`` -supports binary or integer-coded discrete (D in Z) treatment. +supports binary or integer-coded discrete (D in Z) treatment, and +composes with ``survey_design`` for analytical Binder TSL SE and +replicate-weight bootstrap variance (multiplier bootstrap under +survey + by_path remains gated; no R parity since R +``did_multiplegt_dyn`` does not support survey weighting). The estimator: diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index 91c33f70..353d2fb0 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -640,7 +640,7 @@ The guard is fired by `_survey_se_from_group_if` (analytical and replicate) and - **Note (Phase 3 Design-2 switch-in/switch-out):** Convenience wrapper for Web Appendix Section 1.6 (Assumption 16). Identifies groups with exactly 2 treatment changes (join then leave), reports switch-in and switch-out mean effects. This is a descriptive summary, not a full re-estimation with specialized control pools as described in the paper. **Always uses raw (unadjusted) outcomes** regardless of active `controls`, `trends_linear`, or `trends_nonparam` options - those adjustments apply to the main estimator surface but not to the Design-2 descriptive block. For full adjusted Design-2 estimation with proper control pools, the paper recommends "running the command on a restricted subsample and using `trends_nonparam` for the entry-timing grouping." Activated via `design2=True` in `fit()`, requires `drop_larger_lower=False` to retain 2-switch groups. -- **Note (Phase 3 `by_path` per-path event-study disaggregation):** Per-path disaggregation of the multi-horizon event study, mirroring R `did_multiplegt_dyn(..., by_path=k)`. Activated via `ChaisemartinDHaultfoeuille(by_path=k, drop_larger_lower=False)` where `k` is a positive integer (top-k most common observed paths by switcher-group frequency). **Window convention:** the path tuple for a switcher group `g` is `(D_{g, F_g-1}, D_{g, F_g}, ..., D_{g, F_g-1+L_max})` — length `L_max + 1`, matching R's window `[F_{g-1}, F_{g-1+l}]`. **Ranking:** paths are ranked by descending frequency; ties are broken lexicographically on the path tuple for deterministic ordering, so every selected path has a unique `frequency_rank`. If `by_path` exceeds the number of observed paths, all observed paths are returned with a `UserWarning`. **Per-path SE convention (joiners/leavers precedent):** the per-path influence function follows the joiners-only / leavers-only IF construction at `chaisemartin_dhaultfoeuille.py:5495-5504`: the switcher-side contribution `+S_g * (Y_{g,out} - Y_{g,ref})` is zeroed for groups whose observed trajectory is NOT the selected path; control contributions and the full cohort structure `(D_{g,1}, F_g, S_g)` are unchanged. After applying the singleton-baseline eligible mask and cohort-recentering with the original cohort IDs, the plug-in SE uses the path-specific divisor `N_l_path` (count of path switchers eligible at horizon `l`) — same pattern as `joiners_se` using `joiner_total`. This gives the **within-path mean** estimand `DID_{path,l}` as the within-path average of `DID_{g,l}`. **Degenerate-cohort behavior per path:** when a path's centered IF at some horizon is identically zero (every variance-eligible path switcher forms its own `(D_{g,1}, F_g, S_g)` cohort, or the path has a single contributing group), SE / t_stat / p_value / conf_int are NaN-consistent and a `UserWarning` is emitted scoped to `(path, horizon)`. This mirrors the overall-path degenerate-cohort surface and is common for rare paths with few contributing groups. **Empty-state contract:** `results.path_effects` distinguishes "not requested" (`None`) from "requested but empty" (`{}` — all switchers have windows outside the panel or unobserved cells). The empty-dict case emits a `UserWarning` at fit-time and renders as an explicit "no observed paths" notice in `summary()`; `to_dataframe(level="by_path")` returns an empty DataFrame with the canonical column set (mirrors the `linear_trends` pattern when `trends_linear=True` but no horizons survive). **Requirements:** `drop_larger_lower=False` (multi-switch groups are the object of interest; default `True` filters them out) and `L_max >= 1` (path window depends on the horizon). **Scope:** combinations with `heterogeneity`, `design2`, `honest_did`, and `survey_design` remain gated behind explicit `NotImplementedError` (deferred to follow-up wave PRs). `n_bootstrap > 0` is now supported — see the **Bootstrap SE** paragraph below. `placebo=True` is now supported per-path — see the **Per-path placebos** paragraph below. **TWFE diagnostic** remains a sample-level summary (not computed per path) in this release. Results are exposed on `results.path_effects` as `Dict[Tuple[int, ...], Dict[str, Any]]` with nested `horizons` dicts per horizon `l`, and on `results.to_dataframe(level="by_path")` as a long-format table with columns `[path, frequency_rank, n_groups, horizon, effect, se, t_stat, p_value, conf_int_lower, conf_int_upper, n_obs, cband_lower, cband_upper, cumulated_effect, cumulated_se]` (the `cband_*` columns are added by the joint sup-t Note below, populated for positive-horizon rows of paths with a finite sup-t crit and NaN otherwise; the `cumulated_*` columns are added by the per-path linear-trends Note below, populated for positive-horizon rows when `trends_linear=True` is set and NaN otherwise). Gated tests live in `tests/test_chaisemartin_dhaultfoeuille.py::TestByPathGates` / `::TestByPathBehavior` / `::TestByPathEdgeCases`. **R-parity** against `DIDmultiplegtDYN 2.3.3` is confirmed at `tests/test_chaisemartin_dhaultfoeuille_parity.py::TestDCDHDynRParityByPath` via two scenarios: `mixed_single_switch_by_path` (2 paths, `by_path=2`) and `multi_path_reversible_by_path` (4 paths, `by_path=3`; path-assignment deterministic on `F_g` so each `(D_{g,1}, F_g, S_g)` cohort contains switchers from a single path). Per-path point estimates and per-path switcher counts match R exactly; per-path SE matches within the Phase 2 multi-horizon SE envelope (observed rtol ≤ 10.2% on the 2-path mixed scenario, ≤ 4.2% on the 4-path cohort-clean scenario). **Deviation from R (cross-path cohort-sharing SE):** our analytical SE is the marginal variance of the path-contribution estimator cohort-centered on the *full-panel* cohort structure (joiners/leavers precedent — non-path switchers contribute to cohort means via their zeroed switcher row). R's `did_multiplegt_dyn(..., by_path=k)` re-runs the estimator per path, so cohort means are computed over the path's own switchers only. When a cohort `(D_{g,1}, F_g, S_g)` spans multiple observed paths, Python and R SE diverge materially (our empirical probes with random post-window toggling saw rtol > 100%); when every cohort is single-path (scenario 13 by design, scenario 14 by construction), the two approaches coincide up to the documented Phase 2 envelope. Practitioners with cohort structures that mix paths should interpret the per-path SE as a within-full-panel marginal variance, not a per-path conditional variance. **Bootstrap SE:** when `n_bootstrap > 0` is set, the top-k paths are enumerated once on the observed data (R-faithful: matches `did_multiplegt_dyn(..., by_path=k, bootstrap=B)`'s path-stability convention — verified empirically against DIDmultiplegtDYN 2.3.3) and the multiplier bootstrap (`bootstrap_weights ∈ {"rademacher", "mammen", "webb"}`) runs per `(path, horizon)` target via the shared `_bootstrap_one_target` / `compute_effect_bootstrap_stats` helpers. Point estimates are unchanged from the analytical path. Bootstrap SE replaces the analytical SE in `path_effects[path]["horizons"][l]["se"]`, and `p_value` / `conf_int` are taken as the **bootstrap percentile** statistics, matching the Round-10 library convention for overall / joiners / leavers / multi-horizon bootstrap (see the `Note (bootstrap inference surface)` elsewhere in this file and the pinned regression `test_bootstrap_p_value_and_ci_propagated_to_top_level`). `t_stat` is SE-derived via `safe_inference` per the anti-pattern rule. Interpretation: inference is *conditional on the observed path set*. **SE inherits the analytical cross-path cohort-sharing deviation:** the bootstrap input is the exact same full-panel cohort-centered path IF that the analytical path computes (`_collect_path_bootstrap_inputs` reuses the same enumeration / cohort IDs / IF construction), so the bootstrap SE is a Monte Carlo analog of the analytical SE — it inherits the same cross-path cohort-sharing deviation from R's per-path re-run convention documented above. On single-path-cohort panels (scenarios 13 and 14 of the R-parity fixture, and any DGP where `(D_{g,1}, F_g, S_g)` cohorts never span multiple observed paths), bootstrap SE tracks analytical SE up to Monte Carlo noise and both coincide with R up to the Phase 2 envelope. On cross-path cohort panels, bootstrap SE inherits the >100% rtol divergence from R that analytical already has. **Deviation from R (CI method):** R's per-path CI is normal-theory around the bootstrap SE (half-width ≈ `1.96·se`); ours is the bootstrap percentile CI, intentionally diverging from R to keep the dCDH inference surface internally consistent across all bootstrap targets. Practitioners who want *unconditional* inference capturing path-selection uncertainty need a pairs-bootstrap (deferred — no R precedent). Positive regressions live in `tests/test_chaisemartin_dhaultfoeuille.py::TestByPathBootstrap` (gated `@pytest.mark.slow`): point-estimate invariance, finite positive SE on non-degenerate panels, SE-within-30%-rtol of analytical on cohort-clean fixtures, degenerate-cohort NaN propagation, Rademacher/Mammen/Webb parity, seed reproducibility, and percentile-vs-normal-theory CI pinning. **Per-path placebos:** when `placebo=True` (and `L_max >= 1`) is combined with `by_path=k`, per-path backward-horizon placebos `DID^{pl}_{path, l}` for `l = 1..L_max` are computed using the same joiners/leavers IF precedent applied to `_compute_per_group_if_placebo_horizon` (with the new `switcher_subset_mask` parameter): switcher contributions are zeroed for groups not in the path; the control pool and the variance-eligible cohort structure `(D_{g,1}, F_g, S_g)` are unchanged. Plug-in SE uses the path-specific divisor `N^{pl}_{l, path}` (count of path switchers eligible at backward lag `l`). Surfaced on `results.path_placebo_event_study[path][-l]` with the same `{effect, se, t_stat, p_value, conf_int, n_obs}` shape as `placebo_event_study` (negative-int inner keys parallel the existing per-path event-study positive-int keys, so a unified forward+backward view is well-formed). **Inherits the cross-path cohort-sharing SE deviation from R** documented above for `path_effects` (same convention applied backward); tracks R within numerical tolerance on single-path-cohort panels and diverges on cohort-mixed panels. Multiplier bootstrap (when `n_bootstrap > 0`) runs per `(path, lag)` target via the same `_bootstrap_one_target` dispatch used for the per-path event-study, with the canonical NaN-on-invalid contract. The bootstrap SE is a Monte Carlo analog of the analytical placebo SE — same per-path centered IF input — and inherits the same deviation. Surfaced through `summary()` (negative-keyed rows rendered alongside positive-keyed event-study rows under each path block) and `to_dataframe(level="by_path")` (`horizon` column takes negative ints for placebo rows). **Empty-state contract:** `results.path_placebo_event_study` mirrors `path_effects` — `None` when `by_path + placebo` was not requested, `{}` when requested but no observed path has a complete window within the panel (same regime that returns `{}` for `path_effects`, with the same fit-time `UserWarning`). R-parity is confirmed at `tests/test_chaisemartin_dhaultfoeuille_parity.py::TestDCDHDynRParityByPathPlacebo` on the `multi_path_reversible_by_path_placebo` scenario; positive analytical + bootstrap invariants live in `tests/test_chaisemartin_dhaultfoeuille.py::TestByPathPlacebo` (with the gated `::TestByPathPlacebo::TestBootstrap` subclass). **Per-path covariate residualization (DID^X):** when `controls=[...]` is set with `by_path=k`, the per-baseline OLS residualization (Web Appendix Section 1.2) runs once on the first-differenced outcome BEFORE path enumeration. All four downstream surfaces — analytical per-path SE, bootstrap SE, per-path placebos, and per-path joint sup-t bands — consume the residualized `Y_mat` automatically (Frisch-Waugh-Lovell). Per-period effects remain unadjusted, consistent with the existing `controls` + per-period DID contract (per-period DID does not support residualization). Failed-stratum baselines (rank-deficient X) zero out `N_mat` for affected groups, which the path enumeration treats as ineligible per its existing convention. **Deviation from R on multi-baseline switcher panels (point estimates):** R `did_multiplegt_dyn(..., by_path, controls)` re-runs the per-baseline residualization on each path's restricted subsample (`R/R/did_multiplegt_dyn.R` lines 401-405: rows of the path's switchers OR rows where `yet_to_switch=1 AND baseline matches the path's baseline`). The first-stage residualization sample R uses for path B equals: pre-switch rows of all switchers with matching baseline + all rows of never-switchers with matching baseline — bit-identical to our global first-stage sample under single-baseline switcher panels (every switcher shares the same `D_{g,1}`, regardless of how `F_g` or path identity varies across switchers). Per-path point estimates therefore coincide with R on those panels up to the existing **DID^X first-stage cell-weighting deviation** documented above in `Note (Phase 3 DID^X covariate adjustment)` (Python's first-stage OLS uses equal cell weights — one observation per `(g, t)` cell, consistent with the library's cell-aggregated input convention; R weights by `N_gt`). On panels with one observation per `(g, t)` cell (the common case after the cell-aggregation step in `fit()`), Python matches R bit-exactly: the `multi_path_reversible_by_path_controls` parity fixture has 4 paths with switcher `F_g` values spanning [0..6] under `D_{g,1}=0` and Python matches R to rtol ~1e-11. On multi-baseline switcher panels (some switchers have `D_{g,1}=0`, others have `D_{g,1}=1`) R's per-path subset drops switchers whose baseline differs from the path's baseline, so the per-baseline regression coefficients diverge per path under R and point estimates can diverge between Python and R — a `UserWarning` is emitted at fit-time when this configuration is detected so practitioners do not silently consume estimates that disagree with R. The warning filters to switcher groups only; never-switchers (never-treated + always-treated controls) at multiple baseline values do NOT trigger the warning because they don't affect R's per-path subset construction. **Inherits the cross-path cohort-sharing SE deviation from R** documented above for `path_effects` — bootstrap SE, placebo SE, and sup-t crit are Monte Carlo / joint-distribution analogs of the same residualized analytical IF and carry the same deviation. R-parity is confirmed against `did_multiplegt_dyn(..., by_path=3, controls="X1")` at `tests/test_chaisemartin_dhaultfoeuille_parity.py::TestDCDHDynRParityByPathControls` on the `multi_path_reversible_by_path_controls` scenario (single-baseline DGP, exact point-estimate match measured rtol ~1e-11); cross-surface inheritance and the multi-baseline warning are regression-tested at `tests/test_chaisemartin_dhaultfoeuille.py::TestByPathControls` (analytical + bootstrap + placebo + sup-t + `to_dataframe(level="by_path")` cband columns + multi-baseline `UserWarning`). **Per-path linear-trends DID^{fd}:** when `trends_linear=True` is set with `by_path=k`, the first-differencing transform at `chaisemartin_dhaultfoeuille.py:1599-1630` runs once globally BEFORE path enumeration (replaces `Y_mat` with `Z_mat = Y_t - Y_{t-1}` and shrinks the time axis by one), so per-path raw second-differences `DID^{fd}_{path, l}` surface on `path_effects[path]["horizons"][l]` automatically. Per-path cumulated level effects `delta_{path, l} = sum_{l'=1..l} DID^{fd}_{path, l'}` (the quantity R returns under `did_multiplegt_dyn(..., by_path, trends_lin)` per the existing parity test pivot at `tests/test_chaisemartin_dhaultfoeuille_parity.py:403-409`) surface on the new `results.path_cumulated_event_study[path][l]` field — a per-group running sum of `DID^{fd}_{g, l'}` averaged over the path's switchers eligible at horizon `l`, mirroring the global `linear_trends_effects` cumulation logic at `chaisemartin_dhaultfoeuille.py:3340-3398`. SE on the cumulated layer is the conservative upper bound (sum of per-horizon component SEs from `path_effects[path]["horizons"][l]["se"]`, NaN-consistent: any non-finite component yields a NaN cumulated SE). **Post-bootstrap recomputation:** the cumulated layer is built AFTER the bootstrap propagation block at `chaisemartin_dhaultfoeuille.py:3034-3081` so it reads the FINAL post-bootstrap per-horizon SEs (mirrors the global `linear_trends_effects` placement). When `n_bootstrap > 0`, cumulated SE / t / p / CI are derived from bootstrap per-horizon SEs; when bootstrap produces non-finite SE (e.g., `n_bootstrap=1` degenerate distribution), the cumulated layer's full inference tuple is NaN per the library-wide NaN-on-invalid bootstrap contract. `to_dataframe(level="by_path")` exposes `cumulated_effect` and `cumulated_se` columns (always present, NaN-when-None — mirrors the `cband_*` always-present convention from PR #374). `summary()` renders a `Cumulated Level Effects (DID^{fd}, trends_linear)` sub-section under each per-path block. **Path enumeration uses the post-first-differenced `N_mat_fd`**: switchers with `F_g==2` fail the window-eligibility check and are dropped from path enumeration entirely (the existing global `F_g >= 3` warning at line 1620 surfaces the issue), so a path whose switchers all have `F_g < 3` is silently absent from `path_effects` rather than present-with-NaN. **F_g=3 boundary-case divergence (`by_path + trends_linear`):** `F_g=3` switchers have exactly 2 pre-switch periods, which after first-differencing and the `time==1` filter leaves only 1 valid pre-window Z value. R's per-path full-pipeline call handles this single-pre-period regime differently from Python's global-then-disaggregate architecture, producing 30%+ relative divergence on point estimates for paths whose switchers include `F_g=3` (empirically observed on the parity fixture's earlier `F_g=3` variant). A separate `UserWarning` fires at fit-time when the panel includes any `F_g=3` switcher AND `by_path + trends_linear` is set, mirroring the `F_g < 3` exclusion warning. The shipped parity fixture (`single_baseline_multi_path_by_path_trends_lin`) restricts to `F_g >= 4` exclusively to avoid this regime; per-path R parity is asserted only there. **Placebo under `trends_linear` returns RAW per-horizon values** (no per-path placebo cumulation surface) — verified empirically against the existing `joiners_only_trends_lin` parity fixture: R's per-path Placebo_l matches Python's `path_placebo_event_study[path][-l]` (raw) bit-exactly under non-`by_path` trends_lin. **Deviation from R on multi-baseline switcher panels (point estimates):** R `did_multiplegt_dyn(..., by_path, trends_lin)` re-runs the full pipeline (including first-differencing) on each path's restricted subsample, so it operates on different switcher samples per path when switchers have different baseline values `D_{g,1}`. Python first-differences once globally before path enumeration. On single-baseline switcher panels the two architectures coincide; on multi-baseline switcher panels per-path point estimates can diverge — a `UserWarning` is emitted at fit-time when this configuration is detected so practitioners do not silently consume estimates that disagree with R (mirroring the analogous `by_path + controls` warning). Per-path R parity is confirmed against `did_multiplegt_dyn(..., by_path=3, trends_lin=TRUE, placebo=1)` at `tests/test_chaisemartin_dhaultfoeuille_parity.py::TestDCDHDynRParityByPathTrendsLinear` on the `single_baseline_multi_path_by_path_trends_lin` scenario (single-baseline + cohort-single-path + `F_g >= 4` DGP designed to eliminate the multi-baseline divergence, the cross-path cohort-sharing deviation, and the F_g=3 boundary case under R's per-path full-pipeline call). Per-path cumulated point estimates match R bit-exactly (rtol ~1e-9) on event horizons under those conditions; cumulated SE_RTOL is widened to `0.20` (vs `0.12` used for non-cumulated by_path parity) because the conservative upper-bound SE compounds the cross-path cohort-sharing deviation under summation. **Placebo parity is intentionally skipped for `trends_linear`**: R's per-path placebo computation re-runs on the path-restricted subsample with different control eligibility than Python's global-then-disaggregate architecture surfaces, producing a sign-and-magnitude divergence on paths whose switchers have minimal pre-window depth (e.g., `F_g=4` switchers). Placebo under `by_path + trends_linear` is exercised via internal regression in `tests/test_chaisemartin_dhaultfoeuille.py::TestByPathTrendsLinear` (finite values, bootstrap inheritance) but not pinned to R bit-by-bit. Cross-surface invariants (analytical + bootstrap + placebo + sup-t + `path_cumulated_event_study` + `to_dataframe` columns + `summary()` rendering) are regression-tested at `TestByPathTrendsLinear`. **Per-path state-set trends:** when `trends_nonparam="state_col"` is set with `by_path=k`, the set membership column is validated and stored once globally as `set_ids_arr` (time-invariance, NaN rejection, partition-coarseness checks unchanged from the non-by_path path). The `set_ids` parameter is threaded through the four per-path IF helpers (`_compute_path_effects`, `_compute_path_placebos`, `_collect_path_bootstrap_inputs`, `_collect_path_placebo_bootstrap_inputs`) so per-path analytical SE, bootstrap, placebos, and sup-t bands all consume the set-restricted control pool automatically. R does NOT first-difference and does NOT cumulate under `trends_nonparam` (unlike `trends_lin`); per-horizon `Effect_l` is a normal DID with set-restricted controls. Per-path R parity is confirmed against `did_multiplegt_dyn(..., by_path=3, trends_nonparam="state", placebo=1)` at `tests/test_chaisemartin_dhaultfoeuille_parity.py::TestDCDHDynRParityByPathTrendsNonparam` on the `multi_path_reversible_by_path_trends_nonparam` scenario; per-path point estimates AND placebos match R bit-exactly (rtol ~1e-9), per-path SE matches within the Phase 2 envelope (~13% rtol observed). Cross-surface invariants are regression-tested at `tests/test_chaisemartin_dhaultfoeuille.py::TestByPathTrendsNonparam`. **Per-path non-binary treatment:** integer-coded discrete treatment (D in Z, e.g. ordinal {0, 1, 2}) is supported under `by_path=k` and `paths_of_interest`. Path tuples become integer-state tuples (`(0, 2, 2, 2)`) keyed bit-for-bit against R's comma-separated path strings (`"0,2,2,2"`) for D in {0..9}. Continuous D (e.g. `1.5`) raises `ValueError` at fit-time per the no-silent-failures contract — the existing `int(round(float(v)))` cast in `_enumerate_treatment_paths` is now defensive (no-op for integer-coded D). **Deviation from R for D >= 10:** R's `did_multiplegt_by_path` derives the per-path baseline via `path_index$baseline_XX <- substr(path_index$path, 1, 1)` (extracted 2026-05-03 via `Rscript -e 'cat(paste(deparse(DIDmultiplegtDYN:::did_multiplegt_by_path), collapse="\n"))'`), capturing only the first character of the comma-separated path string. For D >= 10 this captures `"1"` instead of `"12"` for `path = "12,12,..."`, mis-allocating R's per-path control-pool subset. Python's tuple-key matching is correct in this regime; the per-path point estimates we compute are correct, R's per-path subset for the same path is buggy. The shipped parity scenario stays in `D in {0, 1, 2}` to avoid the R bug; R-parity for D in {0..9} is asserted at `tests/test_chaisemartin_dhaultfoeuille_parity.py::TestDCDHDynRParityByPathNonBinary` on the `multi_path_reversible_by_path_non_binary` scenario (78 switchers, 3 paths, single-baseline custom DGP, F_g >= 4) — per-path point estimates match R bit-exactly (rtol ~1e-9 events; rtol+atol envelope for placebo near-zero values), SE inherits the documented cross-path cohort-sharing deviation (~5% rtol observed; SE_RTOL=0.15 envelope). Cross-surface invariants regression-tested at `tests/test_chaisemartin_dhaultfoeuille.py::TestByPathNonBinary`. **Per-path user-specified path selection (`paths_of_interest`):** Python-only API extension — R's `did_multiplegt_dyn(..., by_path=k)` only accepts a positive int (top-k automatic ranking) or `-1` (all observed paths) and provides no list-based selection. Activated via `ChaisemartinDHaultfoeuille(paths_of_interest=[(0, 1, 1, 1), (0, 1, 0, 0)], drop_larger_lower=False)` as an alternative to `by_path=k`; the two are **mutually exclusive** (setting both raises `ValueError` at `__init__` and `set_params` time). Each path tuple must have length `L_max + 1`; the type / element / non-empty / length-uniformity checks fire at `__init__`, the length-vs-L_max check fires at fit-time. `bool` and `np.bool_` are explicitly rejected; `np.integer` is accepted and canonicalized to Python `int` for tuple-key consistency. Duplicates emit a `UserWarning` and are deduplicated; paths not observed in the panel emit a `UserWarning` and are omitted from `path_effects`. Paths appear in `results.path_effects` in the user-specified order, modulo deduplication and unobserved-path filtering. Composes with non-binary D and all downstream `by_path` surfaces (bootstrap, per-path placebos, per-path joint sup-t bands, `controls`, `trends_linear`, `trends_nonparam`) — mechanical filter on observed paths, no methodology change. Behavior + cross-feature regressions live at `tests/test_chaisemartin_dhaultfoeuille.py::TestPathsOfInterest`. +- **Note (Phase 3 `by_path` per-path event-study disaggregation):** Per-path disaggregation of the multi-horizon event study, mirroring R `did_multiplegt_dyn(..., by_path=k)`. Activated via `ChaisemartinDHaultfoeuille(by_path=k, drop_larger_lower=False)` where `k` is a positive integer (top-k most common observed paths by switcher-group frequency). **Window convention:** the path tuple for a switcher group `g` is `(D_{g, F_g-1}, D_{g, F_g}, ..., D_{g, F_g-1+L_max})` — length `L_max + 1`, matching R's window `[F_{g-1}, F_{g-1+l}]`. **Ranking:** paths are ranked by descending frequency; ties are broken lexicographically on the path tuple for deterministic ordering, so every selected path has a unique `frequency_rank`. If `by_path` exceeds the number of observed paths, all observed paths are returned with a `UserWarning`. **Per-path SE convention (joiners/leavers precedent):** the per-path influence function follows the joiners-only / leavers-only IF construction at `chaisemartin_dhaultfoeuille.py:5495-5504`: the switcher-side contribution `+S_g * (Y_{g,out} - Y_{g,ref})` is zeroed for groups whose observed trajectory is NOT the selected path; control contributions and the full cohort structure `(D_{g,1}, F_g, S_g)` are unchanged. After applying the singleton-baseline eligible mask and cohort-recentering with the original cohort IDs, the plug-in SE uses the path-specific divisor `N_l_path` (count of path switchers eligible at horizon `l`) — same pattern as `joiners_se` using `joiner_total`. This gives the **within-path mean** estimand `DID_{path,l}` as the within-path average of `DID_{g,l}`. **Degenerate-cohort behavior per path:** when a path's centered IF at some horizon is identically zero (every variance-eligible path switcher forms its own `(D_{g,1}, F_g, S_g)` cohort, or the path has a single contributing group), SE / t_stat / p_value / conf_int are NaN-consistent and a `UserWarning` is emitted scoped to `(path, horizon)`. This mirrors the overall-path degenerate-cohort surface and is common for rare paths with few contributing groups. **Empty-state contract:** `results.path_effects` distinguishes "not requested" (`None`) from "requested but empty" (`{}` — all switchers have windows outside the panel or unobserved cells). The empty-dict case emits a `UserWarning` at fit-time and renders as an explicit "no observed paths" notice in `summary()`; `to_dataframe(level="by_path")` returns an empty DataFrame with the canonical column set (mirrors the `linear_trends` pattern when `trends_linear=True` but no horizons survive). **Requirements:** `drop_larger_lower=False` (multi-switch groups are the object of interest; default `True` filters them out) and `L_max >= 1` (path window depends on the horizon). **Scope:** combinations with `heterogeneity`, `design2`, and `honest_did` remain gated behind explicit `NotImplementedError` (deferred to follow-up wave PRs). `n_bootstrap > 0` is now supported — see the **Bootstrap SE** paragraph below. `survey_design` is supported under analytical Binder TSL and replicate-weight bootstrap — see the **Per-path survey-design SE** paragraph below; multiplier bootstrap (`n_bootstrap > 0`) under `survey_design + by_path/paths_of_interest` remains gated. `placebo=True` is now supported per-path — see the **Per-path placebos** paragraph below. **TWFE diagnostic** remains a sample-level summary (not computed per path) in this release. Results are exposed on `results.path_effects` as `Dict[Tuple[int, ...], Dict[str, Any]]` with nested `horizons` dicts per horizon `l`, and on `results.to_dataframe(level="by_path")` as a long-format table with columns `[path, frequency_rank, n_groups, horizon, effect, se, t_stat, p_value, conf_int_lower, conf_int_upper, n_obs, cband_lower, cband_upper, cumulated_effect, cumulated_se]` (the `cband_*` columns are added by the joint sup-t Note below, populated for positive-horizon rows of paths with a finite sup-t crit and NaN otherwise; the `cumulated_*` columns are added by the per-path linear-trends Note below, populated for positive-horizon rows when `trends_linear=True` is set and NaN otherwise). Gated tests live in `tests/test_chaisemartin_dhaultfoeuille.py::TestByPathGates` / `::TestByPathBehavior` / `::TestByPathEdgeCases`. **R-parity** against `DIDmultiplegtDYN 2.3.3` is confirmed at `tests/test_chaisemartin_dhaultfoeuille_parity.py::TestDCDHDynRParityByPath` via two scenarios: `mixed_single_switch_by_path` (2 paths, `by_path=2`) and `multi_path_reversible_by_path` (4 paths, `by_path=3`; path-assignment deterministic on `F_g` so each `(D_{g,1}, F_g, S_g)` cohort contains switchers from a single path). Per-path point estimates and per-path switcher counts match R exactly; per-path SE matches within the Phase 2 multi-horizon SE envelope (observed rtol ≤ 10.2% on the 2-path mixed scenario, ≤ 4.2% on the 4-path cohort-clean scenario). **Deviation from R (cross-path cohort-sharing SE):** our analytical SE is the marginal variance of the path-contribution estimator cohort-centered on the *full-panel* cohort structure (joiners/leavers precedent — non-path switchers contribute to cohort means via their zeroed switcher row). R's `did_multiplegt_dyn(..., by_path=k)` re-runs the estimator per path, so cohort means are computed over the path's own switchers only. When a cohort `(D_{g,1}, F_g, S_g)` spans multiple observed paths, Python and R SE diverge materially (our empirical probes with random post-window toggling saw rtol > 100%); when every cohort is single-path (scenario 13 by design, scenario 14 by construction), the two approaches coincide up to the documented Phase 2 envelope. Practitioners with cohort structures that mix paths should interpret the per-path SE as a within-full-panel marginal variance, not a per-path conditional variance. **Bootstrap SE:** when `n_bootstrap > 0` is set, the top-k paths are enumerated once on the observed data (R-faithful: matches `did_multiplegt_dyn(..., by_path=k, bootstrap=B)`'s path-stability convention — verified empirically against DIDmultiplegtDYN 2.3.3) and the multiplier bootstrap (`bootstrap_weights ∈ {"rademacher", "mammen", "webb"}`) runs per `(path, horizon)` target via the shared `_bootstrap_one_target` / `compute_effect_bootstrap_stats` helpers. Point estimates are unchanged from the analytical path. Bootstrap SE replaces the analytical SE in `path_effects[path]["horizons"][l]["se"]`, and `p_value` / `conf_int` are taken as the **bootstrap percentile** statistics, matching the Round-10 library convention for overall / joiners / leavers / multi-horizon bootstrap (see the `Note (bootstrap inference surface)` elsewhere in this file and the pinned regression `test_bootstrap_p_value_and_ci_propagated_to_top_level`). `t_stat` is SE-derived via `safe_inference` per the anti-pattern rule. Interpretation: inference is *conditional on the observed path set*. **SE inherits the analytical cross-path cohort-sharing deviation:** the bootstrap input is the exact same full-panel cohort-centered path IF that the analytical path computes (`_collect_path_bootstrap_inputs` reuses the same enumeration / cohort IDs / IF construction), so the bootstrap SE is a Monte Carlo analog of the analytical SE — it inherits the same cross-path cohort-sharing deviation from R's per-path re-run convention documented above. On single-path-cohort panels (scenarios 13 and 14 of the R-parity fixture, and any DGP where `(D_{g,1}, F_g, S_g)` cohorts never span multiple observed paths), bootstrap SE tracks analytical SE up to Monte Carlo noise and both coincide with R up to the Phase 2 envelope. On cross-path cohort panels, bootstrap SE inherits the >100% rtol divergence from R that analytical already has. **Deviation from R (CI method):** R's per-path CI is normal-theory around the bootstrap SE (half-width ≈ `1.96·se`); ours is the bootstrap percentile CI, intentionally diverging from R to keep the dCDH inference surface internally consistent across all bootstrap targets. Practitioners who want *unconditional* inference capturing path-selection uncertainty need a pairs-bootstrap (deferred — no R precedent). Positive regressions live in `tests/test_chaisemartin_dhaultfoeuille.py::TestByPathBootstrap` (gated `@pytest.mark.slow`): point-estimate invariance, finite positive SE on non-degenerate panels, SE-within-30%-rtol of analytical on cohort-clean fixtures, degenerate-cohort NaN propagation, Rademacher/Mammen/Webb parity, seed reproducibility, and percentile-vs-normal-theory CI pinning. **Per-path placebos:** when `placebo=True` (and `L_max >= 1`) is combined with `by_path=k`, per-path backward-horizon placebos `DID^{pl}_{path, l}` for `l = 1..L_max` are computed using the same joiners/leavers IF precedent applied to `_compute_per_group_if_placebo_horizon` (with the new `switcher_subset_mask` parameter): switcher contributions are zeroed for groups not in the path; the control pool and the variance-eligible cohort structure `(D_{g,1}, F_g, S_g)` are unchanged. Plug-in SE uses the path-specific divisor `N^{pl}_{l, path}` (count of path switchers eligible at backward lag `l`). Surfaced on `results.path_placebo_event_study[path][-l]` with the same `{effect, se, t_stat, p_value, conf_int, n_obs}` shape as `placebo_event_study` (negative-int inner keys parallel the existing per-path event-study positive-int keys, so a unified forward+backward view is well-formed). **Inherits the cross-path cohort-sharing SE deviation from R** documented above for `path_effects` (same convention applied backward); tracks R within numerical tolerance on single-path-cohort panels and diverges on cohort-mixed panels. Multiplier bootstrap (when `n_bootstrap > 0`) runs per `(path, lag)` target via the same `_bootstrap_one_target` dispatch used for the per-path event-study, with the canonical NaN-on-invalid contract. The bootstrap SE is a Monte Carlo analog of the analytical placebo SE — same per-path centered IF input — and inherits the same deviation. Surfaced through `summary()` (negative-keyed rows rendered alongside positive-keyed event-study rows under each path block) and `to_dataframe(level="by_path")` (`horizon` column takes negative ints for placebo rows). **Empty-state contract:** `results.path_placebo_event_study` mirrors `path_effects` — `None` when `by_path + placebo` was not requested, `{}` when requested but no observed path has a complete window within the panel (same regime that returns `{}` for `path_effects`, with the same fit-time `UserWarning`). R-parity is confirmed at `tests/test_chaisemartin_dhaultfoeuille_parity.py::TestDCDHDynRParityByPathPlacebo` on the `multi_path_reversible_by_path_placebo` scenario; positive analytical + bootstrap invariants live in `tests/test_chaisemartin_dhaultfoeuille.py::TestByPathPlacebo` (with the gated `::TestByPathPlacebo::TestBootstrap` subclass). **Per-path covariate residualization (DID^X):** when `controls=[...]` is set with `by_path=k`, the per-baseline OLS residualization (Web Appendix Section 1.2) runs once on the first-differenced outcome BEFORE path enumeration. All four downstream surfaces — analytical per-path SE, bootstrap SE, per-path placebos, and per-path joint sup-t bands — consume the residualized `Y_mat` automatically (Frisch-Waugh-Lovell). Per-period effects remain unadjusted, consistent with the existing `controls` + per-period DID contract (per-period DID does not support residualization). Failed-stratum baselines (rank-deficient X) zero out `N_mat` for affected groups, which the path enumeration treats as ineligible per its existing convention. **Deviation from R on multi-baseline switcher panels (point estimates):** R `did_multiplegt_dyn(..., by_path, controls)` re-runs the per-baseline residualization on each path's restricted subsample (`R/R/did_multiplegt_dyn.R` lines 401-405: rows of the path's switchers OR rows where `yet_to_switch=1 AND baseline matches the path's baseline`). The first-stage residualization sample R uses for path B equals: pre-switch rows of all switchers with matching baseline + all rows of never-switchers with matching baseline — bit-identical to our global first-stage sample under single-baseline switcher panels (every switcher shares the same `D_{g,1}`, regardless of how `F_g` or path identity varies across switchers). Per-path point estimates therefore coincide with R on those panels up to the existing **DID^X first-stage cell-weighting deviation** documented above in `Note (Phase 3 DID^X covariate adjustment)` (Python's first-stage OLS uses equal cell weights — one observation per `(g, t)` cell, consistent with the library's cell-aggregated input convention; R weights by `N_gt`). On panels with one observation per `(g, t)` cell (the common case after the cell-aggregation step in `fit()`), Python matches R bit-exactly: the `multi_path_reversible_by_path_controls` parity fixture has 4 paths with switcher `F_g` values spanning [0..6] under `D_{g,1}=0` and Python matches R to rtol ~1e-11. On multi-baseline switcher panels (some switchers have `D_{g,1}=0`, others have `D_{g,1}=1`) R's per-path subset drops switchers whose baseline differs from the path's baseline, so the per-baseline regression coefficients diverge per path under R and point estimates can diverge between Python and R — a `UserWarning` is emitted at fit-time when this configuration is detected so practitioners do not silently consume estimates that disagree with R. The warning filters to switcher groups only; never-switchers (never-treated + always-treated controls) at multiple baseline values do NOT trigger the warning because they don't affect R's per-path subset construction. **Inherits the cross-path cohort-sharing SE deviation from R** documented above for `path_effects` — bootstrap SE, placebo SE, and sup-t crit are Monte Carlo / joint-distribution analogs of the same residualized analytical IF and carry the same deviation. R-parity is confirmed against `did_multiplegt_dyn(..., by_path=3, controls="X1")` at `tests/test_chaisemartin_dhaultfoeuille_parity.py::TestDCDHDynRParityByPathControls` on the `multi_path_reversible_by_path_controls` scenario (single-baseline DGP, exact point-estimate match measured rtol ~1e-11); cross-surface inheritance and the multi-baseline warning are regression-tested at `tests/test_chaisemartin_dhaultfoeuille.py::TestByPathControls` (analytical + bootstrap + placebo + sup-t + `to_dataframe(level="by_path")` cband columns + multi-baseline `UserWarning`). **Per-path linear-trends DID^{fd}:** when `trends_linear=True` is set with `by_path=k`, the first-differencing transform at `chaisemartin_dhaultfoeuille.py:1599-1630` runs once globally BEFORE path enumeration (replaces `Y_mat` with `Z_mat = Y_t - Y_{t-1}` and shrinks the time axis by one), so per-path raw second-differences `DID^{fd}_{path, l}` surface on `path_effects[path]["horizons"][l]` automatically. Per-path cumulated level effects `delta_{path, l} = sum_{l'=1..l} DID^{fd}_{path, l'}` (the quantity R returns under `did_multiplegt_dyn(..., by_path, trends_lin)` per the existing parity test pivot at `tests/test_chaisemartin_dhaultfoeuille_parity.py:403-409`) surface on the new `results.path_cumulated_event_study[path][l]` field — a per-group running sum of `DID^{fd}_{g, l'}` averaged over the path's switchers eligible at horizon `l`, mirroring the global `linear_trends_effects` cumulation logic at `chaisemartin_dhaultfoeuille.py:3340-3398`. SE on the cumulated layer is the conservative upper bound (sum of per-horizon component SEs from `path_effects[path]["horizons"][l]["se"]`, NaN-consistent: any non-finite component yields a NaN cumulated SE). **Post-bootstrap recomputation:** the cumulated layer is built AFTER the bootstrap propagation block at `chaisemartin_dhaultfoeuille.py:3034-3081` so it reads the FINAL post-bootstrap per-horizon SEs (mirrors the global `linear_trends_effects` placement). When `n_bootstrap > 0`, cumulated SE / t / p / CI are derived from bootstrap per-horizon SEs; when bootstrap produces non-finite SE (e.g., `n_bootstrap=1` degenerate distribution), the cumulated layer's full inference tuple is NaN per the library-wide NaN-on-invalid bootstrap contract. `to_dataframe(level="by_path")` exposes `cumulated_effect` and `cumulated_se` columns (always present, NaN-when-None — mirrors the `cband_*` always-present convention from PR #374). `summary()` renders a `Cumulated Level Effects (DID^{fd}, trends_linear)` sub-section under each per-path block. **Path enumeration uses the post-first-differenced `N_mat_fd`**: switchers with `F_g==2` fail the window-eligibility check and are dropped from path enumeration entirely (the existing global `F_g >= 3` warning at line 1620 surfaces the issue), so a path whose switchers all have `F_g < 3` is silently absent from `path_effects` rather than present-with-NaN. **F_g=3 boundary-case divergence (`by_path + trends_linear`):** `F_g=3` switchers have exactly 2 pre-switch periods, which after first-differencing and the `time==1` filter leaves only 1 valid pre-window Z value. R's per-path full-pipeline call handles this single-pre-period regime differently from Python's global-then-disaggregate architecture, producing 30%+ relative divergence on point estimates for paths whose switchers include `F_g=3` (empirically observed on the parity fixture's earlier `F_g=3` variant). A separate `UserWarning` fires at fit-time when the panel includes any `F_g=3` switcher AND `by_path + trends_linear` is set, mirroring the `F_g < 3` exclusion warning. The shipped parity fixture (`single_baseline_multi_path_by_path_trends_lin`) restricts to `F_g >= 4` exclusively to avoid this regime; per-path R parity is asserted only there. **Placebo under `trends_linear` returns RAW per-horizon values** (no per-path placebo cumulation surface) — verified empirically against the existing `joiners_only_trends_lin` parity fixture: R's per-path Placebo_l matches Python's `path_placebo_event_study[path][-l]` (raw) bit-exactly under non-`by_path` trends_lin. **Deviation from R on multi-baseline switcher panels (point estimates):** R `did_multiplegt_dyn(..., by_path, trends_lin)` re-runs the full pipeline (including first-differencing) on each path's restricted subsample, so it operates on different switcher samples per path when switchers have different baseline values `D_{g,1}`. Python first-differences once globally before path enumeration. On single-baseline switcher panels the two architectures coincide; on multi-baseline switcher panels per-path point estimates can diverge — a `UserWarning` is emitted at fit-time when this configuration is detected so practitioners do not silently consume estimates that disagree with R (mirroring the analogous `by_path + controls` warning). Per-path R parity is confirmed against `did_multiplegt_dyn(..., by_path=3, trends_lin=TRUE, placebo=1)` at `tests/test_chaisemartin_dhaultfoeuille_parity.py::TestDCDHDynRParityByPathTrendsLinear` on the `single_baseline_multi_path_by_path_trends_lin` scenario (single-baseline + cohort-single-path + `F_g >= 4` DGP designed to eliminate the multi-baseline divergence, the cross-path cohort-sharing deviation, and the F_g=3 boundary case under R's per-path full-pipeline call). Per-path cumulated point estimates match R bit-exactly (rtol ~1e-9) on event horizons under those conditions; cumulated SE_RTOL is widened to `0.20` (vs `0.12` used for non-cumulated by_path parity) because the conservative upper-bound SE compounds the cross-path cohort-sharing deviation under summation. **Placebo parity is intentionally skipped for `trends_linear`**: R's per-path placebo computation re-runs on the path-restricted subsample with different control eligibility than Python's global-then-disaggregate architecture surfaces, producing a sign-and-magnitude divergence on paths whose switchers have minimal pre-window depth (e.g., `F_g=4` switchers). Placebo under `by_path + trends_linear` is exercised via internal regression in `tests/test_chaisemartin_dhaultfoeuille.py::TestByPathTrendsLinear` (finite values, bootstrap inheritance) but not pinned to R bit-by-bit. Cross-surface invariants (analytical + bootstrap + placebo + sup-t + `path_cumulated_event_study` + `to_dataframe` columns + `summary()` rendering) are regression-tested at `TestByPathTrendsLinear`. **Per-path state-set trends:** when `trends_nonparam="state_col"` is set with `by_path=k`, the set membership column is validated and stored once globally as `set_ids_arr` (time-invariance, NaN rejection, partition-coarseness checks unchanged from the non-by_path path). The `set_ids` parameter is threaded through the four per-path IF helpers (`_compute_path_effects`, `_compute_path_placebos`, `_collect_path_bootstrap_inputs`, `_collect_path_placebo_bootstrap_inputs`) so per-path analytical SE, bootstrap, placebos, and sup-t bands all consume the set-restricted control pool automatically. R does NOT first-difference and does NOT cumulate under `trends_nonparam` (unlike `trends_lin`); per-horizon `Effect_l` is a normal DID with set-restricted controls. Per-path R parity is confirmed against `did_multiplegt_dyn(..., by_path=3, trends_nonparam="state", placebo=1)` at `tests/test_chaisemartin_dhaultfoeuille_parity.py::TestDCDHDynRParityByPathTrendsNonparam` on the `multi_path_reversible_by_path_trends_nonparam` scenario; per-path point estimates AND placebos match R bit-exactly (rtol ~1e-9), per-path SE matches within the Phase 2 envelope (~13% rtol observed). Cross-surface invariants are regression-tested at `tests/test_chaisemartin_dhaultfoeuille.py::TestByPathTrendsNonparam`. **Per-path non-binary treatment:** integer-coded discrete treatment (D in Z, e.g. ordinal {0, 1, 2}) is supported under `by_path=k` and `paths_of_interest`. Path tuples become integer-state tuples (`(0, 2, 2, 2)`) keyed bit-for-bit against R's comma-separated path strings (`"0,2,2,2"`) for D in {0..9}. Continuous D (e.g. `1.5`) raises `ValueError` at fit-time per the no-silent-failures contract — the existing `int(round(float(v)))` cast in `_enumerate_treatment_paths` is now defensive (no-op for integer-coded D). **Deviation from R for D >= 10:** R's `did_multiplegt_by_path` derives the per-path baseline via `path_index$baseline_XX <- substr(path_index$path, 1, 1)` (extracted 2026-05-03 via `Rscript -e 'cat(paste(deparse(DIDmultiplegtDYN:::did_multiplegt_by_path), collapse="\n"))'`), capturing only the first character of the comma-separated path string. For D >= 10 this captures `"1"` instead of `"12"` for `path = "12,12,..."`, mis-allocating R's per-path control-pool subset. Python's tuple-key matching is correct in this regime; the per-path point estimates we compute are correct, R's per-path subset for the same path is buggy. The shipped parity scenario stays in `D in {0, 1, 2}` to avoid the R bug; R-parity for D in {0..9} is asserted at `tests/test_chaisemartin_dhaultfoeuille_parity.py::TestDCDHDynRParityByPathNonBinary` on the `multi_path_reversible_by_path_non_binary` scenario (78 switchers, 3 paths, single-baseline custom DGP, F_g >= 4) — per-path point estimates match R bit-exactly (rtol ~1e-9 events; rtol+atol envelope for placebo near-zero values), SE inherits the documented cross-path cohort-sharing deviation (~5% rtol observed; SE_RTOL=0.15 envelope). Cross-surface invariants regression-tested at `tests/test_chaisemartin_dhaultfoeuille.py::TestByPathNonBinary`. **Per-path survey-design SE** (analytical Binder TSL + replicate-weight bootstrap): under `by_path` / `paths_of_interest` + `survey_design`, the per-path per-horizon SE routes through `_survey_se_from_group_if` using the cell-period allocator. The per-path influence function `U_pp_l_path` is the per-period IF with non-path switcher-side contributions skipped — control contributions remain unchanged, matching the joiners/leavers IF convention from the **Per-path SE convention** paragraph above (the `switcher_subset_mask` zeroes the switcher row of the per-group IF, which trivially zeroes the corresponding row of the per-cell IF, preserving the row-sum identity `U_pp.sum(axis=1) == U`). The IF is cohort-recentered via `_cohort_recenter_per_period` and expanded to observations as `psi_i = U_pp[g_i, t_i] · (w_i / W_{g_i, t_i})`. Replicate-weight designs unconditionally route through the cell allocator (Class A contract, PR #323). Multiplier bootstrap (`n_bootstrap > 0`) under `survey_design + by_path/paths_of_interest` raises `NotImplementedError` at fit-time — the survey-aware perturbation pivot for path-restricted IFs is methodologically underived and deferred to a future wave; the global non-by_path TSL multiplier bootstrap is unaffected and continues to ship. **Path-enumeration ranking is unweighted** under `survey_design`: top-k selection uses group cardinality (`path_to_count[p]` = number of groups), not population-weight mass — survey weights do not affect which paths are selected as "top-k". A weighted-ranking variant (sum of survey weights per path) is deferred until concrete demand. **`df_survey` propagation:** under replicate weights, every per-path per-horizon fit contributes an `n_valid` count to the shared `_replicate_n_valid_list` accumulator and the final `_effective_df_survey = min(...) - 1` reflects all per-path replicate fits. A post-call `_refresh_path_inference` helper re-runs `safe_inference` on every populated entry so `multi_horizon_inference`, `placebo_horizon_inference`, `path_effects`, and `path_placebos` all use the same final df after per-path appends complete. **Lonely-PSU policy is sample-wide, not per-path** — the `lonely_psu` policy (`remove`/`certainty`/`adjust`) operates on the full design-level PSU/strata structure, not on path-restricted subsamples. **Telescope invariant:** on a single-path panel where every switcher follows the same trajectory and `eligible_groups` matches between by_path and non-by_path, per-path SE equals the global non-by_path survey SE bit-exactly — pinned at `tests/test_chaisemartin_dhaultfoeuille.py::TestByPathSurveyDesignTelescope::test_telescope_analytical_TSL`. **Deviation from R:** none — R `did_multiplegt_dyn` does not support survey weighting, so this is a Python-only methodology extension (no R parity available; no R parity test class). Regression test anchor: `tests/test_chaisemartin_dhaultfoeuille.py::TestByPathSurveyDesignAnalytical` covering analytical SE, replicate-weight SE, the `n_bootstrap` gate, the global anti-regression, per-path placebos, `trends_linear` composition, and unobserved-path warnings under survey. **Per-path user-specified path selection (`paths_of_interest`):** Python-only API extension — R's `did_multiplegt_dyn(..., by_path=k)` only accepts a positive int (top-k automatic ranking) or `-1` (all observed paths) and provides no list-based selection. Activated via `ChaisemartinDHaultfoeuille(paths_of_interest=[(0, 1, 1, 1), (0, 1, 0, 0)], drop_larger_lower=False)` as an alternative to `by_path=k`; the two are **mutually exclusive** (setting both raises `ValueError` at `__init__` and `set_params` time). Each path tuple must have length `L_max + 1`; the type / element / non-empty / length-uniformity checks fire at `__init__`, the length-vs-L_max check fires at fit-time. `bool` and `np.bool_` are explicitly rejected; `np.integer` is accepted and canonicalized to Python `int` for tuple-key consistency. Duplicates emit a `UserWarning` and are deduplicated; paths not observed in the panel emit a `UserWarning` and are omitted from `path_effects`. Paths appear in `results.path_effects` in the user-specified order, modulo deduplication and unobserved-path filtering. Composes with non-binary D and all downstream `by_path` surfaces (bootstrap, per-path placebos, per-path joint sup-t bands, `controls`, `trends_linear`, `trends_nonparam`) — mechanical filter on observed paths, no methodology change. Behavior + cross-feature regressions live at `tests/test_chaisemartin_dhaultfoeuille.py::TestPathsOfInterest`. - **Note (Phase 3 `by_path` per-path joint sup-t bands):** When `n_bootstrap > 0` is set with `by_path=k`, per-path joint sup-t simultaneous confidence bands are computed across horizons `1..L_max` within each path. **Methodology:** a single `(n_bootstrap, n_eligible)` multiplier weight matrix (using the estimator's configured `bootstrap_weights` — Rademacher / Mammen / Webb) is drawn per path and broadcast across all horizons of that path, producing correlated bootstrap distributions across horizons within the path. The path-specific critical value `c_p = quantile(max_l |t_l|, 1 - α)` is then used to construct symmetric joint bands `effect_l ± c_p · se_l` per horizon, surfaced in `path_effects[path]["horizons"][l]["cband_conf_int"]` and at top-level `results.path_sup_t_bands[path] = {"crit_value", "alpha", "n_bootstrap", "method", "n_valid_horizons"}`. **Gates:** a path must have `>= 2` valid horizons (finite bootstrap SE > 0) AND a strict majority (more than 50%) of finite sup-t draws to receive a band; otherwise the path is absent from `path_sup_t_bands`. Both gates mirror the OVERALL `event_study_sup_t_bands` semantics at `chaisemartin_dhaultfoeuille_bootstrap.py:605,612`: `len(valid_horizons) >= 2` AND `finite_mask.sum() > 0.5 * n_bootstrap`. Exactly half-finite draws are NOT enough — the gate is strictly greater than half. **Empty-state contract:** `path_sup_t_bands is None` when not requested (no bootstrap, or both `by_path` and `paths_of_interest` are `None`); `{}` when requested but no path passes both gates. **`to_dataframe(level="by_path")` integration:** the table now includes `cband_lower` / `cband_upper` columns for parity with OVERALL `level="event_study"`; populated for positive-horizon rows of paths with a finite sup-t crit, NaN for placebo rows / unbanded paths / the requested-but-empty fallback DataFrame. **Methodology asymmetry vs OVERALL:** OVERALL sup-t reuses the same multi-horizon shared-draw distribution for both the SE in the t-stat denominator and the bootstrap distribution in the numerator. The per-path sup-t draws a fresh shared weight matrix per path AFTER the per-path SE bootstrap block has already populated `results.path_ses` via independent per-(path, horizon) draws — numerator: fresh shared draws, denominator: bootstrap SEs from the earlier independent draws. Asymptotically equivalent to OVERALL's self-consistent reuse, but NOT bit-identical. The fresh draw is intentional: it preserves RNG-state isolation and keeps every existing per-path SE seed-reproducibility test bit-stable post-implementation. **Inherited deviation from R:** the bootstrap SE used as the t-stat denominator carries the cross-path cohort-sharing SE deviation from R documented for `path_effects` above; the per-path sup-t crit therefore inherits the same deviation. **Interpretation:** the band covers joint inference *within a single path across horizons*; it does NOT provide simultaneous coverage *across paths* (a different inference target requiring a `path × horizon` re-derivation, deferred to a future wave). **Deviation from R:** `did_multiplegt_dyn` provides no joint / sup-t / simultaneous bands at any surface — this is a Python-only methodology extension, consistent with the existing OVERALL `event_study_sup_t_bands` (also Python-only). Regression test anchor: `tests/test_chaisemartin_dhaultfoeuille.py::TestByPathSupTBands`. @@ -670,7 +670,7 @@ The guard is fired by `_survey_se_from_group_if` (analytical and replicate) and - [x] State-set-specific trends via control-pool restriction (Web Appendix Section 1.4) - [x] Heterogeneity testing via saturated OLS (Web Appendix Section 1.5, Lemma 7) - [x] Design-2 switch-in/switch-out descriptive wrapper (Web Appendix Section 1.6) -- [x] `by_path` per-path event-study disaggregation (binary or integer-coded discrete treatment, joiners/leavers IF precedent; mirrors R `did_multiplegt_dyn(..., by_path=k)`); plus `paths_of_interest=[(...), ...]` for user-specified path subsets (Python-only API; mutex with `by_path`) +- [x] `by_path` per-path event-study disaggregation (binary or integer-coded discrete treatment, joiners/leavers IF precedent; mirrors R `did_multiplegt_dyn(..., by_path=k)`); plus `paths_of_interest=[(...), ...]` for user-specified path subsets (Python-only API; mutex with `by_path`); composes with `survey_design` for analytical Binder TSL and replicate-weight bootstrap SE (multiplier-bootstrap path under survey gated, deferred) - [x] HonestDiD (Rambachan-Roth 2023) integration on placebo + event study surface - [x] Survey design support: pweight with strata/PSU/FPC via Taylor Series Linearization (analytical) **or replicate-weight variance (BRR/Fay/JK1/JKn/SDR)**, covering the main ATT surface, covariate adjustment (DID^X), heterogeneity testing, the TWFE diagnostic (fit and standalone `twowayfeweights()` helper), and HonestDiD bounds. Opt-in **PSU-level Hall-Mammen wild bootstrap** is also supported via `n_bootstrap > 0`. - **Note (Survey IF expansion — library convention):** Survey IF expansion is a library extension not in the dCDH papers (the paper's plug-in variance assumes iid sampling). The library convention builds observation-level `psi_i` by proportionally distributing per-group IF mass within weight share: either at the group level (`psi_i = U_centered[g] * w_i / W_g`, the previous convention) or at the per-`(g, t)` cell level via the cell-period allocator shipped in this release. Cell-level expansion: decompose `U[g]` into per-period attributions `U[g, t]`, cohort-center each column independently, then expand to observation level as `psi_i = U_centered_per_period[g_i, t_i] * (w_i / W_{g_i, t_i})`. Binder (1983) stratified-PSU variance aggregates the resulting `psi` at PSU level. **Post-period attribution convention:** each transition term in the IF sum (of the form `role_weight * (Y_{g, t} - Y_{g, t-1})` for DID_M or `S_g * (Y_{g, out} - Y_{g, ref})` for DID_l) is attributed as a single *difference* to the POST-period cell, not split into a `+Y_post` / `-Y_pre` pair across two cells. This is a library *convention*, not a theorem — adopted because it preserves the group-sum, PSU-sum, and cohort-sum identities of the previous group-level expansion (so Binder variance coincides with the group-level variance under the auto-injected `psu=group`) and because Monte Carlo coverage at nominal 95% is empirically close to nominal on a DGP where PSUs vary across the cells of each group (see `tests/test_dcdh_cell_period_coverage.py`). A covariance-aware two-cell allocator is a plausible alternative and may be worth exploring if future designs motivate an explicit observation-level IF derivation; the method currently in the library is **not derived from the observation-level survey linearization of the contrast** and makes no stronger claim than "coverage is approximately nominal under the tested DGPs and the group-sum identity holds exactly." Under within-group-constant PSU (the pre-allocator accepted input), per-cell sums telescope to `U_centered[g]` and Binder variance is byte-identical (up to single-ULP floating-point noise) to the previous group-level expansion. **Strata and PSU must be constant within each `(g, t)` cell** (trivially satisfied in one-obs-per-cell panels — the canonical dCDH structure); variation **across cells of a group** is supported by the allocator. Within-group-varying **weights** are supported as before. When `survey_design.psu` is not specified, `fit()` auto-injects `psu=` so the TSL variance, `df_survey`, and t-based inference match the per-group PSU structure. **Strata that vary across cells of a group require either an explicit `psu=` or the original `SurveyDesign(..., nest=True)` flag** — under `nest=True` the resolver combines `(stratum, psu)` into globally-unique labels, so the auto-injected `psu=` is re-labeled per stratum and the cell allocator proceeds. Only the `nest=False` + varying-strata + omitted-psu combination is rejected up front with a targeted `ValueError` at `fit()` time (the synthesized PSU column would reuse group labels across strata and trip the cross-stratum PSU uniqueness check in `SurveyDesign.resolve()`). Under replicate-weight designs, the same cell-level `psi_i` is aggregated via Rao-Wu weight-ratio rescaling (`compute_replicate_if_variance` at `diff_diff/survey.py:1681`) rather than the Binder TSL formula. All five methods (BRR/Fay/JK1/JKn/SDR) are supported method-agnostically through the unified helper; the effective `df_survey` is reduced to `min(n_valid) - 1` across IF sites when some replicate solves fail (matching `efficient_did.py:1133-1135` and `triple_diff.py:676-686` precedents). Under DID^X, the first-stage residualization coefficient `theta_hat` is computed once on full-sample weights and treated as fixed (FWL plug-in IF convention) — per-replicate refits of `theta_hat` are not performed. **Post-period attribution extends to heterogeneity (Binder TSL branch only):** the heterogeneity WLS coefficient IF `ψ_g = inv(X'WX)[1,:] @ x_g * W_g * r_g` is attributed in full to the single post-period cell `(g, out_idx)` at each horizon (same single-cell convention as DID_l), then expanded as `ψ_i = ψ_g * (w_i / W_{g, out_idx})`, and fed through `compute_survey_if_variance`. Under PSU=group the PSU-level aggregate telescopes to `ψ_g`, so Binder variance is byte-identical relative to the pre-cell-period release; under within-group-varying PSU mass lands in the post-period PSU. **Replicate-weight branch keeps the legacy group-level allocator** `ψ_i = ψ_g * (w_i / W_g)` because `compute_replicate_if_variance` computes `θ_r = sum_i ratio_ir * ψ_i` at observation level and is therefore not PSU-telescoping: redistributing mass onto the post-period cell would silently change the replicate SE whenever a replicate column's ratios vary within a group (the library accepts arbitrary per-row replicate matrices, not just PSU-aligned ones). The legacy allocator preserves byte-identity of the replicate SE for every previously-supported fit. Replicate + within-group-varying PSU is unreachable by construction (`SurveyDesign` rejects `replicate_weights` combined with explicit `strata/psu/fpc`). diff --git a/tests/test_chaisemartin_dhaultfoeuille.py b/tests/test_chaisemartin_dhaultfoeuille.py index 086aaa79..06a50478 100644 --- a/tests/test_chaisemartin_dhaultfoeuille.py +++ b/tests/test_chaisemartin_dhaultfoeuille.py @@ -9112,3 +9112,494 @@ def test_per_path_placebos_with_paths_of_interest_present(self): ) assert res.path_placebo_event_study is not None assert len(res.path_placebo_event_study) == 2 + + +# ============================================================================= +# by_path / paths_of_interest + survey_design (Wave 4 #10) +# ============================================================================= + + +def _by_path_survey_data(seed: int = 44) -> pd.DataFrame: + """Panel for `by_path` + `survey_design` tests. + + Three paths, single baseline D=0, all switchers have F_g=4 with + L_max+1=4 window fully inside an 8-period panel (so per-path / + global telescope holds at every horizon). 30 switchers split across + 3 paths + 30 never-treated controls. Strata are within-group- + constant (4 strata cycling); PSU = group (one PSU per group, no + within-group variation). + """ + rng = np.random.default_rng(seed) + n_periods = 8 + rows: list = [] + paths = [(0, 1, 1, 1), (0, 1, 0, 0), (0, 1, 1, 0)] + for g in range(30): + F_g = 4 + path = paths[g % 3] + stratum = g % 4 + weight = 1.0 + 0.1 * (g % 5) + for t in range(n_periods): + if F_g - 1 <= t < F_g - 1 + len(path): + d = path[t - (F_g - 1)] + else: + d = 0 + y = 0.5 * d + rng.normal(0, 0.5) + rows.append({ + "group": g, "period": t, "treatment": d, "outcome": y, + "survey_weights": weight, "strata": stratum, "psu": g, + }) + for g in range(30, 60): + stratum = (g - 30) % 4 + weight = 1.0 + 0.1 * ((g - 30) % 5) + for t in range(n_periods): + y = rng.normal(0, 0.5) + rows.append({ + "group": g, "period": t, "treatment": 0, "outcome": y, + "survey_weights": weight, "strata": stratum, "psu": g, + }) + return pd.DataFrame(rows) + + +def _by_path_survey_data_single_path(seed: int = 44) -> pd.DataFrame: + """Single-path variant of `_by_path_survey_data` for telescope tests. + + All 30 switchers follow the same path `(0, 1, 1, 1)` with F_g=4 + in a 7-period panel — last path cell at ``t = F_g - 1 + L_max = 6`` + coincides with the panel end, so treatment doesn't switch back to + 0 (no multi-switch trigger under default ``drop_larger_lower=True``). + Per-path SE on the lone path equals the global non-by_path SE. + """ + rng = np.random.default_rng(seed) + n_periods = 7 + rows: list = [] + path = (0, 1, 1, 1) + for g in range(30): + F_g = 4 + stratum = g % 4 + weight = 1.0 + 0.1 * (g % 5) + for t in range(n_periods): + if F_g - 1 <= t < F_g - 1 + len(path): + d = path[t - (F_g - 1)] + else: + d = 0 + y = 0.5 * d + rng.normal(0, 0.5) + rows.append({ + "group": g, "period": t, "treatment": d, "outcome": y, + "survey_weights": weight, "strata": stratum, "psu": g, + }) + for g in range(30, 60): + stratum = (g - 30) % 4 + weight = 1.0 + 0.1 * ((g - 30) % 5) + for t in range(n_periods): + y = rng.normal(0, 0.5) + rows.append({ + "group": g, "period": t, "treatment": 0, "outcome": y, + "survey_weights": weight, "strata": stratum, "psu": g, + }) + return pd.DataFrame(rows) + + +class TestByPathSurveyDesignAnalytical: + """`by_path` / `paths_of_interest` compose with `survey_design`. + + Analytical Binder TSL routes per-path SE through + ``_survey_se_from_group_if`` using the cell-period allocator with + non-path switcher contributions zeroed at both group and cell + levels. Multiplier-bootstrap (`n_bootstrap > 0`) under survey + + by_path remains gated. + """ + + # ----- Gate + dispatch ----- + + def test_no_longer_raises_on_survey(self): + from diff_diff.survey import SurveyDesign + + df = _by_path_survey_data() + sd = SurveyDesign(weights="survey_weights", strata="strata", psu="psu") + est = ChaisemartinDHaultfoeuille(by_path=2, drop_larger_lower=False) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + res = est.fit( + df, outcome="outcome", group="group", time="period", + treatment="treatment", L_max=3, survey_design=sd, + ) + assert res.path_effects is not None + assert len(res.path_effects) >= 1 + + def test_paths_of_interest_with_survey_no_longer_raises(self): + from diff_diff.survey import SurveyDesign + + df = _by_path_survey_data() + sd = SurveyDesign(weights="survey_weights", strata="strata", psu="psu") + est = ChaisemartinDHaultfoeuille( + paths_of_interest=[(0, 1, 1, 1), (0, 1, 0, 0)], + drop_larger_lower=False, + ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + res = est.fit( + df, outcome="outcome", group="group", time="period", + treatment="treatment", L_max=3, survey_design=sd, + ) + assert res.path_effects is not None + assert (0, 1, 1, 1) in res.path_effects + assert (0, 1, 0, 0) in res.path_effects + + def test_survey_design_plus_n_bootstrap_raises(self): + from diff_diff.survey import SurveyDesign + + df = _by_path_survey_data() + sd = SurveyDesign(weights="survey_weights", strata="strata", psu="psu") + est = ChaisemartinDHaultfoeuille( + by_path=2, n_bootstrap=50, seed=42, drop_larger_lower=False + ) + with pytest.raises(NotImplementedError, match="n_bootstrap.*multiplier"): + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + est.fit( + df, outcome="outcome", group="group", time="period", + treatment="treatment", L_max=3, survey_design=sd, + ) + + def test_survey_design_plus_paths_of_interest_plus_n_bootstrap_raises(self): + from diff_diff.survey import SurveyDesign + + df = _by_path_survey_data() + sd = SurveyDesign(weights="survey_weights", strata="strata", psu="psu") + est = ChaisemartinDHaultfoeuille( + paths_of_interest=[(0, 1, 1, 1)], + n_bootstrap=50, + seed=42, + drop_larger_lower=False, + ) + with pytest.raises(NotImplementedError, match="paths_of_interest"): + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + est.fit( + df, outcome="outcome", group="group", time="period", + treatment="treatment", L_max=3, survey_design=sd, + ) + + def test_global_survey_plus_n_bootstrap_still_works(self): + """Anti-regression: the new gate is per-path-only. + + Locks the per-path-only scope of the multiplier-bootstrap gate + added in this PR. Global TSL + n_bootstrap is supported and + regression-tested in tests/test_survey_dcdh.py — confirm the + new gate doesn't accidentally fire on the no-by_path path. + + Uses ``_by_path_survey_data_single_path`` because the multi- + path fixture's reversible paths get filtered by the default + ``drop_larger_lower=True`` policy. + """ + from diff_diff.survey import SurveyDesign + + df = _by_path_survey_data_single_path() + sd = SurveyDesign(weights="survey_weights", strata="strata", psu="psu") + est = ChaisemartinDHaultfoeuille(n_bootstrap=50, seed=42) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + res = est.fit( + df, outcome="outcome", group="group", time="period", + treatment="treatment", L_max=3, survey_design=sd, + ) + assert np.isfinite(res.overall_se) + assert res.path_effects is None + + # ----- Analytical SE correctness ----- + + def test_per_path_analytical_se_finite_under_survey(self): + from diff_diff.survey import SurveyDesign + + df = _by_path_survey_data() + sd = SurveyDesign(weights="survey_weights", strata="strata", psu="psu") + est = ChaisemartinDHaultfoeuille(by_path=3, drop_larger_lower=False) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + res = est.fit( + df, outcome="outcome", group="group", time="period", + treatment="treatment", L_max=3, survey_design=sd, + ) + assert res.path_effects is not None + for path, entry in res.path_effects.items(): + for l_h, vals in entry["horizons"].items(): + if vals["n_obs"] > 0: + assert np.isfinite(vals["effect"]), f"{path} l={l_h} effect non-finite" + assert np.isfinite(vals["se"]), f"{path} l={l_h} se non-finite" + + def test_per_path_se_telescope_to_global_on_single_path(self): + """Single-path panel: per-path SE == global SE (telescope). + + Preconditions baked into ``_by_path_survey_data_single_path``: + (a) all switchers follow exactly one path, + (b) all switchers have F_g=4 (full L_max=3 window), + (c) >=3 cohorts represented (cohort recentering non-degenerate), + (d) >=2 strata, >=1 PSU per group (lonely-PSU not triggered), + (e) survey weights non-constant (so test isn't a no-op telescope). + """ + from diff_diff.survey import SurveyDesign + + df = _by_path_survey_data_single_path() + sd = SurveyDesign(weights="survey_weights", strata="strata", psu="psu") + est_g = ChaisemartinDHaultfoeuille(drop_larger_lower=False) + est_p = ChaisemartinDHaultfoeuille(by_path=1, drop_larger_lower=False) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + res_g = est_g.fit( + df, outcome="outcome", group="group", time="period", + treatment="treatment", L_max=3, survey_design=sd, + ) + res_p = est_p.fit( + df, outcome="outcome", group="group", time="period", + treatment="treatment", L_max=3, survey_design=sd, + ) + assert res_p.path_effects is not None + assert len(res_p.path_effects) == 1 + path = next(iter(res_p.path_effects.keys())) + for l_h in range(1, 4): + assert res_p.path_effects[path]["horizons"][l_h]["n_obs"] > 0 + np.testing.assert_allclose( + res_p.path_effects[path]["horizons"][l_h]["effect"], + res_g.event_study_effects[l_h]["effect"], + atol=1e-12, + err_msg=f"l={l_h} effect mismatch", + ) + np.testing.assert_allclose( + res_p.path_effects[path]["horizons"][l_h]["se"], + res_g.event_study_effects[l_h]["se"], + atol=1e-12, + err_msg=f"l={l_h} se mismatch", + ) + + def test_per_path_se_within_envelope_of_unweighted(self): + """Constant weights + single PSU per group: survey SE matches plug-in SE.""" + from diff_diff.survey import SurveyDesign + + df = _by_path_survey_data() + df["survey_weights"] = 1.0 + df["strata"] = 0 # single stratum + # PSU = group already + sd = SurveyDesign(weights="survey_weights", strata="strata", psu="psu") + est_p = ChaisemartinDHaultfoeuille(by_path=2, drop_larger_lower=False) + est_p_no_survey = ChaisemartinDHaultfoeuille(by_path=2, drop_larger_lower=False) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + res_survey = est_p.fit( + df, outcome="outcome", group="group", time="period", + treatment="treatment", L_max=3, survey_design=sd, + ) + res_plain = est_p_no_survey.fit( + df, outcome="outcome", group="group", time="period", + treatment="treatment", L_max=3, + ) + assert res_survey.path_effects is not None and res_plain.path_effects is not None + # Effects must match exactly (point estimate is design-agnostic). + for path in res_survey.path_effects: + if path not in res_plain.path_effects: + continue + for l_h in range(1, 4): + if res_survey.path_effects[path]["horizons"][l_h]["n_obs"] == 0: + continue + np.testing.assert_allclose( + res_survey.path_effects[path]["horizons"][l_h]["effect"], + res_plain.path_effects[path]["horizons"][l_h]["effect"], + atol=1e-12, + ) + + # ----- Replicate-weight SE correctness (slow) ----- + + @pytest.mark.slow + def test_per_path_replicate_se_finite(self): + from diff_diff.survey import SurveyDesign + + df = _by_path_survey_data() + n_obs = len(df) + rng = np.random.default_rng(0) + # JK1: leave-one-PSU-out replicates. With group as PSU and 60 + # groups, build 60 replicate columns in the dataframe. + rep_cols = [f"rep_{i}" for i in range(20)] + for i, col in enumerate(rep_cols): + df[col] = df["survey_weights"] * (1.0 + 0.05 * rng.standard_normal(n_obs)) + sd = SurveyDesign( + weights="survey_weights", + replicate_weights=rep_cols, + replicate_method="JK1", + replicate_scale=1.0, + ) + est = ChaisemartinDHaultfoeuille(by_path=2, drop_larger_lower=False) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + res = est.fit( + df, outcome="outcome", group="group", time="period", + treatment="treatment", L_max=3, survey_design=sd, + ) + assert res.path_effects is not None + any_finite = False + for path, entry in res.path_effects.items(): + for l_h, vals in entry["horizons"].items(): + if vals["n_obs"] > 0 and np.isfinite(vals["se"]): + any_finite = True + assert any_finite + + @pytest.mark.slow + def test_per_path_replicate_n_valid_propagates_to_df_survey(self): + """`results.df_survey` reflects min(n_valid) across per-path replicate fits.""" + from diff_diff.survey import SurveyDesign + + df = _by_path_survey_data() + n_obs = len(df) + rng = np.random.default_rng(1) + rep_cols = [f"rep_{i}" for i in range(15)] + for i, col in enumerate(rep_cols): + df[col] = df["survey_weights"] * (1.0 + 0.05 * rng.standard_normal(n_obs)) + sd = SurveyDesign( + weights="survey_weights", + replicate_weights=rep_cols, + replicate_method="JK1", + replicate_scale=1.0, + ) + est = ChaisemartinDHaultfoeuille(by_path=2, drop_larger_lower=False) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + res = est.fit( + df, outcome="outcome", group="group", time="period", + treatment="treatment", L_max=3, survey_design=sd, + ) + # df_survey reflects replicate columns; cap is 15 - 1 = 14. + assert res.survey_metadata is not None + df_s = res.survey_metadata.df_survey + assert df_s is not None + assert df_s <= 14 + + # ----- Per-path placebo ----- + + def test_per_path_placebo_se_finite_under_survey(self): + from diff_diff.survey import SurveyDesign + + df = _by_path_survey_data() + sd = SurveyDesign(weights="survey_weights", strata="strata", psu="psu") + est = ChaisemartinDHaultfoeuille( + by_path=2, drop_larger_lower=False, placebo=True + ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + res = est.fit( + df, outcome="outcome", group="group", time="period", + treatment="treatment", L_max=3, survey_design=sd, + ) + assert res.path_placebo_event_study is not None + any_finite = False + for path, lags in res.path_placebo_event_study.items(): + for lag_l, vals in lags.items(): + if vals["n_obs"] > 0 and np.isfinite(vals["se"]): + any_finite = True + assert any_finite, "no finite placebo SE under survey + by_path" + + # ----- trends_linear composition ----- + + @pytest.mark.slow + def test_per_path_cumulated_se_inherits_survey(self): + from diff_diff.survey import SurveyDesign + + # Need wider F_g window for trends_linear (F_g >= 4 to dodge boundary). + rng = np.random.default_rng(45) + n_periods = 10 + rows = [] + path_choices = [(0, 1, 1, 1), (0, 1, 0, 0)] + for g in range(30): + F_g = 5 + path = path_choices[g % 2] + stratum = g % 4 + weight = 1.0 + 0.1 * (g % 5) + trend = 0.05 * g # group-specific linear trend + for t in range(n_periods): + if F_g - 1 <= t < F_g - 1 + len(path): + d = path[t - (F_g - 1)] + else: + d = 0 + y = 0.5 * d + trend * t + rng.normal(0, 0.5) + rows.append({ + "group": g, "period": t, "treatment": d, "outcome": y, + "survey_weights": weight, "strata": stratum, "psu": g, + }) + for g in range(30, 60): + stratum = (g - 30) % 4 + weight = 1.0 + 0.1 * ((g - 30) % 5) + trend = 0.05 * g + for t in range(n_periods): + y = trend * t + rng.normal(0, 0.5) + rows.append({ + "group": g, "period": t, "treatment": 0, "outcome": y, + "survey_weights": weight, "strata": stratum, "psu": g, + }) + df = pd.DataFrame(rows) + sd = SurveyDesign(weights="survey_weights", strata="strata", psu="psu") + est = ChaisemartinDHaultfoeuille(by_path=2, drop_larger_lower=False) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + res = est.fit( + df, outcome="outcome", group="group", time="period", + treatment="treatment", L_max=3, survey_design=sd, + trends_linear=True, + ) + # path_cumulated_event_study should populate under trends_linear + assert res.path_cumulated_event_study is not None + + # ----- Edge cases ----- + + def test_path_unobserved_under_survey_warns_omits(self): + """POI unobserved-path warning composes with survey.""" + from diff_diff.survey import SurveyDesign + + df = _by_path_survey_data() + sd = SurveyDesign(weights="survey_weights", strata="strata", psu="psu") + est = ChaisemartinDHaultfoeuille( + paths_of_interest=[(0, 1, 1, 1), (0, 9, 9, 9)], # second is unobserved + drop_larger_lower=False, + ) + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always", UserWarning) + res = est.fit( + df, outcome="outcome", group="group", time="period", + treatment="treatment", L_max=3, survey_design=sd, + ) + assert res.path_effects is not None + assert (0, 1, 1, 1) in res.path_effects + assert (0, 9, 9, 9) not in res.path_effects + # Unobserved-path warning must have fired + assert any( + "zero observed" in str(w.message) and "(0, 9, 9, 9)" in str(w.message) + for w in caught + ) + + +class TestByPathSurveyDesignTelescope: + """Single-path telescope invariants — by_path SE matches global SE.""" + + def test_telescope_analytical_TSL(self): + """Single-path analytical TSL: per-path SE == global SE.""" + from diff_diff.survey import SurveyDesign + + df = _by_path_survey_data_single_path() + sd = SurveyDesign(weights="survey_weights", strata="strata", psu="psu") + est_g = ChaisemartinDHaultfoeuille(drop_larger_lower=False) + est_p = ChaisemartinDHaultfoeuille(by_path=1, drop_larger_lower=False) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + res_g = est_g.fit( + df, outcome="outcome", group="group", time="period", + treatment="treatment", L_max=3, survey_design=sd, + ) + res_p = est_p.fit( + df, outcome="outcome", group="group", time="period", + treatment="treatment", L_max=3, survey_design=sd, + ) + assert res_p.path_effects is not None + path = next(iter(res_p.path_effects.keys())) + for l_h in range(1, 4): + assert res_p.path_effects[path]["horizons"][l_h]["n_obs"] > 0 + np.testing.assert_allclose( + res_p.path_effects[path]["horizons"][l_h]["se"], + res_g.event_study_effects[l_h]["se"], + atol=1e-12, + ) From 14cd6ceefd7280ffcdf3ad85ba38e4ef6b28d3a9 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 9 May 2026 18:19:51 -0400 Subject: [PATCH 2/5] Address PR #408 R0 review (1 P2 test gap) R0 P2: test_per_path_se_within_envelope_of_unweighted claimed an SE invariant in its name but only asserted point-estimate equality. Add finite-SE rtol=0.10 envelope assertion alongside the existing effect equality, matching the test's documented contract. Under unit weights + single stratum + PSU=group, Binder TSL contributes a Bessel n/(n-1) factor relative to plug-in SE's plain 1/n divisor, so SEs differ by O(1/n) but track within a few percent on cohort-clean panels. Empirical observation on the test fixture: max rtol ~0.84% (well under the 10% envelope). Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_chaisemartin_dhaultfoeuille.py | 30 +++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/tests/test_chaisemartin_dhaultfoeuille.py b/tests/test_chaisemartin_dhaultfoeuille.py index 06a50478..3f2ec7df 100644 --- a/tests/test_chaisemartin_dhaultfoeuille.py +++ b/tests/test_chaisemartin_dhaultfoeuille.py @@ -9372,7 +9372,18 @@ def test_per_path_se_telescope_to_global_on_single_path(self): ) def test_per_path_se_within_envelope_of_unweighted(self): - """Constant weights + single PSU per group: survey SE matches plug-in SE.""" + """Constant weights + single PSU per group: survey SE within Bessel- + envelope of plug-in SE. + + Under unit weights + single stratum + PSU=group, the survey path's + cell-period allocator reduces to a group-level allocator and Binder + TSL contributes a `n/(n-1)` Bessel factor relative to the plug-in + SE's plain `1/n` divisor. SE values therefore differ by O(1/n) but + track within a few percent on cohort-clean panels — the named + envelope. This test confirms (a) point estimates are bit-equal + (design-agnostic) and (b) survey SE is within a 10% rtol envelope + of plug-in SE on every (path, horizon) entry where both are finite. + """ from diff_diff.survey import SurveyDesign df = _by_path_survey_data() @@ -9393,7 +9404,7 @@ def test_per_path_se_within_envelope_of_unweighted(self): treatment="treatment", L_max=3, ) assert res_survey.path_effects is not None and res_plain.path_effects is not None - # Effects must match exactly (point estimate is design-agnostic). + any_se_compared = False for path in res_survey.path_effects: if path not in res_plain.path_effects: continue @@ -9405,6 +9416,21 @@ def test_per_path_se_within_envelope_of_unweighted(self): res_plain.path_effects[path]["horizons"][l_h]["effect"], atol=1e-12, ) + se_survey = res_survey.path_effects[path]["horizons"][l_h]["se"] + se_plain = res_plain.path_effects[path]["horizons"][l_h]["se"] + if np.isfinite(se_survey) and np.isfinite(se_plain): + np.testing.assert_allclose( + se_survey, se_plain, rtol=0.10, + err_msg=( + f"path={path} l={l_h}: survey SE outside 10% " + f"rtol envelope of plug-in SE" + ), + ) + any_se_compared = True + assert any_se_compared, ( + "No (path, horizon) entry had finite SE on both surfaces — " + "constant-weight SE envelope was not actually exercised." + ) # ----- Replicate-weight SE correctness (slow) ----- From ad0943bc2948e427a7ea46021a6517d08a432545 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 9 May 2026 18:31:48 -0400 Subject: [PATCH 3/5] Address PR #408 R1 review (1 P1 final-df refresh on per-path) R1 P1: per-path event-study and placebo helpers snapshot df_inference BEFORE appending their own n_valid contributions to the shared _replicate_n_valid_list. The early _refresh_path_inference call then ran immediately after per-path runs but BEFORE later IF sites (overall / joiners / leavers / heterogeneity) appended their own n_valid values. If a later append further reduced the effective df, per-path t_stat / p_value / conf_int silently used a larger df than the global surfaces and survey_metadata.df_survey, which the existing final R2 P1b refresh block already updates with the FINAL df. Fix: relocate _refresh_path_inference to the final R2 P1b block at the bottom of fit() so per-path entries are refreshed alongside the global event-study / placebo / heterogeneity / overall / joiners / leavers / normalized surfaces with the final _final_inf_df. Narrow the helper to per-path only (the existing block handles globals inline). Add a dedicated regression test asserting that every populated per-path entry's t_stat / p_value / conf_int reproduces safe_inference(effect, se, df=results.survey_metadata.df_survey) under a JK1 replicate-weight design with placebo enabled. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/chaisemartin_dhaultfoeuille.py | 77 +++++++++----------- tests/test_chaisemartin_dhaultfoeuille.py | 85 +++++++++++++++++++++++ 2 files changed, 118 insertions(+), 44 deletions(-) diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index 5d05b991..55bfb883 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -2552,30 +2552,14 @@ def fit( replicate_n_valid_list=_replicate_n_valid_list, ) - # Post-per-path inference refresh under replicate-weight - # designs. Per-path effects/placebos snapshot df_inference - # BEFORE their own n_valid contributions append to the - # shared list, and the global per-horizon / placebo - # surfaces took their snapshots before per-path runs. After - # all per-path fits complete, recompute the final df and - # re-run safe_inference on every populated entry so all - # four surfaces (multi_horizon_inference, - # placebo_horizon_inference, path_effects, path_placebos) - # reflect the same df. No-op under TSL / non-survey fits. - if ( - resolved_survey is not None - and getattr(resolved_survey, "uses_replicate_variance", False) - and (self.by_path is not None or self.paths_of_interest is not None) - ): - _df_s_final = _effective_df_survey(resolved_survey, _replicate_n_valid_list) - _refresh_path_inference( - path_effects=path_effects, - path_placebos=path_placebos, - multi_horizon_inference=multi_horizon_inference, - placebo_horizon_inference=placebo_horizon_inference, - alpha=self.alpha, - df_final=_inference_df(_df_s_final, resolved_survey), - ) + # Per-path inference for replicate-weight designs is + # refreshed in the final R2 P1b block below (alongside + # global event-study / placebo / heterogeneity surfaces), + # so it reflects the FINAL `_replicate_n_valid_list` after + # heterogeneity / overall / joiners / leavers IF sites + # have appended their own `n_valid` values. Computing it + # here would only see per-path appends and miss any later + # df shrinkage from those subsequent IF sites. # Normalized effects DID^n_l (suppressed under trends_linear # because event_study_effects holds second-differences DID^{fd}_l, @@ -4037,6 +4021,19 @@ def fit( _info_r2["t_stat"] = _t_r2 _info_r2["p_value"] = _p_r2 _info_r2["conf_int"] = _ci_r2 + # Per-path event-study and placebo surfaces: their helpers + # snapshotted df_inference BEFORE appending their own n_valid + # contributions, and the global event-study / placebo / + # heterogeneity / overall / joiners / leavers IF sites + # appended their n_valid AFTER per-path runs. Refresh per-path + # inference with the final df so it agrees with the global + # surfaces and `survey_metadata.df_survey`. + _refresh_path_inference( + path_effects=path_effects, + path_placebos=path_placebos, + alpha=self.alpha, + df_final=_final_inf_df, + ) # Persist the final effective df_survey into survey_metadata so # downstream consumers — HonestDiD bounds (honest_did.py:973 @@ -7685,23 +7682,21 @@ def _validate_cell_constant_strata_psu( def _refresh_path_inference( path_effects: Optional[Dict[Tuple[int, ...], Dict[str, Any]]], path_placebos: Optional[Dict[Tuple[int, ...], Dict[int, Dict[str, Any]]]], - multi_horizon_inference: Optional[Dict[int, Dict[str, Any]]], - placebo_horizon_inference: Optional[Dict[int, Dict[str, Any]]], alpha: float, df_final: Optional[int], ) -> None: - """Refresh inference fields (t_stat, p_value, conf_int) with the - final ``df_final`` after per-path replicate fits have appended to - the shared ``_replicate_n_valid_list``. - - Under replicate-weight designs, every IF site contributes an - ``n_valid`` count and the effective ``df_survey`` is - ``min(...) - 1``. Per-path fits run AFTER the global per-horizon - and global placebo loops snapshot their df from - ``_replicate_n_valid_list``; per-path entries themselves use a - snapshot taken BEFORE per-path replicate appends. This helper - re-runs ``safe_inference(effect, se, alpha, df=df_final)`` on every - populated entry so all four surfaces reflect the final df. + """Refresh per-path inference fields (t_stat, p_value, conf_int) with + the final ``df_final`` so they agree with the global surfaces and + ``results.survey_metadata.df_survey`` after all replicate-weight + ``n_valid`` appends complete. + + Per-path event-study and placebo helpers compute inference using a + snapshot of ``_replicate_n_valid_list`` taken at fit-time BEFORE + they append their own ``n_valid`` contributions. The final R2 P1b + block in ``fit()`` already refreshes the global surfaces (overall / + joiners / leavers / multi_horizon_inference / placebo_horizon_inference + / heterogeneity / normalized) with the final df; this helper is its + per-path counterpart, called from the same final block. No-op under TSL (analytical) or non-survey fits — they skip replicate-n_valid bookkeeping entirely. Mutates dicts in place. @@ -7720,12 +7715,6 @@ def _refresh_entry(entry: Dict[str, Any]) -> None: entry["p_value"] = p_new entry["conf_int"] = ci_new - if multi_horizon_inference is not None: - for entry in multi_horizon_inference.values(): - _refresh_entry(entry) - if placebo_horizon_inference is not None: - for entry in placebo_horizon_inference.values(): - _refresh_entry(entry) if path_effects is not None: for path_data in path_effects.values(): horizons = path_data.get("horizons", {}) diff --git a/tests/test_chaisemartin_dhaultfoeuille.py b/tests/test_chaisemartin_dhaultfoeuille.py index 3f2ec7df..b30134dd 100644 --- a/tests/test_chaisemartin_dhaultfoeuille.py +++ b/tests/test_chaisemartin_dhaultfoeuille.py @@ -9467,6 +9467,91 @@ def test_per_path_replicate_se_finite(self): any_finite = True assert any_finite + @pytest.mark.slow + def test_per_path_inference_uses_final_df_after_all_appends(self): + """Per-path t/p/CI must use `results.survey_metadata.df_survey`. + + Per-path event-study and placebo helpers snapshot + ``df_inference`` BEFORE appending their own ``n_valid`` + contributions to ``_replicate_n_valid_list``; later in fit() + the global overall / joiners / leavers / heterogeneity sites + append more ``n_valid`` values that may further reduce the + effective df. After the final R2 P1b refresh block runs, + ``_refresh_path_inference`` must update per-path entries so + their ``t_stat`` / ``p_value`` / ``conf_int`` agree with + ``results.survey_metadata.df_survey`` and the global event- + study / placebo surfaces (which the same final block already + refreshes). Regression for PR #408 R1 P1. + """ + from diff_diff.survey import SurveyDesign + from diff_diff.utils import safe_inference + + df = _by_path_survey_data() + n_obs = len(df) + rng = np.random.default_rng(2) + rep_cols = [f"rep_{i}" for i in range(12)] + for i, col in enumerate(rep_cols): + df[col] = df["survey_weights"] * (1.0 + 0.05 * rng.standard_normal(n_obs)) + sd = SurveyDesign( + weights="survey_weights", + replicate_weights=rep_cols, + replicate_method="JK1", + replicate_scale=1.0, + ) + est = ChaisemartinDHaultfoeuille( + by_path=2, drop_larger_lower=False, placebo=True + ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + res = est.fit( + df, outcome="outcome", group="group", time="period", + treatment="treatment", L_max=3, survey_design=sd, + ) + assert res.survey_metadata is not None + df_final = res.survey_metadata.df_survey + assert df_final is not None + # Per-path event-study: every populated finite-SE entry must + # reproduce safe_inference(effect, se, df=df_final). + assert res.path_effects is not None + any_checked = False + for path, entry in res.path_effects.items(): + for l_h, vals in entry["horizons"].items(): + if vals["n_obs"] == 0 or not np.isfinite(vals["se"]): + continue + t_exp, p_exp, ci_exp = safe_inference( + vals["effect"], vals["se"], alpha=est.alpha, df=df_final, + ) + np.testing.assert_allclose( + vals["t_stat"], t_exp, atol=1e-12, + err_msg=f"path={path} l={l_h} t_stat stale", + ) + np.testing.assert_allclose( + vals["p_value"], p_exp, atol=1e-12, + err_msg=f"path={path} l={l_h} p_value stale", + ) + np.testing.assert_allclose( + vals["conf_int"], ci_exp, atol=1e-12, + err_msg=f"path={path} l={l_h} conf_int stale", + ) + any_checked = True + # Per-path placebo: same invariant on negative-keyed entries. + if res.path_placebo_event_study is not None: + for path, lags in res.path_placebo_event_study.items(): + for lag_l, vals in lags.items(): + if vals["n_obs"] == 0 or not np.isfinite(vals["se"]): + continue + t_exp, p_exp, ci_exp = safe_inference( + vals["effect"], vals["se"], alpha=est.alpha, df=df_final, + ) + np.testing.assert_allclose(vals["t_stat"], t_exp, atol=1e-12) + np.testing.assert_allclose(vals["p_value"], p_exp, atol=1e-12) + np.testing.assert_allclose(vals["conf_int"], ci_exp, atol=1e-12) + any_checked = True + assert any_checked, ( + "No populated per-path entry was checked — replicate-df " + "invariant was not actually exercised." + ) + @pytest.mark.slow def test_per_path_replicate_n_valid_propagates_to_df_survey(self): """`results.df_survey` reflects min(n_valid) across per-path replicate fits.""" From 9b8741c1445c5a1cb727ac0c4524f9bef46396f4 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 9 May 2026 18:47:03 -0400 Subject: [PATCH 4/5] Address PR #408 R2 review (1 P2 strengthen replicate-df regression) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit R2 P2: the previous regression test asserted per-path inference matches `safe_inference(..., df=results.survey_metadata.df_survey)`, but under uniform-valid replicate fixtures every IF site reports the same `n_valid` so the snapshot df and final df happen to coincide and the assertion passes vacuously even when the bug is present (per-path uses a stale snapshot df that incidentally equals the final df). Add `test_refresh_path_inference_called_from_final_block`: wraps the helper with `mock.patch.object` to capture call_args, asserts (a) helper is invoked exactly once, (b) the `df_final` it received equals `results.survey_metadata.df_survey` — a relationship that holds by construction when invoked from the final R2 P1b block (which uses `_final_eff_df = _effective_df_survey(resolved_survey, _replicate_n_valid_list)` AFTER all appends), but can only coincide by chance when invoked from an earlier block on a fixture where snapshot equals final. Update the existing test's docstring to acknowledge it documents the contract on a uniform-valid fixture and points readers at the new mock-based test for direct call-site verification. Use `importlib.import_module` to access the dCDH module: the top-level `diff_diff` package re-exports the convenience function `chaisemartin_dhaultfoeuille`, shadowing the module of the same name in attribute lookup. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_chaisemartin_dhaultfoeuille.py | 87 ++++++++++++++++++++++- 1 file changed, 86 insertions(+), 1 deletion(-) diff --git a/tests/test_chaisemartin_dhaultfoeuille.py b/tests/test_chaisemartin_dhaultfoeuille.py index b30134dd..2e231759 100644 --- a/tests/test_chaisemartin_dhaultfoeuille.py +++ b/tests/test_chaisemartin_dhaultfoeuille.py @@ -9467,6 +9467,86 @@ def test_per_path_replicate_se_finite(self): any_finite = True assert any_finite + @pytest.mark.slow + def test_refresh_path_inference_called_from_final_block(self): + """Pin the helper's call site to the final R2 P1b block. + + Regression for PR #408 R1 P1: an earlier implementation + invoked ``_refresh_path_inference`` immediately after per-path + runs, BEFORE the global overall / joiners / leavers / + heterogeneity IF sites appended their ``n_valid`` contributions + — leaving per-path inference using a stale snapshot df that + could exceed the final ``survey_metadata.df_survey``. + + Pure-fixture detection is unreliable: under uniform-valid + replicate weights, every IF site reports the same ``n_valid``, + so the snapshot df and the final df happen to coincide and a + match-against-final-df assertion would pass even with the bug + present. Instead we wrap the helper with ``mock.patch.object`` + and assert the ``df_final`` it receives equals the final + ``survey_metadata.df_survey`` — a relationship that holds by + construction when invoked from the final block (which uses + ``_final_eff_df = _effective_df_survey(resolved_survey, + _replicate_n_valid_list)`` AFTER all appends), but can only + coincide by chance from an earlier block. + """ + import importlib + import unittest.mock as _mock + + from diff_diff.survey import SurveyDesign + + # The top-level `diff_diff` package re-exports + # `chaisemartin_dhaultfoeuille` as the convenience function, + # shadowing the module of the same name. Use importlib to + # access the module object explicitly so mock.patch.object + # operates on the correct namespace. + _cd_mod = importlib.import_module("diff_diff.chaisemartin_dhaultfoeuille") + + df = _by_path_survey_data() + n_obs = len(df) + rng = np.random.default_rng(3) + rep_cols = [f"rep_{i}" for i in range(10)] + for col in rep_cols: + df[col] = df["survey_weights"] * (1.0 + 0.05 * rng.standard_normal(n_obs)) + sd = SurveyDesign( + weights="survey_weights", + replicate_weights=rep_cols, + replicate_method="JK1", + replicate_scale=1.0, + ) + est = ChaisemartinDHaultfoeuille(by_path=2, drop_larger_lower=False) + + with _mock.patch.object( + _cd_mod, + "_refresh_path_inference", + wraps=_cd_mod._refresh_path_inference, + ) as m: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + res = est.fit( + df, outcome="outcome", group="group", time="period", + treatment="treatment", L_max=3, survey_design=sd, + ) + + # Helper called exactly once from the final R2 P1b block. + assert m.call_count == 1, ( + f"_refresh_path_inference should be called exactly once " + f"under replicate-weight + by_path; got {m.call_count}" + ) + # Under replicate variance with defined effective df, + # _inference_df returns the effective df unchanged, and + # survey_metadata.df_survey persists the same value. Equality + # proves the helper received the FINAL df, not an earlier + # snapshot taken before the global IF sites appended. + df_final_passed = m.call_args.kwargs["df_final"] + assert res.survey_metadata is not None + assert df_final_passed == res.survey_metadata.df_survey, ( + f"Helper invoked with df_final={df_final_passed!r}, but " + f"results.survey_metadata.df_survey={res.survey_metadata.df_survey!r}. " + f"This indicates the helper ran from a stale earlier " + f"call site instead of the final R2 P1b block." + ) + @pytest.mark.slow def test_per_path_inference_uses_final_df_after_all_appends(self): """Per-path t/p/CI must use `results.survey_metadata.df_survey`. @@ -9481,7 +9561,12 @@ def test_per_path_inference_uses_final_df_after_all_appends(self): their ``t_stat`` / ``p_value`` / ``conf_int`` agree with ``results.survey_metadata.df_survey`` and the global event- study / placebo surfaces (which the same final block already - refreshes). Regression for PR #408 R1 P1. + refreshes). Companion test + ``test_refresh_path_inference_called_from_final_block`` pins + the helper's call site directly via mock.patch (the + match-against-final-df assertion below is satisfied vacuously + under uniform-valid replicates where snapshot df coincides + with final df). Regression for PR #408 R1 P1. """ from diff_diff.survey import SurveyDesign from diff_diff.utils import safe_inference From fb594275a0b3c7ff2e9f8906310adc0cf168a25a Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 9 May 2026 18:55:31 -0400 Subject: [PATCH 5/5] Address PR #408 R3 review (1 P2 deterministic stale-vs-final df forcing) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit R3 P2: the previous mock-based regression checks the helper is called with the final df, but if no later IF site reduces n_valid relative to per-path snapshots, snapshot df coincides with final df and the assertion is vacuous. Add `test_per_path_inference_refreshes_to_lower_final_df`: deter- ministic forcing function via `mock.patch.object` on `_compute_se`, gated by a flag that flips after `_compute_path_effects` returns. After the flag is set, every subsequent `_compute_se` call returns a hardcoded low `n_valid=5` — so global placebo / overall / joiners / leavers all append 5, while per-path effects already snapshotted a high df from the unmodified pre-flag calls. Final `survey_metadata.df_survey = 5 - 1 = 4` is strictly less than the per-path snapshot df, forcing the refresh to demonstrably move per-path inference from the high snapshot df to the low final df. Sanity-checked: temporarily replacing `_refresh_path_inference` with a no-op causes the new test to fail with a stale-p_value assertion, confirming bug-detection. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_chaisemartin_dhaultfoeuille.py | 125 ++++++++++++++++++++++ 1 file changed, 125 insertions(+) diff --git a/tests/test_chaisemartin_dhaultfoeuille.py b/tests/test_chaisemartin_dhaultfoeuille.py index 2e231759..ccaba2e9 100644 --- a/tests/test_chaisemartin_dhaultfoeuille.py +++ b/tests/test_chaisemartin_dhaultfoeuille.py @@ -9467,6 +9467,131 @@ def test_per_path_replicate_se_finite(self): any_finite = True assert any_finite + @pytest.mark.slow + def test_per_path_inference_refreshes_to_lower_final_df(self): + """Deterministic stale-vs-final df regression. + + Forces a later IF site to return a smaller ``n_valid`` than the + per-path snapshot via monkeypatch on ``_compute_se``: a flag is + set after ``_compute_path_effects`` returns, and any subsequent + ``_compute_se`` call (global placebo / overall / joiners / + leavers) returns a hardcoded low ``n_valid``. Per-path effects + therefore snapshot a HIGH df at their call site, while the + final ``_replicate_n_valid_list`` is bounded by the lowered + post-per-path appends, producing a strictly smaller final df. + + Without ``_refresh_path_inference()`` running from the final + block, per-path effect inference would retain the stale high + df. This test asserts every populated per-path entry's + ``t_stat`` / ``p_value`` / ``conf_int`` matches + ``safe_inference(effect, se, df=results.survey_metadata.df_survey)`` + (the LOW final df), proving the refresh moved the values to + the post-append df. + + Regression for PR #408 R1 P1 / R3 P2 (deterministic version). + """ + import importlib + import unittest.mock as _mock + + from diff_diff.survey import SurveyDesign + from diff_diff.utils import safe_inference + + _cd_mod = importlib.import_module("diff_diff.chaisemartin_dhaultfoeuille") + + df = _by_path_survey_data() + n_obs = len(df) + rng = np.random.default_rng(7) + # Use enough replicate columns so the natural n_valid is large + # and our forced low n_valid is detectably smaller. + rep_cols = [f"rep_{i}" for i in range(20)] + for col in rep_cols: + df[col] = df["survey_weights"] * (1.0 + 0.05 * rng.standard_normal(n_obs)) + sd = SurveyDesign( + weights="survey_weights", + replicate_weights=rep_cols, + replicate_method="JK1", + replicate_scale=1.0, + ) + est = ChaisemartinDHaultfoeuille(by_path=2, drop_larger_lower=False) + + real_compute_se = _cd_mod._compute_se + real_path_effects = _cd_mod._compute_path_effects + post_path_flag = [False] + forced_low_n_valid = 5 + + def wrapped_path_effects(*args, **kwargs): + result = real_path_effects(*args, **kwargs) + post_path_flag[0] = True + return result + + def wrapped_compute_se(*args, **kwargs): + se, n_valid = real_compute_se(*args, **kwargs) + if post_path_flag[0] and n_valid is not None and n_valid > forced_low_n_valid: + return se, forced_low_n_valid + return se, n_valid + + with _mock.patch.object( + _cd_mod, "_compute_path_effects", + side_effect=wrapped_path_effects, + ), _mock.patch.object( + _cd_mod, "_compute_se", + side_effect=wrapped_compute_se, + ): + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + res = est.fit( + df, outcome="outcome", group="group", time="period", + treatment="treatment", L_max=3, survey_design=sd, + ) + + # The forced low n_valid (5) at later IF sites bounds the final + # effective df at 5 - 1 = 4. JK1 / replicate convention: + # df_survey = min(n_valid) - 1. + expected_low_df = forced_low_n_valid - 1 + assert res.survey_metadata is not None + assert res.survey_metadata.df_survey == expected_low_df, ( + f"Expected forced final df={expected_low_df}, got " + f"{res.survey_metadata.df_survey}. The monkeypatch did not " + f"force a divergence — adjust forced_low_n_valid or fixture." + ) + + # Per-path effects entries snapshot df at fit-time BEFORE the + # forced lowering kicked in (so their snapshot df > final df). + # If `_refresh_path_inference` runs from the final block, every + # entry's t_stat / p_value / conf_int is recomputed at the low + # final df. If the helper is called from an earlier block (the + # bug), per-path effects keep the stale high-df inference. + assert res.path_effects is not None + any_compared = False + for path, entry in res.path_effects.items(): + for l_h, vals in entry["horizons"].items(): + if vals["n_obs"] == 0 or not np.isfinite(vals["se"]): + continue + t_final, p_final, ci_final = safe_inference( + vals["effect"], vals["se"], + alpha=est.alpha, df=expected_low_df, + ) + np.testing.assert_allclose( + vals["t_stat"], t_final, atol=1e-12, + err_msg=( + f"path={path} l={l_h}: t_stat reflects stale " + f"snapshot df, not final df={expected_low_df}" + ), + ) + np.testing.assert_allclose( + vals["p_value"], p_final, atol=1e-12, + err_msg=f"path={path} l={l_h}: p_value stale", + ) + np.testing.assert_allclose( + vals["conf_int"], ci_final, atol=1e-12, + err_msg=f"path={path} l={l_h}: conf_int stale", + ) + any_compared = True + assert any_compared, ( + "No per-path effects entry had finite SE — forcing function " + "did not exercise the refresh path." + ) + @pytest.mark.slow def test_refresh_path_inference_called_from_final_block(self): """Pin the helper's call site to the final R2 P1b block.