Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
27c5c09
feat: implement PHIX validation for schools and daycares
eswarchandravidyasagar Jan 14, 2026
3c74056
removed list
eswarchandravidyasagar Jan 16, 2026
56a93ee
Refactor PHIX validation to enforce strict exact matching
eswarchandravidyasagar Jan 21, 2026
3a73f99
feat: enhance PHIX validation with configurable sheet name and output…
eswarchandravidyasagar Feb 5, 2026
4793e46
When available, use PHIX ID for validation; only considered exact mat…
TiaTuinstra May 7, 2026
3e628b9
requires higher version of python for union operand. Increment packag…
jangevaare Jan 20, 2026
081d379
uv lock --upgrade and uv sync
jangevaare Jan 20, 2026
ed544e4
fix over_16 in tests
jangevaare Jan 20, 2026
48989b9
Adding in option to remove border around client information box. Upda…
kassyray Jan 20, 2026
fa12d2e
type checker fixes
jangevaare Jan 20, 2026
9269230
Bump the minor-and-patch group across 1 directory with 4 updates (#166)
dependabot[bot] Feb 2, 2026
5ce9403
Update typst to ver 0.14.2 in GH actions (#159)
jangevaare Feb 2, 2026
6eb6af7
Bump the minor-and-patch group across 1 directory with 3 updates (#168)
dependabot[bot] Feb 17, 2026
ddde781
Bump the minor-and-patch group with 2 updates (#169)
dependabot[bot] Feb 23, 2026
ccf5488
Bump the minor-and-patch group across 1 directory with 4 updates (#171)
dependabot[bot] Mar 16, 2026
76af415
Bump the minor-and-patch group with 4 updates (#172)
dependabot[bot] Mar 23, 2026
b95764a
Bump the minor-and-patch group with 3 updates (#174)
dependabot[bot] Mar 30, 2026
acfbf81
Bump the minor-and-patch group across 1 directory with 5 updates (#176)
dependabot[bot] Apr 14, 2026
7cb92c8
Bump codecov/codecov-action from 5 to 6 in /.github/workflows
dependabot[bot] Mar 27, 2026
d8a746e
Update setuptools requirement from >=45 to >=82.0.1
dependabot[bot] Apr 20, 2026
08ca3f0
Bump the minor-and-patch group across 1 directory with 3 updates
dependabot[bot] Apr 27, 2026
99f6789
Merge branch 'main' into feat/PHIX-validation
TiaTuinstra May 7, 2026
173ebcc
updated tests with new warning wording
TiaTuinstra May 7, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,11 @@ coverage.xml
coverage.json
!input/rodent_dataset.xlsx
input/*
PHIX Reference*.xlsx
PHIX Reference*.xls
*PHIX*Reference*List*.xlsx
*PHIX*Reference*List*.xlsm
phu_templates/*
!phu_templates/README.md
!phu_templates/.gitkeep
.gitmodules
.gitmodules
27 changes: 27 additions & 0 deletions config/parameters.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,33 @@
bundling:
bundle_size: 100
group_by: null

# Step 2: Preprocessing - PHIX Reference Validation
# Validates school/daycare names against the official PHIX reference list
phix_validation:
enabled: true
# Path to PHIX reference Excel file (relative to project root)
# IMPORTANT: We cannot redistribute the PHIX workbook. Provide your own copy
# and update this path (absolute path or relative to repo root).
reference_file: BYO_PHIX_REFERENCE.xlsx
# Name of the Excel sheet containing facility data
# Default: "Schools & Day Cares"
reference_sheet_name: "Schools & Day Cares"
# Prefix for output column names (e.g., "PHIX_ID", "PHIX_MATCHED_PHU", etc.)
# Default: "PHIX_"
column_prefix: "PHIX_"
# Optional mapping file that converts PHIX PHU column names to canonical
# template codes. Required when target_phu_code or --template is provided.
phu_mapping_file: config/phu_aliases.yaml
# Optional default PHU scope when running without --template.
# Accepts a single code or list of codes. Leave null to accept all PHUs.
target_phu_code: null
# How to handle unmatched facilities: 'warn', 'error', or 'skip'
# - warn: Log warning, continue processing all records
# - error: Fail pipeline if any facilities don't match
# - skip: Filter out records with unmatched facilities
unmatched_behavior: warn

chart_diseases_header:
- Diphtheria
- Tetanus
Expand Down
30 changes: 30 additions & 0 deletions config/phu_aliases.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Maps canonical PHU acronyms (matching template folder names) to every
# alias that may appear in the PHIX reference workbook. Customize this file
# so each Public Health Unit is represented with all historic spellings,
# ensuring PHIX validation can restrict facilities to the correct PHU.
#
# Example usage:
# phu_aliases:
# wdgph:
# display_name: Wellington-Dufferin-Guelph Public Health
# aliases:
# - Wellington Dufferin Guelph Public Health
# - WDGPH

phu_aliases:
wdgph:
display_name: Wellington-Dufferin-Guelph Public Health
aliases:
- Wellington Dufferin Guelph Public Health
- Wellington-Dufferin-Guelph Health Unit
- WDGPH
peel:
display_name: Peel Public Health
aliases:
- Region of Peel Public Health
- PEEL HEALTH UNIT
ottawa:
display_name: Ottawa Public Health
aliases:
- City of Ottawa Health Unit
- OTTAWA PHU
19 changes: 18 additions & 1 deletion docs/PDF_VALIDATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,8 @@ This markerless approach is also suitable for checks like:
In step 6 (validation), the orchestrator:
1. Loads `preprocessed_clients_{run_id}.json` from `output/artifacts/`.
2. Builds a mapping: `filename -> expected_value` (e.g., client ID, sequence number).
3. Passes this mapping to `validate_pdfs.main(..., client_id_map=client_id_map)`.
3. Passes this mapping to `validate_pdfs.main(..., client_id_map=client_id_map, client_metadata_map=client_metadata_map)`.
4. `client_metadata_map` currently carries PHIX validation context (target PHU, matched PHU, school name) so every per-PDF log records which facility/PHU was validated upstream.

Rules then validate against the mapping using artifact data as the source of truth.

Expand All @@ -165,6 +166,22 @@ Current rule: Searches for any 10-digit number in the PDF text and compares to t

This ensures every generated PDF contains the correct client ID, catching generation errors or data drift early.

### Example: PHIX facility scope tracking

- Preprocessing stores PHIX validation metadata (`phix_validation`) in each client's artifact entry.
- The orchestrator passes this data via `client_metadata_map`.
- `validate_pdf_layout` records `phix_target_phu_code`, `phix_matched_phu_code`, and `phix_match_confidence` in each PDF's `measurements`.
- If a PDF's matched PHU does not align with the template's target PHU, the validator emits a `phix_target_phu` warning per file.

This gives auditors a traceable link from every generated PDF back to the PHIX reference data used during preprocessing.

### PHIX reference workbook is BYO

- The official PHIX reference workbook is licensed and cannot be redistributed in this repository.
- `.gitignore` explicitly ignores `PHIX Reference*.xls*` so accidental copies never enter history.
- `config/parameters.yaml` ships with the placeholder `BYO_PHIX_REFERENCE.xlsx`; operators must point it at their local copy before running Step 2.
- Document the location internally (outside git) and ensure CI/CD environments mount the workbook securely (e.g., secrets storage or mounted volume).

## Why we prefer template‑emitted measurements over PDF distance math

We strongly prefer emitting precise measurements from the Typst template (via `measure()` and `MEASURE_...` markers) instead of inferring sizes by computing distances between two markers in extracted PDF text. Reasons:
Expand Down
82 changes: 81 additions & 1 deletion pipeline/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,10 @@
import traceback
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional

# Import pipeline steps
from . import bundle_pdfs, cleanup, compile_notices, validate_pdfs
from . import bundle_pdfs, cleanup, compile_notices, validate_pdfs, validate_phix
from . import (
encrypt_notice,
generate_notices,
Expand All @@ -50,6 +51,7 @@
preprocess,
)
from .config_loader import load_config
from .data_models import PreprocessResult
from .enums import Language

SCRIPT_DIR = Path(__file__).resolve().parent
Expand Down Expand Up @@ -123,6 +125,7 @@ def validate_args(args: argparse.Namespace) -> None:
f"Input file not found: {args.input_dir / args.input_file}"
)

args.template_key = None
# Resolve template directory
if args.template_dir is None:
# No custom template specified; use default
Expand All @@ -136,6 +139,7 @@ def validate_args(args: argparse.Namespace) -> None:
f"Expected a simple name like 'wdgph' or 'my_phu', not a path."
)

args.template_key = args.template_dir.strip()
phu_template_path = DEFAULT_PHU_TEMPLATES_DIR / args.template_dir
if not phu_template_path.exists():
raise FileNotFoundError(
Expand Down Expand Up @@ -209,6 +213,8 @@ def run_step_2_preprocess(
output_dir: Path,
language: str,
run_id: str,
config_dir: Path,
template_code: Optional[str] = None,
) -> int:
"""Step 2: Preprocessing.

Expand All @@ -220,6 +226,10 @@ def run_step_2_preprocess(
# Configure logging
log_path = preprocess.configure_logging(output_dir, run_id)

# Load configuration for PHIX validation
config = load_config(config_dir / "parameters.yaml")
project_root = config_dir.parent

# Load and process input data
input_path = input_dir / input_file
df_raw = preprocess.read_input(input_path)
Expand All @@ -230,6 +240,55 @@ def run_step_2_preprocess(
# Check that addresses are complete, return only complete rows
df = preprocess.check_addresses_complete(df)

# Validate schools/daycares against PHIX reference list
phix_config = config.get("phix_validation", {})
phix_warnings: list[str] = []
if phix_config.get("enabled", False):
reference_file = phix_config.get("reference_file", "")
mapping_file = phix_config.get("phu_mapping_file")
mapping_path: Optional[Path] = None
if mapping_file:
mapping_path = Path(mapping_file)
if not mapping_path.is_absolute():
mapping_path = (project_root / mapping_file).resolve()
if not mapping_path.exists():
raise FileNotFoundError(f"PHU alias mapping file not found: {mapping_path}")

target_phu_codes: set[str] = set()
configured_target = phix_config.get("target_phu_code")
if isinstance(configured_target, str):
configured_target = [configured_target]
if isinstance(configured_target, (list, tuple, set)):
for code in configured_target:
if code and str(code).strip():
target_phu_codes.add(str(code))
if template_code:
target_phu_codes.add(template_code)

if reference_file:
reference_path = Path(reference_file)
# If relative path, resolve from project root
if not reference_path.is_absolute():
reference_path = (project_root / reference_file).resolve()
if reference_path.exists():
sheet_name = phix_config.get("reference_sheet_name", "Schools & Day Cares")
column_prefix = phix_config.get("column_prefix", "PHIX_")
df, phix_warnings = validate_phix.validate_facilities(
df=df,
reference_path=reference_path,
output_dir=output_dir,
unmatched_behavior=phix_config.get("unmatched_behavior", "warn"),
target_phu_codes=target_phu_codes or None,
phu_mapping_path=mapping_path,
reference_sheet_name=sheet_name,
column_prefix=column_prefix,
)
print(f"🏫 PHIX validation complete: {len(df)} records validated")
else:
print(f"⚠️ PHIX reference file not found: {reference_path}")
else:
print("⚠️ PHIX validation enabled but no reference_file configured")

# Load configuration
vaccine_reference_path = preprocess.VACCINE_REFERENCE_PATH
vaccine_reference = json.loads(vaccine_reference_path.read_text(encoding="utf-8"))
Expand All @@ -239,6 +298,14 @@ def run_step_2_preprocess(
df, language, vaccine_reference, preprocess.REPLACE_UNSPECIFIED
)

# Merge PHIX validation warnings into result
if phix_warnings:
combined_warnings = list(result.warnings) + phix_warnings
result = PreprocessResult(
clients=result.clients,
warnings=combined_warnings,
)

# Write artifact
artifact_path = preprocess.write_artifact(
output_dir / "artifacts", language, run_id, result
Expand Down Expand Up @@ -397,6 +464,7 @@ def run_step_6_validate_pdfs(

# Load preprocessed clients to build client ID mapping
client_id_map = {}
client_metadata_map = {}
import json

with open(preprocessed_json, "r", encoding="utf-8") as f:
Expand All @@ -406,11 +474,20 @@ def run_step_6_validate_pdfs(
# Filename format: {language}_notice_{sequence:05d}_{client_id}.pdf
for idx, client in enumerate(clients, start=1):
client_id = str(client.get("client_id", ""))
metadata = client.get("metadata", {}) or {}
phix_meta = metadata.get("phix_validation", {}) or {}
school = client.get("school", {}) or {}
# Try to match any expected filename format
for ext in [".pdf"]:
for lang_prefix in ["en", "fr"]:
filename = f"{lang_prefix}_notice_{idx:05d}_{client_id}{ext}"
client_id_map[filename] = client_id
client_metadata_map[filename] = {
"phix_validation": phix_meta,
"phix_target_phu_code": metadata.get("phix_target_phu_code"),
"phix_target_phu_label": metadata.get("phix_target_phu_label"),
"school_name": school.get("name"),
}

# Validate PDFs (module loads validation rules from config_dir)
validate_pdfs.main(
Expand All @@ -419,6 +496,7 @@ def run_step_6_validate_pdfs(
json_output=validation_json,
client_id_map=client_id_map,
config_dir=config_dir,
client_metadata_map=client_metadata_map,
)


Expand Down Expand Up @@ -574,6 +652,8 @@ def main() -> int:
output_dir,
args.language,
run_id,
config_dir,
args.template_key,
)
step_duration = time.time() - step_start
step_times.append(("Preprocessing", step_duration))
Expand Down
61 changes: 57 additions & 4 deletions pipeline/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,8 @@ def map_columns(df: pd.DataFrame, required_columns=REQUIRED_COLUMNS):
# Normalize input columns for matching
normalized_input_cols = [normalize(c) for c in input_cols]

best_matches = {}

# Check each input column against required columns
for input_col in normalized_input_cols:
col_name, score, index = process.extractOne(
Expand All @@ -455,11 +457,26 @@ def map_columns(df: pd.DataFrame, required_columns=REQUIRED_COLUMNS):
if score >= THRESHOLD: # adjustable threshold
# Map the original column name, not the normalized one
actual_in_col = next(c for c in input_cols if normalize(c) == input_col)
col_map[actual_in_col] = best_match
#col_map[actual_in_col] = best_match

# print colname and score for debugging
print(f"Matching '{input_col}' to '{best_match}' with score {score}")

# Check if column already has an assigned mapping
if best_match not in col_map.values():
print(f"The value {best_match} does not exist in the dictionary - adding value.")
col_map[actual_in_col] = best_match
best_matches[best_match] = {"actual_in_col": actual_in_col, "score": score}

# Replace if higher score
elif score > best_matches[best_match]["score"]:
print(f"{input_col} has a higher score than current best match in the dictionary - replacing value.")

col_map = col_map.pop(best_matches[best_match]["actual_in_col"], None)
col_map[actual_in_col] = best_match

best_matches[best_match] = {"actual_in_col": actual_in_col, "score": score}

return df.rename(columns=col_map), col_map


Expand Down Expand Up @@ -743,6 +760,16 @@ def build_preprocess_result(
warnings: set[str] = set()
working = normalize_dataframe(df)

def clean_optional(value: Any) -> Any:
"""Convert pandas NA or empty values to None."""
if value is None:
return None
if isinstance(value, (float, int)) and pd.isna(value):
return None
if pd.isna(value):
return None
return value

# Load parameters for date_notice_delivery and chart_diseases_header
params = {}
if PARAMETERS_PATH.exists():
Expand Down Expand Up @@ -845,6 +872,34 @@ def build_preprocess_result(
"postal_code": postal_code,
}

phix_id = clean_optional(getattr(row, "PHIX_ID", None))
phix_match_type = getattr(row, "PHIX_MATCH_TYPE", "none")
phix_match_conf = getattr(row, "PHIX_MATCH_CONFIDENCE", 0)
if pd.isna(phix_match_conf):
phix_match_conf = 0
phix_match_conf = int(phix_match_conf)
phix_phu_name = clean_optional(getattr(row, "PHIX_MATCHED_PHU", None))
phix_phu_code = clean_optional(getattr(row, "PHIX_MATCHED_PHU_CODE", None))
phix_target_code = clean_optional(getattr(row, "PHIX_TARGET_PHU_CODE", None))
phix_target_label = clean_optional(getattr(row, "PHIX_TARGET_PHU_LABEL", None))

metadata: Dict[str, Any] = {
"unique_id": row.UNIQUE_ID or None, # type: ignore[attr-defined]
}
metadata["phix_validation"] = {
"id": phix_id,
"match_type": phix_match_type or "none",
"confidence": phix_match_conf,
"phu_name": phix_phu_name,
"phu_code": phix_phu_code,
"target_phu_code": phix_target_code,
"target_phu_label": phix_target_label,
}
if phix_target_code:
metadata["phix_target_phu_code"] = phix_target_code
if phix_target_label:
metadata["phix_target_phu_label"] = phix_target_label

client = ClientRecord(
sequence=sequence,
client_id=client_id,
Expand All @@ -856,9 +911,7 @@ def build_preprocess_result(
vaccines_due=vaccines_due if vaccines_due else None,
vaccines_due_list=vaccines_due_list if vaccines_due_list else None,
received=received if received else None,
metadata={
"unique_id": row.UNIQUE_ID or None, # type: ignore[attr-defined]
},
metadata=metadata,
)

clients.append(client)
Expand Down
Loading