From cac6fdea56fd5b4add9ca7fc8bd2fe5e6261cda4 Mon Sep 17 00:00:00 2001 From: blalterman <12834389+blalterman@users.noreply.github.com> Date: Tue, 17 Feb 2026 14:50:19 -0500 Subject: [PATCH 1/3] Remove non-functional joblib parallelization from TrendFit (#428) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs(plans): add private-dev + public-release repo plan Documents the two-repo architecture for private development with public releases via rsync-based export script. Co-Authored-By: Claude Opus 4.6 * refactor(fitfunctions): remove joblib parallelization from TrendFit Joblib's loky backend deadlocks on macOS with Python 3.12's spawn start method. The multiprocessing backend fails due to unpicklable closures, and the threading backend provides no CPU-bound speedup. Remove the non-functional parallel code path, related tests, benchmarks, and the performance optional dependency. All Phase 4 features in core.py (residuals use_all, in-place mask ops, xoutside/youtside) are preserved — they have zero joblib dependency. Generated with Claude Code Co-Authored-By: Claude --------- Co-authored-by: Claude Opus 4.6 --- benchmarks/fitfunctions_performance.py | 179 --------- plans/private-dev-public-release-repos.md | 347 ++++++++++++++++++ pyproject.toml | 3 - solarwindpy/fitfunctions/trend_fits.py | 146 +------- .../fitfunctions/test_trend_fits_advanced.py | 222 +---------- 5 files changed, 357 insertions(+), 540 deletions(-) delete mode 100644 benchmarks/fitfunctions_performance.py create mode 100644 plans/private-dev-public-release-repos.md diff --git a/benchmarks/fitfunctions_performance.py b/benchmarks/fitfunctions_performance.py deleted file mode 100644 index 863c01e2..00000000 --- a/benchmarks/fitfunctions_performance.py +++ /dev/null @@ -1,179 +0,0 @@ -#!/usr/bin/env python -"""Benchmark Phase 4 performance optimizations.""" - -import time -import numpy as np -import pandas as pd -import sys -import os - -# Add the parent directory to sys.path to import solarwindpy -sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) - -from solarwindpy.fitfunctions import Gaussian -from solarwindpy.fitfunctions.trend_fits import TrendFit - - -def benchmark_trendfit(n_fits=50): - """Compare sequential vs parallel TrendFit performance.""" - print(f"\nBenchmarking with {n_fits} fits...") - - # Create synthetic data that's realistic for fitting - np.random.seed(42) - x = np.linspace(0, 10, 100) - data = pd.DataFrame({ - f'col_{i}': 5 * np.exp(-(x-5)**2/2) + np.random.normal(0, 0.1, 100) - for i in range(n_fits) - }, index=x) - - # Sequential execution - print(" Running sequential...") - tf_seq = TrendFit(data, Gaussian, ffunc1d=Gaussian) - tf_seq.make_ffunc1ds() - - start = time.perf_counter() - tf_seq.make_1dfits(n_jobs=1) - seq_time = time.perf_counter() - start - - # Parallel execution - print(" Running parallel...") - tf_par = TrendFit(data, Gaussian, ffunc1d=Gaussian) - tf_par.make_ffunc1ds() - - start = time.perf_counter() - tf_par.make_1dfits(n_jobs=-1) - par_time = time.perf_counter() - start - - speedup = seq_time / par_time - print(f" Sequential: {seq_time:.2f}s") - print(f" Parallel: {par_time:.2f}s") - print(f" Speedup: {speedup:.1f}x") - - # Verify results match - print(" Verifying results match...") - successful_fits = 0 - for key in tf_seq.ffuncs.index: - if key in tf_par.ffuncs.index: # Both succeeded - seq_popt = tf_seq.ffuncs[key].popt - par_popt = tf_par.ffuncs[key].popt - for param in seq_popt: - np.testing.assert_allclose( - seq_popt[param], par_popt[param], - rtol=1e-10, atol=1e-10 - ) - successful_fits += 1 - - print(f" ✓ {successful_fits} fits verified identical") - - return speedup, successful_fits - - -def benchmark_single_fitfunction(): - """Benchmark single FitFunction to understand baseline performance.""" - print("\nBenchmarking single FitFunction...") - - np.random.seed(42) - x = np.linspace(0, 10, 100) - y = 5 * np.exp(-(x-5)**2/2) + np.random.normal(0, 0.1, 100) - - # Time creation and fitting - start = time.perf_counter() - ff = Gaussian(x, y) - creation_time = time.perf_counter() - start - - start = time.perf_counter() - ff.make_fit() - fit_time = time.perf_counter() - start - - total_time = creation_time + fit_time - - print(f" Creation time: {creation_time*1000:.1f}ms") - print(f" Fitting time: {fit_time*1000:.1f}ms") - print(f" Total time: {total_time*1000:.1f}ms") - - return total_time - - -def check_joblib_availability(): - """Check if joblib is available for parallel processing.""" - try: - import joblib - print(f"✓ joblib {joblib.__version__} available") - - # Check number of cores - import os - n_cores = os.cpu_count() - print(f"✓ {n_cores} CPU cores detected") - return True - except ImportError: - print("✗ joblib not available - only sequential benchmarks will run") - return False - - -if __name__ == "__main__": - print("FitFunctions Phase 4 Performance Benchmark") - print("=" * 50) - - # Check system capabilities - has_joblib = check_joblib_availability() - - # Single fit baseline - single_time = benchmark_single_fitfunction() - - # TrendFit scaling benchmarks - speedups = [] - fit_counts = [] - - test_sizes = [10, 25, 50, 100] - if has_joblib: - # Only run larger tests if joblib is available - test_sizes.extend([200]) - - for n in test_sizes: - expected_seq_time = single_time * n - print(f"\nExpected sequential time for {n} fits: {expected_seq_time:.1f}s") - - try: - speedup, n_successful = benchmark_trendfit(n) - speedups.append(speedup) - fit_counts.append(n_successful) - except Exception as e: - print(f" ✗ Benchmark failed: {e}") - speedups.append(1.0) - fit_counts.append(0) - - # Summary report - print("\n" + "=" * 50) - print("BENCHMARK SUMMARY") - print("=" * 50) - - print(f"Single fit baseline: {single_time*1000:.1f}ms") - - if speedups: - print("\nTrendFit Scaling Results:") - print("Fits | Successful | Speedup") - print("-" * 30) - for i, n in enumerate(test_sizes): - if i < len(speedups): - print(f"{n:4d} | {fit_counts[i]:10d} | {speedups[i]:7.1f}x") - - if has_joblib: - avg_speedup = np.mean(speedups) - best_speedup = max(speedups) - print(f"\nAverage speedup: {avg_speedup:.1f}x") - print(f"Best speedup: {best_speedup:.1f}x") - - # Efficiency analysis - if avg_speedup > 1.5: - print("✓ Parallelization provides significant benefit") - else: - print("⚠ Parallelization benefit limited (overhead or few cores)") - else: - print("\nInstall joblib for parallel processing:") - print(" pip install joblib") - print(" or") - print(" pip install solarwindpy[performance]") - - print("\nTo use parallel fitting in your code:") - print(" tf.make_1dfits(n_jobs=-1) # Use all cores") - print(" tf.make_1dfits(n_jobs=4) # Use 4 cores") \ No newline at end of file diff --git a/plans/private-dev-public-release-repos.md b/plans/private-dev-public-release-repos.md new file mode 100644 index 00000000..c1030769 --- /dev/null +++ b/plans/private-dev-public-release-repos.md @@ -0,0 +1,347 @@ +# Plan: Private Development Repo + Public Release Repo + +## Summary + +Create a private development repo (`blalterman/SolarWindPy-dev`) as the primary working +environment, while keeping the existing public repo (`blalterman/SolarWindPy`) as-is for +releases. All current public content stays public. The private repo is a **superset** of +the public repo, adding research code, private data, and experimental features. + +## Architecture + +``` +GitHub: + blalterman/SolarWindPy (public, existing) <- Release repo + blalterman/SolarWindPy-dev (private, new) <- Development repo + +Local: + ~/observatories/code/ + |-- SolarWindPy/ -> public repo clone (releases only) + +-- SolarWindPy-dev/ -> private repo clone (daily work) +``` + +**Relationship**: Private repo is a superset. It contains everything the public repo has, +plus additional private directories. Development happens in private; curated releases flow +to public via an export script. + +## Evidence & Justification + +### Why two repos instead of branch-based separation? +- Git has no per-remote file filtering. Pushing `master` to two remotes sends ALL files. +- Branch-based approaches (e.g., `develop` with private content, `master` without) require + careful merge hygiene that is error-prone for a solo developer. +- Two repos with an export script is the standard pattern used by Linux, Android, and + corporate open-source projects for private-dev -> public-release workflows. +- Zero risk of accidental private content leakage. + +### Why copy history rather than start fresh? +- The private repo should have full commit history for `git blame`, bisect, and reference. +- Starting from a bare clone means both repos share the same initial history (1,383 commits). +- setuptools_scm works identically in both (same tags, same version derivation). + +### Why keep the public repo unchanged? +- User's explicit requirement: "Everything that currently is public can stay public." +- No external link breakage (PyPI, conda-forge, ReadTheDocs, 4 stars, 3 forks). +- No CI/CD reconfiguration needed for the public repo. +- `.claude/`, `plans/`, `paper/` remain public as they already are. + +## Phase 1: Create Private Repository (~15 min) + +### Step 1.1: Create private repo on GitHub +```bash +gh repo create blalterman/SolarWindPy-dev \ + --private \ + --description "Private development repository for SolarWindPy" +``` + +### Step 1.2: Push full history to private repo +```bash +# Clone the public repo as a bare mirror +git clone --bare git@github.com:blalterman/SolarWindPy.git /tmp/swp-mirror + +# Push everything to the new private repo +cd /tmp/swp-mirror +git push --mirror git@github.com:blalterman/SolarWindPy-dev.git + +# Cleanup +rm -rf /tmp/swp-mirror +``` + +### Step 1.3: Clone private repo locally +```bash +cd ~/observatories/code +git clone git@github.com:blalterman/SolarWindPy-dev.git SolarWindPy-dev +``` + +### Step 1.4: Add public repo as a remote in the private clone +```bash +cd ~/observatories/code/SolarWindPy-dev +git remote add public git@github.com:blalterman/SolarWindPy.git +git fetch public +``` + +**Verification**: `git remote -v` shows both `origin` (private) and `public` (public). + +## Phase 2: Configure Private Repo Structure (~30 min) + +### Step 2.1: Add private-only directories +```bash +cd ~/observatories/code/SolarWindPy-dev +mkdir -p research/{notebooks,analysis,experiments} +mkdir -p private/{data,configs} +``` + +### Step 2.2: Create private content manifest +Create `.private-manifest` (tracked in private repo, excluded from export): +``` +# Paths that exist only in the private repo. +# The export script excludes these when syncing to public. +research/ +private/ +.private-manifest +``` + +### Step 2.3: Update private repo .gitignore +Add to `.gitignore`: +``` +# Private data files (never commit even to private repo) +private/data/**/*.h5 +private/data/**/*.hdf5 +private/configs/secrets.* +``` + +### Step 2.4: Initial commit of private structure +```bash +git add research/ private/ .private-manifest +git commit -m "feat: add private research and data directories + +Establishes private-only directories for research code, analysis +notebooks, experimental features, and private data/configs. +These directories are excluded from public repo syncs. + +Co-Authored-By: Claude " +git push origin master +``` + +## Phase 3: Create Export Script (~1 hour) + +### Step 3.1: Create `scripts/export-to-public.sh` in private repo + +This script: +1. Reads `.private-manifest` for paths to exclude +2. Uses `rsync` to sync everything else to the local public clone +3. Commits and optionally tags in the public clone +4. Pushes to the public GitHub repo + +```bash +#!/bin/bash +# export-to-public.sh -- Sync public-safe content from private to public repo +# +# Usage: +# ./scripts/export-to-public.sh # Sync without tagging +# ./scripts/export-to-public.sh v0.4.0 # Sync and tag for release +# ./scripts/export-to-public.sh --dry-run # Show what would be synced + +set -euo pipefail + +PRIVATE_REPO="$(cd "$(dirname "$0")/.." && pwd)" +PUBLIC_REPO="${PRIVATE_REPO}/../SolarWindPy" +MANIFEST="${PRIVATE_REPO}/.private-manifest" + +# Parse args +DRY_RUN=false +VERSION_TAG="" +for arg in "$@"; do + case "$arg" in + --dry-run) DRY_RUN=true ;; + v*) VERSION_TAG="$arg" ;; + *) echo "Unknown arg: $arg"; exit 1 ;; + esac +done + +# Validate +[ -d "$PUBLIC_REPO/.git" ] || { echo "Error: Public repo not found at $PUBLIC_REPO"; exit 1; } +[ -f "$MANIFEST" ] || { echo "Error: .private-manifest not found"; exit 1; } + +# Build rsync exclude list from manifest +EXCLUDES=() +while IFS= read -r line; do + [[ "$line" =~ ^#.*$ || -z "$line" ]] && continue + EXCLUDES+=(--exclude="$line") +done < "$MANIFEST" + +# Always exclude git directory and build artifacts +EXCLUDES+=(--exclude=".git" --exclude="dist/" --exclude="*.egg-info/" + --exclude="htmlcov/" --exclude="__pycache__/" --exclude=".pytest_cache/" + --exclude=".eggs/" --exclude="build/" --exclude="tmp/" + --exclude="staged-recipes*/" --exclude="*.pyc") + +echo "=== Export to Public Repo ===" +echo "Private: $PRIVATE_REPO" +echo "Public: $PUBLIC_REPO" +echo "Excludes: ${EXCLUDES[*]}" + +if [ "$DRY_RUN" = true ]; then + echo "--- DRY RUN ---" + rsync -avn --delete "${EXCLUDES[@]}" "$PRIVATE_REPO/" "$PUBLIC_REPO/" + exit 0 +fi + +# Sync +rsync -av --delete "${EXCLUDES[@]}" "$PRIVATE_REPO/" "$PUBLIC_REPO/" + +# Commit changes in public repo +cd "$PUBLIC_REPO" +if [ -n "$(git status --porcelain)" ]; then + git add -A + git commit -m "$(cat < +EOF +)" + echo "Committed changes to public repo" +else + echo "No changes to commit" +fi + +# Tag if requested +if [ -n "$VERSION_TAG" ]; then + git tag -a "$VERSION_TAG" -m "SolarWindPy $VERSION_TAG" + echo "Tagged as $VERSION_TAG" + echo "" + echo "Ready to push. Run:" + echo " cd $PUBLIC_REPO && git push origin master --tags" +fi + +echo "=== Export complete ===" +``` + +### Step 3.2: Make executable and commit +```bash +chmod +x scripts/export-to-public.sh +git add scripts/export-to-public.sh +git commit -m "feat(scripts): add export-to-public sync script + +Provides one-command sync from private dev repo to public release +repo, reading .private-manifest for paths to exclude. + +Co-Authored-By: Claude " +``` + +## Phase 4: CI/CD Configuration (~30 min) + +### Private repo CI/CD +No file changes needed. Existing CI/CD workflows work as-is because the private +repo is a superset of the public repo (same structure, same files, same tests). + +### Public repo CI/CD stays as-is +The `publish.yml` workflow already triggers on `v*` tags and publishes to PyPI. +The `docs.yml` workflow already builds documentation. No changes needed. + +### Optional: Disable publishing from private repo +To prevent accidental publishing from the wrong repo, remove the `PYPI_API_TOKEN` +secret from the private repo's GitHub settings. The private repo's `publish.yml` +would fail on tag push (harmless), ensuring you never accidentally publish from +the wrong source. + +**Action**: GitHub Settings (no file changes) -> Private repo -> Settings -> Secrets -> Remove `PYPI_API_TOKEN` + +## Phase 5: Development Workflow (Reference) + +### Daily development +```bash +cd ~/observatories/code/SolarWindPy-dev # Work in private repo +git checkout -b feature/my-feature # Feature branch +# ... develop, test, commit ... +git push origin feature/my-feature # Push to private +# Create PR on private repo (for your own tracking) +``` + +### Research & experiments +```bash +cd ~/observatories/code/SolarWindPy-dev +# Research code goes in research/ -- never synced to public +vim research/notebooks/ion_temperature_analysis.ipynb +vim research/experiments/new_fitting_approach.py +git add research/ +git commit -m "research: preliminary ion temperature analysis" +git push origin master +``` + +### Release workflow +```bash +cd ~/observatories/code/SolarWindPy-dev +# 1. Ensure master is clean and tests pass +pytest -q +# 2. Update CHANGELOG.md +# 3. Dry-run the export +./scripts/export-to-public.sh --dry-run +# 4. Export and tag +./scripts/export-to-public.sh v0.4.0 +# 5. Push to public +cd ~/observatories/code/SolarWindPy +git push origin master --tags +# 6. publish.yml fires -> PyPI -> conda-forge +``` + +### Pulling community contributions (if any) +```bash +cd ~/observatories/code/SolarWindPy-dev +git fetch public +git merge public/master # Pull any external PRs into private +``` + +## Phase 6: Keeping Repos in Sync (Reference) + +### When to sync +- **Always sync before release**: Run export script before tagging +- **Don't sync on every commit**: The public repo receives batch updates at release time +- **Pull from public after external PRs**: If someone contributes to the public repo + +### Conflict prevention +- All development happens in private repo -> public is never ahead of private + (except for external contributions, which are rare for a solo project) +- The export script uses `rsync --delete`, so public repo always matches the + private repo's public-safe content exactly + +### Private content safety +- `.private-manifest` lists all private-only paths +- `rsync --delete` with exclusions ensures private paths never appear in public +- No complex git gymnastics needed -- it's just a file sync + +## Risk Analysis + +| Risk | Severity | Probability | Mitigation | +|------|----------|-------------|------------| +| Forget to sync before release | Medium | Medium | Add to CHANGELOG/release checklist | +| Private content leaks to public | High | Very Low | `.private-manifest` + `rsync` excludes | +| setuptools_scm version mismatch | Medium | Low | Both repos have same tags; verify with `python -m setuptools_scm` | +| Divergent histories after external PR | Low | Low | `git fetch public && git merge public/master` | +| Accidental PyPI publish from private | Medium | Low | Remove PYPI_API_TOKEN from private repo secrets | + +## Cost Analysis (GitHub Organization Question) + +If an org is desired later, these are the costs: +- **Free tier**: Unlimited public + private repos, 2,000 CI min/month. No branch protection on private repos. +- **Team**: $4/user/month. Adds branch protection, required reviewers. +- **Recommendation**: Not needed now. Revisit if the project grows to multiple contributors. + +## Verification Checklist + +After implementation, verify: +- [ ] `git remote -v` in private clone shows both `origin` and `public` +- [ ] `pytest -q` passes in private repo +- [ ] `./scripts/export-to-public.sh --dry-run` shows correct file list +- [ ] Private-only directories (`research/`, `private/`) do NOT appear in dry-run output +- [ ] `python -m setuptools_scm` reports correct version in both repos +- [ ] `.claude/` infrastructure works normally in private repo +- [ ] Public repo CI passes after first sync + +## Critical Files + +- `SolarWindPy/.gitignore` -- needs private-data patterns (in private clone only) +- `SolarWindPy/pyproject.toml` -- URLs stay as-is (public repo) +- `SolarWindPy/.pre-commit-config.yaml` -- works in both repos unchanged +- New: `scripts/export-to-public.sh` -- the sync script (private repo only) +- New: `.private-manifest` -- private content manifest (private repo only) diff --git a/pyproject.toml b/pyproject.toml index 2a4b2e0a..57f3e5fd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -103,9 +103,6 @@ dev = [ # Code analysis tools (ast-grep via MCP server, not Python package) "pre-commit>=3.5", # Git hook framework ] -performance = [ - "joblib>=1.3.0", # Parallel execution for TrendFit -] analysis = [ # Interactive analysis environment "jupyterlab>=4.0", diff --git a/solarwindpy/fitfunctions/trend_fits.py b/solarwindpy/fitfunctions/trend_fits.py index 1e389be7..a98326cc 100644 --- a/solarwindpy/fitfunctions/trend_fits.py +++ b/solarwindpy/fitfunctions/trend_fits.py @@ -6,22 +6,12 @@ """ -# import warnings import logging # noqa: F401 -import warnings import numpy as np import pandas as pd import matplotlib as mpl from collections import namedtuple -# Parallel processing support -try: - from joblib import Parallel, delayed - - JOBLIB_AVAILABLE = True -except ImportError: - JOBLIB_AVAILABLE = False - from ..plotting import subplots from . import core from . import gaussians @@ -159,146 +149,24 @@ def make_ffunc1ds(self, **kwargs): ffuncs = pd.Series(ffuncs) self._ffuncs = ffuncs - def make_1dfits(self, n_jobs=1, verbose=0, backend="loky", **kwargs): - r""" - Execute fits for all 1D functions, optionally in parallel. + def make_1dfits(self, **kwargs): + r"""Execute fits for all 1D functions. - Each FitFunction instance represents a single dataset to fit. - TrendFit creates many such instances (one per column), making - this ideal for parallelization. + Removes bad fits from `ffuncs` and saves them in `bad_fits`. Parameters ---------- - n_jobs : int, default=1 - Number of parallel jobs: - - 1: Sequential execution (default, backward compatible) - - -1: Use all available CPU cores - - n>1: Use n cores - Requires joblib: pip install joblib - verbose : int, default=0 - Joblib verbosity level (0=silent, 10=progress) - backend : str, default='loky' - Joblib backend ('loky', 'threading', 'multiprocessing') **kwargs Passed to each FitFunction.make_fit() - - Examples - -------- - >>> # TrendFit creates one FitFunction per column - >>> tf = TrendFit(agg_data, Gaussian, ffunc1d=Gaussian) - >>> tf.make_ffunc1ds() # Creates instances - >>> - >>> # Fit all instances sequentially (default) - >>> tf.make_1dfits() - >>> - >>> # Fit in parallel using all cores - >>> tf.make_1dfits(n_jobs=-1) - >>> - >>> # With progress display - >>> tf.make_1dfits(n_jobs=-1, verbose=10) - - Notes - ----- - Parallel execution returns complete fitted FitFunction objects from worker - processes, which incurs serialization overhead. This overhead typically - outweighs parallelization benefits for simple fits. Parallelization is - most beneficial for: - - - Complex fitting functions with expensive computations - - Large datasets (>1000 points per fit) - - Batch processing of many fits (>50) - - Systems with many CPU cores and sufficient memory - - For typical Gaussian fits on moderate data, sequential execution (n_jobs=1) - may be faster due to Python's GIL and serialization overhead. - - Removes bad fits from `ffuncs` and saves them in `bad_fits`. """ # Successful fits return None, which pandas treats as NaN. return_exception = kwargs.pop("return_exception", True) - # Filter out parallelization parameters from kwargs before passing to make_fit() - # These are specific to make_1dfits() and should not be passed to individual fits - fit_kwargs = { - k: v for k, v in kwargs.items() if k not in ["n_jobs", "verbose", "backend"] - } - - # Check if parallel execution is requested and possible - if n_jobs != 1 and len(self.ffuncs) > 1: - if not JOBLIB_AVAILABLE: - warnings.warn( - f"joblib not installed. Install with 'pip install joblib' " - f"for parallel processing of {len(self.ffuncs)} fits. " - f"Falling back to sequential execution.", - UserWarning, - ) - n_jobs = 1 - else: - # Parallel execution - return fitted objects to preserve TrendFit architecture - def fit_single_from_data( - column_name, x_data, y_data, ffunc_class, ffunc_kwargs - ): - """Create and fit FitFunction, return both result and fitted object.""" - # Create new FitFunction instance in worker process - ffunc = ffunc_class(x_data, y_data, **ffunc_kwargs) - fit_result = ffunc.make_fit( - return_exception=return_exception, **fit_kwargs - ) - # Return tuple: (fit_result, fitted_object) - return (fit_result, ffunc) - - # Prepare minimal data for each fit - fit_tasks = [] - for col_name, ffunc in self.ffuncs.items(): - x_data = ffunc.observations.raw.x - y_data = ffunc.observations.raw.y - ffunc_class = type(ffunc) - # Extract constructor kwargs from ffunc (constraints, etc.) - ffunc_kwargs = { - "xmin": getattr(ffunc, "xmin", None), - "xmax": getattr(ffunc, "xmax", None), - "ymin": getattr(ffunc, "ymin", None), - "ymax": getattr(ffunc, "ymax", None), - "xoutside": getattr(ffunc, "xoutside", None), - "youtside": getattr(ffunc, "youtside", None), - } - # Remove None values - ffunc_kwargs = { - k: v for k, v in ffunc_kwargs.items() if v is not None - } - - fit_tasks.append( - (col_name, x_data, y_data, ffunc_class, ffunc_kwargs) - ) - - # Run fits in parallel and get both results and fitted objects - parallel_output = Parallel( - n_jobs=n_jobs, verbose=verbose, backend=backend - )( - delayed(fit_single_from_data)( - col_name, x_data, y_data, ffunc_class, ffunc_kwargs - ) - for col_name, x_data, y_data, ffunc_class, ffunc_kwargs in fit_tasks - ) - - # Separate results and fitted objects, update self.ffuncs with fitted objects - fit_results = [] - for idx, (result, fitted_ffunc) in enumerate(parallel_output): - fit_results.append(result) - # CRITICAL: Replace original with fitted object to preserve TrendFit architecture - col_name = self.ffuncs.index[idx] - self.ffuncs[col_name] = fitted_ffunc - - # Convert to Series for bad fit handling - fit_success = pd.Series(fit_results, index=self.ffuncs.index) - - if n_jobs == 1: - # Original sequential implementation (unchanged) - fit_success = self.ffuncs.apply( - lambda x: x.make_fit(return_exception=return_exception, **fit_kwargs) - ) + fit_success = self.ffuncs.apply( + lambda x: x.make_fit(return_exception=return_exception, **kwargs) + ) - # Handle failed fits (original code, unchanged) + # Handle failed fits bad_idx = fit_success.dropna().index bad_fits = self.ffuncs.loc[bad_idx] self._bad_fits = bad_fits diff --git a/tests/fitfunctions/test_trend_fits_advanced.py b/tests/fitfunctions/test_trend_fits_advanced.py index 3e42b31c..1baf9708 100644 --- a/tests/fitfunctions/test_trend_fits_advanced.py +++ b/tests/fitfunctions/test_trend_fits_advanced.py @@ -1,14 +1,12 @@ -"""Test Phase 4 performance optimizations.""" +"""Test TrendFit advanced features.""" import time -import warnings import matplotlib import matplotlib.pyplot as plt import numpy as np import pandas as pd import pytest -from unittest.mock import patch from solarwindpy.fitfunctions import Gaussian, Line from solarwindpy.fitfunctions.trend_fits import TrendFit @@ -16,219 +14,6 @@ matplotlib.use("Agg") # Non-interactive backend for testing -class TestTrendFitParallelization: - """Test TrendFit parallel execution.""" - - def setup_method(self): - """Create test data for reproducible tests.""" - np.random.seed(42) - x = np.linspace(0, 10, 50) - self.data = pd.DataFrame( - { - f"col_{i}": 5 * np.exp(-((x - 5) ** 2) / 2) - + np.random.normal(0, 0.1, 50) - for i in range(10) - }, - index=x, - ) - - def test_backward_compatibility(self): - """Verify default behavior unchanged.""" - tf = TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf.make_ffunc1ds() - - # Should work without n_jobs parameter (default behavior) - tf.make_1dfits() - assert len(tf.ffuncs) > 0 - assert hasattr(tf, "_bad_fits") - - def test_parallel_sequential_equivalence(self): - """Verify parallel gives same results as sequential.""" - # Sequential execution - tf_seq = TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf_seq.make_ffunc1ds() - tf_seq.make_1dfits(n_jobs=1) - - # Parallel execution - tf_par = TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf_par.make_ffunc1ds() - tf_par.make_1dfits(n_jobs=2) - - # Should have same number of successful fits - assert len(tf_seq.ffuncs) == len(tf_par.ffuncs) - - # Compare all fit parameters - for key in tf_seq.ffuncs.index: - assert ( - key in tf_par.ffuncs.index - ), f"Fit {key} missing from parallel results" - - seq_popt = tf_seq.ffuncs[key].popt - par_popt = tf_par.ffuncs[key].popt - - # Parameters should match within numerical precision - for param in seq_popt: - np.testing.assert_allclose( - seq_popt[param], - par_popt[param], - rtol=1e-10, - atol=1e-10, - err_msg=f"Parameter {param} differs between sequential and parallel", - ) - - def test_parallel_execution_correctness(self): - """Verify parallel execution works correctly, acknowledging Python GIL limitations.""" - # Check if joblib is available - if not, test falls back gracefully - try: - import joblib # noqa: F401 - - joblib_available = True - except ImportError: - joblib_available = False - - # Create test dataset - focus on correctness rather than performance - x = np.linspace(0, 10, 100) - data = pd.DataFrame( - { - f"col_{i}": 5 * np.exp(-((x - 5) ** 2) / 2) - + np.random.normal(0, 0.1, 100) - for i in range(20) # Reasonable number of fits - }, - index=x, - ) - - # Time sequential execution - tf_seq = TrendFit(data, Gaussian, ffunc1d=Gaussian) - tf_seq.make_ffunc1ds() - start = time.perf_counter() - tf_seq.make_1dfits(n_jobs=1) - seq_time = time.perf_counter() - start - - # Time parallel execution with threading - tf_par = TrendFit(data, Gaussian, ffunc1d=Gaussian) - tf_par.make_ffunc1ds() - start = time.perf_counter() - tf_par.make_1dfits(n_jobs=4, backend="threading") - par_time = time.perf_counter() - start - - speedup = seq_time / par_time if par_time > 0 else float("inf") - - print( - f"Sequential time: {seq_time:.3f}s, fits: {len(tf_seq.ffuncs)}" # noqa: E231 - ) - print( - f"Parallel time: {par_time:.3f}s, fits: {len(tf_par.ffuncs)}" # noqa: E231 - ) - print( - f"Speedup achieved: {speedup:.2f}x (joblib available: {joblib_available})" # noqa: E231 - ) - - if joblib_available: - # Main goal: verify parallel execution works and produces correct results - # Note: Due to Python GIL and serialization overhead, speedup may be minimal - # or even negative for small/fast workloads. This is expected behavior. - assert ( - speedup > 0.05 - ), f"Parallel execution extremely slow, got {speedup:.2f}x" # noqa: E231 - print( - "NOTE: Python GIL and serialization overhead may limit speedup for small workloads" - ) - else: - # Without joblib, both should be sequential (speedup ~1.0) - # Widen tolerance to 1.5 for timing variability across platforms - assert ( - 0.5 <= speedup <= 1.5 - ), f"Expected ~1.0x speedup without joblib, got {speedup:.2f}x" # noqa: E231 - - # Most important: verify both produce the same number of successful fits - assert len(tf_seq.ffuncs) == len( - tf_par.ffuncs - ), "Parallel and sequential should have same success rate" - - # Verify results are equivalent (this is the key correctness test) - for key in tf_seq.ffuncs.index: - if key in tf_par.ffuncs.index: # Both succeeded - seq_popt = tf_seq.ffuncs[key].popt - par_popt = tf_par.ffuncs[key].popt - for param in seq_popt: - np.testing.assert_allclose( - seq_popt[param], - par_popt[param], - rtol=1e-10, - atol=1e-10, - err_msg=f"Parameter {param} differs between sequential and parallel", - ) - - def test_joblib_not_installed_fallback(self): - """Test graceful fallback when joblib unavailable.""" - # Mock joblib as unavailable - with patch.dict("sys.modules", {"joblib": None}): - # Force reload to simulate joblib not being installed - import solarwindpy.fitfunctions.trend_fits as tf_module - - # Temporarily mock JOBLIB_AVAILABLE - original_available = tf_module.JOBLIB_AVAILABLE - tf_module.JOBLIB_AVAILABLE = False - - try: - tf = tf_module.TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf.make_ffunc1ds() - - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - tf.make_1dfits(n_jobs=-1) # Request parallel - - # Should warn about joblib not being available - assert len(w) == 1 - assert "joblib not installed" in str(w[0].message) - assert "parallel processing" in str(w[0].message) - - # Should still complete successfully with sequential execution - assert len(tf.ffuncs) > 0 - finally: - # Restore original state - tf_module.JOBLIB_AVAILABLE = original_available - - def test_n_jobs_parameter_validation(self): - """Test different n_jobs parameter values.""" - tf = TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf.make_ffunc1ds() - - # Test various n_jobs values - for n_jobs in [1, 2, -1]: - tf_test = TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf_test.make_ffunc1ds() - tf_test.make_1dfits(n_jobs=n_jobs) - assert len(tf_test.ffuncs) > 0, f"n_jobs={n_jobs} failed" - - def test_verbose_parameter(self): - """Test verbose parameter doesn't break execution.""" - tf = TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf.make_ffunc1ds() - - # Should work with verbose output (though we can't easily test the output) - tf.make_1dfits(n_jobs=2, verbose=0) - assert len(tf.ffuncs) > 0 - - def test_backend_parameter(self): - """Test different joblib backends.""" - tf = TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf.make_ffunc1ds() - - # Test different backends (may not all be available in all environments) - for backend in ["loky", "threading"]: - tf_test = TrendFit(self.data, Gaussian, ffunc1d=Gaussian) - tf_test.make_ffunc1ds() - try: - tf_test.make_1dfits(n_jobs=2, backend=backend) - assert len(tf_test.ffuncs) > 0, f"Backend {backend} failed" - except ValueError: - # Some backends may not be available in all environments - pytest.skip( - f"Backend {backend} not available in this environment" # noqa: E713 - ) - - class TestResidualsEnhancement: """Test residuals use_all parameter.""" @@ -400,7 +185,7 @@ def test_complete_workflow(self): * np.exp(-((x - (10 + i * 0.2)) ** 2) / (2 * (2 + i * 0.1) ** 2)) + np.random.normal(0, 0.05, 200) ) - for i in range(25) # 25 measurements for good parallelization test + for i in range(25) }, index=x, ) @@ -409,9 +194,8 @@ def test_complete_workflow(self): tf = TrendFit(data, Gaussian, ffunc1d=Gaussian) tf.make_ffunc1ds() - # Fit with parallelization start_time = time.perf_counter() - tf.make_1dfits(n_jobs=-1, verbose=0) + tf.make_1dfits() execution_time = time.perf_counter() - start_time # Verify results From 2769a19a9684f6d8bb0fb4faccb9c1297f319be9 Mon Sep 17 00:00:00 2001 From: blalterman Date: Sun, 26 Apr 2026 11:56:01 -0400 Subject: [PATCH 2/3] chore: commit to push --- .claude/compacted_state.md | 161 +++- ...2-core-rewrite-characterization-testing.md | 881 ++++++++++++++++++ 2 files changed, 993 insertions(+), 49 deletions(-) create mode 100644 plans/v2-core-rewrite-characterization-testing.md diff --git a/.claude/compacted_state.md b/.claude/compacted_state.md index 5f0035e2..dfdb1fc4 100644 --- a/.claude/compacted_state.md +++ b/.claude/compacted_state.md @@ -1,62 +1,125 @@ -# Compacted State: FitFunctions Phase 6 Execution +# Compacted Context State - 2026-02-10T16:37:25Z -## Branch: plan/fitfunctions-audit-execution @ e0ca3659 +## Compaction Metadata +- **Timestamp**: 2026-02-10T16:37:25Z +- **Branch**: master +- **Plan**: tests-audit +- **Pre-Compaction Context**: ~7,116 tokens (1,514 lines) +- **Target Compression**: light (20% reduction) +- **Target Tokens**: ~5,692 tokens +- **Strategy**: light compression with prose focus -## Current Status -| Stage | Status | Notes | -|-------|--------|-------| -| 1. Merge | ✅ DONE | Bug fix committed e0ca3659 | -| 2. Environment | 🔧 BLOCKED | Editable install wrong dir | -| 3-7 | ⏳ Pending | After env fix | +## Content Analysis +- **Files Analyzed**: 6 +- **Content Breakdown**: + - Code: 337 lines + - Prose: 390 lines + - Tables: 13 lines + - Lists: 314 lines + - Headers: 179 lines +- **Token Estimates**: + - Line-based: 4,542 + - Character-based: 12,568 + - Word-based: 7,900 + - Content-weighted: 3,456 + - **Final estimate**: 7,116 tokens -## Critical Blocker -**Problem**: Tests run against wrong installation +## Git State +### Current Branch: master +### Last Commit: 51cff45f - feat(core): Update ReferenceAbundances to Asplund 2021 with year selection (#424) (blalterman, 2 weeks ago) + +### Recent Commits: ``` -pip show solarwindpy | grep Editable -# Returns: SolarWindPy-2 (WRONG) -# Should be: SolarWindPy (current directory) +51cff45f (HEAD -> master, origin/master, origin/HEAD) feat(core): Update ReferenceAbundances to Asplund 2021 with year selection (#424) +a910f985 feat(solar_activity): Add ICMECAT class for HELIO4CAST ICME catalog access (#425) +df7a4708 chore: dead code cleanup - remove ~2,450 lines of commented code (#419) +487fbccc feat(data): add Asplund 2009 photospheric reference abundances +296ac07c feat(fitfunctions): add hinge, composite, and Heaviside fit functions (#422) ``` -**Solution**: -```bash -pip uninstall -y solarwindpy -pip install -e ".[dev,performance]" -pytest tests/fitfunctions/test_phase4_performance.py -v +### Working Directory Status: +``` +M .claude/compacted_state.md ``` -## Bug Fix (COMMITTED e0ca3659) -File: `solarwindpy/fitfunctions/trend_fits.py` -- Line 221-223: Filter n_jobs/verbose/backend from kwargs -- Line 241, 285: Use `**fit_kwargs` instead of `**kwargs` - -## Phase 6 Coverage Targets -| Module | Current | Target | Priority | -|--------|---------|--------|----------| -| gaussians.py | 73% | 96% | CRITICAL | -| exponentials.py | 82% | 96% | CRITICAL | -| core.py | 90% | 95% | HIGH | -| trend_fits.py | 80% | 91% | MEDIUM | -| plots.py | 90% | 95% | MEDIUM | -| moyal.py | 86% | 95% | LOW | - -## Parallel Agent Strategy -After Stage 2, launch 6 TestEngineer agents in parallel: -```python -Task(TestEngineer, "gaussians tests", run_in_background=True) -Task(TestEngineer, "exponentials tests", run_in_background=True) -# ... (all 6 modules simultaneously) +### Uncommitted Changes Summary: ``` -Time: 4-5 hrs sequential → 1.5 hrs parallel +.claude/compacted_state.md | 161 +++++++++++++++++++++++++++++++-------------- + 1 file changed, 112 insertions(+), 49 deletions(-) +``` + +## Critical Context Summary + +### Active Tasks (Priority Focus) +- No active tasks identified + +### Recent Key Decisions +- No recent decisions captured + +### Blockers & Issues +⚠️ - **Process Issues**: None - agent coordination worked smoothly throughout +⚠️ - [x] **Document risk assessment matrix** (Est: 25 min) - Create risk ratings for identified issues (Critical, High, Medium, Low) +⚠️ ### Blockers & Issues + +### Immediate Next Steps +➡️ - Notes: Show per-module coverage changes and remaining gaps +➡️ - [x] **Generate recommendations summary** (Est: 20 min) - Provide actionable next steps for ongoing test suite maintenance +➡️ - [x] Recommendations summary providing actionable next steps + +## Session Context Summary -## Key Files -- Plan: `/Users/balterma/.claude/plans/gentle-hugging-sundae.md` -- Handoff: `plans/fitfunctions-audit/phase6-session-handoff.md` +### Active Plan: tests-audit +## Plan Metadata +- **Plan Name**: Physics-Focused Test Suite Audit +- **Created**: 2025-08-21 +- **Branch**: plan/tests-audit +- **Implementation Branch**: feature/tests-hardening +- **PlanManager**: UnifiedPlanCoordinator +- **PlanImplementer**: UnifiedPlanCoordinator with specialized agents +- **Structure**: Multi-Phase +- **Total Phases**: 6 +- **Dependencies**: None +- **Affects**: tests/*, plans/tests-audit/artifacts/, documentation files +- **Estimated Duration**: 12-18 hours +- **Status**: Completed -## Next Actions -1. Fix environment (Stage 2) -2. Verify tests pass -3. Run coverage analysis (Stage 3) -4. Launch parallel agents (Stage 4) + +### Plan Progress Summary +- Plan directory: plans/tests-audit +- Last modified: 2025-08-24 20:27 + +## Session Resumption Instructions + +### 🚀 Quick Start Commands +```bash +# Restore session environment +cd plans/tests-audit && ls -la +git status +pwd # Verify working directory +conda info --envs # Check active environment +``` + +### 🎯 Priority Actions for Next Session +1. Review plan status: cat plans/tests-audit/0-Overview.md +2. Resolve: - **Process Issues**: None - agent coordination worked smoothly throughout +3. Resolve: - [x] **Document risk assessment matrix** (Est: 25 min) - Create risk ratings for identified issues (Critical, High, Medium, Low) +4. Review uncommitted changes and decide on commit strategy + +### 🔄 Session Continuity Checklist +- [ ] **Environment**: Verify correct conda environment and working directory +- [ ] **Branch**: Confirm on correct git branch (master) +- [ ] **Context**: Review critical context summary above +- [ ] **Plan**: Check plan status in plans/tests-audit +- [ ] **Changes**: Review uncommitted changes + +### 📊 Efficiency Metrics +- **Context Reduction**: 20.0% (7,116 → 5,692 tokens) +- **Estimated Session Extension**: 12 additional minutes of productive work +- **Compaction Strategy**: light compression focused on prose optimization --- -*Updated: 2025-12-31 - FitFunctions Phase 6 Execution* +*Automated intelligent compaction - 2026-02-10T16:37:25Z* + +## Compaction File +Filename: `compaction-2026-02-10-163725-20pct.md` - Unique timestamp-based compaction file +No git tags created - using file-based state preservation diff --git a/plans/v2-core-rewrite-characterization-testing.md b/plans/v2-core-rewrite-characterization-testing.md new file mode 100644 index 00000000..1f2650aa --- /dev/null +++ b/plans/v2-core-rewrite-characterization-testing.md @@ -0,0 +1,881 @@ +# SolarWindPy v2.0: Core Rewrite via Characterization Testing + +## Strategy: Test v1 First, Then Rewrite to Pass Same Tests + +``` +Phase A-Core: Write characterization tests against v1 core (A.0-A.6) + Tests pass with v1. Tests ARE the spec. + +Phase A-NonCore: Write characterization tests against v1 non-core modules (A.7-A.10) + Runs IN PARALLEL with Phase B. Pure value for v1. Phase C prerequisites. + +Phase B: Rewrite core/ to xarray, passing Phase A-Core tests. + When v2 passes all A-Core tests → behavioral equivalence guaranteed. + +Phase C: (Future) Incrementally migrate fitting/plotting/activity/instabilities. + Phase A-NonCore tests already exist → same TDD guarantee as core. +``` + +``` +Timeline: + Phase A-Core ──────→ Phase B (must pass A-Core) + │ + └── Phase A-NonCore ──→ (runs in parallel with B, tests v1) + ↓ + Phase C (future, already has tests) +``` + +**Why this approach:** +- Tests capture ACTUAL v1 behavior, including innovations we might miss +- v1 gets better tested immediately (pure value, zero risk) +- v2 correctness is guaranteed by passing the same tests +- No guessing about physics formulas — v1 output IS ground truth +- Core rewrite scope: 4,317 lines (21%), non-core tested but untouched: 15,924 lines +- Non-core characterization tests are Phase C prerequisites — future rewrites get TDD "for free" +- A-NonCore runs in parallel with Phase B — no added wall-clock time on the critical path + +--- + +## Decision Record + +| Question | Answer | +|----------|--------| +| Package name | Same `solarwindpy`, version 2.0 | +| Rewrite scope | **Core only** (4,317 lines, 21% of codebase) | +| Fitting/plotting/activity/instabilities | **Stay as-is** (15,924 lines untouched) | +| Data backend | xarray (replaces pandas MultiIndex) | +| AI-friendly | First-class design goal | +| Test strategy | **Characterization tests against v1 → rewrite v2 to pass them** | +| Non-core test strategy | **Characterization tests for all modules** (A-NonCore ∥ Phase B) | +| Parallel agents | Yes — phases with independent work parallelized | +| Compatibility bridge | v2 DataArray → `.values`/`.to_series()` for existing modules | +| Species string spec | PRESERVED — tested against v1 behavior | +| `__call__` component access | PRESERVED — `velocity("x")`, `thermal_speed("par")` | +| All 15 core innovations | PRESERVED — characterization tests capture them | +| All 32 non-core innovations | **TESTED + PRESERVED** — characterization tests written, modules untouched | + +--- + +## Innovation Registry + +### Scope A: Core Innovations (15 — tested and reimplemented) + +| # | Innovation | v1 Location | Phase A Test Strategy | +|---|-----------|-------------|----------------------| +| 1 | Species string spec DSL | `base.py:136-175`, `plasma.py:505-528` | Test all 3 calling patterns with known data | +| 2 | `__getattr__` ion access | `plasma.py:208-212` | Test `plasma.p1` returns correct Ion | +| 3 | `__call__` component access | `vector.py:33-46`, `tensor.py:30-48` | Test `velocity("x")` == `velocity.x` | +| 4 | Calculation chain orchestration | `plasma.py` (full) | Test intermediate + final values match | +| 5 | Adaptive species return types | `plasma.py:798-895` | Test shape/type for single/multi/combined | +| 6 | Scalar w auto-derivation | `plasma.py:744-762` | Test `sqrt((par^2 + 2*per^2)/3)` at init | +| 7 | Tensor par/per/scalar contract | `tensor.py:9-91` | Test rejection of wrong components | +| 8 | Polytropic index context | `units_constants.py:37` | Test cs uses `{par:3, per:2, scalar:5/3}` | +| 9 | lnlambda/nuc/nc validation chain | `plasma.py:1353-1552` | Test rejection of invalid species combos | +| 10 | `estimate_electrons` | `plasma.py:1639-1727` | Test charge neutrality + T_e=T_p | +| 11 | `,` syntax for turbulence | `plasma.py:1776-1822` | Test `"p1,a"` velocity/density separation | +| 12 | `raffaella_version` flag | `alfvenic_turbulence.py:61-62` | Test GSE→RTN + Parker spiral projection | +| 13 | Save/load modifier functions | `plasma.py:296-397` | Test roundtrip with transform | +| 14 | Data quality logging | `plasma.py:678-722` | Test statistics output | +| 15 | BField minimal specialization | `vector.py:294-329` | Test Vector + pressure property | + +### Scope B: Non-Core Innovations (32 — characterization tested via A-NonCore, modules untouched) + +These modules stay as-is but gain comprehensive characterization tests. Phase A-NonCore tests: +- **Fitting (8):** TeXinfo, named tuple tracking, scipy integration, 6 piecewise, composites, joblib, FFPlot, metaclass docstring inheritance +- **Plotting (7):** Species label DSL, aggregation pipeline, Numba spiral, cell filtering, OrbitPlot mixin, 7 normalization modes, bin edge strategies +- **Solar Activity (7):** Dual-layer cache, JSON metadata, extrema calculator, IntervalIndex cycle assignment, Rise/Fall labeling, ICMECAT fallbacks, vectorized containment +- **Instabilities (4):** Verscharen parametric table, sentinel validation, NaN-safe thresholds, table legend +- **Engineering (6):** Import aliases, pandas strict mode, year-selectable abundances, dynamically constructed Ion dict, Ion reconstruction design, view-based memory model + +**Current test coverage audit (what A-NonCore addresses):** + +| Module | LOC | Methods | Existing Tests | Coverage | Priority | +|--------|-----|---------|---------------|----------|----------| +| instabilities/ | 727 | 22 | 0 | **0%** | CRITICAL | +| solar_activity/ | 2,096 | 92 | 50 | ~30% | CRITICAL | +| plotting/ | 7,362 | ~280 | 207 | ~35-45% | HIGH | +| fitfunctions/ | 5,563 | 188 | 240+ | ~75% | GAP-FILL | + +--- + +## Phase A: Characterization Tests Against v1 + +**Goal:** Write tests comprehensive enough that if v2 passes them all, we trust it. + +**Principle:** Tests assert on INTERFACE and NUMERICAL OUTPUT, not implementation details. +- YES: `assert result.values == expected_array` (numerical equivalence) +- YES: `assert result.name == "p1+a"` (interface contract) +- YES: `assert isinstance(plasma.p1, Ion)` (type contract) +- NO: `assert isinstance(result, pd.Series)` (implementation detail — will change) +- NO: `assert result.index.name == "Epoch"` (pandas-specific) + +**Test fixtures:** Create synthetic plasma data with known values so we can predict exact outputs. + +```python +# Fixture: known-value plasma for characterization +@pytest.fixture +def known_plasma(): + """Plasma with hand-calculated values for verification. + + Species: p1 (protons), a (alphas) + Time points: 5 + All values chosen so derived quantities are exact (no floating point ambiguity). + """ + # ... construct v1 Plasma with known data ... + return plasma +``` + +### A.0: Species Spec Tests + +**File:** `tests/v2_characterization/test_species_spec.py` + +```python +class TestConformSpecies: + def test_single(self): ... # "p1" -> ("p1",) + def test_plus_splits_and_sorts(self): ... # "p1+a" -> ("a", "p1") + def test_multi_args_sorted(self): ... # "p1", "a" -> ("a", "p1") + def test_rejects_plus_with_multi(self): ... # "p1+a", "p2" -> ValueError + def test_rejects_comma(self): ... # "a,p1" -> ValueError + def test_single_species_returns_len1(self): ... # "p1+a" -> len == 1 + +class TestChkSpecies: + def test_valid_species(self): ... # "p1" when plasma has p1 + def test_invalid_species_error(self): ... # "o6" when not in plasma + def test_error_message_helpful(self): ... # lists available species +``` + +**v1 reference:** `core/base.py:136-175`, `core/plasma.py:505-528` + +### A.1: Vector3D + Tensor3D Tests + +**File:** `tests/v2_characterization/test_vector_tensor.py` + +```python +class TestVector: + # Core operations — capture v1 numerical output + def test_magnitude(self, known_vector): ... + def test_unit_vector(self, known_vector): ... + def test_dot_product(self, v1, v2): ... + def test_cross_product(self, v1, v2): ... + def test_rho_lat_lon(self, known_vector): ... + def test_cos_theta(self, v1, v2): ... + + # Callable component access — innovation #3 + def test_call_x(self, known_vector): ... # vector("x") + def test_call_equivalent_to_attr(self, v): ... # vector("x") == vector.x + + # Projection — captures exact decomposition + def test_project_parallel(self, velocity, bfield): ... + def test_project_perpendicular(self, velocity, bfield): ... + def test_project_reconstructs_original(self, v, b): ... # v_par + v_perp == v + +class TestTensor: + def test_par_per_scalar_access(self, known_tensor): ... + def test_call_par(self, known_tensor): ... # tensor("par") + def test_scalar_formula(self, known_tensor): ... # sqrt((par^2 + 2*per^2)/3) + def test_magnitude(self, known_tensor): ... + def test_anisotropy(self, known_tensor): ... # per/par ratio + def test_rejects_wrong_components(self): ... # must be par/per/scalar + +class TestBField: + def test_inherits_vector(self, bfield): ... + def test_pressure(self, bfield): ... # B^2/(2*mu_0), exact values + def test_call_component(self, bfield): ... # bfield("x") +``` + +**v1 reference:** `core/vector.py` (328 lines), `core/tensor.py` (90 lines) + +### A.2: Ion Tests + +**File:** `tests/v2_characterization/test_ion.py` + +```python +class TestIonProperties: + # Each test captures v1's exact numerical output for known inputs + def test_n(self, known_ion): ... # number density passthrough + def test_velocity_is_vector(self, known_ion): ... # returns Vector type + def test_thermal_speed_is_tensor(self, known_ion): ... + def test_rho(self, known_ion): ... # n * m, exact values + def test_temperature_par(self, known_ion): ... # T = m/(2*k_B) * w^2 + def test_temperature_per(self, known_ion): ... + def test_temperature_scalar(self, known_ion): ... + def test_pth_par(self, known_ion): ... # p = 0.5 * rho * w^2 + def test_pth_per(self, known_ion): ... + def test_pth_scalar(self, known_ion): ... + def test_sound_speed_par(self, known_ion): ... # gamma_par = 3.0 + def test_sound_speed_per(self, known_ion): ... # gamma_per = 2.0 + def test_sound_speed_scalar(self, known_ion): ... # gamma_scalar = 5/3 + def test_anisotropy(self, known_ion): ... # w_per^2 / w_par^2 + def test_specific_entropy(self, known_ion): ... + def test_kinetic_energy_flux(self, known_ion): ... + +class TestIonCaching: + def test_velocity_cached(self, ion): ... # same object on repeated access + def test_thermal_speed_cached(self, ion): ... +``` + +**v1 reference:** `core/ions.py` (311 lines) + +### A.3: Plasma Species Spec Tests (THE critical test suite) + +**File:** `tests/v2_characterization/test_plasma.py` + +```python +class TestPlasmaConstruction: + def test_species_list(self, plasma): ... # plasma.species -> ["a", "p1"] + def test_scalar_w_auto_calculated(self, plasma): ... # captures v1 init behavior + +class TestPlasmaIonAccess: + def test_getattr_returns_ion(self, plasma): ... # plasma.p1 -> Ion + def test_getattr_cached(self, plasma): ... # plasma.p1 is plasma.p1 + def test_getattr_unknown_raises(self, plasma): ... # helpful error + +class TestPlasmaNumberDensity: + # Single species + def test_n_single_values(self, plasma): ... # exact numerical output + def test_n_single_name(self, plasma): ... # result labeled "p1" + + # Combined (+ combinator) + def test_n_combined_values(self, plasma): ... # sum of n_p1 + n_a + def test_n_combined_name(self, plasma): ... # labeled "p1+a" + + # Multiple (comparison) + def test_n_multi_shape(self, plasma): ... # has species dimension + def test_n_multi_values(self, plasma): ... # both species present + +class TestPlasmaBeta: + def test_beta_single_values(self, plasma): ... # pth / p_B, exact + def test_beta_combined_values(self, plasma): ... # summed pth / p_B + def test_beta_multi_values(self, plasma): ... + def test_beta_has_thermal_components(self, plasma): ... # par, per, scalar + +class TestPlasmaVelocity: + def test_v_single_values(self, plasma): ... + def test_v_combined_is_com(self, plasma): ... # mass-weighted CoM, exact + def test_v_multi_values(self, plasma): ... + +class TestPlasmaAlfvenSpeed: + def test_ca_single(self, plasma): ... + def test_ca_combined(self, plasma): ... # uses total mass density + +class TestPlasmaSoundSpeed: + def test_cs_single(self, plasma): ... + def test_cs_uses_polytropic_context(self, plasma): ... # par=3, per=2, scalar=5/3 + +class TestPlasmaThermalPressure: + def test_pth_single(self, plasma): ... + def test_pth_combined(self, plasma): ... # summed across species + +class TestPlasmaTwoSpecies: + def test_dv_values(self, plasma): ... # v_a - v_p, exact + def test_dv_with_com(self, plasma): ... # v_a - v_CoM + def test_lnlambda_values(self, plasma): ... # Coulomb log, exact + def test_lnlambda_rejects_plus(self, plasma): ... # physics-aware validation + def test_nuc_values(self, plasma): ... # collision frequency, exact + def test_nc_requires_spacecraft(self, plasma): ... # physics-aware validation + +class TestPlasmaEdgeCases: + def test_plus_with_multi_raises(self, plasma): ... # "p1+a", "p2" -> error + def test_thermal_speed_rejects_plus(self, plasma): ... + def test_nan_handling(self, plasma_with_nans): ... # skipna behavior + def test_single_species_plasma(self, single_species_plasma): ... + +class TestPlasmaEstimateElectrons: + def test_requires_protons(self, plasma): ... + def test_electron_density(self, plasma): ... # n_e = sum(Z_i * n_i), exact + def test_electron_temperature(self, plasma): ... # T_e = T_p, exact + +class TestPlasmaDataQuality: + def test_logging_statistics(self, plasma): ... + +class TestPlasmaIO: + def test_save_load_roundtrip(self, plasma, tmp_path): ... + def test_save_with_modifier(self, plasma, tmp_path): ... +``` + +**v1 reference:** `core/plasma.py` (1,901 lines) + +### A.4: Spacecraft Tests + +**File:** `tests/v2_characterization/test_spacecraft.py` + +```python +class TestSpacecraft: + def test_position_is_vector(self, sc): ... + def test_distance_to_sun(self, sc): ... # exact values + def test_carrington_coords(self, sc): ... +``` + +**v1 reference:** `core/spacecraft.py` (256 lines) + +### A.5: Alfvenic Turbulence Tests + +**File:** `tests/v2_characterization/test_turbulence.py` + +```python +class TestAlfvenicTurbulence: + def test_elsasser_zplus(self, turb): ... # z+ = dv + db_A, exact + def test_elsasser_zminus(self, turb): ... # z- = dv - db_A, exact + def test_cross_helicity(self, turb): ... # sigma_c + def test_residual_energy(self, turb): ... # sigma_r + def test_alfven_ratio(self, turb): ... + +class TestCommaSyntax: + def test_vel_from_p1_density_from_a(self, plasma): ... # "p1,a" + def test_comma_with_plus(self, plasma): ... # "p1+p2,p1+p2+a" + +class TestRaffaellaVersion: + def test_gse_to_rtn(self, turb_raff): ... + def test_parker_spiral_projection(self, turb_raff): ... + +class TestAlfvenUnitsConversion: + def test_b_converted_before_averaging(self, turb): ... # order matters +``` + +**v1 reference:** `core/alfvenic_turbulence.py` (804 lines) + +### A.6: Units & Constants Tests + +**File:** `tests/v2_characterization/test_units.py` + +```python +class TestPolytropic: + def test_par_is_3(self): ... + def test_per_is_2(self): ... + def test_scalar_is_5_over_3(self): ... + +class TestConversionFactors: + # Capture v1's exact conversion factors + def test_bfield_to_si(self): ... # 1e-9 + def test_velocity_to_si(self): ... # 1e3 + def test_density_to_si(self): ... # 1e6 + # ... all conversion factors from units_constants.py +``` + +**v1 reference:** `core/units_constants.py` (199 lines) + +### Phase A Execution Strategy + +**Parallel agents within Phase A:** +- Agent 1: A.0 (species spec) + A.6 (units) — foundational, small +- Agent 2: A.1 (vector/tensor) + A.2 (ion) — data containers +- Agent 3: A.3 (plasma) — the big one, can start once fixtures exist + +**Known-value fixtures:** Construct ONE set of synthetic plasma data with hand-calculable values. All characterization tests use this fixture. Example: +```python +# Known values chosen for exact arithmetic: +# n_p1 = 5.0 cm^-3, n_a = 0.2 cm^-3 +# v_p1 = [400, 0, 0] km/s, v_a = [420, 10, 0] km/s +# w_p1 = {par: 30, per: 20} km/s, w_a = {par: 40, per: 25} km/s +# B = [3, -2, 1] nT +# → beta_p1_par = (0.5 * rho_p1 * w_p1_par^2) / (B^2 / 2*mu_0) = [exact value] +``` + +**Verification:** Run `pytest tests/v2_characterization/ -v` against v1. All tests must pass. + +--- + +## Phase A-NonCore: Characterization Tests for Non-Core Modules + +**Goal:** Bring all non-core modules to characterization-test standard. Runs IN PARALLEL with Phase B. + +**Principle:** Same as Phase A-Core — tests assert on INTERFACE and OUTPUT, not implementation. +These tests run against v1 as-is. They provide: +1. Immediate regression safety for v1 +2. Documentation of behavior for 87% undocumented functions +3. Ready-made test suites if Phase C ever rewrites these modules + +**Key difference from A-Core tests:** These tests CAN assert on pandas types (`pd.Series`, `pd.DataFrame`) since we are NOT rewriting these modules. If/when Phase C rewrites a module, the tests would be adapted then, just as A-Core tests were designed implementation-agnostic for Phase B. + +### A.7: Instabilities Tests (CRITICAL — 0% coverage) + +**File:** `tests/v2_characterization/test_instabilities.py` + +**Current state:** ZERO tests. 727 LOC. Contains deprecated `iteritems()` call that crashes. + +```python +class TestVerscharen2016: + # Parametric table — the core data structure + def test_table_shape(self, verscharen): ... # correct dimensions + def test_table_columns(self, verscharen): ... # AIC, FMW, MM, OFI present + def test_growth_rates_available(self, verscharen): ... # -4, -3, -2 + + # Threshold calculations — numerical output for known inputs + def test_aic_threshold(self, verscharen): ... # exact values for known beta + def test_fmw_threshold(self, verscharen): ... + def test_mm_threshold(self, verscharen): ... + def test_ofi_threshold(self, verscharen): ... + + # Interpolation + def test_interpolation_within_range(self, verscharen): ... + def test_interpolation_extrapolation(self, verscharen): ... + + # Validation + def test_nan_safe_thresholds(self, verscharen): ... # NaN input handling + def test_sentinel_fill_values(self, verscharen): ... # invalid data detection + def test_rejects_invalid_instability(self, verscharen): ... + +class TestBetaRPlot: + def test_initialization(self, beta_r_plot): ... + def test_labels(self, beta_r_plot): ... + def test_inherits_hist2d(self, beta_r_plot): ... + +class TestIteritemsFix: + def test_no_iteritems_usage(self): ... # verify deprecated API removed +``` + +**v1 reference:** `instabilities/verscharen2016.py` (630 lines), `instabilities/beta_ani.py` (81 lines) + +### A.8: Solar Activity Tests (CRITICAL — ~30% coverage) + +**File:** `tests/v2_characterization/test_solar_activity.py` + +**Current state:** 50 tests for 92 methods. Critical gaps in LISIRD (2/14), extrema_calculator (2/18), SIDC (2/16). + +```python +class TestBase: + # Dual-layer cache — innovation + def test_cache_stores_data(self, loader): ... + def test_cache_staleness_detection(self, loader): ... + def test_cache_refresh(self, loader): ... + + # ID system + def test_id_construction(self): ... + def test_id_url_generation(self): ... + + # DataLoader + def test_data_loading(self, loader): ... + def test_data_filtering(self, loader): ... + def test_json_metadata(self, loader): ... + +class TestLISIRD: + # Currently only 2 tests for 14 methods + def test_data_retrieval(self, lisird): ... + def test_time_series_format(self, lisird): ... + def test_interpolation(self, lisird): ... + def test_missing_data_handling(self, lisird): ... + def test_multiple_datasets(self, lisird): ... + +class TestExtremaCalculator: + # Currently only 2 tests for 18 methods + def test_find_maxima(self, calc): ... + def test_find_minima(self, calc): ... + def test_cycle_boundaries(self, calc): ... + def test_interval_index_assignment(self, calc): ... # innovation + def test_rise_fall_labeling(self, calc): ... # innovation + def test_edge_cases_near_boundaries(self, calc): ... + +class TestSIDC: + # Currently only 2 tests for 16 methods + def test_sunspot_number_loading(self, sidc): ... + def test_data_format(self, sidc): ... + def test_monthly_vs_daily(self, sidc): ... + def test_smoothing(self, sidc): ... + def test_cycle_number_assignment(self, sidc): ... + +class TestICMECAT: + # Already decent (10 tests), fill gaps + def test_fallback_data_sources(self, icmecat): ... # innovation + def test_vectorized_containment(self, icmecat): ... # innovation + def test_interval_overlap(self, icmecat): ... +``` + +**v1 reference:** `solar_activity/base.py` (604 lines), `solar_activity/lisird/` (711 lines), `solar_activity/sunspot_number/` (558 lines), `solar_activity/icme/` (452 lines) + +### A.9: Plotting Tests (HIGH — ~35-45% coverage) + +**File:** `tests/v2_characterization/test_plotting.py` (split into sub-files as needed) + +**Current state:** 207 tests for ~280 methods. Major gaps in special labels (14/89), hist2d (4/13), spiral (10/30). + +```python +# --- test_plotting_labels.py --- +class TestSpeciesLabelDSL: + # Currently 14 tests for 89 methods — the biggest gap + def test_proton_label(self): ... + def test_alpha_label(self): ... + def test_combined_species_label(self): ... + def test_measurement_label(self): ... + def test_component_label(self): ... + def test_latex_rendering(self): ... + def test_units_formatting(self): ... + # Test each of the 14 specialized label classes + def test_beta_label(self): ... + def test_temperature_label(self): ... + def test_density_label(self): ... + def test_velocity_label(self): ... + def test_thermal_speed_label(self): ... + def test_pressure_label(self): ... + def test_alfven_speed_label(self): ... + +class TestDatetimeLabels: + def test_year_format(self): ... + def test_month_format(self): ... + def test_epoch_format(self): ... + def test_custom_format(self): ... + +# --- test_plotting_hist2d.py --- +class TestHist2D: + # Currently 4 tests for 1,200 LOC — weakest ratio + def test_basic_histogram(self, hist2d): ... + def test_normalization_modes(self, hist2d): ... # 7 modes — innovation + def test_bin_edge_strategies(self, hist2d): ... # innovation + def test_colorbar(self, hist2d): ... + def test_log_scale(self, hist2d): ... + def test_aggregation_pipeline(self, hist2d): ... # innovation + def test_cell_filtering(self, hist2d): ... # innovation + def test_empty_bins(self, hist2d): ... + +# --- test_plotting_spiral.py --- +class TestSpiral: + # Currently 10 tests for 1,074 LOC + def test_numba_binning(self, spiral): ... # innovation + def test_parker_spiral_overlay(self, spiral): ... + def test_carrington_projection(self, spiral): ... + def test_radial_extent(self, spiral): ... + def test_cell_filter_integration(self, spiral): ... + +# --- test_plotting_agg.py --- +class TestAggPlot: + def test_aggregation_pipeline(self, agg): ... # innovation + def test_multiple_aggregations(self, agg): ... + def test_orbit_mixin(self, orbit_plot): ... # innovation +``` + +**v1 reference:** `plotting/labels/` (2,458 lines), `plotting/hist2d.py` (1,200 lines), `plotting/spiral.py` (1,074 lines), `plotting/base.py` (311 lines), `plotting/agg_plot.py` (488 lines) + +### A.10: Fitting Tests (GAP-FILL — ~75% coverage) + +**File:** `tests/v2_characterization/test_fitting.py` (extends existing tests) + +**Current state:** 240+ tests for 188 methods. Best covered, but specific gaps remain. + +```python +# --- test_fitting_composite.py (currently 3 tests for 11 methods) --- +class TestComposite: + def test_gaussian_times_heaviside(self, composite): ... + def test_gaussian_times_heaviside_plus_heaviside(self, composite): ... + def test_parameter_count(self, composite): ... + def test_component_access(self, composite): ... + def test_tex_info_combined(self, composite): ... # innovation: TeXinfo on composites + def test_jacobian(self, composite): ... + def test_bounds(self, composite): ... + +# --- test_fitting_gaussians.py (currently 7 tests for 15 methods) --- +class TestGaussians: + def test_gaussian_fit(self, gauss): ... + def test_skew_gaussian(self, skew_gauss): ... + def test_parameter_names(self, gauss): ... + def test_tex_info_gaussian(self, gauss): ... # innovation: TeXinfo + def test_bounded_fit(self, gauss): ... + +# --- test_fitting_texinfo.py (ensure comprehensive coverage) --- +class TestTeXInfoEngine: + # TeXinfo is a CORE innovation per user + def test_parameter_annotation(self, tex_info): ... + def test_fit_result_annotation(self, tex_info): ... + def test_latex_rendering(self, tex_info): ... + def test_multi_component_annotation(self, tex_info): ... + def test_custom_format(self, tex_info): ... + +# --- test_fitting_metaclass.py --- +class TestFitFunctionMetaclass: + def test_docstring_inheritance(self): ... # innovation + def test_subclass_gets_parent_docs(self): ... + def test_method_docs_inherited(self): ... + +# --- test_fitting_observation_tracking.py --- +class TestObservationTracking: + def test_named_tuple_format(self, fit): ... # innovation + def test_observation_count(self, fit): ... + def test_observation_persistence(self, fit): ... +``` + +**v1 reference:** `fitfunctions/composite.py` (559 lines), `fitfunctions/gaussians.py` (244 lines), `fitfunctions/tex_info.py` (567 lines), `fitfunctions/core.py` (825 lines) + +### Phase A-NonCore Execution Strategy + +**Parallel agents within A-NonCore:** +- Agent 1: A.7 (instabilities) — smallest, highest priority (0% → target 95%) +- Agent 2: A.8 (solar_activity) — critical gaps to fill (~30% → target 95%) +- Agent 3: A.9 (plotting) + A.10 (fitting) — larger but partially covered + +**A-NonCore runs IN PARALLEL with Phase B.** Since A-NonCore tests v1, it has no dependency on the v2 core rewrite. This means: +- No added wall-clock time on the critical path +- v1 gets comprehensive test coverage immediately +- Phase C gets ready-made test suites whenever we decide to rewrite those modules + +**Test approach differences from A-Core:** +- A-NonCore tests CAN use pandas assertions (these modules stay pandas-based) +- A-NonCore tests CAN mock network calls (solar_activity has external data sources) +- A-NonCore tests SHOULD use `@pytest.mark.mpl_image_compare` for plotting (visual regression) +- A-NonCore tests inherit existing conftest.py fixtures where available + +**Verification:** `pytest tests/v2_characterization/noncore/ -v` passes against v1. All green. + +--- + +## Phase B: Core Rewrite with xarray + +**Goal:** Reimplement core/ using xarray, passing ALL Phase A tests. + +**Principle:** Change the import in test fixtures from v1 to v2. Run tests. Fix failures. + +### B.0: Scaffold + Species System + +**Implement:** +- `pyproject.toml` (xarray, pint, pint-xarray, scipy, numpy, matplotlib, numba, joblib) +- `core/base.py` — `_conform_species()`, `_chk_species()` (port from v1) +- `core/species.py` — `SpeciesInfo` (frozen dataclass), `SpeciesRegistry` +- `core/units.py` — pint registry + polytropic index context dict +- mypy strict config + +**Pass:** All A.0 + A.6 tests + +### B.1: Vector3D + Tensor3D + +**Implement:** +- `core/vector.py` — `Vector3D` wrapping `xr.DataArray(dims=[..., "component"])` + - `__call__("x")` for callable component access + - `project(other)` for par/per decomposition + - `rho`, `lat`, `lon` for spherical coordinates +- `core/tensor.py` — `Tensor3D` wrapping `xr.DataArray(dims=[..., "thermal_component"])` + - `__call__("par")` for callable component access + - Hard validation: exactly par/per/scalar +- `core/bfield.py` — `MagneticField(Vector3D)` + pressure property + +**Pass:** All A.1 tests + +**Parallel opportunity:** Vector3D and Tensor3D can be implemented by separate agents. + +### B.2: Ion + +**Implement:** +- `core/ion.py` — Ion backed by `xr.Dataset`, `cached_property` for Vector3D/Tensor3D + - All derived quantities as properties (T, pth, cs, rho, anisotropy, entropy, KE flux) + - Sound speed uses polytropic index context dict + +**Pass:** All A.2 tests + +### B.3: Spacecraft + +**Implement:** +- `core/spacecraft.py` — position, velocity, Carrington coords + +**Pass:** All A.4 tests + +**Parallel opportunity:** B.2 (Ion) and B.3 (Spacecraft) are independent. + +### B.4: Plasma (THE centerpiece) + +**Implement:** +- `core/plasma.py` — Full species string spec, `__getattr__`, all physics methods + - Ions stored in `dict[str, Ion]` + - All physics methods with calculation chains + - Scalar w auto-derivation at init + - `estimate_electrons()` + - lnlambda/nuc/nc validation chain + - Data quality logging + - Save/load with modifier functions + +**Pass:** All A.3 tests — this is the critical milestone + +**Parallel opportunity within B.4:** +- Agent A: Construction, `__getattr__`, ion access +- Agent B: Single-species methods (n, rho, beta, ca, cs, pth, T) +- Agent C: Multi-species methods (dv, lnlambda, nuc, nc) + estimate_electrons + +### B.5: I/O + +**Implement:** +- `io/` — Protocol-based, NetCDF default +- `compat/convert.py` — `from_mcs_dataframe()` +- `Plasma.load()` and `Plasma.save()` with modifier support + +**Pass:** A.3 I/O tests + +### B.6: Alfvenic Turbulence + +**Implement:** +- `core/turbulence.py` — Elsasser, sigma_c, sigma_r + - `,` species syntax preserved + - `raffaella_version` flag preserved + - Alfven units conversion pipeline + +**Pass:** All A.5 tests + +**Parallel opportunity:** B.5 (I/O) and B.6 (Turbulence) are independent. + +### Phase B Compatibility Bridge + +Existing fitting/plotting/activity/instabilities modules consume v2 output via: + +```python +# v2 Plasma returns xr.DataArray +result = plasma.beta("p1") + +# Existing modules can use: +result.values # → numpy array (zero-cost) +result.to_series() # → pandas Series +result.to_dataframe() # → pandas DataFrame +``` + +No changes needed to existing modules. They continue to work. + +--- + +## Full Parallel Execution Plan + +``` +Phase A-Core (A.0-A.6) ─────────────────→ Phase B (core rewrite) + Agent 1: A.0 + A.6 B.0: Scaffold + Species + Units + Agent 2: A.1 + A.2 | + Agent 3: A.3 + A.4 + A.5 ├── B.1a: Vector3D ──┐ + │ | ├── (PARALLEL) + │ └── B.1b: Tensor3D ──┘ + │ | + │ └── B.1c: BField + │ | + │ ├── B.2: Ion ─┐ + │ | ├── (PARALLEL) + │ └── B.3: Spacecraft ─┘ + │ | + │ B.4: Plasma + │ | + │ ├── B.5: I/O ─┐ + │ | ├── (PARALLEL) + │ └── B.6: Turbulence ─┘ + │ + └── Phase A-NonCore (A.7-A.10) ← RUNS IN PARALLEL with Phase B + Agent 1: A.7 (instabilities, 0%→95%) + Agent 2: A.8 (solar_activity, 30%→95%) + Agent 3: A.9 (plotting) + A.10 (fitting) +``` + +**Critical path:** A-Core → B.0 → B.1 → B.2 → B.4 → B.6 +**A-NonCore is OFF critical path** — runs alongside Phase B at zero added cost +**Wall-clock phases:** 6 (A-Core, then 5 Phase B stages with A-NonCore in parallel) + +--- + +## Phase C: Future Incremental Migration (Out of Scope — But Test-Ready) + +These modules work as-is with v2 core. **Phase A-NonCore provides characterization tests** so any future rewrite gets the same TDD guarantee as core. + +| Module | Lines | Priority | A-NonCore Tests | Reason to Migrate | +|--------|-------|----------|-----------------|-------------------| +| instabilities/ | 727 | HIGH | A.7 (0%→95%) | Fix `iteritems()` crash, modernize | +| solar_activity/ | 2,496 | MEDIUM | A.8 (30%→95%) | Could benefit from xarray | +| fitting/ | 5,563 | LOW | A.10 (75%→95%) | Accepts arrays, minimal ceremony | +| plotting/ | 7,138 | LOW | A.9 (40%→95%) | Accepts arrays, Numba stays as-is | + +Phase C can also include: +- Type annotations for non-core modules +- Docstring backfill for non-core modules +- `__init_subclass__` replacing metaclass in FitFunction +- Adapt A-NonCore tests from pandas assertions to implementation-agnostic (same as A-Core) + +--- + +## Internal Architecture: xarray Replaces M/C/S MultiIndex + +### v1 Internal Structure +``` +Plasma._data: pd.DataFrame + columns: MultiIndex(M, C, S) + ("v", "x", "p1"), ("v", "y", "p1"), ("n", "", "p1"), ("b", "x", ""), ... + index: DatetimeIndex("Epoch") +``` + +### v2 Internal Structure +``` +Plasma._ds: xr.Dataset + coords: time, component (x,y,z), species (p1,a), thermal_component (par,per,scalar) + data_vars: + velocity: (time, component, species) + number_density: (time, species) + thermal_speed: (time, thermal_component, species) + magnetic_field: (time, component) +``` + +### What Changes (pandas ceremony eliminated) + +| v1 Pattern | Count | v2 Equivalent | +|------------|-------|---------------| +| `{s: self.ions.loc[s].X for s in slist}` + `pd.concat()` | 6 | `.sel(species=list)` | +| `.T.groupby(level="S").sum().T` | 9 | `.sum(dim="species")` | +| `.multiply(coeff, axis=1, level="C")` | 6+ | `* coeff` (auto-broadcast) | +| `axis=0` / `axis=1` | 253 | Named dimensions | +| `level=` parameters | 65 | Dimension names | +| Empty string `""` hacks | multiple | Dimensions are optional | + +**plasma.py code composition changes:** +- v1: 34% pandas ceremony, 42% physics, 24% method structure +- v2: ~5% xarray ops, 60% physics, 35% method structure + docstrings + +--- + +## Key Design Decisions + +| Decision | Choice | Rationale | +|----------|--------|-----------| +| **Test strategy** | **Characterization tests** | v1 output IS ground truth | +| **Rewrite scope** | **Core only (4,317 lines)** | Where ceremony is worst, rest works as-is | +| **Species string spec** | Preserved exactly | Tested against v1 behavior | +| **`plasma.p1` access** | Preserved (`__getattr__`) | Tested against v1 behavior | +| **`velocity("x")` callable** | Preserved (`__call__`) | Tested against v1 behavior | +| **All physics formulas** | v1-equivalent | Characterization tests guarantee it | +| Data backend | xarray Dataset | Named dims, no ceremony | +| Type annotations | mypy strict | From day 1 in v2 | +| Docstrings | NumPy + examples + LaTeX | In v2 code | +| Return types | Always `xr.DataArray` | Consistent (bridge to pandas available) | +| Ion storage | `dict[str, Ion]` | No pandas dependency | +| Object caching | `cached_property` | Fix v1's per-access creation | +| Units | pint-xarray (opt-in) | Replaces manual conversion factors | +| I/O | NetCDF default + plugins | xarray native | +| Error messages | Include available options | AI self-correction | +| Docstring inheritance | `__init_subclass__` | Modern replacement for metaclass | + +## Verification Strategy + +### Phase A-Core Verification +- **A-Core completion:** `pytest tests/v2_characterization/core/ -v` passes against v1. ALL GREEN. +- Gate: ALL A-Core tests green before Phase B starts. + +### Phase A-NonCore Verification +- **A.7 (instabilities):** `pytest tests/v2_characterization/noncore/test_instabilities.py -v` — ALL GREEN against v1. +- **A.8 (solar_activity):** `pytest tests/v2_characterization/noncore/test_solar_activity.py -v` — ALL GREEN. +- **A.9 (plotting):** `pytest tests/v2_characterization/noncore/test_plotting*.py -v` — ALL GREEN. +- **A.10 (fitting):** `pytest tests/v2_characterization/noncore/test_fitting*.py -v` — ALL GREEN. +- A-NonCore can complete while Phase B is in progress (no blocking dependency). + +### Phase B Verification +- **Phase B.0:** Species spec tests pass with v2 implementation +- **Phase B.1:** Vector/Tensor tests pass with v2 +- **Phase B.2-B.3:** Ion/Spacecraft tests pass with v2 +- **Phase B.4:** ALL plasma characterization tests pass — **critical milestone** +- **Phase B.5-B.6:** I/O + Turbulence tests pass +- **Final:** `pytest tests/v2_characterization/core/ -v` passes against v2. ALL GREEN. +- **Every sub-phase:** `mypy --strict`, `--cov-fail-under=95` +- **Numerical equivalence:** `np.testing.assert_allclose(v2_result.values, v1_expected, rtol=1e-10)` + +### Full Project Verification +- `pytest tests/v2_characterization/ -v` — ALL tests (core + noncore) pass. Full codebase characterized. + +## Critical v1 Files to Reference + +| v1 File | Lines | What to Port | +|---------|-------|-------------| +| `core/base.py:136-175` | — | `_conform_species`, `_chk_species` | +| `core/plasma.py` (full) | 1,901 | Species spec, `__getattr__`, all physics methods | +| `core/plasma.py:744-762` | — | Scalar w auto-calculation | +| `core/plasma.py:1353-1552` | — | lnlambda/nuc/nc validation chain | +| `core/plasma.py:1639-1727` | — | `estimate_electrons` | +| `core/plasma.py:1776-1822` | — | `build_alfvenic_turbulence` with `,` syntax | +| `core/plasma.py:296-397` | — | Save/load modifier functions | +| `core/ions.py` | 311 | Single-species physics, lazy properties | +| `core/vector.py` | 328 | Vector ops, `__call__`, BField | +| `core/tensor.py` | 90 | par/per/scalar contract, `__call__` | +| `core/units_constants.py` | 199 | Conversion factors, polytropic index | +| `core/spacecraft.py` | 256 | Position, Carrington coords | +| `core/alfvenic_turbulence.py` | 804 | Elsasser, `,` syntax, `raffaella_version` | From d81dcb193ab1b7eafc67397afef2356b696c9c01 Mon Sep 17 00:00:00 2001 From: blalterman Date: Fri, 1 May 2026 04:11:58 +0000 Subject: [PATCH 3/3] chore: auto-sync lockfiles from pyproject.toml - Updated requirements.txt (production dependencies) - Updated requirements-dev.lock (development dependencies) - Updated docs/requirements.txt (documentation dependencies) - Updated conda environment: solarwindpy.yml - Auto-generated via pip-compile from pyproject.toml --- docs/requirements.txt | 2 +- requirements-dev.lock | 2 +- solarwindpy.yml | 65 ------------------------------------------- 3 files changed, 2 insertions(+), 67 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 3f476075..7fd96240 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.12 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile --allow-unsafe --extra=docs --output-file=docs/requirements.txt pyproject.toml diff --git a/requirements-dev.lock b/requirements-dev.lock index 4a7e9d05..3a4ff15c 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.12 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile --allow-unsafe --extra=dev --output-file=requirements-dev.lock pyproject.toml diff --git a/solarwindpy.yml b/solarwindpy.yml index 16c50efe..1dd1dadb 100644 --- a/solarwindpy.yml +++ b/solarwindpy.yml @@ -22,94 +22,29 @@ name: solarwindpy channels: - conda-forge dependencies: -- alabaster - astropy - astropy-iers-data -- babel -- black -- python-blosc2 - bottleneck -- certifi -- cfgv -- charset-normalizer -- click - contourpy -- coverage[toml] - cycler -- distlib -- doc8 - docstring-inheritance -- docutils -- filelock -- flake8 -- flake8-docstrings - fonttools - h5py -- identify -- idna -- imagesize -- iniconfig -- jinja2 - kiwisolver -- latexcodec - llvmlite -- markupsafe - matplotlib -- mccabe -- msgpack-python -- mypy_extensions -- ndindex -- nodeenv - numba - numexpr - numpy -- numpydoc - packaging - pandas -- pathspec - pillow -- platformdirs -- pluggy -- pre-commit -- psutil -- py-cpuinfo -- pybtex -- pybtex-docutils -- pycodestyle -- pydocstyle -- pyenchant - pyerfa -- pyflakes -- pygments - pyparsing -- pytest -- pytest-cov - python-dateutil -- pytokens - pytz - pyyaml -- requests -- restructuredtext_lint -- roman-numerals -- roman-numerals-py - scipy - six -- snowballstemmer -- sphinx -- sphinx-rtd-theme -- sphinxcontrib-applehelp -- sphinxcontrib-bibtex -- sphinxcontrib-devhelp -- sphinxcontrib-htmlhelp -- sphinxcontrib-jquery -- sphinxcontrib-jsmath -- sphinxcontrib-qthelp -- sphinxcontrib-serializinghtml -- sphinxcontrib-spelling -- stevedore -- pytables - tabulate -- typing-extensions - tzdata -- urllib3 -- virtualenv