AryanBhati02 · AryanBhati02 · May 17, 2026 · May 17, 2026 · May 17, 2026 · May 17, 2026
diff --git a/backend/.dockerignore b/backend/.dockerignore
@@ -0,0 +1,44 @@
+# Python caches
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+
+# Virtual environments
+.venv/
+venv/
+env/
+.env/
+
+# Local session data (runtime, not source)
+sessions/
+
+# Test / benchmark outputs
+results/
+.pytest_cache/
+
+# Build artefacts
+*.egg-info/
+dist/
+build/
+
+# DB files
+*.db
+*.sqlite
+
+# Editor / OS
+.idea/
+.vscode/
+.DS_Store
+Thumbs.db
+
+# Git metadata
+.git/
+.gitignore
+
+# Jupyter
+*.ipynb
+.ipynb_checkpoints/
+
+# Docs / non-runtime assets
+docs/
diff --git a/backend/Dockerfile b/backend/Dockerfile
@@ -0,0 +1,55 @@
+# Backend Dockerfile
+# Python 3.11 slim — small base, fast builds
+FROM python:3.11-slim
+
+# System packages
+#  git              – GitPython binary requirement
+#  build-essential  – compiles C extensions (tree-sitter, scipy)
+#  curl             – lightweight HTTP tool (optional local debugging)
+#  libgomp1         – OpenMP runtime required by PyTorch / numpy
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        git \
+        build-essential \
+        curl \
+        libgomp1 \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+#Python dependencies (layer-cache order)
+COPY requirements.txt .
+
+# 1. Upgrade pip
+RUN pip install --no-cache-dir --upgrade pip
+
+# 1b. Pin NumPy to 1.x BEFORE any compiled extension is installed.
+#     torch 2.2.x and PyG wheels are built against NumPy 1.x ABI;
+#     installing numpy first prevents any later dep from upgrading to 2.x.
+RUN pip install --no-cache-dir "numpy==1.26.4"
+
+# 2. CPU-only PyTorch (avoids the ~2 GB CUDA wheel)
+RUN pip install --no-cache-dir \
+        torch==2.2.2 torchvision torchaudio \
+        --index-url https://download.pytorch.org/whl/cpu
+
+# 3. PyTorch Geometric + required sparse/scatter extensions (CPU wheels)
+#    Must be installed AFTER torch and AGAINST the same torch version.
+RUN pip install --no-cache-dir torch_geometric==2.5.3
+RUN pip install --no-cache-dir \
+        torch_scatter \
+        torch_sparse \
+        torch_cluster \
+        torch_spline_conv \
+        -f https://data.pyg.org/whl/torch-2.2.2+cpu.html
+
+# 4. Everything else from requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+
+#Application source
+COPY . .
+
+EXPOSE 8000
+
+# Uvicorn: bind to 0.0.0.0 so Docker port-mapping works.
+# No --reload in production image; add a volume mount for dev hot-reload.
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/backend/api/mcp_server.py b/backend/api/mcp_server.py
@@ -30,7 +30,7 @@
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
-    stream=sys.stderr,  # MCP uses stdout for the protocol; log to stderr
+    stream=sys.stderr,
 )
 
 mcp = FastMCP("Atlas — Behavioral Code Intelligence")
@@ -337,7 +337,7 @@ async def get_hot_paths(top_k: int = 10) -> str:
         fan_in = graph.in_degree(node_id)
         fan_out = graph.out_degree(node_id)
         complexity = int(node_data.get("complexity", 0))
-        # Impact: heavy fan-in + high complexity = highest risk
+
         impact = fan_in * 2 + complexity + fan_out * 0.5
         scored.append(
             {

diff --git a/backend/api/routes/analyze.py b/backend/api/routes/analyze.py
@@ -1,6 +1,7 @@
 import asyncio
 import json
 import logging
+import time
 import threading
 from pathlib import Path
 from typing import Protocol, cast
@@ -21,17 +22,64 @@
 
 router = APIRouter(prefix="/analyze", tags=["Analyze"])
 
+# Sentinel files written by the ingest route — same constants as tasks.py
+_INGEST_READY = ".ingest_ready"
+_INGEST_FAILED = ".ingest_failed"
+_READY_WAIT_SECS = 30
+_READY_POLL_INTERVAL = 1
+
 
 class CeleryTask(Protocol):
     def delay(self, *args: object, **kwargs: object) -> object:
         ...
 
+
+def _wait_for_ingest_sentinel(session_id: str, session_dir: Path) -> bool:
+    """
+    Block (in a thread) until .ingest_ready appears or .ingest_failed is found.
+    Returns True if ready, False if failed/timed-out.
+    """
+    log = logging.getLogger(f"codebase-intel.thread.{session_id[:8]}")
+    ready_file = session_dir / _INGEST_READY
+    failed_file = session_dir / _INGEST_FAILED
+
+    log.info(f"[INGEST_WAIT] Thread fallback waiting for sentinel {ready_file}")
+    for i in range(_READY_WAIT_SECS):
+        if failed_file.exists():
+            reason = failed_file.read_text(encoding="utf-8").strip()
+            log.error(f"[INGEST_FAILED] Ingestion failed before thread pipeline: {reason}")
+            progress_store.update_sync(
+                session_id,
+                status="error",
+                error_message=f"Ingestion failed before analysis could start: {reason}",
+            )
+            return False
+        if ready_file.exists():
+            log.info(f"[INGEST_READY] Sentinel found after {i}s — starting thread pipeline")
+            return True
+        time.sleep(_READY_POLL_INTERVAL)
+
+    log.error(f"[INGEST_TIMEOUT] Repo not ready after {_READY_WAIT_SECS}s (thread)")
+    progress_store.update_sync(
+        session_id,
+        status="error",
+        error_message=(
+            f"Ingestion timed out — repository was not ready after "
+            f"{_READY_WAIT_SECS}s. Please re-ingest the repository."
+        ),
+    )
+    return False
+
+
 def _run_pipeline_in_thread(session_id: str, session_dir: Path) -> None:
     from core.pipeline import PipelineError, run_analysis_pipeline
 
     log = logging.getLogger(f"codebase-intel.thread.{session_id[:8]}")
     log.info("Starting pipeline in thread fallback mode")
 
+    if not _wait_for_ingest_sentinel(session_id, session_dir):
+        return
+
     try:
         asyncio.run(run_analysis_pipeline(session_id, session_dir))
 
@@ -52,7 +100,7 @@ def _run_pipeline_in_thread(session_id: str, session_dir: Path) -> None:
             status="error",
             error_message="Out of memory. Try a smaller repository.",
         )
-    except Exception as exc:                
+    except Exception as exc:
         log.error(f"Unexpected error: {exc}", exc_info=True)
         progress_store.update_sync(
             session_id,

diff --git a/backend/api/routes/debug.py b/backend/api/routes/debug.py
@@ -0,0 +1,84 @@
+"""
+GET /api/debug/session/{session_id}
+
+Returns diagnostic information about a session directory so operators can
+verify that:
+  1. The session directory exists and is on the correct volume mount.
+  2. The repo/ sub-directory was populated by git clone / ZIP extract.
+  3. The .ingest_ready sentinel was written by the ingest route.
+  4. The .ingest_failed sentinel was NOT written (i.e. no ingest error).
+
+This endpoint is intentionally read-only and has no side-effects.
+"""
+import logging
+from pathlib import Path
+
+from fastapi import APIRouter, HTTPException
+from config import SESSIONS_DIR
+
+logger = logging.getLogger("codebase-intel.routes.debug")
+
+router = APIRouter(prefix="/debug", tags=["Debug"])
+
+
+@router.get("/session/{session_id}")
+async def debug_session(session_id: str):
+    """
+    Returns a JSON snapshot of the session directory state.
+    Useful for diagnosing path / volume-mount / sentinel issues in Docker.
+    """
+    session_dir: Path = SESSIONS_DIR / session_id
+
+    session_exists = session_dir.exists() and session_dir.is_dir()
+    if not session_exists:
+
+        return {
+            "session_id": session_id,
+            "sessions_dir": str(SESSIONS_DIR.resolve()),
+            "session_exists": False,
+            "repo_exists": False,
+            "repo_file_count": 0,
+            "ready_exists": False,
+            "failed_exists": False,
+            "failed_reason": None,
+            "repo_absolute_path": str((session_dir / "repo").resolve()),
+            "ready_absolute_path": str((session_dir / ".ingest_ready").resolve()),
+            "failed_absolute_path": str((session_dir / ".ingest_failed").resolve()),
+            "meta_exists": False,
+            "file_entries_exist": False,
+        }
+
+    repo_dir = session_dir / "repo"
+    ready_file = session_dir / ".ingest_ready"
+    failed_file = session_dir / ".ingest_failed"
+
+    repo_exists = repo_dir.exists() and repo_dir.is_dir()
+    repo_file_count = 0
+    if repo_exists:
+        try:
+            repo_file_count = sum(1 for _ in repo_dir.rglob("*") if _.is_file())
+        except Exception:
+            repo_file_count = -1
+
+    failed_reason: str | None = None
+    if failed_file.exists():
+        try:
+            failed_reason = failed_file.read_text(encoding="utf-8").strip()
+        except Exception:
+            failed_reason = "<unreadable>"
+
+    return {
+        "session_id": session_id,
+        "sessions_dir": str(SESSIONS_DIR.resolve()),
+        "session_exists": True,
+        "repo_exists": repo_exists,
+        "repo_file_count": repo_file_count,
+        "ready_exists": ready_file.exists(),
+        "failed_exists": failed_file.exists(),
+        "failed_reason": failed_reason,
+        "repo_absolute_path": str(repo_dir.resolve()),
+        "ready_absolute_path": str(ready_file.resolve()),
+        "failed_absolute_path": str(failed_file.resolve()),
+        "meta_exists": (session_dir / "meta.json").exists(),
+        "file_entries_exist": (session_dir / "file_entries.json").exists(),
+    }