From 55511d2eb48e35c606d6792b8f0b6215b2b91e08 Mon Sep 17 00:00:00 2001
From: Alexander Nesterov <nesterov.alexander@outlook.com>
Date: Mon, 4 May 2026 12:43:21 +0200
Subject: [PATCH 1/3] Add model performance benchmark

---
 .gitignore                      |   1 +
 benchmarks/README.md            |  52 +++
 benchmarks/model_performance.py | 591 ++++++++++++++++++++++++++++++++
 3 files changed, 644 insertions(+)
 create mode 100644 benchmarks/README.md
 create mode 100644 benchmarks/model_performance.py

diff --git a/.gitignore b/.gitignore
index fec5c1a55..be2e08571 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@ build
 build*
 3rdparty/tensorflow
 app/AccuracyImgNet/imgs
+benchmark_results/
 docs/
 docs/input
 docs/mnist
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 000000000..fbf9db4a7
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,52 @@
+# Model Performance Benchmark
+
+`model_performance.py` benchmarks the existing `Graph_Build` executable for all
+target networks:
+
+- `alexnet_mnist`
+- `googlenet`
+- `densenet`
+- `resnet`
+- `yolo`
+
+It measures wall time and peak RSS memory for two stages:
+
+- `compile`: process start until `Graph_Build` prints `Starting inference...`
+- `inference`: `Starting inference...` until `Inference completed successfully.`
+
+The benchmark does not modify C++ code. It reads the executable output live and
+samples process memory while the command is running.
+
+Install `psutil` to measure RSS for the full process tree on every platform.
+Without it, Linux uses `/proc`, while macOS and Windows use parent-process RSS
+fallbacks.
+
+## Usage
+
+Build the project first:
+
+```bash
+cmake -S . -B build
+cmake --build build --target Graph_Build --parallel
+```
+
+Run the default benchmark over every model with available JSON/input assets:
+
+```bash
+python3 benchmarks/model_performance.py
+```
+
+Run selected models and variants:
+
+```bash
+python3 benchmarks/model_performance.py \
+  --model googlenet,resnet \
+  --variant seq \
+  --variant parallel-tbb \
+  --repeat 3 \
+  --warmup 1 \
+  --csv-out benchmark_results/model_performance.csv
+```
+
+Use `--strict-assets` to fail when a model JSON or input image directory is
+missing instead of skipping that model.
diff --git a/benchmarks/model_performance.py b/benchmarks/model_performance.py
new file mode 100644
index 000000000..c9d1fabff
--- /dev/null
+++ b/benchmarks/model_performance.py
@@ -0,0 +1,591 @@
+#!/usr/bin/env python3
+"""Benchmark ITLabAI target models through the Graph_Build executable."""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import dataclasses
+import datetime as dt
+import errno
+import json
+import os
+import platform
+import signal
+import subprocess
+import sys
+import threading
+import time
+from pathlib import Path
+from typing import Iterable, Sequence
+
+try:
+    import psutil  # type: ignore
+except ImportError:  # pragma: no cover - optional dependency
+    psutil = None
+
+try:
+    import pty
+except ImportError:  # pragma: no cover - Windows
+    pty = None
+
+
+ROOT = Path(__file__).resolve().parents[1]
+START_MARKER = "Starting inference..."
+DONE_MARKER = "Inference completed successfully."
+
+TARGET_MODELS = {
+    "alexnet_mnist": ("docs/jsons/model_data_alexnet_1.json", "docs/input/28"),
+    "googlenet": ("docs/jsons/googlenet_onnx_model.json", "docs/input/Imagenet_test"),
+    "densenet": (
+        "docs/jsons/densenet121_Opset16_onnx_model.json",
+        "docs/input/Imagenet_test",
+    ),
+    "resnet": (
+        "docs/jsons/resnest101e_Opset16_onnx_model.json",
+        "docs/input/Imagenet_test",
+    ),
+    "yolo": ("docs/jsons/yolo11x-cls_onnx_model.json", "docs/input/Imagenet_test"),
+}
+
+VARIANT_ARGS = {
+    "seq": [],
+    "onednn": ["--onednn"],
+    "parallel-tbb": ["--parallel", "tbb"],
+    "parallel-threads": ["--parallel", "threads"],
+    "parallel-omp": ["--parallel", "omp"],
+    "parallel-kokkos": ["--parallel", "kokkos"],
+}
+
+
+@dataclasses.dataclass
+class Phase:
+    duration_s: float = 0.0
+    peak_rss_bytes: int = 0
+    markers: int = 0
+
+
+@dataclasses.dataclass
+class Sample:
+    stage: str
+    rss_bytes: int
+
+
+@dataclasses.dataclass
+class Result:
+    model: str
+    variant: str
+    repeat: int
+    command: list[str]
+    returncode: int
+    total_s: float
+    peak_rss_bytes: int
+    compile: Phase
+    inference: Phase
+    marker_split: bool
+    stdout_tail: list[str]
+    error: str = ""
+
+    def row(self) -> dict[str, object]:
+        return {
+            "model": self.model,
+            "variant": self.variant,
+            "repeat": self.repeat,
+            "returncode": self.returncode,
+            "total_s": round(self.total_s, 6),
+            "compile_s": round(self.compile.duration_s, 6),
+            "inference_s": round(self.inference.duration_s, 6),
+            "peak_rss_mib": bytes_to_mib(self.peak_rss_bytes),
+            "compile_peak_rss_mib": bytes_to_mib(self.compile.peak_rss_bytes),
+            "inference_peak_rss_mib": bytes_to_mib(self.inference.peak_rss_bytes),
+            "compile_markers": self.compile.markers,
+            "inference_markers": self.inference.markers,
+            "marker_split": self.marker_split,
+            "command": " ".join(self.command),
+            "error": self.error,
+        }
+
+
+def bytes_to_mib(value: int) -> float:
+    return round(value / (1024 * 1024), 3)
+
+
+def exe_name() -> str:
+    return "Graph_Build.exe" if os.name == "nt" else "Graph_Build"
+
+
+def default_graph_build(build_dir: Path) -> Path:
+    for candidate in (
+        build_dir / "bin" / exe_name(),
+        build_dir / "bin" / "Release" / exe_name(),
+        build_dir / "bin" / "Debug" / exe_name(),
+    ):
+        if candidate.exists():
+            return candidate
+    return build_dir / "bin" / exe_name()
+
+
+def expand_choices(values: Sequence[str], choices: dict[str, object], default: str) -> list[str]:
+    if not values:
+        return list(choices) if default == "all" else [default]
+
+    expanded: list[str] = []
+    for value in values:
+        for item in value.split(","):
+            item = item.strip()
+            if not item:
+                continue
+            if item == "all":
+                expanded.extend(choices)
+            elif item in choices:
+                expanded.append(item)
+            else:
+                raise SystemExit(f"Unknown value '{item}'. Valid: all, {', '.join(choices)}")
+    return dedupe(expanded)
+
+
+def dedupe(values: Iterable[str]) -> list[str]:
+    result: list[str] = []
+    seen: set[str] = set()
+    for value in values:
+        if value not in seen:
+            seen.add(value)
+            result.append(value)
+    return result
+
+
+def has_image(input_dir: Path) -> bool:
+    if not input_dir.is_dir():
+        return False
+    return any(
+        path.is_file() and path.suffix.lower() in {".png", ".jpg", ".jpeg"}
+        for path in input_dir.iterdir()
+    )
+
+
+def check_assets(model: str, strict: bool) -> bool:
+    json_rel, input_rel = TARGET_MODELS[model]
+    missing = [
+        str(path)
+        for path, ok in (
+            (ROOT / json_rel, (ROOT / json_rel).is_file()),
+            (ROOT / input_rel, has_image(ROOT / input_rel)),
+        )
+        if not ok
+    ]
+    if not missing:
+        return True
+    message = f"missing benchmark assets for {model}: {', '.join(missing)}"
+    if strict:
+        raise SystemExit(message)
+    print(f"Skipping {model}: {message}", file=sys.stderr)
+    return False
+
+
+def rss_tree(pid: int) -> int:
+    if psutil is not None:
+        try:
+            proc = psutil.Process(pid)
+            return sum(
+                child.memory_info().rss
+                for child in [proc, *proc.children(recursive=True)]
+                if child.is_running()
+            )
+        except psutil.Error:
+            return 0
+    if sys.platform.startswith("linux"):
+        return linux_rss_tree(pid)
+    if os.name == "nt":
+        return powershell_rss(pid)
+    return ps_rss(pid)
+
+
+def linux_rss_tree(pid: int) -> int:
+    children: dict[int, list[int]] = {}
+    rss: dict[int, int] = {}
+    for status in Path("/proc").glob("[0-9]*/status"):
+        current = int(status.parent.name)
+        parent = 0
+        current_rss = 0
+        try:
+            for line in status.read_text(errors="ignore").splitlines():
+                if line.startswith("PPid:"):
+                    parent = int(line.split()[1])
+                elif line.startswith("VmRSS:"):
+                    current_rss = int(line.split()[1]) * 1024
+        except (FileNotFoundError, ProcessLookupError, ValueError):
+            continue
+        children.setdefault(parent, []).append(current)
+        rss[current] = current_rss
+
+    total = 0
+    stack = [pid]
+    seen: set[int] = set()
+    while stack:
+        current = stack.pop()
+        if current in seen:
+            continue
+        seen.add(current)
+        total += rss.get(current, 0)
+        stack.extend(children.get(current, []))
+    return total
+
+
+def ps_rss(pid: int) -> int:
+    try:
+        output = subprocess.check_output(["ps", "-o", "rss=", "-p", str(pid)], text=True)
+        return int(output.strip() or "0") * 1024
+    except (OSError, subprocess.SubprocessError, ValueError):
+        return 0
+
+
+def powershell_rss(pid: int) -> int:
+    command = [
+        "powershell",
+        "-NoProfile",
+        "-Command",
+        f"(Get-Process -Id {pid} -ErrorAction SilentlyContinue).WorkingSet64",
+    ]
+    try:
+        return int(subprocess.check_output(command, text=True).strip() or "0")
+    except (OSError, subprocess.SubprocessError, ValueError):
+        return 0
+
+
+def sample_memory(
+    pid: int,
+    stage: dict[str, str],
+    samples: list[Sample],
+    stop: threading.Event,
+    interval_s: float,
+) -> None:
+    while not stop.is_set():
+        samples.append(Sample(stage["name"], rss_tree(pid)))
+        stop.wait(interval_s)
+    samples.append(Sample(stage["name"], rss_tree(pid)))
+
+
+def record_line(
+    line: str,
+    lines: list[str],
+    stage: dict[str, str],
+    phase_started_at: dict[str, float],
+    compile_phase: Phase,
+    inference_phase: Phase,
+) -> None:
+    now = time.perf_counter()
+    lines.append(line.rstrip())
+    if START_MARKER in line:
+        compile_phase.duration_s += now - phase_started_at["time"]
+        compile_phase.markers += 1
+        stage["name"] = "inference"
+        phase_started_at["time"] = now
+    elif DONE_MARKER in line and stage["name"] == "inference":
+        inference_phase.duration_s += now - phase_started_at["time"]
+        inference_phase.markers += 1
+        stage["name"] = "compile"
+        phase_started_at["time"] = now
+
+
+def read_pipe(
+    proc: subprocess.Popen[str],
+    lines: list[str],
+    stage: dict[str, str],
+    phase_started_at: dict[str, float],
+    compile_phase: Phase,
+    inference_phase: Phase,
+) -> None:
+    assert proc.stdout is not None
+    for line in proc.stdout:
+        record_line(line, lines, stage, phase_started_at, compile_phase, inference_phase)
+    proc.wait()
+
+
+def read_pty(
+    proc: subprocess.Popen[bytes],
+    master_fd: int,
+    lines: list[str],
+    stage: dict[str, str],
+    phase_started_at: dict[str, float],
+    compile_phase: Phase,
+    inference_phase: Phase,
+) -> None:
+    buffer = ""
+    while True:
+        try:
+            chunk = os.read(master_fd, 4096)
+        except OSError as exc:
+            if exc.errno == errno.EIO:
+                break
+            raise
+        if not chunk:
+            break
+        buffer += chunk.decode(errors="replace")
+        while "\n" in buffer:
+            line, buffer = buffer.split("\n", 1)
+            record_line(line, lines, stage, phase_started_at, compile_phase, inference_phase)
+    if buffer:
+        record_line(buffer, lines, stage, phase_started_at, compile_phase, inference_phase)
+    proc.wait()
+
+
+def terminate(proc: subprocess.Popen) -> None:
+    if proc.poll() is not None:
+        return
+    try:
+        if os.name == "nt":
+            proc.terminate()
+        else:
+            os.kill(proc.pid, signal.SIGTERM)
+        proc.wait(timeout=5)
+    except (OSError, subprocess.TimeoutExpired):
+        try:
+            proc.kill()
+        except OSError:
+            pass
+
+
+def peak_by_stage(samples: list[Sample], compile_phase: Phase, inference_phase: Phase) -> int:
+    process_peak = 0
+    for sample in samples:
+        process_peak = max(process_peak, sample.rss_bytes)
+        if sample.stage == "compile":
+            compile_phase.peak_rss_bytes = max(compile_phase.peak_rss_bytes, sample.rss_bytes)
+        elif sample.stage == "inference":
+            inference_phase.peak_rss_bytes = max(
+                inference_phase.peak_rss_bytes,
+                sample.rss_bytes,
+            )
+    return process_peak
+
+
+def run_command(command: list[str], timeout_s: float, interval_s: float) -> Result:
+    start = time.perf_counter()
+    phase_started_at = {"time": start}
+    stage = {"name": "compile"}
+    compile_phase = Phase()
+    inference_phase = Phase()
+    samples: list[Sample] = []
+    lines: list[str] = []
+    stop = threading.Event()
+    error = ""
+    master_fd: int | None = None
+
+    use_pty = os.name != "nt" and pty is not None
+    if use_pty:
+        master_fd, slave_fd = pty.openpty()
+        proc = subprocess.Popen(
+            command,
+            cwd=ROOT,
+            stdin=subprocess.DEVNULL,
+            stdout=slave_fd,
+            stderr=slave_fd,
+            close_fds=True,
+        )
+        os.close(slave_fd)
+        reader = threading.Thread(
+            target=read_pty,
+            args=(proc, master_fd, lines, stage, phase_started_at, compile_phase, inference_phase),
+            daemon=True,
+        )
+    else:
+        proc = subprocess.Popen(
+            command,
+            cwd=ROOT,
+            stdin=subprocess.DEVNULL,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            bufsize=1,
+        )
+        reader = threading.Thread(
+            target=read_pipe,
+            args=(proc, lines, stage, phase_started_at, compile_phase, inference_phase),
+            daemon=True,
+        )
+
+    monitor = threading.Thread(
+        target=sample_memory,
+        args=(proc.pid, stage, samples, stop, interval_s),
+        daemon=True,
+    )
+    monitor.start()
+    reader.start()
+    reader.join(timeout_s)
+    if reader.is_alive():
+        error = f"timeout after {timeout_s:.1f}s"
+        terminate(proc)
+        reader.join(5)
+    stop.set()
+    monitor.join(2)
+    if master_fd is not None:
+        try:
+            os.close(master_fd)
+        except OSError:
+            pass
+
+    end = time.perf_counter()
+    if proc.returncode is None:
+        proc.wait(timeout=5)
+    if stage["name"] == "inference":
+        inference_phase.duration_s += end - phase_started_at["time"]
+    elif compile_phase.markers == 0:
+        compile_phase.duration_s = end - start
+
+    peak = peak_by_stage(samples, compile_phase, inference_phase)
+    return Result(
+        model="",
+        variant="",
+        repeat=0,
+        command=command,
+        returncode=int(proc.returncode or 0),
+        total_s=end - start,
+        peak_rss_bytes=peak,
+        compile=compile_phase,
+        inference=inference_phase,
+        marker_split=compile_phase.markers > 0 and inference_phase.markers > 0,
+        stdout_tail=lines[-80:],
+        error=error or (f"process exited with {proc.returncode}" if proc.returncode else ""),
+    )
+
+
+def run_model(
+    graph_build: Path,
+    model: str,
+    variant: str,
+    repeat: int,
+    timeout_s: float,
+    interval_s: float,
+    extra_args: Sequence[str],
+) -> Result:
+    command = [str(graph_build), "--model", model, *VARIANT_ARGS[variant], *extra_args]
+    result = run_command(command, timeout_s, interval_s)
+    result.model = model
+    result.variant = variant
+    result.repeat = repeat
+    return result
+
+
+def report_path(suffix: str) -> Path:
+    timestamp = dt.datetime.now().strftime("%Y%m%d-%H%M%S")
+    return ROOT / "benchmark_results" / f"model_performance-{timestamp}.{suffix}"
+
+
+def write_json(path: Path, args: argparse.Namespace, rows: list[Result]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    payload = {
+        "metadata": {
+            "created_at": dt.datetime.now(dt.timezone.utc).isoformat(),
+            "host": platform.node(),
+            "platform": platform.platform(),
+            "python": sys.version,
+            "graph_build": str(args.graph_build),
+            "sample_interval_s": args.sample_interval,
+        },
+        "results": [{**row.row(), "stdout_tail": row.stdout_tail} for row in rows],
+    }
+    path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+
+
+def write_csv(path: Path, rows: list[Result]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    fieldnames = list(Result("", "", 0, [], 0, 0, 0, Phase(), Phase(), False, []).row())
+    with path.open("w", newline="", encoding="utf-8") as handle:
+        writer = csv.DictWriter(handle, fieldnames=fieldnames)
+        writer.writeheader()
+        for row in rows:
+            writer.writerow(row.row())
+
+
+def print_summary(rows: list[Result]) -> None:
+    if not rows:
+        print("No benchmark runs were executed.")
+        return
+    print("model | variant | rep | total_s | compile_s | infer_s | peak_mib | rc")
+    print("----- | ------- | --- | ------- | --------- | ------- | -------- | --")
+    for row in rows:
+        print(
+            f"{row.model} | {row.variant} | {row.repeat} | "
+            f"{row.total_s:.3f} | {row.compile.duration_s:.3f} | "
+            f"{row.inference.duration_s:.3f} | {bytes_to_mib(row.peak_rss_bytes):.1f} | "
+            f"{row.returncode}"
+        )
+
+
+def parse_args(argv: Sequence[str]) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Benchmark Graph_Build compile/inference time and RSS."
+    )
+    parser.add_argument("--build-dir", type=Path, default=ROOT / "build")
+    parser.add_argument("--graph-build", type=Path)
+    parser.add_argument("--model", action="append", default=[], help="Repeat or comma-list. Default: all.")
+    parser.add_argument("--variant", action="append", default=[], help="Repeat or comma-list. Default: seq.")
+    parser.add_argument("--repeat", type=int, default=1)
+    parser.add_argument("--warmup", type=int, default=0)
+    parser.add_argument("--timeout", type=float, default=1800.0)
+    parser.add_argument("--sample-interval", type=float, default=0.05)
+    parser.add_argument("--extra-arg", action="append", default=[])
+    parser.add_argument("--strict-assets", action="store_true")
+    parser.add_argument("--json-out", type=Path)
+    parser.add_argument("--csv-out", type=Path)
+    return parser.parse_args(argv)
+
+
+def main(argv: Sequence[str]) -> int:
+    args = parse_args(argv)
+    args.build_dir = args.build_dir.resolve()
+    args.graph_build = (
+        args.graph_build.resolve()
+        if args.graph_build
+        else default_graph_build(args.build_dir).resolve()
+    )
+    if not args.graph_build.exists():
+        raise SystemExit(f"Graph_Build was not found: {args.graph_build}")
+
+    models = [
+        model
+        for model in expand_choices(args.model, TARGET_MODELS, "all")
+        if check_assets(model, args.strict_assets)
+    ]
+    variants = expand_choices(args.variant, VARIANT_ARGS, "seq")
+
+    results: list[Result] = []
+    for model in models:
+        for variant in variants:
+            for index in range(args.warmup):
+                print(f"warmup model={model} variant={variant} {index + 1}/{args.warmup}", flush=True)
+                run_model(
+                    args.graph_build,
+                    model,
+                    variant,
+                    -(index + 1),
+                    args.timeout,
+                    args.sample_interval,
+                    args.extra_arg,
+                )
+            for repeat in range(args.repeat):
+                print(f"benchmark model={model} variant={variant} {repeat + 1}/{args.repeat}", flush=True)
+                results.append(
+                    run_model(
+                        args.graph_build,
+                        model,
+                        variant,
+                        repeat,
+                        args.timeout,
+                        args.sample_interval,
+                        args.extra_arg,
+                    )
+                )
+
+    print_summary(results)
+    json_out = args.json_out or report_path("json")
+    write_json(json_out, args, results)
+    print(f"JSON report: {json_out}")
+    if args.csv_out:
+        write_csv(args.csv_out, results)
+        print(f"CSV report: {args.csv_out}")
+    return 1 if any(row.returncode != 0 for row in results) else 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv[1:]))

From e13836ace7591863d5394987a04b9a798e3c5cc3 Mon Sep 17 00:00:00 2001
From: Alexander Nesterov <nesterov.alexander@outlook.com>
Date: Mon, 4 May 2026 14:59:09 +0200
Subject: [PATCH 2/3] Add memory timeline plots to benchmark

---
 benchmarks/README.md            |  20 +++---
 benchmarks/model_performance.py | 115 ++++++++++++++++++++++++++++++--
 2 files changed, 121 insertions(+), 14 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index fbf9db4a7..1bbb365fc 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -9,17 +9,18 @@ target networks:
 - `resnet`
 - `yolo`
 
-It measures wall time and peak RSS memory for two stages:
+It measures wall time and RSS memory timeline for two stages:
 
 - `compile`: process start until `Graph_Build` prints `Starting inference...`
 - `inference`: `Starting inference...` until `Inference completed successfully.`
 
-The benchmark does not modify C++ code. It reads the executable output live and
-samples process memory while the command is running.
+The benchmark does not modify C++ code. It reads the executable output live,
+samples process memory while the command is running, stores the full RSS sample
+series, and writes a memory plot for every measured run.
 
-Install `psutil` to measure RSS for the full process tree on every platform.
-Without it, Linux uses `/proc`, while macOS and Windows use parent-process RSS
-fallbacks.
+Install `matplotlib` to generate memory plots. Install `psutil` to measure RSS
+for the full process tree on every platform. Without `psutil`, Linux uses
+`/proc`, while macOS and Windows use parent-process RSS fallbacks.
 
 ## Usage
 
@@ -44,9 +45,12 @@ python3 benchmarks/model_performance.py \
   --variant seq \
   --variant parallel-tbb \
   --repeat 3 \
-  --warmup 1 \
-  --csv-out benchmark_results/model_performance.csv
+  --warmup 1
 ```
 
+The JSON report includes `memory_samples` for every run. PNG plots are written
+to `benchmark_results/memory_plots` by default. Use `--samples-csv-out` to export
+the memory timeline to CSV and `--plots-dir` to choose another plot directory.
+
 Use `--strict-assets` to fail when a model JSON or input image directory is
 missing instead of skipping that model.
diff --git a/benchmarks/model_performance.py b/benchmarks/model_performance.py
index c9d1fabff..78e5f610e 100644
--- a/benchmarks/model_performance.py
+++ b/benchmarks/model_performance.py
@@ -19,6 +19,8 @@
 from pathlib import Path
 from typing import Iterable, Sequence
 
+import matplotlib.pyplot as plt
+
 try:
     import psutil  # type: ignore
 except ImportError:  # pragma: no cover - optional dependency
@@ -67,6 +69,7 @@ class Phase:
 
 @dataclasses.dataclass
 class Sample:
+    timestamp_s: float
     stage: str
     rss_bytes: int
 
@@ -83,7 +86,9 @@ class Result:
     compile: Phase
     inference: Phase
     marker_split: bool
+    samples: list[Sample]
     stdout_tail: list[str]
+    plot_path: str = ""
     error: str = ""
 
     def row(self) -> dict[str, object]:
@@ -254,15 +259,16 @@ def powershell_rss(pid: int) -> int:
 
 def sample_memory(
     pid: int,
+    started_at: float,
     stage: dict[str, str],
     samples: list[Sample],
     stop: threading.Event,
     interval_s: float,
 ) -> None:
     while not stop.is_set():
-        samples.append(Sample(stage["name"], rss_tree(pid)))
+        samples.append(Sample(time.perf_counter() - started_at, stage["name"], rss_tree(pid)))
         stop.wait(interval_s)
-    samples.append(Sample(stage["name"], rss_tree(pid)))
+    samples.append(Sample(time.perf_counter() - started_at, stage["name"], rss_tree(pid)))
 
 
 def record_line(
@@ -406,7 +412,7 @@ def run_command(command: list[str], timeout_s: float, interval_s: float) -> Resu
 
     monitor = threading.Thread(
         target=sample_memory,
-        args=(proc.pid, stage, samples, stop, interval_s),
+        args=(proc.pid, start, stage, samples, stop, interval_s),
         daemon=True,
     )
     monitor.start()
@@ -444,6 +450,7 @@ def run_command(command: list[str], timeout_s: float, interval_s: float) -> Resu
         compile=compile_phase,
         inference=inference_phase,
         marker_split=compile_phase.markers > 0 and inference_phase.markers > 0,
+        samples=samples,
         stdout_tail=lines[-80:],
         error=error or (f"process exited with {proc.returncode}" if proc.returncode else ""),
     )
@@ -471,6 +478,68 @@ def report_path(suffix: str) -> Path:
     return ROOT / "benchmark_results" / f"model_performance-{timestamp}.{suffix}"
 
 
+def sample_rows(row: Result) -> list[dict[str, object]]:
+    return [
+        {
+            "timestamp_s": round(sample.timestamp_s, 6),
+            "stage": sample.stage,
+            "rss_mib": bytes_to_mib(sample.rss_bytes),
+        }
+        for sample in row.samples
+    ]
+
+
+def safe_name(value: str) -> str:
+    return "".join(ch if ch.isalnum() or ch in {"-", "_"} else "_" for ch in value)
+
+
+def plot_memory(row: Result, output_dir: Path) -> None:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    plot_path = output_dir / (
+        f"{safe_name(row.model)}-{safe_name(row.variant)}-r{row.repeat}-memory.png"
+    )
+    times = [sample.timestamp_s for sample in row.samples]
+    rss = [bytes_to_mib(sample.rss_bytes) for sample in row.samples]
+    compile_times = [sample.timestamp_s for sample in row.samples if sample.stage == "compile"]
+    compile_rss = [bytes_to_mib(sample.rss_bytes) for sample in row.samples if sample.stage == "compile"]
+    inference_times = [sample.timestamp_s for sample in row.samples if sample.stage == "inference"]
+    inference_rss = [bytes_to_mib(sample.rss_bytes) for sample in row.samples if sample.stage == "inference"]
+
+    _, axis = plt.subplots(figsize=(10, 5))
+    axis.plot(times, rss, color="#1f2937", linewidth=1.2, label="rss")
+    if compile_times:
+        axis.scatter(compile_times, compile_rss, color="#2563eb", s=8, label="compile")
+    if inference_times:
+        axis.scatter(inference_times, inference_rss, color="#dc2626", s=8, label="inference")
+    axis.set_title(f"{row.model} / {row.variant} / repeat {row.repeat}")
+    axis.set_xlabel("time, s")
+    axis.set_ylabel("RSS, MiB")
+    axis.grid(True, alpha=0.25)
+    axis.legend()
+    plt.tight_layout()
+    plt.savefig(plot_path, dpi=140)
+    plt.close()
+    row.plot_path = str(plot_path)
+
+
+def write_samples_csv(path: Path, rows: list[Result]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", newline="", encoding="utf-8") as handle:
+        fieldnames = ["model", "variant", "repeat", "timestamp_s", "stage", "rss_mib"]
+        writer = csv.DictWriter(handle, fieldnames=fieldnames)
+        writer.writeheader()
+        for row in rows:
+            for sample in sample_rows(row):
+                writer.writerow(
+                    {
+                        "model": row.model,
+                        "variant": row.variant,
+                        "repeat": row.repeat,
+                        **sample,
+                    }
+                )
+
+
 def write_json(path: Path, args: argparse.Namespace, rows: list[Result]) -> None:
     path.parent.mkdir(parents=True, exist_ok=True)
     payload = {
@@ -482,14 +551,38 @@ def write_json(path: Path, args: argparse.Namespace, rows: list[Result]) -> None
             "graph_build": str(args.graph_build),
             "sample_interval_s": args.sample_interval,
         },
-        "results": [{**row.row(), "stdout_tail": row.stdout_tail} for row in rows],
+        "results": [
+            {
+                **row.row(),
+                "memory_plot": row.plot_path,
+                "memory_samples": sample_rows(row),
+                "stdout_tail": row.stdout_tail,
+            }
+            for row in rows
+        ],
     }
     path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
 
 
 def write_csv(path: Path, rows: list[Result]) -> None:
     path.parent.mkdir(parents=True, exist_ok=True)
-    fieldnames = list(Result("", "", 0, [], 0, 0, 0, Phase(), Phase(), False, []).row())
+    fieldnames = [
+        "model",
+        "variant",
+        "repeat",
+        "returncode",
+        "total_s",
+        "compile_s",
+        "inference_s",
+        "peak_rss_mib",
+        "compile_peak_rss_mib",
+        "inference_peak_rss_mib",
+        "compile_markers",
+        "inference_markers",
+        "marker_split",
+        "command",
+        "error",
+    ]
     with path.open("w", newline="", encoding="utf-8") as handle:
         writer = csv.DictWriter(handle, fieldnames=fieldnames)
         writer.writeheader()
@@ -528,6 +621,8 @@ def parse_args(argv: Sequence[str]) -> argparse.Namespace:
     parser.add_argument("--strict-assets", action="store_true")
     parser.add_argument("--json-out", type=Path)
     parser.add_argument("--csv-out", type=Path)
+    parser.add_argument("--samples-csv-out", type=Path)
+    parser.add_argument("--plots-dir", type=Path)
     return parser.parse_args(argv)
 
 
@@ -577,13 +672,21 @@ def main(argv: Sequence[str]) -> int:
                     )
                 )
 
-    print_summary(results)
     json_out = args.json_out or report_path("json")
+    plots_dir = args.plots_dir or json_out.parent / "memory_plots"
+    for row in results:
+        plot_memory(row, plots_dir)
+
+    print_summary(results)
     write_json(json_out, args, results)
     print(f"JSON report: {json_out}")
     if args.csv_out:
         write_csv(args.csv_out, results)
         print(f"CSV report: {args.csv_out}")
+    if args.samples_csv_out:
+        write_samples_csv(args.samples_csv_out, results)
+        print(f"Samples CSV report: {args.samples_csv_out}")
+    print(f"Memory plots: {plots_dir}")
     return 1 if any(row.returncode != 0 for row in results) else 0
 
 

From 807b228ca60b6dbc96c47f034e3137386f93fa84 Mon Sep 17 00:00:00 2001
From: Alexander Nesterov <nesterov.alexander@outlook.com>
Date: Mon, 4 May 2026 23:07:21 +0200
Subject: [PATCH 3/3] Benchmark parallel and fusion variants

---
 app/Graph/CMakeLists.txt        |  1 +
 app/Graph/graph_build.cpp       | 53 ++++++++++++++++++++++++++++++++-
 benchmarks/README.md            |  8 +++--
 benchmarks/model_performance.py | 47 ++++++++++++++++++++++++++++-
 4 files changed, 105 insertions(+), 4 deletions(-)

diff --git a/app/Graph/CMakeLists.txt b/app/Graph/CMakeLists.txt
index 218e511bf..e337fff00 100644
--- a/app/Graph/CMakeLists.txt
+++ b/app/Graph/CMakeLists.txt
@@ -19,6 +19,7 @@ target_include_directories(BuildGraph PUBLIC ${CMAKE_SOURCE_DIR}/3rdparty/Json/i
 
 add_executable(Graph_Build graph_build.cpp)
 target_link_libraries(Graph_Build BuildGraph)
+target_link_libraries(Graph_Build graphT_lib)
 
 add_executable(ACC acc_check.cpp)
 target_link_libraries(ACC BuildGraph)
diff --git a/app/Graph/graph_build.cpp b/app/Graph/graph_build.cpp
index 53389bc1d..e61266aa1 100644
--- a/app/Graph/graph_build.cpp
+++ b/app/Graph/graph_build.cpp
@@ -3,13 +3,55 @@
 #include <unordered_map>
 
 #include "build.hpp"
+#include "graph_transformations/graph_transformations.hpp"
+#include "layers_fused/ConvRelu.hpp"
 
 namespace fs = std::filesystem;
 using namespace it_lab_ai;
 
+namespace {
+
+enum class FusionMode { kOff, kPostops, kConvRelu };
+
+FusionMode parse_fusion_mode(const std::string& value) {
+  if (value == "off") {
+    return FusionMode::kOff;
+  }
+  if (value == "postops") {
+    return FusionMode::kPostops;
+  }
+  if (value == "convrelu") {
+    return FusionMode::kConvRelu;
+  }
+  throw std::invalid_argument("Unknown fusion mode: " + value);
+}
+
+void apply_conv_relu_fusion(Graph& graph, Tensor& output,
+                            const RuntimeOptions& options) {
+  if (options.backend == Backend::kOneDnn) {
+    throw std::invalid_argument(
+        "convrelu fusion is not supported with oneDNN backend");
+  }
+
+  Graph subgraph;
+  Tensor dummy_input = make_tensor(std::vector<int>({0}));
+  auto conv = std::make_shared<ConvolutionalLayer>();
+  auto relu = std::make_shared<EWLayer>("relu");
+  subgraph.setInput(conv, dummy_input);
+  subgraph.makeConnection(conv, relu);
+
+  Graph fused_graph;
+  auto fused_layer = std::make_shared<ConvReluLayer>();
+  changed_subgraphs(graph, subgraph, fused_layer, fused_graph, output, options);
+  graph = std::move(fused_graph);
+}
+
+}  // namespace
+
 int main(int argc, char* argv[]) {
   std::string model_name = "alexnet_mnist";
   RuntimeOptions options;
+  FusionMode fusion_mode = FusionMode::kPostops;
 
   for (int i = 1; i < argc; ++i) {
     if (std::string(argv[i]) == "--model" && i + 1 < argc) {
@@ -47,6 +89,8 @@ int main(int argc, char* argv[]) {
       }
     } else if (std::string(argv[i]) == "--threads" && i + 1 < argc) {
       options.threads = std::stoi(argv[++i]);
+    } else if (std::string(argv[i]) == "--fusion" && i + 1 < argc) {
+      fusion_mode = parse_fusion_mode(argv[++i]);
     }
   }
 
@@ -92,7 +136,11 @@ int main(int argc, char* argv[]) {
         std::vector<float> vec(75, 3);
         it_lab_ai::Tensor output = it_lab_ai::make_tensor(vec, sh1);
         Graph graph;
-        build_graph_linear(graph, input, output, options, true);
+        build_graph_linear(graph, input, output, options, true,
+                           fusion_mode == FusionMode::kPostops);
+        if (fusion_mode == FusionMode::kConvRelu) {
+          apply_conv_relu_fusion(graph, output, options);
+        }
 
         std::cout << "Starting inference..." << '\n';
         try {
@@ -133,6 +181,9 @@ int main(int argc, char* argv[]) {
 
         Graph graph;
         build_graph(graph, input, output, json_path, options, false);
+        if (fusion_mode == FusionMode::kConvRelu) {
+          apply_conv_relu_fusion(graph, output, options);
+        }
 
         std::cout << "Starting inference..." << '\n';
         try {
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 1bbb365fc..ca000ed8f 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -42,8 +42,7 @@ Run selected models and variants:
 ```bash
 python3 benchmarks/model_performance.py \
   --model googlenet,resnet \
-  --variant seq \
-  --variant parallel-tbb \
+  --variant target \
   --repeat 3 \
   --warmup 1
 ```
@@ -52,5 +51,10 @@ The JSON report includes `memory_samples` for every run. PNG plots are written
 to `benchmark_results/memory_plots` by default. Use `--samples-csv-out` to export
 the memory timeline to CSV and `--plots-dir` to choose another plot directory.
 
+Use `--variant target` for the full target matrix: every supported parallel
+backend with fusion off/on, plus oneDNN with fusion off/on. Fusion-on uses the
+existing `Conv+Relu` fused layer for naive/parallel backends and existing
+post-ops mode for oneDNN.
+
 Use `--strict-assets` to fail when a model JSON or input image directory is
 missing instead of skipping that model.
diff --git a/benchmarks/model_performance.py b/benchmarks/model_performance.py
index 78e5f610e..7fffd5422 100644
--- a/benchmarks/model_performance.py
+++ b/benchmarks/model_performance.py
@@ -57,6 +57,46 @@
     "parallel-threads": ["--parallel", "threads"],
     "parallel-omp": ["--parallel", "omp"],
     "parallel-kokkos": ["--parallel", "kokkos"],
+    "seq-fusion-off": ["--fusion", "off"],
+    "seq-fusion-on": ["--fusion", "convrelu"],
+    "parallel-tbb-fusion-off": ["--parallel", "tbb", "--fusion", "off"],
+    "parallel-tbb-fusion-on": ["--parallel", "tbb", "--fusion", "convrelu"],
+    "parallel-threads-fusion-off": ["--parallel", "threads", "--fusion", "off"],
+    "parallel-threads-fusion-on": [
+        "--parallel",
+        "threads",
+        "--fusion",
+        "convrelu",
+    ],
+    "parallel-omp-fusion-off": ["--parallel", "omp", "--fusion", "off"],
+    "parallel-omp-fusion-on": ["--parallel", "omp", "--fusion", "convrelu"],
+    "parallel-kokkos-fusion-off": ["--parallel", "kokkos", "--fusion", "off"],
+    "parallel-kokkos-fusion-on": [
+        "--parallel",
+        "kokkos",
+        "--fusion",
+        "convrelu",
+    ],
+    "onednn-fusion-off": ["--onednn", "--fusion", "off"],
+    "onednn-fusion-on": ["--onednn", "--fusion", "postops"],
+}
+
+VARIANT_GROUPS = {
+    "all": list(VARIANT_ARGS),
+    "target": [
+        "seq-fusion-off",
+        "seq-fusion-on",
+        "parallel-tbb-fusion-off",
+        "parallel-tbb-fusion-on",
+        "parallel-threads-fusion-off",
+        "parallel-threads-fusion-on",
+        "parallel-omp-fusion-off",
+        "parallel-omp-fusion-on",
+        "parallel-kokkos-fusion-off",
+        "parallel-kokkos-fusion-on",
+        "onednn-fusion-off",
+        "onednn-fusion-on",
+    ],
 }
 
 
@@ -142,10 +182,15 @@ def expand_choices(values: Sequence[str], choices: dict[str, object], default: s
                 continue
             if item == "all":
                 expanded.extend(choices)
+            elif item in VARIANT_GROUPS and choices is VARIANT_ARGS:
+                expanded.extend(VARIANT_GROUPS[item])
             elif item in choices:
                 expanded.append(item)
             else:
-                raise SystemExit(f"Unknown value '{item}'. Valid: all, {', '.join(choices)}")
+                valid = ["all", *choices]
+                if choices is VARIANT_ARGS:
+                    valid.extend(name for name in VARIANT_GROUPS if name != "all")
+                raise SystemExit(f"Unknown value '{item}'. Valid: {', '.join(valid)}")
     return dedupe(expanded)