embedded-dev-research · allnes · May 4, 2026 · May 4, 2026 · May 4, 2026
@@ -3,6 +3,7 @@ build
 build*
 3rdparty/tensorflow
 app/AccuracyImgNet/imgs
+benchmark_results/
 docs/
 docs/input
 docs/mnist
@@ -19,6 +19,7 @@ target_include_directories(BuildGraph PUBLIC ${CMAKE_SOURCE_DIR}/3rdparty/Json/i
 
 add_executable(Graph_Build graph_build.cpp)
 target_link_libraries(Graph_Build BuildGraph)
+target_link_libraries(Graph_Build graphT_lib)
 
 add_executable(ACC acc_check.cpp)
 target_link_libraries(ACC BuildGraph)

@@ -3,13 +3,55 @@
 #include <unordered_map>
 
 #include "build.hpp"
+#include "graph_transformations/graph_transformations.hpp"
+#include "layers_fused/ConvRelu.hpp"
 
 namespace fs = std::filesystem;
 using namespace it_lab_ai;
 
+namespace {
+
+enum class FusionMode { kOff, kPostops, kConvRelu };
+
+FusionMode parse_fusion_mode(const std::string& value) {
+  if (value == "off") {
+    return FusionMode::kOff;
+  }
+  if (value == "postops") {
+    return FusionMode::kPostops;
+  }
+  if (value == "convrelu") {
+    return FusionMode::kConvRelu;
+  }
+  throw std::invalid_argument("Unknown fusion mode: " + value);
+}
+
+void apply_conv_relu_fusion(Graph& graph, Tensor& output,
+                            const RuntimeOptions& options) {
+  if (options.backend == Backend::kOneDnn) {
+    throw std::invalid_argument(
+        "convrelu fusion is not supported with oneDNN backend");
+  }
+
+  Graph subgraph;
+  Tensor dummy_input = make_tensor(std::vector<int>({0}));
+  auto conv = std::make_shared<ConvolutionalLayer>();
+  auto relu = std::make_shared<EWLayer>("relu");
+  subgraph.setInput(conv, dummy_input);
+  subgraph.makeConnection(conv, relu);
+
+  Graph fused_graph;
+  auto fused_layer = std::make_shared<ConvReluLayer>();
+  changed_subgraphs(graph, subgraph, fused_layer, fused_graph, output, options);
+  graph = std::move(fused_graph);
+}
+
+}  // namespace
+
 int main(int argc, char* argv[]) {
   std::string model_name = "alexnet_mnist";
   RuntimeOptions options;
+  FusionMode fusion_mode = FusionMode::kPostops;
 
   for (int i = 1; i < argc; ++i) {
     if (std::string(argv[i]) == "--model" && i + 1 < argc) {
@@ -47,6 +89,8 @@ int main(int argc, char* argv[]) {
       }
     } else if (std::string(argv[i]) == "--threads" && i + 1 < argc) {
       options.threads = std::stoi(argv[++i]);
+    } else if (std::string(argv[i]) == "--fusion" && i + 1 < argc) {
+      fusion_mode = parse_fusion_mode(argv[++i]);
     }
   }
 
@@ -92,7 +136,11 @@ int main(int argc, char* argv[]) {
         std::vector<float> vec(75, 3);
         it_lab_ai::Tensor output = it_lab_ai::make_tensor(vec, sh1);
         Graph graph;
-        build_graph_linear(graph, input, output, options, true);
+        build_graph_linear(graph, input, output, options, true,
+                           fusion_mode == FusionMode::kPostops);
+        if (fusion_mode == FusionMode::kConvRelu) {
+          apply_conv_relu_fusion(graph, output, options);
+        }
 
         std::cout << "Starting inference..." << '\n';
         try {
@@ -133,6 +181,9 @@ int main(int argc, char* argv[]) {
 
         Graph graph;
         build_graph(graph, input, output, json_path, options, false);
+        if (fusion_mode == FusionMode::kConvRelu) {
+          apply_conv_relu_fusion(graph, output, options);
+        }
 
         std::cout << "Starting inference..." << '\n';
         try {

@@ -0,0 +1,60 @@
+# Model Performance Benchmark
+
+`model_performance.py` benchmarks the existing `Graph_Build` executable for all
+target networks:
+
+- `alexnet_mnist`
+- `googlenet`
+- `densenet`
+- `resnet`
+- `yolo`
+
+It measures wall time and RSS memory timeline for two stages:
+
+- `compile`: process start until `Graph_Build` prints `Starting inference...`
+- `inference`: `Starting inference...` until `Inference completed successfully.`
+
+The benchmark does not modify C++ code. It reads the executable output live,
+samples process memory while the command is running, stores the full RSS sample
+series, and writes a memory plot for every measured run.
+
+Install `matplotlib` to generate memory plots. Install `psutil` to measure RSS
+for the full process tree on every platform. Without `psutil`, Linux uses
+`/proc`, while macOS and Windows use parent-process RSS fallbacks.
+
+## Usage
+
+Build the project first:
+
+```bash
+cmake -S . -B build
+cmake --build build --target Graph_Build --parallel
+```
+
+Run the default benchmark over every model with available JSON/input assets:
+
+```bash
+python3 benchmarks/model_performance.py
+```
+
+Run selected models and variants:
+
+```bash
+python3 benchmarks/model_performance.py \
+  --model googlenet,resnet \
+  --variant target \
+  --repeat 3 \
+  --warmup 1
+```
+
+The JSON report includes `memory_samples` for every run. PNG plots are written
+to `benchmark_results/memory_plots` by default. Use `--samples-csv-out` to export
+the memory timeline to CSV and `--plots-dir` to choose another plot directory.
+
+Use `--variant target` for the full target matrix: every supported parallel
+backend with fusion off/on, plus oneDNN with fusion off/on. Fusion-on uses the
+existing `Conv+Relu` fused layer for naive/parallel backends and existing
+post-ops mode for oneDNN.
+
+Use `--strict-assets` to fail when a model JSON or input image directory is
+missing instead of skipping that model.