From 819d6fa495abfbef238ec37e2ced442b417474ee Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Sun, 10 May 2026 20:37:00 +0200
Subject: [PATCH 01/31] start with permuted dense struct, without integration

---
 include/utils/matrix.h            |  28 +++
 include/utils/permuted_dense.h    |  87 ++++++++
 src/atoms/affine/left_matmul.c    |   3 +
 src/utils/permuted_dense.c        | 223 ++++++++++++++++++++
 tests/all_tests.c                 |  14 ++
 tests/profiling/profile_log_reg.h | 160 ++++++++++++++
 tests/utils/test_permuted_dense.h | 333 ++++++++++++++++++++++++++++++
 7 files changed, 848 insertions(+)
 create mode 100644 include/utils/permuted_dense.h
 create mode 100644 src/utils/permuted_dense.c
 create mode 100644 tests/profiling/profile_log_reg.h
 create mode 100644 tests/utils/test_permuted_dense.h

diff --git a/include/utils/matrix.h b/include/utils/matrix.h
index c54443a..4725548 100644
--- a/include/utils/matrix.h
+++ b/include/utils/matrix.h
@@ -21,6 +21,34 @@
 #include "CSC_Matrix.h"
 #include "CSR_Matrix.h"
 
+/* We implement three different types of matrices.
+
+    1. 'sparse_matrix' represents a generic CSR matrix.
+    2. 'permuted_dense' represents a matrix that only consists of a dense block
+        (potentially after permuting columns).
+    3. 'blkdiag_dense' represents a block diagonal matrix with a constant dense
+        block.
+
+    Each of these types implements its own functionality for common matrix operations
+    such as DA_fill_values etc. The return type of most of these operations are the
+    same as the type of the input. For example, DA_fill_values for permuted_dense
+    fills the values of a new permuted_dense object.
+
+    2, 'permuted_dense':
+       * DA_fill_values just scales the rows. It does not affect the permutation
+         indices.
+       * ATA_alloc
+       * ATDA_fill_values
+       * to_csr_sparsity
+       * to_csr_values
+       *
+
+   1. sparse_matrix: generic CSR matrix.
+   2. permuted_dense:
+
+
+*/
+
 /* Base matrix type with function pointers for polymorphic dispatch */
 typedef struct Matrix
 {
diff --git a/include/utils/permuted_dense.h b/include/utils/permuted_dense.h
new file mode 100644
index 0000000..41fba71
--- /dev/null
+++ b/include/utils/permuted_dense.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2026 Daniel Cederberg and William Zhang
+ *
+ * This file is part of the SparseDiffEngine project.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef PERMUTED_DENSE_H
+#define PERMUTED_DENSE_H
+
+#include "matrix.h"
+
+/* Permuted_Dense represents a matrix whose only nonzeros lie in a dense
+   block, after rows and columns are restricted to chosen subsets of the
+   global index space. For local indices (ii, jj) with 0 <= ii < dense_m
+   and 0 <= jj < dense_n,
+
+       M[row_perm[ii], col_perm[jj]] = X[ii, jj].
+
+   All other entries of M are zero. row_perm and col_perm are stored in
+   strictly increasing order; the constructor asserts this. */
+typedef struct Permuted_Dense
+{
+    Matrix base;       /* base.m, base.n = global ambient dimensions */
+    int dense_m;       /* rows of dense block (= len(row_perm))      */
+    int dense_n;       /* cols of dense block (= len(col_perm))      */
+    int *row_perm;     /* row_perm[ii] in [0, base.m), sorted        */
+    int *col_perm;     /* col_perm[jj] in [0, base.n), sorted        */
+    double *X;         /* dense_m * dense_n, row-major               */
+    double *Y_scratch; /* dense_m * dense_n, used by ATDA            */
+    int *col_inv;      /* length base.n: col_inv[col_perm[jj]] = jj, */
+                       /* otherwise -1; used by `x CSC` allocation.  */
+} Permuted_Dense;
+
+/* Constructor. row_perm and col_perm must be strictly increasing in their
+   respective ranges. If X_data is NULL the value buffer is allocated but
+   left uninitialized; otherwise dense_m * dense_n entries are copied. */
+Matrix *new_permuted_dense(int m, int n, int dense_m, int dense_n,
+                           const int *row_perm, const int *col_perm,
+                           const double *X_data);
+
+/* Convert to CSR. The output has dense_m * dense_n nonzeros. */
+CSR_Matrix *permuted_dense_to_csr_alloc(const Permuted_Dense *self);
+void permuted_dense_to_csr_fill_values(const Permuted_Dense *self, CSR_Matrix *out);
+
+/* Fill out = diag(d) * self, where d has length self->base.m. out must have
+   the same structure as self (same dimensions and same row_perm/col_perm). */
+void permuted_dense_DA_fill_values(const double *d, const Permuted_Dense *self,
+                                   Permuted_Dense *out);
+
+/* Allocate a new Permuted_Dense for C = self^T @ self. C is square of global
+   size self->base.n, with dense block self->dense_n x self->dense_n and both
+   permutations equal to self->col_perm. Values are uninitialized; the caller
+   is expected to fill them via permuted_dense_ATDA_fill_values. */
+Matrix *permuted_dense_ATA_alloc(const Permuted_Dense *self);
+
+/* Fill out.X = self.X^T diag(d) self.X, where d has length self->base.m.
+   out must have the structure produced by permuted_dense_ATA_alloc(self).
+   Uses self->Y_scratch as workspace; const-correctness is preserved because
+   only the buffer pointed to by Y_scratch is mutated. */
+void permuted_dense_ATDA_fill_values(const Permuted_Dense *self, const double *d,
+                                     Permuted_Dense *out);
+
+/* Allocate a new Permuted_Dense for C = self @ J. C has global shape
+   (self->base.m, J->n) with row_perm = self->row_perm and col_perm equal
+   to the sorted list of columns of J that have at least one structural
+   nonzero in some row in self->col_perm. Values are uninitialized. */
+Matrix *permuted_dense_times_csc_alloc(const Permuted_Dense *self,
+                                       const CSC_Matrix *J);
+
+/* Fill out.X[ii, jj] = sum_kk self.X[ii, kk] * J[col_perm_self[kk],
+   col_perm_out[jj]]. out must have the structure produced by
+   permuted_dense_times_csc_alloc(self, J). */
+void permuted_dense_times_csc_fill_values(const Permuted_Dense *self,
+                                          const CSC_Matrix *J, Permuted_Dense *out);
+
+#endif /* PERMUTED_DENSE_H */
diff --git a/src/atoms/affine/left_matmul.c b/src/atoms/affine/left_matmul.c
index c083385..e458342 100644
--- a/src/atoms/affine/left_matmul.c
+++ b/src/atoms/affine/left_matmul.c
@@ -245,6 +245,9 @@ expr *new_left_matmul(expr *param_node, expr *u, const CSR_Matrix *A)
 expr *new_left_matmul_dense(expr *param_node, expr *u, int m, int n,
                             const double *data)
 {
+    /* TODO: do a helper function for this dimension check (so we can use it in both
+     * dense and sparse constructors). We could include even more code in that
+     * functon, all the day down to the parameter support I think*/
     int d1, d2, n_blocks;
     if (u->d1 == n)
     {
diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c
new file mode 100644
index 0000000..9a7e25a
--- /dev/null
+++ b/src/utils/permuted_dense.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright 2026 Daniel Cederberg and William Zhang
+ *
+ * This file is part of the SparseDiffEngine project.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "utils/permuted_dense.h"
+#include "utils/cblas_wrapper.h"
+#include "utils/iVec.h"
+#include "utils/tracked_alloc.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+static void permuted_dense_free(Matrix *self)
+{
+    Permuted_Dense *pd = (Permuted_Dense *) self;
+    free(pd->row_perm);
+    free(pd->col_perm);
+    free(pd->X);
+    free(pd->Y_scratch);
+    free(pd->col_inv);
+    free(pd);
+}
+
+Matrix *new_permuted_dense(int m, int n, int dense_m, int dense_n,
+                           const int *row_perm, const int *col_perm,
+                           const double *X_data)
+{
+    /* Validate sorted invariants. */
+    for (int ii = 1; ii < dense_m; ii++)
+    {
+        assert(row_perm[ii] > row_perm[ii - 1]);
+    }
+    for (int jj = 1; jj < dense_n; jj++)
+    {
+        assert(col_perm[jj] > col_perm[jj - 1]);
+    }
+    if (dense_m > 0)
+    {
+        assert(row_perm[0] >= 0 && row_perm[dense_m - 1] < m);
+    }
+    if (dense_n > 0)
+    {
+        assert(col_perm[0] >= 0 && col_perm[dense_n - 1] < n);
+    }
+
+    Permuted_Dense *pd = (Permuted_Dense *) SP_CALLOC(1, sizeof(Permuted_Dense));
+    pd->base.m = m;
+    pd->base.n = n;
+    pd->base.free_fn = permuted_dense_free;
+    /* Other vtable slots are wired up in later steps as the operations
+       they dispatch to are implemented. */
+
+    pd->dense_m = dense_m;
+    pd->dense_n = dense_n;
+
+    int sz = dense_m * dense_n;
+    pd->row_perm = (int *) SP_MALLOC(dense_m * sizeof(int));
+    pd->col_perm = (int *) SP_MALLOC(dense_n * sizeof(int));
+    pd->X = (double *) SP_MALLOC(sz * sizeof(double));
+    pd->Y_scratch = (double *) SP_MALLOC(sz * sizeof(double));
+    pd->col_inv = (int *) SP_MALLOC(n * sizeof(int));
+
+    if (dense_m > 0)
+    {
+        memcpy(pd->row_perm, row_perm, dense_m * sizeof(int));
+    }
+    if (dense_n > 0)
+    {
+        memcpy(pd->col_perm, col_perm, dense_n * sizeof(int));
+    }
+
+    for (int j = 0; j < n; j++)
+    {
+        pd->col_inv[j] = -1;
+    }
+    for (int jj = 0; jj < dense_n; jj++)
+    {
+        pd->col_inv[col_perm[jj]] = jj;
+    }
+
+    if (X_data != NULL && sz > 0)
+    {
+        memcpy(pd->X, X_data, sz * sizeof(double));
+    }
+
+    return &pd->base;
+}
+
+CSR_Matrix *permuted_dense_to_csr_alloc(const Permuted_Dense *self)
+{
+    int dense_m = self->dense_m;
+    int dense_n = self->dense_n;
+    int m = self->base.m;
+    CSR_Matrix *C = new_csr_matrix(m, self->base.n, dense_m * dense_n);
+
+    /* fill column indices (each dense row contributes a copy of col_perm) */
+    for (int ii = 0; ii < dense_m; ii++)
+    {
+        memcpy(C->i + ii * dense_n, self->col_perm, dense_n * sizeof(int));
+    }
+
+    /* set row pointers via count and then cumulative sum  */
+    memset(C->p, 0, (m + 1) * sizeof(int));
+    for (int ii = 0; ii < dense_m; ii++)
+    {
+        C->p[self->row_perm[ii] + 1] = dense_n;
+    }
+
+    for (int i = 0; i < m; i++)
+    {
+        C->p[i + 1] += C->p[i];
+    }
+
+    return C;
+}
+
+void permuted_dense_to_csr_fill_values(const Permuted_Dense *self, CSR_Matrix *out)
+{
+    memcpy(out->x, self->X, self->dense_m * self->dense_n * sizeof(double));
+}
+
+void permuted_dense_DA_fill_values(const double *d, const Permuted_Dense *self,
+                                   Permuted_Dense *out)
+{
+    int dense_m = self->dense_m;
+    int dense_n = self->dense_n;
+    cblas_dcopy(dense_m * dense_n, self->X, 1, out->X, 1);
+    for (int ii = 0; ii < dense_m; ii++)
+    {
+        cblas_dscal(dense_n, d[self->row_perm[ii]], out->X + ii * dense_n, 1);
+    }
+}
+
+Matrix *permuted_dense_ATA_alloc(const Permuted_Dense *self)
+{
+    int n = self->base.n;
+    int dense_n = self->dense_n;
+    return new_permuted_dense(n, n, dense_n, dense_n, self->col_perm, self->col_perm,
+                              NULL);
+}
+
+void permuted_dense_ATDA_fill_values(const Permuted_Dense *self, const double *d,
+                                     Permuted_Dense *out)
+{
+    int dense_m = self->dense_m;
+    int dense_n = self->dense_n;
+
+    /* Y_scratch = diag(d_perm) X, where d_perm[kk] = d[row_perm[kk]]. */
+    cblas_dcopy(dense_m * dense_n, self->X, 1, self->Y_scratch, 1);
+    for (int ii = 0; ii < dense_m; ii++)
+    {
+        cblas_dscal(dense_n, d[self->row_perm[ii]], self->Y_scratch + ii * dense_n,
+                    1);
+    }
+
+    /* out.X = X^T Y_scratch. */
+    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dense_n, dense_n, dense_m,
+                1.0, self->X, dense_n, self->Y_scratch, dense_n, 0.0, out->X,
+                dense_n);
+}
+
+Matrix *permuted_dense_times_csc_alloc(const Permuted_Dense *self,
+                                       const CSC_Matrix *J)
+{
+    /* Active columns: those with at least one structural nonzero in a row
+       of col_perm_self. col_inv[r] != -1 iff r is in col_perm_self. */
+    iVec *col_perm_out = iVec_new(8);
+    for (int j = 0; j < J->n; j++)
+    {
+        for (int e = J->p[j]; e < J->p[j + 1]; e++)
+        {
+            if (self->col_inv[J->i[e]] != -1)
+            {
+                iVec_append(col_perm_out, j);
+                break;
+            }
+        }
+    }
+
+    Matrix *M_out =
+        new_permuted_dense(self->base.m, J->n, self->dense_m, col_perm_out->len,
+                           self->row_perm, col_perm_out->data, NULL);
+    iVec_free(col_perm_out);
+    return M_out;
+}
+
+void permuted_dense_times_csc_fill_values(const Permuted_Dense *self,
+                                          const CSC_Matrix *J, Permuted_Dense *out)
+{
+    int dense_m = self->dense_m;
+    int dense_n_self = self->dense_n;
+    int dense_n_out = out->dense_n;
+
+    /* Each entry (r, val) of J in active columns with r in col_perm_self
+       contributes val * self.X[:, kk] to out.X[:, jj], where kk = col_inv[r]
+       and jj is the position of the column in col_perm_out. Columns of X
+       and out.X are accessed via row-major strides. */
+    memset(out->X, 0, dense_m * dense_n_out * sizeof(double));
+    for (int jj = 0; jj < dense_n_out; jj++)
+    {
+        int col = out->col_perm[jj];
+        for (int e = J->p[col]; e < J->p[col + 1]; e++)
+        {
+            int kk = self->col_inv[J->i[e]];
+            if (kk == -1) continue;
+            cblas_daxpy(dense_m, J->x[e], self->X + kk, dense_n_self, out->X + jj,
+                        dense_n_out);
+        }
+    }
+}
diff --git a/tests/all_tests.c b/tests/all_tests.c
index 76807b6..a40b2b9 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -66,6 +66,7 @@
 #include "utils/test_linalg_sparse_matmuls.h"
 #include "utils/test_linalg_utils_matmul_chain_rule.h"
 #include "utils/test_matrix.h"
+#include "utils/test_permuted_dense.h"
 #include "wsum_hess/affine/test_broadcast.h"
 #include "wsum_hess/affine/test_convolve.h"
 #include "wsum_hess/affine/test_diag_mat.h"
@@ -103,6 +104,7 @@
 
 #ifdef PROFILE_ONLY
 #include "profiling/profile_left_matmul.h"
+#include "profiling/profile_log_reg.h"
 #endif /* PROFILE_ONLY */
 
 int main(void)
@@ -356,6 +358,17 @@ int main(void)
     mu_run_test(test_sparse_vs_dense_mult_vec, tests_run);
     mu_run_test(test_dense_matrix_trans, tests_run);
     mu_run_test(test_sparse_vs_dense_mult_vec_blocks, tests_run);
+    mu_run_test(test_permuted_dense_to_csr_basic, tests_run);
+    mu_run_test(test_permuted_dense_to_csr_empty, tests_run);
+    mu_run_test(test_permuted_dense_to_csr_full, tests_run);
+    mu_run_test(test_permuted_dense_to_csr_single_row, tests_run);
+    mu_run_test(test_permuted_dense_to_csr_single_col, tests_run);
+    mu_run_test(test_permuted_dense_DA_fill_values, tests_run);
+    mu_run_test(test_permuted_dense_ATA_alloc, tests_run);
+    mu_run_test(test_permuted_dense_ATDA_fill_values, tests_run);
+    mu_run_test(test_permuted_dense_times_csc, tests_run);
+    mu_run_test(test_permuted_dense_times_csc_no_active, tests_run);
+    mu_run_test(test_permuted_dense_col_inv, tests_run);
     mu_run_test(test_YT_kron_I, tests_run);
     mu_run_test(test_YT_kron_I_larger, tests_run);
     mu_run_test(test_I_kron_X, tests_run);
@@ -401,6 +414,7 @@ int main(void)
 #ifdef PROFILE_ONLY
     printf("\n--- Profiling Tests ---\n");
     mu_run_test(profile_left_matmul, tests_run);
+    mu_run_test(profile_log_reg, tests_run);
 #endif /* PROFILE_ONLY */
 
     printf("\n=== All %d tests passed ===\n", tests_run);
diff --git a/tests/profiling/profile_log_reg.h b/tests/profiling/profile_log_reg.h
new file mode 100644
index 0000000..8ad7eea
--- /dev/null
+++ b/tests/profiling/profile_log_reg.h
@@ -0,0 +1,160 @@
+#ifndef PROFILE_LOG_REG_H
+#define PROFILE_LOG_REG_H
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "atoms/affine.h"
+#include "atoms/elementwise_full_dom.h"
+#include "expr.h"
+#include "minunit.h"
+#include "utils/CSR_sum.h"
+#include "utils/Timer.h"
+#include "utils/permuted_dense.h"
+
+/* Profile and validate Jacobian + Hessian of obj = sum(logistic(A x)).
+
+   Path A: the engine's expression DAG (CSR/CSC chain rule).
+   Path B: hardcoded chain rule using Permuted_Dense kernels for the dense
+           linear algebra (DA and ATDA), plus the engine's CSR row-sum
+           primitives for J_sum.
+
+   Forward pass is excluded from timing. */
+const char *profile_log_reg(void)
+{
+    int m = 2000;
+    int n = 785;
+
+    /* ---- Random A and initial x ---- */
+    double *A_data = (double *) malloc((size_t) m * n * sizeof(double));
+    double *u = (double *) malloc(n * sizeof(double));
+    srand(42);
+    for (int i = 0; i < m * n; i++)
+    {
+        A_data[i] = (double) rand() / RAND_MAX - 0.5;
+    }
+    for (int i = 0; i < n; i++)
+    {
+        u[i] = (double) rand() / RAND_MAX - 0.5;
+    }
+
+    /* ---- Build expression DAG (shared by both paths) ---- */
+    expr *x = new_variable(n, 1, 0, n);
+    expr *Ax = new_left_matmul_dense(NULL, x, m, n, A_data);
+    expr *log_obj = new_logistic(Ax);
+    expr *obj = new_sum(log_obj, -1);
+    jacobian_init(obj);
+    wsum_hess_init(obj);
+
+    /* Forward (untimed). */
+    obj->forward(obj, u);
+
+    /* ---- Path A: time eval_jacobian + eval_wsum_hess ---- */
+    Timer t_a;
+    double w_one = 1.0;
+    clock_gettime(CLOCK_MONOTONIC, &t_a.start);
+    obj->eval_jacobian(obj);
+    obj->eval_wsum_hess(obj, &w_one);
+    clock_gettime(CLOCK_MONOTONIC, &t_a.end);
+    double sec_a = GET_ELAPSED_SECONDS(t_a);
+
+    /* ---- Path B setup (untimed) ---- */
+    int *full_rows = (int *) malloc(m * sizeof(int));
+    int *full_cols = (int *) malloc(n * sizeof(int));
+    for (int i = 0; i < m; i++) full_rows[i] = i;
+    for (int j = 0; j < n; j++) full_cols[j] = j;
+
+    Matrix *A_pd_M = new_permuted_dense(m, n, m, n, full_rows, full_cols, A_data);
+    Permuted_Dense *A_pd = (Permuted_Dense *) A_pd_M;
+    Matrix *Jlog_M = new_permuted_dense(m, n, m, n, full_rows, full_cols, NULL);
+    Permuted_Dense *Jlog_pd = (Permuted_Dense *) Jlog_M;
+    Matrix *H_pd_M = permuted_dense_ATA_alloc(A_pd);
+    Permuted_Dense *H_pd = (Permuted_Dense *) H_pd_M;
+
+    free(full_rows);
+    free(full_cols);
+
+    /* CSR scaffolding for the row-sum step. */
+    CSR_Matrix *Jlog_csr = permuted_dense_to_csr_alloc(Jlog_pd);
+    CSR_Matrix *Jobj_csr = new_csr_matrix(1, n, n);
+    int *iwork = (int *) malloc((size_t) m * n * sizeof(int));
+    int *idx_map = (int *) malloc((size_t) m * n * sizeof(int));
+    sum_all_rows_csr_alloc(Jlog_csr, Jobj_csr, iwork, idx_map);
+
+    double *d2 = (double *) malloc(m * sizeof(double));
+    double *w_ones = (double *) malloc(m * sizeof(double));
+    for (int i = 0; i < m; i++) w_ones[i] = 1.0;
+
+    /* ---- Path B: time the manual chain rule ---- */
+    Timer t_b;
+    clock_gettime(CLOCK_MONOTONIC, &t_b.start);
+    /* dwork = sigmoid(z); used as the diagonal in DA below and (still in
+       dwork) as sigmas read by local_wsum_hess. */
+    log_obj->local_jacobian(log_obj, log_obj->work->dwork);
+    permuted_dense_DA_fill_values(log_obj->work->dwork, A_pd, Jlog_pd);
+    permuted_dense_to_csr_fill_values(Jlog_pd, Jlog_csr);
+    memset(Jobj_csr->x, 0, Jobj_csr->nnz * sizeof(double));
+    accumulator(Jlog_csr, idx_map, Jobj_csr->x);
+    log_obj->local_wsum_hess(log_obj, d2, w_ones);
+    permuted_dense_ATDA_fill_values(A_pd, d2, H_pd);
+    clock_gettime(CLOCK_MONOTONIC, &t_b.end);
+    double sec_b = GET_ELAPSED_SECONDS(t_b);
+
+    printf("\n");
+    printf("  Path A (engine CSR/CSC):  %10.6f seconds\n", sec_a);
+    printf("  Path B (Permuted_Dense):  %10.6f seconds\n", sec_b);
+    printf("  Speedup (A / B):          %10.2fx\n", sec_a / sec_b);
+
+    /* ---- Compare Jacobian (1 x n, both have full sparsity) ---- */
+    mu_assert("J n mismatch", obj->jacobian->n == Jobj_csr->n);
+    mu_assert("J nnz mismatch", obj->jacobian->nnz == Jobj_csr->nnz);
+    double max_J_diff = 0.0;
+    for (int j = 0; j < obj->jacobian->nnz; j++)
+    {
+        double diff = fabs(obj->jacobian->x[j] - Jobj_csr->x[j]);
+        if (diff > max_J_diff) max_J_diff = diff;
+    }
+    printf("  Jacobian max abs diff:   %10.3e\n", max_J_diff);
+    mu_assert("Jacobian mismatch", max_J_diff < 1e-10);
+
+    /* ---- Compare Hessian (n x n): scatter Path A's CSR into a dense
+       n x n array, compare to H_pd->X (already dense row-major). ---- */
+    double *H_a_dense = (double *) calloc((size_t) n * n, sizeof(double));
+    for (int i = 0; i < n; i++)
+    {
+        for (int e = obj->wsum_hess->p[i]; e < obj->wsum_hess->p[i + 1]; e++)
+        {
+            int col = obj->wsum_hess->i[e];
+            H_a_dense[i * n + col] = obj->wsum_hess->x[e];
+        }
+    }
+    double max_H_diff = 0.0;
+    for (size_t k = 0; k < (size_t) n * n; k++)
+    {
+        double diff = fabs(H_a_dense[k] - H_pd->X[k]);
+        if (diff > max_H_diff) max_H_diff = diff;
+    }
+    printf("  Hessian max abs diff:    %10.3e\n", max_H_diff);
+    mu_assert("Hessian mismatch", max_H_diff < 1e-10);
+
+    /* ---- Cleanup ---- */
+    free(H_a_dense);
+    free(d2);
+    free(w_ones);
+    free(iwork);
+    free(idx_map);
+    free_csr_matrix(Jobj_csr);
+    free_csr_matrix(Jlog_csr);
+    free_matrix(H_pd_M);
+    free_matrix(Jlog_M);
+    free_matrix(A_pd_M);
+    free_expr(obj);
+    free(A_data);
+    free(u);
+
+    return 0;
+}
+
+#endif /* PROFILE_LOG_REG_H */
diff --git a/tests/utils/test_permuted_dense.h b/tests/utils/test_permuted_dense.h
new file mode 100644
index 0000000..e11d6e1
--- /dev/null
+++ b/tests/utils/test_permuted_dense.h
@@ -0,0 +1,333 @@
+#ifndef TEST_PERMUTED_DENSE_H
+#define TEST_PERMUTED_DENSE_H
+
+#include "minunit.h"
+#include "test_helpers.h"
+#include "utils/CSC_Matrix.h"
+#include "utils/permuted_dense.h"
+#include <stdlib.h>
+#include <string.h>
+
+/* 5x6 matrix with a 3x2 dense block at rows {1, 2, 4}, cols {0, 3}:
+
+       global view:
+       [0  0  0  0  0  0]
+       [1  0  0  2  0  0]
+       [3  0  0  4  0  0]
+       [0  0  0  0  0  0]
+       [5  0  0  6  0  0]                                                */
+const char *test_permuted_dense_to_csr_basic(void)
+{
+    int row_perm[3] = {1, 2, 4};
+    int col_perm[2] = {0, 3};
+    double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+
+    Matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
+    Permuted_Dense *pd = (Permuted_Dense *) M;
+
+    CSR_Matrix *C = permuted_dense_to_csr_alloc(pd);
+    permuted_dense_to_csr_fill_values(pd, C);
+
+    int Cp_expected[6] = {0, 0, 2, 4, 4, 6};
+    int Ci_expected[6] = {0, 3, 0, 3, 0, 3};
+    double Cx_expected[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+
+    mu_assert("dim m", C->m == 5);
+    mu_assert("dim n", C->n == 6);
+    mu_assert("nnz", C->nnz == 6);
+    mu_assert("p", cmp_int_array(C->p, Cp_expected, 6));
+    mu_assert("i", cmp_int_array(C->i, Ci_expected, 6));
+    mu_assert("x", cmp_double_array(C->x, Cx_expected, 6));
+
+    free_csr_matrix(C);
+    free_matrix(M);
+    return 0;
+}
+
+/* Empty dense block (dense_m = dense_n = 0): result is an m x n CSR with
+   no nonzeros. */
+const char *test_permuted_dense_to_csr_empty(void)
+{
+    Matrix *M = new_permuted_dense(4, 5, 0, 0, NULL, NULL, NULL);
+    Permuted_Dense *pd = (Permuted_Dense *) M;
+
+    CSR_Matrix *C = permuted_dense_to_csr_alloc(pd);
+    permuted_dense_to_csr_fill_values(pd, C);
+
+    int Cp_expected[5] = {0, 0, 0, 0, 0};
+    mu_assert("nnz", C->nnz == 0);
+    mu_assert("p", cmp_int_array(C->p, Cp_expected, 5));
+
+    free_csr_matrix(C);
+    free_matrix(M);
+    return 0;
+}
+
+/* Full dense (row_perm = [0..m), col_perm = [0..n)): result is the dense
+   matrix in CSR. */
+const char *test_permuted_dense_to_csr_full(void)
+{
+    int row_perm[2] = {0, 1};
+    int col_perm[3] = {0, 1, 2};
+    double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+
+    Matrix *M = new_permuted_dense(2, 3, 2, 3, row_perm, col_perm, X);
+    Permuted_Dense *pd = (Permuted_Dense *) M;
+
+    CSR_Matrix *C = permuted_dense_to_csr_alloc(pd);
+    permuted_dense_to_csr_fill_values(pd, C);
+
+    int Cp_expected[3] = {0, 3, 6};
+    int Ci_expected[6] = {0, 1, 2, 0, 1, 2};
+    double Cx_expected[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+
+    mu_assert("p", cmp_int_array(C->p, Cp_expected, 3));
+    mu_assert("i", cmp_int_array(C->i, Ci_expected, 6));
+    mu_assert("x", cmp_double_array(C->x, Cx_expected, 6));
+
+    free_csr_matrix(C);
+    free_matrix(M);
+    return 0;
+}
+
+/* Single dense row, two dense cols. Tests rows with no entries before
+   and after the active row. */
+const char *test_permuted_dense_to_csr_single_row(void)
+{
+    int row_perm[1] = {2};
+    int col_perm[2] = {1, 4};
+    double X[2] = {7.0, 9.0};
+
+    Matrix *M = new_permuted_dense(4, 5, 1, 2, row_perm, col_perm, X);
+    Permuted_Dense *pd = (Permuted_Dense *) M;
+
+    CSR_Matrix *C = permuted_dense_to_csr_alloc(pd);
+    permuted_dense_to_csr_fill_values(pd, C);
+
+    int Cp_expected[5] = {0, 0, 0, 2, 2};
+    int Ci_expected[2] = {1, 4};
+    double Cx_expected[2] = {7.0, 9.0};
+
+    mu_assert("p", cmp_int_array(C->p, Cp_expected, 5));
+    mu_assert("i", cmp_int_array(C->i, Ci_expected, 2));
+    mu_assert("x", cmp_double_array(C->x, Cx_expected, 2));
+
+    free_csr_matrix(C);
+    free_matrix(M);
+    return 0;
+}
+
+/* Single dense col across multiple rows. */
+const char *test_permuted_dense_to_csr_single_col(void)
+{
+    int row_perm[3] = {0, 2, 3};
+    int col_perm[1] = {2};
+    double X[3] = {1.0, 2.0, 3.0};
+
+    Matrix *M = new_permuted_dense(4, 4, 3, 1, row_perm, col_perm, X);
+    Permuted_Dense *pd = (Permuted_Dense *) M;
+
+    CSR_Matrix *C = permuted_dense_to_csr_alloc(pd);
+    permuted_dense_to_csr_fill_values(pd, C);
+
+    int Cp_expected[5] = {0, 1, 1, 2, 3};
+    int Ci_expected[3] = {2, 2, 2};
+    double Cx_expected[3] = {1.0, 2.0, 3.0};
+
+    mu_assert("p", cmp_int_array(C->p, Cp_expected, 5));
+    mu_assert("i", cmp_int_array(C->i, Ci_expected, 3));
+    mu_assert("x", cmp_double_array(C->x, Cx_expected, 3));
+
+    free_csr_matrix(C);
+    free_matrix(M);
+    return 0;
+}
+
+/* DA_fill_values: compare against CSR DA_fill_values on the equivalent CSR.
+
+   PD is the 5x6 matrix from the basic to_csr test, with d a length-5
+   global-row diagonal including a negative and zero entry. */
+const char *test_permuted_dense_DA_fill_values(void)
+{
+    int row_perm[3] = {1, 2, 4};
+    int col_perm[2] = {0, 3};
+    double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    double d[5] = {7.0, -1.5, 0.0, 9.0, 2.5};
+
+    Matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
+    Matrix *M_out = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, NULL);
+    Permuted_Dense *pd = (Permuted_Dense *) M;
+    Permuted_Dense *pd_out = (Permuted_Dense *) M_out;
+
+    permuted_dense_DA_fill_values(d, pd, pd_out);
+
+    /* Ground truth: build CSR of self, run DA_fill_values, compare. */
+    CSR_Matrix *csr = permuted_dense_to_csr_alloc(pd);
+    permuted_dense_to_csr_fill_values(pd, csr);
+    CSR_Matrix *csr_expected = new_csr_copy_sparsity(csr);
+    DA_fill_values(d, csr, csr_expected);
+
+    CSR_Matrix *csr_out = permuted_dense_to_csr_alloc(pd_out);
+    permuted_dense_to_csr_fill_values(pd_out, csr_out);
+
+    mu_assert("x", cmp_double_array(csr_out->x, csr_expected->x, csr->nnz));
+
+    free_csr_matrix(csr);
+    free_csr_matrix(csr_expected);
+    free_csr_matrix(csr_out);
+    free_matrix(M);
+    free_matrix(M_out);
+    return 0;
+}
+
+/* ATA_alloc: structure-only check. Output is 6x6 with a 2x2 dense block at
+   perms {0, 3} (= self.col_perm on both sides). Values are uninitialized
+   here; ATDA_fill_values is the value-producing op. */
+const char *test_permuted_dense_ATA_alloc(void)
+{
+    int row_perm[3] = {1, 2, 4};
+    int col_perm[2] = {0, 3};
+    double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+
+    Matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
+    Permuted_Dense *pd = (Permuted_Dense *) M;
+
+    Matrix *M_ata = permuted_dense_ATA_alloc(pd);
+    Permuted_Dense *pd_ata = (Permuted_Dense *) M_ata;
+
+    int perm_expected[2] = {0, 3};
+    mu_assert("m", M_ata->m == 6);
+    mu_assert("n", M_ata->n == 6);
+    mu_assert("dense_m", pd_ata->dense_m == 2);
+    mu_assert("dense_n", pd_ata->dense_n == 2);
+    mu_assert("row_perm", cmp_int_array(pd_ata->row_perm, perm_expected, 2));
+    mu_assert("col_perm", cmp_int_array(pd_ata->col_perm, perm_expected, 2));
+
+    free_matrix(M);
+    free_matrix(M_ata);
+    return 0;
+}
+
+/* ATDA: same 5x6 PD, d with negative + zero entries to catch sign bugs.
+   Hand-computed: d_perm = [-1.5, 0, 2.5], Y = diag(d_perm) X gives
+   [[-1.5,-3],[0,0],[12.5,15]], and X^T Y = [[61,72],[72,84]]. */
+const char *test_permuted_dense_ATDA_fill_values(void)
+{
+    int row_perm[3] = {1, 2, 4};
+    int col_perm[2] = {0, 3};
+    double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    double d[5] = {7.0, -1.5, 0.0, 9.0, 2.5};
+
+    Matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
+    Permuted_Dense *pd = (Permuted_Dense *) M;
+
+    Matrix *M_out = permuted_dense_ATA_alloc(pd);
+    Permuted_Dense *pd_out = (Permuted_Dense *) M_out;
+    permuted_dense_ATDA_fill_values(pd, d, pd_out);
+
+    double X_expected[4] = {61.0, 72.0, 72.0, 84.0};
+    mu_assert("X", cmp_double_array(pd_out->X, X_expected, 4));
+
+    free_matrix(M);
+    free_matrix(M_out);
+    return 0;
+}
+
+/* PD x CSC: J is 6x4. col 0 empty; col 1 has rows {0,3} (vals 10, 20);
+   col 2 has row {2} (val 30, but row 2 not in col_perm_self = {0,3} so col 2
+   is INACTIVE); col 3 has row {3} (val 40). Active cols: {1, 3}.
+
+   Expected: dense_m=3, dense_n=2, row_perm={1,2,4}, col_perm={1,3}.
+   Values: out.X[:,0] = 10*[1,3,5] + 20*[2,4,6] = [50,110,170],
+           out.X[:,1] = 40*[2,4,6] = [80,160,240]. */
+const char *test_permuted_dense_times_csc(void)
+{
+    int row_perm[3] = {1, 2, 4};
+    int col_perm[2] = {0, 3};
+    double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    Matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
+    Permuted_Dense *pd = (Permuted_Dense *) M;
+
+    CSC_Matrix *J = new_csc_matrix(6, 4, 4);
+    int Jp[5] = {0, 0, 2, 3, 4};
+    int Ji[4] = {0, 3, 2, 3};
+    double Jx[4] = {10.0, 20.0, 30.0, 40.0};
+    memcpy(J->p, Jp, 5 * sizeof(int));
+    memcpy(J->i, Ji, 4 * sizeof(int));
+    memcpy(J->x, Jx, 4 * sizeof(double));
+
+    Matrix *M_out = permuted_dense_times_csc_alloc(pd, J);
+    Permuted_Dense *pd_out = (Permuted_Dense *) M_out;
+    permuted_dense_times_csc_fill_values(pd, J, pd_out);
+
+    int row_perm_expected[3] = {1, 2, 4};
+    int col_perm_expected[2] = {1, 3};
+    double X_expected[6] = {50.0, 80.0, 110.0, 160.0, 170.0, 240.0};
+
+    mu_assert("m", M_out->m == 5);
+    mu_assert("n", M_out->n == 4);
+    mu_assert("dense_m", pd_out->dense_m == 3);
+    mu_assert("dense_n", pd_out->dense_n == 2);
+    mu_assert("row_perm", cmp_int_array(pd_out->row_perm, row_perm_expected, 3));
+    mu_assert("col_perm", cmp_int_array(pd_out->col_perm, col_perm_expected, 2));
+    mu_assert("X", cmp_double_array(pd_out->X, X_expected, 6));
+
+    free_matrix(M);
+    free_matrix(M_out);
+    free_csc_matrix(J);
+    return 0;
+}
+
+/* PD x CSC edge case: every column of J has its only nonzero outside
+   col_perm_self, so col_perm_out is empty (dense_n = 0). */
+const char *test_permuted_dense_times_csc_no_active(void)
+{
+    int row_perm[3] = {1, 2, 4};
+    int col_perm[2] = {0, 3};
+    double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    Matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
+    Permuted_Dense *pd = (Permuted_Dense *) M;
+
+    /* J: col 0 has row {1}, col 1 has row {5}. Neither in col_perm_self. */
+    CSC_Matrix *J = new_csc_matrix(6, 2, 2);
+    int Jp[3] = {0, 1, 2};
+    int Ji[2] = {1, 5};
+    double Jx[2] = {100.0, 200.0};
+    memcpy(J->p, Jp, 3 * sizeof(int));
+    memcpy(J->i, Ji, 2 * sizeof(int));
+    memcpy(J->x, Jx, 2 * sizeof(double));
+
+    Matrix *M_out = permuted_dense_times_csc_alloc(pd, J);
+    Permuted_Dense *pd_out = (Permuted_Dense *) M_out;
+    permuted_dense_times_csc_fill_values(pd, J, pd_out);
+
+    mu_assert("m", M_out->m == 5);
+    mu_assert("n", M_out->n == 2);
+    mu_assert("dense_m", pd_out->dense_m == 3);
+    mu_assert("dense_n", pd_out->dense_n == 0);
+
+    free_matrix(M);
+    free_matrix(M_out);
+    free_csc_matrix(J);
+    return 0;
+}
+
+/* Sanity check: col_inv is built correctly. col_perm = {0, 3} on n = 6
+   should give col_inv = {0, -1, -1, 1, -1, -1}. */
+const char *test_permuted_dense_col_inv(void)
+{
+    int row_perm[1] = {0};
+    int col_perm[2] = {0, 3};
+    double X[2] = {0.0, 0.0};
+
+    Matrix *M = new_permuted_dense(1, 6, 1, 2, row_perm, col_perm, X);
+    Permuted_Dense *pd = (Permuted_Dense *) M;
+
+    int expected[6] = {0, -1, -1, 1, -1, -1};
+    mu_assert("col_inv", cmp_int_array(pd->col_inv, expected, 6));
+
+    free_matrix(M);
+    return 0;
+}
+
+#endif /* TEST_PERMUTED_DENSE_H */

From 4e0b35508e3e6de73c88ae22dc81652cab57e276 Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Sun, 10 May 2026 22:44:07 +0200
Subject: [PATCH 02/31] better polymorphism

---
 include/expr.h                                |  3 +-
 include/utils/matrix.h                        | 37 +++++++-
 include/utils/permuted_dense.h                | 21 +++--
 src/atoms/affine/add.c                        | 13 ++-
 src/atoms/affine/broadcast.c                  | 16 ++--
 src/atoms/affine/convolve.c                   | 11 ++-
 src/atoms/affine/diag_vec.c                   |  8 +-
 src/atoms/affine/hstack.c                     | 16 ++--
 src/atoms/affine/index.c                      |  8 +-
 src/atoms/affine/left_matmul.c                | 13 +--
 src/atoms/affine/neg.c                        |  7 +-
 src/atoms/affine/parameter.c                  |  2 +-
 src/atoms/affine/promote.c                    | 15 ++--
 src/atoms/affine/reshape.c                    |  6 +-
 src/atoms/affine/scalar_mult.c                |  8 +-
 src/atoms/affine/sum.c                        | 26 +++---
 src/atoms/affine/trace.c                      | 19 +++--
 src/atoms/affine/transpose.c                  | 11 +--
 src/atoms/affine/variable.c                   | 11 +--
 src/atoms/affine/vector_mult.c                |  8 +-
 src/atoms/bivariate_full_dom/matmul.c         | 37 ++++----
 src/atoms/bivariate_full_dom/multiply.c       | 17 ++--
 .../bivariate_restricted_dom/quad_over_lin.c  | 58 +++++++------
 src/atoms/bivariate_restricted_dom/rel_entr.c | 26 +++---
 .../rel_entr_scalar_vector.c                  | 26 +++---
 .../rel_entr_vector_scalar.c                  | 26 +++---
 src/atoms/elementwise_full_dom/common.c       | 43 +++++-----
 src/atoms/elementwise_full_dom/logistic.c     |  2 +-
 src/atoms/elementwise_restricted_dom/atanh.c  |  3 +-
 src/atoms/elementwise_restricted_dom/common.c |  9 +-
 src/atoms/elementwise_restricted_dom/entr.c   |  3 +-
 src/atoms/elementwise_restricted_dom/log.c    |  3 +-
 src/atoms/elementwise_restricted_dom/tan.c    |  3 +-
 src/atoms/other/prod.c                        | 18 ++--
 src/atoms/other/prod_axis_one.c               | 11 +--
 src/atoms/other/prod_axis_zero.c              | 11 +--
 src/atoms/other/quad_form.c                   | 31 +++----
 src/expr.c                                    |  4 +-
 src/old-code/linear_op.c                      |  7 +-
 src/problem.c                                 | 27 +++---
 src/utils/permuted_dense.c                    | 34 +++++++-
 src/utils/sparse_matrix.c                     | 84 ++++++++++++++++---
 tests/all_tests.c                             |  1 +
 tests/jacobian_tests/affine/test_broadcast.h  | 22 ++---
 tests/jacobian_tests/affine/test_convolve.h   | 12 +--
 tests/jacobian_tests/affine/test_diag_mat.h   | 10 +--
 tests/jacobian_tests/affine/test_hstack.h     | 12 +--
 tests/jacobian_tests/affine/test_index.h      | 20 ++---
 .../jacobian_tests/affine/test_left_matmul.h  | 12 +--
 tests/jacobian_tests/affine/test_neg.h        | 12 +--
 tests/jacobian_tests/affine/test_promote.h    | 12 +--
 .../jacobian_tests/affine/test_right_matmul.h | 12 +--
 .../jacobian_tests/affine/test_scalar_mult.h  | 12 +--
 tests/jacobian_tests/affine/test_sum.h        | 30 +++----
 tests/jacobian_tests/affine/test_trace.h      | 14 ++--
 tests/jacobian_tests/affine/test_transpose.h  |  6 +-
 tests/jacobian_tests/affine/test_upper_tri.h  | 10 +--
 .../jacobian_tests/affine/test_vector_mult.h  | 12 +--
 tests/jacobian_tests/affine/test_vstack.h     | 12 +--
 .../test_elementwise_mult.h                   | 24 +++---
 .../bivariate_full_dom/test_matmul.h          | 12 +--
 .../test_quad_over_lin.h                      | 30 +++----
 .../bivariate_restricted_dom/test_rel_entr.h  | 18 ++--
 .../test_rel_entr_scalar_vector.h             |  6 +-
 .../test_rel_entr_vector_scalar.h             |  6 +-
 .../composite/test_composite_exp.h            |  6 +-
 .../elementwise_restricted_dom/test_log.h     | 12 +--
 tests/jacobian_tests/other/test_prod.h        | 18 ++--
 .../jacobian_tests/other/test_prod_axis_one.h | 12 +--
 .../other/test_prod_axis_zero.h               |  6 +-
 tests/jacobian_tests/other/test_quad_form.h   | 12 +--
 tests/numerical_diff.c                        |  6 +-
 tests/profiling/profile_log_reg.h             | 45 ++++++----
 tests/utils/test_matrix.h                     |  2 -
 tests/utils/test_permuted_dense.h             | 29 +++++++
 75 files changed, 714 insertions(+), 493 deletions(-)

diff --git a/include/expr.h b/include/expr.h
index 1c54b2e..61d8347 100644
--- a/include/expr.h
+++ b/include/expr.h
@@ -20,6 +20,7 @@
 
 #include "utils/CSC_Matrix.h"
 #include "utils/CSR_Matrix.h"
+#include "utils/matrix.h"
 #include <stdbool.h>
 #include <stddef.h>
 #include <string.h>
@@ -70,7 +71,7 @@ typedef struct expr
     //                     oracle related quantities
     // ------------------------------------------------------------------------
     double *value;
-    CSR_Matrix *jacobian;
+    Matrix *jacobian;
     CSR_Matrix *wsum_hess;
     forward_fn forward;
     jacobian_init_fn jacobian_init_impl;
diff --git a/include/utils/matrix.h b/include/utils/matrix.h
index 4725548..bb83a4e 100644
--- a/include/utils/matrix.h
+++ b/include/utils/matrix.h
@@ -53,6 +53,8 @@
 typedef struct Matrix
 {
     int m, n;
+
+    /* Operators for the left-multiply matrix in left_matmul. */
     void (*block_left_mult_vec)(const struct Matrix *self, const double *x,
                                 double *y, int p);
     CSC_Matrix *(*block_left_mult_sparsity)(const struct Matrix *self,
@@ -60,18 +62,47 @@ typedef struct Matrix
     void (*block_left_mult_values)(const struct Matrix *self, const CSC_Matrix *J,
                                    CSC_Matrix *C);
     void (*update_values)(struct Matrix *self, const double *new_values);
+
+    /* Chain-rule operations used by transformer atoms (elementwise, etc.).
+       copy_sparsity returns a matrix of same shape and type as self;
+       DA_fill_values writes diag(d) * self into out (which has same structure as
+       self); ATA_alloc_csr allocates a CSR with sparsity of self^T * self;
+       ATDA_fill_csr fills csr_out with self^T * diag(d) * self;
+       to_csr returns a CSR view of self (constant-time for Sparse_Matrix, lazily
+       built/refreshed for other types). */
+    struct Matrix *(*copy_sparsity)(const struct Matrix *self);
+    void (*DA_fill_values)(const double *d, const struct Matrix *self,
+                           struct Matrix *out);
+    CSR_Matrix *(*ATA_alloc_csr)(struct Matrix *self);
+    void (*ATDA_fill_csr)(const struct Matrix *self, const double *d,
+                          CSR_Matrix *csr_out);
+    CSR_Matrix *(*to_csr)(struct Matrix *self);
+
+    /* Refresh any internal caches (e.g. a CSC mirror) so subsequent ATA / ATDA
+       calls reflect the current values. Atoms whose child Jacobian is affine
+       can skip this on iterations after the first; non-affine children must
+       call it before every chain-rule call. No-op for types that don't have
+       a cache (e.g. permuted_dense). */
+    void (*refresh_csc_values)(struct Matrix *self);
+
+    /* Lifecycle. */
     void (*free_fn)(struct Matrix *self);
 } Matrix;
 
-/* Sparse matrix wrapping CSR */
+/* Sparse matrix wrapping CSR. csc_cache is a lazily-built CSC mirror used by
+   the chain-rule ATA / ATDA paths; it's allocated on first need and refilled
+   by refresh_csc_values. csc_iwork is the workspace for csr_to_csc. */
 typedef struct Sparse_Matrix
 {
     Matrix base;
     CSR_Matrix *csr;
+    CSC_Matrix *csc_cache;
+    int *csc_iwork;
 } Sparse_Matrix;
 
-/* Constructors */
-Matrix *new_sparse_matrix(const CSR_Matrix *A);
+/* Constructor. Takes ownership of A; the caller must not free A separately
+   (free_matrix on the returned Matrix frees A). */
+Matrix *new_sparse_matrix(CSR_Matrix *A);
 
 /* Transpose helper */
 Matrix *sparse_matrix_trans(const Sparse_Matrix *self, int *iwork);
diff --git a/include/utils/permuted_dense.h b/include/utils/permuted_dense.h
index 41fba71..90f0caa 100644
--- a/include/utils/permuted_dense.h
+++ b/include/utils/permuted_dense.h
@@ -31,15 +31,18 @@
    strictly increasing order; the constructor asserts this. */
 typedef struct Permuted_Dense
 {
-    Matrix base;       /* base.m, base.n = global ambient dimensions */
-    int dense_m;       /* rows of dense block (= len(row_perm))      */
-    int dense_n;       /* cols of dense block (= len(col_perm))      */
-    int *row_perm;     /* row_perm[ii] in [0, base.m), sorted        */
-    int *col_perm;     /* col_perm[jj] in [0, base.n), sorted        */
-    double *X;         /* dense_m * dense_n, row-major               */
-    double *Y_scratch; /* dense_m * dense_n, used by ATDA            */
-    int *col_inv;      /* length base.n: col_inv[col_perm[jj]] = jj, */
-                       /* otherwise -1; used by `x CSC` allocation.  */
+    Matrix base;           /* base.m, base.n = global ambient dimensions */
+    int dense_m;           /* rows of dense block (= len(row_perm))      */
+    int dense_n;           /* cols of dense block (= len(col_perm))      */
+    int *row_perm;         /* row_perm[ii] in [0, base.m), sorted        */
+    int *col_perm;         /* col_perm[jj] in [0, base.n), sorted        */
+    double *X;             /* dense_m * dense_n, row-major               */
+    double *Y_scratch;     /* dense_m * dense_n, used by ATDA            */
+    int *col_inv;          /* length base.n: col_inv[col_perm[jj]] = jj, */
+                           /* otherwise -1; used by `x CSC` allocation.  */
+    CSR_Matrix *csr_cache; /* lazy CSR view built by to_csr; structure */
+                           /* allocated on first call, values refilled */
+                           /* on every call. NULL until first call.    */
 } Permuted_Dense;
 
 /* Constructor. row_perm and col_perm must be strictly increasing in their
diff --git a/src/atoms/affine/add.c b/src/atoms/affine/add.c
index 631ffb6..77fa694 100644
--- a/src/atoms/affine/add.c
+++ b/src/atoms/affine/add.c
@@ -41,12 +41,16 @@ static void jacobian_init_impl(expr *node)
     jacobian_init(node->left);
     jacobian_init(node->right);
 
+    CSR_Matrix *Jl = node->left->jacobian->to_csr(node->left->jacobian);
+    CSR_Matrix *Jr = node->right->jacobian->to_csr(node->right->jacobian);
+
     /* we never have to store more than the sum of children's nnz */
-    int nnz_max = node->left->jacobian->nnz + node->right->jacobian->nnz;
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, nnz_max);
+    int nnz_max = Jl->nnz + Jr->nnz;
+    CSR_Matrix *jac = new_csr_matrix(node->size, node->n_vars, nnz_max);
 
     /* fill sparsity pattern  */
-    sum_csr_alloc(node->left->jacobian, node->right->jacobian, node->jacobian);
+    sum_csr_alloc(Jl, Jr, jac);
+    node->jacobian = new_sparse_matrix(jac);
 }
 
 static void eval_jacobian(expr *node)
@@ -56,7 +60,8 @@ static void eval_jacobian(expr *node)
     node->right->eval_jacobian(node->right);
 
     /* sum children's jacobians */
-    sum_csr_fill_values(node->left->jacobian, node->right->jacobian, node->jacobian);
+    sum_csr_fill_values(node->left->jacobian->to_csr(node->left->jacobian), node->right->jacobian->to_csr(node->right->jacobian),
+                        node->jacobian->to_csr(node->jacobian));
 }
 
 static void wsum_hess_init_impl(expr *node)
diff --git a/src/atoms/affine/broadcast.c b/src/atoms/affine/broadcast.c
index 94e32e1..d38aae7 100644
--- a/src/atoms/affine/broadcast.c
+++ b/src/atoms/affine/broadcast.c
@@ -72,6 +72,7 @@ static void jacobian_init_impl(expr *node)
     expr *x = node->left;
     jacobian_init(x);
     broadcast_expr *bcast = (broadcast_expr *) node;
+    CSR_Matrix *Jx = x->jacobian->to_csr(x->jacobian);
     int total_nnz;
 
     // --------------------------------------------------------------------
@@ -80,26 +81,25 @@ static void jacobian_init_impl(expr *node)
     if (bcast->type == BROADCAST_ROW)
     {
         /* Row broadcast: (1, n) -> (m, n) */
-        total_nnz = x->jacobian->nnz * node->d1;
+        total_nnz = Jx->nnz * node->d1;
     }
     else if (bcast->type == BROADCAST_COL)
     {
         /* Column broadcast: (m, 1) -> (m, n) */
-        total_nnz = x->jacobian->nnz * node->d2;
+        total_nnz = Jx->nnz * node->d2;
     }
     else
     {
         /* Scalar broadcast: (1, 1) -> (m, n) */
-        total_nnz = x->jacobian->nnz * node->size;
+        total_nnz = Jx->nnz * node->size;
     }
 
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, total_nnz);
+    CSR_Matrix *J = new_csr_matrix(node->size, node->n_vars, total_nnz);
+    node->jacobian = new_sparse_matrix(J);
 
     // ---------------------------------------------------------------------
     //                 fill sparsity pattern
     // ---------------------------------------------------------------------
-    CSR_Matrix *Jx = x->jacobian;
-    CSR_Matrix *J = node->jacobian;
 
     if (bcast->type == BROADCAST_ROW)
     {
@@ -163,8 +163,8 @@ static void eval_jacobian(expr *node)
     node->left->eval_jacobian(node->left);
 
     broadcast_expr *bcast = (broadcast_expr *) node;
-    CSR_Matrix *Jx = node->left->jacobian;
-    CSR_Matrix *J = node->jacobian;
+    CSR_Matrix *Jx = node->left->jacobian->to_csr(node->left->jacobian);
+    CSR_Matrix *J = node->jacobian->to_csr(node->jacobian);
 
     if (bcast->type == BROADCAST_ROW)
     {
diff --git a/src/atoms/affine/convolve.c b/src/atoms/affine/convolve.c
index 55e5857..1406a39 100644
--- a/src/atoms/affine/convolve.c
+++ b/src/atoms/affine/convolve.c
@@ -81,8 +81,9 @@ static void jacobian_init_impl(expr *node)
     conv_matrix_fill_values(cnode->T, a);
 
     /* J = T @ J_child */
-    cnode->Jchild_CSC = csr_to_csc_alloc(child->jacobian, node->work->iwork);
-    node->jacobian = csr_csc_matmul_alloc(cnode->T, cnode->Jchild_CSC);
+    cnode->Jchild_CSC = csr_to_csc_alloc(child->jacobian->to_csr(child->jacobian), node->work->iwork);
+    node->jacobian =
+        new_sparse_matrix(csr_csc_matmul_alloc(cnode->T, cnode->Jchild_CSC));
 }
 
 static void eval_jacobian(expr *node)
@@ -93,8 +94,10 @@ static void eval_jacobian(expr *node)
     child->eval_jacobian(child);
 
     /* J = T @ J_child */
-    csr_to_csc_fill_values(child->jacobian, cnode->Jchild_CSC, node->work->iwork);
-    csr_csc_matmul_fill_values(cnode->T, cnode->Jchild_CSC, node->jacobian);
+    csr_to_csc_fill_values(child->jacobian->to_csr(child->jacobian), cnode->Jchild_CSC,
+                           node->work->iwork);
+    csr_csc_matmul_fill_values(cnode->T, cnode->Jchild_CSC,
+                               node->jacobian->to_csr(node->jacobian));
 }
 
 static void wsum_hess_init_impl(expr *node)
diff --git a/src/atoms/affine/diag_vec.c b/src/atoms/affine/diag_vec.c
index bd94c05..b05360f 100644
--- a/src/atoms/affine/diag_vec.c
+++ b/src/atoms/affine/diag_vec.c
@@ -51,7 +51,7 @@ static void jacobian_init_impl(expr *node)
     int n = x->size;
     jacobian_init(x);
 
-    CSR_Matrix *Jx = x->jacobian;
+    CSR_Matrix *Jx = x->jacobian->to_csr(x->jacobian);
     CSR_Matrix *J = new_csr_matrix(node->size, node->n_vars, Jx->nnz);
 
     /* Output has n^2 rows but only n diagonal positions are non-empty.
@@ -72,7 +72,7 @@ static void jacobian_init_impl(expr *node)
     }
     J->p[node->size] = nnz;
 
-    node->jacobian = J;
+    node->jacobian = new_sparse_matrix(J);
 }
 
 static void eval_jacobian(expr *node)
@@ -81,8 +81,8 @@ static void eval_jacobian(expr *node)
     int n = x->size;
     x->eval_jacobian(x);
 
-    CSR_Matrix *J = node->jacobian;
-    CSR_Matrix *Jx = x->jacobian;
+    CSR_Matrix *J = node->jacobian->to_csr(node->jacobian);
+    CSR_Matrix *Jx = x->jacobian->to_csr(x->jacobian);
 
     /* Copy values from child row i to output diagonal row i*(n+1) */
     for (int i = 0; i < n; i++)
diff --git a/src/atoms/affine/hstack.c b/src/atoms/affine/hstack.c
index da670d2..ab4b55e 100644
--- a/src/atoms/affine/hstack.c
+++ b/src/atoms/affine/hstack.c
@@ -53,20 +53,19 @@ static void jacobian_init_impl(expr *node)
     {
         assert(hnode->args[i] != NULL);
         jacobian_init(hnode->args[i]);
-        nnz += hnode->args[i]->jacobian->nnz;
+        nnz += hnode->args[i]->jacobian->to_csr(hnode->args[i]->jacobian)->nnz;
     }
 
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, nnz);
+    CSR_Matrix *A = new_csr_matrix(node->size, node->n_vars, nnz);
 
     /* precompute sparsity pattern of this node's jacobian */
     int row_offset = 0;
-    CSR_Matrix *A = node->jacobian;
     A->nnz = 0;
 
     for (int i = 0; i < hnode->n_args; i++)
     {
         expr *child = hnode->args[i];
-        CSR_Matrix *B = child->jacobian;
+        CSR_Matrix *B = child->jacobian->to_csr(child->jacobian);
 
         /* copy columns */
         memcpy(A->i + A->nnz, B->i, B->nnz * sizeof(int));
@@ -81,23 +80,24 @@ static void jacobian_init_impl(expr *node)
         row_offset += child->size;
     }
     A->p[node->size] = A->nnz;
+    node->jacobian = new_sparse_matrix(A);
 }
 
 static void eval_jacobian(expr *node)
 {
     hstack_expr *hnode = (hstack_expr *) node;
-    CSR_Matrix *A = node->jacobian;
+    CSR_Matrix *A = node->jacobian->to_csr(node->jacobian);
     A->nnz = 0;
 
     for (int i = 0; i < hnode->n_args; i++)
     {
         expr *child = hnode->args[i];
         child->eval_jacobian(child);
+        CSR_Matrix *Jc = child->jacobian->to_csr(child->jacobian);
 
         /* copy values */
-        memcpy(A->x + A->nnz, child->jacobian->x,
-               child->jacobian->nnz * sizeof(double));
-        A->nnz += child->jacobian->nnz;
+        memcpy(A->x + A->nnz, Jc->x, Jc->nnz * sizeof(double));
+        A->nnz += Jc->nnz;
     }
 }
 
diff --git a/src/atoms/affine/index.c b/src/atoms/affine/index.c
index 34c8310..27654b1 100644
--- a/src/atoms/affine/index.c
+++ b/src/atoms/affine/index.c
@@ -64,7 +64,7 @@ static void jacobian_init_impl(expr *node)
     index_expr *idx = (index_expr *) node;
     jacobian_init(x);
 
-    CSR_Matrix *Jx = x->jacobian;
+    CSR_Matrix *Jx = x->jacobian->to_csr(x->jacobian);
     CSR_Matrix *J = new_csr_matrix(node->size, node->n_vars, Jx->nnz);
 
     /* set sparsity pattern */
@@ -78,7 +78,7 @@ static void jacobian_init_impl(expr *node)
     }
 
     J->nnz = J->p[idx->n_idxs];
-    node->jacobian = J;
+    node->jacobian = new_sparse_matrix(J);
 }
 
 static void eval_jacobian(expr *node)
@@ -87,8 +87,8 @@ static void eval_jacobian(expr *node)
     index_expr *idx = (index_expr *) node;
     x->eval_jacobian(x);
 
-    CSR_Matrix *J = node->jacobian;
-    CSR_Matrix *Jx = x->jacobian;
+    CSR_Matrix *J = node->jacobian->to_csr(node->jacobian);
+    CSR_Matrix *Jx = x->jacobian->to_csr(x->jacobian);
 
     for (int i = 0; i < idx->n_idxs; i++)
     {
diff --git a/src/atoms/affine/left_matmul.c b/src/atoms/affine/left_matmul.c
index e458342..755a23b 100644
--- a/src/atoms/affine/left_matmul.c
+++ b/src/atoms/affine/left_matmul.c
@@ -115,12 +115,13 @@ static void jacobian_init_impl(expr *node)
 
     /* initialize child's jacobian and precompute sparsity of its CSC */
     jacobian_init(x);
-    lnode->Jchild_CSC = csr_to_csc_alloc(x->jacobian, node->work->iwork);
+    lnode->Jchild_CSC = csr_to_csc_alloc(x->jacobian->to_csr(x->jacobian), node->work->iwork);
 
     /* precompute sparsity of this node's jacobian in CSC and CSR */
     lnode->J_CSC = lnode->A->block_left_mult_sparsity(lnode->A, lnode->Jchild_CSC,
                                                       lnode->n_blocks);
-    node->jacobian = csc_to_csr_alloc(lnode->J_CSC, lnode->csc_to_csr_work);
+    node->jacobian =
+        new_sparse_matrix(csc_to_csr_alloc(lnode->J_CSC, lnode->csc_to_csr_work));
 }
 
 static void eval_jacobian(expr *node)
@@ -133,11 +134,11 @@ static void eval_jacobian(expr *node)
 
     /* evaluate child's jacobian and convert to CSC */
     x->eval_jacobian(x);
-    csr_to_csc_fill_values(x->jacobian, Jchild_CSC, node->work->iwork);
+    csr_to_csc_fill_values(x->jacobian->to_csr(x->jacobian), Jchild_CSC, node->work->iwork);
 
     /* compute this node's jacobian: */
     lnode->A->block_left_mult_values(lnode->A, Jchild_CSC, J_CSC);
-    csc_to_csr_fill_values(J_CSC, node->jacobian, lnode->csc_to_csr_work);
+    csc_to_csr_fill_values(J_CSC, node->jacobian->to_csr(node->jacobian), lnode->csc_to_csr_work);
 }
 
 static void wsum_hess_init_impl(expr *node)
@@ -225,8 +226,8 @@ expr *new_left_matmul(expr *param_node, expr *u, const CSR_Matrix *A)
     lnode->csc_to_csr_work = (int *) SP_MALLOC(node->size * sizeof(int));
     lnode->n_blocks = n_blocks;
 
-    /* store A and AT */
-    lnode->A = new_sparse_matrix(A);
+    /* store A and AT. new_sparse_matrix takes ownership, so clone first. */
+    lnode->A = new_sparse_matrix(new_csr(A));
     lnode->AT =
         sparse_matrix_trans((const Sparse_Matrix *) lnode->A, node->work->iwork);
 
diff --git a/src/atoms/affine/neg.c b/src/atoms/affine/neg.c
index f453876..303e1e2 100644
--- a/src/atoms/affine/neg.c
+++ b/src/atoms/affine/neg.c
@@ -40,7 +40,7 @@ static void jacobian_init_impl(expr *node)
     jacobian_init(x);
 
     /* same sparsity pattern as child */
-    node->jacobian = new_csr_copy_sparsity(x->jacobian);
+    node->jacobian = new_sparse_matrix(new_csr_copy_sparsity(x->jacobian->to_csr(x->jacobian)));
 }
 
 static void eval_jacobian(expr *node)
@@ -49,10 +49,11 @@ static void eval_jacobian(expr *node)
     node->left->eval_jacobian(node->left);
 
     /* negate values only (sparsity pattern set in jacobian_init_impl) */
-    CSR_Matrix *child_jac = node->left->jacobian;
+    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
+    CSR_Matrix *child_jac = node->left->jacobian->to_csr(node->left->jacobian);
     for (int k = 0; k < child_jac->nnz; k++)
     {
-        node->jacobian->x[k] = -child_jac->x[k];
+        jac->x[k] = -child_jac->x[k];
     }
 }
 
diff --git a/src/atoms/affine/parameter.c b/src/atoms/affine/parameter.c
index 57458b0..2dc5be1 100644
--- a/src/atoms/affine/parameter.c
+++ b/src/atoms/affine/parameter.c
@@ -32,7 +32,7 @@ static void forward(expr *node, const double *u)
 static void jacobian_init_impl(expr *node)
 {
     /* Zero jacobian: size x n_vars with 0 nonzeros. */
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, 0);
+    node->jacobian = new_sparse_matrix(new_csr_matrix(node->size, node->n_vars, 0));
 }
 
 static void eval_jacobian(expr *node)
diff --git a/src/atoms/affine/promote.c b/src/atoms/affine/promote.c
index ffa21a6..6af9bbf 100644
--- a/src/atoms/affine/promote.c
+++ b/src/atoms/affine/promote.c
@@ -39,30 +39,31 @@ static void jacobian_init_impl(expr *node)
 {
     expr *x = node->left;
     jacobian_init(x);
+    CSR_Matrix *Jx = x->jacobian->to_csr(x->jacobian);
 
     /* each output row copies the single row from child's jacobian */
-    int nnz = node->size * x->jacobian->nnz;
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, nnz);
+    int nnz = node->size * Jx->nnz;
+    CSR_Matrix *J = new_csr_matrix(node->size, node->n_vars, nnz);
 
     /* fill sparsity pattern */
-    CSR_Matrix *J = node->jacobian;
     J->nnz = 0;
     for (int row = 0; row < node->size; row++)
     {
         J->p[row] = J->nnz;
-        memcpy(J->i + J->nnz, x->jacobian->i, x->jacobian->nnz * sizeof(int));
-        J->nnz += x->jacobian->nnz;
+        memcpy(J->i + J->nnz, Jx->i, Jx->nnz * sizeof(int));
+        J->nnz += Jx->nnz;
     }
     assert(J->nnz == nnz);
     J->p[node->size] = J->nnz;
+    node->jacobian = new_sparse_matrix(J);
 }
 
 static void eval_jacobian(expr *node)
 {
     node->left->eval_jacobian(node->left);
 
-    CSR_Matrix *child_jac = node->left->jacobian;
-    CSR_Matrix *jac = node->jacobian;
+    CSR_Matrix *child_jac = node->left->jacobian->to_csr(node->left->jacobian);
+    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
     int child_nnz = child_jac->p[1] - child_jac->p[0];
 
     /* Copy child's row values to each output row */
diff --git a/src/atoms/affine/reshape.c b/src/atoms/affine/reshape.c
index d654b17..8e23124 100644
--- a/src/atoms/affine/reshape.c
+++ b/src/atoms/affine/reshape.c
@@ -36,14 +36,16 @@ static void jacobian_init_impl(expr *node)
 {
     expr *x = node->left;
     jacobian_init(x);
-    node->jacobian = new_csr_copy_sparsity(x->jacobian);
+    node->jacobian = new_sparse_matrix(new_csr_copy_sparsity(x->jacobian->to_csr(x->jacobian)));
 }
 
 static void eval_jacobian(expr *node)
 {
     expr *x = node->left;
     x->eval_jacobian(x);
-    memcpy(node->jacobian->x, x->jacobian->x, x->jacobian->nnz * sizeof(double));
+    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
+    CSR_Matrix *Jx = x->jacobian->to_csr(x->jacobian);
+    memcpy(jac->x, Jx->x, Jx->nnz * sizeof(double));
 }
 
 static void wsum_hess_init_impl(expr *node)
diff --git a/src/atoms/affine/scalar_mult.c b/src/atoms/affine/scalar_mult.c
index e54142a..210fd9f 100644
--- a/src/atoms/affine/scalar_mult.c
+++ b/src/atoms/affine/scalar_mult.c
@@ -59,7 +59,7 @@ static void jacobian_init_impl(expr *node)
     jacobian_init(x);
 
     /* same sparsity as child */
-    node->jacobian = new_csr_copy_sparsity(x->jacobian);
+    node->jacobian = new_sparse_matrix(new_csr_copy_sparsity(x->jacobian->to_csr(x->jacobian)));
 }
 
 static void eval_jacobian(expr *node)
@@ -71,9 +71,11 @@ static void eval_jacobian(expr *node)
     child->eval_jacobian(child);
 
     /* scale child's jacobian */
-    for (int j = 0; j < child->jacobian->nnz; j++)
+    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
+    CSR_Matrix *child_jac = child->jacobian->to_csr(child->jacobian);
+    for (int j = 0; j < child_jac->nnz; j++)
     {
-        node->jacobian->x[j] = a * child->jacobian->x[j];
+        jac->x[j] = a * child_jac->x[j];
     }
 }
 
diff --git a/src/atoms/affine/sum.c b/src/atoms/affine/sum.c
index c43d2ab..3a89dff 100644
--- a/src/atoms/affine/sum.c
+++ b/src/atoms/affine/sum.c
@@ -86,12 +86,12 @@ static void jacobian_init_impl(expr *node)
 
     /* initialize child's jacobian */
     jacobian_init(x);
+    CSR_Matrix *Jx = x->jacobian->to_csr(x->jacobian);
 
     /* we never have to store more than the child's nnz */
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, x->jacobian->nnz);
-    node->work->iwork =
-        SP_MALLOC(MAX(node->jacobian->n, x->jacobian->nnz) * sizeof(int));
-    snode->idx_map = SP_MALLOC(x->jacobian->nnz * sizeof(int));
+    CSR_Matrix *jac = new_csr_matrix(node->size, node->n_vars, Jx->nnz);
+    node->work->iwork = SP_MALLOC(MAX(jac->n, Jx->nnz) * sizeof(int));
+    snode->idx_map = SP_MALLOC(Jx->nnz * sizeof(int));
 
     /* the idx_map array maps each nonzero entry j in x->jacobian
        to the corresponding index in the output row matrix C. Specifically, for
@@ -100,19 +100,20 @@ static void jacobian_init_impl(expr *node)
 
     if (axis == -1)
     {
-        sum_all_rows_csr_alloc(x->jacobian, node->jacobian, node->work->iwork,
-                               snode->idx_map);
+        sum_all_rows_csr_alloc(Jx, jac, node->work->iwork, snode->idx_map);
     }
     else if (axis == 0)
     {
-        sum_block_of_rows_csr_alloc(x->jacobian, node->jacobian, x->d1,
-                                    node->work->iwork, snode->idx_map);
+        sum_block_of_rows_csr_alloc(Jx, jac, x->d1, node->work->iwork,
+                                    snode->idx_map);
     }
     else if (axis == 1)
     {
-        sum_evenly_spaced_rows_csr_alloc(x->jacobian, node->jacobian, node->size,
-                                         node->work->iwork, snode->idx_map);
+        sum_evenly_spaced_rows_csr_alloc(Jx, jac, node->size, node->work->iwork,
+                                         snode->idx_map);
     }
+
+    node->jacobian = new_sparse_matrix(jac);
 }
 
 static void eval_jacobian(expr *node)
@@ -124,8 +125,9 @@ static void eval_jacobian(expr *node)
 
     /* we have precomputed an idx map between the nonzeros of the child's jacobian
        and this node's jacobian, so we just accumulate accordingly */
-    memset(node->jacobian->x, 0, node->jacobian->nnz * sizeof(double));
-    accumulator(x->jacobian, ((sum_expr *) node)->idx_map, node->jacobian->x);
+    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
+    memset(jac->x, 0, jac->nnz * sizeof(double));
+    accumulator(x->jacobian->to_csr(x->jacobian), ((sum_expr *) node)->idx_map, jac->x);
 }
 
 static void wsum_hess_init_impl(expr *node)
diff --git a/src/atoms/affine/trace.c b/src/atoms/affine/trace.c
index c421b3f..5d6e7df 100644
--- a/src/atoms/affine/trace.c
+++ b/src/atoms/affine/trace.c
@@ -55,7 +55,7 @@ static void jacobian_init_impl(expr *node)
     // ---------------------------------------------------------------
     //    count total nnz and allocate matrix with sufficient space
     // ---------------------------------------------------------------
-    const CSR_Matrix *A = x->jacobian;
+    const CSR_Matrix *A = x->jacobian->to_csr(x->jacobian);
     int total_nnz = 0;
     int row_spacing = x->d1 + 1;
 
@@ -64,22 +64,23 @@ static void jacobian_init_impl(expr *node)
         total_nnz += A->p[row + 1] - A->p[row];
     }
 
-    node->jacobian = new_csr_matrix(1, node->n_vars, total_nnz);
+    CSR_Matrix *jac = new_csr_matrix(1, node->n_vars, total_nnz);
 
     // ---------------------------------------------------------------
     // fill sparsity pattern and idx_map
     // ---------------------------------------------------------------
     trace_expr *tnode = (trace_expr *) node;
-    node->work->iwork = SP_MALLOC(MAX(node->jacobian->n, total_nnz) * sizeof(int));
+    node->work->iwork = SP_MALLOC(MAX(jac->n, total_nnz) * sizeof(int));
 
     /* the idx_map array maps each nonzero entry j in the original matrix A (from the
        selected, evenly spaced rows) to the corresponding index in the output row
        matrix C. Specifically, for each nonzero entry j in A (from the selected
        rows), idx_map[j] gives the position in C->x where the value from A->x[j]
        should be accumulated. */
-    tnode->idx_map = SP_MALLOC(x->jacobian->nnz * sizeof(int));
-    sum_spaced_rows_into_row_csr_alloc(A, node->jacobian, row_spacing,
-                                       node->work->iwork, tnode->idx_map);
+    tnode->idx_map = SP_MALLOC(A->nnz * sizeof(int));
+    sum_spaced_rows_into_row_csr_alloc(A, jac, row_spacing, node->work->iwork,
+                                       tnode->idx_map);
+    node->jacobian = new_sparse_matrix(jac);
 }
 
 static void eval_jacobian(expr *node)
@@ -91,9 +92,9 @@ static void eval_jacobian(expr *node)
     x->eval_jacobian(x);
 
     /* local jacobian */
-    memset(node->jacobian->x, 0, node->jacobian->nnz * sizeof(double));
-    accumulator_with_spacing(x->jacobian, tnode->idx_map, node->jacobian->x,
-                             x->d1 + 1);
+    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
+    memset(jac->x, 0, jac->nnz * sizeof(double));
+    accumulator_with_spacing(x->jacobian->to_csr(x->jacobian), tnode->idx_map, jac->x, x->d1 + 1);
 }
 
 /* Placeholders for Hessian-related functions */
diff --git a/src/atoms/affine/transpose.c b/src/atoms/affine/transpose.c
index d4a7998..e3e7230 100644
--- a/src/atoms/affine/transpose.c
+++ b/src/atoms/affine/transpose.c
@@ -45,11 +45,10 @@ static void jacobian_init_impl(expr *node)
 {
     expr *child = node->left;
     jacobian_init(child);
-    CSR_Matrix *Jc = child->jacobian;
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, Jc->nnz);
+    CSR_Matrix *Jc = child->jacobian->to_csr(child->jacobian);
+    CSR_Matrix *J = new_csr_matrix(node->size, node->n_vars, Jc->nnz);
 
     /* fill sparsity */
-    CSR_Matrix *J = node->jacobian;
     int d1 = node->d1;
     int d2 = node->d2;
     int nnz = 0;
@@ -65,14 +64,16 @@ static void jacobian_init_impl(expr *node)
         nnz += len;
         J->p[row + 1] = nnz;
     }
+
+    node->jacobian = new_sparse_matrix(J);
 }
 
 static void eval_jacobian(expr *node)
 {
     expr *child = node->left;
     child->eval_jacobian(child);
-    CSR_Matrix *Jc = child->jacobian;
-    CSR_Matrix *J = node->jacobian;
+    CSR_Matrix *Jc = child->jacobian->to_csr(child->jacobian);
+    CSR_Matrix *J = node->jacobian->to_csr(node->jacobian);
 
     int d1 = node->d1;
     int d2 = node->d2;
diff --git a/src/atoms/affine/variable.c b/src/atoms/affine/variable.c
index 0dc12d2..9a0f182 100644
--- a/src/atoms/affine/variable.c
+++ b/src/atoms/affine/variable.c
@@ -27,14 +27,15 @@ static void forward(expr *node, const double *u)
 
 static void jacobian_init_impl(expr *node)
 {
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, node->size);
+    CSR_Matrix *jac = new_csr_matrix(node->size, node->n_vars, node->size);
     for (int j = 0; j < node->size; j++)
     {
-        node->jacobian->p[j] = j;
-        node->jacobian->i[j] = j + node->var_id;
-        node->jacobian->x[j] = 1.0;
+        jac->p[j] = j;
+        jac->i[j] = j + node->var_id;
+        jac->x[j] = 1.0;
     }
-    node->jacobian->p[node->size] = node->size;
+    jac->p[node->size] = node->size;
+    node->jacobian = new_sparse_matrix(jac);
 }
 
 static void eval_jacobian(expr *node)
diff --git a/src/atoms/affine/vector_mult.c b/src/atoms/affine/vector_mult.c
index 3647f8b..d077c97 100644
--- a/src/atoms/affine/vector_mult.c
+++ b/src/atoms/affine/vector_mult.c
@@ -59,7 +59,7 @@ static void jacobian_init_impl(expr *node)
     jacobian_init(x);
 
     /* same sparsity as child */
-    node->jacobian = new_csr_copy_sparsity(x->jacobian);
+    node->jacobian = new_sparse_matrix(new_csr_copy_sparsity(x->jacobian->to_csr(x->jacobian)));
 }
 
 static void eval_jacobian(expr *node)
@@ -71,11 +71,13 @@ static void eval_jacobian(expr *node)
     x->eval_jacobian(x);
 
     /* row-wise scale child's jacobian */
+    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
+    CSR_Matrix *Jx = x->jacobian->to_csr(x->jacobian);
     for (int i = 0; i < node->size; i++)
     {
-        for (int j = x->jacobian->p[i]; j < x->jacobian->p[i + 1]; j++)
+        for (int j = Jx->p[i]; j < Jx->p[i + 1]; j++)
         {
-            node->jacobian->x[j] = a[i] * x->jacobian->x[j];
+            jac->x[j] = a[i] * Jx->x[j];
         }
     }
 }
diff --git a/src/atoms/bivariate_full_dom/matmul.c b/src/atoms/bivariate_full_dom/matmul.c
index 5fb8af6..6d36194 100644
--- a/src/atoms/bivariate_full_dom/matmul.c
+++ b/src/atoms/bivariate_full_dom/matmul.c
@@ -139,7 +139,7 @@ static void jacobian_init_no_chain_rule(expr *node)
     int k = x->d2;
     int n = y->d2;
     int nnz = m * n * 2 * k;
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, nnz);
+    CSR_Matrix *jac = new_csr_matrix(node->size, node->n_vars, nnz);
 
     int nnz_idx = 0;
     for (int i = 0; i < node->size; i++)
@@ -147,33 +147,34 @@ static void jacobian_init_no_chain_rule(expr *node)
         int row = i % m;
         int col = i / m;
 
-        node->jacobian->p[i] = nnz_idx;
+        jac->p[i] = nnz_idx;
 
         if (x->var_id < y->var_id)
         {
             for (int j = 0; j < k; j++)
             {
-                node->jacobian->i[nnz_idx++] = x->var_id + row + j * m;
+                jac->i[nnz_idx++] = x->var_id + row + j * m;
             }
             for (int j = 0; j < k; j++)
             {
-                node->jacobian->i[nnz_idx++] = y->var_id + col * k + j;
+                jac->i[nnz_idx++] = y->var_id + col * k + j;
             }
         }
         else
         {
             for (int j = 0; j < k; j++)
             {
-                node->jacobian->i[nnz_idx++] = y->var_id + col * k + j;
+                jac->i[nnz_idx++] = y->var_id + col * k + j;
             }
             for (int j = 0; j < k; j++)
             {
-                node->jacobian->i[nnz_idx++] = x->var_id + row + j * m;
+                jac->i[nnz_idx++] = x->var_id + row + j * m;
             }
         }
     }
-    node->jacobian->p[node->size] = nnz_idx;
+    jac->p[node->size] = nnz_idx;
     assert(nnz_idx == nnz);
+    node->jacobian = new_sparse_matrix(jac);
 }
 
 static void eval_jacobian_no_chain_rule(expr *node)
@@ -182,13 +183,14 @@ static void eval_jacobian_no_chain_rule(expr *node)
     expr *y = node->right;
     int m = x->d1;
     int k = x->d2;
-    double *Jx = node->jacobian->x;
+    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
+    double *Jx = jac->x;
 
     for (int i = 0; i < node->size; i++)
     {
         int row = i % m;
         int col = i / m;
-        int pos = node->jacobian->p[i];
+        int pos = jac->p[i];
 
         if (x->var_id < y->var_id)
         {
@@ -234,8 +236,9 @@ static void jacobian_init_chain_rule(expr *node)
     mnode->term1_CSR = YT_kron_I_alloc(m, k, n, f->work->jacobian_csc);
     mnode->term2_CSR = I_kron_X_alloc(m, k, n, g->work->jacobian_csc);
     int max_nnz = mnode->term1_CSR->nnz + mnode->term2_CSR->nnz;
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, max_nnz);
-    sum_csr_alloc(mnode->term1_CSR, mnode->term2_CSR, node->jacobian);
+    CSR_Matrix *jac = new_csr_matrix(node->size, node->n_vars, max_nnz);
+    sum_csr_alloc(mnode->term1_CSR, mnode->term2_CSR, jac);
+    node->jacobian = new_sparse_matrix(jac);
 }
 
 static void eval_jacobian_chain_rule(expr *node)
@@ -250,14 +253,16 @@ static void eval_jacobian_chain_rule(expr *node)
     /* evaluate Jacobians of children */
     f->eval_jacobian(f);
     g->eval_jacobian(g);
-    csr_to_csc_fill_values(f->jacobian, f->work->jacobian_csc, f->work->csc_work);
-    csr_to_csc_fill_values(g->jacobian, g->work->jacobian_csc, g->work->csc_work);
+    csr_to_csc_fill_values(f->jacobian->to_csr(f->jacobian), f->work->jacobian_csc,
+                           f->work->csc_work);
+    csr_to_csc_fill_values(g->jacobian->to_csr(g->jacobian), g->work->jacobian_csc,
+                           g->work->csc_work);
 
     /* evaluate term1, term2, and their sum */
     YT_kron_I_fill_values(m, k, n, g->value, f->work->jacobian_csc,
                           mnode->term1_CSR);
     I_kron_X_fill_values(m, k, n, f->value, g->work->jacobian_csc, mnode->term2_CSR);
-    sum_csr_fill_values(mnode->term1_CSR, mnode->term2_CSR, node->jacobian);
+    sum_csr_fill_values(mnode->term1_CSR, mnode->term2_CSR, node->jacobian->to_csr(node->jacobian));
 }
 
 // ------------------------------------------------------------------------------------
@@ -466,7 +471,7 @@ static void eval_wsum_hess_chain_rule(expr *node, const double *w)
     /* refresh child Jacobian CSC values (cache if affine) */
     if (!f->work->jacobian_csc_filled)
     {
-        csr_to_csc_fill_values(f->jacobian, Jf, f->work->csc_work);
+        csr_to_csc_fill_values(f->jacobian->to_csr(f->jacobian), Jf, f->work->csc_work);
         if (is_f_affine)
         {
             f->work->jacobian_csc_filled = true;
@@ -476,7 +481,7 @@ static void eval_wsum_hess_chain_rule(expr *node, const double *w)
     /* refresh child Jacobian CSC values (cache if affine) */
     if (!g->work->jacobian_csc_filled)
     {
-        csr_to_csc_fill_values(g->jacobian, Jg, g->work->csc_work);
+        csr_to_csc_fill_values(g->jacobian->to_csr(g->jacobian), Jg, g->work->csc_work);
         if (is_g_affine)
         {
             g->work->jacobian_csc_filled = true;
diff --git a/src/atoms/bivariate_full_dom/multiply.c b/src/atoms/bivariate_full_dom/multiply.c
index 5f606f8..236f67c 100644
--- a/src/atoms/bivariate_full_dom/multiply.c
+++ b/src/atoms/bivariate_full_dom/multiply.c
@@ -49,11 +49,14 @@ static void jacobian_init_impl(expr *node)
 {
     jacobian_init(node->left);
     jacobian_init(node->right);
-    int nnz_max = node->left->jacobian->nnz + node->right->jacobian->nnz;
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, nnz_max);
+    CSR_Matrix *Jl = node->left->jacobian->to_csr(node->left->jacobian);
+    CSR_Matrix *Jr = node->right->jacobian->to_csr(node->right->jacobian);
+    int nnz_max = Jl->nnz + Jr->nnz;
+    CSR_Matrix *jac = new_csr_matrix(node->size, node->n_vars, nnz_max);
 
     /* fill sparsity pattern */
-    sum_csr_alloc(node->left->jacobian, node->right->jacobian, node->jacobian);
+    sum_csr_alloc(Jl, Jr, jac);
+    node->jacobian = new_sparse_matrix(jac);
 }
 
 static void eval_jacobian(expr *node)
@@ -66,8 +69,8 @@ static void eval_jacobian(expr *node)
 
     /* chain rule: the jacobian of h(x) = f(g1(x), g2(x))) is Jh = J_{f, 1} J_{g1} +
      * J_{f, 2} J_{g2} */
-    sum_scaled_csr_matrices_fill_values(x->jacobian, y->jacobian, node->jacobian,
-                                        y->value, x->value);
+    sum_scaled_csr_matrices_fill_values(x->jacobian->to_csr(x->jacobian), y->jacobian->to_csr(y->jacobian),
+                                        node->jacobian->to_csr(node->jacobian), y->value, x->value);
 }
 
 static void wsum_hess_init_impl(expr *node)
@@ -200,7 +203,7 @@ static void eval_wsum_hess(expr *node, const double *w)
         // ----------------------------------------------------------------------
         if (!x->work->jacobian_csc_filled)
         {
-            csr_to_csc_fill_values(x->jacobian, x->work->jacobian_csc,
+            csr_to_csc_fill_values(x->jacobian->to_csr(x->jacobian), x->work->jacobian_csc,
                                    x->work->csc_work);
 
             if (is_x_affine)
@@ -211,7 +214,7 @@ static void eval_wsum_hess(expr *node, const double *w)
 
         if (!y->work->jacobian_csc_filled)
         {
-            csr_to_csc_fill_values(y->jacobian, y->work->jacobian_csc,
+            csr_to_csc_fill_values(y->jacobian->to_csr(y->jacobian), y->work->jacobian_csc,
                                    y->work->csc_work);
 
             if (is_y_affine)
diff --git a/src/atoms/bivariate_restricted_dom/quad_over_lin.c b/src/atoms/bivariate_restricted_dom/quad_over_lin.c
index ecf2741..518917e 100644
--- a/src/atoms/bivariate_restricted_dom/quad_over_lin.c
+++ b/src/atoms/bivariate_restricted_dom/quad_over_lin.c
@@ -58,27 +58,28 @@ static void jacobian_init_impl(expr *node)
     /* if left node is a variable */
     if (x->var_id != NOT_A_VARIABLE)
     {
-        node->jacobian = new_csr_matrix(1, node->n_vars, x->size + 1);
-        node->jacobian->p[0] = 0;
-        node->jacobian->p[1] = x->size + 1;
+        CSR_Matrix *jac = new_csr_matrix(1, node->n_vars, x->size + 1);
+        jac->p[0] = 0;
+        jac->p[1] = x->size + 1;
 
         /* if x has lower idx than y*/
         if (x->var_id < y->var_id)
         {
             for (int j = 0; j < x->size; j++)
             {
-                node->jacobian->i[j] = x->var_id + j;
+                jac->i[j] = x->var_id + j;
             }
-            node->jacobian->i[x->size] = y->var_id;
+            jac->i[x->size] = y->var_id;
         }
         else /* y has lower idx than x */
         {
-            node->jacobian->i[0] = y->var_id;
+            jac->i[0] = y->var_id;
             for (int j = 0; j < x->size; j++)
             {
-                node->jacobian->i[j + 1] = x->var_id + j;
+                jac->i[j + 1] = x->var_id + j;
             }
         }
+        node->jacobian = new_sparse_matrix(jac);
     }
     else /* left node is not a variable (guaranteed to be a linear operator) */
     {
@@ -87,46 +88,48 @@ static void jacobian_init_impl(expr *node)
         /* compute required allocation and allocate jacobian */
         bool *col_nz = (bool *) SP_CALLOC(
             node->n_vars, sizeof(bool)); /* TODO: could use iwork here instead*/
-        int nonzero_cols = count_nonzero_cols(x->jacobian, col_nz);
-        node->jacobian = new_csr_matrix(1, node->n_vars, nonzero_cols + 1);
+        CSR_Matrix *Jx = x->jacobian->to_csr(x->jacobian);
+        int nonzero_cols = count_nonzero_cols(Jx, col_nz);
+        CSR_Matrix *jac = new_csr_matrix(1, node->n_vars, nonzero_cols + 1);
 
         /* precompute column indices */
-        node->jacobian->nnz = 0;
+        jac->nnz = 0;
         for (int j = 0; j < node->n_vars; j++)
         {
             if (col_nz[j])
             {
-                node->jacobian->i[node->jacobian->nnz] = j;
-                node->jacobian->nnz++;
+                jac->i[jac->nnz] = j;
+                jac->nnz++;
             }
         }
-        assert(nonzero_cols == node->jacobian->nnz);
+        assert(nonzero_cols == jac->nnz);
 
         free(col_nz);
 
         /* insert y variable index at correct position */
-        insert_idx(y->var_id, node->jacobian->i, node->jacobian->nnz);
-        node->jacobian->nnz += 1;
-        node->jacobian->p[0] = 0;
-        node->jacobian->p[1] = node->jacobian->nnz;
+        insert_idx(y->var_id, jac->i, jac->nnz);
+        jac->nnz += 1;
+        jac->p[0] = 0;
+        jac->p[1] = jac->nnz;
 
         /* find position where y should be inserted */
         node->work->iwork = (int *) SP_MALLOC(sizeof(int));
-        for (int j = 0; j < node->jacobian->nnz; j++)
+        for (int j = 0; j < jac->nnz; j++)
         {
-            if (node->jacobian->i[j] == y->var_id)
+            if (jac->i[j] == y->var_id)
             {
                 node->work->iwork[0] = j;
                 break;
             }
         }
 
+        node->jacobian = new_sparse_matrix(jac);
+
         /* prepare CSC form of child jacobian for chain rule.
          * For a linear operator the values are constant, so fill
          * them once here. */
         jacobian_csc_init(x);
-        csr_to_csc_fill_values(x->jacobian, x->work->jacobian_csc,
-                               x->work->csc_work);
+        csr_to_csc_fill_values(Jx, x->work->jacobian_csc, x->work->csc_work);
     }
 }
 
@@ -134,6 +137,7 @@ static void eval_jacobian(expr *node)
 {
     expr *x = node->left;
     expr *y = node->right;
+    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
 
     /* if x is a variable */
     if (x->var_id != NOT_A_VARIABLE)
@@ -143,16 +147,16 @@ static void eval_jacobian(expr *node)
         {
             for (int j = 0; j < x->size; j++)
             {
-                node->jacobian->x[j] = (2.0 * x->value[j]) / y->value[0];
+                jac->x[j] = (2.0 * x->value[j]) / y->value[0];
             }
-            node->jacobian->x[x->size] = -node->value[0] / y->value[0];
+            jac->x[x->size] = -node->value[0] / y->value[0];
         }
         else /* y has lower idx than x */
         {
-            node->jacobian->x[0] = -node->value[0] / y->value[0];
+            jac->x[0] = -node->value[0] / y->value[0];
             for (int j = 0; j < x->size; j++)
             {
-                node->jacobian->x[j + 1] = (2.0 * x->value[j]) / y->value[0];
+                jac->x[j + 1] = (2.0 * x->value[j]) / y->value[0];
             }
         }
     }
@@ -165,12 +169,12 @@ static void eval_jacobian(expr *node)
         }
 
         /* chain rule (no derivative wrt y) using CSC format */
-        yTA_fill_values(x->work->jacobian_csc, node->work->dwork, node->jacobian);
+        yTA_fill_values(x->work->jacobian_csc, node->work->dwork, jac);
 
         /* insert derivative wrt y at right place (for correctness this assumes
            that y does not appear in the numerator, but this will always be
            the case since y is a new variable for the denominator */
-        node->jacobian->x[node->work->iwork[0]] = -node->value[0] / y->value[0];
+        jac->x[node->work->iwork[0]] = -node->value[0] / y->value[0];
     }
 }
 
diff --git a/src/atoms/bivariate_restricted_dom/rel_entr.c b/src/atoms/bivariate_restricted_dom/rel_entr.c
index d2ab721..8db2727 100644
--- a/src/atoms/bivariate_restricted_dom/rel_entr.c
+++ b/src/atoms/bivariate_restricted_dom/rel_entr.c
@@ -45,7 +45,7 @@ static void forward_vector_args(expr *node, const double *u)
 
 static void jacobian_init_vectors_args(expr *node)
 {
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, 2 * node->size);
+    CSR_Matrix *jac = new_csr_matrix(node->size, node->n_vars, 2 * node->size);
 
     expr *x = node->left;
     expr *y = node->right;
@@ -57,44 +57,46 @@ static void jacobian_init_vectors_args(expr *node)
     {
         for (int j = 0; j < node->size; j++)
         {
-            node->jacobian->i[2 * j] = j + x->var_id;
-            node->jacobian->i[2 * j + 1] = j + y->var_id;
-            node->jacobian->p[j] = 2 * j;
+            jac->i[2 * j] = j + x->var_id;
+            jac->i[2 * j + 1] = j + y->var_id;
+            jac->p[j] = 2 * j;
         }
     }
     else
     {
         for (int j = 0; j < node->size; j++)
         {
-            node->jacobian->i[2 * j] = j + y->var_id;
-            node->jacobian->i[2 * j + 1] = j + x->var_id;
-            node->jacobian->p[j] = 2 * j;
+            jac->i[2 * j] = j + y->var_id;
+            jac->i[2 * j + 1] = j + x->var_id;
+            jac->p[j] = 2 * j;
         }
     }
 
-    node->jacobian->p[node->size] = 2 * node->size;
+    jac->p[node->size] = 2 * node->size;
+    node->jacobian = new_sparse_matrix(jac);
 }
 
 static void eval_jacobian_vector_args(expr *node)
 {
     expr *x = node->left;
     expr *y = node->right;
+    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
 
     /* if x has lower variable idx than y */
     if (x->var_id < y->var_id)
     {
         for (int i = 0; i < node->size; i++)
         {
-            node->jacobian->x[2 * i] = log(x->value[i] / y->value[i]) + 1;
-            node->jacobian->x[2 * i + 1] = -x->value[i] / y->value[i];
+            jac->x[2 * i] = log(x->value[i] / y->value[i]) + 1;
+            jac->x[2 * i + 1] = -x->value[i] / y->value[i];
         }
     }
     else
     {
         for (int i = 0; i < node->size; i++)
         {
-            node->jacobian->x[2 * i] = -x->value[i] / y->value[i];
-            node->jacobian->x[2 * i + 1] = log(x->value[i] / y->value[i]) + 1;
+            jac->x[2 * i] = -x->value[i] / y->value[i];
+            jac->x[2 * i + 1] = log(x->value[i] / y->value[i]) + 1;
         }
     }
 }
diff --git a/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c b/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c
index 5f09cc9..b4b5130 100644
--- a/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c
+++ b/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c
@@ -50,49 +50,51 @@ static void jacobian_init_scalar_vector(expr *node)
     assert(x->var_id != NOT_A_VARIABLE && y->var_id != NOT_A_VARIABLE);
     assert(x->var_id != y->var_id);
 
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, 2 * node->size);
+    CSR_Matrix *jac = new_csr_matrix(node->size, node->n_vars, 2 * node->size);
 
     if (x->var_id < y->var_id)
     {
         for (int j = 0; j < node->size; j++)
         {
-            node->jacobian->i[2 * j] = x->var_id;
-            node->jacobian->i[2 * j + 1] = y->var_id + j;
-            node->jacobian->p[j] = 2 * j;
+            jac->i[2 * j] = x->var_id;
+            jac->i[2 * j + 1] = y->var_id + j;
+            jac->p[j] = 2 * j;
         }
     }
     else
     {
         for (int j = 0; j < node->size; j++)
         {
-            node->jacobian->i[2 * j] = y->var_id + j;
-            node->jacobian->i[2 * j + 1] = x->var_id;
-            node->jacobian->p[j] = 2 * j;
+            jac->i[2 * j] = y->var_id + j;
+            jac->i[2 * j + 1] = x->var_id;
+            jac->p[j] = 2 * j;
         }
     }
 
-    node->jacobian->p[node->size] = 2 * node->size;
+    jac->p[node->size] = 2 * node->size;
+    node->jacobian = new_sparse_matrix(jac);
 }
 
 static void eval_jacobian_scalar_vector(expr *node)
 {
     expr *x = node->left;
     expr *y = node->right;
+    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
 
     if (x->var_id < y->var_id)
     {
         for (int i = 0; i < node->size; i++)
         {
-            node->jacobian->x[2 * i] = log(x->value[0] / y->value[i]) + 1;
-            node->jacobian->x[2 * i + 1] = -x->value[0] / y->value[i];
+            jac->x[2 * i] = log(x->value[0] / y->value[i]) + 1;
+            jac->x[2 * i + 1] = -x->value[0] / y->value[i];
         }
     }
     else
     {
         for (int i = 0; i < node->size; i++)
         {
-            node->jacobian->x[2 * i] = -x->value[0] / y->value[i];
-            node->jacobian->x[2 * i + 1] = log(x->value[0] / y->value[i]) + 1;
+            jac->x[2 * i] = -x->value[0] / y->value[i];
+            jac->x[2 * i + 1] = log(x->value[0] / y->value[i]) + 1;
         }
     }
 }
diff --git a/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c b/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c
index f425357..c4eba8b 100644
--- a/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c
+++ b/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c
@@ -50,49 +50,51 @@ static void jacobian_init_vector_scalar(expr *node)
     assert(x->var_id != NOT_A_VARIABLE && y->var_id != NOT_A_VARIABLE);
     assert(x->var_id != y->var_id);
 
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, 2 * node->size);
+    CSR_Matrix *jac = new_csr_matrix(node->size, node->n_vars, 2 * node->size);
 
     if (x->var_id < y->var_id)
     {
         for (int j = 0; j < node->size; j++)
         {
-            node->jacobian->i[2 * j] = x->var_id + j;
-            node->jacobian->i[2 * j + 1] = y->var_id;
-            node->jacobian->p[j] = 2 * j;
+            jac->i[2 * j] = x->var_id + j;
+            jac->i[2 * j + 1] = y->var_id;
+            jac->p[j] = 2 * j;
         }
     }
     else
     {
         for (int j = 0; j < node->size; j++)
         {
-            node->jacobian->i[2 * j] = y->var_id;
-            node->jacobian->i[2 * j + 1] = x->var_id + j;
-            node->jacobian->p[j] = 2 * j;
+            jac->i[2 * j] = y->var_id;
+            jac->i[2 * j + 1] = x->var_id + j;
+            jac->p[j] = 2 * j;
         }
     }
 
-    node->jacobian->p[node->size] = 2 * node->size;
+    jac->p[node->size] = 2 * node->size;
+    node->jacobian = new_sparse_matrix(jac);
 }
 
 static void eval_jacobian_vector_scalar(expr *node)
 {
     expr *x = node->left;
     expr *y = node->right;
+    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
 
     if (x->var_id < y->var_id)
     {
         for (int i = 0; i < node->size; i++)
         {
-            node->jacobian->x[2 * i] = log(x->value[i] / y->value[0]) + 1;
-            node->jacobian->x[2 * i + 1] = -x->value[i] / y->value[0];
+            jac->x[2 * i] = log(x->value[i] / y->value[0]) + 1;
+            jac->x[2 * i + 1] = -x->value[i] / y->value[0];
         }
     }
     else
     {
         for (int i = 0; i < node->size; i++)
         {
-            node->jacobian->x[2 * i] = -x->value[i] / y->value[0];
-            node->jacobian->x[2 * i + 1] = log(x->value[i] / y->value[0]) + 1;
+            jac->x[2 * i] = -x->value[i] / y->value[0];
+            jac->x[2 * i + 1] = log(x->value[i] / y->value[0]) + 1;
         }
     }
 }
diff --git a/src/atoms/elementwise_full_dom/common.c b/src/atoms/elementwise_full_dom/common.c
index b9735a4..3a51d7c 100644
--- a/src/atoms/elementwise_full_dom/common.c
+++ b/src/atoms/elementwise_full_dom/common.c
@@ -32,20 +32,20 @@ void jacobian_init_elementwise(expr *node)
     /* if the variable is a child */
     if (child->var_id != NOT_A_VARIABLE)
     {
-        node->jacobian = new_csr_matrix(node->size, node->n_vars, node->size);
+        CSR_Matrix *jac = new_csr_matrix(node->size, node->n_vars, node->size);
         for (int j = 0; j < node->size; j++)
         {
-            node->jacobian->p[j] = j;
-            node->jacobian->i[j] = j + child->var_id;
+            jac->p[j] = j;
+            jac->i[j] = j + child->var_id;
         }
-        node->jacobian->p[node->size] = node->size;
+        jac->p[node->size] = node->size;
+        node->jacobian = new_sparse_matrix(jac);
     }
     else
     {
         /* jacobian of h(x) = f(g(x)) is Jf @ Jg, and here Jf is diagonal */
         jacobian_init(child);
-        CSR_Matrix *Jg = child->jacobian;
-        node->jacobian = new_csr_copy_sparsity(Jg);
+        node->jacobian = child->jacobian->copy_sparsity(child->jacobian);
         node->work->dwork = (double *) SP_MALLOC(node->size * sizeof(double));
         node->work->local_jac_diag =
             (double *) SP_MALLOC(node->size * sizeof(double));
@@ -58,17 +58,17 @@ void eval_jacobian_elementwise(expr *node)
 
     if (child->var_id != NOT_A_VARIABLE)
     {
-        node->local_jacobian(node, node->jacobian->x);
+        node->local_jacobian(node, node->jacobian->to_csr(node->jacobian)->x);
     }
     else
     {
         /* jacobian of h(x) = f(g(x)) is Jf @ Jg, and here Jf is diagonal */
         child->eval_jacobian(child);
-        CSR_Matrix *Jg = child->jacobian;
         node->local_jacobian(node, node->work->local_jac_diag);
         memcpy(node->work->dwork, node->work->local_jac_diag,
                node->size * sizeof(double));
-        DA_fill_values(node->work->dwork, Jg, node->jacobian);
+        child->jacobian->DA_fill_values(node->work->dwork, child->jacobian,
+                                        node->jacobian);
     }
 }
 
@@ -101,17 +101,14 @@ void wsum_hess_init_elementwise(expr *node)
             term2 = sum_i (J_f^T w)_i^T Hg_i.
 
             For elementwise functions, D is diagonal. */
-        jacobian_csc_init(child);
-        CSC_Matrix *Jg = child->work->jacobian_csc;
-
         if (child->is_affine(child))
         {
-            node->wsum_hess = ATA_alloc(Jg);
+            node->wsum_hess = child->jacobian->ATA_alloc_csr(child->jacobian);
         }
         else
         {
             /* term1: Jg^T @ D @ Jg */
-            node->work->hess_term1 = ATA_alloc(Jg);
+            node->work->hess_term1 = child->jacobian->ATA_alloc_csr(child->jacobian);
 
             /* term2: child's Hessian */
             wsum_hess_init(child);
@@ -139,27 +136,27 @@ void eval_wsum_hess_elementwise(expr *node, const double *w)
     {
         if (child->is_affine(child))
         {
+            /* Refresh the child Jacobian's CSC mirror once; subsequent calls
+               skip since the affine child's values don't change. */
             if (!child->work->jacobian_csc_filled)
             {
-                csr_to_csc_fill_values(child->jacobian, child->work->jacobian_csc,
-                                       child->work->csc_work);
+                child->jacobian->refresh_csc_values(child->jacobian);
                 child->work->jacobian_csc_filled = true;
             }
 
             node->local_wsum_hess(node, node->work->dwork, w);
-            ATDA_fill_values(child->work->jacobian_csc, node->work->dwork,
-                             node->wsum_hess);
+            child->jacobian->ATDA_fill_csr(child->jacobian, node->work->dwork,
+                                           node->wsum_hess);
         }
         else
         {
-            /* refresh CSC jacobian values */
-            csr_to_csc_fill_values(child->jacobian, child->work->jacobian_csc,
-                                   child->work->csc_work);
+            /* Non-affine child: values change every iteration, must refresh. */
+            child->jacobian->refresh_csc_values(child->jacobian);
 
             /* term1: Jg^T @ D @ Jg */
             node->local_wsum_hess(node, node->work->dwork, w);
-            ATDA_fill_values(child->work->jacobian_csc, node->work->dwork,
-                             node->work->hess_term1);
+            child->jacobian->ATDA_fill_csr(child->jacobian, node->work->dwork,
+                                           node->work->hess_term1);
 
             /* term2: child Hessian with weight Jf^T w */
             memcpy(node->work->dwork, node->work->local_jac_diag,
diff --git a/src/atoms/elementwise_full_dom/logistic.c b/src/atoms/elementwise_full_dom/logistic.c
index 303306e..08e1bb6 100644
--- a/src/atoms/elementwise_full_dom/logistic.c
+++ b/src/atoms/elementwise_full_dom/logistic.c
@@ -63,7 +63,7 @@ static void local_wsum_hess(expr *node, double *out, const double *w)
 
     if (node->left->var_id != NOT_A_VARIABLE)
     {
-        sigmas = node->jacobian->x;
+        sigmas = node->jacobian->to_csr(node->jacobian)->x;
     }
     else
     {
diff --git a/src/atoms/elementwise_restricted_dom/atanh.c b/src/atoms/elementwise_restricted_dom/atanh.c
index 79a08f2..2938dd7 100644
--- a/src/atoms/elementwise_restricted_dom/atanh.c
+++ b/src/atoms/elementwise_restricted_dom/atanh.c
@@ -30,9 +30,10 @@ static void atanh_forward(expr *node, const double *u)
 static void atanh_eval_jacobian(expr *node)
 {
     double *x = node->left->value;
+    double *jx = node->jacobian->to_csr(node->jacobian)->x;
     for (int j = 0; j < node->size; j++)
     {
-        node->jacobian->x[j] = 1.0 / (1.0 - x[j] * x[j]);
+        jx[j] = 1.0 / (1.0 - x[j] * x[j]);
     }
 }
 
diff --git a/src/atoms/elementwise_restricted_dom/common.c b/src/atoms/elementwise_restricted_dom/common.c
index 9a2f561..40a45e3 100644
--- a/src/atoms/elementwise_restricted_dom/common.c
+++ b/src/atoms/elementwise_restricted_dom/common.c
@@ -23,13 +23,14 @@ void jacobian_init_restricted(expr *node)
 {
     expr *child = node->left;
 
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, node->size);
+    CSR_Matrix *jac = new_csr_matrix(node->size, node->n_vars, node->size);
     for (int j = 0; j < node->size; j++)
     {
-        node->jacobian->p[j] = j;
-        node->jacobian->i[j] = j + child->var_id;
+        jac->p[j] = j;
+        jac->i[j] = j + child->var_id;
     }
-    node->jacobian->p[node->size] = node->size;
+    jac->p[node->size] = node->size;
+    node->jacobian = new_sparse_matrix(jac);
 }
 
 void wsum_hess_init_restricted(expr *node)
diff --git a/src/atoms/elementwise_restricted_dom/entr.c b/src/atoms/elementwise_restricted_dom/entr.c
index 53ac275..08ff549 100644
--- a/src/atoms/elementwise_restricted_dom/entr.c
+++ b/src/atoms/elementwise_restricted_dom/entr.c
@@ -32,9 +32,10 @@ static void entr_forward(expr *node, const double *u)
 static void entr_eval_jacobian(expr *node)
 {
     double *x = node->left->value;
+    double *jx = node->jacobian->to_csr(node->jacobian)->x;
     for (int j = 0; j < node->size; j++)
     {
-        node->jacobian->x[j] = -log(x[j]) - 1.0;
+        jx[j] = -log(x[j]) - 1.0;
     }
 }
 
diff --git a/src/atoms/elementwise_restricted_dom/log.c b/src/atoms/elementwise_restricted_dom/log.c
index ce96425..31a6241 100644
--- a/src/atoms/elementwise_restricted_dom/log.c
+++ b/src/atoms/elementwise_restricted_dom/log.c
@@ -31,9 +31,10 @@ static void log_forward(expr *node, const double *u)
 static void log_eval_jacobian(expr *node)
 {
     double *x = node->left->value;
+    double *jx = node->jacobian->to_csr(node->jacobian)->x;
     for (int j = 0; j < node->size; j++)
     {
-        node->jacobian->x[j] = 1.0 / x[j];
+        jx[j] = 1.0 / x[j];
     }
 }
 
diff --git a/src/atoms/elementwise_restricted_dom/tan.c b/src/atoms/elementwise_restricted_dom/tan.c
index ccf945e..7fbc4cf 100644
--- a/src/atoms/elementwise_restricted_dom/tan.c
+++ b/src/atoms/elementwise_restricted_dom/tan.c
@@ -30,10 +30,11 @@ static void tan_forward(expr *node, const double *u)
 static void tan_eval_jacobian(expr *node)
 {
     double *x = node->left->value;
+    double *jx = node->jacobian->to_csr(node->jacobian)->x;
     for (int j = 0; j < node->size; j++)
     {
         double c = cos(x[j]);
-        node->jacobian->x[j] = 1.0 / (c * c);
+        jx[j] = 1.0 / (c * c);
     }
 }
 
diff --git a/src/atoms/other/prod.c b/src/atoms/other/prod.c
index 083e4c1..5e27820 100644
--- a/src/atoms/other/prod.c
+++ b/src/atoms/other/prod.c
@@ -71,13 +71,14 @@ static void jacobian_init_impl(expr *node)
     /* if x is a variable */
     if (x->var_id != NOT_A_VARIABLE)
     {
-        node->jacobian = new_csr_matrix(1, node->n_vars, x->size);
-        node->jacobian->p[0] = 0;
-        node->jacobian->p[1] = x->size;
+        CSR_Matrix *jac = new_csr_matrix(1, node->n_vars, x->size);
+        jac->p[0] = 0;
+        jac->p[1] = x->size;
         for (int j = 0; j < x->size; j++)
         {
-            node->jacobian->i[j] = x->var_id + j;
+            jac->i[j] = x->var_id + j;
         }
+        node->jacobian = new_sparse_matrix(jac);
     }
     else
     {
@@ -96,21 +97,22 @@ static void eval_jacobian(expr *node)
     /* if x is a variable */
     if (x->var_id != NOT_A_VARIABLE)
     {
+        double *jx = node->jacobian->to_csr(node->jacobian)->x;
         if (num_of_zeros == 0)
         {
             for (int j = 0; j < x->size; j++)
             {
-                node->jacobian->x[j] = node->value[0] / x->value[j];
+                jx[j] = node->value[0] / x->value[j];
             }
         }
         else if (num_of_zeros == 1)
         {
-            memset(node->jacobian->x, 0, sizeof(double) * x->size);
-            node->jacobian->x[pnode->zero_index] = pnode->prod_nonzero;
+            memset(jx, 0, sizeof(double) * x->size);
+            jx[pnode->zero_index] = pnode->prod_nonzero;
         }
         else
         {
-            memset(node->jacobian->x, 0, sizeof(double) * x->size);
+            memset(jx, 0, sizeof(double) * x->size);
         }
     }
     else
diff --git a/src/atoms/other/prod_axis_one.c b/src/atoms/other/prod_axis_one.c
index ac2583e..864039c 100644
--- a/src/atoms/other/prod_axis_one.c
+++ b/src/atoms/other/prod_axis_one.c
@@ -81,14 +81,14 @@ static void jacobian_init_impl(expr *node)
     /* if x is a variable */
     if (x->var_id != NOT_A_VARIABLE)
     {
-        node->jacobian = new_csr_matrix(node->size, node->n_vars, x->size);
+        CSR_Matrix *jac = new_csr_matrix(node->size, node->n_vars, x->size);
 
         /* set row pointers (each row has d2 nnzs) */
         for (int row = 0; row < x->d1; row++)
         {
-            node->jacobian->p[row] = row * x->d2;
+            jac->p[row] = row * x->d2;
         }
-        node->jacobian->p[x->d1] = x->size;
+        jac->p[x->d1] = x->size;
 
         /* set column indices */
         for (int row = 0; row < x->d1; row++)
@@ -96,9 +96,10 @@ static void jacobian_init_impl(expr *node)
             int start = row * x->d2;
             for (int col = 0; col < x->d2; col++)
             {
-                node->jacobian->i[start + col] = x->var_id + col * x->d1 + row;
+                jac->i[start + col] = x->var_id + col * x->d1 + row;
             }
         }
+        node->jacobian = new_sparse_matrix(jac);
     }
     else
     {
@@ -113,7 +114,7 @@ static void eval_jacobian(expr *node)
     expr *x = node->left;
     prod_axis *pnode = (prod_axis *) node;
 
-    double *J_vals = node->jacobian->x;
+    double *J_vals = node->jacobian->to_csr(node->jacobian)->x;
 
     /* if x is a variable */
     if (x->var_id != NOT_A_VARIABLE)
diff --git a/src/atoms/other/prod_axis_zero.c b/src/atoms/other/prod_axis_zero.c
index 20fac1b..f5df03a 100644
--- a/src/atoms/other/prod_axis_zero.c
+++ b/src/atoms/other/prod_axis_zero.c
@@ -76,14 +76,14 @@ static void jacobian_init_impl(expr *node)
     /* if x is a variable */
     if (x->var_id != NOT_A_VARIABLE)
     {
-        node->jacobian = new_csr_matrix(node->size, node->n_vars, x->size);
+        CSR_Matrix *jac = new_csr_matrix(node->size, node->n_vars, x->size);
 
         /* set row pointers (each row has d1 nnzs) */
         for (int row = 0; row < x->d2; row++)
         {
-            node->jacobian->p[row] = row * x->d1;
+            jac->p[row] = row * x->d1;
         }
-        node->jacobian->p[x->d2] = x->size;
+        jac->p[x->d2] = x->size;
 
         /* set column indices */
         for (int col = 0; col < x->d2; col++)
@@ -91,9 +91,10 @@ static void jacobian_init_impl(expr *node)
             int start = col * x->d1;
             for (int i = 0; i < x->d1; i++)
             {
-                node->jacobian->i[start + i] = x->var_id + start + i;
+                jac->i[start + i] = x->var_id + start + i;
             }
         }
+        node->jacobian = new_sparse_matrix(jac);
     }
     else
     {
@@ -108,7 +109,7 @@ static void eval_jacobian(expr *node)
     expr *x = node->left;
     prod_axis *pnode = (prod_axis *) node;
 
-    double *J_vals = node->jacobian->x;
+    double *J_vals = node->jacobian->to_csr(node->jacobian)->x;
 
     /* if x is a variable */
     if (x->var_id != NOT_A_VARIABLE)
diff --git a/src/atoms/other/quad_form.c b/src/atoms/other/quad_form.c
index 72474df..fc2ec4f 100644
--- a/src/atoms/other/quad_form.c
+++ b/src/atoms/other/quad_form.c
@@ -51,14 +51,15 @@ static void jacobian_init_impl(expr *node)
 
     if (x->var_id != NOT_A_VARIABLE)
     {
-        node->jacobian = new_csr_matrix(1, node->n_vars, x->size);
-        node->jacobian->p[0] = 0;
-        node->jacobian->p[1] = x->size;
+        CSR_Matrix *jac = new_csr_matrix(1, node->n_vars, x->size);
+        jac->p[0] = 0;
+        jac->p[1] = x->size;
 
         for (int j = 0; j < x->size; j++)
         {
-            node->jacobian->i[j] = x->var_id + j;
+            jac->i[j] = x->var_id + j;
         }
+        node->jacobian = new_sparse_matrix(jac);
     }
     else
     {
@@ -69,9 +70,9 @@ static void jacobian_init_impl(expr *node)
 
         /* allocate the right number of nnz */
         int nnz = count_nonzero_cols_csc(J_csc);
-        node->jacobian = new_csr_matrix(1, node->n_vars, nnz);
-        node->jacobian->p[0] = 0;
-        node->jacobian->p[1] = nnz;
+        CSR_Matrix *jac = new_csr_matrix(1, node->n_vars, nnz);
+        jac->p[0] = 0;
+        jac->p[1] = nnz;
 
         /* fill sparsity pattern */
         int idx = 0;
@@ -79,9 +80,10 @@ static void jacobian_init_impl(expr *node)
         {
             if (J_csc->p[j + 1] > J_csc->p[j])
             {
-                node->jacobian->i[idx++] = j;
+                jac->i[idx++] = j;
             }
         }
+        node->jacobian = new_sparse_matrix(jac);
     }
 }
 
@@ -89,12 +91,13 @@ static void eval_jacobian(expr *node)
 {
     expr *x = node->left;
     CSR_Matrix *Q = ((quad_form_expr *) node)->Q;
+    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
 
     if (x->var_id != NOT_A_VARIABLE)
     {
         /* jacobian = 2 * (Q @ x)^T */
-        Ax_csr(Q, x->value, node->jacobian->x, 0);
-        cblas_dscal(x->size, 2.0, node->jacobian->x, 1);
+        Ax_csr(Q, x->value, jac->x, 0);
+        cblas_dscal(x->size, 2.0, jac->x, 1);
     }
     else
     {
@@ -103,7 +106,7 @@ static void eval_jacobian(expr *node)
 
         if (!x->work->jacobian_csc_filled)
         {
-            csr_to_csc_fill_values(x->jacobian, x->work->jacobian_csc,
+            csr_to_csc_fill_values(x->jacobian->to_csr(x->jacobian), x->work->jacobian_csc,
                                    x->work->csc_work);
 
             if (x->is_affine(x))
@@ -114,9 +117,9 @@ static void eval_jacobian(expr *node)
 
         /* The jacobian has same values as the gradient, which is
            J_f^T (Q @ f(x)). Here, dwork stores Q @ f(x) from forward */
-        yTA_fill_values(x->work->jacobian_csc, node->work->dwork, node->jacobian);
+        yTA_fill_values(x->work->jacobian_csc, node->work->dwork, jac);
 
-        cblas_dscal(node->jacobian->nnz, 2.0, node->jacobian->x, 1);
+        cblas_dscal(jac->nnz, 2.0, jac->x, 1);
     }
 }
 
@@ -195,7 +198,7 @@ static void eval_wsum_hess(expr *node, const double *w)
         CSC_Matrix *Jf = x->work->jacobian_csc;
         if (!x->work->jacobian_csc_filled)
         {
-            csr_to_csc_fill_values(x->jacobian, Jf, x->work->csc_work);
+            csr_to_csc_fill_values(x->jacobian->to_csr(x->jacobian), Jf, x->work->csc_work);
 
             if (x->is_affine(x))
             {
diff --git a/src/expr.c b/src/expr.c
index 64f0410..f4400cb 100644
--- a/src/expr.c
+++ b/src/expr.c
@@ -52,7 +52,7 @@ void jacobian_csc_init(expr *node)
     }
     node->work->csc_work = (int *) SP_MALLOC(node->n_vars * sizeof(int));
     node->work->jacobian_csc =
-        csr_to_csc_alloc(node->jacobian, node->work->csc_work);
+        csr_to_csc_alloc(node->jacobian->to_csr(node->jacobian), node->work->csc_work);
 }
 
 void free_expr(expr *node)
@@ -76,7 +76,7 @@ void free_expr(expr *node)
 
     /* free value array and derivative matrices */
     free(node->value);
-    free_csr_matrix(node->jacobian);
+    free_matrix(node->jacobian);
     free_csr_matrix(node->wsum_hess);
 
     /* free workspace */
diff --git a/src/old-code/linear_op.c b/src/old-code/linear_op.c
index a63bc4d..ca0174b 100644
--- a/src/old-code/linear_op.c
+++ b/src/old-code/linear_op.c
@@ -32,7 +32,7 @@ static void forward(expr *node, const double *u)
     node->left->forward(node->left, u);
 
     /* y = A * x (A is stored as node->jacobian) */
-    Ax_csr(node->jacobian, x->value, node->value, x->var_id);
+    Ax_csr(node->jacobian->to_csr(node->jacobian), x->value, node->value, x->var_id);
 
     /* y += b (if offset exists) */
     if (lin_node->b != NULL)
@@ -97,8 +97,9 @@ expr *new_linear(expr *u, const CSR_Matrix *A, const double *b)
     expr_retain(u);
 
     /* Store A directly as the jacobian (linear op jacobian is constant) */
-    node->jacobian = new_csr_matrix(A->m, A->n, A->nnz);
-    copy_csr_matrix(A, node->jacobian);
+    CSR_Matrix *jac = new_csr_matrix(A->m, A->n, A->nnz);
+    copy_csr_matrix(A, jac);
+    node->jacobian = new_sparse_matrix(jac);
 
     /* Initialize offset (copy b if provided, otherwise NULL) */
     if (b != NULL)
diff --git a/src/problem.c b/src/problem.c
index 691921a..154fae4 100644
--- a/src/problem.c
+++ b/src/problem.c
@@ -176,15 +176,16 @@ void problem_init_jacobian(problem *prob)
     {
         expr *c = prob->constraints[i];
         jacobian_init(c);
-        nnz += c->jacobian->nnz;
+        CSR_Matrix *Jc = c->jacobian->to_csr(c->jacobian);
+        nnz += Jc->nnz;
 
         if (c->is_affine(c))
         {
-            prob->stats.nnz_affine += c->jacobian->nnz;
+            prob->stats.nnz_affine += Jc->nnz;
         }
         else
         {
-            prob->stats.nnz_nonlinear += c->jacobian->nnz;
+            prob->stats.nnz_nonlinear += Jc->nnz;
         }
     }
 
@@ -198,15 +199,16 @@ void problem_init_jacobian(problem *prob)
     for (int i = 0; i < prob->n_constraints; i++)
     {
         expr *c = prob->constraints[i];
+        CSR_Matrix *Jc = c->jacobian->to_csr(c->jacobian);
 
-        for (int r = 1; r <= c->jacobian->m; r++)
+        for (int r = 1; r <= Jc->m; r++)
         {
-            H->p[row_offset + r] = nnz_offset + c->jacobian->p[r];
+            H->p[row_offset + r] = nnz_offset + Jc->p[r];
         }
 
-        memcpy(H->i + nnz_offset, c->jacobian->i, c->jacobian->nnz * sizeof(int));
-        row_offset += c->jacobian->m;
-        nnz_offset += c->jacobian->nnz;
+        memcpy(H->i + nnz_offset, Jc->i, Jc->nnz * sizeof(int));
+        row_offset += Jc->m;
+        nnz_offset += Jc->nnz;
     }
     assert(nnz_offset == nnz);
 
@@ -466,7 +468,7 @@ void problem_gradient(problem *prob)
 
     /* copy sparse jacobian to dense gradient */
     memset(prob->gradient_values, 0, prob->n_vars * sizeof(double));
-    CSR_Matrix *jac = prob->objective->jacobian;
+    CSR_Matrix *jac = prob->objective->jacobian->to_csr(prob->objective->jacobian);
     for (int k = jac->p[0]; k < jac->p[1]; k++)
     {
         prob->gradient_values[jac->i[k]] = jac->x[k];
@@ -488,17 +490,18 @@ void problem_jacobian(problem *prob)
     for (int i = 0; i < prob->n_constraints; i++)
     {
         expr *c = prob->constraints[i];
+        CSR_Matrix *Jc = c->jacobian->to_csr(c->jacobian);
 
         if (!first_call && c->is_affine(c))
         {
             /* skip evaluation for affine constraints after first call */
-            nnz_offset += c->jacobian->nnz;
+            nnz_offset += Jc->nnz;
             continue;
         }
 
         c->eval_jacobian(c);
-        memcpy(J->x + nnz_offset, c->jacobian->x, c->jacobian->nnz * sizeof(double));
-        nnz_offset += c->jacobian->nnz;
+        memcpy(J->x + nnz_offset, Jc->x, Jc->nnz * sizeof(double));
+        nnz_offset += Jc->nnz;
     }
 
     /* update actual nnz (may be less than allocated) */
diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c
index 9a7e25a..2624814 100644
--- a/src/utils/permuted_dense.c
+++ b/src/utils/permuted_dense.c
@@ -31,9 +31,36 @@ static void permuted_dense_free(Matrix *self)
     free(pd->X);
     free(pd->Y_scratch);
     free(pd->col_inv);
+    free_csr_matrix(pd->csr_cache);
     free(pd);
 }
 
+/* Permuted_Dense has no CSC mirror; chain-rule kernels operate on X directly. */
+static void permuted_dense_refresh_csc_values(Matrix *self)
+{
+    (void) self;
+}
+
+/* Lazy CSR view: allocate structure on first call, refill values on every call.
+   This means the returned CSR's values always reflect the current X.
+
+   Future optimization: pd->X and csr_cache->x have bit-identical memory layout
+   (row-major dense block, same offsets), so we could alias csr_cache->x = pd->X
+   and skip the value fill entirely. That requires a non-owning x flag on
+   CSR_Matrix so free_csr_matrix doesn't double-free pd->X. The current
+   memcpy-on-every-call is cheap (O(dense_m * dense_n) bandwidth), and revisiting
+   this can wait until a profile shows it matters. */
+static CSR_Matrix *permuted_dense_to_csr(Matrix *self)
+{
+    Permuted_Dense *pd = (Permuted_Dense *) self;
+    if (pd->csr_cache == NULL)
+    {
+        pd->csr_cache = permuted_dense_to_csr_alloc(pd);
+    }
+    permuted_dense_to_csr_fill_values(pd, pd->csr_cache);
+    return pd->csr_cache;
+}
+
 Matrix *new_permuted_dense(int m, int n, int dense_m, int dense_n,
                            const int *row_perm, const int *col_perm,
                            const double *X_data)
@@ -59,9 +86,12 @@ Matrix *new_permuted_dense(int m, int n, int dense_m, int dense_n,
     Permuted_Dense *pd = (Permuted_Dense *) SP_CALLOC(1, sizeof(Permuted_Dense));
     pd->base.m = m;
     pd->base.n = n;
+    pd->base.to_csr = permuted_dense_to_csr;
+    pd->base.refresh_csc_values = permuted_dense_refresh_csc_values;
     pd->base.free_fn = permuted_dense_free;
-    /* Other vtable slots are wired up in later steps as the operations
-       they dispatch to are implemented. */
+    /* Other vtable slots (copy_sparsity, DA_fill_values, ATA_alloc_csr,
+       ATDA_fill_csr) are wired up in a later step when permuted_dense actually
+       starts appearing as a node->jacobian. */
 
     pd->dense_m = dense_m;
     pd->dense_n = dense_n;
diff --git a/src/utils/sparse_matrix.c b/src/utils/sparse_matrix.c
index 0237346..b1991e2 100644
--- a/src/utils/sparse_matrix.c
+++ b/src/utils/sparse_matrix.c
@@ -15,6 +15,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include "utils/CSC_Matrix.h"
 #include "utils/linalg_sparse_matmuls.h"
 #include "utils/matrix.h"
 #include "utils/tracked_alloc.h"
@@ -52,20 +53,87 @@ static void sparse_free(Matrix *self)
 {
     Sparse_Matrix *sm = (Sparse_Matrix *) self;
     free_csr_matrix(sm->csr);
+    free_csc_matrix(sm->csc_cache);
+    free(sm->csc_iwork);
     free(sm);
 }
 
-Matrix *new_sparse_matrix(const CSR_Matrix *A)
+/* Forward decl: ctor is referenced by copy_sparsity below. */
+Matrix *new_sparse_matrix(CSR_Matrix *A);
+
+/* Build the CSC cache structure if absent. Values are NOT filled here; caller
+   must call refresh_csc_values before consuming. ATA_alloc_csr only needs
+   structure, so it's safe to call after build_csc_structure alone. */
+static void build_csc_structure_if_absent(Sparse_Matrix *sm)
+{
+    if (sm->csc_cache != NULL) return;
+    sm->csc_iwork = (int *) SP_MALLOC(sm->csr->n * sizeof(int));
+    sm->csc_cache = csr_to_csc_alloc(sm->csr, sm->csc_iwork);
+}
+
+static Matrix *sparse_copy_sparsity(const Matrix *self)
+{
+    const Sparse_Matrix *sm = (const Sparse_Matrix *) self;
+    return new_sparse_matrix(new_csr_copy_sparsity(sm->csr));
+}
+
+static void sparse_DA_fill_values(const double *d, const Matrix *self, Matrix *out)
+{
+    const Sparse_Matrix *sm = (const Sparse_Matrix *) self;
+    Sparse_Matrix *sm_out = (Sparse_Matrix *) out;
+    DA_fill_values(d, sm->csr, sm_out->csr);
+}
+
+static CSR_Matrix *sparse_ATA_alloc_csr(Matrix *self)
+{
+    Sparse_Matrix *sm = (Sparse_Matrix *) self;
+    build_csc_structure_if_absent(sm);
+    return ATA_alloc(sm->csc_cache);
+}
+
+/* Caller must have called refresh_csc_values since the last change to csr->x. */
+static void sparse_ATDA_fill_csr(const Matrix *self, const double *d,
+                                 CSR_Matrix *csr_out)
+{
+    const Sparse_Matrix *sm = (const Sparse_Matrix *) self;
+    ATDA_fill_values(sm->csc_cache, d, csr_out);
+}
+
+static CSR_Matrix *sparse_to_csr(Matrix *self)
+{
+    return ((Sparse_Matrix *) self)->csr;
+}
+
+/* Build CSC structure on first call; refill values from csr->x on every call. */
+static void sparse_refresh_csc_values(Matrix *self)
+{
+    Sparse_Matrix *sm = (Sparse_Matrix *) self;
+    build_csc_structure_if_absent(sm);
+    csr_to_csc_fill_values(sm->csr, sm->csc_cache, sm->csc_iwork);
+}
+
+static void wire_vtable(Sparse_Matrix *sm)
 {
-    Sparse_Matrix *sm = (Sparse_Matrix *) SP_CALLOC(1, sizeof(Sparse_Matrix));
-    sm->base.m = A->m;
-    sm->base.n = A->n;
     sm->base.block_left_mult_vec = sparse_block_left_mult_vec;
     sm->base.block_left_mult_sparsity = sparse_block_left_mult_sparsity;
     sm->base.block_left_mult_values = sparse_block_left_mult_values;
     sm->base.update_values = sparse_update_values;
+    sm->base.copy_sparsity = sparse_copy_sparsity;
+    sm->base.DA_fill_values = sparse_DA_fill_values;
+    sm->base.ATA_alloc_csr = sparse_ATA_alloc_csr;
+    sm->base.ATDA_fill_csr = sparse_ATDA_fill_csr;
+    sm->base.to_csr = sparse_to_csr;
+    sm->base.refresh_csc_values = sparse_refresh_csc_values;
     sm->base.free_fn = sparse_free;
-    sm->csr = new_csr(A);
+}
+
+Matrix *new_sparse_matrix(CSR_Matrix *A)
+{
+    Sparse_Matrix *sm = (Sparse_Matrix *) SP_CALLOC(1, sizeof(Sparse_Matrix));
+    sm->base.m = A->m;
+    sm->base.n = A->n;
+    wire_vtable(sm);
+    sm->csr = A;
     return &sm->base;
 }
 
@@ -75,11 +143,7 @@ Matrix *sparse_matrix_trans(const Sparse_Matrix *self, int *iwork)
     Sparse_Matrix *sm = (Sparse_Matrix *) SP_CALLOC(1, sizeof(Sparse_Matrix));
     sm->base.m = AT->m;
     sm->base.n = AT->n;
-    sm->base.block_left_mult_vec = sparse_block_left_mult_vec;
-    sm->base.block_left_mult_sparsity = sparse_block_left_mult_sparsity;
-    sm->base.block_left_mult_values = sparse_block_left_mult_values;
-    sm->base.update_values = sparse_update_values;
-    sm->base.free_fn = sparse_free;
+    wire_vtable(sm);
     sm->csr = AT;
     return &sm->base;
 }
diff --git a/tests/all_tests.c b/tests/all_tests.c
index a40b2b9..1d38759 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -368,6 +368,7 @@ int main(void)
     mu_run_test(test_permuted_dense_ATDA_fill_values, tests_run);
     mu_run_test(test_permuted_dense_times_csc, tests_run);
     mu_run_test(test_permuted_dense_times_csc_no_active, tests_run);
+    mu_run_test(test_permuted_dense_to_csr_lazy, tests_run);
     mu_run_test(test_permuted_dense_col_inv, tests_run);
     mu_run_test(test_YT_kron_I, tests_run);
     mu_run_test(test_YT_kron_I_larger, tests_run);
diff --git a/tests/jacobian_tests/affine/test_broadcast.h b/tests/jacobian_tests/affine/test_broadcast.h
index f41082d..4ed1b40 100644
--- a/tests/jacobian_tests/affine/test_broadcast.h
+++ b/tests/jacobian_tests/affine/test_broadcast.h
@@ -38,11 +38,11 @@ const char *test_broadcast_row_jacobian(void)
     int expected_i[6] = {0, 0, 1, 1, 2, 2};
 
     mu_assert("broadcast row jacobian vals fail",
-              cmp_double_array(bcast->jacobian->x, expected_x, 6));
+              cmp_double_array(bcast->jacobian->to_csr(bcast->jacobian)->x, expected_x, 6));
     mu_assert("broadcast row jacobian rows fail",
-              cmp_int_array(bcast->jacobian->p, expected_p, 4));
+              cmp_int_array(bcast->jacobian->to_csr(bcast->jacobian)->p, expected_p, 4));
     mu_assert("broadcast row jacobian cols fail",
-              cmp_int_array(bcast->jacobian->i, expected_i, 6));
+              cmp_int_array(bcast->jacobian->to_csr(bcast->jacobian)->i, expected_i, 6));
 
     free_expr(bcast);
     return 0;
@@ -83,11 +83,11 @@ const char *test_broadcast_col_jacobian(void)
     int expected_i[6] = {0, 1, 2, 0, 1, 2};
 
     mu_assert("broadcast col jacobian vals fail",
-              cmp_double_array(bcast->jacobian->x, expected_x, 6));
+              cmp_double_array(bcast->jacobian->to_csr(bcast->jacobian)->x, expected_x, 6));
     mu_assert("broadcast col jacobian rows fail",
-              cmp_int_array(bcast->jacobian->p, expected_p, 7));
+              cmp_int_array(bcast->jacobian->to_csr(bcast->jacobian)->p, expected_p, 7));
     mu_assert("broadcast col jacobian cols fail",
-              cmp_int_array(bcast->jacobian->i, expected_i, 6));
+              cmp_int_array(bcast->jacobian->to_csr(bcast->jacobian)->i, expected_i, 6));
 
     free_expr(bcast);
     return 0;
@@ -124,11 +124,11 @@ const char *test_broadcast_scalar_to_matrix_jacobian(void)
     int expected_i[6] = {0, 0, 0, 0, 0, 0};
 
     mu_assert("broadcast scalar jacobian vals fail",
-              cmp_double_array(bcast->jacobian->x, expected_x, 6));
+              cmp_double_array(bcast->jacobian->to_csr(bcast->jacobian)->x, expected_x, 6));
     mu_assert("broadcast scalar jacobian rows fail",
-              cmp_int_array(bcast->jacobian->p, expected_p, 7));
+              cmp_int_array(bcast->jacobian->to_csr(bcast->jacobian)->p, expected_p, 7));
     mu_assert("broadcast scalar jacobian cols fail",
-              cmp_int_array(bcast->jacobian->i, expected_i, 6));
+              cmp_int_array(bcast->jacobian->to_csr(bcast->jacobian)->i, expected_i, 6));
 
     free_expr(bcast);
     return 0;
@@ -156,11 +156,11 @@ const char *test_double_broadcast(void)
     // int expected_i[6] = {0, 0, 0, 0, 0, 0};
     //
     // mu_assert("broadcast scalar jacobian vals fail",
-    //          cmp_double_array(sum->jacobian->x, expected_x, 6));
+    //          cmp_double_array(sum->jacobian->to_csr(sum->jacobian)->x, expected_x, 6));
     // mu_assert("broadcast scalar jacobian rows fail",
     //          cmp_int_array(sum ->jacobian->p, expected_p, 7));
     // mu_assert("broadcast scalar jacobian cols fail",
-    //          cmp_int_array(bcast->jacobian->i, expected_i, 6));
+    //          cmp_int_array(bcast->jacobian->to_csr(bcast->jacobian)->i, expected_i, 6));
 
     free_expr(sum);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_convolve.h b/tests/jacobian_tests/affine/test_convolve.h
index c3aae98..ddb7f7a 100644
--- a/tests/jacobian_tests/affine/test_convolve.h
+++ b/tests/jacobian_tests/affine/test_convolve.h
@@ -29,20 +29,20 @@ const char *test_jacobian_convolve(void)
     jacobian_init(y);
     y->eval_jacobian(y);
 
-    mu_assert("Jacobian should have 5 rows", y->jacobian->m == 5);
-    mu_assert("Jacobian should have 3 columns", y->jacobian->n == 3);
-    mu_assert("Jacobian should have 9 nonzeros", y->jacobian->nnz == 9);
+    mu_assert("Jacobian should have 5 rows", y->jacobian->to_csr(y->jacobian)->m == 5);
+    mu_assert("Jacobian should have 3 columns", y->jacobian->to_csr(y->jacobian)->n == 3);
+    mu_assert("Jacobian should have 9 nonzeros", y->jacobian->to_csr(y->jacobian)->nnz == 9);
 
     int expected_p[6] = {0, 1, 3, 6, 8, 9};
     int expected_i[9] = {0, 0, 1, 0, 1, 2, 1, 2, 2};
     double expected_x[9] = {1.0, 2.0, 1.0, 3.0, 2.0, 1.0, 3.0, 2.0, 3.0};
 
     mu_assert("Convolve Jacobian row pointers incorrect",
-              cmp_int_array(y->jacobian->p, expected_p, 6));
+              cmp_int_array(y->jacobian->to_csr(y->jacobian)->p, expected_p, 6));
     mu_assert("Convolve Jacobian column indices incorrect",
-              cmp_int_array(y->jacobian->i, expected_i, 9));
+              cmp_int_array(y->jacobian->to_csr(y->jacobian)->i, expected_i, 9));
     mu_assert("Convolve Jacobian values incorrect",
-              cmp_double_array(y->jacobian->x, expected_x, 9));
+              cmp_double_array(y->jacobian->to_csr(y->jacobian)->x, expected_x, 9));
 
     free_expr(y);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_diag_mat.h b/tests/jacobian_tests/affine/test_diag_mat.h
index b1031ac..2aec818 100644
--- a/tests/jacobian_tests/affine/test_diag_mat.h
+++ b/tests/jacobian_tests/affine/test_diag_mat.h
@@ -23,9 +23,9 @@ const char *test_diag_mat_jacobian_variable(void)
     int expected_p[3] = {0, 1, 2};
     int expected_i[2] = {0, 3};
 
-    mu_assert("diag_mat jac vals", cmp_double_array(dm->jacobian->x, expected_x, 2));
-    mu_assert("diag_mat jac p", cmp_int_array(dm->jacobian->p, expected_p, 3));
-    mu_assert("diag_mat jac i", cmp_int_array(dm->jacobian->i, expected_i, 2));
+    mu_assert("diag_mat jac vals", cmp_double_array(dm->jacobian->to_csr(dm->jacobian)->x, expected_x, 2));
+    mu_assert("diag_mat jac p", cmp_int_array(dm->jacobian->to_csr(dm->jacobian)->p, expected_p, 3));
+    mu_assert("diag_mat jac i", cmp_int_array(dm->jacobian->to_csr(dm->jacobian)->i, expected_i, 2));
 
     free_expr(dm);
     return 0;
@@ -52,9 +52,9 @@ const char *test_diag_mat_jacobian_of_log(void)
     int expected_i[2] = {0, 3};
 
     mu_assert("diag_mat log jac vals",
-              cmp_double_array(dm->jacobian->x, expected_x, 2));
+              cmp_double_array(dm->jacobian->to_csr(dm->jacobian)->x, expected_x, 2));
     mu_assert("diag_mat log jac cols",
-              cmp_int_array(dm->jacobian->i, expected_i, 2));
+              cmp_int_array(dm->jacobian->to_csr(dm->jacobian)->i, expected_i, 2));
 
     free_expr(dm);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_hstack.h b/tests/jacobian_tests/affine/test_hstack.h
index a5b163c..ac7becf 100644
--- a/tests/jacobian_tests/affine/test_hstack.h
+++ b/tests/jacobian_tests/affine/test_hstack.h
@@ -42,9 +42,9 @@ const char *test_jacobian_hstack_vectors(void)
     int expected_Ai[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
     int expected_Ap[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
 
-    mu_assert("vals fail", cmp_double_array(stack->jacobian->x, expected_Ax, 9));
-    mu_assert("cols fail", cmp_int_array(stack->jacobian->i, expected_Ai, 9));
-    mu_assert("rows fail", cmp_int_array(stack->jacobian->p, expected_Ap, 10));
+    mu_assert("vals fail", cmp_double_array(stack->jacobian->to_csr(stack->jacobian)->x, expected_Ax, 9));
+    mu_assert("cols fail", cmp_int_array(stack->jacobian->to_csr(stack->jacobian)->i, expected_Ai, 9));
+    mu_assert("rows fail", cmp_int_array(stack->jacobian->to_csr(stack->jacobian)->p, expected_Ap, 10));
 
     free_expr(stack);
     return 0;
@@ -86,9 +86,9 @@ const char *test_jacobian_hstack_matrix(void)
     int expected_Ap[19] = {0,  1,  2,  3,  4,  5,  6,  7,  8, 9,
                            10, 11, 12, 13, 14, 15, 16, 17, 18};
 
-    mu_assert("vals fail", cmp_double_array(stack->jacobian->x, expected_Ax, 18));
-    mu_assert("cols fail", cmp_int_array(stack->jacobian->i, expected_Ai, 18));
-    mu_assert("rows fail", cmp_int_array(stack->jacobian->p, expected_Ap, 19));
+    mu_assert("vals fail", cmp_double_array(stack->jacobian->to_csr(stack->jacobian)->x, expected_Ax, 18));
+    mu_assert("cols fail", cmp_int_array(stack->jacobian->to_csr(stack->jacobian)->i, expected_Ai, 18));
+    mu_assert("rows fail", cmp_int_array(stack->jacobian->to_csr(stack->jacobian)->p, expected_Ap, 19));
 
     free_expr(stack);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_index.h b/tests/jacobian_tests/affine/test_index.h
index 8cdd9f7..16474a4 100644
--- a/tests/jacobian_tests/affine/test_index.h
+++ b/tests/jacobian_tests/affine/test_index.h
@@ -56,9 +56,9 @@ const char *test_index_jacobian_of_variable(void)
     int expected_p[3] = {0, 1, 2}; /* CSR row ptrs */
     int expected_i[2] = {0, 2};    /* column indices */
 
-    mu_assert("index jac vals", cmp_double_array(idx->jacobian->x, expected_x, 2));
-    mu_assert("index jac p", cmp_int_array(idx->jacobian->p, expected_p, 3));
-    mu_assert("index jac i", cmp_int_array(idx->jacobian->i, expected_i, 2));
+    mu_assert("index jac vals", cmp_double_array(idx->jacobian->to_csr(idx->jacobian)->x, expected_x, 2));
+    mu_assert("index jac p", cmp_int_array(idx->jacobian->to_csr(idx->jacobian)->p, expected_p, 3));
+    mu_assert("index jac i", cmp_int_array(idx->jacobian->to_csr(idx->jacobian)->i, expected_i, 2));
 
     free_expr(idx);
     return 0;
@@ -83,9 +83,9 @@ const char *test_index_jacobian_of_log(void)
     int expected_i[2] = {0, 2};
 
     mu_assert("index of log jac vals",
-              cmp_double_array(idx->jacobian->x, expected_x, 2));
+              cmp_double_array(idx->jacobian->to_csr(idx->jacobian)->x, expected_x, 2));
     mu_assert("index of log jac cols",
-              cmp_int_array(idx->jacobian->i, expected_i, 2));
+              cmp_int_array(idx->jacobian->to_csr(idx->jacobian)->i, expected_i, 2));
 
     free_expr(idx);
     return 0;
@@ -108,11 +108,11 @@ const char *test_index_jacobian_repeated(void)
     int expected_i[2] = {0, 0}; /* Both reference col 0 */
 
     mu_assert("index repeated jac vals",
-              cmp_double_array(idx->jacobian->x, expected_x, 2));
+              cmp_double_array(idx->jacobian->to_csr(idx->jacobian)->x, expected_x, 2));
     mu_assert("index repeated row ptr",
-              cmp_int_array(idx->jacobian->p, expected_p, 3));
+              cmp_int_array(idx->jacobian->to_csr(idx->jacobian)->p, expected_p, 3));
     mu_assert("index repeated jac i",
-              cmp_int_array(idx->jacobian->i, expected_i, 2));
+              cmp_int_array(idx->jacobian->to_csr(idx->jacobian)->i, expected_i, 2));
 
     free_expr(idx);
     return 0;
@@ -137,8 +137,8 @@ const char *test_sum_of_index(void)
     double expected_x[2] = {1.0, 1.0};
     int expected_i[2] = {0, 2};
 
-    mu_assert("sum of index vals", cmp_double_array(s->jacobian->x, expected_x, 2));
-    mu_assert("sum of index cols", cmp_int_array(s->jacobian->i, expected_i, 2));
+    mu_assert("sum of index vals", cmp_double_array(s->jacobian->to_csr(s->jacobian)->x, expected_x, 2));
+    mu_assert("sum of index cols", cmp_int_array(s->jacobian->to_csr(s->jacobian)->i, expected_i, 2));
 
     free_expr(s);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_left_matmul.h b/tests/jacobian_tests/affine/test_left_matmul.h
index a0c1385..bb4e31f 100644
--- a/tests/jacobian_tests/affine/test_left_matmul.h
+++ b/tests/jacobian_tests/affine/test_left_matmul.h
@@ -61,9 +61,9 @@ const char *test_jacobian_left_matmul_log(void)
     int expected_Ai[7] = {0, 2, 0, 2, 0, 2, 0};
     int expected_Ap[5] = {0, 2, 4, 6, 7};
 
-    mu_assert("vals fail", cmp_double_array(A_log_x->jacobian->x, expected_Ax, 7));
-    mu_assert("cols fail", cmp_int_array(A_log_x->jacobian->i, expected_Ai, 7));
-    mu_assert("rows fail", cmp_int_array(A_log_x->jacobian->p, expected_Ap, 5));
+    mu_assert("vals fail", cmp_double_array(A_log_x->jacobian->to_csr(A_log_x->jacobian)->x, expected_Ax, 7));
+    mu_assert("cols fail", cmp_int_array(A_log_x->jacobian->to_csr(A_log_x->jacobian)->i, expected_Ai, 7));
+    mu_assert("rows fail", cmp_int_array(A_log_x->jacobian->to_csr(A_log_x->jacobian)->p, expected_Ap, 5));
 
     free_csr_matrix(A);
     free_expr(A_log_x);
@@ -100,9 +100,9 @@ const char *test_jacobian_left_matmul_log_matrix(void)
     int expected_Ai[14] = {0, 2, 0, 2, 0, 2, 0, 3, 5, 3, 5, 3, 5, 3};
     int expected_Ap[9] = {0, 2, 4, 6, 7, 9, 11, 13, 14};
 
-    mu_assert("vals fail", cmp_double_array(A_log_x->jacobian->x, expected_Ax, 14));
-    mu_assert("cols fail", cmp_int_array(A_log_x->jacobian->i, expected_Ai, 14));
-    mu_assert("rows fail", cmp_int_array(A_log_x->jacobian->p, expected_Ap, 9));
+    mu_assert("vals fail", cmp_double_array(A_log_x->jacobian->to_csr(A_log_x->jacobian)->x, expected_Ax, 14));
+    mu_assert("cols fail", cmp_int_array(A_log_x->jacobian->to_csr(A_log_x->jacobian)->i, expected_Ai, 14));
+    mu_assert("rows fail", cmp_int_array(A_log_x->jacobian->to_csr(A_log_x->jacobian)->p, expected_Ap, 9));
 
     free_csr_matrix(A);
     free_expr(A_log_x);
diff --git a/tests/jacobian_tests/affine/test_neg.h b/tests/jacobian_tests/affine/test_neg.h
index ca4e595..a928d8d 100644
--- a/tests/jacobian_tests/affine/test_neg.h
+++ b/tests/jacobian_tests/affine/test_neg.h
@@ -20,11 +20,11 @@ const char *test_neg_jacobian(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("neg jacobian vals fail",
-              cmp_double_array(neg_node->jacobian->x, expected_x, 3));
+              cmp_double_array(neg_node->jacobian->to_csr(neg_node->jacobian)->x, expected_x, 3));
     mu_assert("neg jacobian rows fail",
-              cmp_int_array(neg_node->jacobian->p, expected_p, 4));
+              cmp_int_array(neg_node->jacobian->to_csr(neg_node->jacobian)->p, expected_p, 4));
     mu_assert("neg jacobian cols fail",
-              cmp_int_array(neg_node->jacobian->i, expected_i, 3));
+              cmp_int_array(neg_node->jacobian->to_csr(neg_node->jacobian)->i, expected_i, 3));
 
     free_expr(neg_node);
     return 0;
@@ -51,11 +51,11 @@ const char *test_neg_chain(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("neg chain jacobian vals fail",
-              cmp_double_array(neg2->jacobian->x, expected_x, 3));
+              cmp_double_array(neg2->jacobian->to_csr(neg2->jacobian)->x, expected_x, 3));
     mu_assert("neg chain jacobian rows fail",
-              cmp_int_array(neg2->jacobian->p, expected_p, 4));
+              cmp_int_array(neg2->jacobian->to_csr(neg2->jacobian)->p, expected_p, 4));
     mu_assert("neg chain jacobian cols fail",
-              cmp_int_array(neg2->jacobian->i, expected_i, 3));
+              cmp_int_array(neg2->jacobian->to_csr(neg2->jacobian)->i, expected_i, 3));
 
     free_expr(neg2);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_promote.h b/tests/jacobian_tests/affine/test_promote.h
index ad2ab49..b6c304b 100644
--- a/tests/jacobian_tests/affine/test_promote.h
+++ b/tests/jacobian_tests/affine/test_promote.h
@@ -23,11 +23,11 @@ const char *test_promote_scalar_jacobian(void)
     int expected_i[3] = {0, 0, 0};
 
     mu_assert("promote jacobian vals fail",
-              cmp_double_array(promote_node->jacobian->x, expected_x, 3));
+              cmp_double_array(promote_node->jacobian->to_csr(promote_node->jacobian)->x, expected_x, 3));
     mu_assert("promote jacobian rows fail",
-              cmp_int_array(promote_node->jacobian->p, expected_p, 4));
+              cmp_int_array(promote_node->jacobian->to_csr(promote_node->jacobian)->p, expected_p, 4));
     mu_assert("promote jacobian cols fail",
-              cmp_int_array(promote_node->jacobian->i, expected_i, 3));
+              cmp_int_array(promote_node->jacobian->to_csr(promote_node->jacobian)->i, expected_i, 3));
 
     free_expr(promote_node);
     return 0;
@@ -55,11 +55,11 @@ const char *test_promote_scalar_to_matrix_jacobian(void)
     int expected_i[6] = {0, 0, 0, 0, 0, 0};
 
     mu_assert("promote matrix jacobian vals fail",
-              cmp_double_array(promote_node->jacobian->x, expected_x, 6));
+              cmp_double_array(promote_node->jacobian->to_csr(promote_node->jacobian)->x, expected_x, 6));
     mu_assert("promote matrix jacobian rows fail",
-              cmp_int_array(promote_node->jacobian->p, expected_p, 7));
+              cmp_int_array(promote_node->jacobian->to_csr(promote_node->jacobian)->p, expected_p, 7));
     mu_assert("promote matrix jacobian cols fail",
-              cmp_int_array(promote_node->jacobian->i, expected_i, 6));
+              cmp_int_array(promote_node->jacobian->to_csr(promote_node->jacobian)->i, expected_i, 6));
 
     free_expr(promote_node);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_right_matmul.h b/tests/jacobian_tests/affine/test_right_matmul.h
index 8c2fa04..056e844 100644
--- a/tests/jacobian_tests/affine/test_right_matmul.h
+++ b/tests/jacobian_tests/affine/test_right_matmul.h
@@ -47,9 +47,9 @@ const char *test_jacobian_right_matmul_log(void)
     int expected_Ai[8] = {0, 2, 1, 3, 0, 2, 1, 3};
     int expected_Ap[7] = {0, 2, 4, 4, 4, 6, 8};
 
-    mu_assert("vals fail", cmp_double_array(log_x_A->jacobian->x, expected_Ax, 8));
-    mu_assert("cols fail", cmp_int_array(log_x_A->jacobian->i, expected_Ai, 8));
-    mu_assert("rows fail", cmp_int_array(log_x_A->jacobian->p, expected_Ap, 7));
+    mu_assert("vals fail", cmp_double_array(log_x_A->jacobian->to_csr(log_x_A->jacobian)->x, expected_Ax, 8));
+    mu_assert("cols fail", cmp_int_array(log_x_A->jacobian->to_csr(log_x_A->jacobian)->i, expected_Ai, 8));
+    mu_assert("rows fail", cmp_int_array(log_x_A->jacobian->to_csr(log_x_A->jacobian)->p, expected_Ap, 7));
 
     free_csr_matrix(A);
     free_expr(log_x_A);
@@ -92,9 +92,9 @@ const char *test_jacobian_right_matmul_log_vector(void)
     int expected_Ai[4] = {0, 1, 1, 2};
     int expected_Ap[3] = {0, 2, 4};
 
-    mu_assert("vals fail", cmp_double_array(log_x_A->jacobian->x, expected_Ax, 4));
-    mu_assert("cols fail", cmp_int_array(log_x_A->jacobian->i, expected_Ai, 4));
-    mu_assert("rows fail", cmp_int_array(log_x_A->jacobian->p, expected_Ap, 3));
+    mu_assert("vals fail", cmp_double_array(log_x_A->jacobian->to_csr(log_x_A->jacobian)->x, expected_Ax, 4));
+    mu_assert("cols fail", cmp_int_array(log_x_A->jacobian->to_csr(log_x_A->jacobian)->i, expected_Ai, 4));
+    mu_assert("rows fail", cmp_int_array(log_x_A->jacobian->to_csr(log_x_A->jacobian)->p, expected_Ap, 3));
 
     free_csr_matrix(A);
     free_expr(log_x_A);
diff --git a/tests/jacobian_tests/affine/test_scalar_mult.h b/tests/jacobian_tests/affine/test_scalar_mult.h
index a16a6fa..66bdaaf 100644
--- a/tests/jacobian_tests/affine/test_scalar_mult.h
+++ b/tests/jacobian_tests/affine/test_scalar_mult.h
@@ -37,11 +37,11 @@ const char *test_jacobian_scalar_mult_log_vector(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("scalar mult log: x values fail",
-              cmp_double_array(y->jacobian->x, expected_x, 3));
+              cmp_double_array(y->jacobian->to_csr(y->jacobian)->x, expected_x, 3));
     mu_assert("scalar mult log: row pointers fail",
-              cmp_int_array(y->jacobian->p, expected_p, 4));
+              cmp_int_array(y->jacobian->to_csr(y->jacobian)->p, expected_p, 4));
     mu_assert("scalar mult log: column indices fail",
-              cmp_int_array(y->jacobian->i, expected_i, 3));
+              cmp_int_array(y->jacobian->to_csr(y->jacobian)->i, expected_i, 3));
 
     free_expr(y);
     return 0;
@@ -74,11 +74,11 @@ const char *test_jacobian_scalar_mult_log_matrix(void)
     int expected_i[4] = {0, 1, 2, 3};
 
     mu_assert("scalar mult log matrix: x values fail",
-              cmp_double_array(y->jacobian->x, expected_x, 4));
+              cmp_double_array(y->jacobian->to_csr(y->jacobian)->x, expected_x, 4));
     mu_assert("scalar mult log matrix: row pointers fail",
-              cmp_int_array(y->jacobian->p, expected_p, 5));
+              cmp_int_array(y->jacobian->to_csr(y->jacobian)->p, expected_p, 5));
     mu_assert("scalar mult log matrix: column indices fail",
-              cmp_int_array(y->jacobian->i, expected_i, 4));
+              cmp_int_array(y->jacobian->to_csr(y->jacobian)->i, expected_i, 4));
 
     free_expr(y);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_sum.h b/tests/jacobian_tests/affine/test_sum.h
index f34ee91..b00695b 100644
--- a/tests/jacobian_tests/affine/test_sum.h
+++ b/tests/jacobian_tests/affine/test_sum.h
@@ -25,9 +25,9 @@ const char *test_jacobian_sum_log(void)
     int expected_Ap[2] = {0, 3};
     int expected_Ai[3] = {2, 3, 4};
 
-    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->x, expected_Ax, 3));
-    mu_assert("rows fail", cmp_int_array(sum_node->jacobian->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(sum_node->jacobian->i, expected_Ai, 3));
+    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->to_csr(sum_node->jacobian)->x, expected_Ax, 3));
+    mu_assert("rows fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->p, expected_Ap, 2));
+    mu_assert("cols fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->i, expected_Ai, 3));
 
     free_expr(sum_node);
     return 0;
@@ -59,9 +59,9 @@ const char *test_jacobian_sum_mult(void)
     int expected_Ap[2] = {0, 6}; /* 1x10 matrix: row 0 spans all 6 nonzeros */
     int expected_Ai[6] = {2, 3, 4, 6, 7, 8}; /* column indices */
 
-    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->x, expected_Ax, 6));
-    mu_assert("rows fail", cmp_int_array(sum_node->jacobian->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(sum_node->jacobian->i, expected_Ai, 6));
+    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->to_csr(sum_node->jacobian)->x, expected_Ax, 6));
+    mu_assert("rows fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->p, expected_Ap, 2));
+    mu_assert("cols fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->i, expected_Ai, 6));
 
     free_expr(sum_node);
     return 0;
@@ -99,9 +99,9 @@ const char *test_jacobian_sum_log_axis_0(void)
     int expected_Ap[3] = {0, 3, 6};
     int expected_Ai[6] = {2, 3, 4, 5, 6, 7}; /* column indices */
 
-    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->x, expected_Ax, 6));
-    mu_assert("rows fail", cmp_int_array(sum_node->jacobian->p, expected_Ap, 3));
-    mu_assert("cols fail", cmp_int_array(sum_node->jacobian->i, expected_Ai, 6));
+    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->to_csr(sum_node->jacobian)->x, expected_Ax, 6));
+    mu_assert("rows fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->p, expected_Ap, 3));
+    mu_assert("cols fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->i, expected_Ai, 6));
 
     free_expr(sum_node);
     return 0;
@@ -146,9 +146,9 @@ const char *test_jacobian_sum_add_log_axis_0(void)
     int expected_Ai[12] = {2, 3, 4, 8,  9,  10,  /* row 0 columns */
                            5, 6, 7, 11, 12, 13}; /* row 1 columns */
 
-    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->x, expected_Ax, 12));
-    mu_assert("rows fail", cmp_int_array(sum_node->jacobian->p, expected_Ap, 3));
-    mu_assert("cols fail", cmp_int_array(sum_node->jacobian->i, expected_Ai, 12));
+    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->to_csr(sum_node->jacobian)->x, expected_Ax, 12));
+    mu_assert("rows fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->p, expected_Ap, 3));
+    mu_assert("cols fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->i, expected_Ai, 12));
 
     free_expr(sum_node);
     return 0;
@@ -188,9 +188,9 @@ const char *test_jacobian_sum_log_axis_1(void)
     int expected_Ap[4] = {0, 2, 4, 6};
     int expected_Ai[6] = {2, 5, 3, 6, 4, 7}; /* column indices */
 
-    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->x, expected_Ax, 6));
-    mu_assert("rows fail", cmp_int_array(sum_node->jacobian->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(sum_node->jacobian->i, expected_Ai, 6));
+    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->to_csr(sum_node->jacobian)->x, expected_Ax, 6));
+    mu_assert("rows fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->p, expected_Ap, 4));
+    mu_assert("cols fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->i, expected_Ai, 6));
 
     free_expr(sum_node);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_trace.h b/tests/jacobian_tests/affine/test_trace.h
index 61701bd..39f9fbf 100644
--- a/tests/jacobian_tests/affine/test_trace.h
+++ b/tests/jacobian_tests/affine/test_trace.h
@@ -40,9 +40,9 @@ const char *test_jacobian_trace_variable(void)
     int expected_Ai[3] = {1, 5, 9}; /* column indices (global variable indices) */
 
     mu_assert("vals fail",
-              cmp_double_array(trace_node->jacobian->x, expected_Ax, 3));
-    mu_assert("rows fail", cmp_int_array(trace_node->jacobian->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(trace_node->jacobian->i, expected_Ai, 3));
+              cmp_double_array(trace_node->jacobian->to_csr(trace_node->jacobian)->x, expected_Ax, 3));
+    mu_assert("rows fail", cmp_int_array(trace_node->jacobian->to_csr(trace_node->jacobian)->p, expected_Ap, 2));
+    mu_assert("cols fail", cmp_int_array(trace_node->jacobian->to_csr(trace_node->jacobian)->i, expected_Ai, 3));
 
     free_expr(trace_node);
     return 0;
@@ -93,11 +93,11 @@ const char *test_jacobian_trace_composite(void)
     int expected_Ap[2] = {0, 3};
     int expected_Ai[3] = {1, 5, 9}; /* column indices (global variable indices) */
 
-    mu_assert("vals match count", trace_node->jacobian->nnz == 3);
-    mu_assert("rows fail", cmp_int_array(trace_node->jacobian->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(trace_node->jacobian->i, expected_Ai, 3));
+    mu_assert("vals match count", trace_node->jacobian->to_csr(trace_node->jacobian)->nnz == 3);
+    mu_assert("rows fail", cmp_int_array(trace_node->jacobian->to_csr(trace_node->jacobian)->p, expected_Ap, 2));
+    mu_assert("cols fail", cmp_int_array(trace_node->jacobian->to_csr(trace_node->jacobian)->i, expected_Ai, 3));
     mu_assert("vals fail",
-              cmp_double_array(trace_node->jacobian->x, expected_Ax, 3));
+              cmp_double_array(trace_node->jacobian->to_csr(trace_node->jacobian)->x, expected_Ax, 3));
 
     free_expr(trace_node);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_transpose.h b/tests/jacobian_tests/affine/test_transpose.h
index a02e22d..c85f50a 100644
--- a/tests/jacobian_tests/affine/test_transpose.h
+++ b/tests/jacobian_tests/affine/test_transpose.h
@@ -34,11 +34,11 @@ const char *test_jacobian_transpose(void)
     int expected_i[8] = {0, 1, 2, 3, 0, 1, 2, 3};
 
     mu_assert("jacobian values fail",
-              cmp_double_array(transpose_AX->jacobian->x, expected_x, 8));
+              cmp_double_array(transpose_AX->jacobian->to_csr(transpose_AX->jacobian)->x, expected_x, 8));
     mu_assert("jacobian row ptr fail",
-              cmp_int_array(transpose_AX->jacobian->p, expected_p, 5));
+              cmp_int_array(transpose_AX->jacobian->to_csr(transpose_AX->jacobian)->p, expected_p, 5));
     mu_assert("jacobian col idx fail",
-              cmp_int_array(transpose_AX->jacobian->i, expected_i, 8));
+              cmp_int_array(transpose_AX->jacobian->to_csr(transpose_AX->jacobian)->i, expected_i, 8));
     free_expr(transpose_AX);
     free_csr_matrix(A);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_upper_tri.h b/tests/jacobian_tests/affine/test_upper_tri.h
index f0bffa1..125780f 100644
--- a/tests/jacobian_tests/affine/test_upper_tri.h
+++ b/tests/jacobian_tests/affine/test_upper_tri.h
@@ -28,9 +28,9 @@ const char *test_upper_tri_jacobian_variable(void)
     int expected_i[6] = {4, 8, 12, 9, 13, 14};
 
     mu_assert("upper_tri jac vals",
-              cmp_double_array(ut->jacobian->x, expected_x, 6));
-    mu_assert("upper_tri jac p", cmp_int_array(ut->jacobian->p, expected_p, 7));
-    mu_assert("upper_tri jac i", cmp_int_array(ut->jacobian->i, expected_i, 6));
+              cmp_double_array(ut->jacobian->to_csr(ut->jacobian)->x, expected_x, 6));
+    mu_assert("upper_tri jac p", cmp_int_array(ut->jacobian->to_csr(ut->jacobian)->p, expected_p, 7));
+    mu_assert("upper_tri jac i", cmp_int_array(ut->jacobian->to_csr(ut->jacobian)->i, expected_i, 6));
 
     free_expr(ut);
     return 0;
@@ -60,9 +60,9 @@ const char *test_upper_tri_jacobian_of_log(void)
     int expected_i[6] = {4, 8, 12, 9, 13, 14};
 
     mu_assert("upper_tri log jac vals",
-              cmp_double_array(ut->jacobian->x, expected_x, 6));
+              cmp_double_array(ut->jacobian->to_csr(ut->jacobian)->x, expected_x, 6));
     mu_assert("upper_tri log jac cols",
-              cmp_int_array(ut->jacobian->i, expected_i, 6));
+              cmp_int_array(ut->jacobian->to_csr(ut->jacobian)->i, expected_i, 6));
 
     free_expr(ut);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_vector_mult.h b/tests/jacobian_tests/affine/test_vector_mult.h
index cd6fa1a..8304e91 100644
--- a/tests/jacobian_tests/affine/test_vector_mult.h
+++ b/tests/jacobian_tests/affine/test_vector_mult.h
@@ -36,11 +36,11 @@ const char *test_jacobian_vector_mult_log_vector(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vector mult log: x values fail",
-              cmp_double_array(y->jacobian->x, expected_x, 3));
+              cmp_double_array(y->jacobian->to_csr(y->jacobian)->x, expected_x, 3));
     mu_assert("vector mult log: row pointers fail",
-              cmp_int_array(y->jacobian->p, expected_p, 4));
+              cmp_int_array(y->jacobian->to_csr(y->jacobian)->p, expected_p, 4));
     mu_assert("vector mult log: column indices fail",
-              cmp_int_array(y->jacobian->i, expected_i, 3));
+              cmp_int_array(y->jacobian->to_csr(y->jacobian)->i, expected_i, 3));
 
     free_expr(y);
     return 0;
@@ -72,11 +72,11 @@ const char *test_jacobian_vector_mult_log_matrix(void)
     int expected_i[4] = {0, 1, 2, 3};
 
     mu_assert("vector mult log matrix: x values fail",
-              cmp_double_array(y->jacobian->x, expected_x, 4));
+              cmp_double_array(y->jacobian->to_csr(y->jacobian)->x, expected_x, 4));
     mu_assert("vector mult log matrix: row pointers fail",
-              cmp_int_array(y->jacobian->p, expected_p, 5));
+              cmp_int_array(y->jacobian->to_csr(y->jacobian)->p, expected_p, 5));
     mu_assert("vector mult log matrix: column indices fail",
-              cmp_int_array(y->jacobian->i, expected_i, 4));
+              cmp_int_array(y->jacobian->to_csr(y->jacobian)->i, expected_i, 4));
 
     free_expr(y);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_vstack.h b/tests/jacobian_tests/affine/test_vstack.h
index fc4229f..b4115b6 100644
--- a/tests/jacobian_tests/affine/test_vstack.h
+++ b/tests/jacobian_tests/affine/test_vstack.h
@@ -39,11 +39,11 @@ const char *test_jacobian_vstack_vectors(void)
     int expected_p[7] = {0, 1, 2, 3, 4, 5, 6};
 
     mu_assert("vstack jac vectors: vals",
-              cmp_double_array(stack->jacobian->x, expected_x, 6));
+              cmp_double_array(stack->jacobian->to_csr(stack->jacobian)->x, expected_x, 6));
     mu_assert("vstack jac vectors: cols",
-              cmp_int_array(stack->jacobian->i, expected_i, 6));
+              cmp_int_array(stack->jacobian->to_csr(stack->jacobian)->i, expected_i, 6));
     mu_assert("vstack jac vectors: rows",
-              cmp_int_array(stack->jacobian->p, expected_p, 7));
+              cmp_int_array(stack->jacobian->to_csr(stack->jacobian)->p, expected_p, 7));
 
     free_expr(stack);
     return 0;
@@ -89,11 +89,11 @@ const char *test_jacobian_vstack_matrix(void)
     int expected_p[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
 
     mu_assert("vstack jac matrix: vals",
-              cmp_double_array(stack->jacobian->x, expected_x, 9));
+              cmp_double_array(stack->jacobian->to_csr(stack->jacobian)->x, expected_x, 9));
     mu_assert("vstack jac matrix: cols",
-              cmp_int_array(stack->jacobian->i, expected_i, 9));
+              cmp_int_array(stack->jacobian->to_csr(stack->jacobian)->i, expected_i, 9));
     mu_assert("vstack jac matrix: rows",
-              cmp_int_array(stack->jacobian->p, expected_p, 10));
+              cmp_int_array(stack->jacobian->to_csr(stack->jacobian)->p, expected_p, 10));
 
     free_expr(stack);
     return 0;
diff --git a/tests/jacobian_tests/bivariate_full_dom/test_elementwise_mult.h b/tests/jacobian_tests/bivariate_full_dom/test_elementwise_mult.h
index 9e7e6e5..bd3bd3c 100644
--- a/tests/jacobian_tests/bivariate_full_dom/test_elementwise_mult.h
+++ b/tests/jacobian_tests/bivariate_full_dom/test_elementwise_mult.h
@@ -24,9 +24,9 @@ const char *test_jacobian_elementwise_mult_1(void)
     int rows[4] = {0, 2, 4, 6};
     int cols[6] = {2, 7, 3, 8, 4, 9};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, vals, 6));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, rows, 4));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, cols, 6));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, vals, 6));
+    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, rows, 4));
+    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, cols, 6));
     free_expr(node);
     return 0;
 }
@@ -49,9 +49,9 @@ const char *test_jacobian_elementwise_mult_2(void)
     int rows[4] = {0, 2, 4, 6};
     int cols[6] = {2, 7, 3, 8, 4, 9};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, vals, 6));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, rows, 4));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, cols, 6));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, vals, 6));
+    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, rows, 4));
+    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, cols, 6));
     free_expr(node);
     return 0;
 }
@@ -112,9 +112,9 @@ const char *test_jacobian_elementwise_mult_3(void)
     int rows[4] = {0, 4, 10, 16};
     int cols[16] = {2, 3, 7, 8, 2, 3, 4, 7, 8, 9, 2, 3, 4, 7, 8, 9};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, vals, 16));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, rows, 4));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, cols, 16));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, vals, 16));
+    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, rows, 4));
+    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, cols, 16));
     free_expr(node);
     free_csr_matrix(A);
     free_csr_matrix(B);
@@ -157,9 +157,9 @@ const char *test_jacobian_elementwise_mult_4(void)
     int rows[4] = {0, 2, 5, 8};
     int cols[8] = {2, 3, 2, 3, 4, 2, 3, 4};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, vals, 8));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, rows, 4));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, cols, 8));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, vals, 8));
+    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, rows, 4));
+    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, cols, 8));
     free_expr(node);
     free_csr_matrix(A);
     return 0;
diff --git a/tests/jacobian_tests/bivariate_full_dom/test_matmul.h b/tests/jacobian_tests/bivariate_full_dom/test_matmul.h
index 386efa1..45745b8 100644
--- a/tests/jacobian_tests/bivariate_full_dom/test_matmul.h
+++ b/tests/jacobian_tests/bivariate_full_dom/test_matmul.h
@@ -49,14 +49,14 @@ const char *test_jacobian_matmul(void)
     Z->eval_jacobian(Z);
 
     /* Verify sparsity pattern */
-    mu_assert("Jacobian should have 8 rows", Z->jacobian->m == z_size);
-    mu_assert("Jacobian should have 18 columns", Z->jacobian->n == n_vars);
-    mu_assert("Jacobian should have 48 nonzeros", Z->jacobian->nnz == 48);
+    mu_assert("Jacobian should have 8 rows", Z->jacobian->to_csr(Z->jacobian)->m == z_size);
+    mu_assert("Jacobian should have 18 columns", Z->jacobian->to_csr(Z->jacobian)->n == n_vars);
+    mu_assert("Jacobian should have 48 nonzeros", Z->jacobian->to_csr(Z->jacobian)->nnz == 48);
 
     /* Check row pointers: each row should have 6 entries */
     int expected_p[9] = {0, 6, 12, 18, 24, 30, 36, 42, 48};
     mu_assert("Row pointers incorrect",
-              cmp_int_array(Z->jacobian->p, expected_p, 9));
+              cmp_int_array(Z->jacobian->to_csr(Z->jacobian)->p, expected_p, 9));
 
     int expected_i[48] = {0, 2, 4, 6,  7,  8,   /* row 0 */
                           1, 3, 5, 6,  7,  8,   /* row 1 */
@@ -67,7 +67,7 @@ const char *test_jacobian_matmul(void)
                           0, 2, 4, 15, 16, 17,  /* row 6 */
                           1, 3, 5, 15, 16, 17}; /* row 7 */
     mu_assert("Column indices incorrect",
-              cmp_int_array(Z->jacobian->i, expected_i, 48));
+              cmp_int_array(Z->jacobian->to_csr(Z->jacobian)->i, expected_i, 48));
 
     /* Verify Jacobian values row-wise: for each row, values are
        [Y^T row for the column, X row values] since X has lower var_id */
@@ -82,7 +82,7 @@ const char *test_jacobian_matmul(void)
         /* row 7 (col 3) */ 16.0, 17.0, 18.0, 2.0, 4.0, 6.0};
 
     mu_assert("Jacobian values incorrect",
-              cmp_double_array(Z->jacobian->x, expected_x, 48));
+              cmp_double_array(Z->jacobian->to_csr(Z->jacobian)->x, expected_x, 48));
 
     free_expr(Z);
     return 0;
diff --git a/tests/jacobian_tests/bivariate_restricted_dom/test_quad_over_lin.h b/tests/jacobian_tests/bivariate_restricted_dom/test_quad_over_lin.h
index e1007f8..451827b 100644
--- a/tests/jacobian_tests/bivariate_restricted_dom/test_quad_over_lin.h
+++ b/tests/jacobian_tests/bivariate_restricted_dom/test_quad_over_lin.h
@@ -25,9 +25,9 @@ const char *test_quad_over_lin1(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 3, 4, 7};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 4));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, expected_Ai, 4));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, expected_Ax, 4));
+    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 2));
+    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 4));
     free_expr(node);
     return 0;
 }
@@ -49,9 +49,9 @@ const char *test_quad_over_lin2(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 5, 6, 7};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 4));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, expected_Ai, 4));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, expected_Ax, 4));
+    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 2));
+    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 4));
     free_expr(node);
     return 0;
 }
@@ -86,9 +86,9 @@ const char *test_quad_over_lin3(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 3, 4, 7};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_vals, 4));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, expected_Ai, 4));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, expected_vals, 4));
+    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 2));
+    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 4));
 
     free_csr_matrix(A);
     free_expr(node);
@@ -126,9 +126,9 @@ const char *test_quad_over_lin4(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 5, 6, 7};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_vals, 4));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, expected_Ai, 4));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, expected_vals, 4));
+    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 2));
+    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 4));
 
     free_csr_matrix(A);
     free_expr(node);
@@ -166,9 +166,9 @@ const char *test_quad_over_lin5(void)
     int expected_Ap[2] = {0, 7};
     int expected_Ai[7] = {0, 1, 2, 3, 5, 6, 7};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_vals, 7));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, expected_Ai, 7));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, expected_vals, 7));
+    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 2));
+    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 7));
 
     free_csr_matrix(A);
     free_expr(node);
diff --git a/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr.h b/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr.h
index 883e233..596c75a 100644
--- a/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr.h
+++ b/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr.h
@@ -30,9 +30,9 @@ const char *test_jacobian_rel_entr_vector_args_1(void)
     int expected_Ap[4] = {0, 2, 4, 6};
     int expected_Ai[6] = {2, 7, 3, 8, 4, 9};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 6));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, expected_Ai, 6));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, expected_Ax, 6));
+    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 4));
+    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 6));
     free_expr(node);
     return 0;
 }
@@ -61,9 +61,9 @@ const char *test_jacobian_rel_entr_vector_args_2(void)
     int expected_Ap[4] = {0, 2, 4, 6};
     int expected_Ai[6] = {2, 7, 3, 8, 4, 9};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 6));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, expected_Ai, 6));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, expected_Ax, 6));
+    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 4));
+    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 6));
     free_expr(node);
     return 0;
 }
@@ -103,9 +103,9 @@ const char *test_jacobian_rel_entr_matrix_args(void)
     int expected_Ap[7] = {0, 2, 4, 6, 8, 10, 12};
     int expected_Ai[12] = {0, 6, 1, 7, 2, 8, 3, 9, 4, 10, 5, 11};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 12));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, expected_Ap, 7));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, expected_Ai, 12));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, expected_Ax, 12));
+    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 7));
+    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 12));
 
     free_expr(node);
     return 0;
diff --git a/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_scalar_vector.h b/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_scalar_vector.h
index 20cd6fa..d716046 100644
--- a/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_scalar_vector.h
+++ b/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_scalar_vector.h
@@ -28,9 +28,9 @@ const char *test_jacobian_rel_entr_scalar_vector(void)
     int expected_Ap[4] = {0, 2, 4, 6};
     int expected_Ai[6] = {0, 1, 0, 2, 0, 3};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 6));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, expected_Ai, 6));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, expected_Ax, 6));
+    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 4));
+    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 6));
     free_expr(node);
     return 0;
 }
diff --git a/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_vector_scalar.h b/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_vector_scalar.h
index 9a454dd..f4c77eb 100644
--- a/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_vector_scalar.h
+++ b/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_vector_scalar.h
@@ -28,9 +28,9 @@ const char *test_jacobian_rel_entr_vector_scalar(void)
     int expected_Ap[4] = {0, 2, 4, 6};
     int expected_Ai[6] = {0, 3, 1, 3, 2, 3};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 6));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, expected_Ai, 6));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, expected_Ax, 6));
+    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 4));
+    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 6));
     free_expr(node);
     return 0;
 }
diff --git a/tests/jacobian_tests/composite/test_composite_exp.h b/tests/jacobian_tests/composite/test_composite_exp.h
index 32671d1..2862550 100644
--- a/tests/jacobian_tests/composite/test_composite_exp.h
+++ b/tests/jacobian_tests/composite/test_composite_exp.h
@@ -33,9 +33,9 @@ const char *test_jacobian_composite_exp(void)
     double vals[6] = {3 * e10, 2 * e10, 1 * e10, 2 * e7, 1 * e7, 1 * e7};
     int rows[3] = {0, 3, 6};
     int cols[6] = {2, 3, 4, 2, 3, 4};
-    mu_assert("vals fail", cmp_double_array(exp_node->jacobian->x, vals, 6));
-    mu_assert("rows fail", cmp_int_array(exp_node->jacobian->p, rows, 3));
-    mu_assert("cols fail", cmp_int_array(exp_node->jacobian->i, cols, 6));
+    mu_assert("vals fail", cmp_double_array(exp_node->jacobian->to_csr(exp_node->jacobian)->x, vals, 6));
+    mu_assert("rows fail", cmp_int_array(exp_node->jacobian->to_csr(exp_node->jacobian)->p, rows, 3));
+    mu_assert("cols fail", cmp_int_array(exp_node->jacobian->to_csr(exp_node->jacobian)->i, cols, 6));
     free_expr(exp_node);
     free_csr_matrix(A);
     return 0;
diff --git a/tests/jacobian_tests/elementwise_restricted_dom/test_log.h b/tests/jacobian_tests/elementwise_restricted_dom/test_log.h
index ff99d64..b6a07c2 100644
--- a/tests/jacobian_tests/elementwise_restricted_dom/test_log.h
+++ b/tests/jacobian_tests/elementwise_restricted_dom/test_log.h
@@ -18,9 +18,9 @@ const char *test_jacobian_log(void)
     log_node->forward(log_node, u_vals);
     jacobian_init(log_node);
     log_node->eval_jacobian(log_node);
-    mu_assert("vals fail", cmp_double_array(log_node->jacobian->x, expected_Ax, 3));
-    mu_assert("rows fail", cmp_int_array(log_node->jacobian->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(log_node->jacobian->i, expected_Ai, 3));
+    mu_assert("vals fail", cmp_double_array(log_node->jacobian->to_csr(log_node->jacobian)->x, expected_Ax, 3));
+    mu_assert("rows fail", cmp_int_array(log_node->jacobian->to_csr(log_node->jacobian)->p, expected_Ap, 4));
+    mu_assert("cols fail", cmp_int_array(log_node->jacobian->to_csr(log_node->jacobian)->i, expected_Ai, 3));
     free_expr(log_node);
     return 0;
 }
@@ -36,9 +36,9 @@ const char *test_jacobian_log_matrix(void)
     log_node->forward(log_node, u_vals);
     jacobian_init(log_node);
     log_node->eval_jacobian(log_node);
-    mu_assert("vals fail", cmp_double_array(log_node->jacobian->x, expected_Ax, 4));
-    mu_assert("rows fail", cmp_int_array(log_node->jacobian->p, expected_Ap, 5));
-    mu_assert("cols fail", cmp_int_array(log_node->jacobian->i, expected_Ai, 4));
+    mu_assert("vals fail", cmp_double_array(log_node->jacobian->to_csr(log_node->jacobian)->x, expected_Ax, 4));
+    mu_assert("rows fail", cmp_int_array(log_node->jacobian->to_csr(log_node->jacobian)->p, expected_Ap, 5));
+    mu_assert("cols fail", cmp_int_array(log_node->jacobian->to_csr(log_node->jacobian)->i, expected_Ai, 4));
     free_expr(log_node);
     return 0;
 }
diff --git a/tests/jacobian_tests/other/test_prod.h b/tests/jacobian_tests/other/test_prod.h
index ef59250..ea6aef3 100644
--- a/tests/jacobian_tests/other/test_prod.h
+++ b/tests/jacobian_tests/other/test_prod.h
@@ -25,9 +25,9 @@ const char *test_jacobian_prod_no_zero(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->jacobian->x, expected_Ax, 4));
-    mu_assert("rows fail", cmp_int_array(p->jacobian->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(p->jacobian->i, expected_Ai, 4));
+    mu_assert("vals fail", cmp_double_array(p->jacobian->to_csr(p->jacobian)->x, expected_Ax, 4));
+    mu_assert("rows fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->p, expected_Ap, 2));
+    mu_assert("cols fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->i, expected_Ai, 4));
 
     free_expr(p);
     return 0;
@@ -50,9 +50,9 @@ const char *test_jacobian_prod_one_zero(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->jacobian->x, expected_Ax, 4));
-    mu_assert("rows fail", cmp_int_array(p->jacobian->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(p->jacobian->i, expected_Ai, 4));
+    mu_assert("vals fail", cmp_double_array(p->jacobian->to_csr(p->jacobian)->x, expected_Ax, 4));
+    mu_assert("rows fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->p, expected_Ap, 2));
+    mu_assert("cols fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->i, expected_Ai, 4));
 
     free_expr(p);
     return 0;
@@ -73,9 +73,9 @@ const char *test_jacobian_prod_two_zeros(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->jacobian->x, expected_Ax, 4));
-    mu_assert("rows fail", cmp_int_array(p->jacobian->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(p->jacobian->i, expected_Ai, 4));
+    mu_assert("vals fail", cmp_double_array(p->jacobian->to_csr(p->jacobian)->x, expected_Ax, 4));
+    mu_assert("rows fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->p, expected_Ap, 2));
+    mu_assert("cols fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->i, expected_Ai, 4));
 
     free_expr(p);
     return 0;
diff --git a/tests/jacobian_tests/other/test_prod_axis_one.h b/tests/jacobian_tests/other/test_prod_axis_one.h
index b4757ee..1fb1c53 100644
--- a/tests/jacobian_tests/other/test_prod_axis_one.h
+++ b/tests/jacobian_tests/other/test_prod_axis_one.h
@@ -42,9 +42,9 @@ const char *test_jacobian_prod_axis_one(void)
     int expected_Ap[4] = {0, 3, 6, 9};
     int expected_Ai[9] = {1, 4, 7, 2, 5, 8, 3, 6, 9};
 
-    mu_assert("vals fail", cmp_double_array(p->jacobian->x, expected_Ax, 9));
-    mu_assert("rows fail", cmp_int_array(p->jacobian->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(p->jacobian->i, expected_Ai, 9));
+    mu_assert("vals fail", cmp_double_array(p->jacobian->to_csr(p->jacobian)->x, expected_Ax, 9));
+    mu_assert("rows fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->p, expected_Ap, 4));
+    mu_assert("cols fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->i, expected_Ai, 9));
 
     free_expr(p);
     return 0;
@@ -85,9 +85,9 @@ const char *test_jacobian_prod_axis_one_one_zero(void)
     int expected_Ap[4] = {0, 3, 6, 9};
     int expected_Ai[9] = {1, 4, 7, 2, 5, 8, 3, 6, 9};
 
-    mu_assert("vals fail", cmp_double_array(p->jacobian->x, expected_Ax, 9));
-    mu_assert("rows fail", cmp_int_array(p->jacobian->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(p->jacobian->i, expected_Ai, 9));
+    mu_assert("vals fail", cmp_double_array(p->jacobian->to_csr(p->jacobian)->x, expected_Ax, 9));
+    mu_assert("rows fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->p, expected_Ap, 4));
+    mu_assert("cols fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->i, expected_Ai, 9));
 
     free_expr(p);
     return 0;
diff --git a/tests/jacobian_tests/other/test_prod_axis_zero.h b/tests/jacobian_tests/other/test_prod_axis_zero.h
index 42b4ad1..7c7957d 100644
--- a/tests/jacobian_tests/other/test_prod_axis_zero.h
+++ b/tests/jacobian_tests/other/test_prod_axis_zero.h
@@ -36,9 +36,9 @@ const char *test_jacobian_prod_axis_zero(void)
     int expected_Ap[4] = {0, 2, 4, 6};
     int expected_Ai[6] = {1, 2, 3, 4, 5, 6};
 
-    mu_assert("vals fail", cmp_double_array(p->jacobian->x, expected_Ax, 6));
-    mu_assert("rows fail", cmp_int_array(p->jacobian->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(p->jacobian->i, expected_Ai, 6));
+    mu_assert("vals fail", cmp_double_array(p->jacobian->to_csr(p->jacobian)->x, expected_Ax, 6));
+    mu_assert("rows fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->p, expected_Ap, 4));
+    mu_assert("cols fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->i, expected_Ai, 6));
 
     free_expr(p);
     return 0;
diff --git a/tests/jacobian_tests/other/test_quad_form.h b/tests/jacobian_tests/other/test_quad_form.h
index d832a83..8864b41 100644
--- a/tests/jacobian_tests/other/test_quad_form.h
+++ b/tests/jacobian_tests/other/test_quad_form.h
@@ -31,9 +31,9 @@ const char *test_quad_form(void)
     int expected_Ap[2] = {0, 3};
     int expected_Ai[3] = {2, 3, 4};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 3));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, expected_Ai, 3));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, expected_Ax, 3));
+    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 2));
+    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 3));
     free_expr(node);
     free_csr_matrix(Q);
     return 0;
@@ -77,9 +77,9 @@ double expected_Ax[5] = {422, 2222, 3244, 3786, 120};
 int expected_Ap[2] = {0, 5};
 int expected_Ai[5] = {0, 2, 3, 4, 5};
 
-mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 5));
-mu_assert("rows fail", cmp_int_array(node->jacobian->p, expected_Ap, 2));
-mu_assert("cols fail", cmp_int_array(node->jacobian->i, expected_Ai, 5));
+mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, expected_Ax, 5));
+mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 2));
+mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 5));
 free_expr(node);
 free_expr(Au);
 free_csr_matrix(Q);
diff --git a/tests/numerical_diff.c b/tests/numerical_diff.c
index 9c9812a..dad03f8 100644
--- a/tests/numerical_diff.c
+++ b/tests/numerical_diff.c
@@ -76,7 +76,7 @@ int check_jacobian_num(expr *node, const double *u, double h)
     node->forward(node, u);
 
     double *J_analytical = calloc((size_t) m * n, sizeof(double));
-    csr_to_dense(node->jacobian, J_analytical);
+    csr_to_dense(node->jacobian->to_csr(node->jacobian), J_analytical);
 
     int result = 1;
     for (int i = 0; i < m * n; i++)
@@ -134,14 +134,14 @@ double *numerical_wsum_hess(expr *node, const double *u, const double *w, double
         node->forward(node, u_work);
         node->eval_jacobian(node);
         memset(g_plus, 0, n * sizeof(double));
-        csr_transpose_mult_vec(node->jacobian, w, g_plus);
+        csr_transpose_mult_vec(node->jacobian->to_csr(node->jacobian), w, g_plus);
 
         /* g(u - h*e_j) */
         u_work[j] = u[j] - h;
         node->forward(node, u_work);
         node->eval_jacobian(node);
         memset(g_minus, 0, n * sizeof(double));
-        csr_transpose_mult_vec(node->jacobian, w, g_minus);
+        csr_transpose_mult_vec(node->jacobian->to_csr(node->jacobian), w, g_minus);
 
         u_work[j] = u[j];
 
diff --git a/tests/profiling/profile_log_reg.h b/tests/profiling/profile_log_reg.h
index 8ad7eea..7fdc0fa 100644
--- a/tests/profiling/profile_log_reg.h
+++ b/tests/profiling/profile_log_reg.h
@@ -51,14 +51,17 @@ const char *profile_log_reg(void)
     /* Forward (untimed). */
     obj->forward(obj, u);
 
-    /* ---- Path A: time eval_jacobian + eval_wsum_hess ---- */
-    Timer t_a;
+    /* ---- Path A: time eval_jacobian and eval_wsum_hess separately ---- */
+    Timer t_a_jac, t_a_hess;
     double w_one = 1.0;
-    clock_gettime(CLOCK_MONOTONIC, &t_a.start);
+    clock_gettime(CLOCK_MONOTONIC, &t_a_jac.start);
     obj->eval_jacobian(obj);
+    clock_gettime(CLOCK_MONOTONIC, &t_a_jac.end);
+    clock_gettime(CLOCK_MONOTONIC, &t_a_hess.start);
     obj->eval_wsum_hess(obj, &w_one);
-    clock_gettime(CLOCK_MONOTONIC, &t_a.end);
-    double sec_a = GET_ELAPSED_SECONDS(t_a);
+    clock_gettime(CLOCK_MONOTONIC, &t_a_hess.end);
+    double sec_a_jac = GET_ELAPSED_SECONDS(t_a_jac);
+    double sec_a_hess = GET_ELAPSED_SECONDS(t_a_hess);
 
     /* ---- Path B setup (untimed) ---- */
     int *full_rows = (int *) malloc(m * sizeof(int));
@@ -87,33 +90,41 @@ const char *profile_log_reg(void)
     double *w_ones = (double *) malloc(m * sizeof(double));
     for (int i = 0; i < m; i++) w_ones[i] = 1.0;
 
-    /* ---- Path B: time the manual chain rule ---- */
-    Timer t_b;
-    clock_gettime(CLOCK_MONOTONIC, &t_b.start);
+    /* ---- Path B: time the manual chain rule, Jacobian and Hessian separately ---- */
+    Timer t_b_jac, t_b_hess;
     /* dwork = sigmoid(z); used as the diagonal in DA below and (still in
        dwork) as sigmas read by local_wsum_hess. */
+    clock_gettime(CLOCK_MONOTONIC, &t_b_jac.start);
     log_obj->local_jacobian(log_obj, log_obj->work->dwork);
     permuted_dense_DA_fill_values(log_obj->work->dwork, A_pd, Jlog_pd);
     permuted_dense_to_csr_fill_values(Jlog_pd, Jlog_csr);
     memset(Jobj_csr->x, 0, Jobj_csr->nnz * sizeof(double));
     accumulator(Jlog_csr, idx_map, Jobj_csr->x);
+    clock_gettime(CLOCK_MONOTONIC, &t_b_jac.end);
+    clock_gettime(CLOCK_MONOTONIC, &t_b_hess.start);
     log_obj->local_wsum_hess(log_obj, d2, w_ones);
     permuted_dense_ATDA_fill_values(A_pd, d2, H_pd);
-    clock_gettime(CLOCK_MONOTONIC, &t_b.end);
-    double sec_b = GET_ELAPSED_SECONDS(t_b);
+    clock_gettime(CLOCK_MONOTONIC, &t_b_hess.end);
+    double sec_b_jac = GET_ELAPSED_SECONDS(t_b_jac);
+    double sec_b_hess = GET_ELAPSED_SECONDS(t_b_hess);
 
     printf("\n");
-    printf("  Path A (engine CSR/CSC):  %10.6f seconds\n", sec_a);
-    printf("  Path B (Permuted_Dense):  %10.6f seconds\n", sec_b);
-    printf("  Speedup (A / B):          %10.2fx\n", sec_a / sec_b);
+    printf("                            Jacobian      Hessian        Total\n");
+    printf("  Path A (engine CSR/CSC): %10.6fs  %10.6fs  %10.6fs\n", sec_a_jac,
+           sec_a_hess, sec_a_jac + sec_a_hess);
+    printf("  Path B (Permuted_Dense): %10.6fs  %10.6fs  %10.6fs\n", sec_b_jac,
+           sec_b_hess, sec_b_jac + sec_b_hess);
+    printf("  Speedup (A / B):         %10.2fx %10.2fx %10.2fx\n",
+           sec_a_jac / sec_b_jac, sec_a_hess / sec_b_hess,
+           (sec_a_jac + sec_a_hess) / (sec_b_jac + sec_b_hess));
 
     /* ---- Compare Jacobian (1 x n, both have full sparsity) ---- */
-    mu_assert("J n mismatch", obj->jacobian->n == Jobj_csr->n);
-    mu_assert("J nnz mismatch", obj->jacobian->nnz == Jobj_csr->nnz);
+    mu_assert("J n mismatch", obj->jacobian->to_csr(obj->jacobian)->n == Jobj_csr->n);
+    mu_assert("J nnz mismatch", obj->jacobian->to_csr(obj->jacobian)->nnz == Jobj_csr->nnz);
     double max_J_diff = 0.0;
-    for (int j = 0; j < obj->jacobian->nnz; j++)
+    for (int j = 0; j < obj->jacobian->to_csr(obj->jacobian)->nnz; j++)
     {
-        double diff = fabs(obj->jacobian->x[j] - Jobj_csr->x[j]);
+        double diff = fabs(obj->jacobian->to_csr(obj->jacobian)->x[j] - Jobj_csr->x[j]);
         if (diff > max_J_diff) max_J_diff = diff;
     }
     printf("  Jacobian max abs diff:   %10.3e\n", max_J_diff);
diff --git a/tests/utils/test_matrix.h b/tests/utils/test_matrix.h
index c329a16..087ef43 100644
--- a/tests/utils/test_matrix.h
+++ b/tests/utils/test_matrix.h
@@ -76,7 +76,6 @@ const char *test_sparse_vs_dense_mult_vec(void)
 
     free_matrix(sparse);
     free_matrix(dense);
-    free_csr_matrix(csr);
     return 0;
 }
 
@@ -129,7 +128,6 @@ const char *test_sparse_vs_dense_mult_vec_blocks(void)
 
     free_matrix(sparse);
     free_matrix(dense);
-    free_csr_matrix(csr);
     return 0;
 }
 
diff --git a/tests/utils/test_permuted_dense.h b/tests/utils/test_permuted_dense.h
index e11d6e1..2e385a0 100644
--- a/tests/utils/test_permuted_dense.h
+++ b/tests/utils/test_permuted_dense.h
@@ -312,6 +312,35 @@ const char *test_permuted_dense_times_csc_no_active(void)
     return 0;
 }
 
+/* to_csr vtable method: lazy CSR view. First call allocates pd->csr_cache;
+   subsequent calls refresh values to reflect the current pd->X. */
+const char *test_permuted_dense_to_csr_lazy(void)
+{
+    int row_perm[3] = {1, 2, 4};
+    int col_perm[2] = {0, 3};
+    double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+
+    Matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
+    Permuted_Dense *pd = (Permuted_Dense *) M;
+
+    mu_assert("csr_cache initially NULL", pd->csr_cache == NULL);
+
+    CSR_Matrix *csr = M->to_csr(M);
+    mu_assert("csr_cache populated", pd->csr_cache != NULL);
+    mu_assert("returns the cache", csr == pd->csr_cache);
+
+    double expected[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    mu_assert("values match X", cmp_double_array(csr->x, expected, 6));
+
+    /* Mutate X and re-call to_csr: values must reflect the change. */
+    pd->X[0] = 99.0;
+    csr = M->to_csr(M);
+    mu_assert("refresh picks up new value", csr->x[0] == 99.0);
+
+    free_matrix(M);
+    return 0;
+}
+
 /* Sanity check: col_inv is built correctly. col_perm = {0, 3} on n = 6
    should give col_inv = {0, -1, -1, 1, -1, -1}. */
 const char *test_permuted_dense_col_inv(void)

From fa02469f734ae22a379abc1160f74f8a6f6eb0a3 Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Sun, 10 May 2026 23:05:51 +0200
Subject: [PATCH 03/31] make hessian matrix

---
 include/expr.h                                |  6 +-
 include/utils/matrix.h                        | 18 +++---
 src/atoms/affine/add.c                        | 15 +++--
 src/atoms/affine/broadcast.c                  |  4 +-
 src/atoms/affine/convolve.c                   |  6 +-
 src/atoms/affine/diag_vec.c                   |  5 +-
 src/atoms/affine/hstack.c                     | 16 ++---
 src/atoms/affine/index.c                      |  5 +-
 src/atoms/affine/left_matmul.c                |  6 +-
 src/atoms/affine/neg.c                        | 11 ++--
 src/atoms/affine/parameter.c                  |  3 +-
 src/atoms/affine/promote.c                    | 11 ++--
 src/atoms/affine/reshape.c                    |  4 +-
 src/atoms/affine/scalar_mult.c                |  6 +-
 src/atoms/affine/sum.c                        |  4 +-
 src/atoms/affine/trace.c                      |  4 +-
 src/atoms/affine/transpose.c                  |  6 +-
 src/atoms/affine/variable.c                   |  3 +-
 src/atoms/affine/vector_mult.c                |  4 +-
 src/atoms/bivariate_full_dom/matmul.c         | 26 ++++----
 src/atoms/bivariate_full_dom/multiply.c       | 35 ++++++-----
 .../bivariate_restricted_dom/quad_over_lin.c  |  6 +-
 src/atoms/bivariate_restricted_dom/rel_entr.c | 19 +++---
 .../rel_entr_scalar_vector.c                  |  7 ++-
 .../rel_entr_vector_scalar.c                  |  7 ++-
 src/atoms/elementwise_full_dom/common.c       | 60 +++++++++++--------
 src/atoms/elementwise_restricted_dom/atanh.c  |  2 +-
 src/atoms/elementwise_restricted_dom/common.c |  9 +--
 src/atoms/elementwise_restricted_dom/entr.c   |  2 +-
 src/atoms/elementwise_restricted_dom/log.c    |  2 +-
 src/atoms/elementwise_restricted_dom/tan.c    |  2 +-
 src/atoms/other/prod.c                        | 23 +++----
 src/atoms/other/prod_axis_one.c               | 12 ++--
 src/atoms/other/prod_axis_zero.c              | 12 ++--
 src/atoms/other/quad_form.c                   | 30 ++++++----
 src/expr.c                                    |  6 +-
 src/old-code/linear_op.c                      |  3 +-
 src/problem.c                                 | 24 +++++---
 src/utils/permuted_dense.c                    |  6 +-
 src/utils/sparse_matrix.c                     | 18 +++---
 tests/numerical_diff.c                        |  2 +-
 tests/profiling/profile_log_reg.h             |  6 +-
 tests/wsum_hess/affine/test_broadcast.h       | 18 +++---
 tests/wsum_hess/affine/test_convolve.h        |  6 +-
 tests/wsum_hess/affine/test_diag_mat.h        |  6 +-
 tests/wsum_hess/affine/test_hstack.h          | 12 ++--
 tests/wsum_hess/affine/test_index.h           | 18 +++---
 tests/wsum_hess/affine/test_left_matmul.h     | 12 ++--
 tests/wsum_hess/affine/test_right_matmul.h    | 12 ++--
 tests/wsum_hess/affine/test_scalar_mult.h     | 12 ++--
 tests/wsum_hess/affine/test_sum.h             | 12 ++--
 tests/wsum_hess/affine/test_trace.h           | 14 ++---
 tests/wsum_hess/affine/test_transpose.h       |  6 +-
 tests/wsum_hess/affine/test_upper_tri.h       |  6 +-
 tests/wsum_hess/affine/test_vector_mult.h     | 12 ++--
 tests/wsum_hess/affine/test_vstack.h          | 12 ++--
 .../bivariate_full_dom/test_matmul.h          | 24 ++++----
 .../bivariate_full_dom/test_multiply.h        | 24 ++++----
 .../test_quad_over_lin.h                      | 12 ++--
 .../bivariate_restricted_dom/test_rel_entr.h  | 18 +++---
 .../test_rel_entr_scalar_vector.h             |  6 +-
 .../test_rel_entr_vector_scalar.h             |  6 +-
 .../wsum_hess/elementwise_full_dom/test_exp.h |  6 +-
 .../elementwise_full_dom/test_hyperbolic.h    | 24 ++++----
 .../elementwise_full_dom/test_logistic.h      |  6 +-
 .../elementwise_full_dom/test_power.h         |  6 +-
 .../elementwise_full_dom/test_trig.h          | 18 +++---
 .../elementwise_full_dom/test_xexp.h          |  6 +-
 .../elementwise_restricted_dom/test_entr.h    |  6 +-
 .../elementwise_restricted_dom/test_log.h     |  6 +-
 tests/wsum_hess/other/test_prod.h             | 24 ++++----
 tests/wsum_hess/other/test_prod_axis_one.h    | 24 ++++----
 tests/wsum_hess/other/test_prod_axis_zero.h   | 18 +++---
 tests/wsum_hess/other/test_quad_form.h        |  6 +-
 74 files changed, 448 insertions(+), 406 deletions(-)

diff --git a/include/expr.h b/include/expr.h
index 61d8347..4facb95 100644
--- a/include/expr.h
+++ b/include/expr.h
@@ -53,8 +53,8 @@ typedef struct
        supporting common subexpressions on the Python side. */
     bool jacobian_csc_filled;
     double *local_jac_diag; /* cached f'(g(x)) diagonal */
-    CSR_Matrix *hess_term1; /* Jg^T D Jg workspace */
-    CSR_Matrix *hess_term2; /* child wsum_hess workspace */
+    Matrix *hess_term1; /* Jg^T D Jg workspace */
+    Matrix *hess_term2; /* child wsum_hess workspace */
 } Expr_Work;
 
 /* Base expression node structure */
@@ -72,7 +72,7 @@ typedef struct expr
     // ------------------------------------------------------------------------
     double *value;
     Matrix *jacobian;
-    CSR_Matrix *wsum_hess;
+    Matrix *wsum_hess;
     forward_fn forward;
     jacobian_init_fn jacobian_init_impl;
     wsum_hess_init_fn wsum_hess_init_impl;
diff --git a/include/utils/matrix.h b/include/utils/matrix.h
index bb83a4e..b72dc4c 100644
--- a/include/utils/matrix.h
+++ b/include/utils/matrix.h
@@ -64,18 +64,18 @@ typedef struct Matrix
     void (*update_values)(struct Matrix *self, const double *new_values);
 
     /* Chain-rule operations used by transformer atoms (elementwise, etc.).
-       copy_sparsity returns a matrix of same shape and type as self;
-       DA_fill_values writes diag(d) * self into out (which has same structure as
-       self); ATA_alloc_csr allocates a CSR with sparsity of self^T * self;
-       ATDA_fill_csr fills csr_out with self^T * diag(d) * self;
-       to_csr returns a CSR view of self (constant-time for Sparse_Matrix, lazily
-       built/refreshed for other types). */
+       All chain-rule outputs are the same concrete type as self (uniform
+       polymorphism). copy_sparsity returns a matrix of same shape and type as
+       self; DA_fill_values writes diag(d) * self into out; ATA_alloc allocates
+       a matrix with sparsity of self^T * self; ATDA_fill_values fills out with
+       self^T * diag(d) * self; to_csr returns a CSR view of self (constant-time
+       for Sparse_Matrix, lazily built/refreshed for other types). */
     struct Matrix *(*copy_sparsity)(const struct Matrix *self);
     void (*DA_fill_values)(const double *d, const struct Matrix *self,
                            struct Matrix *out);
-    CSR_Matrix *(*ATA_alloc_csr)(struct Matrix *self);
-    void (*ATDA_fill_csr)(const struct Matrix *self, const double *d,
-                          CSR_Matrix *csr_out);
+    struct Matrix *(*ATA_alloc)(struct Matrix *self);
+    void (*ATDA_fill_values)(const struct Matrix *self, const double *d,
+                             struct Matrix *out);
     CSR_Matrix *(*to_csr)(struct Matrix *self);
 
     /* Refresh any internal caches (e.g. a CSC mirror) so subsequent ATA / ATDA
diff --git a/src/atoms/affine/add.c b/src/atoms/affine/add.c
index 77fa694..1895a85 100644
--- a/src/atoms/affine/add.c
+++ b/src/atoms/affine/add.c
@@ -70,12 +70,16 @@ static void wsum_hess_init_impl(expr *node)
     wsum_hess_init(node->left);
     wsum_hess_init(node->right);
 
+    CSR_Matrix *Hl = node->left->wsum_hess->to_csr(node->left->wsum_hess);
+    CSR_Matrix *Hr = node->right->wsum_hess->to_csr(node->right->wsum_hess);
+
     /* we never have to store more than the sum of children's nnz */
-    int nnz_max = node->left->wsum_hess->nnz + node->right->wsum_hess->nnz;
-    node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, nnz_max);
+    int nnz_max = Hl->nnz + Hr->nnz;
+    CSR_Matrix *hess = new_csr_matrix(node->n_vars, node->n_vars, nnz_max);
 
     /* fill sparsity pattern of hessian */
-    sum_csr_alloc(node->left->wsum_hess, node->right->wsum_hess, node->wsum_hess);
+    sum_csr_alloc(Hl, Hr, hess);
+    node->wsum_hess = new_sparse_matrix(hess);
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
@@ -85,8 +89,9 @@ static void eval_wsum_hess(expr *node, const double *w)
     node->right->eval_wsum_hess(node->right, w);
 
     /* sum children's wsum_hess */
-    sum_csr_fill_values(node->left->wsum_hess, node->right->wsum_hess,
-                        node->wsum_hess);
+    sum_csr_fill_values(node->left->wsum_hess->to_csr(node->left->wsum_hess),
+                        node->right->wsum_hess->to_csr(node->right->wsum_hess),
+                        node->wsum_hess->to_csr(node->wsum_hess));
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/broadcast.c b/src/atoms/affine/broadcast.c
index d38aae7..0f73dcd 100644
--- a/src/atoms/affine/broadcast.c
+++ b/src/atoms/affine/broadcast.c
@@ -192,7 +192,7 @@ static void wsum_hess_init_impl(expr *node)
     wsum_hess_init(x);
 
     /* Same sparsity as child - weights get summed */
-    node->wsum_hess = new_csr_copy_sparsity(x->wsum_hess);
+    node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess);
 
     /* allocate space for weight vector */
     node->work->dwork = SP_MALLOC(node->size * sizeof(double));
@@ -239,7 +239,7 @@ static void eval_wsum_hess(expr *node, const double *w)
     }
 
     x->eval_wsum_hess(x, node->work->dwork);
-    memcpy(node->wsum_hess->x, x->wsum_hess->x, x->wsum_hess->nnz * sizeof(double));
+    memcpy(node->wsum_hess->to_csr(node->wsum_hess)->x, x->wsum_hess->to_csr(x->wsum_hess)->x, x->wsum_hess->to_csr(x->wsum_hess)->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/convolve.c b/src/atoms/affine/convolve.c
index 1406a39..f8c296d 100644
--- a/src/atoms/affine/convolve.c
+++ b/src/atoms/affine/convolve.c
@@ -106,7 +106,7 @@ static void wsum_hess_init_impl(expr *node)
     convolve_expr *cnode = (convolve_expr *) node;
 
     wsum_hess_init(child);
-    node->wsum_hess = new_csr_copy_sparsity(child->wsum_hess);
+    node->wsum_hess = child->wsum_hess->copy_sparsity(child->wsum_hess);
     node->work->dwork = (double *) SP_MALLOC(cnode->n * sizeof(double));
 }
 
@@ -129,8 +129,8 @@ static void eval_wsum_hess(expr *node, const double *w)
     }
 
     child->eval_wsum_hess(child, w_prime);
-    memcpy(node->wsum_hess->x, child->wsum_hess->x,
-           child->wsum_hess->nnz * sizeof(double));
+    memcpy(node->wsum_hess->to_csr(node->wsum_hess)->x, child->wsum_hess->to_csr(child->wsum_hess)->x,
+           child->wsum_hess->to_csr(child->wsum_hess)->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/diag_vec.c b/src/atoms/affine/diag_vec.c
index b05360f..07008af 100644
--- a/src/atoms/affine/diag_vec.c
+++ b/src/atoms/affine/diag_vec.c
@@ -105,8 +105,7 @@ static void wsum_hess_init_impl(expr *node)
 
     /* Copy child's Hessian structure (diag_vec is linear, so its own Hessian is
      * zero) */
-    CSR_Matrix *Hx = x->wsum_hess;
-    node->wsum_hess = new_csr_copy_sparsity(Hx);
+    node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess);
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
@@ -122,7 +121,7 @@ static void eval_wsum_hess(expr *node, const double *w)
 
     /* Evaluate child's Hessian with extracted weights */
     x->eval_wsum_hess(x, node->work->dwork);
-    memcpy(node->wsum_hess->x, x->wsum_hess->x, x->wsum_hess->nnz * sizeof(double));
+    memcpy(node->wsum_hess->to_csr(node->wsum_hess)->x, x->wsum_hess->to_csr(x->wsum_hess)->x, x->wsum_hess->to_csr(x->wsum_hess)->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/hstack.c b/src/atoms/affine/hstack.c
index ab4b55e..31e75e4 100644
--- a/src/atoms/affine/hstack.c
+++ b/src/atoms/affine/hstack.c
@@ -109,30 +109,30 @@ static void wsum_hess_init_impl(expr *node)
     for (int i = 0; i < hnode->n_args; i++)
     {
         wsum_hess_init(hnode->args[i]);
-        nnz += hnode->args[i]->wsum_hess->nnz;
+        Matrix *child_hess = hnode->args[i]->wsum_hess;
+        nnz += child_hess->to_csr(child_hess)->nnz;
     }
 
     /* worst-case scenario the nnz of node->wsum_hess is the sum of children's
        nnz */
-    node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, nnz);
+    CSR_Matrix *H = new_csr_matrix(node->n_vars, node->n_vars, nnz);
     hnode->CSR_work = new_csr_matrix(node->n_vars, node->n_vars, nnz);
 
     /* fill sparsity pattern */
-    CSR_Matrix *H = node->wsum_hess;
     H->nnz = 0;
-
     for (int i = 0; i < hnode->n_args; i++)
     {
-        expr *child = hnode->args[i];
+        Matrix *child_hess = hnode->args[i]->wsum_hess;
         copy_csr_matrix(H, hnode->CSR_work);
-        sum_csr_alloc(hnode->CSR_work, child->wsum_hess, H);
+        sum_csr_alloc(hnode->CSR_work, child_hess->to_csr(child_hess), H);
     }
+    node->wsum_hess = new_sparse_matrix(H);
 }
 
 static void wsum_hess_eval(expr *node, const double *w)
 {
     hstack_expr *hnode = (hstack_expr *) node;
-    CSR_Matrix *H = node->wsum_hess;
+    CSR_Matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
     int row_offset = 0;
     memset(H->x, 0, H->nnz * sizeof(double));
 
@@ -141,7 +141,7 @@ static void wsum_hess_eval(expr *node, const double *w)
         expr *child = hnode->args[i];
         child->eval_wsum_hess(child, w + row_offset);
         copy_csr_matrix(H, hnode->CSR_work);
-        sum_csr_fill_values(hnode->CSR_work, child->wsum_hess, H);
+        sum_csr_fill_values(hnode->CSR_work, child->wsum_hess->to_csr(child->wsum_hess), H);
         row_offset += child->size;
     }
 }
diff --git a/src/atoms/affine/index.c b/src/atoms/affine/index.c
index 27654b1..507213e 100644
--- a/src/atoms/affine/index.c
+++ b/src/atoms/affine/index.c
@@ -113,8 +113,7 @@ static void wsum_hess_init_impl(expr *node)
        many numerical zeros in child->wsum_hess that are actually
        structural zeros, but we do not try to exploit that sparsity
        right now. */
-    CSR_Matrix *Hx = x->wsum_hess;
-    node->wsum_hess = new_csr_copy_sparsity(Hx);
+    node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess);
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
@@ -142,7 +141,7 @@ static void eval_wsum_hess(expr *node, const double *w)
 
     /* evalute hessian of child */
     x->eval_wsum_hess(x, node->work->dwork);
-    memcpy(node->wsum_hess->x, x->wsum_hess->x, x->wsum_hess->nnz * sizeof(double));
+    memcpy(node->wsum_hess->to_csr(node->wsum_hess)->x, x->wsum_hess->to_csr(x->wsum_hess)->x, x->wsum_hess->to_csr(x->wsum_hess)->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/left_matmul.c b/src/atoms/affine/left_matmul.c
index 755a23b..1d5c376 100644
--- a/src/atoms/affine/left_matmul.c
+++ b/src/atoms/affine/left_matmul.c
@@ -148,7 +148,7 @@ static void wsum_hess_init_impl(expr *node)
     wsum_hess_init(x);
 
     /* allocate this node's hessian with the same sparsity as child's */
-    node->wsum_hess = new_csr_copy_sparsity(x->wsum_hess);
+    node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess);
 
     /* work for computing A^T w*/
     int n_blocks = ((left_matmul_expr *) node)->n_blocks;
@@ -166,8 +166,8 @@ static void eval_wsum_hess(expr *node, const double *w)
     AT->block_left_mult_vec(AT, w, node->work->dwork, n_blocks);
 
     node->left->eval_wsum_hess(node->left, node->work->dwork);
-    memcpy(node->wsum_hess->x, node->left->wsum_hess->x,
-           node->wsum_hess->nnz * sizeof(double));
+    memcpy(node->wsum_hess->to_csr(node->wsum_hess)->x, node->left->wsum_hess->to_csr(node->left->wsum_hess)->x,
+           node->wsum_hess->to_csr(node->wsum_hess)->nnz * sizeof(double));
 }
 
 static void refresh_dense_left(left_matmul_expr *lnode)
diff --git a/src/atoms/affine/neg.c b/src/atoms/affine/neg.c
index 303e1e2..3251c50 100644
--- a/src/atoms/affine/neg.c
+++ b/src/atoms/affine/neg.c
@@ -64,9 +64,8 @@ static void wsum_hess_init_impl(expr *node)
     /* initialize child's wsum_hess */
     wsum_hess_init(x);
 
-    /* same sparsity pattern as child */
-    CSR_Matrix *child_hess = x->wsum_hess;
-    node->wsum_hess = new_csr_copy_sparsity(child_hess);
+    /* same sparsity pattern as child (polymorphic copy) */
+    node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess);
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
@@ -75,10 +74,12 @@ static void eval_wsum_hess(expr *node, const double *w)
     node->left->eval_wsum_hess(node->left, w);
 
     /* negate values (sparsity pattern set in wsum_hess_init_impl) */
-    CSR_Matrix *child_hess = node->left->wsum_hess;
+    CSR_Matrix *child_hess =
+        node->left->wsum_hess->to_csr(node->left->wsum_hess);
+    CSR_Matrix *jac = node->wsum_hess->to_csr(node->wsum_hess);
     for (int k = 0; k < child_hess->nnz; k++)
     {
-        node->wsum_hess->x[k] = -child_hess->x[k];
+        jac->x[k] = -child_hess->x[k];
     }
 }
 
diff --git a/src/atoms/affine/parameter.c b/src/atoms/affine/parameter.c
index 2dc5be1..db6f14d 100644
--- a/src/atoms/affine/parameter.c
+++ b/src/atoms/affine/parameter.c
@@ -43,7 +43,8 @@ static void eval_jacobian(expr *node)
 static void wsum_hess_init_impl(expr *node)
 {
     /* Zero Hessian: n_vars x n_vars with 0 nonzeros. */
-    node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, 0);
+    node->wsum_hess =
+        new_sparse_matrix(new_csr_matrix(node->n_vars, node->n_vars, 0));
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
diff --git a/src/atoms/affine/promote.c b/src/atoms/affine/promote.c
index 6af9bbf..008e56e 100644
--- a/src/atoms/affine/promote.c
+++ b/src/atoms/affine/promote.c
@@ -77,10 +77,7 @@ static void eval_jacobian(expr *node)
 static void wsum_hess_init_impl(expr *node)
 {
     wsum_hess_init(node->left);
-
-    /* same sparsity as child since we're summing weights */
-    CSR_Matrix *child_hess = node->left->wsum_hess;
-    node->wsum_hess = new_csr_copy_sparsity(child_hess);
+    node->wsum_hess = node->left->wsum_hess->copy_sparsity(node->left->wsum_hess);
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
@@ -96,8 +93,10 @@ static void eval_wsum_hess(expr *node, const double *w)
     node->left->eval_wsum_hess(node->left, &sum_w);
 
     /* copy values */
-    CSR_Matrix *child_hess = node->left->wsum_hess;
-    memcpy(node->wsum_hess->x, child_hess->x, child_hess->nnz * sizeof(double));
+    CSR_Matrix *child_hess =
+        node->left->wsum_hess->to_csr(node->left->wsum_hess);
+    CSR_Matrix *jac = node->wsum_hess->to_csr(node->wsum_hess);
+    memcpy(jac->x, child_hess->x, child_hess->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/reshape.c b/src/atoms/affine/reshape.c
index 8e23124..c0c207b 100644
--- a/src/atoms/affine/reshape.c
+++ b/src/atoms/affine/reshape.c
@@ -52,14 +52,14 @@ static void wsum_hess_init_impl(expr *node)
 {
     expr *x = node->left;
     wsum_hess_init(x);
-    node->wsum_hess = new_csr_copy_sparsity(x->wsum_hess);
+    node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess);
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
 {
     expr *x = node->left;
     x->eval_wsum_hess(x, w);
-    memcpy(node->wsum_hess->x, x->wsum_hess->x, x->wsum_hess->nnz * sizeof(double));
+    memcpy(node->wsum_hess->to_csr(node->wsum_hess)->x, x->wsum_hess->to_csr(x->wsum_hess)->x, x->wsum_hess->to_csr(x->wsum_hess)->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/scalar_mult.c b/src/atoms/affine/scalar_mult.c
index 210fd9f..93ffa93 100644
--- a/src/atoms/affine/scalar_mult.c
+++ b/src/atoms/affine/scalar_mult.c
@@ -87,7 +87,7 @@ static void wsum_hess_init_impl(expr *node)
     wsum_hess_init(x);
 
     /* same sparsity as child */
-    node->wsum_hess = new_csr_copy_sparsity(x->wsum_hess);
+    node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess);
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
@@ -96,9 +96,9 @@ static void eval_wsum_hess(expr *node, const double *w)
     x->eval_wsum_hess(x, w);
 
     double a = ((scalar_mult_expr *) node)->param_source->value[0];
-    for (int j = 0; j < x->wsum_hess->nnz; j++)
+    for (int j = 0; j < x->wsum_hess->to_csr(x->wsum_hess)->nnz; j++)
     {
-        node->wsum_hess->x[j] = a * x->wsum_hess->x[j];
+        node->wsum_hess->to_csr(node->wsum_hess)->x[j] = a * x->wsum_hess->to_csr(x->wsum_hess)->x[j];
     }
 }
 
diff --git a/src/atoms/affine/sum.c b/src/atoms/affine/sum.c
index 3a89dff..87a6d23 100644
--- a/src/atoms/affine/sum.c
+++ b/src/atoms/affine/sum.c
@@ -137,7 +137,7 @@ static void wsum_hess_init_impl(expr *node)
     wsum_hess_init(x);
 
     /* we never have to store more than the child's nnz */
-    node->wsum_hess = new_csr_copy_sparsity(x->wsum_hess);
+    node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess);
     node->work->dwork = SP_MALLOC(x->size * sizeof(double));
 }
 
@@ -163,7 +163,7 @@ static void eval_wsum_hess(expr *node, const double *w)
     x->eval_wsum_hess(x, node->work->dwork);
 
     /* copy values */
-    memcpy(node->wsum_hess->x, x->wsum_hess->x, x->wsum_hess->nnz * sizeof(double));
+    memcpy(node->wsum_hess->to_csr(node->wsum_hess)->x, x->wsum_hess->to_csr(x->wsum_hess)->x, x->wsum_hess->to_csr(x->wsum_hess)->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/trace.c b/src/atoms/affine/trace.c
index 5d6e7df..a1931fc 100644
--- a/src/atoms/affine/trace.c
+++ b/src/atoms/affine/trace.c
@@ -111,7 +111,7 @@ static void wsum_hess_init_impl(expr *node)
        contribution to wsum_hess of entries of the child that will always have
        zero weight in eval_wsum_hess. We do this for simplicity. But the Hessian
        can for sure be made more sophisticated. */
-    node->wsum_hess = new_csr_copy_sparsity(x->wsum_hess);
+    node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess);
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
@@ -126,7 +126,7 @@ static void eval_wsum_hess(expr *node, const double *w)
 
     x->eval_wsum_hess(x, node->work->dwork);
 
-    memcpy(node->wsum_hess->x, x->wsum_hess->x, sizeof(double) * x->wsum_hess->nnz);
+    memcpy(node->wsum_hess->to_csr(node->wsum_hess)->x, x->wsum_hess->to_csr(x->wsum_hess)->x, sizeof(double) * x->wsum_hess->to_csr(x->wsum_hess)->nnz);
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/transpose.c b/src/atoms/affine/transpose.c
index e3e7230..022aaff 100644
--- a/src/atoms/affine/transpose.c
+++ b/src/atoms/affine/transpose.c
@@ -94,7 +94,7 @@ static void wsum_hess_init_impl(expr *node)
     wsum_hess_init(x);
 
     /* same sparsity pattern as child */
-    node->wsum_hess = new_csr_copy_sparsity(x->wsum_hess);
+    node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess);
 
     /* for computing Kw where K is the commutation matrix */
     node->work->dwork = (double *) SP_MALLOC(node->size * sizeof(double));
@@ -117,8 +117,8 @@ static void eval_wsum_hess(expr *node, const double *w)
     node->left->eval_wsum_hess(node->left, node->work->dwork);
 
     /* copy to this node's hessian */
-    memcpy(node->wsum_hess->x, node->left->wsum_hess->x,
-           node->wsum_hess->nnz * sizeof(double));
+    memcpy(node->wsum_hess->to_csr(node->wsum_hess)->x, node->left->wsum_hess->to_csr(node->left->wsum_hess)->x,
+           node->wsum_hess->to_csr(node->wsum_hess)->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/variable.c b/src/atoms/affine/variable.c
index 9a0f182..82933aa 100644
--- a/src/atoms/affine/variable.c
+++ b/src/atoms/affine/variable.c
@@ -47,7 +47,8 @@ static void eval_jacobian(expr *node)
 static void wsum_hess_init_impl(expr *node)
 {
     /* Variables have zero Hessian */
-    node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, 0);
+    node->wsum_hess =
+        new_sparse_matrix(new_csr_matrix(node->n_vars, node->n_vars, 0));
 }
 
 static void wsum_hess_eval(expr *node, const double *w)
diff --git a/src/atoms/affine/vector_mult.c b/src/atoms/affine/vector_mult.c
index d077c97..6f15b66 100644
--- a/src/atoms/affine/vector_mult.c
+++ b/src/atoms/affine/vector_mult.c
@@ -90,7 +90,7 @@ static void wsum_hess_init_impl(expr *node)
     wsum_hess_init(x);
 
     /* same sparsity as child */
-    node->wsum_hess = new_csr_copy_sparsity(x->wsum_hess);
+    node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess);
 
     node->work->dwork = (double *) SP_MALLOC(node->size * sizeof(double));
 }
@@ -109,7 +109,7 @@ static void eval_wsum_hess(expr *node, const double *w)
     x->eval_wsum_hess(x, node->work->dwork);
 
     /* copy values from child to this node */
-    memcpy(node->wsum_hess->x, x->wsum_hess->x, x->wsum_hess->nnz * sizeof(double));
+    memcpy(node->wsum_hess->to_csr(node->wsum_hess)->x, x->wsum_hess->to_csr(x->wsum_hess)->x, x->wsum_hess->to_csr(x->wsum_hess)->nnz * sizeof(double));
 }
 
 static void free_type_data(expr *node)
diff --git a/src/atoms/bivariate_full_dom/matmul.c b/src/atoms/bivariate_full_dom/matmul.c
index 6d36194..485df69 100644
--- a/src/atoms/bivariate_full_dom/matmul.c
+++ b/src/atoms/bivariate_full_dom/matmul.c
@@ -277,10 +277,10 @@ static void wsum_hess_init_no_chain_rule(expr *node)
     int k = x->d2;
     int n = y->d2;
     int total_nnz = 2 * m * k * n;
-    node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, total_nnz);
+    CSR_Matrix *hess = new_csr_matrix(node->n_vars, node->n_vars, total_nnz);
     int nnz = 0;
-    int *Hi = node->wsum_hess->i;
-    int *Hp = node->wsum_hess->p;
+    int *Hi = hess->i;
+    int *Hp = hess->p;
     int start, i;
 
     if (x->var_id < y->var_id)
@@ -343,6 +343,7 @@ static void wsum_hess_init_no_chain_rule(expr *node)
     }
     Hp[node->n_vars] = nnz;
     assert(nnz == total_nnz);
+    node->wsum_hess = new_sparse_matrix(hess);
 }
 
 static void eval_wsum_hess_no_chain_rule(expr *node, const double *w)
@@ -353,7 +354,7 @@ static void eval_wsum_hess_no_chain_rule(expr *node, const double *w)
     int k = x->d2;
     int n = y->d2;
     int offset = 0;
-    double *Hx = node->wsum_hess->x;
+    double *Hx = node->wsum_hess->to_csr(node->wsum_hess)->x;
     const double *w_temp;
 
     if (x->var_id < y->var_id)
@@ -440,8 +441,10 @@ static void wsum_hess_init_chain_rule(expr *node)
 
     /* sum the four terms and fill idx maps */
     int *maps[4];
-    node->wsum_hess =
-        sum_4_csr_alloc(mnode->C, mnode->CT, f->wsum_hess, g->wsum_hess, maps);
+    CSR_Matrix *hess =
+        sum_4_csr_alloc(mnode->C, mnode->CT, f->wsum_hess->to_csr(f->wsum_hess),
+                        g->wsum_hess->to_csr(g->wsum_hess), maps);
+    node->wsum_hess = new_sparse_matrix(hess);
     mnode->idx_map_C = maps[0];
     mnode->idx_map_CT = maps[1];
     mnode->idx_map_Hf = maps[2];
@@ -512,11 +515,12 @@ static void eval_wsum_hess_chain_rule(expr *node, const double *w)
     }
 
     /* accumulate H = C + C^T + H_f + H_g */
-    memset(node->wsum_hess->x, 0, node->wsum_hess->nnz * sizeof(double));
-    accumulator(mnode->C, mnode->idx_map_C, node->wsum_hess->x);
-    accumulator(mnode->CT, mnode->idx_map_CT, node->wsum_hess->x);
-    accumulator(f->wsum_hess, mnode->idx_map_Hf, node->wsum_hess->x);
-    accumulator(g->wsum_hess, mnode->idx_map_Hg, node->wsum_hess->x);
+    CSR_Matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
+    memset(H->x, 0, H->nnz * sizeof(double));
+    accumulator(mnode->C, mnode->idx_map_C, H->x);
+    accumulator(mnode->CT, mnode->idx_map_CT, H->x);
+    accumulator(f->wsum_hess->to_csr(f->wsum_hess), mnode->idx_map_Hf, H->x);
+    accumulator(g->wsum_hess->to_csr(g->wsum_hess), mnode->idx_map_Hg, H->x);
 }
 
 expr *new_matmul(expr *x, expr *y)
diff --git a/src/atoms/bivariate_full_dom/multiply.c b/src/atoms/bivariate_full_dom/multiply.c
index 236f67c..01e1cd7 100644
--- a/src/atoms/bivariate_full_dom/multiply.c
+++ b/src/atoms/bivariate_full_dom/multiply.c
@@ -83,7 +83,8 @@ static void wsum_hess_init_impl(expr *node)
         x->var_id != y->var_id)
     {
         assert(y->var_id != NOT_A_VARIABLE);
-        node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, 2 * node->size);
+        CSR_Matrix *hess =
+            new_csr_matrix(node->n_vars, node->n_vars, 2 * node->size);
 
         int i, var1_id, var2_id;
 
@@ -101,8 +102,8 @@ static void wsum_hess_init_impl(expr *node)
         /* var1 rows of Hessian */
         for (i = 0; i < node->size; i++)
         {
-            node->wsum_hess->p[var1_id + i] = i;
-            node->wsum_hess->i[i] = var2_id + i;
+            hess->p[var1_id + i] = i;
+            hess->i[i] = var2_id + i;
         }
 
         int nnz = node->size;
@@ -110,22 +111,23 @@ static void wsum_hess_init_impl(expr *node)
         /* rows between var1 and var2 */
         for (i = var1_id + node->size; i < var2_id; i++)
         {
-            node->wsum_hess->p[i] = nnz;
+            hess->p[i] = nnz;
         }
 
         /* var2 rows of Hessian */
         for (i = 0; i < node->size; i++)
         {
-            node->wsum_hess->p[var2_id + i] = nnz + i;
-            node->wsum_hess->i[nnz + i] = var1_id + i;
+            hess->p[var2_id + i] = nnz + i;
+            hess->i[nnz + i] = var1_id + i;
         }
 
         /* remaining rows */
         nnz += node->size;
         for (i = var2_id + node->size; i <= node->n_vars; i++)
         {
-            node->wsum_hess->p[i] = nnz;
+            hess->p[i] = nnz;
         }
+        node->wsum_hess = new_sparse_matrix(hess);
     }
     else
     {
@@ -171,7 +173,9 @@ static void wsum_hess_init_impl(expr *node)
            fill index maps telling us where to accumulate each element of each
            matrix in the sum) */
         int *maps[4];
-        node->wsum_hess = sum_4_csr_alloc(C, CT, x->wsum_hess, y->wsum_hess, maps);
+        CSR_Matrix *hess = sum_4_csr_alloc(C, CT, x->wsum_hess->to_csr(x->wsum_hess),
+                                            y->wsum_hess->to_csr(y->wsum_hess), maps);
+        node->wsum_hess = new_sparse_matrix(hess);
         mul_node->idx_map_C = maps[0];
         mul_node->idx_map_CT = maps[1];
         mul_node->idx_map_Hx = maps[2];
@@ -188,8 +192,8 @@ static void eval_wsum_hess(expr *node, const double *w)
     if (x->var_id != NOT_A_VARIABLE && y->var_id != NOT_A_VARIABLE &&
         x->var_id != y->var_id)
     {
-        memcpy(node->wsum_hess->x, w, node->size * sizeof(double));
-        memcpy(node->wsum_hess->x + node->size, w, node->size * sizeof(double));
+        memcpy(node->wsum_hess->to_csr(node->wsum_hess)->x, w, node->size * sizeof(double));
+        memcpy(node->wsum_hess->to_csr(node->wsum_hess)->x + node->size, w, node->size * sizeof(double));
     }
     else
     {
@@ -259,11 +263,12 @@ static void eval_wsum_hess(expr *node, const double *w)
         // ---------------------------------------------------------------
         //        compute H = C + C^T + term2 + term3
         // ---------------------------------------------------------------
-        memset(node->wsum_hess->x, 0, node->wsum_hess->nnz * sizeof(double));
-        accumulator(C, mul_node->idx_map_C, node->wsum_hess->x);
-        accumulator(CT, mul_node->idx_map_CT, node->wsum_hess->x);
-        accumulator(x->wsum_hess, mul_node->idx_map_Hx, node->wsum_hess->x);
-        accumulator(y->wsum_hess, mul_node->idx_map_Hy, node->wsum_hess->x);
+        CSR_Matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
+        memset(H->x, 0, H->nnz * sizeof(double));
+        accumulator(C, mul_node->idx_map_C, H->x);
+        accumulator(CT, mul_node->idx_map_CT, H->x);
+        accumulator(x->wsum_hess->to_csr(x->wsum_hess), mul_node->idx_map_Hx, H->x);
+        accumulator(y->wsum_hess->to_csr(y->wsum_hess), mul_node->idx_map_Hy, H->x);
     }
 }
 
diff --git a/src/atoms/bivariate_restricted_dom/quad_over_lin.c b/src/atoms/bivariate_restricted_dom/quad_over_lin.c
index 518917e..6baf4c2 100644
--- a/src/atoms/bivariate_restricted_dom/quad_over_lin.c
+++ b/src/atoms/bivariate_restricted_dom/quad_over_lin.c
@@ -188,9 +188,9 @@ static void wsum_hess_init_impl(expr *node)
     /* if left node is a variable */
     if (x->var_id != NOT_A_VARIABLE)
     {
-        node->wsum_hess =
+        CSR_Matrix *H =
             new_csr_matrix(node->n_vars, node->n_vars, 3 * x->size + 1);
-        CSR_Matrix *H = node->wsum_hess;
+        node->wsum_hess = new_sparse_matrix(H);
 
         /* if x has lower idx than y*/
         if (var_id_x < var_id_y)
@@ -269,7 +269,7 @@ static void eval_wsum_hess(expr *node, const double *w)
 {
     double *x = node->left->value;
     double y = node->right->value[0];
-    double *H = node->wsum_hess->x;
+    double *H = node->wsum_hess->to_csr(node->wsum_hess)->x;
     int var_id_x = node->left->var_id;
     int var_id_y = node->right->var_id;
     int x_size = node->left->size;
diff --git a/src/atoms/bivariate_restricted_dom/rel_entr.c b/src/atoms/bivariate_restricted_dom/rel_entr.c
index 8db2727..8a3f5a2 100644
--- a/src/atoms/bivariate_restricted_dom/rel_entr.c
+++ b/src/atoms/bivariate_restricted_dom/rel_entr.c
@@ -103,7 +103,7 @@ static void eval_jacobian_vector_args(expr *node)
 
 static void wsum_hess_init_vector_args(expr *node)
 {
-    node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, 4 * node->size);
+    CSR_Matrix *H = new_csr_matrix(node->n_vars, node->n_vars, 4 * node->size);
     expr *x = node->left;
     expr *y = node->right;
 
@@ -123,9 +123,9 @@ static void wsum_hess_init_vector_args(expr *node)
     /* var1 rows of Hessian */
     for (i = 0; i < node->size; i++)
     {
-        node->wsum_hess->p[var1_id + i] = 2 * i;
-        node->wsum_hess->i[2 * i] = var1_id + i;
-        node->wsum_hess->i[2 * i + 1] = var2_id + i;
+        H->p[var1_id + i] = 2 * i;
+        H->i[2 * i] = var1_id + i;
+        H->i[2 * i + 1] = var2_id + i;
     }
 
     int nnz = 2 * node->size;
@@ -133,28 +133,29 @@ static void wsum_hess_init_vector_args(expr *node)
     /* rows between var1 and var2 */
     for (i = var1_id + node->size; i < var2_id; i++)
     {
-        node->wsum_hess->p[i] = nnz;
+        H->p[i] = nnz;
     }
 
     /* var2 rows of Hessian */
     for (i = 0; i < node->size; i++)
     {
-        node->wsum_hess->p[var2_id + i] = nnz + 2 * i;
+        H->p[var2_id + i] = nnz + 2 * i;
     }
-    memcpy(node->wsum_hess->i + nnz, node->wsum_hess->i, nnz * sizeof(int));
+    memcpy(H->i + nnz, H->i, nnz * sizeof(int));
 
     /* remaining rows */
     for (i = var2_id + node->size; i <= node->n_vars; i++)
     {
-        node->wsum_hess->p[i] = 4 * node->size;
+        H->p[i] = 4 * node->size;
     }
+    node->wsum_hess = new_sparse_matrix(H);
 }
 
 static void eval_wsum_hess_vector_args(expr *node, const double *w)
 {
     double *x = node->left->value;
     double *y = node->right->value;
-    double *hess = node->wsum_hess->x;
+    double *hess = node->wsum_hess->to_csr(node->wsum_hess)->x;
 
     if (node->left->var_id < node->right->var_id)
     {
diff --git a/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c b/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c
index b4b5130..25cbe7f 100644
--- a/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c
+++ b/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c
@@ -106,8 +106,8 @@ static void wsum_hess_init_scalar_vector(expr *node)
     int var_id_x = x->var_id;
     int var_id_y = y->var_id;
 
-    node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, 3 * node->size + 1);
-    CSR_Matrix *H = node->wsum_hess;
+    CSR_Matrix *H =
+        new_csr_matrix(node->n_vars, node->n_vars, 3 * node->size + 1);
 
     if (var_id_x < var_id_y)
     {
@@ -163,13 +163,14 @@ static void wsum_hess_init_scalar_vector(expr *node)
             H->p[i] = 3 * node->size + 1;
         }
     }
+    node->wsum_hess = new_sparse_matrix(H);
 }
 
 static void eval_wsum_hess_scalar_vector(expr *node, const double *w)
 {
     double x = node->left->value[0];
     double *y = node->right->value;
-    double *H = node->wsum_hess->x;
+    double *H = node->wsum_hess->to_csr(node->wsum_hess)->x;
     int var_id_x = node->left->var_id;
     int var_id_y = node->right->var_id;
 
diff --git a/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c b/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c
index c4eba8b..804ac43 100644
--- a/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c
+++ b/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c
@@ -106,8 +106,8 @@ static void wsum_hess_init_vector_scalar(expr *node)
     int var_id_x = x->var_id;
     int var_id_y = y->var_id;
 
-    node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, 3 * node->size + 1);
-    CSR_Matrix *H = node->wsum_hess;
+    CSR_Matrix *H =
+        new_csr_matrix(node->n_vars, node->n_vars, 3 * node->size + 1);
 
     if (var_id_x < var_id_y)
     {
@@ -163,13 +163,14 @@ static void wsum_hess_init_vector_scalar(expr *node)
             H->p[i] = 3 * node->size + 1;
         }
     }
+    node->wsum_hess = new_sparse_matrix(H);
 }
 
 static void eval_wsum_hess_vector_scalar(expr *node, const double *w)
 {
     double *x = node->left->value;
     double y = node->right->value[0];
-    double *H = node->wsum_hess->x;
+    double *H = node->wsum_hess->to_csr(node->wsum_hess)->x;
     int var_id_x = node->left->var_id;
     int var_id_y = node->right->var_id;
 
diff --git a/src/atoms/elementwise_full_dom/common.c b/src/atoms/elementwise_full_dom/common.c
index 3a51d7c..0e6e399 100644
--- a/src/atoms/elementwise_full_dom/common.c
+++ b/src/atoms/elementwise_full_dom/common.c
@@ -81,18 +81,19 @@ void wsum_hess_init_elementwise(expr *node)
     /* if the variable is a child */
     if (id != NOT_A_VARIABLE)
     {
-        node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, node->size);
+        CSR_Matrix *hess = new_csr_matrix(node->n_vars, node->n_vars, node->size);
 
         for (i = 0; i < node->size; i++)
         {
-            node->wsum_hess->p[id + i] = i;
-            node->wsum_hess->i[i] = id + i;
+            hess->p[id + i] = i;
+            hess->i[i] = id + i;
         }
 
         for (i = id + node->size; i <= node->n_vars; i++)
         {
-            node->wsum_hess->p[i] = node->size;
+            hess->p[i] = node->size;
         }
+        node->wsum_hess = new_sparse_matrix(hess);
     }
     else
     {
@@ -103,23 +104,28 @@ void wsum_hess_init_elementwise(expr *node)
             For elementwise functions, D is diagonal. */
         if (child->is_affine(child))
         {
-            node->wsum_hess = child->jacobian->ATA_alloc_csr(child->jacobian);
+            node->wsum_hess = child->jacobian->ATA_alloc(child->jacobian);
         }
         else
         {
             /* term1: Jg^T @ D @ Jg */
-            node->work->hess_term1 = child->jacobian->ATA_alloc_csr(child->jacobian);
+            node->work->hess_term1 = child->jacobian->ATA_alloc(child->jacobian);
 
-            /* term2: child's Hessian */
+            /* term2: child's Hessian (mirror its sparsity polymorphically) */
             wsum_hess_init(child);
-            CSR_Matrix *Hg = child->wsum_hess;
-            node->work->hess_term2 = new_csr_copy_sparsity(Hg);
-
-            /* wsum_hess = term1 + term2 */
-            int max_nnz = node->work->hess_term1->nnz + node->work->hess_term2->nnz;
-            node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, max_nnz);
-            sum_csr_alloc(node->work->hess_term1, node->work->hess_term2,
-                          node->wsum_hess);
+            node->work->hess_term2 =
+                child->wsum_hess->copy_sparsity(child->wsum_hess);
+
+            /* wsum_hess = term1 + term2 (sum_csr_alloc operates on CSR; extract
+               via to_csr on the operands). The result is stored as CSR. */
+            CSR_Matrix *t1 =
+                node->work->hess_term1->to_csr(node->work->hess_term1);
+            CSR_Matrix *t2 =
+                node->work->hess_term2->to_csr(node->work->hess_term2);
+            int max_nnz = t1->nnz + t2->nnz;
+            CSR_Matrix *hess = new_csr_matrix(node->n_vars, node->n_vars, max_nnz);
+            sum_csr_alloc(t1, t2, hess);
+            node->wsum_hess = new_sparse_matrix(hess);
         }
     }
 }
@@ -130,7 +136,8 @@ void eval_wsum_hess_elementwise(expr *node, const double *w)
 
     if (child->var_id != NOT_A_VARIABLE)
     {
-        node->local_wsum_hess(node, node->wsum_hess->x, w);
+        node->local_wsum_hess(node, node->wsum_hess->to_csr(node->wsum_hess)->x,
+                              w);
     }
     else
     {
@@ -145,8 +152,8 @@ void eval_wsum_hess_elementwise(expr *node, const double *w)
             }
 
             node->local_wsum_hess(node, node->work->dwork, w);
-            child->jacobian->ATDA_fill_csr(child->jacobian, node->work->dwork,
-                                           node->wsum_hess);
+            child->jacobian->ATDA_fill_values(child->jacobian, node->work->dwork,
+                                              node->wsum_hess);
         }
         else
         {
@@ -155,8 +162,8 @@ void eval_wsum_hess_elementwise(expr *node, const double *w)
 
             /* term1: Jg^T @ D @ Jg */
             node->local_wsum_hess(node, node->work->dwork, w);
-            child->jacobian->ATDA_fill_csr(child->jacobian, node->work->dwork,
-                                           node->work->hess_term1);
+            child->jacobian->ATDA_fill_values(child->jacobian, node->work->dwork,
+                                              node->work->hess_term1);
 
             /* term2: child Hessian with weight Jf^T w */
             memcpy(node->work->dwork, node->work->local_jac_diag,
@@ -167,12 +174,17 @@ void eval_wsum_hess_elementwise(expr *node, const double *w)
             }
 
             child->eval_wsum_hess(child, node->work->dwork);
-            memcpy(node->work->hess_term2->x, child->wsum_hess->x,
-                   child->wsum_hess->nnz * sizeof(double));
+            CSR_Matrix *t2_csr =
+                node->work->hess_term2->to_csr(node->work->hess_term2);
+            CSR_Matrix *child_hess_csr = child->wsum_hess->to_csr(child->wsum_hess);
+            memcpy(t2_csr->x, child_hess_csr->x,
+                   child_hess_csr->nnz * sizeof(double));
 
             /* wsum_hess = term1 + term2 */
-            sum_csr_fill_values(node->work->hess_term1, node->work->hess_term2,
-                                node->wsum_hess);
+            CSR_Matrix *t1_csr =
+                node->work->hess_term1->to_csr(node->work->hess_term1);
+            sum_csr_fill_values(t1_csr, t2_csr,
+                                node->wsum_hess->to_csr(node->wsum_hess));
         }
     }
 }
diff --git a/src/atoms/elementwise_restricted_dom/atanh.c b/src/atoms/elementwise_restricted_dom/atanh.c
index 2938dd7..69b425b 100644
--- a/src/atoms/elementwise_restricted_dom/atanh.c
+++ b/src/atoms/elementwise_restricted_dom/atanh.c
@@ -43,7 +43,7 @@ static void atanh_eval_wsum_hess(expr *node, const double *w)
     for (int j = 0; j < node->size; j++)
     {
         double c = 1.0 - x[j] * x[j];
-        node->wsum_hess->x[j] = w[j] * (2.0 * x[j]) / (c * c);
+        node->wsum_hess->to_csr(node->wsum_hess)->x[j] = w[j] * (2.0 * x[j]) / (c * c);
     }
 }
 
diff --git a/src/atoms/elementwise_restricted_dom/common.c b/src/atoms/elementwise_restricted_dom/common.c
index 40a45e3..ddb4ddb 100644
--- a/src/atoms/elementwise_restricted_dom/common.c
+++ b/src/atoms/elementwise_restricted_dom/common.c
@@ -39,18 +39,19 @@ void wsum_hess_init_restricted(expr *node)
     int id = child->var_id;
     int i;
 
-    node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, node->size);
+    CSR_Matrix *hess = new_csr_matrix(node->n_vars, node->n_vars, node->size);
 
     for (i = 0; i < node->size; i++)
     {
-        node->wsum_hess->p[id + i] = i;
-        node->wsum_hess->i[i] = id + i;
+        hess->p[id + i] = i;
+        hess->i[i] = id + i;
     }
 
     for (i = id + node->size; i <= node->n_vars; i++)
     {
-        node->wsum_hess->p[i] = node->size;
+        hess->p[i] = node->size;
     }
+    node->wsum_hess = new_sparse_matrix(hess);
 }
 
 bool is_affine_restricted(const expr *node)
diff --git a/src/atoms/elementwise_restricted_dom/entr.c b/src/atoms/elementwise_restricted_dom/entr.c
index 08ff549..3a23978 100644
--- a/src/atoms/elementwise_restricted_dom/entr.c
+++ b/src/atoms/elementwise_restricted_dom/entr.c
@@ -44,7 +44,7 @@ static void entr_eval_wsum_hess(expr *node, const double *w)
     double *x = node->left->value;
     for (int j = 0; j < node->size; j++)
     {
-        node->wsum_hess->x[j] = -w[j] / x[j];
+        node->wsum_hess->to_csr(node->wsum_hess)->x[j] = -w[j] / x[j];
     }
 }
 
diff --git a/src/atoms/elementwise_restricted_dom/log.c b/src/atoms/elementwise_restricted_dom/log.c
index 31a6241..9d1a010 100644
--- a/src/atoms/elementwise_restricted_dom/log.c
+++ b/src/atoms/elementwise_restricted_dom/log.c
@@ -43,7 +43,7 @@ static void log_eval_wsum_hess(expr *node, const double *w)
     double *x = node->left->value;
     for (int j = 0; j < node->size; j++)
     {
-        node->wsum_hess->x[j] = -w[j] / (x[j] * x[j]);
+        node->wsum_hess->to_csr(node->wsum_hess)->x[j] = -w[j] / (x[j] * x[j]);
     }
 }
 
diff --git a/src/atoms/elementwise_restricted_dom/tan.c b/src/atoms/elementwise_restricted_dom/tan.c
index 7fbc4cf..087f750 100644
--- a/src/atoms/elementwise_restricted_dom/tan.c
+++ b/src/atoms/elementwise_restricted_dom/tan.c
@@ -44,7 +44,7 @@ static void tan_eval_wsum_hess(expr *node, const double *w)
     for (int j = 0; j < node->size; j++)
     {
         double c = cos(x[j]);
-        node->wsum_hess->x[j] = 2.0 * w[j] * node->value[j] / (c * c);
+        node->wsum_hess->to_csr(node->wsum_hess)->x[j] = 2.0 * w[j] * node->value[j] / (c * c);
     }
 }
 
diff --git a/src/atoms/other/prod.c b/src/atoms/other/prod.c
index 5e27820..33a1b60 100644
--- a/src/atoms/other/prod.c
+++ b/src/atoms/other/prod.c
@@ -133,18 +133,18 @@ static void wsum_hess_init_impl(expr *node)
         /* allocate n_vars x n_vars CSR matrix with dense block */
         int block_size = x->size;
         int nnz = block_size * block_size;
-        node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, nnz);
+        CSR_Matrix *hess = new_csr_matrix(node->n_vars, node->n_vars, nnz);
 
         /* fill row pointers for the dense block */
         for (int i = 0; i < block_size; i++)
         {
-            node->wsum_hess->p[x->var_id + i] = i * block_size;
+            hess->p[x->var_id + i] = i * block_size;
         }
 
         /* fill row pointers for rows after the block */
         for (int i = x->var_id + block_size; i <= node->n_vars; i++)
         {
-            node->wsum_hess->p[i] = nnz;
+            hess->p[i] = nnz;
         }
 
         /* fill column indices for the dense block */
@@ -152,9 +152,10 @@ static void wsum_hess_init_impl(expr *node)
         {
             for (int j = 0; j < block_size; j++)
             {
-                node->wsum_hess->i[i * block_size + j] = x->var_id + j;
+                hess->i[i * block_size + j] = x->var_id + j;
             }
         }
+        node->wsum_hess = new_sparse_matrix(hess);
     }
     else
     {
@@ -239,11 +240,11 @@ static inline void wsum_hess_no_zeros(expr *node, const double *w)
         {
             if (i == j)
             {
-                node->wsum_hess->x[i * n + j] = 0.0;
+                node->wsum_hess->to_csr(node->wsum_hess)->x[i * n + j] = 0.0;
             }
             else
             {
-                node->wsum_hess->x[i * n + j] = wf / (x[i] * x[j]);
+                node->wsum_hess->to_csr(node->wsum_hess)->x[i * n + j] = wf / (x[i] * x[j]);
             }
         }
     }
@@ -252,7 +253,7 @@ static inline void wsum_hess_no_zeros(expr *node, const double *w)
 static inline void wsum_hess_one_zero(expr *node, const double *w)
 {
     expr *x = node->left;
-    double *H = node->wsum_hess->x;
+    double *H = node->wsum_hess->to_csr(node->wsum_hess)->x;
     memset(H, 0, sizeof(double) * (x->size * x->size));
     int p = ((prod_expr *) node)->zero_index;
     double prod_nonzero = ((prod_expr *) node)->prod_nonzero;
@@ -273,7 +274,7 @@ static inline void wsum_hess_two_zeros(expr *node, const double *w)
 {
     expr *x = node->left;
     int n = x->size;
-    memset(node->wsum_hess->x, 0, sizeof(double) * (n * n));
+    memset(node->wsum_hess->to_csr(node->wsum_hess)->x, 0, sizeof(double) * (n * n));
 
     /* find indices p and q where x[p] = x[q] = 0 */
     int p = -1, q = -1;
@@ -295,13 +296,13 @@ static inline void wsum_hess_two_zeros(expr *node, const double *w)
     assert(p != -1 && q != -1);
 
     double hess_val = w[0] * ((prod_expr *) node)->prod_nonzero;
-    node->wsum_hess->x[p * n + q] = hess_val;
-    node->wsum_hess->x[q * n + p] = hess_val;
+    node->wsum_hess->to_csr(node->wsum_hess)->x[p * n + q] = hess_val;
+    node->wsum_hess->to_csr(node->wsum_hess)->x[q * n + p] = hess_val;
 }
 
 static inline void wsum_hess_many_zeros(expr *node, const double *w)
 {
     expr *x = node->left;
-    memset(node->wsum_hess->x, 0, sizeof(double) * (x->size * x->size));
+    memset(node->wsum_hess->to_csr(node->wsum_hess)->x, 0, sizeof(double) * (x->size * x->size));
     (void) w;
 }
diff --git a/src/atoms/other/prod_axis_one.c b/src/atoms/other/prod_axis_one.c
index 864039c..b7b4dbd 100644
--- a/src/atoms/other/prod_axis_one.c
+++ b/src/atoms/other/prod_axis_one.c
@@ -162,8 +162,7 @@ static void wsum_hess_init_impl(expr *node)
         /* each row i has d2-1 non-zero entries, with column indices corresponding to
            the columns in that row (except the diagonal element). */
         int nnz = x->d1 * x->d2 * (x->d2 - 1);
-        node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, nnz);
-        CSR_Matrix *H = node->wsum_hess;
+        CSR_Matrix *H = new_csr_matrix(node->n_vars, node->n_vars, nnz);
 
         /* fill sparsity pattern */
         int nnz_per_row = x->d2 - 1;
@@ -193,6 +192,7 @@ static void wsum_hess_init_impl(expr *node)
         {
             H->p[i] = nnz;
         }
+        node->wsum_hess = new_sparse_matrix(H);
     }
     else
     {
@@ -206,7 +206,7 @@ static inline void wsum_hess_row_no_zeros(expr *node, const double *w, int row,
                                           int d2)
 {
     expr *x = node->left;
-    CSR_Matrix *H = node->wsum_hess;
+    CSR_Matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
     double scale = w[row] * node->value[row];
 
     /* for each variable xk in this row, fill in Hessian entries
@@ -233,7 +233,7 @@ static inline void wsum_hess_row_one_zero(expr *node, const double *w, int row,
 {
     expr *x = node->left;
     prod_axis *pnode = (prod_axis *) node;
-    CSR_Matrix *H = node->wsum_hess;
+    CSR_Matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
     double *H_vals = H->x;
     int p = pnode->zero_index[row]; /* zero column index */
     double w_prod = w[row] * pnode->prod_nonzero[row];
@@ -278,7 +278,7 @@ static inline void wsum_hess_row_two_zeros(expr *node, const double *w, int row,
 {
     expr *x = node->left;
     prod_axis *pnode = (prod_axis *) node;
-    CSR_Matrix *H = node->wsum_hess;
+    CSR_Matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
     double *H_vals = H->x;
 
     /* find indices p and q where row has zeros */
@@ -331,7 +331,7 @@ static inline void wsum_hess_row_two_zeros(expr *node, const double *w, int row,
 
 static inline void wsum_hess_row_many_zeros(expr *node, int row, int d2)
 {
-    CSR_Matrix *H = node->wsum_hess;
+    CSR_Matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
     double *H_vals = H->x;
     expr *x = node->left;
 
diff --git a/src/atoms/other/prod_axis_zero.c b/src/atoms/other/prod_axis_zero.c
index f5df03a..9c2ab10 100644
--- a/src/atoms/other/prod_axis_zero.c
+++ b/src/atoms/other/prod_axis_zero.c
@@ -155,8 +155,7 @@ static void wsum_hess_init_impl(expr *node)
     {
         /* Hessian has block diagonal structure: d2 blocks of size d1 x d1 */
         int nnz = x->d2 * x->d1 * x->d1;
-        node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, nnz);
-        CSR_Matrix *H = node->wsum_hess;
+        CSR_Matrix *H = new_csr_matrix(node->n_vars, node->n_vars, nnz);
 
         /* fill row pointers for the variable's rows (block diagonal) */
         for (int i = 0; i < x->size; i++)
@@ -185,6 +184,7 @@ static void wsum_hess_init_impl(expr *node)
                 }
             }
         }
+        node->wsum_hess = new_sparse_matrix(H);
     }
     else
     {
@@ -199,7 +199,7 @@ static inline void wsum_hess_column_no_zeros(expr *node, const double *w, int co
                                              int d1)
 {
     expr *x = node->left;
-    double *H = node->wsum_hess->x;
+    double *H = node->wsum_hess->to_csr(node->wsum_hess)->x;
     int col_start = col * d1;
     int block_start = col * d1 * d1;
     double scale = w[col] * node->value[col];
@@ -228,7 +228,7 @@ static inline void wsum_hess_column_one_zero(expr *node, const double *w, int co
 {
     expr *x = node->left;
     prod_axis *pnode = (prod_axis *) node;
-    double *H = node->wsum_hess->x;
+    double *H = node->wsum_hess->to_csr(node->wsum_hess)->x;
     int col_start = col * d1;
     int block_start = col * d1 * d1;
 
@@ -256,7 +256,7 @@ static inline void wsum_hess_column_two_zeros(expr *node, const double *w, int c
 {
     expr *x = node->left;
     prod_axis *pnode = (prod_axis *) node;
-    double *H = node->wsum_hess->x;
+    double *H = node->wsum_hess->to_csr(node->wsum_hess)->x;
     int col_start = col * d1;
     int block_start = col * d1 * d1;
 
@@ -290,7 +290,7 @@ static inline void wsum_hess_column_two_zeros(expr *node, const double *w, int c
 static inline void wsum_hess_column_many_zeros(expr *node, const double *w, int col,
                                                int d1)
 {
-    double *H = node->wsum_hess->x;
+    double *H = node->wsum_hess->to_csr(node->wsum_hess)->x;
     int block_start = col * d1 * d1;
 
     /* clear this column's block */
diff --git a/src/atoms/other/quad_form.c b/src/atoms/other/quad_form.c
index fc2ec4f..122ec25 100644
--- a/src/atoms/other/quad_form.c
+++ b/src/atoms/other/quad_form.c
@@ -145,7 +145,7 @@ static void wsum_hess_init_impl(expr *node)
             H->i[i] = Q->i[i] + x->var_id;
         }
 
-        node->wsum_hess = H;
+        node->wsum_hess = new_sparse_matrix(H);
     }
     else
     {
@@ -165,17 +165,19 @@ static void wsum_hess_init_impl(expr *node)
         /* term1 = Jf^T W Jf = Jf^T B*/
         CSC_Matrix *B = symBA_alloc(Q, Jf);
         qnode->QJf = B;
-        node->work->hess_term1 = BTA_alloc(Jf, B);
+        node->work->hess_term1 = new_sparse_matrix(BTA_alloc(Jf, B));
 
         /* term2 = sum_i (Qf(x))_i nabla^2 f_i */
         wsum_hess_init(x);
-        node->work->hess_term2 = new_csr_copy_sparsity(x->wsum_hess);
+        node->work->hess_term2 = x->wsum_hess->copy_sparsity(x->wsum_hess);
 
         /* hess = term1 + term2 */
-        int max_nnz = node->work->hess_term1->nnz + node->work->hess_term2->nnz;
-        node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, max_nnz);
-        sum_csr_alloc(node->work->hess_term1, node->work->hess_term2,
-                      node->wsum_hess);
+        CSR_Matrix *t1 = node->work->hess_term1->to_csr(node->work->hess_term1);
+        CSR_Matrix *t2 = node->work->hess_term2->to_csr(node->work->hess_term2);
+        int max_nnz = t1->nnz + t2->nnz;
+        CSR_Matrix *hess = new_csr_matrix(node->n_vars, node->n_vars, max_nnz);
+        sum_csr_alloc(t1, t2, hess);
+        node->wsum_hess = new_sparse_matrix(hess);
     }
 }
 
@@ -189,8 +191,9 @@ static void eval_wsum_hess(expr *node, const double *w)
     {
         /* TODO: do we want to compute this hessian only once (up to a scaling)?
          * Maybe unnecessary optimization. */
-        memcpy(node->wsum_hess->x, Q->x, Q->nnz * sizeof(double));
-        cblas_dscal(Q->nnz, two_w, node->wsum_hess->x, 1);
+        CSR_Matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
+        memcpy(H->x, Q->x, Q->nnz * sizeof(double));
+        cblas_dscal(Q->nnz, two_w, H->x, 1);
     }
     else
     {
@@ -207,23 +210,24 @@ static void eval_wsum_hess(expr *node, const double *w)
         }
 
         CSC_Matrix *QJf = ((quad_form_expr *) node)->QJf;
-        CSR_Matrix *term1 = node->work->hess_term1;
-        CSR_Matrix *term2 = node->work->hess_term2;
+        CSR_Matrix *term1 = node->work->hess_term1->to_csr(node->work->hess_term1);
+        CSR_Matrix *term2 = node->work->hess_term2->to_csr(node->work->hess_term2);
 
         /* term1 = J_f^T Q J_f = J_f^T B  */
         BA_fill_values(Q, Jf, QJf);
         BTDA_fill_values(Jf, QJf, NULL, term1);
 
         /* term2 */
+        CSR_Matrix *x_hess = x->wsum_hess->to_csr(x->wsum_hess);
         x->eval_wsum_hess(x, node->work->dwork);
-        memcpy(term2->x, x->wsum_hess->x, x->wsum_hess->nnz * sizeof(double));
+        memcpy(term2->x, x_hess->x, x_hess->nnz * sizeof(double));
 
         /* scale both terms by 2w */
         cblas_dscal(term1->nnz, two_w, term1->x, 1);
         cblas_dscal(term2->nnz, two_w, term2->x, 1);
 
         /* sum the two terms */
-        sum_csr_fill_values(term1, term2, node->wsum_hess);
+        sum_csr_fill_values(term1, term2, node->wsum_hess->to_csr(node->wsum_hess));
     }
 }
 
diff --git a/src/expr.c b/src/expr.c
index f4400cb..9b300d4 100644
--- a/src/expr.c
+++ b/src/expr.c
@@ -77,7 +77,7 @@ void free_expr(expr *node)
     /* free value array and derivative matrices */
     free(node->value);
     free_matrix(node->jacobian);
-    free_csr_matrix(node->wsum_hess);
+    free_matrix(node->wsum_hess);
 
     /* free workspace */
     if (node->work)
@@ -87,8 +87,8 @@ void free_expr(expr *node)
         free_csc_matrix(node->work->jacobian_csc);
         free(node->work->csc_work);
         free(node->work->local_jac_diag);
-        free_csr_matrix(node->work->hess_term1);
-        free_csr_matrix(node->work->hess_term2);
+        free_matrix(node->work->hess_term1);
+        free_matrix(node->work->hess_term2);
         free(node->work);
     }
 
diff --git a/src/old-code/linear_op.c b/src/old-code/linear_op.c
index ca0174b..33bfed5 100644
--- a/src/old-code/linear_op.c
+++ b/src/old-code/linear_op.c
@@ -74,7 +74,8 @@ static void eval_jacobian(expr *node)
 static void wsum_hess_init_impl(expr *node)
 {
     /* Linear operator Hessian is always zero */
-    node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, 0);
+    node->wsum_hess =
+        new_sparse_matrix(new_csr_matrix(node->n_vars, node->n_vars, 0));
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
diff --git a/src/problem.c b/src/problem.c
index 154fae4..0378884 100644
--- a/src/problem.c
+++ b/src/problem.c
@@ -83,7 +83,8 @@ static void problem_lagrange_hess_fill_sparsity(problem *prob, int *iwork)
     int *cols = iwork;
     int *col_to_pos = iwork; /* reused after qsort */
     int nnz = 0;
-    CSR_Matrix *H_obj = prob->objective->wsum_hess;
+    CSR_Matrix *H_obj =
+        prob->objective->wsum_hess->to_csr(prob->objective->wsum_hess);
     CSR_Matrix *H_c;
     CSR_Matrix *H = prob->lagrange_hessian;
     H->p[0] = 0;
@@ -100,7 +101,7 @@ static void problem_lagrange_hess_fill_sparsity(problem *prob, int *iwork)
         /* gather columns from constraint hessians */
         for (int c_idx = 0; c_idx < prob->n_constraints; c_idx++)
         {
-            H_c = constrs[c_idx]->wsum_hess;
+            H_c = constrs[c_idx]->wsum_hess->to_csr(constrs[c_idx]->wsum_hess);
             int c_len = H_c->p[row + 1] - H_c->p[row];
             memcpy(cols + count, H_c->i + H_c->p[row], c_len * sizeof(int));
             count += c_len;
@@ -146,7 +147,7 @@ static void problem_lagrange_hess_fill_sparsity(problem *prob, int *iwork)
     /* map constraint hessian entries */
     for (int c_idx = 0; c_idx < prob->n_constraints; c_idx++)
     {
-        H_c = constrs[c_idx]->wsum_hess;
+        H_c = constrs[c_idx]->wsum_hess->to_csr(constrs[c_idx]->wsum_hess);
         for (int row = 0; row < H->m; row++)
         {
             for (int idx = H->p[row]; idx < H->p[row + 1]; idx++)
@@ -225,12 +226,14 @@ void problem_init_hessian(problem *prob)
     //                        Lagrange Hessian structure
     // -------------------------------------------------------------------------------
     wsum_hess_init(prob->objective);
-    int nnz = prob->objective->wsum_hess->nnz;
+    int nnz =
+        prob->objective->wsum_hess->to_csr(prob->objective->wsum_hess)->nnz;
 
     for (int i = 0; i < prob->n_constraints; i++)
     {
         wsum_hess_init(prob->constraints[i]);
-        nnz += prob->constraints[i]->wsum_hess->nnz;
+        Matrix *c_hess = prob->constraints[i]->wsum_hess;
+        nnz += c_hess->to_csr(c_hess)->nnz;
     }
 
     prob->lagrange_hessian = new_csr_matrix(prob->n_vars, prob->n_vars, nnz);
@@ -547,14 +550,17 @@ void problem_hessian(problem *prob, double obj_w, const double *w)
     memset(H->x, 0, H->nnz * sizeof(double));
 
     /* accumulate objective function */
-    accumulator(obj->wsum_hess, idx_map, H->x);
-    offset = obj->wsum_hess->nnz;
+    CSR_Matrix *obj_hess_csr = obj->wsum_hess->to_csr(obj->wsum_hess);
+    accumulator(obj_hess_csr, idx_map, H->x);
+    offset = obj_hess_csr->nnz;
 
     /* accumulate constraint functions */
     for (int i = 0; i < prob->n_constraints; i++)
     {
-        accumulator(constrs[i]->wsum_hess, idx_map + offset, H->x);
-        offset += constrs[i]->wsum_hess->nnz;
+        CSR_Matrix *c_hess_csr =
+            constrs[i]->wsum_hess->to_csr(constrs[i]->wsum_hess);
+        accumulator(c_hess_csr, idx_map + offset, H->x);
+        offset += c_hess_csr->nnz;
     }
 
     clock_gettime(CLOCK_MONOTONIC, &timer.end);
diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c
index 2624814..54b22c8 100644
--- a/src/utils/permuted_dense.c
+++ b/src/utils/permuted_dense.c
@@ -89,9 +89,9 @@ Matrix *new_permuted_dense(int m, int n, int dense_m, int dense_n,
     pd->base.to_csr = permuted_dense_to_csr;
     pd->base.refresh_csc_values = permuted_dense_refresh_csc_values;
     pd->base.free_fn = permuted_dense_free;
-    /* Other vtable slots (copy_sparsity, DA_fill_values, ATA_alloc_csr,
-       ATDA_fill_csr) are wired up in a later step when permuted_dense actually
-       starts appearing as a node->jacobian. */
+    /* Other vtable slots (copy_sparsity, DA_fill_values, ATA_alloc,
+       ATDA_fill_values) are wired up in a later step when permuted_dense
+       actually starts appearing as a node->jacobian. */
 
     pd->dense_m = dense_m;
     pd->dense_n = dense_n;
diff --git a/src/utils/sparse_matrix.c b/src/utils/sparse_matrix.c
index b1991e2..6654a3f 100644
--- a/src/utils/sparse_matrix.c
+++ b/src/utils/sparse_matrix.c
@@ -62,8 +62,8 @@ static void sparse_free(Matrix *self)
 Matrix *new_sparse_matrix(CSR_Matrix *A);
 
 /* Build the CSC cache structure if absent. Values are NOT filled here; caller
-   must call refresh_csc_values before consuming. ATA_alloc_csr only needs
-   structure, so it's safe to call after build_csc_structure alone. */
+   must call refresh_csc_values before consuming. ATA_alloc only needs structure,
+   so it's safe to call after build_csc_structure alone. */
 static void build_csc_structure_if_absent(Sparse_Matrix *sm)
 {
     if (sm->csc_cache != NULL) return;
@@ -84,19 +84,19 @@ static void sparse_DA_fill_values(const double *d, const Matrix *self, Matrix *o
     DA_fill_values(d, sm->csr, sm_out->csr);
 }
 
-static CSR_Matrix *sparse_ATA_alloc_csr(Matrix *self)
+static Matrix *sparse_ATA_alloc(Matrix *self)
 {
     Sparse_Matrix *sm = (Sparse_Matrix *) self;
     build_csc_structure_if_absent(sm);
-    return ATA_alloc(sm->csc_cache);
+    return new_sparse_matrix(ATA_alloc(sm->csc_cache));
 }
 
 /* Caller must have called refresh_csc_values since the last change to csr->x. */
-static void sparse_ATDA_fill_csr(const Matrix *self, const double *d,
-                                 CSR_Matrix *csr_out)
+static void sparse_ATDA_fill_values(const Matrix *self, const double *d, Matrix *out)
 {
     const Sparse_Matrix *sm = (const Sparse_Matrix *) self;
-    ATDA_fill_values(sm->csc_cache, d, csr_out);
+    Sparse_Matrix *sm_out = (Sparse_Matrix *) out;
+    ATDA_fill_values(sm->csc_cache, d, sm_out->csr);
 }
 
 static CSR_Matrix *sparse_to_csr(Matrix *self)
@@ -120,8 +120,8 @@ static void wire_vtable(Sparse_Matrix *sm)
     sm->base.update_values = sparse_update_values;
     sm->base.copy_sparsity = sparse_copy_sparsity;
     sm->base.DA_fill_values = sparse_DA_fill_values;
-    sm->base.ATA_alloc_csr = sparse_ATA_alloc_csr;
-    sm->base.ATDA_fill_csr = sparse_ATDA_fill_csr;
+    sm->base.ATA_alloc = sparse_ATA_alloc;
+    sm->base.ATDA_fill_values = sparse_ATDA_fill_values;
     sm->base.to_csr = sparse_to_csr;
     sm->base.refresh_csc_values = sparse_refresh_csc_values;
     sm->base.free_fn = sparse_free;
diff --git a/tests/numerical_diff.c b/tests/numerical_diff.c
index dad03f8..291ae7d 100644
--- a/tests/numerical_diff.c
+++ b/tests/numerical_diff.c
@@ -171,7 +171,7 @@ int check_wsum_hess(expr *node, const double *u, const double *w, double h)
     node->eval_wsum_hess(node, w);
 
     double *H_ana = calloc((size_t) n * n, sizeof(double));
-    csr_to_dense(node->wsum_hess, H_ana);
+    csr_to_dense(node->wsum_hess->to_csr(node->wsum_hess), H_ana);
 
     int result = 1;
     for (int i = 0; i < n * n; i++)
diff --git a/tests/profiling/profile_log_reg.h b/tests/profiling/profile_log_reg.h
index 7fdc0fa..5d66ada 100644
--- a/tests/profiling/profile_log_reg.h
+++ b/tests/profiling/profile_log_reg.h
@@ -135,10 +135,10 @@ const char *profile_log_reg(void)
     double *H_a_dense = (double *) calloc((size_t) n * n, sizeof(double));
     for (int i = 0; i < n; i++)
     {
-        for (int e = obj->wsum_hess->p[i]; e < obj->wsum_hess->p[i + 1]; e++)
+        for (int e = obj->wsum_hess->to_csr(obj->wsum_hess)->p[i]; e < obj->wsum_hess->to_csr(obj->wsum_hess)->p[i + 1]; e++)
         {
-            int col = obj->wsum_hess->i[e];
-            H_a_dense[i * n + col] = obj->wsum_hess->x[e];
+            int col = obj->wsum_hess->to_csr(obj->wsum_hess)->i[e];
+            H_a_dense[i * n + col] = obj->wsum_hess->to_csr(obj->wsum_hess)->x[e];
         }
     }
     double max_H_diff = 0.0;
diff --git a/tests/wsum_hess/affine/test_broadcast.h b/tests/wsum_hess/affine/test_broadcast.h
index 3d36cb4..97d3208 100644
--- a/tests/wsum_hess/affine/test_broadcast.h
+++ b/tests/wsum_hess/affine/test_broadcast.h
@@ -57,11 +57,11 @@ const char *test_wsum_hess_broadcast_row(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("broadcast row wsum_hess: x values fail",
-              cmp_double_array(bcast->wsum_hess->x, expected_x, 3));
+              cmp_double_array(bcast->wsum_hess->to_csr(bcast->wsum_hess)->x, expected_x, 3));
     mu_assert("broadcast row wsum_hess: row pointers fail",
-              cmp_int_array(bcast->wsum_hess->p, expected_p, 4));
+              cmp_int_array(bcast->wsum_hess->to_csr(bcast->wsum_hess)->p, expected_p, 4));
     mu_assert("broadcast row wsum_hess: column indices fail",
-              cmp_int_array(bcast->wsum_hess->i, expected_i, 3));
+              cmp_int_array(bcast->wsum_hess->to_csr(bcast->wsum_hess)->i, expected_i, 3));
 
     free_expr(bcast);
     return 0;
@@ -112,11 +112,11 @@ const char *test_wsum_hess_broadcast_col(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("broadcast col wsum_hess: x values fail",
-              cmp_double_array(bcast->wsum_hess->x, expected_x, 3));
+              cmp_double_array(bcast->wsum_hess->to_csr(bcast->wsum_hess)->x, expected_x, 3));
     mu_assert("broadcast col wsum_hess: row pointers fail",
-              cmp_int_array(bcast->wsum_hess->p, expected_p, 4));
+              cmp_int_array(bcast->wsum_hess->to_csr(bcast->wsum_hess)->p, expected_p, 4));
     mu_assert("broadcast col wsum_hess: column indices fail",
-              cmp_int_array(bcast->wsum_hess->i, expected_i, 3));
+              cmp_int_array(bcast->wsum_hess->to_csr(bcast->wsum_hess)->i, expected_i, 3));
 
     free_expr(bcast);
     return 0;
@@ -160,11 +160,11 @@ const char *test_wsum_hess_broadcast_scalar_to_matrix(void)
     int expected_i[1] = {0};
 
     mu_assert("broadcast scalar wsum_hess: x values fail",
-              cmp_double_array(bcast->wsum_hess->x, expected_x, 1));
+              cmp_double_array(bcast->wsum_hess->to_csr(bcast->wsum_hess)->x, expected_x, 1));
     mu_assert("broadcast scalar wsum_hess: row pointers fail",
-              cmp_int_array(bcast->wsum_hess->p, expected_p, 2));
+              cmp_int_array(bcast->wsum_hess->to_csr(bcast->wsum_hess)->p, expected_p, 2));
     mu_assert("broadcast scalar wsum_hess: column indices fail",
-              cmp_int_array(bcast->wsum_hess->i, expected_i, 1));
+              cmp_int_array(bcast->wsum_hess->to_csr(bcast->wsum_hess)->i, expected_i, 1));
 
     free_expr(bcast);
     return 0;
diff --git a/tests/wsum_hess/affine/test_convolve.h b/tests/wsum_hess/affine/test_convolve.h
index 98dd7ab..f44aaba 100644
--- a/tests/wsum_hess/affine/test_convolve.h
+++ b/tests/wsum_hess/affine/test_convolve.h
@@ -26,10 +26,10 @@ const char *test_wsum_hess_convolve(void)
     wsum_hess_init(y);
     y->eval_wsum_hess(y, w);
 
-    mu_assert("Convolve wsum_hess should be 3x3", y->wsum_hess->m == 3);
-    mu_assert("Convolve wsum_hess should be square", y->wsum_hess->n == 3);
+    mu_assert("Convolve wsum_hess should be 3x3", y->wsum_hess->to_csr(y->wsum_hess)->m == 3);
+    mu_assert("Convolve wsum_hess should be square", y->wsum_hess->to_csr(y->wsum_hess)->n == 3);
     mu_assert("Convolve wsum_hess should have zero nonzeros",
-              y->wsum_hess->nnz == 0);
+              y->wsum_hess->to_csr(y->wsum_hess)->nnz == 0);
 
     free_expr(y);
     return 0;
diff --git a/tests/wsum_hess/affine/test_diag_mat.h b/tests/wsum_hess/affine/test_diag_mat.h
index aec571a..6a57355 100644
--- a/tests/wsum_hess/affine/test_diag_mat.h
+++ b/tests/wsum_hess/affine/test_diag_mat.h
@@ -37,9 +37,9 @@ const char *test_wsum_hess_diag_mat_log(void)
     int expected_i[4] = {0, 1, 2, 3};
 
     mu_assert("diag_mat log hess vals",
-              cmp_double_array(dm->wsum_hess->x, expected_x, 4));
-    mu_assert("diag_mat log hess p", cmp_int_array(dm->wsum_hess->p, expected_p, 5));
-    mu_assert("diag_mat log hess i", cmp_int_array(dm->wsum_hess->i, expected_i, 4));
+              cmp_double_array(dm->wsum_hess->to_csr(dm->wsum_hess)->x, expected_x, 4));
+    mu_assert("diag_mat log hess p", cmp_int_array(dm->wsum_hess->to_csr(dm->wsum_hess)->p, expected_p, 5));
+    mu_assert("diag_mat log hess i", cmp_int_array(dm->wsum_hess->to_csr(dm->wsum_hess)->i, expected_i, 4));
 
     free_expr(dm);
     return 0;
diff --git a/tests/wsum_hess/affine/test_hstack.h b/tests/wsum_hess/affine/test_hstack.h
index 78c8160..7f7d8ab 100644
--- a/tests/wsum_hess/affine/test_hstack.h
+++ b/tests/wsum_hess/affine/test_hstack.h
@@ -91,11 +91,11 @@ const char *test_wsum_hess_hstack(void)
     int expected_i[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
 
     mu_assert("vals incorrect",
-              cmp_double_array(hstack_node->wsum_hess->x, expected_x, 9));
+              cmp_double_array(hstack_node->wsum_hess->to_csr(hstack_node->wsum_hess)->x, expected_x, 9));
     mu_assert("rows incorrect",
-              cmp_int_array(hstack_node->wsum_hess->p, expected_p, 10));
+              cmp_int_array(hstack_node->wsum_hess->to_csr(hstack_node->wsum_hess)->p, expected_p, 10));
     mu_assert("cols incorrect",
-              cmp_int_array(hstack_node->wsum_hess->i, expected_i, 9));
+              cmp_int_array(hstack_node->wsum_hess->to_csr(hstack_node->wsum_hess)->i, expected_i, 9));
 
     free_expr(hstack_node);
     return 0;
@@ -201,11 +201,11 @@ const char *test_wsum_hess_hstack_matrix(void)
                           9, 10, 11, 12, 13, 14, 15, 16, 17};
 
     mu_assert("vals incorrect",
-              cmp_double_array(hstack_node->wsum_hess->x, expected_x, 18));
+              cmp_double_array(hstack_node->wsum_hess->to_csr(hstack_node->wsum_hess)->x, expected_x, 18));
     mu_assert("rows incorrect",
-              cmp_int_array(hstack_node->wsum_hess->p, expected_p, 19));
+              cmp_int_array(hstack_node->wsum_hess->to_csr(hstack_node->wsum_hess)->p, expected_p, 19));
     mu_assert("cols incorrect",
-              cmp_int_array(hstack_node->wsum_hess->i, expected_i, 18));
+              cmp_int_array(hstack_node->wsum_hess->to_csr(hstack_node->wsum_hess)->i, expected_i, 18));
 
     free_expr(hstack_node);
     return 0;
diff --git a/tests/wsum_hess/affine/test_index.h b/tests/wsum_hess/affine/test_index.h
index 3aa2030..b0a7bdc 100644
--- a/tests/wsum_hess/affine/test_index.h
+++ b/tests/wsum_hess/affine/test_index.h
@@ -38,9 +38,9 @@ const char *test_wsum_hess_index_log(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("index log hess vals",
-              cmp_double_array(idx->wsum_hess->x, expected_x, 3));
-    mu_assert("index log hess p", cmp_int_array(idx->wsum_hess->p, expected_p, 4));
-    mu_assert("index log hess i", cmp_int_array(idx->wsum_hess->i, expected_i, 3));
+              cmp_double_array(idx->wsum_hess->to_csr(idx->wsum_hess)->x, expected_x, 3));
+    mu_assert("index log hess p", cmp_int_array(idx->wsum_hess->to_csr(idx->wsum_hess)->p, expected_p, 4));
+    mu_assert("index log hess i", cmp_int_array(idx->wsum_hess->to_csr(idx->wsum_hess)->i, expected_i, 3));
 
     free_expr(idx);
     return 0;
@@ -71,11 +71,11 @@ const char *test_wsum_hess_index_repeated(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("index repeated hess vals",
-              cmp_double_array(idx->wsum_hess->x, expected_x, 3));
+              cmp_double_array(idx->wsum_hess->to_csr(idx->wsum_hess)->x, expected_x, 3));
     mu_assert("index repeated hess p",
-              cmp_int_array(idx->wsum_hess->p, expected_p, 4));
+              cmp_int_array(idx->wsum_hess->to_csr(idx->wsum_hess)->p, expected_p, 4));
     mu_assert("index repeated hess i",
-              cmp_int_array(idx->wsum_hess->i, expected_i, 3));
+              cmp_int_array(idx->wsum_hess->to_csr(idx->wsum_hess)->i, expected_i, 3));
 
     free_expr(idx);
     return 0;
@@ -110,11 +110,11 @@ const char *test_wsum_hess_sum_index_log(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("sum index log hess vals",
-              cmp_double_array(sum_node->wsum_hess->x, expected_x, 3));
+              cmp_double_array(sum_node->wsum_hess->to_csr(sum_node->wsum_hess)->x, expected_x, 3));
     mu_assert("sum index log hess p",
-              cmp_int_array(sum_node->wsum_hess->p, expected_p, 4));
+              cmp_int_array(sum_node->wsum_hess->to_csr(sum_node->wsum_hess)->p, expected_p, 4));
     mu_assert("sum index log hess i",
-              cmp_int_array(sum_node->wsum_hess->i, expected_i, 3));
+              cmp_int_array(sum_node->wsum_hess->to_csr(sum_node->wsum_hess)->i, expected_i, 3));
 
     free_expr(sum_node);
     return 0;
diff --git a/tests/wsum_hess/affine/test_left_matmul.h b/tests/wsum_hess/affine/test_left_matmul.h
index 313624a..e95dc33 100644
--- a/tests/wsum_hess/affine/test_left_matmul.h
+++ b/tests/wsum_hess/affine/test_left_matmul.h
@@ -81,9 +81,9 @@ const char *test_wsum_hess_left_matmul(void)
     int expected_p[4] = {0, 1, 2, 3}; /* each row has 1 diagonal entry */
 
     mu_assert("vals incorrect",
-              cmp_double_array(A_log_x->wsum_hess->x, expected_x, 3));
-    mu_assert("cols incorrect", cmp_int_array(A_log_x->wsum_hess->i, expected_i, 3));
-    mu_assert("rows incorrect", cmp_int_array(A_log_x->wsum_hess->p, expected_p, 4));
+              cmp_double_array(A_log_x->wsum_hess->to_csr(A_log_x->wsum_hess)->x, expected_x, 3));
+    mu_assert("cols incorrect", cmp_int_array(A_log_x->wsum_hess->to_csr(A_log_x->wsum_hess)->i, expected_i, 3));
+    mu_assert("rows incorrect", cmp_int_array(A_log_x->wsum_hess->to_csr(A_log_x->wsum_hess)->p, expected_p, 4));
 
     free_csr_matrix(A);
     free_expr(A_log_x);
@@ -190,9 +190,9 @@ const char *test_wsum_hess_left_matmul_matrix(void)
     int expected_p[7] = {0, 1, 2, 3, 4, 5, 6}; /* each row has 1 diagonal entry */
 
     mu_assert("vals incorrect",
-              cmp_double_array(A_log_x->wsum_hess->x, expected_x, 6));
-    mu_assert("cols incorrect", cmp_int_array(A_log_x->wsum_hess->i, expected_i, 6));
-    mu_assert("rows incorrect", cmp_int_array(A_log_x->wsum_hess->p, expected_p, 7));
+              cmp_double_array(A_log_x->wsum_hess->to_csr(A_log_x->wsum_hess)->x, expected_x, 6));
+    mu_assert("cols incorrect", cmp_int_array(A_log_x->wsum_hess->to_csr(A_log_x->wsum_hess)->i, expected_i, 6));
+    mu_assert("rows incorrect", cmp_int_array(A_log_x->wsum_hess->to_csr(A_log_x->wsum_hess)->p, expected_p, 7));
 
     free_csr_matrix(A);
     free_expr(A_log_x);
diff --git a/tests/wsum_hess/affine/test_right_matmul.h b/tests/wsum_hess/affine/test_right_matmul.h
index 2f51b9b..b1adced 100644
--- a/tests/wsum_hess/affine/test_right_matmul.h
+++ b/tests/wsum_hess/affine/test_right_matmul.h
@@ -51,9 +51,9 @@ const char *test_wsum_hess_right_matmul(void)
     int expected_p[5] = {0, 1, 2, 3, 4}; /* each row has 1 diagonal entry */
 
     mu_assert("vals incorrect",
-              cmp_double_array(log_x_A->wsum_hess->x, expected_x, 4));
-    mu_assert("cols incorrect", cmp_int_array(log_x_A->wsum_hess->i, expected_i, 4));
-    mu_assert("rows incorrect", cmp_int_array(log_x_A->wsum_hess->p, expected_p, 5));
+              cmp_double_array(log_x_A->wsum_hess->to_csr(log_x_A->wsum_hess)->x, expected_x, 4));
+    mu_assert("cols incorrect", cmp_int_array(log_x_A->wsum_hess->to_csr(log_x_A->wsum_hess)->i, expected_i, 4));
+    mu_assert("rows incorrect", cmp_int_array(log_x_A->wsum_hess->to_csr(log_x_A->wsum_hess)->p, expected_p, 5));
 
     free_csr_matrix(A);
     free_expr(log_x_A);
@@ -100,9 +100,9 @@ const char *test_wsum_hess_right_matmul_vector(void)
     int expected_p[4] = {0, 1, 2, 3}; /* each row has 1 diagonal entry */
 
     mu_assert("vals incorrect",
-              cmp_double_array(log_x_A->wsum_hess->x, expected_x, 3));
-    mu_assert("cols incorrect", cmp_int_array(log_x_A->wsum_hess->i, expected_i, 3));
-    mu_assert("rows incorrect", cmp_int_array(log_x_A->wsum_hess->p, expected_p, 4));
+              cmp_double_array(log_x_A->wsum_hess->to_csr(log_x_A->wsum_hess)->x, expected_x, 3));
+    mu_assert("cols incorrect", cmp_int_array(log_x_A->wsum_hess->to_csr(log_x_A->wsum_hess)->i, expected_i, 3));
+    mu_assert("rows incorrect", cmp_int_array(log_x_A->wsum_hess->to_csr(log_x_A->wsum_hess)->p, expected_p, 4));
 
     free_csr_matrix(A);
     free_expr(log_x_A);
diff --git a/tests/wsum_hess/affine/test_scalar_mult.h b/tests/wsum_hess/affine/test_scalar_mult.h
index ba6bae9..32dec5d 100644
--- a/tests/wsum_hess/affine/test_scalar_mult.h
+++ b/tests/wsum_hess/affine/test_scalar_mult.h
@@ -39,11 +39,11 @@ const char *test_wsum_hess_scalar_mult_log_vector(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("scalar mult log hess: x values fail",
-              cmp_double_array(y->wsum_hess->x, expected_x, 3));
+              cmp_double_array(y->wsum_hess->to_csr(y->wsum_hess)->x, expected_x, 3));
     mu_assert("scalar mult log hess: row pointers fail",
-              cmp_int_array(y->wsum_hess->p, expected_p, 4));
+              cmp_int_array(y->wsum_hess->to_csr(y->wsum_hess)->p, expected_p, 4));
     mu_assert("scalar mult log hess: column indices fail",
-              cmp_int_array(y->wsum_hess->i, expected_i, 3));
+              cmp_int_array(y->wsum_hess->to_csr(y->wsum_hess)->i, expected_i, 3));
 
     free_expr(y);
     return 0;
@@ -77,11 +77,11 @@ const char *test_wsum_hess_scalar_mult_log_matrix(void)
     int expected_i[4] = {0, 1, 2, 3};
 
     mu_assert("scalar mult log hess matrix: x values fail",
-              cmp_double_array(y->wsum_hess->x, expected_x, 4));
+              cmp_double_array(y->wsum_hess->to_csr(y->wsum_hess)->x, expected_x, 4));
     mu_assert("scalar mult log hess matrix: row pointers fail",
-              cmp_int_array(y->wsum_hess->p, expected_p, 5));
+              cmp_int_array(y->wsum_hess->to_csr(y->wsum_hess)->p, expected_p, 5));
     mu_assert("scalar mult log hess matrix: column indices fail",
-              cmp_int_array(y->wsum_hess->i, expected_i, 4));
+              cmp_int_array(y->wsum_hess->to_csr(y->wsum_hess)->i, expected_i, 4));
 
     free_expr(y);
     return 0;
diff --git a/tests/wsum_hess/affine/test_sum.h b/tests/wsum_hess/affine/test_sum.h
index 8ade78b..c2bd984 100644
--- a/tests/wsum_hess/affine/test_sum.h
+++ b/tests/wsum_hess/affine/test_sum.h
@@ -64,11 +64,11 @@ const char *test_wsum_hess_sum_log_axis0(void)
     int expected_i[6] = {0, 1, 2, 3, 4, 5};
 
     mu_assert("vals incorrect",
-              cmp_double_array(sum_node->wsum_hess->x, expected_x, 6));
+              cmp_double_array(sum_node->wsum_hess->to_csr(sum_node->wsum_hess)->x, expected_x, 6));
     mu_assert("rows incorrect",
-              cmp_int_array(sum_node->wsum_hess->p, expected_p, 7));
+              cmp_int_array(sum_node->wsum_hess->to_csr(sum_node->wsum_hess)->p, expected_p, 7));
     mu_assert("cols incorrect",
-              cmp_int_array(sum_node->wsum_hess->i, expected_i, 6));
+              cmp_int_array(sum_node->wsum_hess->to_csr(sum_node->wsum_hess)->i, expected_i, 6));
 
     free_expr(sum_node);
 
@@ -102,11 +102,11 @@ const char *test_wsum_hess_sum_log_axis1(void)
     int expected_i[6] = {0, 1, 2, 3, 4, 5};
 
     mu_assert("vals incorrect",
-              cmp_double_array(sum_node->wsum_hess->x, expected_x, 6));
+              cmp_double_array(sum_node->wsum_hess->to_csr(sum_node->wsum_hess)->x, expected_x, 6));
     mu_assert("rows incorrect",
-              cmp_int_array(sum_node->wsum_hess->p, expected_p, 7));
+              cmp_int_array(sum_node->wsum_hess->to_csr(sum_node->wsum_hess)->p, expected_p, 7));
     mu_assert("cols incorrect",
-              cmp_int_array(sum_node->wsum_hess->i, expected_i, 6));
+              cmp_int_array(sum_node->wsum_hess->to_csr(sum_node->wsum_hess)->i, expected_i, 6));
 
     free_expr(sum_node);
 
diff --git a/tests/wsum_hess/affine/test_trace.h b/tests/wsum_hess/affine/test_trace.h
index bb3f98c..9e209b6 100644
--- a/tests/wsum_hess/affine/test_trace.h
+++ b/tests/wsum_hess/affine/test_trace.h
@@ -35,10 +35,10 @@ const char *test_wsum_hess_trace_variable(void)
     trace_node->eval_wsum_hess(trace_node, &w);
 
     /* For a linear operation (variable), Hessian is zero */
-    mu_assert("wsum_hess should be empty", trace_node->wsum_hess->nnz == 0);
+    mu_assert("wsum_hess should be empty", trace_node->wsum_hess->to_csr(trace_node->wsum_hess)->nnz == 0);
 
     mu_assert("dims correct",
-              trace_node->wsum_hess->m == 13 && trace_node->wsum_hess->n == 13);
+              trace_node->wsum_hess->to_csr(trace_node->wsum_hess)->m == 13 && trace_node->wsum_hess->to_csr(trace_node->wsum_hess)->n == 13);
 
     free_expr(trace_node);
     return 0;
@@ -70,7 +70,7 @@ const char *test_wsum_hess_trace_log_variable(void)
     int expected_Ap[14] = {0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 9};
     int expected_Ai[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
 
-    CSR_Matrix *H = trace_node->wsum_hess;
+    CSR_Matrix *H = trace_node->wsum_hess->to_csr(trace_node->wsum_hess);
     mu_assert("nnz wrong", H->nnz == 9);
     mu_assert("vals match", cmp_double_array(H->x, expected_Ax, 9));
     mu_assert("cols match", cmp_int_array(H->i, expected_Ai, 9));
@@ -135,11 +135,11 @@ const char *test_wsum_hess_trace_composite(void)
     int expected_Ap[14] = {0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 9};
     int expected_Ai[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
 
-    mu_assert("nnz wrong", trace_node->wsum_hess->nnz == 9);
-    mu_assert("rows fail", cmp_int_array(trace_node->wsum_hess->p, expected_Ap, 14));
+    mu_assert("nnz wrong", trace_node->wsum_hess->to_csr(trace_node->wsum_hess)->nnz == 9);
+    mu_assert("rows fail", cmp_int_array(trace_node->wsum_hess->to_csr(trace_node->wsum_hess)->p, expected_Ap, 14));
     mu_assert("vals match",
-              cmp_double_array(trace_node->wsum_hess->x, expected_Ax, 9));
-    mu_assert("cols match", cmp_int_array(trace_node->wsum_hess->i, expected_Ai, 9));
+              cmp_double_array(trace_node->wsum_hess->to_csr(trace_node->wsum_hess)->x, expected_Ax, 9));
+    mu_assert("cols match", cmp_int_array(trace_node->wsum_hess->to_csr(trace_node->wsum_hess)->i, expected_Ai, 9));
     free_expr(trace_node);
     return 0;
 }
diff --git a/tests/wsum_hess/affine/test_transpose.h b/tests/wsum_hess/affine/test_transpose.h
index be91541..7d61b41 100644
--- a/tests/wsum_hess/affine/test_transpose.h
+++ b/tests/wsum_hess/affine/test_transpose.h
@@ -28,11 +28,11 @@ const char *test_wsum_hess_transpose(void)
     int expected_i[16] = {4, 6, 4, 6, 5, 7, 5, 7, 0, 1, 2, 3, 0, 1, 2, 3};
 
     mu_assert("hess values fail",
-              cmp_double_array(XYT->wsum_hess->x, expected_x, 8));
+              cmp_double_array(XYT->wsum_hess->to_csr(XYT->wsum_hess)->x, expected_x, 8));
     mu_assert("jacobian row ptr fail",
-              cmp_int_array(XYT->wsum_hess->p, expected_p, 5));
+              cmp_int_array(XYT->wsum_hess->to_csr(XYT->wsum_hess)->p, expected_p, 5));
     mu_assert("jacobian col idx fail",
-              cmp_int_array(XYT->wsum_hess->i, expected_i, 8));
+              cmp_int_array(XYT->wsum_hess->to_csr(XYT->wsum_hess)->i, expected_i, 8));
     free_expr(XYT);
 
     return 0;
diff --git a/tests/wsum_hess/affine/test_upper_tri.h b/tests/wsum_hess/affine/test_upper_tri.h
index 418b809..7cfe951 100644
--- a/tests/wsum_hess/affine/test_upper_tri.h
+++ b/tests/wsum_hess/affine/test_upper_tri.h
@@ -50,11 +50,11 @@ const char *test_wsum_hess_upper_tri_log(void)
     int expected_i[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
 
     mu_assert("upper_tri log hess vals",
-              cmp_double_array(ut->wsum_hess->x, expected_x, 16));
+              cmp_double_array(ut->wsum_hess->to_csr(ut->wsum_hess)->x, expected_x, 16));
     mu_assert("upper_tri log hess p",
-              cmp_int_array(ut->wsum_hess->p, expected_p, 17));
+              cmp_int_array(ut->wsum_hess->to_csr(ut->wsum_hess)->p, expected_p, 17));
     mu_assert("upper_tri log hess i",
-              cmp_int_array(ut->wsum_hess->i, expected_i, 16));
+              cmp_int_array(ut->wsum_hess->to_csr(ut->wsum_hess)->i, expected_i, 16));
 
     free_expr(ut);
     return 0;
diff --git a/tests/wsum_hess/affine/test_vector_mult.h b/tests/wsum_hess/affine/test_vector_mult.h
index 1d75ad3..d8f384e 100644
--- a/tests/wsum_hess/affine/test_vector_mult.h
+++ b/tests/wsum_hess/affine/test_vector_mult.h
@@ -39,11 +39,11 @@ const char *test_wsum_hess_vector_mult_log_vector(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vector mult log hess: x values fail",
-              cmp_double_array(y->wsum_hess->x, expected_x, 3));
+              cmp_double_array(y->wsum_hess->to_csr(y->wsum_hess)->x, expected_x, 3));
     mu_assert("vector mult log hess: row pointers fail",
-              cmp_int_array(y->wsum_hess->p, expected_p, 4));
+              cmp_int_array(y->wsum_hess->to_csr(y->wsum_hess)->p, expected_p, 4));
     mu_assert("vector mult log hess: column indices fail",
-              cmp_int_array(y->wsum_hess->i, expected_i, 3));
+              cmp_int_array(y->wsum_hess->to_csr(y->wsum_hess)->i, expected_i, 3));
 
     free_expr(y);
     return 0;
@@ -77,11 +77,11 @@ const char *test_wsum_hess_vector_mult_log_matrix(void)
     int expected_i[4] = {0, 1, 2, 3};
 
     mu_assert("vector mult log hess matrix: x values fail",
-              cmp_double_array(y->wsum_hess->x, expected_x, 4));
+              cmp_double_array(y->wsum_hess->to_csr(y->wsum_hess)->x, expected_x, 4));
     mu_assert("vector mult log hess matrix: row pointers fail",
-              cmp_int_array(y->wsum_hess->p, expected_p, 5));
+              cmp_int_array(y->wsum_hess->to_csr(y->wsum_hess)->p, expected_p, 5));
     mu_assert("vector mult log hess matrix: column indices fail",
-              cmp_int_array(y->wsum_hess->i, expected_i, 4));
+              cmp_int_array(y->wsum_hess->to_csr(y->wsum_hess)->i, expected_i, 4));
 
     free_expr(y);
     return 0;
diff --git a/tests/wsum_hess/affine/test_vstack.h b/tests/wsum_hess/affine/test_vstack.h
index f003431..97faa17 100644
--- a/tests/wsum_hess/affine/test_vstack.h
+++ b/tests/wsum_hess/affine/test_vstack.h
@@ -40,11 +40,11 @@ const char *test_wsum_hess_vstack_vectors(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vstack hess vectors: vals",
-              cmp_double_array(stack->wsum_hess->x, expected_x, 3));
+              cmp_double_array(stack->wsum_hess->to_csr(stack->wsum_hess)->x, expected_x, 3));
     mu_assert("vstack hess vectors: rows",
-              cmp_int_array(stack->wsum_hess->p, expected_p, 4));
+              cmp_int_array(stack->wsum_hess->to_csr(stack->wsum_hess)->p, expected_p, 4));
     mu_assert("vstack hess vectors: cols",
-              cmp_int_array(stack->wsum_hess->i, expected_i, 3));
+              cmp_int_array(stack->wsum_hess->to_csr(stack->wsum_hess)->i, expected_i, 3));
 
     free_expr(stack);
     return 0;
@@ -103,11 +103,11 @@ const char *test_wsum_hess_vstack_matrix(void)
     int expected_i[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
 
     mu_assert("vstack hess matrix: vals",
-              cmp_double_array(stack->wsum_hess->x, expected_x, 9));
+              cmp_double_array(stack->wsum_hess->to_csr(stack->wsum_hess)->x, expected_x, 9));
     mu_assert("vstack hess matrix: rows",
-              cmp_int_array(stack->wsum_hess->p, expected_p, 10));
+              cmp_int_array(stack->wsum_hess->to_csr(stack->wsum_hess)->p, expected_p, 10));
     mu_assert("vstack hess matrix: cols",
-              cmp_int_array(stack->wsum_hess->i, expected_i, 9));
+              cmp_int_array(stack->wsum_hess->to_csr(stack->wsum_hess)->i, expected_i, 9));
 
     free_expr(stack);
     return 0;
diff --git a/tests/wsum_hess/bivariate_full_dom/test_matmul.h b/tests/wsum_hess/bivariate_full_dom/test_matmul.h
index 61a2d2d..512823a 100644
--- a/tests/wsum_hess/bivariate_full_dom/test_matmul.h
+++ b/tests/wsum_hess/bivariate_full_dom/test_matmul.h
@@ -49,15 +49,15 @@ const char *test_wsum_hess_matmul(void)
     Z->eval_wsum_hess(Z, w);
 
     /* Verify Hessian dimensions and sparsity */
-    mu_assert("Hessian should be 18x18", Z->wsum_hess->m == n_vars);
-    mu_assert("Hessian should be 18x18", Z->wsum_hess->n == n_vars);
-    mu_assert("Hessian should have 48 nonzeros", Z->wsum_hess->nnz == 48);
+    mu_assert("Hessian should be 18x18", Z->wsum_hess->to_csr(Z->wsum_hess)->m == n_vars);
+    mu_assert("Hessian should be 18x18", Z->wsum_hess->to_csr(Z->wsum_hess)->n == n_vars);
+    mu_assert("Hessian should have 48 nonzeros", Z->wsum_hess->to_csr(Z->wsum_hess)->nnz == 48);
 
     int expected_p[19] = {0,  4,  8,  12, 16, 20, 24, 26, 28, 30,
                           32, 34, 36, 38, 40, 42, 44, 46, 48};
 
     mu_assert("Row pointers incorrect",
-              cmp_int_array(Z->wsum_hess->p, expected_p, 19));
+              cmp_int_array(Z->wsum_hess->to_csr(Z->wsum_hess)->p, expected_p, 19));
 
     int expected_i[48] = {6, 9,  12, 15, /* row 0 */
                           6, 9,  12, 15, /* row 1 */
@@ -79,7 +79,7 @@ const char *test_wsum_hess_matmul(void)
                           4, 5};
 
     mu_assert("Column indices incorrect",
-              cmp_int_array(Z->wsum_hess->i, expected_i, 48));
+              cmp_int_array(Z->wsum_hess->to_csr(Z->wsum_hess)->i, expected_i, 48));
 
     double expected_x[48] = {1.0, 3.0, 5.0, 7.0, /* row 0 */
                              2.0, 4.0, 6.0, 8.0, /* row 1 */
@@ -101,7 +101,7 @@ const char *test_wsum_hess_matmul(void)
                              7.0, 8.0};          /* row 17 */
 
     mu_assert("Hessian values incorrect",
-              cmp_double_array(Z->wsum_hess->x, expected_x, 48));
+              cmp_double_array(Z->wsum_hess->to_csr(Z->wsum_hess)->x, expected_x, 48));
 
     free_expr(Z);
     return 0;
@@ -150,9 +150,9 @@ const char *test_wsum_hess_matmul_yx(void)
     Z->eval_wsum_hess(Z, w);
 
     /* Verify Hessian dimensions and sparsity */
-    mu_assert("Hessian should be 18x18", Z->wsum_hess->m == n_vars);
-    mu_assert("Hessian should be 18x18", Z->wsum_hess->n == n_vars);
-    mu_assert("Hessian should have 48 nonzeros", Z->wsum_hess->nnz == 48);
+    mu_assert("Hessian should be 18x18", Z->wsum_hess->to_csr(Z->wsum_hess)->m == n_vars);
+    mu_assert("Hessian should be 18x18", Z->wsum_hess->to_csr(Z->wsum_hess)->n == n_vars);
+    mu_assert("Hessian should have 48 nonzeros", Z->wsum_hess->to_csr(Z->wsum_hess)->nnz == 48);
 
     /* Row pointers when Y < X:
      * Rows 0-11 (Y variables): each couples with m=2 X variables
@@ -162,7 +162,7 @@ const char *test_wsum_hess_matmul_yx(void)
                           20, 22, 24, 28, 32, 36, 40, 44, 48};
 
     mu_assert("Row pointers incorrect",
-              cmp_int_array(Z->wsum_hess->p, expected_p, 19));
+              cmp_int_array(Z->wsum_hess->to_csr(Z->wsum_hess)->p, expected_p, 19));
 
     /* Column indices when Y < X:
      * Y[k_idx, col] couples with X[row, k_idx] for all row
@@ -190,7 +190,7 @@ const char *test_wsum_hess_matmul_yx(void)
                           2,  5,  8, 11}; /* row 17: X[1,2] */
 
     mu_assert("Column indices incorrect",
-              cmp_int_array(Z->wsum_hess->i, expected_i, 48));
+              cmp_int_array(Z->wsum_hess->to_csr(Z->wsum_hess)->i, expected_i, 48));
 
     double expected_x[48] = {1.0, 2.0,            /* row 0 */
                              1.0, 2.0,            /* row 1 */
@@ -212,7 +212,7 @@ const char *test_wsum_hess_matmul_yx(void)
                              2.0, 4.0, 6.0, 8.0}; /* row 17 */
 
     mu_assert("Hessian values incorrect",
-              cmp_double_array(Z->wsum_hess->x, expected_x, 48));
+              cmp_double_array(Z->wsum_hess->to_csr(Z->wsum_hess)->x, expected_x, 48));
 
     free_expr(Z);
     return 0;
diff --git a/tests/wsum_hess/bivariate_full_dom/test_multiply.h b/tests/wsum_hess/bivariate_full_dom/test_multiply.h
index e85c7a1..4b9e428 100644
--- a/tests/wsum_hess/bivariate_full_dom/test_multiply.h
+++ b/tests/wsum_hess/bivariate_full_dom/test_multiply.h
@@ -30,9 +30,9 @@ const char *test_wsum_hess_multiply_1(void)
     int expected_i[6] = {8, 9, 10, 3, 4, 5};
     double expected_x[6] = {1.0, 2.0, 3.0, 1.0, 2.0, 3.0};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->p, expected_p, 13));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->i, expected_i, 6));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 6));
+    mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 13));
+    mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 6));
+    mu_assert("x array fails", cmp_double_array(node->wsum_hess->to_csr(node->wsum_hess)->x, expected_x, 6));
 
     free_expr(node);
     return 0;
@@ -97,11 +97,11 @@ const char *test_wsum_hess_multiply_sparse_random(void)
         -0.09679721, -0.18801593, 0.01371497,  -0.01130641, 0.06305481};
 
     mu_assert("p array fails",
-              cmp_int_array(mult_node->wsum_hess->p, expected_p, 11));
+              cmp_int_array(mult_node->wsum_hess->to_csr(mult_node->wsum_hess)->p, expected_p, 11));
     mu_assert("i array fails",
-              cmp_int_array(mult_node->wsum_hess->i, expected_i, 29));
+              cmp_int_array(mult_node->wsum_hess->to_csr(mult_node->wsum_hess)->i, expected_i, 29));
     mu_assert("x array fails",
-              cmp_double_array(mult_node->wsum_hess->x, expected_x, 29));
+              cmp_double_array(mult_node->wsum_hess->to_csr(mult_node->wsum_hess)->x, expected_x, 29));
 
     /* Cleanup */
     free_expr(mult_node);
@@ -180,11 +180,11 @@ const char *test_wsum_hess_multiply_linear_ops(void)
     double expected_x[7] = {74.0, 75.0, 216.0, 18.0, 75.0, 18.0, 76.0};
 
     mu_assert("p array fails",
-              cmp_int_array(mult_node->wsum_hess->p, expected_p, 4));
+              cmp_int_array(mult_node->wsum_hess->to_csr(mult_node->wsum_hess)->p, expected_p, 4));
     mu_assert("i array fails",
-              cmp_int_array(mult_node->wsum_hess->i, expected_i, 7));
+              cmp_int_array(mult_node->wsum_hess->to_csr(mult_node->wsum_hess)->i, expected_i, 7));
     mu_assert("x array fails",
-              cmp_double_array(mult_node->wsum_hess->x, expected_x, 7));
+              cmp_double_array(mult_node->wsum_hess->to_csr(mult_node->wsum_hess)->x, expected_x, 7));
 
     /* Cleanup */
     free_expr(mult_node);
@@ -219,9 +219,9 @@ const char *test_wsum_hess_multiply_2(void)
     int expected_i[6] = {8, 9, 10, 3, 4, 5};
     double expected_x[6] = {1.0, 2.0, 3.0, 1.0, 2.0, 3.0};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->p, expected_p, 13));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->i, expected_i, 6));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 6));
+    mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 13));
+    mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 6));
+    mu_assert("x array fails", cmp_double_array(node->wsum_hess->to_csr(node->wsum_hess)->x, expected_x, 6));
 
     free_expr(node);
     return 0;
diff --git a/tests/wsum_hess/bivariate_restricted_dom/test_quad_over_lin.h b/tests/wsum_hess/bivariate_restricted_dom/test_quad_over_lin.h
index f1053e2..eab59a3 100644
--- a/tests/wsum_hess/bivariate_restricted_dom/test_quad_over_lin.h
+++ b/tests/wsum_hess/bivariate_restricted_dom/test_quad_over_lin.h
@@ -26,9 +26,9 @@ const char *test_wsum_hess_quad_over_lin_xy(void)
     double expected_x[10] = {1.0,   -0.25, 1.0,  -0.5,  1.0,
                              -0.75, -0.25, -0.5, -0.75, 0.875};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->p, expected_p, 10));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->i, expected_i, 10));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 10));
+    mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 10));
+    mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 10));
+    mu_assert("x array fails", cmp_double_array(node->wsum_hess->to_csr(node->wsum_hess)->x, expected_x, 10));
 
     free_expr(node);
     return 0;
@@ -56,9 +56,9 @@ const char *test_wsum_hess_quad_over_lin_yx(void)
     double expected_x[10] = {0.875, -0.25, -0.5, -0.75, -0.25,
                              1.0,   -0.5,  1.0,  -0.75, 1.0};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->p, expected_p, 10));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->i, expected_i, 10));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 10));
+    mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 10));
+    mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 10));
+    mu_assert("x array fails", cmp_double_array(node->wsum_hess->to_csr(node->wsum_hess)->x, expected_x, 10));
 
     free_expr(node);
     return 0;
diff --git a/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr.h b/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr.h
index ce2137a..ae5dbd5 100644
--- a/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr.h
+++ b/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr.h
@@ -30,9 +30,9 @@ const char *test_wsum_hess_rel_entr_1(void)
     double expected_x[12] = {1.0,   -0.25,  1.0,  -0.4, 1.0,  -0.5,
                              -0.25, 0.0625, -0.4, 0.16, -0.5, 0.25};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->p, expected_p, 11));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->i, expected_i, 12));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 12));
+    mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 11));
+    mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 12));
+    mu_assert("x array fails", cmp_double_array(node->wsum_hess->to_csr(node->wsum_hess)->x, expected_x, 12));
 
     free_expr(node);
     return 0;
@@ -62,9 +62,9 @@ const char *test_wsum_hess_rel_entr_2(void)
     double expected_x[12] = {0.0625, -0.25, 0.16, -0.4, 0.25, -0.5,
                              -0.25,  1.0,   -0.4, 1.0,  -0.5, 1.0};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->p, expected_p, 11));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->i, expected_i, 12));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 12));
+    mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 11));
+    mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 12));
+    mu_assert("x array fails", cmp_double_array(node->wsum_hess->to_csr(node->wsum_hess)->x, expected_x, 12));
 
     free_expr(node);
     return 0;
@@ -98,9 +98,9 @@ const char *test_wsum_hess_rel_entr_matrix(void)
         -1.0 / 6.0, 1.0 / 36.0,         -0.4, 0.16, -0.75, 0.5625,
         -4.0 / 3.0, 1.7777777777777777, -2.5, 6.25, -6.0,  36.0};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->p, expected_p, 13));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->i, expected_i, 24));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 24));
+    mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 13));
+    mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 24));
+    mu_assert("x array fails", cmp_double_array(node->wsum_hess->to_csr(node->wsum_hess)->x, expected_x, 24));
 
     free_expr(node);
     return 0;
diff --git a/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_scalar_vector.h b/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_scalar_vector.h
index a431b96..493830f 100644
--- a/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_scalar_vector.h
+++ b/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_scalar_vector.h
@@ -25,9 +25,9 @@ const char *test_wsum_hess_rel_entr_scalar_vector(void)
     double expected_x[10] = {15.0, -2.0,       -5.0 / 3.0, -1.5, -2.0,
                              1.0,  -5.0 / 3.0, 5.0 / 9.0,  -1.5, 0.375};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->p, expected_p, 5));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->i, expected_i, 10));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 10));
+    mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 5));
+    mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 10));
+    mu_assert("x array fails", cmp_double_array(node->wsum_hess->to_csr(node->wsum_hess)->x, expected_x, 10));
 
     free_expr(node);
     return 0;
diff --git a/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_vector_scalar.h b/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_vector_scalar.h
index 60b9436..06ba77e 100644
--- a/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_vector_scalar.h
+++ b/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_vector_scalar.h
@@ -25,9 +25,9 @@ const char *test_wsum_hess_rel_entr_vector_scalar(void)
     double expected_x[10] = {1.0,   -0.25, 1.0,  -0.5,  1.0,
                              -0.75, -0.25, -0.5, -0.75, 0.875};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->p, expected_p, 5));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->i, expected_i, 10));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 10));
+    mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 5));
+    mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 10));
+    mu_assert("x array fails", cmp_double_array(node->wsum_hess->to_csr(node->wsum_hess)->x, expected_x, 10));
 
     free_expr(node);
     return 0;
diff --git a/tests/wsum_hess/elementwise_full_dom/test_exp.h b/tests/wsum_hess/elementwise_full_dom/test_exp.h
index be00553..84ff64c 100644
--- a/tests/wsum_hess/elementwise_full_dom/test_exp.h
+++ b/tests/wsum_hess/elementwise_full_dom/test_exp.h
@@ -28,11 +28,11 @@ const char *test_wsum_hess_exp(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vals incorrect",
-              cmp_double_array(exp_node->wsum_hess->x, expected_x, 3));
+              cmp_double_array(exp_node->wsum_hess->to_csr(exp_node->wsum_hess)->x, expected_x, 3));
     mu_assert("rows incorrect",
-              cmp_int_array(exp_node->wsum_hess->p, expected_p, 4));
+              cmp_int_array(exp_node->wsum_hess->to_csr(exp_node->wsum_hess)->p, expected_p, 4));
     mu_assert("cols incorrect",
-              cmp_int_array(exp_node->wsum_hess->i, expected_i, 3));
+              cmp_int_array(exp_node->wsum_hess->to_csr(exp_node->wsum_hess)->i, expected_i, 3));
 
     free_expr(exp_node);
 
diff --git a/tests/wsum_hess/elementwise_full_dom/test_hyperbolic.h b/tests/wsum_hess/elementwise_full_dom/test_hyperbolic.h
index 88a655f..d78070e 100644
--- a/tests/wsum_hess/elementwise_full_dom/test_hyperbolic.h
+++ b/tests/wsum_hess/elementwise_full_dom/test_hyperbolic.h
@@ -35,11 +35,11 @@ const char *test_wsum_hess_sinh(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vals incorrect",
-              cmp_double_array(sinh_node->wsum_hess->x, expected_x, 3));
+              cmp_double_array(sinh_node->wsum_hess->to_csr(sinh_node->wsum_hess)->x, expected_x, 3));
     mu_assert("rows incorrect",
-              cmp_int_array(sinh_node->wsum_hess->p, expected_p, 4));
+              cmp_int_array(sinh_node->wsum_hess->to_csr(sinh_node->wsum_hess)->p, expected_p, 4));
     mu_assert("cols incorrect",
-              cmp_int_array(sinh_node->wsum_hess->i, expected_i, 3));
+              cmp_int_array(sinh_node->wsum_hess->to_csr(sinh_node->wsum_hess)->i, expected_i, 3));
 
     free_expr(sinh_node);
 
@@ -73,11 +73,11 @@ const char *test_wsum_hess_tanh(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vals incorrect",
-              cmp_double_array(tanh_node->wsum_hess->x, expected_x, 3));
+              cmp_double_array(tanh_node->wsum_hess->to_csr(tanh_node->wsum_hess)->x, expected_x, 3));
     mu_assert("rows incorrect",
-              cmp_int_array(tanh_node->wsum_hess->p, expected_p, 4));
+              cmp_int_array(tanh_node->wsum_hess->to_csr(tanh_node->wsum_hess)->p, expected_p, 4));
     mu_assert("cols incorrect",
-              cmp_int_array(tanh_node->wsum_hess->i, expected_i, 3));
+              cmp_int_array(tanh_node->wsum_hess->to_csr(tanh_node->wsum_hess)->i, expected_i, 3));
 
     free_expr(tanh_node);
 
@@ -111,11 +111,11 @@ const char *test_wsum_hess_asinh(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vals incorrect",
-              cmp_double_array(asinh_node->wsum_hess->x, expected_x, 3));
+              cmp_double_array(asinh_node->wsum_hess->to_csr(asinh_node->wsum_hess)->x, expected_x, 3));
     mu_assert("rows incorrect",
-              cmp_int_array(asinh_node->wsum_hess->p, expected_p, 4));
+              cmp_int_array(asinh_node->wsum_hess->to_csr(asinh_node->wsum_hess)->p, expected_p, 4));
     mu_assert("cols incorrect",
-              cmp_int_array(asinh_node->wsum_hess->i, expected_i, 3));
+              cmp_int_array(asinh_node->wsum_hess->to_csr(asinh_node->wsum_hess)->i, expected_i, 3));
 
     free_expr(asinh_node);
 
@@ -150,11 +150,11 @@ const char *test_wsum_hess_atanh(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vals incorrect",
-              cmp_double_array(atanh_node->wsum_hess->x, expected_x, 3));
+              cmp_double_array(atanh_node->wsum_hess->to_csr(atanh_node->wsum_hess)->x, expected_x, 3));
     mu_assert("rows incorrect",
-              cmp_int_array(atanh_node->wsum_hess->p, expected_p, 4));
+              cmp_int_array(atanh_node->wsum_hess->to_csr(atanh_node->wsum_hess)->p, expected_p, 4));
     mu_assert("cols incorrect",
-              cmp_int_array(atanh_node->wsum_hess->i, expected_i, 3));
+              cmp_int_array(atanh_node->wsum_hess->to_csr(atanh_node->wsum_hess)->i, expected_i, 3));
 
     free_expr(atanh_node);
 
diff --git a/tests/wsum_hess/elementwise_full_dom/test_logistic.h b/tests/wsum_hess/elementwise_full_dom/test_logistic.h
index 2de6070..f9d25cd 100644
--- a/tests/wsum_hess/elementwise_full_dom/test_logistic.h
+++ b/tests/wsum_hess/elementwise_full_dom/test_logistic.h
@@ -43,11 +43,11 @@ const char *test_wsum_hess_logistic(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vals incorrect",
-              cmp_double_array(logistic_node->wsum_hess->x, expected_x, 3));
+              cmp_double_array(logistic_node->wsum_hess->to_csr(logistic_node->wsum_hess)->x, expected_x, 3));
     mu_assert("rows incorrect",
-              cmp_int_array(logistic_node->wsum_hess->p, expected_p, 4));
+              cmp_int_array(logistic_node->wsum_hess->to_csr(logistic_node->wsum_hess)->p, expected_p, 4));
     mu_assert("cols incorrect",
-              cmp_int_array(logistic_node->wsum_hess->i, expected_i, 3));
+              cmp_int_array(logistic_node->wsum_hess->to_csr(logistic_node->wsum_hess)->i, expected_i, 3));
 
     free_expr(logistic_node);
 
diff --git a/tests/wsum_hess/elementwise_full_dom/test_power.h b/tests/wsum_hess/elementwise_full_dom/test_power.h
index c21021e..12004f2 100644
--- a/tests/wsum_hess/elementwise_full_dom/test_power.h
+++ b/tests/wsum_hess/elementwise_full_dom/test_power.h
@@ -28,11 +28,11 @@ const char *test_wsum_hess_power(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vals incorrect",
-              cmp_double_array(power_node->wsum_hess->x, expected_x, 3));
+              cmp_double_array(power_node->wsum_hess->to_csr(power_node->wsum_hess)->x, expected_x, 3));
     mu_assert("rows incorrect",
-              cmp_int_array(power_node->wsum_hess->p, expected_p, 4));
+              cmp_int_array(power_node->wsum_hess->to_csr(power_node->wsum_hess)->p, expected_p, 4));
     mu_assert("cols incorrect",
-              cmp_int_array(power_node->wsum_hess->i, expected_i, 3));
+              cmp_int_array(power_node->wsum_hess->to_csr(power_node->wsum_hess)->i, expected_i, 3));
 
     free_expr(power_node);
 
diff --git a/tests/wsum_hess/elementwise_full_dom/test_trig.h b/tests/wsum_hess/elementwise_full_dom/test_trig.h
index 51b1422..f506f48 100644
--- a/tests/wsum_hess/elementwise_full_dom/test_trig.h
+++ b/tests/wsum_hess/elementwise_full_dom/test_trig.h
@@ -28,11 +28,11 @@ const char *test_wsum_hess_sin(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vals incorrect",
-              cmp_double_array(sin_node->wsum_hess->x, expected_x, 3));
+              cmp_double_array(sin_node->wsum_hess->to_csr(sin_node->wsum_hess)->x, expected_x, 3));
     mu_assert("rows incorrect",
-              cmp_int_array(sin_node->wsum_hess->p, expected_p, 4));
+              cmp_int_array(sin_node->wsum_hess->to_csr(sin_node->wsum_hess)->p, expected_p, 4));
     mu_assert("cols incorrect",
-              cmp_int_array(sin_node->wsum_hess->i, expected_i, 3));
+              cmp_int_array(sin_node->wsum_hess->to_csr(sin_node->wsum_hess)->i, expected_i, 3));
 
     free_expr(sin_node);
 
@@ -57,11 +57,11 @@ const char *test_wsum_hess_cos(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vals incorrect",
-              cmp_double_array(cos_node->wsum_hess->x, expected_x, 3));
+              cmp_double_array(cos_node->wsum_hess->to_csr(cos_node->wsum_hess)->x, expected_x, 3));
     mu_assert("rows incorrect",
-              cmp_int_array(cos_node->wsum_hess->p, expected_p, 4));
+              cmp_int_array(cos_node->wsum_hess->to_csr(cos_node->wsum_hess)->p, expected_p, 4));
     mu_assert("cols incorrect",
-              cmp_int_array(cos_node->wsum_hess->i, expected_i, 3));
+              cmp_int_array(cos_node->wsum_hess->to_csr(cos_node->wsum_hess)->i, expected_i, 3));
 
     free_expr(cos_node);
 
@@ -88,11 +88,11 @@ const char *test_wsum_hess_tan(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vals incorrect",
-              cmp_double_array(tan_node->wsum_hess->x, expected_x, 3));
+              cmp_double_array(tan_node->wsum_hess->to_csr(tan_node->wsum_hess)->x, expected_x, 3));
     mu_assert("rows incorrect",
-              cmp_int_array(tan_node->wsum_hess->p, expected_p, 4));
+              cmp_int_array(tan_node->wsum_hess->to_csr(tan_node->wsum_hess)->p, expected_p, 4));
     mu_assert("cols incorrect",
-              cmp_int_array(tan_node->wsum_hess->i, expected_i, 3));
+              cmp_int_array(tan_node->wsum_hess->to_csr(tan_node->wsum_hess)->i, expected_i, 3));
 
     free_expr(tan_node);
 
diff --git a/tests/wsum_hess/elementwise_full_dom/test_xexp.h b/tests/wsum_hess/elementwise_full_dom/test_xexp.h
index c4c6f94..3ec1e58 100644
--- a/tests/wsum_hess/elementwise_full_dom/test_xexp.h
+++ b/tests/wsum_hess/elementwise_full_dom/test_xexp.h
@@ -29,11 +29,11 @@ const char *test_wsum_hess_xexp(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vals incorrect",
-              cmp_double_array(xexp_node->wsum_hess->x, expected_x, 3));
+              cmp_double_array(xexp_node->wsum_hess->to_csr(xexp_node->wsum_hess)->x, expected_x, 3));
     mu_assert("rows incorrect",
-              cmp_int_array(xexp_node->wsum_hess->p, expected_p, 4));
+              cmp_int_array(xexp_node->wsum_hess->to_csr(xexp_node->wsum_hess)->p, expected_p, 4));
     mu_assert("cols incorrect",
-              cmp_int_array(xexp_node->wsum_hess->i, expected_i, 3));
+              cmp_int_array(xexp_node->wsum_hess->to_csr(xexp_node->wsum_hess)->i, expected_i, 3));
 
     free_expr(xexp_node);
 
diff --git a/tests/wsum_hess/elementwise_restricted_dom/test_entr.h b/tests/wsum_hess/elementwise_restricted_dom/test_entr.h
index 1f4fb14..f1f3941 100644
--- a/tests/wsum_hess/elementwise_restricted_dom/test_entr.h
+++ b/tests/wsum_hess/elementwise_restricted_dom/test_entr.h
@@ -28,11 +28,11 @@ const char *test_wsum_hess_entr(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vals incorrect",
-              cmp_double_array(entr_node->wsum_hess->x, expected_x, 3));
+              cmp_double_array(entr_node->wsum_hess->to_csr(entr_node->wsum_hess)->x, expected_x, 3));
     mu_assert("rows incorrect",
-              cmp_int_array(entr_node->wsum_hess->p, expected_p, 4));
+              cmp_int_array(entr_node->wsum_hess->to_csr(entr_node->wsum_hess)->p, expected_p, 4));
     mu_assert("cols incorrect",
-              cmp_int_array(entr_node->wsum_hess->i, expected_i, 3));
+              cmp_int_array(entr_node->wsum_hess->to_csr(entr_node->wsum_hess)->i, expected_i, 3));
 
     free_expr(entr_node);
 
diff --git a/tests/wsum_hess/elementwise_restricted_dom/test_log.h b/tests/wsum_hess/elementwise_restricted_dom/test_log.h
index 024ea35..bbac1b8 100644
--- a/tests/wsum_hess/elementwise_restricted_dom/test_log.h
+++ b/tests/wsum_hess/elementwise_restricted_dom/test_log.h
@@ -40,11 +40,11 @@ const char *test_wsum_hess_log(void)
     int expected_i[3] = {2, 3, 4};
 
     mu_assert("vals incorrect",
-              cmp_double_array(log_node->wsum_hess->x, expected_x, 3));
+              cmp_double_array(log_node->wsum_hess->to_csr(log_node->wsum_hess)->x, expected_x, 3));
     mu_assert("rows incorrect",
-              cmp_int_array(log_node->wsum_hess->p, expected_p, 8));
+              cmp_int_array(log_node->wsum_hess->to_csr(log_node->wsum_hess)->p, expected_p, 8));
     mu_assert("cols incorrect",
-              cmp_int_array(log_node->wsum_hess->i, expected_i, 3));
+              cmp_int_array(log_node->wsum_hess->to_csr(log_node->wsum_hess)->i, expected_i, 3));
 
     free_expr(log_node);
 
diff --git a/tests/wsum_hess/other/test_prod.h b/tests/wsum_hess/other/test_prod.h
index 753abcf..5c36c5e 100644
--- a/tests/wsum_hess/other/test_prod.h
+++ b/tests/wsum_hess/other/test_prod.h
@@ -28,9 +28,9 @@ const char *test_wsum_hess_prod_no_zero(void)
     int expected_p[9] = {0, 0, 0, 4, 8, 12, 16, 16, 16};
     int expected_i[16] = {2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 16));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->p, expected_p, 9));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->i, expected_i, 16));
+    mu_assert("vals fail", cmp_double_array(p->wsum_hess->to_csr(p->wsum_hess)->x, expected_x, 16));
+    mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 9));
+    mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 16));
 
     free_expr(p);
     return 0;
@@ -62,9 +62,9 @@ const char *test_wsum_hess_prod_one_zero(void)
     int expected_p[9] = {0, 0, 0, 4, 8, 12, 16, 16, 16};
     int expected_i[16] = {2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 16));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->p, expected_p, 9));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->i, expected_i, 16));
+    mu_assert("vals fail", cmp_double_array(p->wsum_hess->to_csr(p->wsum_hess)->x, expected_x, 16));
+    mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 9));
+    mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 16));
 
     free_expr(p);
     return 0;
@@ -91,9 +91,9 @@ const char *test_wsum_hess_prod_two_zeros(void)
     int expected_p[9] = {0, 0, 0, 4, 8, 12, 16, 16, 16};
     int expected_i[16] = {2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 16));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->p, expected_p, 9));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->i, expected_i, 16));
+    mu_assert("vals fail", cmp_double_array(p->wsum_hess->to_csr(p->wsum_hess)->x, expected_x, 16));
+    mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 9));
+    mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 16));
 
     free_expr(p);
     return 0;
@@ -118,9 +118,9 @@ const char *test_wsum_hess_prod_many_zeros(void)
     int expected_p[9] = {0, 0, 0, 4, 8, 12, 16, 16, 16};
     int expected_i[16] = {2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 16));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->p, expected_p, 9));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->i, expected_i, 16));
+    mu_assert("vals fail", cmp_double_array(p->wsum_hess->to_csr(p->wsum_hess)->x, expected_x, 16));
+    mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 9));
+    mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 16));
 
     free_expr(p);
     return 0;
diff --git a/tests/wsum_hess/other/test_prod_axis_one.h b/tests/wsum_hess/other/test_prod_axis_one.h
index e6915a0..02c8b32 100644
--- a/tests/wsum_hess/other/test_prod_axis_one.h
+++ b/tests/wsum_hess/other/test_prod_axis_one.h
@@ -65,9 +65,9 @@ const char *test_wsum_hess_prod_axis_one_no_zeros(void)
                           /* Var 6 (row 1, col 2): cols 2,4 (excludes 6) */
                           2, 4};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 12));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->p, expected_p, 9));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->i, expected_i, 12));
+    mu_assert("vals fail", cmp_double_array(p->wsum_hess->to_csr(p->wsum_hess)->x, expected_x, 12));
+    mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 9));
+    mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 12));
 
     free_expr(p);
     return 0;
@@ -165,9 +165,9 @@ const char *test_wsum_hess_prod_axis_one_one_zero(void)
                           /* Var 9 (row 2, col 2): cols 3,6 (excludes 9) */
                           3, 6};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 18));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->p, expected_p, 11));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->i, expected_i, 18));
+    mu_assert("vals fail", cmp_double_array(p->wsum_hess->to_csr(p->wsum_hess)->x, expected_x, 18));
+    mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 11));
+    mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 18));
 
     free_expr(p);
     return 0;
@@ -317,9 +317,9 @@ const char *test_wsum_hess_prod_axis_one_mixed_zeros(void)
         }
     }
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 30));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->p, expected_p, 17));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->i, expected_i, 30));
+    mu_assert("vals fail", cmp_double_array(p->wsum_hess->to_csr(p->wsum_hess)->x, expected_x, 30));
+    mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 17));
+    mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 30));
 
     free_expr(p);
     return 0;
@@ -373,9 +373,9 @@ const char *test_wsum_hess_prod_axis_one_2x2(void)
                          0,  /* Var 2 (row 0, col 1): only col 0 */
                          1}; /* Var 3 (row 1, col 1): only col 0 */
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 4));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->p, expected_p, 5));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->i, expected_i, 4));
+    mu_assert("vals fail", cmp_double_array(p->wsum_hess->to_csr(p->wsum_hess)->x, expected_x, 4));
+    mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 5));
+    mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 4));
 
     free_expr(p);
     return 0;
diff --git a/tests/wsum_hess/other/test_prod_axis_zero.h b/tests/wsum_hess/other/test_prod_axis_zero.h
index 6a54e0d..650bcf5 100644
--- a/tests/wsum_hess/other/test_prod_axis_zero.h
+++ b/tests/wsum_hess/other/test_prod_axis_zero.h
@@ -63,9 +63,9 @@ const char *test_wsum_hess_prod_axis_zero_no_zeros(void)
      */
     int expected_i[12] = {1, 2, 1, 2, 3, 4, 3, 4, 5, 6, 5, 6};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 12));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->p, expected_p, 9));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->i, expected_i, 12));
+    mu_assert("vals fail", cmp_double_array(p->wsum_hess->to_csr(p->wsum_hess)->x, expected_x, 12));
+    mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 9));
+    mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 12));
 
     free_expr(p);
     return 0;
@@ -183,9 +183,9 @@ const char *test_wsum_hess_prod_axis_zero_mixed_zeros(void)
         }
     }
 
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->p, expected_p, 17));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->i, expected_i, 75));
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 75));
+    mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 17));
+    mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 75));
+    mu_assert("vals fail", cmp_double_array(p->wsum_hess->to_csr(p->wsum_hess)->x, expected_x, 75));
 
     free_expr(p);
     return 0;
@@ -245,9 +245,9 @@ const char *test_wsum_hess_prod_axis_zero_one_zero(void)
      */
     int expected_i[8] = {1, 2, 1, 2, 3, 4, 3, 4};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 8));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->p, expected_p, 6));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->i, expected_i, 8));
+    mu_assert("vals fail", cmp_double_array(p->wsum_hess->to_csr(p->wsum_hess)->x, expected_x, 8));
+    mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 6));
+    mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 8));
 
     free_expr(p);
     return 0;
diff --git a/tests/wsum_hess/other/test_quad_form.h b/tests/wsum_hess/other/test_quad_form.h
index 73f8a1d..162d26b 100644
--- a/tests/wsum_hess/other/test_quad_form.h
+++ b/tests/wsum_hess/other/test_quad_form.h
@@ -38,9 +38,9 @@ const char *test_wsum_hess_quad_form(void)
     int expected_i[10] = {3, 4, 3, 4, 5, 4, 5, 6, 5, 6};
     double expected_x[10] = {4.0, 8.0, 8.0, 20.0, 12.0, 12.0, 16.0, 4.0, 4.0, 24.0};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->p, expected_p, 11));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->i, expected_i, 10));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 10));
+    mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 11));
+    mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 10));
+    mu_assert("x array fails", cmp_double_array(node->wsum_hess->to_csr(node->wsum_hess)->x, expected_x, 10));
 
     free_expr(node);
     free_csr_matrix(Q);

From 680d3ee43578f06788d64a39358f50a8d73d037e Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Sun, 10 May 2026 23:28:05 +0200
Subject: [PATCH 04/31] dispatch for elementwise chainr rule hessian

---
 include/subexpr.h                       |  7 ++++
 src/atoms/affine/broadcast.c            |  2 +-
 src/atoms/affine/convolve.c             |  3 +-
 src/atoms/affine/diag_vec.c             |  2 +-
 src/atoms/affine/index.c                |  2 +-
 src/atoms/affine/left_matmul.c          | 40 +++++++++++++++++++---
 src/atoms/affine/reshape.c              |  2 +-
 src/atoms/affine/sum.c                  |  6 ++--
 src/atoms/affine/trace.c                |  2 +-
 src/atoms/affine/transpose.c            |  3 +-
 src/atoms/affine/vector_mult.c          |  2 +-
 src/atoms/bivariate_full_dom/multiply.c |  8 +++--
 src/utils/permuted_dense.c              | 45 +++++++++++++++++++++++--
 tests/profiling/profile_log_reg.h       | 21 +++++++-----
 14 files changed, 116 insertions(+), 29 deletions(-)

diff --git a/include/subexpr.h b/include/subexpr.h
index f97feef..62aa6eb 100644
--- a/include/subexpr.h
+++ b/include/subexpr.h
@@ -131,6 +131,13 @@ typedef struct left_matmul_expr
     int *csc_to_csr_work;
     expr *param_source;
     void (*refresh_param_values)(struct left_matmul_expr *);
+
+    /* When true, jacobian_init_impl produces a Permuted_Dense node->jacobian
+       directly (skipping the CSC mirror machinery). Set by new_left_matmul_dense
+       when A is a constant dense matrix, child is a leaf variable, and
+       n_blocks == 1 — in that case the Jacobian is exactly A placed in the
+       variable's column slot, which is naturally a full-dense Permuted_Dense. */
+    bool produce_pd_jacobian;
 } left_matmul_expr;
 
 /* Scalar multiplication: y = a * child where a comes from param_source */
diff --git a/src/atoms/affine/broadcast.c b/src/atoms/affine/broadcast.c
index 0f73dcd..aedf11e 100644
--- a/src/atoms/affine/broadcast.c
+++ b/src/atoms/affine/broadcast.c
@@ -239,7 +239,7 @@ static void eval_wsum_hess(expr *node, const double *w)
     }
 
     x->eval_wsum_hess(x, node->work->dwork);
-    memcpy(node->wsum_hess->to_csr(node->wsum_hess)->x, x->wsum_hess->to_csr(x->wsum_hess)->x, x->wsum_hess->to_csr(x->wsum_hess)->nnz * sizeof(double));
+    node->wsum_hess->update_values(node->wsum_hess, x->wsum_hess->to_csr(x->wsum_hess)->x);
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/convolve.c b/src/atoms/affine/convolve.c
index f8c296d..c6d2367 100644
--- a/src/atoms/affine/convolve.c
+++ b/src/atoms/affine/convolve.c
@@ -129,8 +129,7 @@ static void eval_wsum_hess(expr *node, const double *w)
     }
 
     child->eval_wsum_hess(child, w_prime);
-    memcpy(node->wsum_hess->to_csr(node->wsum_hess)->x, child->wsum_hess->to_csr(child->wsum_hess)->x,
-           child->wsum_hess->to_csr(child->wsum_hess)->nnz * sizeof(double));
+    node->wsum_hess->update_values(node->wsum_hess, child->wsum_hess->to_csr(child->wsum_hess)->x);
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/diag_vec.c b/src/atoms/affine/diag_vec.c
index 07008af..1454e9b 100644
--- a/src/atoms/affine/diag_vec.c
+++ b/src/atoms/affine/diag_vec.c
@@ -121,7 +121,7 @@ static void eval_wsum_hess(expr *node, const double *w)
 
     /* Evaluate child's Hessian with extracted weights */
     x->eval_wsum_hess(x, node->work->dwork);
-    memcpy(node->wsum_hess->to_csr(node->wsum_hess)->x, x->wsum_hess->to_csr(x->wsum_hess)->x, x->wsum_hess->to_csr(x->wsum_hess)->nnz * sizeof(double));
+    node->wsum_hess->update_values(node->wsum_hess, x->wsum_hess->to_csr(x->wsum_hess)->x);
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/index.c b/src/atoms/affine/index.c
index 507213e..2e57cd5 100644
--- a/src/atoms/affine/index.c
+++ b/src/atoms/affine/index.c
@@ -141,7 +141,7 @@ static void eval_wsum_hess(expr *node, const double *w)
 
     /* evalute hessian of child */
     x->eval_wsum_hess(x, node->work->dwork);
-    memcpy(node->wsum_hess->to_csr(node->wsum_hess)->x, x->wsum_hess->to_csr(x->wsum_hess)->x, x->wsum_hess->to_csr(x->wsum_hess)->nnz * sizeof(double));
+    node->wsum_hess->update_values(node->wsum_hess, x->wsum_hess->to_csr(x->wsum_hess)->x);
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/left_matmul.c b/src/atoms/affine/left_matmul.c
index 1d5c376..8fabe63 100644
--- a/src/atoms/affine/left_matmul.c
+++ b/src/atoms/affine/left_matmul.c
@@ -18,6 +18,7 @@
 #include "atoms/affine.h"
 #include "subexpr.h"
 #include "utils/dense_matrix.h"
+#include "utils/permuted_dense.h"
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -113,9 +114,30 @@ static void jacobian_init_impl(expr *node)
     expr *x = node->left;
     left_matmul_expr *lnode = (left_matmul_expr *) node;
 
-    /* initialize child's jacobian and precompute sparsity of its CSC */
+    /* initialize child's jacobian */
     jacobian_init(x);
-    lnode->Jchild_CSC = csr_to_csc_alloc(x->jacobian->to_csr(x->jacobian), node->work->iwork);
+
+    /* Fast path: A is a constant Dense_Matrix, child is a leaf variable, and
+       there are no Kronecker blocks. The Jacobian is A placed at the variable's
+       column slot — a full-dense Permuted_Dense. Skip the CSC mirror entirely. */
+    if (lnode->produce_pd_jacobian)
+    {
+        int m_loc = lnode->A->m;
+        int *row_perm = (int *) SP_MALLOC(m_loc * sizeof(int));
+        int *col_perm = (int *) SP_MALLOC(lnode->A->n * sizeof(int));
+        for (int i = 0; i < m_loc; i++) row_perm[i] = i;
+        for (int j = 0; j < lnode->A->n; j++) col_perm[j] = x->var_id + j;
+        Dense_Matrix *dm = (Dense_Matrix *) lnode->A;
+        node->jacobian = new_permuted_dense(m_loc, node->n_vars, m_loc,
+                                            lnode->A->n, row_perm, col_perm, dm->x);
+        free(row_perm);
+        free(col_perm);
+        return;
+    }
+
+    /* General path via CSC mirror. */
+    lnode->Jchild_CSC =
+        csr_to_csc_alloc(x->jacobian->to_csr(x->jacobian), node->work->iwork);
 
     /* precompute sparsity of this node's jacobian in CSC and CSR */
     lnode->J_CSC = lnode->A->block_left_mult_sparsity(lnode->A, lnode->Jchild_CSC,
@@ -129,6 +151,9 @@ static void eval_jacobian(expr *node)
     left_matmul_expr *lnode = (left_matmul_expr *) node;
     expr *x = node->left;
 
+    /* Fast path: PD Jacobian backed by constant A. Values never change. */
+    if (lnode->produce_pd_jacobian) return;
+
     CSC_Matrix *Jchild_CSC = lnode->Jchild_CSC;
     CSC_Matrix *J_CSC = lnode->J_CSC;
 
@@ -166,8 +191,7 @@ static void eval_wsum_hess(expr *node, const double *w)
     AT->block_left_mult_vec(AT, w, node->work->dwork, n_blocks);
 
     node->left->eval_wsum_hess(node->left, node->work->dwork);
-    memcpy(node->wsum_hess->to_csr(node->wsum_hess)->x, node->left->wsum_hess->to_csr(node->left->wsum_hess)->x,
-           node->wsum_hess->to_csr(node->wsum_hess)->nnz * sizeof(double));
+    node->wsum_hess->update_values(node->wsum_hess, node->left->wsum_hess->to_csr(node->left->wsum_hess)->x);
 }
 
 static void refresh_dense_left(left_matmul_expr *lnode)
@@ -309,6 +333,14 @@ expr *new_left_matmul_dense(expr *param_node, expr *u, int m, int n,
 
         lnode->A = new_dense_matrix(m, n, data);
         lnode->AT = dense_matrix_trans((const Dense_Matrix *) lnode->A);
+
+        /* If the child is a leaf variable and there are no blocks, the Jacobian
+           is exactly A placed in the variable's column slot — a full-dense
+           Permuted_Dense. Enable the fast path. */
+        if (u->var_id != NOT_A_VARIABLE && n_blocks == 1)
+        {
+            lnode->produce_pd_jacobian = true;
+        }
     }
 
     return node;
diff --git a/src/atoms/affine/reshape.c b/src/atoms/affine/reshape.c
index c0c207b..5c62064 100644
--- a/src/atoms/affine/reshape.c
+++ b/src/atoms/affine/reshape.c
@@ -59,7 +59,7 @@ static void eval_wsum_hess(expr *node, const double *w)
 {
     expr *x = node->left;
     x->eval_wsum_hess(x, w);
-    memcpy(node->wsum_hess->to_csr(node->wsum_hess)->x, x->wsum_hess->to_csr(x->wsum_hess)->x, x->wsum_hess->to_csr(x->wsum_hess)->nnz * sizeof(double));
+    node->wsum_hess->update_values(node->wsum_hess, x->wsum_hess->to_csr(x->wsum_hess)->x);
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/sum.c b/src/atoms/affine/sum.c
index 87a6d23..d031d56 100644
--- a/src/atoms/affine/sum.c
+++ b/src/atoms/affine/sum.c
@@ -162,8 +162,10 @@ static void eval_wsum_hess(expr *node, const double *w)
 
     x->eval_wsum_hess(x, node->work->dwork);
 
-    /* copy values */
-    memcpy(node->wsum_hess->to_csr(node->wsum_hess)->x, x->wsum_hess->to_csr(x->wsum_hess)->x, x->wsum_hess->to_csr(x->wsum_hess)->nnz * sizeof(double));
+    /* copy values via polymorphic update_values so PD-backed wsum_hess writes
+       reach the underlying X rather than just the lazy CSR cache. */
+    node->wsum_hess->update_values(node->wsum_hess,
+                                   x->wsum_hess->to_csr(x->wsum_hess)->x);
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/trace.c b/src/atoms/affine/trace.c
index a1931fc..e5a7489 100644
--- a/src/atoms/affine/trace.c
+++ b/src/atoms/affine/trace.c
@@ -126,7 +126,7 @@ static void eval_wsum_hess(expr *node, const double *w)
 
     x->eval_wsum_hess(x, node->work->dwork);
 
-    memcpy(node->wsum_hess->to_csr(node->wsum_hess)->x, x->wsum_hess->to_csr(x->wsum_hess)->x, sizeof(double) * x->wsum_hess->to_csr(x->wsum_hess)->nnz);
+    node->wsum_hess->update_values(node->wsum_hess, x->wsum_hess->to_csr(x->wsum_hess)->x);
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/transpose.c b/src/atoms/affine/transpose.c
index 022aaff..70958b8 100644
--- a/src/atoms/affine/transpose.c
+++ b/src/atoms/affine/transpose.c
@@ -117,8 +117,7 @@ static void eval_wsum_hess(expr *node, const double *w)
     node->left->eval_wsum_hess(node->left, node->work->dwork);
 
     /* copy to this node's hessian */
-    memcpy(node->wsum_hess->to_csr(node->wsum_hess)->x, node->left->wsum_hess->to_csr(node->left->wsum_hess)->x,
-           node->wsum_hess->to_csr(node->wsum_hess)->nnz * sizeof(double));
+    node->wsum_hess->update_values(node->wsum_hess, node->left->wsum_hess->to_csr(node->left->wsum_hess)->x);
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/vector_mult.c b/src/atoms/affine/vector_mult.c
index 6f15b66..ca60a84 100644
--- a/src/atoms/affine/vector_mult.c
+++ b/src/atoms/affine/vector_mult.c
@@ -109,7 +109,7 @@ static void eval_wsum_hess(expr *node, const double *w)
     x->eval_wsum_hess(x, node->work->dwork);
 
     /* copy values from child to this node */
-    memcpy(node->wsum_hess->to_csr(node->wsum_hess)->x, x->wsum_hess->to_csr(x->wsum_hess)->x, x->wsum_hess->to_csr(x->wsum_hess)->nnz * sizeof(double));
+    node->wsum_hess->update_values(node->wsum_hess, x->wsum_hess->to_csr(x->wsum_hess)->x);
 }
 
 static void free_type_data(expr *node)
diff --git a/src/atoms/bivariate_full_dom/multiply.c b/src/atoms/bivariate_full_dom/multiply.c
index 01e1cd7..17fa63f 100644
--- a/src/atoms/bivariate_full_dom/multiply.c
+++ b/src/atoms/bivariate_full_dom/multiply.c
@@ -192,8 +192,12 @@ static void eval_wsum_hess(expr *node, const double *w)
     if (x->var_id != NOT_A_VARIABLE && y->var_id != NOT_A_VARIABLE &&
         x->var_id != y->var_id)
     {
-        memcpy(node->wsum_hess->to_csr(node->wsum_hess)->x, w, node->size * sizeof(double));
-        memcpy(node->wsum_hess->to_csr(node->wsum_hess)->x + node->size, w, node->size * sizeof(double));
+        /* node->wsum_hess is Sparse_Matrix (built explicitly above), so the CSR
+           view aliases its storage. Two contiguous halves [w; w] of length
+           node->size each, matching the (var1, var2) and (var2, var1) blocks. */
+        CSR_Matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
+        memcpy(H->x, w, node->size * sizeof(double));
+        memcpy(H->x + node->size, w, node->size * sizeof(double));
     }
     else
     {
diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c
index 54b22c8..b522f7f 100644
--- a/src/utils/permuted_dense.c
+++ b/src/utils/permuted_dense.c
@@ -41,6 +41,43 @@ static void permuted_dense_refresh_csc_values(Matrix *self)
     (void) self;
 }
 
+/* Replace pd->X with new_values (dense_m * dense_n doubles, row-major). Same
+   layout as the CSR view's value array (see permuted_dense_to_csr_alloc), so
+   callers that have a CSR view's x can pass it here. */
+static void permuted_dense_vtable_update_values(Matrix *self,
+                                                const double *new_values)
+{
+    Permuted_Dense *pd = (Permuted_Dense *) self;
+    memcpy(pd->X, new_values, pd->dense_m * pd->dense_n * sizeof(double));
+}
+
+/* Vtable adapters — each delegates to the existing permuted_dense_* kernel. */
+static Matrix *permuted_dense_vtable_copy_sparsity(const Matrix *self)
+{
+    const Permuted_Dense *pd = (const Permuted_Dense *) self;
+    return new_permuted_dense(pd->base.m, pd->base.n, pd->dense_m, pd->dense_n,
+                              pd->row_perm, pd->col_perm, NULL);
+}
+
+static void permuted_dense_vtable_DA_fill_values(const double *d,
+                                                 const Matrix *self, Matrix *out)
+{
+    permuted_dense_DA_fill_values(d, (const Permuted_Dense *) self,
+                                  (Permuted_Dense *) out);
+}
+
+static Matrix *permuted_dense_vtable_ATA_alloc(Matrix *self)
+{
+    return permuted_dense_ATA_alloc((const Permuted_Dense *) self);
+}
+
+static void permuted_dense_vtable_ATDA_fill_values(const Matrix *self,
+                                                   const double *d, Matrix *out)
+{
+    permuted_dense_ATDA_fill_values((const Permuted_Dense *) self, d,
+                                    (Permuted_Dense *) out);
+}
+
 /* Lazy CSR view: allocate structure on first call, refill values on every call.
    This means the returned CSR's values always reflect the current X.
 
@@ -86,12 +123,14 @@ Matrix *new_permuted_dense(int m, int n, int dense_m, int dense_n,
     Permuted_Dense *pd = (Permuted_Dense *) SP_CALLOC(1, sizeof(Permuted_Dense));
     pd->base.m = m;
     pd->base.n = n;
+    pd->base.update_values = permuted_dense_vtable_update_values;
+    pd->base.copy_sparsity = permuted_dense_vtable_copy_sparsity;
+    pd->base.DA_fill_values = permuted_dense_vtable_DA_fill_values;
+    pd->base.ATA_alloc = permuted_dense_vtable_ATA_alloc;
+    pd->base.ATDA_fill_values = permuted_dense_vtable_ATDA_fill_values;
     pd->base.to_csr = permuted_dense_to_csr;
     pd->base.refresh_csc_values = permuted_dense_refresh_csc_values;
     pd->base.free_fn = permuted_dense_free;
-    /* Other vtable slots (copy_sparsity, DA_fill_values, ATA_alloc,
-       ATDA_fill_values) are wired up in a later step when permuted_dense
-       actually starts appearing as a node->jacobian. */
 
     pd->dense_m = dense_m;
     pd->dense_n = dense_n;
diff --git a/tests/profiling/profile_log_reg.h b/tests/profiling/profile_log_reg.h
index 5d66ada..2af45a9 100644
--- a/tests/profiling/profile_log_reg.h
+++ b/tests/profiling/profile_log_reg.h
@@ -119,26 +119,31 @@ const char *profile_log_reg(void)
            (sec_a_jac + sec_a_hess) / (sec_b_jac + sec_b_hess));
 
     /* ---- Compare Jacobian (1 x n, both have full sparsity) ---- */
-    mu_assert("J n mismatch", obj->jacobian->to_csr(obj->jacobian)->n == Jobj_csr->n);
-    mu_assert("J nnz mismatch", obj->jacobian->to_csr(obj->jacobian)->nnz == Jobj_csr->nnz);
+    CSR_Matrix *J_a = obj->jacobian->to_csr(obj->jacobian);
+    mu_assert("J n mismatch", J_a->n == Jobj_csr->n);
+    mu_assert("J nnz mismatch", J_a->nnz == Jobj_csr->nnz);
     double max_J_diff = 0.0;
-    for (int j = 0; j < obj->jacobian->to_csr(obj->jacobian)->nnz; j++)
+    for (int j = 0; j < J_a->nnz; j++)
     {
-        double diff = fabs(obj->jacobian->to_csr(obj->jacobian)->x[j] - Jobj_csr->x[j]);
+        double diff = fabs(J_a->x[j] - Jobj_csr->x[j]);
         if (diff > max_J_diff) max_J_diff = diff;
     }
     printf("  Jacobian max abs diff:   %10.3e\n", max_J_diff);
     mu_assert("Jacobian mismatch", max_J_diff < 1e-10);
 
     /* ---- Compare Hessian (n x n): scatter Path A's CSR into a dense
-       n x n array, compare to H_pd->X (already dense row-major). ---- */
+       n x n array, compare to H_pd->X (already dense row-major).
+       Extract the CSR view ONCE: PD's to_csr does an O(dense_m * dense_n)
+       memcpy refresh per call, so calling it inside the inner loop is
+       quadratically expensive. ---- */
+    CSR_Matrix *H_a = obj->wsum_hess->to_csr(obj->wsum_hess);
     double *H_a_dense = (double *) calloc((size_t) n * n, sizeof(double));
     for (int i = 0; i < n; i++)
     {
-        for (int e = obj->wsum_hess->to_csr(obj->wsum_hess)->p[i]; e < obj->wsum_hess->to_csr(obj->wsum_hess)->p[i + 1]; e++)
+        for (int e = H_a->p[i]; e < H_a->p[i + 1]; e++)
         {
-            int col = obj->wsum_hess->to_csr(obj->wsum_hess)->i[e];
-            H_a_dense[i * n + col] = obj->wsum_hess->to_csr(obj->wsum_hess)->x[e];
+            int col = H_a->i[e];
+            H_a_dense[i * n + col] = H_a->x[e];
         }
     }
     double max_H_diff = 0.0;

From 10bf7ce3dc6c78b1611858d620063892791b0a78 Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Mon, 11 May 2026 08:56:06 +0200
Subject: [PATCH 05/31] better polymorphism for copying sparsity pattern

---
 include/utils/matrix.h                        | 19 ++++++++++++++++---
 src/atoms/affine/broadcast.c                  | 15 +++++++--------
 src/atoms/affine/convolve.c                   |  3 ++-
 src/atoms/affine/diag_vec.c                   |  3 ++-
 src/atoms/affine/hstack.c                     | 14 +++++---------
 src/atoms/affine/index.c                      |  3 ++-
 src/atoms/affine/left_matmul.c                |  3 ++-
 src/atoms/affine/neg.c                        | 15 +++++----------
 src/atoms/affine/promote.c                    |  8 ++------
 src/atoms/affine/reshape.c                    |  9 ++++-----
 src/atoms/affine/scalar_mult.c                | 12 +++++-------
 src/atoms/affine/sum.c                        | 11 ++++-------
 src/atoms/affine/trace.c                      |  8 ++++----
 src/atoms/affine/transpose.c                  |  3 ++-
 src/atoms/affine/vector_mult.c                |  8 ++++----
 src/atoms/bivariate_full_dom/matmul.c         | 13 ++++++-------
 src/atoms/bivariate_full_dom/multiply.c       | 11 +++++------
 .../bivariate_restricted_dom/quad_over_lin.c  |  2 +-
 src/atoms/bivariate_restricted_dom/rel_entr.c | 12 +++++-------
 .../rel_entr_scalar_vector.c                  | 12 +++++-------
 .../rel_entr_vector_scalar.c                  | 12 +++++-------
 src/atoms/elementwise_full_dom/common.c       | 16 ++++++----------
 src/atoms/elementwise_full_dom/logistic.c     |  2 +-
 src/atoms/elementwise_restricted_dom/atanh.c  |  5 +++--
 src/atoms/elementwise_restricted_dom/entr.c   |  5 +++--
 src/atoms/elementwise_restricted_dom/log.c    |  5 +++--
 src/atoms/elementwise_restricted_dom/tan.c    |  5 +++--
 src/atoms/other/prod.c                        | 16 ++++++++--------
 src/atoms/other/prod_axis_one.c               |  2 +-
 src/atoms/other/prod_axis_zero.c              | 10 +++++-----
 src/atoms/other/quad_form.c                   |  8 +++-----
 src/problem.c                                 | 14 +++++---------
 src/utils/dense_matrix.c                      |  9 ++-------
 src/utils/permuted_dense.c                    | 17 ++++-------------
 src/utils/sparse_matrix.c                     | 11 ++++-------
 tests/jacobian_tests/affine/test_broadcast.h  |  8 ++++----
 tests/jacobian_tests/affine/test_convolve.h   |  2 +-
 tests/jacobian_tests/affine/test_diag_mat.h   |  4 ++--
 tests/jacobian_tests/affine/test_hstack.h     |  4 ++--
 tests/jacobian_tests/affine/test_index.h      |  8 ++++----
 .../jacobian_tests/affine/test_left_matmul.h  |  4 ++--
 tests/jacobian_tests/affine/test_neg.h        |  4 ++--
 tests/jacobian_tests/affine/test_promote.h    |  4 ++--
 .../jacobian_tests/affine/test_right_matmul.h |  4 ++--
 .../jacobian_tests/affine/test_scalar_mult.h  |  4 ++--
 tests/jacobian_tests/affine/test_sum.h        | 10 +++++-----
 tests/jacobian_tests/affine/test_trace.h      |  4 ++--
 tests/jacobian_tests/affine/test_transpose.h  |  2 +-
 tests/jacobian_tests/affine/test_upper_tri.h  |  4 ++--
 .../jacobian_tests/affine/test_vector_mult.h  |  4 ++--
 tests/jacobian_tests/affine/test_vstack.h     |  4 ++--
 .../test_elementwise_mult.h                   |  8 ++++----
 .../bivariate_full_dom/test_matmul.h          |  2 +-
 .../test_quad_over_lin.h                      | 10 +++++-----
 .../bivariate_restricted_dom/test_rel_entr.h  |  6 +++---
 .../test_rel_entr_scalar_vector.h             |  2 +-
 .../test_rel_entr_vector_scalar.h             |  2 +-
 .../composite/test_composite_exp.h            |  2 +-
 .../elementwise_restricted_dom/test_log.h     |  4 ++--
 tests/jacobian_tests/other/test_prod.h        |  6 +++---
 .../jacobian_tests/other/test_prod_axis_one.h |  4 ++--
 .../other/test_prod_axis_zero.h               |  2 +-
 tests/jacobian_tests/other/test_quad_form.h   |  4 ++--
 tests/wsum_hess/affine/test_broadcast.h       |  6 +++---
 tests/wsum_hess/affine/test_diag_mat.h        |  2 +-
 tests/wsum_hess/affine/test_hstack.h          |  4 ++--
 tests/wsum_hess/affine/test_index.h           |  6 +++---
 tests/wsum_hess/affine/test_left_matmul.h     |  4 ++--
 tests/wsum_hess/affine/test_right_matmul.h    |  4 ++--
 tests/wsum_hess/affine/test_scalar_mult.h     |  4 ++--
 tests/wsum_hess/affine/test_sum.h             |  4 ++--
 tests/wsum_hess/affine/test_trace.h           |  2 +-
 tests/wsum_hess/affine/test_transpose.h       |  2 +-
 tests/wsum_hess/affine/test_upper_tri.h       |  2 +-
 tests/wsum_hess/affine/test_vector_mult.h     |  4 ++--
 tests/wsum_hess/affine/test_vstack.h          |  4 ++--
 .../bivariate_full_dom/test_matmul.h          |  4 ++--
 .../bivariate_full_dom/test_multiply.h        |  8 ++++----
 .../test_quad_over_lin.h                      |  4 ++--
 .../bivariate_restricted_dom/test_rel_entr.h  |  6 +++---
 .../test_rel_entr_scalar_vector.h             |  2 +-
 .../test_rel_entr_vector_scalar.h             |  2 +-
 .../wsum_hess/elementwise_full_dom/test_exp.h |  2 +-
 .../elementwise_full_dom/test_hyperbolic.h    |  8 ++++----
 .../elementwise_full_dom/test_logistic.h      |  2 +-
 .../elementwise_full_dom/test_power.h         |  2 +-
 .../elementwise_full_dom/test_trig.h          |  6 +++---
 .../elementwise_full_dom/test_xexp.h          |  2 +-
 .../elementwise_restricted_dom/test_entr.h    |  2 +-
 .../elementwise_restricted_dom/test_log.h     |  2 +-
 tests/wsum_hess/other/test_prod.h             |  8 ++++----
 tests/wsum_hess/other/test_prod_axis_one.h    |  8 ++++----
 tests/wsum_hess/other/test_prod_axis_zero.h   |  6 +++---
 tests/wsum_hess/other/test_quad_form.h        |  2 +-
 94 files changed, 269 insertions(+), 302 deletions(-)

diff --git a/include/utils/matrix.h b/include/utils/matrix.h
index b72dc4c..be2d551 100644
--- a/include/utils/matrix.h
+++ b/include/utils/matrix.h
@@ -49,10 +49,24 @@
 
 */
 
-/* Base matrix type with function pointers for polymorphic dispatch */
+/* Base matrix type with function pointers for polymorphic dispatch. There are
+   two types of matrices: 'sparse_matrix' and 'permuted_dense'. Each type
+   implements the same set of operations, but with different algorithms.
+   The following operations are implemented: TODO
+*/
 typedef struct Matrix
 {
-    int m, n;
+    /* Dimensions and number of explicitly stored entries. For Sparse_Matrix
+       nnz is the CSR nnz; for Dense_Matrix it is m * n; for Permuted_Dense it
+       is dense_m * dense_n (the size of the stored dense block). */
+    int m, n, nnz;
+
+    /* Non-owning pointer to the value buffer. Sparse_Matrix: csr->x.
+       Permuted_Dense: pd->X. Dense_Matrix: dm->x. Sparse and Permuted_Dense
+       share row-major layout for equal sparsity patterns (see
+       permuted_dense_to_csr_fill_values), so memcpy via M->x is valid between
+       same-shape Sparse/PD pairs. */
+    double *x;
 
     /* Operators for the left-multiply matrix in left_matmul. */
     void (*block_left_mult_vec)(const struct Matrix *self, const double *x,
@@ -61,7 +75,6 @@ typedef struct Matrix
                                             const CSC_Matrix *J, int p);
     void (*block_left_mult_values)(const struct Matrix *self, const CSC_Matrix *J,
                                    CSC_Matrix *C);
-    void (*update_values)(struct Matrix *self, const double *new_values);
 
     /* Chain-rule operations used by transformer atoms (elementwise, etc.).
        All chain-rule outputs are the same concrete type as self (uniform
diff --git a/src/atoms/affine/broadcast.c b/src/atoms/affine/broadcast.c
index aedf11e..73af7ca 100644
--- a/src/atoms/affine/broadcast.c
+++ b/src/atoms/affine/broadcast.c
@@ -164,25 +164,23 @@ static void eval_jacobian(expr *node)
 
     broadcast_expr *bcast = (broadcast_expr *) node;
     CSR_Matrix *Jx = node->left->jacobian->to_csr(node->left->jacobian);
-    CSR_Matrix *J = node->jacobian->to_csr(node->jacobian);
-
     if (bcast->type == BROADCAST_ROW)
     {
-        J->nnz = 0;
+        node->jacobian->nnz = 0;
         for (int i = 0; i < node->d2; i++)
         {
             int nnz_in_row = Jx->p[i + 1] - Jx->p[i];
-            tile_double(J->x + J->nnz, Jx->x + Jx->p[i], nnz_in_row, node->d1);
-            J->nnz += nnz_in_row * node->d1;
+            tile_double(node->jacobian->x + node->jacobian->nnz, Jx->x + Jx->p[i], nnz_in_row, node->d1);
+            node->jacobian->nnz += nnz_in_row * node->d1;
         }
     }
     else if (bcast->type == BROADCAST_COL)
     {
-        tile_double(J->x, Jx->x, Jx->nnz, node->d2);
+        tile_double(node->jacobian->x, Jx->x, Jx->nnz, node->d2);
     }
     else
     {
-        tile_double(J->x, Jx->x, Jx->nnz, node->size);
+        tile_double(node->jacobian->x, Jx->x, Jx->nnz, node->size);
     }
 }
 
@@ -239,7 +237,8 @@ static void eval_wsum_hess(expr *node, const double *w)
     }
 
     x->eval_wsum_hess(x, node->work->dwork);
-    node->wsum_hess->update_values(node->wsum_hess, x->wsum_hess->to_csr(x->wsum_hess)->x);
+    memcpy(node->wsum_hess->x, x->wsum_hess->x,
+           node->wsum_hess->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/convolve.c b/src/atoms/affine/convolve.c
index c6d2367..1496f95 100644
--- a/src/atoms/affine/convolve.c
+++ b/src/atoms/affine/convolve.c
@@ -129,7 +129,8 @@ static void eval_wsum_hess(expr *node, const double *w)
     }
 
     child->eval_wsum_hess(child, w_prime);
-    node->wsum_hess->update_values(node->wsum_hess, child->wsum_hess->to_csr(child->wsum_hess)->x);
+    memcpy(node->wsum_hess->x, child->wsum_hess->x,
+           node->wsum_hess->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/diag_vec.c b/src/atoms/affine/diag_vec.c
index 1454e9b..3238f32 100644
--- a/src/atoms/affine/diag_vec.c
+++ b/src/atoms/affine/diag_vec.c
@@ -121,7 +121,8 @@ static void eval_wsum_hess(expr *node, const double *w)
 
     /* Evaluate child's Hessian with extracted weights */
     x->eval_wsum_hess(x, node->work->dwork);
-    node->wsum_hess->update_values(node->wsum_hess, x->wsum_hess->to_csr(x->wsum_hess)->x);
+    memcpy(node->wsum_hess->x, x->wsum_hess->x,
+           node->wsum_hess->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/hstack.c b/src/atoms/affine/hstack.c
index 31e75e4..c4e589b 100644
--- a/src/atoms/affine/hstack.c
+++ b/src/atoms/affine/hstack.c
@@ -53,7 +53,7 @@ static void jacobian_init_impl(expr *node)
     {
         assert(hnode->args[i] != NULL);
         jacobian_init(hnode->args[i]);
-        nnz += hnode->args[i]->jacobian->to_csr(hnode->args[i]->jacobian)->nnz;
+        nnz += hnode->args[i]->jacobian->nnz;
     }
 
     CSR_Matrix *A = new_csr_matrix(node->size, node->n_vars, nnz);
@@ -86,18 +86,15 @@ static void jacobian_init_impl(expr *node)
 static void eval_jacobian(expr *node)
 {
     hstack_expr *hnode = (hstack_expr *) node;
-    CSR_Matrix *A = node->jacobian->to_csr(node->jacobian);
-    A->nnz = 0;
+    node->jacobian->nnz = 0;
 
     for (int i = 0; i < hnode->n_args; i++)
     {
         expr *child = hnode->args[i];
         child->eval_jacobian(child);
-        CSR_Matrix *Jc = child->jacobian->to_csr(child->jacobian);
-
         /* copy values */
-        memcpy(A->x + A->nnz, Jc->x, Jc->nnz * sizeof(double));
-        A->nnz += Jc->nnz;
+        memcpy(node->jacobian->x + node->jacobian->nnz, child->jacobian->x, child->jacobian->nnz * sizeof(double));
+        node->jacobian->nnz += child->jacobian->nnz;
     }
 }
 
@@ -109,8 +106,7 @@ static void wsum_hess_init_impl(expr *node)
     for (int i = 0; i < hnode->n_args; i++)
     {
         wsum_hess_init(hnode->args[i]);
-        Matrix *child_hess = hnode->args[i]->wsum_hess;
-        nnz += child_hess->to_csr(child_hess)->nnz;
+        nnz += hnode->args[i]->wsum_hess->nnz;
     }
 
     /* worst-case scenario the nnz of node->wsum_hess is the sum of children's
diff --git a/src/atoms/affine/index.c b/src/atoms/affine/index.c
index 2e57cd5..77028fb 100644
--- a/src/atoms/affine/index.c
+++ b/src/atoms/affine/index.c
@@ -141,7 +141,8 @@ static void eval_wsum_hess(expr *node, const double *w)
 
     /* evalute hessian of child */
     x->eval_wsum_hess(x, node->work->dwork);
-    node->wsum_hess->update_values(node->wsum_hess, x->wsum_hess->to_csr(x->wsum_hess)->x);
+    memcpy(node->wsum_hess->x, x->wsum_hess->x,
+           node->wsum_hess->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/left_matmul.c b/src/atoms/affine/left_matmul.c
index 8fabe63..44e462c 100644
--- a/src/atoms/affine/left_matmul.c
+++ b/src/atoms/affine/left_matmul.c
@@ -191,7 +191,8 @@ static void eval_wsum_hess(expr *node, const double *w)
     AT->block_left_mult_vec(AT, w, node->work->dwork, n_blocks);
 
     node->left->eval_wsum_hess(node->left, node->work->dwork);
-    node->wsum_hess->update_values(node->wsum_hess, node->left->wsum_hess->to_csr(node->left->wsum_hess)->x);
+    memcpy(node->wsum_hess->x, node->left->wsum_hess->x,
+           node->wsum_hess->nnz * sizeof(double));
 }
 
 static void refresh_dense_left(left_matmul_expr *lnode)
diff --git a/src/atoms/affine/neg.c b/src/atoms/affine/neg.c
index 3251c50..01bbf5e 100644
--- a/src/atoms/affine/neg.c
+++ b/src/atoms/affine/neg.c
@@ -40,7 +40,7 @@ static void jacobian_init_impl(expr *node)
     jacobian_init(x);
 
     /* same sparsity pattern as child */
-    node->jacobian = new_sparse_matrix(new_csr_copy_sparsity(x->jacobian->to_csr(x->jacobian)));
+    node->jacobian = x->jacobian->copy_sparsity(x->jacobian);
 }
 
 static void eval_jacobian(expr *node)
@@ -49,11 +49,9 @@ static void eval_jacobian(expr *node)
     node->left->eval_jacobian(node->left);
 
     /* negate values only (sparsity pattern set in jacobian_init_impl) */
-    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
-    CSR_Matrix *child_jac = node->left->jacobian->to_csr(node->left->jacobian);
-    for (int k = 0; k < child_jac->nnz; k++)
+    for (int k = 0; k < node->left->jacobian->nnz; k++)
     {
-        jac->x[k] = -child_jac->x[k];
+        node->jacobian->x[k] = -node->left->jacobian->x[k];
     }
 }
 
@@ -74,12 +72,9 @@ static void eval_wsum_hess(expr *node, const double *w)
     node->left->eval_wsum_hess(node->left, w);
 
     /* negate values (sparsity pattern set in wsum_hess_init_impl) */
-    CSR_Matrix *child_hess =
-        node->left->wsum_hess->to_csr(node->left->wsum_hess);
-    CSR_Matrix *jac = node->wsum_hess->to_csr(node->wsum_hess);
-    for (int k = 0; k < child_hess->nnz; k++)
+    for (int k = 0; k < node->left->wsum_hess->nnz; k++)
     {
-        jac->x[k] = -child_hess->x[k];
+        node->wsum_hess->x[k] = -node->left->wsum_hess->x[k];
     }
 }
 
diff --git a/src/atoms/affine/promote.c b/src/atoms/affine/promote.c
index 008e56e..dbe231f 100644
--- a/src/atoms/affine/promote.c
+++ b/src/atoms/affine/promote.c
@@ -63,13 +63,12 @@ static void eval_jacobian(expr *node)
     node->left->eval_jacobian(node->left);
 
     CSR_Matrix *child_jac = node->left->jacobian->to_csr(node->left->jacobian);
-    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
     int child_nnz = child_jac->p[1] - child_jac->p[0];
 
     /* Copy child's row values to each output row */
     for (int row = 0; row < node->size; row++)
     {
-        memcpy(jac->x + row * child_nnz, child_jac->x + child_jac->p[0],
+        memcpy(node->jacobian->x + row * child_nnz, child_jac->x + child_jac->p[0],
                child_nnz * sizeof(double));
     }
 }
@@ -93,10 +92,7 @@ static void eval_wsum_hess(expr *node, const double *w)
     node->left->eval_wsum_hess(node->left, &sum_w);
 
     /* copy values */
-    CSR_Matrix *child_hess =
-        node->left->wsum_hess->to_csr(node->left->wsum_hess);
-    CSR_Matrix *jac = node->wsum_hess->to_csr(node->wsum_hess);
-    memcpy(jac->x, child_hess->x, child_hess->nnz * sizeof(double));
+    memcpy(node->wsum_hess->x, node->left->wsum_hess->x, node->left->wsum_hess->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/reshape.c b/src/atoms/affine/reshape.c
index 5c62064..e59a7ec 100644
--- a/src/atoms/affine/reshape.c
+++ b/src/atoms/affine/reshape.c
@@ -36,16 +36,14 @@ static void jacobian_init_impl(expr *node)
 {
     expr *x = node->left;
     jacobian_init(x);
-    node->jacobian = new_sparse_matrix(new_csr_copy_sparsity(x->jacobian->to_csr(x->jacobian)));
+    node->jacobian = x->jacobian->copy_sparsity(x->jacobian);
 }
 
 static void eval_jacobian(expr *node)
 {
     expr *x = node->left;
     x->eval_jacobian(x);
-    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
-    CSR_Matrix *Jx = x->jacobian->to_csr(x->jacobian);
-    memcpy(jac->x, Jx->x, Jx->nnz * sizeof(double));
+    memcpy(node->jacobian->x, x->jacobian->x, x->jacobian->nnz * sizeof(double));
 }
 
 static void wsum_hess_init_impl(expr *node)
@@ -59,7 +57,8 @@ static void eval_wsum_hess(expr *node, const double *w)
 {
     expr *x = node->left;
     x->eval_wsum_hess(x, w);
-    node->wsum_hess->update_values(node->wsum_hess, x->wsum_hess->to_csr(x->wsum_hess)->x);
+    memcpy(node->wsum_hess->x, x->wsum_hess->x,
+           node->wsum_hess->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/scalar_mult.c b/src/atoms/affine/scalar_mult.c
index 93ffa93..433f3e1 100644
--- a/src/atoms/affine/scalar_mult.c
+++ b/src/atoms/affine/scalar_mult.c
@@ -59,7 +59,7 @@ static void jacobian_init_impl(expr *node)
     jacobian_init(x);
 
     /* same sparsity as child */
-    node->jacobian = new_sparse_matrix(new_csr_copy_sparsity(x->jacobian->to_csr(x->jacobian)));
+    node->jacobian = x->jacobian->copy_sparsity(x->jacobian);
 }
 
 static void eval_jacobian(expr *node)
@@ -71,11 +71,9 @@ static void eval_jacobian(expr *node)
     child->eval_jacobian(child);
 
     /* scale child's jacobian */
-    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
-    CSR_Matrix *child_jac = child->jacobian->to_csr(child->jacobian);
-    for (int j = 0; j < child_jac->nnz; j++)
+    for (int j = 0; j < child->jacobian->nnz; j++)
     {
-        jac->x[j] = a * child_jac->x[j];
+        node->jacobian->x[j] = a * child->jacobian->x[j];
     }
 }
 
@@ -96,9 +94,9 @@ static void eval_wsum_hess(expr *node, const double *w)
     x->eval_wsum_hess(x, w);
 
     double a = ((scalar_mult_expr *) node)->param_source->value[0];
-    for (int j = 0; j < x->wsum_hess->to_csr(x->wsum_hess)->nnz; j++)
+    for (int j = 0; j < x->wsum_hess->nnz; j++)
     {
-        node->wsum_hess->to_csr(node->wsum_hess)->x[j] = a * x->wsum_hess->to_csr(x->wsum_hess)->x[j];
+        node->wsum_hess->x[j] = a * x->wsum_hess->x[j];
     }
 }
 
diff --git a/src/atoms/affine/sum.c b/src/atoms/affine/sum.c
index d031d56..3d1d5cd 100644
--- a/src/atoms/affine/sum.c
+++ b/src/atoms/affine/sum.c
@@ -125,9 +125,8 @@ static void eval_jacobian(expr *node)
 
     /* we have precomputed an idx map between the nonzeros of the child's jacobian
        and this node's jacobian, so we just accumulate accordingly */
-    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
-    memset(jac->x, 0, jac->nnz * sizeof(double));
-    accumulator(x->jacobian->to_csr(x->jacobian), ((sum_expr *) node)->idx_map, jac->x);
+    memset(node->jacobian->x, 0, node->jacobian->nnz * sizeof(double));
+    accumulator(x->jacobian->to_csr(x->jacobian), ((sum_expr *) node)->idx_map, node->jacobian->x);
 }
 
 static void wsum_hess_init_impl(expr *node)
@@ -162,10 +161,8 @@ static void eval_wsum_hess(expr *node, const double *w)
 
     x->eval_wsum_hess(x, node->work->dwork);
 
-    /* copy values via polymorphic update_values so PD-backed wsum_hess writes
-       reach the underlying X rather than just the lazy CSR cache. */
-    node->wsum_hess->update_values(node->wsum_hess,
-                                   x->wsum_hess->to_csr(x->wsum_hess)->x);
+    memcpy(node->wsum_hess->x, x->wsum_hess->x,
+           node->wsum_hess->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/trace.c b/src/atoms/affine/trace.c
index e5a7489..7c1900a 100644
--- a/src/atoms/affine/trace.c
+++ b/src/atoms/affine/trace.c
@@ -92,9 +92,8 @@ static void eval_jacobian(expr *node)
     x->eval_jacobian(x);
 
     /* local jacobian */
-    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
-    memset(jac->x, 0, jac->nnz * sizeof(double));
-    accumulator_with_spacing(x->jacobian->to_csr(x->jacobian), tnode->idx_map, jac->x, x->d1 + 1);
+    memset(node->jacobian->x, 0, node->jacobian->nnz * sizeof(double));
+    accumulator_with_spacing(x->jacobian->to_csr(x->jacobian), tnode->idx_map, node->jacobian->x, x->d1 + 1);
 }
 
 /* Placeholders for Hessian-related functions */
@@ -126,7 +125,8 @@ static void eval_wsum_hess(expr *node, const double *w)
 
     x->eval_wsum_hess(x, node->work->dwork);
 
-    node->wsum_hess->update_values(node->wsum_hess, x->wsum_hess->to_csr(x->wsum_hess)->x);
+    memcpy(node->wsum_hess->x, x->wsum_hess->x,
+           node->wsum_hess->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/transpose.c b/src/atoms/affine/transpose.c
index 70958b8..ed67e61 100644
--- a/src/atoms/affine/transpose.c
+++ b/src/atoms/affine/transpose.c
@@ -117,7 +117,8 @@ static void eval_wsum_hess(expr *node, const double *w)
     node->left->eval_wsum_hess(node->left, node->work->dwork);
 
     /* copy to this node's hessian */
-    node->wsum_hess->update_values(node->wsum_hess, node->left->wsum_hess->to_csr(node->left->wsum_hess)->x);
+    memcpy(node->wsum_hess->x, node->left->wsum_hess->x,
+           node->wsum_hess->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/vector_mult.c b/src/atoms/affine/vector_mult.c
index ca60a84..3fbb8d9 100644
--- a/src/atoms/affine/vector_mult.c
+++ b/src/atoms/affine/vector_mult.c
@@ -59,7 +59,7 @@ static void jacobian_init_impl(expr *node)
     jacobian_init(x);
 
     /* same sparsity as child */
-    node->jacobian = new_sparse_matrix(new_csr_copy_sparsity(x->jacobian->to_csr(x->jacobian)));
+    node->jacobian = x->jacobian->copy_sparsity(x->jacobian);
 }
 
 static void eval_jacobian(expr *node)
@@ -71,13 +71,12 @@ static void eval_jacobian(expr *node)
     x->eval_jacobian(x);
 
     /* row-wise scale child's jacobian */
-    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
     CSR_Matrix *Jx = x->jacobian->to_csr(x->jacobian);
     for (int i = 0; i < node->size; i++)
     {
         for (int j = Jx->p[i]; j < Jx->p[i + 1]; j++)
         {
-            jac->x[j] = a[i] * Jx->x[j];
+            node->jacobian->x[j] = a[i] * Jx->x[j];
         }
     }
 }
@@ -109,7 +108,8 @@ static void eval_wsum_hess(expr *node, const double *w)
     x->eval_wsum_hess(x, node->work->dwork);
 
     /* copy values from child to this node */
-    node->wsum_hess->update_values(node->wsum_hess, x->wsum_hess->to_csr(x->wsum_hess)->x);
+    memcpy(node->wsum_hess->x, x->wsum_hess->x,
+           node->wsum_hess->nnz * sizeof(double));
 }
 
 static void free_type_data(expr *node)
diff --git a/src/atoms/bivariate_full_dom/matmul.c b/src/atoms/bivariate_full_dom/matmul.c
index 485df69..2f8445f 100644
--- a/src/atoms/bivariate_full_dom/matmul.c
+++ b/src/atoms/bivariate_full_dom/matmul.c
@@ -354,7 +354,7 @@ static void eval_wsum_hess_no_chain_rule(expr *node, const double *w)
     int k = x->d2;
     int n = y->d2;
     int offset = 0;
-    double *Hx = node->wsum_hess->to_csr(node->wsum_hess)->x;
+    double *Hx = node->wsum_hess->x;
     const double *w_temp;
 
     if (x->var_id < y->var_id)
@@ -515,12 +515,11 @@ static void eval_wsum_hess_chain_rule(expr *node, const double *w)
     }
 
     /* accumulate H = C + C^T + H_f + H_g */
-    CSR_Matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
-    memset(H->x, 0, H->nnz * sizeof(double));
-    accumulator(mnode->C, mnode->idx_map_C, H->x);
-    accumulator(mnode->CT, mnode->idx_map_CT, H->x);
-    accumulator(f->wsum_hess->to_csr(f->wsum_hess), mnode->idx_map_Hf, H->x);
-    accumulator(g->wsum_hess->to_csr(g->wsum_hess), mnode->idx_map_Hg, H->x);
+    memset(node->wsum_hess->x, 0, node->wsum_hess->nnz * sizeof(double));
+    accumulator(mnode->C, mnode->idx_map_C, node->wsum_hess->x);
+    accumulator(mnode->CT, mnode->idx_map_CT, node->wsum_hess->x);
+    accumulator(f->wsum_hess->to_csr(f->wsum_hess), mnode->idx_map_Hf, node->wsum_hess->x);
+    accumulator(g->wsum_hess->to_csr(g->wsum_hess), mnode->idx_map_Hg, node->wsum_hess->x);
 }
 
 expr *new_matmul(expr *x, expr *y)
diff --git a/src/atoms/bivariate_full_dom/multiply.c b/src/atoms/bivariate_full_dom/multiply.c
index 17fa63f..d204e98 100644
--- a/src/atoms/bivariate_full_dom/multiply.c
+++ b/src/atoms/bivariate_full_dom/multiply.c
@@ -267,12 +267,11 @@ static void eval_wsum_hess(expr *node, const double *w)
         // ---------------------------------------------------------------
         //        compute H = C + C^T + term2 + term3
         // ---------------------------------------------------------------
-        CSR_Matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
-        memset(H->x, 0, H->nnz * sizeof(double));
-        accumulator(C, mul_node->idx_map_C, H->x);
-        accumulator(CT, mul_node->idx_map_CT, H->x);
-        accumulator(x->wsum_hess->to_csr(x->wsum_hess), mul_node->idx_map_Hx, H->x);
-        accumulator(y->wsum_hess->to_csr(y->wsum_hess), mul_node->idx_map_Hy, H->x);
+        memset(node->wsum_hess->x, 0, node->wsum_hess->nnz * sizeof(double));
+        accumulator(C, mul_node->idx_map_C, node->wsum_hess->x);
+        accumulator(CT, mul_node->idx_map_CT, node->wsum_hess->x);
+        accumulator(x->wsum_hess->to_csr(x->wsum_hess), mul_node->idx_map_Hx, node->wsum_hess->x);
+        accumulator(y->wsum_hess->to_csr(y->wsum_hess), mul_node->idx_map_Hy, node->wsum_hess->x);
     }
 }
 
diff --git a/src/atoms/bivariate_restricted_dom/quad_over_lin.c b/src/atoms/bivariate_restricted_dom/quad_over_lin.c
index 6baf4c2..4cc2531 100644
--- a/src/atoms/bivariate_restricted_dom/quad_over_lin.c
+++ b/src/atoms/bivariate_restricted_dom/quad_over_lin.c
@@ -269,7 +269,7 @@ static void eval_wsum_hess(expr *node, const double *w)
 {
     double *x = node->left->value;
     double y = node->right->value[0];
-    double *H = node->wsum_hess->to_csr(node->wsum_hess)->x;
+    double *H = node->wsum_hess->x;
     int var_id_x = node->left->var_id;
     int var_id_y = node->right->var_id;
     int x_size = node->left->size;
diff --git a/src/atoms/bivariate_restricted_dom/rel_entr.c b/src/atoms/bivariate_restricted_dom/rel_entr.c
index 8a3f5a2..ad7a1f7 100644
--- a/src/atoms/bivariate_restricted_dom/rel_entr.c
+++ b/src/atoms/bivariate_restricted_dom/rel_entr.c
@@ -80,23 +80,21 @@ static void eval_jacobian_vector_args(expr *node)
 {
     expr *x = node->left;
     expr *y = node->right;
-    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
-
     /* if x has lower variable idx than y */
     if (x->var_id < y->var_id)
     {
         for (int i = 0; i < node->size; i++)
         {
-            jac->x[2 * i] = log(x->value[i] / y->value[i]) + 1;
-            jac->x[2 * i + 1] = -x->value[i] / y->value[i];
+            node->jacobian->x[2 * i] = log(x->value[i] / y->value[i]) + 1;
+            node->jacobian->x[2 * i + 1] = -x->value[i] / y->value[i];
         }
     }
     else
     {
         for (int i = 0; i < node->size; i++)
         {
-            jac->x[2 * i] = -x->value[i] / y->value[i];
-            jac->x[2 * i + 1] = log(x->value[i] / y->value[i]) + 1;
+            node->jacobian->x[2 * i] = -x->value[i] / y->value[i];
+            node->jacobian->x[2 * i + 1] = log(x->value[i] / y->value[i]) + 1;
         }
     }
 }
@@ -155,7 +153,7 @@ static void eval_wsum_hess_vector_args(expr *node, const double *w)
 {
     double *x = node->left->value;
     double *y = node->right->value;
-    double *hess = node->wsum_hess->to_csr(node->wsum_hess)->x;
+    double *hess = node->wsum_hess->x;
 
     if (node->left->var_id < node->right->var_id)
     {
diff --git a/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c b/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c
index 25cbe7f..a120718 100644
--- a/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c
+++ b/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c
@@ -79,22 +79,20 @@ static void eval_jacobian_scalar_vector(expr *node)
 {
     expr *x = node->left;
     expr *y = node->right;
-    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
-
     if (x->var_id < y->var_id)
     {
         for (int i = 0; i < node->size; i++)
         {
-            jac->x[2 * i] = log(x->value[0] / y->value[i]) + 1;
-            jac->x[2 * i + 1] = -x->value[0] / y->value[i];
+            node->jacobian->x[2 * i] = log(x->value[0] / y->value[i]) + 1;
+            node->jacobian->x[2 * i + 1] = -x->value[0] / y->value[i];
         }
     }
     else
     {
         for (int i = 0; i < node->size; i++)
         {
-            jac->x[2 * i] = -x->value[0] / y->value[i];
-            jac->x[2 * i + 1] = log(x->value[0] / y->value[i]) + 1;
+            node->jacobian->x[2 * i] = -x->value[0] / y->value[i];
+            node->jacobian->x[2 * i + 1] = log(x->value[0] / y->value[i]) + 1;
         }
     }
 }
@@ -170,7 +168,7 @@ static void eval_wsum_hess_scalar_vector(expr *node, const double *w)
 {
     double x = node->left->value[0];
     double *y = node->right->value;
-    double *H = node->wsum_hess->to_csr(node->wsum_hess)->x;
+    double *H = node->wsum_hess->x;
     int var_id_x = node->left->var_id;
     int var_id_y = node->right->var_id;
 
diff --git a/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c b/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c
index 804ac43..1fabd35 100644
--- a/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c
+++ b/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c
@@ -79,22 +79,20 @@ static void eval_jacobian_vector_scalar(expr *node)
 {
     expr *x = node->left;
     expr *y = node->right;
-    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
-
     if (x->var_id < y->var_id)
     {
         for (int i = 0; i < node->size; i++)
         {
-            jac->x[2 * i] = log(x->value[i] / y->value[0]) + 1;
-            jac->x[2 * i + 1] = -x->value[i] / y->value[0];
+            node->jacobian->x[2 * i] = log(x->value[i] / y->value[0]) + 1;
+            node->jacobian->x[2 * i + 1] = -x->value[i] / y->value[0];
         }
     }
     else
     {
         for (int i = 0; i < node->size; i++)
         {
-            jac->x[2 * i] = -x->value[i] / y->value[0];
-            jac->x[2 * i + 1] = log(x->value[i] / y->value[0]) + 1;
+            node->jacobian->x[2 * i] = -x->value[i] / y->value[0];
+            node->jacobian->x[2 * i + 1] = log(x->value[i] / y->value[0]) + 1;
         }
     }
 }
@@ -170,7 +168,7 @@ static void eval_wsum_hess_vector_scalar(expr *node, const double *w)
 {
     double *x = node->left->value;
     double y = node->right->value[0];
-    double *H = node->wsum_hess->to_csr(node->wsum_hess)->x;
+    double *H = node->wsum_hess->x;
     int var_id_x = node->left->var_id;
     int var_id_y = node->right->var_id;
 
diff --git a/src/atoms/elementwise_full_dom/common.c b/src/atoms/elementwise_full_dom/common.c
index 0e6e399..8a6946e 100644
--- a/src/atoms/elementwise_full_dom/common.c
+++ b/src/atoms/elementwise_full_dom/common.c
@@ -58,7 +58,7 @@ void eval_jacobian_elementwise(expr *node)
 
     if (child->var_id != NOT_A_VARIABLE)
     {
-        node->local_jacobian(node, node->jacobian->to_csr(node->jacobian)->x);
+        node->local_jacobian(node, node->jacobian->x);
     }
     else
     {
@@ -118,10 +118,8 @@ void wsum_hess_init_elementwise(expr *node)
 
             /* wsum_hess = term1 + term2 (sum_csr_alloc operates on CSR; extract
                via to_csr on the operands). The result is stored as CSR. */
-            CSR_Matrix *t1 =
-                node->work->hess_term1->to_csr(node->work->hess_term1);
-            CSR_Matrix *t2 =
-                node->work->hess_term2->to_csr(node->work->hess_term2);
+            CSR_Matrix *t1 = node->work->hess_term1->to_csr(node->work->hess_term1);
+            CSR_Matrix *t2 = node->work->hess_term2->to_csr(node->work->hess_term2);
             int max_nnz = t1->nnz + t2->nnz;
             CSR_Matrix *hess = new_csr_matrix(node->n_vars, node->n_vars, max_nnz);
             sum_csr_alloc(t1, t2, hess);
@@ -136,8 +134,7 @@ void eval_wsum_hess_elementwise(expr *node, const double *w)
 
     if (child->var_id != NOT_A_VARIABLE)
     {
-        node->local_wsum_hess(node, node->wsum_hess->to_csr(node->wsum_hess)->x,
-                              w);
+        node->local_wsum_hess(node, node->wsum_hess->x, w);
     }
     else
     {
@@ -176,9 +173,8 @@ void eval_wsum_hess_elementwise(expr *node, const double *w)
             child->eval_wsum_hess(child, node->work->dwork);
             CSR_Matrix *t2_csr =
                 node->work->hess_term2->to_csr(node->work->hess_term2);
-            CSR_Matrix *child_hess_csr = child->wsum_hess->to_csr(child->wsum_hess);
-            memcpy(t2_csr->x, child_hess_csr->x,
-                   child_hess_csr->nnz * sizeof(double));
+            memcpy(t2_csr->x, child->wsum_hess->x,
+                   child->wsum_hess->nnz * sizeof(double));
 
             /* wsum_hess = term1 + term2 */
             CSR_Matrix *t1_csr =
diff --git a/src/atoms/elementwise_full_dom/logistic.c b/src/atoms/elementwise_full_dom/logistic.c
index 08e1bb6..303306e 100644
--- a/src/atoms/elementwise_full_dom/logistic.c
+++ b/src/atoms/elementwise_full_dom/logistic.c
@@ -63,7 +63,7 @@ static void local_wsum_hess(expr *node, double *out, const double *w)
 
     if (node->left->var_id != NOT_A_VARIABLE)
     {
-        sigmas = node->jacobian->to_csr(node->jacobian)->x;
+        sigmas = node->jacobian->x;
     }
     else
     {
diff --git a/src/atoms/elementwise_restricted_dom/atanh.c b/src/atoms/elementwise_restricted_dom/atanh.c
index 69b425b..c2c71c2 100644
--- a/src/atoms/elementwise_restricted_dom/atanh.c
+++ b/src/atoms/elementwise_restricted_dom/atanh.c
@@ -30,7 +30,7 @@ static void atanh_forward(expr *node, const double *u)
 static void atanh_eval_jacobian(expr *node)
 {
     double *x = node->left->value;
-    double *jx = node->jacobian->to_csr(node->jacobian)->x;
+    double *jx = node->jacobian->x;
     for (int j = 0; j < node->size; j++)
     {
         jx[j] = 1.0 / (1.0 - x[j] * x[j]);
@@ -40,10 +40,11 @@ static void atanh_eval_jacobian(expr *node)
 static void atanh_eval_wsum_hess(expr *node, const double *w)
 {
     double *x = node->left->value;
+    double *hx = node->wsum_hess->x;
     for (int j = 0; j < node->size; j++)
     {
         double c = 1.0 - x[j] * x[j];
-        node->wsum_hess->to_csr(node->wsum_hess)->x[j] = w[j] * (2.0 * x[j]) / (c * c);
+        hx[j] = w[j] * (2.0 * x[j]) / (c * c);
     }
 }
 
diff --git a/src/atoms/elementwise_restricted_dom/entr.c b/src/atoms/elementwise_restricted_dom/entr.c
index 3a23978..c3d29cb 100644
--- a/src/atoms/elementwise_restricted_dom/entr.c
+++ b/src/atoms/elementwise_restricted_dom/entr.c
@@ -32,7 +32,7 @@ static void entr_forward(expr *node, const double *u)
 static void entr_eval_jacobian(expr *node)
 {
     double *x = node->left->value;
-    double *jx = node->jacobian->to_csr(node->jacobian)->x;
+    double *jx = node->jacobian->x;
     for (int j = 0; j < node->size; j++)
     {
         jx[j] = -log(x[j]) - 1.0;
@@ -42,9 +42,10 @@ static void entr_eval_jacobian(expr *node)
 static void entr_eval_wsum_hess(expr *node, const double *w)
 {
     double *x = node->left->value;
+    double *hx = node->wsum_hess->x;
     for (int j = 0; j < node->size; j++)
     {
-        node->wsum_hess->to_csr(node->wsum_hess)->x[j] = -w[j] / x[j];
+        hx[j] = -w[j] / x[j];
     }
 }
 
diff --git a/src/atoms/elementwise_restricted_dom/log.c b/src/atoms/elementwise_restricted_dom/log.c
index 9d1a010..d9753e1 100644
--- a/src/atoms/elementwise_restricted_dom/log.c
+++ b/src/atoms/elementwise_restricted_dom/log.c
@@ -31,7 +31,7 @@ static void log_forward(expr *node, const double *u)
 static void log_eval_jacobian(expr *node)
 {
     double *x = node->left->value;
-    double *jx = node->jacobian->to_csr(node->jacobian)->x;
+    double *jx = node->jacobian->x;
     for (int j = 0; j < node->size; j++)
     {
         jx[j] = 1.0 / x[j];
@@ -41,9 +41,10 @@ static void log_eval_jacobian(expr *node)
 static void log_eval_wsum_hess(expr *node, const double *w)
 {
     double *x = node->left->value;
+    double *hx = node->wsum_hess->x;
     for (int j = 0; j < node->size; j++)
     {
-        node->wsum_hess->to_csr(node->wsum_hess)->x[j] = -w[j] / (x[j] * x[j]);
+        hx[j] = -w[j] / (x[j] * x[j]);
     }
 }
 
diff --git a/src/atoms/elementwise_restricted_dom/tan.c b/src/atoms/elementwise_restricted_dom/tan.c
index 087f750..3a4e4dc 100644
--- a/src/atoms/elementwise_restricted_dom/tan.c
+++ b/src/atoms/elementwise_restricted_dom/tan.c
@@ -30,7 +30,7 @@ static void tan_forward(expr *node, const double *u)
 static void tan_eval_jacobian(expr *node)
 {
     double *x = node->left->value;
-    double *jx = node->jacobian->to_csr(node->jacobian)->x;
+    double *jx = node->jacobian->x;
     for (int j = 0; j < node->size; j++)
     {
         double c = cos(x[j]);
@@ -41,10 +41,11 @@ static void tan_eval_jacobian(expr *node)
 static void tan_eval_wsum_hess(expr *node, const double *w)
 {
     double *x = node->left->value;
+    double *hx = node->wsum_hess->x;
     for (int j = 0; j < node->size; j++)
     {
         double c = cos(x[j]);
-        node->wsum_hess->to_csr(node->wsum_hess)->x[j] = 2.0 * w[j] * node->value[j] / (c * c);
+        hx[j] = 2.0 * w[j] * node->value[j] / (c * c);
     }
 }
 
diff --git a/src/atoms/other/prod.c b/src/atoms/other/prod.c
index 33a1b60..8e1477e 100644
--- a/src/atoms/other/prod.c
+++ b/src/atoms/other/prod.c
@@ -97,7 +97,7 @@ static void eval_jacobian(expr *node)
     /* if x is a variable */
     if (x->var_id != NOT_A_VARIABLE)
     {
-        double *jx = node->jacobian->to_csr(node->jacobian)->x;
+        double *jx = node->jacobian->x;
         if (num_of_zeros == 0)
         {
             for (int j = 0; j < x->size; j++)
@@ -240,11 +240,11 @@ static inline void wsum_hess_no_zeros(expr *node, const double *w)
         {
             if (i == j)
             {
-                node->wsum_hess->to_csr(node->wsum_hess)->x[i * n + j] = 0.0;
+                node->wsum_hess->x[i * n + j] = 0.0;
             }
             else
             {
-                node->wsum_hess->to_csr(node->wsum_hess)->x[i * n + j] = wf / (x[i] * x[j]);
+                node->wsum_hess->x[i * n + j] = wf / (x[i] * x[j]);
             }
         }
     }
@@ -253,7 +253,7 @@ static inline void wsum_hess_no_zeros(expr *node, const double *w)
 static inline void wsum_hess_one_zero(expr *node, const double *w)
 {
     expr *x = node->left;
-    double *H = node->wsum_hess->to_csr(node->wsum_hess)->x;
+    double *H = node->wsum_hess->x;
     memset(H, 0, sizeof(double) * (x->size * x->size));
     int p = ((prod_expr *) node)->zero_index;
     double prod_nonzero = ((prod_expr *) node)->prod_nonzero;
@@ -274,7 +274,7 @@ static inline void wsum_hess_two_zeros(expr *node, const double *w)
 {
     expr *x = node->left;
     int n = x->size;
-    memset(node->wsum_hess->to_csr(node->wsum_hess)->x, 0, sizeof(double) * (n * n));
+    memset(node->wsum_hess->x, 0, sizeof(double) * (n * n));
 
     /* find indices p and q where x[p] = x[q] = 0 */
     int p = -1, q = -1;
@@ -296,13 +296,13 @@ static inline void wsum_hess_two_zeros(expr *node, const double *w)
     assert(p != -1 && q != -1);
 
     double hess_val = w[0] * ((prod_expr *) node)->prod_nonzero;
-    node->wsum_hess->to_csr(node->wsum_hess)->x[p * n + q] = hess_val;
-    node->wsum_hess->to_csr(node->wsum_hess)->x[q * n + p] = hess_val;
+    node->wsum_hess->x[p * n + q] = hess_val;
+    node->wsum_hess->x[q * n + p] = hess_val;
 }
 
 static inline void wsum_hess_many_zeros(expr *node, const double *w)
 {
     expr *x = node->left;
-    memset(node->wsum_hess->to_csr(node->wsum_hess)->x, 0, sizeof(double) * (x->size * x->size));
+    memset(node->wsum_hess->x, 0, sizeof(double) * (x->size * x->size));
     (void) w;
 }
diff --git a/src/atoms/other/prod_axis_one.c b/src/atoms/other/prod_axis_one.c
index b7b4dbd..b310d95 100644
--- a/src/atoms/other/prod_axis_one.c
+++ b/src/atoms/other/prod_axis_one.c
@@ -114,7 +114,7 @@ static void eval_jacobian(expr *node)
     expr *x = node->left;
     prod_axis *pnode = (prod_axis *) node;
 
-    double *J_vals = node->jacobian->to_csr(node->jacobian)->x;
+    double *J_vals = node->jacobian->x;
 
     /* if x is a variable */
     if (x->var_id != NOT_A_VARIABLE)
diff --git a/src/atoms/other/prod_axis_zero.c b/src/atoms/other/prod_axis_zero.c
index 9c2ab10..61352fb 100644
--- a/src/atoms/other/prod_axis_zero.c
+++ b/src/atoms/other/prod_axis_zero.c
@@ -109,7 +109,7 @@ static void eval_jacobian(expr *node)
     expr *x = node->left;
     prod_axis *pnode = (prod_axis *) node;
 
-    double *J_vals = node->jacobian->to_csr(node->jacobian)->x;
+    double *J_vals = node->jacobian->x;
 
     /* if x is a variable */
     if (x->var_id != NOT_A_VARIABLE)
@@ -199,7 +199,7 @@ static inline void wsum_hess_column_no_zeros(expr *node, const double *w, int co
                                              int d1)
 {
     expr *x = node->left;
-    double *H = node->wsum_hess->to_csr(node->wsum_hess)->x;
+    double *H = node->wsum_hess->x;
     int col_start = col * d1;
     int block_start = col * d1 * d1;
     double scale = w[col] * node->value[col];
@@ -228,7 +228,7 @@ static inline void wsum_hess_column_one_zero(expr *node, const double *w, int co
 {
     expr *x = node->left;
     prod_axis *pnode = (prod_axis *) node;
-    double *H = node->wsum_hess->to_csr(node->wsum_hess)->x;
+    double *H = node->wsum_hess->x;
     int col_start = col * d1;
     int block_start = col * d1 * d1;
 
@@ -256,7 +256,7 @@ static inline void wsum_hess_column_two_zeros(expr *node, const double *w, int c
 {
     expr *x = node->left;
     prod_axis *pnode = (prod_axis *) node;
-    double *H = node->wsum_hess->to_csr(node->wsum_hess)->x;
+    double *H = node->wsum_hess->x;
     int col_start = col * d1;
     int block_start = col * d1 * d1;
 
@@ -290,7 +290,7 @@ static inline void wsum_hess_column_two_zeros(expr *node, const double *w, int c
 static inline void wsum_hess_column_many_zeros(expr *node, const double *w, int col,
                                                int d1)
 {
-    double *H = node->wsum_hess->to_csr(node->wsum_hess)->x;
+    double *H = node->wsum_hess->x;
     int block_start = col * d1 * d1;
 
     /* clear this column's block */
diff --git a/src/atoms/other/quad_form.c b/src/atoms/other/quad_form.c
index 122ec25..703d3a8 100644
--- a/src/atoms/other/quad_form.c
+++ b/src/atoms/other/quad_form.c
@@ -191,9 +191,8 @@ static void eval_wsum_hess(expr *node, const double *w)
     {
         /* TODO: do we want to compute this hessian only once (up to a scaling)?
          * Maybe unnecessary optimization. */
-        CSR_Matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
-        memcpy(H->x, Q->x, Q->nnz * sizeof(double));
-        cblas_dscal(Q->nnz, two_w, H->x, 1);
+        memcpy(node->wsum_hess->x, Q->x, Q->nnz * sizeof(double));
+        cblas_dscal(Q->nnz, two_w, node->wsum_hess->x, 1);
     }
     else
     {
@@ -218,9 +217,8 @@ static void eval_wsum_hess(expr *node, const double *w)
         BTDA_fill_values(Jf, QJf, NULL, term1);
 
         /* term2 */
-        CSR_Matrix *x_hess = x->wsum_hess->to_csr(x->wsum_hess);
         x->eval_wsum_hess(x, node->work->dwork);
-        memcpy(term2->x, x_hess->x, x_hess->nnz * sizeof(double));
+        memcpy(term2->x, x->wsum_hess->x, x->wsum_hess->nnz * sizeof(double));
 
         /* scale both terms by 2w */
         cblas_dscal(term1->nnz, two_w, term1->x, 1);
diff --git a/src/problem.c b/src/problem.c
index 0378884..3abd929 100644
--- a/src/problem.c
+++ b/src/problem.c
@@ -226,14 +226,12 @@ void problem_init_hessian(problem *prob)
     //                        Lagrange Hessian structure
     // -------------------------------------------------------------------------------
     wsum_hess_init(prob->objective);
-    int nnz =
-        prob->objective->wsum_hess->to_csr(prob->objective->wsum_hess)->nnz;
+    int nnz = prob->objective->wsum_hess->nnz;
 
     for (int i = 0; i < prob->n_constraints; i++)
     {
         wsum_hess_init(prob->constraints[i]);
-        Matrix *c_hess = prob->constraints[i]->wsum_hess;
-        nnz += c_hess->to_csr(c_hess)->nnz;
+        nnz += prob->constraints[i]->wsum_hess->nnz;
     }
 
     prob->lagrange_hessian = new_csr_matrix(prob->n_vars, prob->n_vars, nnz);
@@ -493,18 +491,16 @@ void problem_jacobian(problem *prob)
     for (int i = 0; i < prob->n_constraints; i++)
     {
         expr *c = prob->constraints[i];
-        CSR_Matrix *Jc = c->jacobian->to_csr(c->jacobian);
-
         if (!first_call && c->is_affine(c))
         {
             /* skip evaluation for affine constraints after first call */
-            nnz_offset += Jc->nnz;
+            nnz_offset += c->jacobian->nnz;
             continue;
         }
 
         c->eval_jacobian(c);
-        memcpy(J->x + nnz_offset, Jc->x, Jc->nnz * sizeof(double));
-        nnz_offset += Jc->nnz;
+        memcpy(J->x + nnz_offset, c->jacobian->x, c->jacobian->nnz * sizeof(double));
+        nnz_offset += c->jacobian->nnz;
     }
 
     /* update actual nnz (may be less than allocated) */
diff --git a/src/utils/dense_matrix.c b/src/utils/dense_matrix.c
index 5c628c2..5060359 100644
--- a/src/utils/dense_matrix.c
+++ b/src/utils/dense_matrix.c
@@ -42,12 +42,6 @@ static void dense_block_left_mult_vec(const Matrix *A, const double *x, double *
                 n, 0.0, y, m);
 }
 
-static void dense_update_values(Matrix *self, const double *new_values)
-{
-    Dense_Matrix *dm = (Dense_Matrix *) self;
-    memcpy(dm->x, new_values, dm->base.m * dm->base.n * sizeof(double));
-}
-
 static void dense_free(Matrix *A)
 {
     Dense_Matrix *dm = (Dense_Matrix *) A;
@@ -61,16 +55,17 @@ Matrix *new_dense_matrix(int m, int n, const double *data)
     Dense_Matrix *dm = (Dense_Matrix *) SP_CALLOC(1, sizeof(Dense_Matrix));
     dm->base.m = m;
     dm->base.n = n;
+    dm->base.nnz = m * n;
     dm->base.block_left_mult_vec = dense_block_left_mult_vec;
     dm->base.block_left_mult_sparsity = I_kron_A_alloc;
     dm->base.block_left_mult_values = I_kron_A_fill_values;
-    dm->base.update_values = dense_update_values;
     dm->base.free_fn = dense_free;
     dm->x = (double *) SP_MALLOC(m * n * sizeof(double));
     if (data != NULL)
     {
         memcpy(dm->x, data, m * n * sizeof(double));
     }
+    dm->base.x = dm->x;
     dm->work = (double *) SP_MALLOC(n * sizeof(double));
     return &dm->base;
 }
diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c
index b522f7f..ab86c0f 100644
--- a/src/utils/permuted_dense.c
+++ b/src/utils/permuted_dense.c
@@ -41,16 +41,6 @@ static void permuted_dense_refresh_csc_values(Matrix *self)
     (void) self;
 }
 
-/* Replace pd->X with new_values (dense_m * dense_n doubles, row-major). Same
-   layout as the CSR view's value array (see permuted_dense_to_csr_alloc), so
-   callers that have a CSR view's x can pass it here. */
-static void permuted_dense_vtable_update_values(Matrix *self,
-                                                const double *new_values)
-{
-    Permuted_Dense *pd = (Permuted_Dense *) self;
-    memcpy(pd->X, new_values, pd->dense_m * pd->dense_n * sizeof(double));
-}
-
 /* Vtable adapters — each delegates to the existing permuted_dense_* kernel. */
 static Matrix *permuted_dense_vtable_copy_sparsity(const Matrix *self)
 {
@@ -59,8 +49,8 @@ static Matrix *permuted_dense_vtable_copy_sparsity(const Matrix *self)
                               pd->row_perm, pd->col_perm, NULL);
 }
 
-static void permuted_dense_vtable_DA_fill_values(const double *d,
-                                                 const Matrix *self, Matrix *out)
+static void permuted_dense_vtable_DA_fill_values(const double *d, const Matrix *self,
+                                                 Matrix *out)
 {
     permuted_dense_DA_fill_values(d, (const Permuted_Dense *) self,
                                   (Permuted_Dense *) out);
@@ -123,7 +113,7 @@ Matrix *new_permuted_dense(int m, int n, int dense_m, int dense_n,
     Permuted_Dense *pd = (Permuted_Dense *) SP_CALLOC(1, sizeof(Permuted_Dense));
     pd->base.m = m;
     pd->base.n = n;
-    pd->base.update_values = permuted_dense_vtable_update_values;
+    pd->base.nnz = dense_m * dense_n;
     pd->base.copy_sparsity = permuted_dense_vtable_copy_sparsity;
     pd->base.DA_fill_values = permuted_dense_vtable_DA_fill_values;
     pd->base.ATA_alloc = permuted_dense_vtable_ATA_alloc;
@@ -139,6 +129,7 @@ Matrix *new_permuted_dense(int m, int n, int dense_m, int dense_n,
     pd->row_perm = (int *) SP_MALLOC(dense_m * sizeof(int));
     pd->col_perm = (int *) SP_MALLOC(dense_n * sizeof(int));
     pd->X = (double *) SP_MALLOC(sz * sizeof(double));
+    pd->base.x = pd->X;
     pd->Y_scratch = (double *) SP_MALLOC(sz * sizeof(double));
     pd->col_inv = (int *) SP_MALLOC(n * sizeof(int));
 
diff --git a/src/utils/sparse_matrix.c b/src/utils/sparse_matrix.c
index 6654a3f..53b8e38 100644
--- a/src/utils/sparse_matrix.c
+++ b/src/utils/sparse_matrix.c
@@ -43,12 +43,6 @@ static void sparse_block_left_mult_values(const Matrix *self, const CSC_Matrix *
     block_left_multiply_fill_values(sm->csr, J, C);
 }
 
-static void sparse_update_values(Matrix *self, const double *new_values)
-{
-    Sparse_Matrix *sm = (Sparse_Matrix *) self;
-    memcpy(sm->csr->x, new_values, sm->csr->nnz * sizeof(double));
-}
-
 static void sparse_free(Matrix *self)
 {
     Sparse_Matrix *sm = (Sparse_Matrix *) self;
@@ -117,7 +111,6 @@ static void wire_vtable(Sparse_Matrix *sm)
     sm->base.block_left_mult_vec = sparse_block_left_mult_vec;
     sm->base.block_left_mult_sparsity = sparse_block_left_mult_sparsity;
     sm->base.block_left_mult_values = sparse_block_left_mult_values;
-    sm->base.update_values = sparse_update_values;
     sm->base.copy_sparsity = sparse_copy_sparsity;
     sm->base.DA_fill_values = sparse_DA_fill_values;
     sm->base.ATA_alloc = sparse_ATA_alloc;
@@ -132,6 +125,8 @@ Matrix *new_sparse_matrix(CSR_Matrix *A)
     Sparse_Matrix *sm = (Sparse_Matrix *) SP_CALLOC(1, sizeof(Sparse_Matrix));
     sm->base.m = A->m;
     sm->base.n = A->n;
+    sm->base.nnz = A->nnz;
+    sm->base.x = A->x;
     wire_vtable(sm);
     sm->csr = A;
     return &sm->base;
@@ -143,6 +138,8 @@ Matrix *sparse_matrix_trans(const Sparse_Matrix *self, int *iwork)
     Sparse_Matrix *sm = (Sparse_Matrix *) SP_CALLOC(1, sizeof(Sparse_Matrix));
     sm->base.m = AT->m;
     sm->base.n = AT->n;
+    sm->base.nnz = AT->nnz;
+    sm->base.x = AT->x;
     wire_vtable(sm);
     sm->csr = AT;
     return &sm->base;
diff --git a/tests/jacobian_tests/affine/test_broadcast.h b/tests/jacobian_tests/affine/test_broadcast.h
index 4ed1b40..2a5d491 100644
--- a/tests/jacobian_tests/affine/test_broadcast.h
+++ b/tests/jacobian_tests/affine/test_broadcast.h
@@ -38,7 +38,7 @@ const char *test_broadcast_row_jacobian(void)
     int expected_i[6] = {0, 0, 1, 1, 2, 2};
 
     mu_assert("broadcast row jacobian vals fail",
-              cmp_double_array(bcast->jacobian->to_csr(bcast->jacobian)->x, expected_x, 6));
+              cmp_double_array(bcast->jacobian->x, expected_x, 6));
     mu_assert("broadcast row jacobian rows fail",
               cmp_int_array(bcast->jacobian->to_csr(bcast->jacobian)->p, expected_p, 4));
     mu_assert("broadcast row jacobian cols fail",
@@ -83,7 +83,7 @@ const char *test_broadcast_col_jacobian(void)
     int expected_i[6] = {0, 1, 2, 0, 1, 2};
 
     mu_assert("broadcast col jacobian vals fail",
-              cmp_double_array(bcast->jacobian->to_csr(bcast->jacobian)->x, expected_x, 6));
+              cmp_double_array(bcast->jacobian->x, expected_x, 6));
     mu_assert("broadcast col jacobian rows fail",
               cmp_int_array(bcast->jacobian->to_csr(bcast->jacobian)->p, expected_p, 7));
     mu_assert("broadcast col jacobian cols fail",
@@ -124,7 +124,7 @@ const char *test_broadcast_scalar_to_matrix_jacobian(void)
     int expected_i[6] = {0, 0, 0, 0, 0, 0};
 
     mu_assert("broadcast scalar jacobian vals fail",
-              cmp_double_array(bcast->jacobian->to_csr(bcast->jacobian)->x, expected_x, 6));
+              cmp_double_array(bcast->jacobian->x, expected_x, 6));
     mu_assert("broadcast scalar jacobian rows fail",
               cmp_int_array(bcast->jacobian->to_csr(bcast->jacobian)->p, expected_p, 7));
     mu_assert("broadcast scalar jacobian cols fail",
@@ -156,7 +156,7 @@ const char *test_double_broadcast(void)
     // int expected_i[6] = {0, 0, 0, 0, 0, 0};
     //
     // mu_assert("broadcast scalar jacobian vals fail",
-    //          cmp_double_array(sum->jacobian->to_csr(sum->jacobian)->x, expected_x, 6));
+    //          cmp_double_array(sum->jacobian->x, expected_x, 6));
     // mu_assert("broadcast scalar jacobian rows fail",
     //          cmp_int_array(sum ->jacobian->p, expected_p, 7));
     // mu_assert("broadcast scalar jacobian cols fail",
diff --git a/tests/jacobian_tests/affine/test_convolve.h b/tests/jacobian_tests/affine/test_convolve.h
index ddb7f7a..a7a4ef8 100644
--- a/tests/jacobian_tests/affine/test_convolve.h
+++ b/tests/jacobian_tests/affine/test_convolve.h
@@ -42,7 +42,7 @@ const char *test_jacobian_convolve(void)
     mu_assert("Convolve Jacobian column indices incorrect",
               cmp_int_array(y->jacobian->to_csr(y->jacobian)->i, expected_i, 9));
     mu_assert("Convolve Jacobian values incorrect",
-              cmp_double_array(y->jacobian->to_csr(y->jacobian)->x, expected_x, 9));
+              cmp_double_array(y->jacobian->x, expected_x, 9));
 
     free_expr(y);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_diag_mat.h b/tests/jacobian_tests/affine/test_diag_mat.h
index 2aec818..49a8de9 100644
--- a/tests/jacobian_tests/affine/test_diag_mat.h
+++ b/tests/jacobian_tests/affine/test_diag_mat.h
@@ -23,7 +23,7 @@ const char *test_diag_mat_jacobian_variable(void)
     int expected_p[3] = {0, 1, 2};
     int expected_i[2] = {0, 3};
 
-    mu_assert("diag_mat jac vals", cmp_double_array(dm->jacobian->to_csr(dm->jacobian)->x, expected_x, 2));
+    mu_assert("diag_mat jac vals", cmp_double_array(dm->jacobian->x, expected_x, 2));
     mu_assert("diag_mat jac p", cmp_int_array(dm->jacobian->to_csr(dm->jacobian)->p, expected_p, 3));
     mu_assert("diag_mat jac i", cmp_int_array(dm->jacobian->to_csr(dm->jacobian)->i, expected_i, 2));
 
@@ -52,7 +52,7 @@ const char *test_diag_mat_jacobian_of_log(void)
     int expected_i[2] = {0, 3};
 
     mu_assert("diag_mat log jac vals",
-              cmp_double_array(dm->jacobian->to_csr(dm->jacobian)->x, expected_x, 2));
+              cmp_double_array(dm->jacobian->x, expected_x, 2));
     mu_assert("diag_mat log jac cols",
               cmp_int_array(dm->jacobian->to_csr(dm->jacobian)->i, expected_i, 2));
 
diff --git a/tests/jacobian_tests/affine/test_hstack.h b/tests/jacobian_tests/affine/test_hstack.h
index ac7becf..1cc0b21 100644
--- a/tests/jacobian_tests/affine/test_hstack.h
+++ b/tests/jacobian_tests/affine/test_hstack.h
@@ -42,7 +42,7 @@ const char *test_jacobian_hstack_vectors(void)
     int expected_Ai[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
     int expected_Ap[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
 
-    mu_assert("vals fail", cmp_double_array(stack->jacobian->to_csr(stack->jacobian)->x, expected_Ax, 9));
+    mu_assert("vals fail", cmp_double_array(stack->jacobian->x, expected_Ax, 9));
     mu_assert("cols fail", cmp_int_array(stack->jacobian->to_csr(stack->jacobian)->i, expected_Ai, 9));
     mu_assert("rows fail", cmp_int_array(stack->jacobian->to_csr(stack->jacobian)->p, expected_Ap, 10));
 
@@ -86,7 +86,7 @@ const char *test_jacobian_hstack_matrix(void)
     int expected_Ap[19] = {0,  1,  2,  3,  4,  5,  6,  7,  8, 9,
                            10, 11, 12, 13, 14, 15, 16, 17, 18};
 
-    mu_assert("vals fail", cmp_double_array(stack->jacobian->to_csr(stack->jacobian)->x, expected_Ax, 18));
+    mu_assert("vals fail", cmp_double_array(stack->jacobian->x, expected_Ax, 18));
     mu_assert("cols fail", cmp_int_array(stack->jacobian->to_csr(stack->jacobian)->i, expected_Ai, 18));
     mu_assert("rows fail", cmp_int_array(stack->jacobian->to_csr(stack->jacobian)->p, expected_Ap, 19));
 
diff --git a/tests/jacobian_tests/affine/test_index.h b/tests/jacobian_tests/affine/test_index.h
index 16474a4..c783448 100644
--- a/tests/jacobian_tests/affine/test_index.h
+++ b/tests/jacobian_tests/affine/test_index.h
@@ -56,7 +56,7 @@ const char *test_index_jacobian_of_variable(void)
     int expected_p[3] = {0, 1, 2}; /* CSR row ptrs */
     int expected_i[2] = {0, 2};    /* column indices */
 
-    mu_assert("index jac vals", cmp_double_array(idx->jacobian->to_csr(idx->jacobian)->x, expected_x, 2));
+    mu_assert("index jac vals", cmp_double_array(idx->jacobian->x, expected_x, 2));
     mu_assert("index jac p", cmp_int_array(idx->jacobian->to_csr(idx->jacobian)->p, expected_p, 3));
     mu_assert("index jac i", cmp_int_array(idx->jacobian->to_csr(idx->jacobian)->i, expected_i, 2));
 
@@ -83,7 +83,7 @@ const char *test_index_jacobian_of_log(void)
     int expected_i[2] = {0, 2};
 
     mu_assert("index of log jac vals",
-              cmp_double_array(idx->jacobian->to_csr(idx->jacobian)->x, expected_x, 2));
+              cmp_double_array(idx->jacobian->x, expected_x, 2));
     mu_assert("index of log jac cols",
               cmp_int_array(idx->jacobian->to_csr(idx->jacobian)->i, expected_i, 2));
 
@@ -108,7 +108,7 @@ const char *test_index_jacobian_repeated(void)
     int expected_i[2] = {0, 0}; /* Both reference col 0 */
 
     mu_assert("index repeated jac vals",
-              cmp_double_array(idx->jacobian->to_csr(idx->jacobian)->x, expected_x, 2));
+              cmp_double_array(idx->jacobian->x, expected_x, 2));
     mu_assert("index repeated row ptr",
               cmp_int_array(idx->jacobian->to_csr(idx->jacobian)->p, expected_p, 3));
     mu_assert("index repeated jac i",
@@ -137,7 +137,7 @@ const char *test_sum_of_index(void)
     double expected_x[2] = {1.0, 1.0};
     int expected_i[2] = {0, 2};
 
-    mu_assert("sum of index vals", cmp_double_array(s->jacobian->to_csr(s->jacobian)->x, expected_x, 2));
+    mu_assert("sum of index vals", cmp_double_array(s->jacobian->x, expected_x, 2));
     mu_assert("sum of index cols", cmp_int_array(s->jacobian->to_csr(s->jacobian)->i, expected_i, 2));
 
     free_expr(s);
diff --git a/tests/jacobian_tests/affine/test_left_matmul.h b/tests/jacobian_tests/affine/test_left_matmul.h
index bb4e31f..1fb7bff 100644
--- a/tests/jacobian_tests/affine/test_left_matmul.h
+++ b/tests/jacobian_tests/affine/test_left_matmul.h
@@ -61,7 +61,7 @@ const char *test_jacobian_left_matmul_log(void)
     int expected_Ai[7] = {0, 2, 0, 2, 0, 2, 0};
     int expected_Ap[5] = {0, 2, 4, 6, 7};
 
-    mu_assert("vals fail", cmp_double_array(A_log_x->jacobian->to_csr(A_log_x->jacobian)->x, expected_Ax, 7));
+    mu_assert("vals fail", cmp_double_array(A_log_x->jacobian->x, expected_Ax, 7));
     mu_assert("cols fail", cmp_int_array(A_log_x->jacobian->to_csr(A_log_x->jacobian)->i, expected_Ai, 7));
     mu_assert("rows fail", cmp_int_array(A_log_x->jacobian->to_csr(A_log_x->jacobian)->p, expected_Ap, 5));
 
@@ -100,7 +100,7 @@ const char *test_jacobian_left_matmul_log_matrix(void)
     int expected_Ai[14] = {0, 2, 0, 2, 0, 2, 0, 3, 5, 3, 5, 3, 5, 3};
     int expected_Ap[9] = {0, 2, 4, 6, 7, 9, 11, 13, 14};
 
-    mu_assert("vals fail", cmp_double_array(A_log_x->jacobian->to_csr(A_log_x->jacobian)->x, expected_Ax, 14));
+    mu_assert("vals fail", cmp_double_array(A_log_x->jacobian->x, expected_Ax, 14));
     mu_assert("cols fail", cmp_int_array(A_log_x->jacobian->to_csr(A_log_x->jacobian)->i, expected_Ai, 14));
     mu_assert("rows fail", cmp_int_array(A_log_x->jacobian->to_csr(A_log_x->jacobian)->p, expected_Ap, 9));
 
diff --git a/tests/jacobian_tests/affine/test_neg.h b/tests/jacobian_tests/affine/test_neg.h
index a928d8d..3190bec 100644
--- a/tests/jacobian_tests/affine/test_neg.h
+++ b/tests/jacobian_tests/affine/test_neg.h
@@ -20,7 +20,7 @@ const char *test_neg_jacobian(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("neg jacobian vals fail",
-              cmp_double_array(neg_node->jacobian->to_csr(neg_node->jacobian)->x, expected_x, 3));
+              cmp_double_array(neg_node->jacobian->x, expected_x, 3));
     mu_assert("neg jacobian rows fail",
               cmp_int_array(neg_node->jacobian->to_csr(neg_node->jacobian)->p, expected_p, 4));
     mu_assert("neg jacobian cols fail",
@@ -51,7 +51,7 @@ const char *test_neg_chain(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("neg chain jacobian vals fail",
-              cmp_double_array(neg2->jacobian->to_csr(neg2->jacobian)->x, expected_x, 3));
+              cmp_double_array(neg2->jacobian->x, expected_x, 3));
     mu_assert("neg chain jacobian rows fail",
               cmp_int_array(neg2->jacobian->to_csr(neg2->jacobian)->p, expected_p, 4));
     mu_assert("neg chain jacobian cols fail",
diff --git a/tests/jacobian_tests/affine/test_promote.h b/tests/jacobian_tests/affine/test_promote.h
index b6c304b..75f68fd 100644
--- a/tests/jacobian_tests/affine/test_promote.h
+++ b/tests/jacobian_tests/affine/test_promote.h
@@ -23,7 +23,7 @@ const char *test_promote_scalar_jacobian(void)
     int expected_i[3] = {0, 0, 0};
 
     mu_assert("promote jacobian vals fail",
-              cmp_double_array(promote_node->jacobian->to_csr(promote_node->jacobian)->x, expected_x, 3));
+              cmp_double_array(promote_node->jacobian->x, expected_x, 3));
     mu_assert("promote jacobian rows fail",
               cmp_int_array(promote_node->jacobian->to_csr(promote_node->jacobian)->p, expected_p, 4));
     mu_assert("promote jacobian cols fail",
@@ -55,7 +55,7 @@ const char *test_promote_scalar_to_matrix_jacobian(void)
     int expected_i[6] = {0, 0, 0, 0, 0, 0};
 
     mu_assert("promote matrix jacobian vals fail",
-              cmp_double_array(promote_node->jacobian->to_csr(promote_node->jacobian)->x, expected_x, 6));
+              cmp_double_array(promote_node->jacobian->x, expected_x, 6));
     mu_assert("promote matrix jacobian rows fail",
               cmp_int_array(promote_node->jacobian->to_csr(promote_node->jacobian)->p, expected_p, 7));
     mu_assert("promote matrix jacobian cols fail",
diff --git a/tests/jacobian_tests/affine/test_right_matmul.h b/tests/jacobian_tests/affine/test_right_matmul.h
index 056e844..ad74d60 100644
--- a/tests/jacobian_tests/affine/test_right_matmul.h
+++ b/tests/jacobian_tests/affine/test_right_matmul.h
@@ -47,7 +47,7 @@ const char *test_jacobian_right_matmul_log(void)
     int expected_Ai[8] = {0, 2, 1, 3, 0, 2, 1, 3};
     int expected_Ap[7] = {0, 2, 4, 4, 4, 6, 8};
 
-    mu_assert("vals fail", cmp_double_array(log_x_A->jacobian->to_csr(log_x_A->jacobian)->x, expected_Ax, 8));
+    mu_assert("vals fail", cmp_double_array(log_x_A->jacobian->x, expected_Ax, 8));
     mu_assert("cols fail", cmp_int_array(log_x_A->jacobian->to_csr(log_x_A->jacobian)->i, expected_Ai, 8));
     mu_assert("rows fail", cmp_int_array(log_x_A->jacobian->to_csr(log_x_A->jacobian)->p, expected_Ap, 7));
 
@@ -92,7 +92,7 @@ const char *test_jacobian_right_matmul_log_vector(void)
     int expected_Ai[4] = {0, 1, 1, 2};
     int expected_Ap[3] = {0, 2, 4};
 
-    mu_assert("vals fail", cmp_double_array(log_x_A->jacobian->to_csr(log_x_A->jacobian)->x, expected_Ax, 4));
+    mu_assert("vals fail", cmp_double_array(log_x_A->jacobian->x, expected_Ax, 4));
     mu_assert("cols fail", cmp_int_array(log_x_A->jacobian->to_csr(log_x_A->jacobian)->i, expected_Ai, 4));
     mu_assert("rows fail", cmp_int_array(log_x_A->jacobian->to_csr(log_x_A->jacobian)->p, expected_Ap, 3));
 
diff --git a/tests/jacobian_tests/affine/test_scalar_mult.h b/tests/jacobian_tests/affine/test_scalar_mult.h
index 66bdaaf..3eda315 100644
--- a/tests/jacobian_tests/affine/test_scalar_mult.h
+++ b/tests/jacobian_tests/affine/test_scalar_mult.h
@@ -37,7 +37,7 @@ const char *test_jacobian_scalar_mult_log_vector(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("scalar mult log: x values fail",
-              cmp_double_array(y->jacobian->to_csr(y->jacobian)->x, expected_x, 3));
+              cmp_double_array(y->jacobian->x, expected_x, 3));
     mu_assert("scalar mult log: row pointers fail",
               cmp_int_array(y->jacobian->to_csr(y->jacobian)->p, expected_p, 4));
     mu_assert("scalar mult log: column indices fail",
@@ -74,7 +74,7 @@ const char *test_jacobian_scalar_mult_log_matrix(void)
     int expected_i[4] = {0, 1, 2, 3};
 
     mu_assert("scalar mult log matrix: x values fail",
-              cmp_double_array(y->jacobian->to_csr(y->jacobian)->x, expected_x, 4));
+              cmp_double_array(y->jacobian->x, expected_x, 4));
     mu_assert("scalar mult log matrix: row pointers fail",
               cmp_int_array(y->jacobian->to_csr(y->jacobian)->p, expected_p, 5));
     mu_assert("scalar mult log matrix: column indices fail",
diff --git a/tests/jacobian_tests/affine/test_sum.h b/tests/jacobian_tests/affine/test_sum.h
index b00695b..026d3bf 100644
--- a/tests/jacobian_tests/affine/test_sum.h
+++ b/tests/jacobian_tests/affine/test_sum.h
@@ -25,7 +25,7 @@ const char *test_jacobian_sum_log(void)
     int expected_Ap[2] = {0, 3};
     int expected_Ai[3] = {2, 3, 4};
 
-    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->to_csr(sum_node->jacobian)->x, expected_Ax, 3));
+    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->x, expected_Ax, 3));
     mu_assert("rows fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->p, expected_Ap, 2));
     mu_assert("cols fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->i, expected_Ai, 3));
 
@@ -59,7 +59,7 @@ const char *test_jacobian_sum_mult(void)
     int expected_Ap[2] = {0, 6}; /* 1x10 matrix: row 0 spans all 6 nonzeros */
     int expected_Ai[6] = {2, 3, 4, 6, 7, 8}; /* column indices */
 
-    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->to_csr(sum_node->jacobian)->x, expected_Ax, 6));
+    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->x, expected_Ax, 6));
     mu_assert("rows fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->p, expected_Ap, 2));
     mu_assert("cols fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->i, expected_Ai, 6));
 
@@ -99,7 +99,7 @@ const char *test_jacobian_sum_log_axis_0(void)
     int expected_Ap[3] = {0, 3, 6};
     int expected_Ai[6] = {2, 3, 4, 5, 6, 7}; /* column indices */
 
-    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->to_csr(sum_node->jacobian)->x, expected_Ax, 6));
+    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->x, expected_Ax, 6));
     mu_assert("rows fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->p, expected_Ap, 3));
     mu_assert("cols fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->i, expected_Ai, 6));
 
@@ -146,7 +146,7 @@ const char *test_jacobian_sum_add_log_axis_0(void)
     int expected_Ai[12] = {2, 3, 4, 8,  9,  10,  /* row 0 columns */
                            5, 6, 7, 11, 12, 13}; /* row 1 columns */
 
-    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->to_csr(sum_node->jacobian)->x, expected_Ax, 12));
+    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->x, expected_Ax, 12));
     mu_assert("rows fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->p, expected_Ap, 3));
     mu_assert("cols fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->i, expected_Ai, 12));
 
@@ -188,7 +188,7 @@ const char *test_jacobian_sum_log_axis_1(void)
     int expected_Ap[4] = {0, 2, 4, 6};
     int expected_Ai[6] = {2, 5, 3, 6, 4, 7}; /* column indices */
 
-    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->to_csr(sum_node->jacobian)->x, expected_Ax, 6));
+    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->x, expected_Ax, 6));
     mu_assert("rows fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->p, expected_Ap, 4));
     mu_assert("cols fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->i, expected_Ai, 6));
 
diff --git a/tests/jacobian_tests/affine/test_trace.h b/tests/jacobian_tests/affine/test_trace.h
index 39f9fbf..0caa122 100644
--- a/tests/jacobian_tests/affine/test_trace.h
+++ b/tests/jacobian_tests/affine/test_trace.h
@@ -40,7 +40,7 @@ const char *test_jacobian_trace_variable(void)
     int expected_Ai[3] = {1, 5, 9}; /* column indices (global variable indices) */
 
     mu_assert("vals fail",
-              cmp_double_array(trace_node->jacobian->to_csr(trace_node->jacobian)->x, expected_Ax, 3));
+              cmp_double_array(trace_node->jacobian->x, expected_Ax, 3));
     mu_assert("rows fail", cmp_int_array(trace_node->jacobian->to_csr(trace_node->jacobian)->p, expected_Ap, 2));
     mu_assert("cols fail", cmp_int_array(trace_node->jacobian->to_csr(trace_node->jacobian)->i, expected_Ai, 3));
 
@@ -97,7 +97,7 @@ const char *test_jacobian_trace_composite(void)
     mu_assert("rows fail", cmp_int_array(trace_node->jacobian->to_csr(trace_node->jacobian)->p, expected_Ap, 2));
     mu_assert("cols fail", cmp_int_array(trace_node->jacobian->to_csr(trace_node->jacobian)->i, expected_Ai, 3));
     mu_assert("vals fail",
-              cmp_double_array(trace_node->jacobian->to_csr(trace_node->jacobian)->x, expected_Ax, 3));
+              cmp_double_array(trace_node->jacobian->x, expected_Ax, 3));
 
     free_expr(trace_node);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_transpose.h b/tests/jacobian_tests/affine/test_transpose.h
index c85f50a..d2201da 100644
--- a/tests/jacobian_tests/affine/test_transpose.h
+++ b/tests/jacobian_tests/affine/test_transpose.h
@@ -34,7 +34,7 @@ const char *test_jacobian_transpose(void)
     int expected_i[8] = {0, 1, 2, 3, 0, 1, 2, 3};
 
     mu_assert("jacobian values fail",
-              cmp_double_array(transpose_AX->jacobian->to_csr(transpose_AX->jacobian)->x, expected_x, 8));
+              cmp_double_array(transpose_AX->jacobian->x, expected_x, 8));
     mu_assert("jacobian row ptr fail",
               cmp_int_array(transpose_AX->jacobian->to_csr(transpose_AX->jacobian)->p, expected_p, 5));
     mu_assert("jacobian col idx fail",
diff --git a/tests/jacobian_tests/affine/test_upper_tri.h b/tests/jacobian_tests/affine/test_upper_tri.h
index 125780f..0a01a30 100644
--- a/tests/jacobian_tests/affine/test_upper_tri.h
+++ b/tests/jacobian_tests/affine/test_upper_tri.h
@@ -28,7 +28,7 @@ const char *test_upper_tri_jacobian_variable(void)
     int expected_i[6] = {4, 8, 12, 9, 13, 14};
 
     mu_assert("upper_tri jac vals",
-              cmp_double_array(ut->jacobian->to_csr(ut->jacobian)->x, expected_x, 6));
+              cmp_double_array(ut->jacobian->x, expected_x, 6));
     mu_assert("upper_tri jac p", cmp_int_array(ut->jacobian->to_csr(ut->jacobian)->p, expected_p, 7));
     mu_assert("upper_tri jac i", cmp_int_array(ut->jacobian->to_csr(ut->jacobian)->i, expected_i, 6));
 
@@ -60,7 +60,7 @@ const char *test_upper_tri_jacobian_of_log(void)
     int expected_i[6] = {4, 8, 12, 9, 13, 14};
 
     mu_assert("upper_tri log jac vals",
-              cmp_double_array(ut->jacobian->to_csr(ut->jacobian)->x, expected_x, 6));
+              cmp_double_array(ut->jacobian->x, expected_x, 6));
     mu_assert("upper_tri log jac cols",
               cmp_int_array(ut->jacobian->to_csr(ut->jacobian)->i, expected_i, 6));
 
diff --git a/tests/jacobian_tests/affine/test_vector_mult.h b/tests/jacobian_tests/affine/test_vector_mult.h
index 8304e91..a686c54 100644
--- a/tests/jacobian_tests/affine/test_vector_mult.h
+++ b/tests/jacobian_tests/affine/test_vector_mult.h
@@ -36,7 +36,7 @@ const char *test_jacobian_vector_mult_log_vector(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vector mult log: x values fail",
-              cmp_double_array(y->jacobian->to_csr(y->jacobian)->x, expected_x, 3));
+              cmp_double_array(y->jacobian->x, expected_x, 3));
     mu_assert("vector mult log: row pointers fail",
               cmp_int_array(y->jacobian->to_csr(y->jacobian)->p, expected_p, 4));
     mu_assert("vector mult log: column indices fail",
@@ -72,7 +72,7 @@ const char *test_jacobian_vector_mult_log_matrix(void)
     int expected_i[4] = {0, 1, 2, 3};
 
     mu_assert("vector mult log matrix: x values fail",
-              cmp_double_array(y->jacobian->to_csr(y->jacobian)->x, expected_x, 4));
+              cmp_double_array(y->jacobian->x, expected_x, 4));
     mu_assert("vector mult log matrix: row pointers fail",
               cmp_int_array(y->jacobian->to_csr(y->jacobian)->p, expected_p, 5));
     mu_assert("vector mult log matrix: column indices fail",
diff --git a/tests/jacobian_tests/affine/test_vstack.h b/tests/jacobian_tests/affine/test_vstack.h
index b4115b6..2f6b2e3 100644
--- a/tests/jacobian_tests/affine/test_vstack.h
+++ b/tests/jacobian_tests/affine/test_vstack.h
@@ -39,7 +39,7 @@ const char *test_jacobian_vstack_vectors(void)
     int expected_p[7] = {0, 1, 2, 3, 4, 5, 6};
 
     mu_assert("vstack jac vectors: vals",
-              cmp_double_array(stack->jacobian->to_csr(stack->jacobian)->x, expected_x, 6));
+              cmp_double_array(stack->jacobian->x, expected_x, 6));
     mu_assert("vstack jac vectors: cols",
               cmp_int_array(stack->jacobian->to_csr(stack->jacobian)->i, expected_i, 6));
     mu_assert("vstack jac vectors: rows",
@@ -89,7 +89,7 @@ const char *test_jacobian_vstack_matrix(void)
     int expected_p[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
 
     mu_assert("vstack jac matrix: vals",
-              cmp_double_array(stack->jacobian->to_csr(stack->jacobian)->x, expected_x, 9));
+              cmp_double_array(stack->jacobian->x, expected_x, 9));
     mu_assert("vstack jac matrix: cols",
               cmp_int_array(stack->jacobian->to_csr(stack->jacobian)->i, expected_i, 9));
     mu_assert("vstack jac matrix: rows",
diff --git a/tests/jacobian_tests/bivariate_full_dom/test_elementwise_mult.h b/tests/jacobian_tests/bivariate_full_dom/test_elementwise_mult.h
index bd3bd3c..bdf3bd9 100644
--- a/tests/jacobian_tests/bivariate_full_dom/test_elementwise_mult.h
+++ b/tests/jacobian_tests/bivariate_full_dom/test_elementwise_mult.h
@@ -24,7 +24,7 @@ const char *test_jacobian_elementwise_mult_1(void)
     int rows[4] = {0, 2, 4, 6};
     int cols[6] = {2, 7, 3, 8, 4, 9};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, vals, 6));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->x, vals, 6));
     mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, rows, 4));
     mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, cols, 6));
     free_expr(node);
@@ -49,7 +49,7 @@ const char *test_jacobian_elementwise_mult_2(void)
     int rows[4] = {0, 2, 4, 6};
     int cols[6] = {2, 7, 3, 8, 4, 9};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, vals, 6));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->x, vals, 6));
     mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, rows, 4));
     mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, cols, 6));
     free_expr(node);
@@ -112,7 +112,7 @@ const char *test_jacobian_elementwise_mult_3(void)
     int rows[4] = {0, 4, 10, 16};
     int cols[16] = {2, 3, 7, 8, 2, 3, 4, 7, 8, 9, 2, 3, 4, 7, 8, 9};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, vals, 16));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->x, vals, 16));
     mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, rows, 4));
     mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, cols, 16));
     free_expr(node);
@@ -157,7 +157,7 @@ const char *test_jacobian_elementwise_mult_4(void)
     int rows[4] = {0, 2, 5, 8};
     int cols[8] = {2, 3, 2, 3, 4, 2, 3, 4};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, vals, 8));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->x, vals, 8));
     mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, rows, 4));
     mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, cols, 8));
     free_expr(node);
diff --git a/tests/jacobian_tests/bivariate_full_dom/test_matmul.h b/tests/jacobian_tests/bivariate_full_dom/test_matmul.h
index 45745b8..e825918 100644
--- a/tests/jacobian_tests/bivariate_full_dom/test_matmul.h
+++ b/tests/jacobian_tests/bivariate_full_dom/test_matmul.h
@@ -82,7 +82,7 @@ const char *test_jacobian_matmul(void)
         /* row 7 (col 3) */ 16.0, 17.0, 18.0, 2.0, 4.0, 6.0};
 
     mu_assert("Jacobian values incorrect",
-              cmp_double_array(Z->jacobian->to_csr(Z->jacobian)->x, expected_x, 48));
+              cmp_double_array(Z->jacobian->x, expected_x, 48));
 
     free_expr(Z);
     return 0;
diff --git a/tests/jacobian_tests/bivariate_restricted_dom/test_quad_over_lin.h b/tests/jacobian_tests/bivariate_restricted_dom/test_quad_over_lin.h
index 451827b..05eb5b2 100644
--- a/tests/jacobian_tests/bivariate_restricted_dom/test_quad_over_lin.h
+++ b/tests/jacobian_tests/bivariate_restricted_dom/test_quad_over_lin.h
@@ -25,7 +25,7 @@ const char *test_quad_over_lin1(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 3, 4, 7};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, expected_Ax, 4));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 4));
     mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 2));
     mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 4));
     free_expr(node);
@@ -49,7 +49,7 @@ const char *test_quad_over_lin2(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 5, 6, 7};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, expected_Ax, 4));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 4));
     mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 2));
     mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 4));
     free_expr(node);
@@ -86,7 +86,7 @@ const char *test_quad_over_lin3(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 3, 4, 7};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, expected_vals, 4));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_vals, 4));
     mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 2));
     mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 4));
 
@@ -126,7 +126,7 @@ const char *test_quad_over_lin4(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 5, 6, 7};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, expected_vals, 4));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_vals, 4));
     mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 2));
     mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 4));
 
@@ -166,7 +166,7 @@ const char *test_quad_over_lin5(void)
     int expected_Ap[2] = {0, 7};
     int expected_Ai[7] = {0, 1, 2, 3, 5, 6, 7};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, expected_vals, 7));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_vals, 7));
     mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 2));
     mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 7));
 
diff --git a/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr.h b/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr.h
index 596c75a..e432d86 100644
--- a/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr.h
+++ b/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr.h
@@ -30,7 +30,7 @@ const char *test_jacobian_rel_entr_vector_args_1(void)
     int expected_Ap[4] = {0, 2, 4, 6};
     int expected_Ai[6] = {2, 7, 3, 8, 4, 9};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, expected_Ax, 6));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 6));
     mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 4));
     mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 6));
     free_expr(node);
@@ -61,7 +61,7 @@ const char *test_jacobian_rel_entr_vector_args_2(void)
     int expected_Ap[4] = {0, 2, 4, 6};
     int expected_Ai[6] = {2, 7, 3, 8, 4, 9};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, expected_Ax, 6));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 6));
     mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 4));
     mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 6));
     free_expr(node);
@@ -103,7 +103,7 @@ const char *test_jacobian_rel_entr_matrix_args(void)
     int expected_Ap[7] = {0, 2, 4, 6, 8, 10, 12};
     int expected_Ai[12] = {0, 6, 1, 7, 2, 8, 3, 9, 4, 10, 5, 11};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, expected_Ax, 12));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 12));
     mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 7));
     mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 12));
 
diff --git a/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_scalar_vector.h b/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_scalar_vector.h
index d716046..2d54b3d 100644
--- a/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_scalar_vector.h
+++ b/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_scalar_vector.h
@@ -28,7 +28,7 @@ const char *test_jacobian_rel_entr_scalar_vector(void)
     int expected_Ap[4] = {0, 2, 4, 6};
     int expected_Ai[6] = {0, 1, 0, 2, 0, 3};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, expected_Ax, 6));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 6));
     mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 4));
     mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 6));
     free_expr(node);
diff --git a/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_vector_scalar.h b/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_vector_scalar.h
index f4c77eb..4e88988 100644
--- a/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_vector_scalar.h
+++ b/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_vector_scalar.h
@@ -28,7 +28,7 @@ const char *test_jacobian_rel_entr_vector_scalar(void)
     int expected_Ap[4] = {0, 2, 4, 6};
     int expected_Ai[6] = {0, 3, 1, 3, 2, 3};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, expected_Ax, 6));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 6));
     mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 4));
     mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 6));
     free_expr(node);
diff --git a/tests/jacobian_tests/composite/test_composite_exp.h b/tests/jacobian_tests/composite/test_composite_exp.h
index 2862550..ad38123 100644
--- a/tests/jacobian_tests/composite/test_composite_exp.h
+++ b/tests/jacobian_tests/composite/test_composite_exp.h
@@ -33,7 +33,7 @@ const char *test_jacobian_composite_exp(void)
     double vals[6] = {3 * e10, 2 * e10, 1 * e10, 2 * e7, 1 * e7, 1 * e7};
     int rows[3] = {0, 3, 6};
     int cols[6] = {2, 3, 4, 2, 3, 4};
-    mu_assert("vals fail", cmp_double_array(exp_node->jacobian->to_csr(exp_node->jacobian)->x, vals, 6));
+    mu_assert("vals fail", cmp_double_array(exp_node->jacobian->x, vals, 6));
     mu_assert("rows fail", cmp_int_array(exp_node->jacobian->to_csr(exp_node->jacobian)->p, rows, 3));
     mu_assert("cols fail", cmp_int_array(exp_node->jacobian->to_csr(exp_node->jacobian)->i, cols, 6));
     free_expr(exp_node);
diff --git a/tests/jacobian_tests/elementwise_restricted_dom/test_log.h b/tests/jacobian_tests/elementwise_restricted_dom/test_log.h
index b6a07c2..d5b55c2 100644
--- a/tests/jacobian_tests/elementwise_restricted_dom/test_log.h
+++ b/tests/jacobian_tests/elementwise_restricted_dom/test_log.h
@@ -18,7 +18,7 @@ const char *test_jacobian_log(void)
     log_node->forward(log_node, u_vals);
     jacobian_init(log_node);
     log_node->eval_jacobian(log_node);
-    mu_assert("vals fail", cmp_double_array(log_node->jacobian->to_csr(log_node->jacobian)->x, expected_Ax, 3));
+    mu_assert("vals fail", cmp_double_array(log_node->jacobian->x, expected_Ax, 3));
     mu_assert("rows fail", cmp_int_array(log_node->jacobian->to_csr(log_node->jacobian)->p, expected_Ap, 4));
     mu_assert("cols fail", cmp_int_array(log_node->jacobian->to_csr(log_node->jacobian)->i, expected_Ai, 3));
     free_expr(log_node);
@@ -36,7 +36,7 @@ const char *test_jacobian_log_matrix(void)
     log_node->forward(log_node, u_vals);
     jacobian_init(log_node);
     log_node->eval_jacobian(log_node);
-    mu_assert("vals fail", cmp_double_array(log_node->jacobian->to_csr(log_node->jacobian)->x, expected_Ax, 4));
+    mu_assert("vals fail", cmp_double_array(log_node->jacobian->x, expected_Ax, 4));
     mu_assert("rows fail", cmp_int_array(log_node->jacobian->to_csr(log_node->jacobian)->p, expected_Ap, 5));
     mu_assert("cols fail", cmp_int_array(log_node->jacobian->to_csr(log_node->jacobian)->i, expected_Ai, 4));
     free_expr(log_node);
diff --git a/tests/jacobian_tests/other/test_prod.h b/tests/jacobian_tests/other/test_prod.h
index ea6aef3..0603ec4 100644
--- a/tests/jacobian_tests/other/test_prod.h
+++ b/tests/jacobian_tests/other/test_prod.h
@@ -25,7 +25,7 @@ const char *test_jacobian_prod_no_zero(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->jacobian->to_csr(p->jacobian)->x, expected_Ax, 4));
+    mu_assert("vals fail", cmp_double_array(p->jacobian->x, expected_Ax, 4));
     mu_assert("rows fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->p, expected_Ap, 2));
     mu_assert("cols fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->i, expected_Ai, 4));
 
@@ -50,7 +50,7 @@ const char *test_jacobian_prod_one_zero(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->jacobian->to_csr(p->jacobian)->x, expected_Ax, 4));
+    mu_assert("vals fail", cmp_double_array(p->jacobian->x, expected_Ax, 4));
     mu_assert("rows fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->p, expected_Ap, 2));
     mu_assert("cols fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->i, expected_Ai, 4));
 
@@ -73,7 +73,7 @@ const char *test_jacobian_prod_two_zeros(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->jacobian->to_csr(p->jacobian)->x, expected_Ax, 4));
+    mu_assert("vals fail", cmp_double_array(p->jacobian->x, expected_Ax, 4));
     mu_assert("rows fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->p, expected_Ap, 2));
     mu_assert("cols fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->i, expected_Ai, 4));
 
diff --git a/tests/jacobian_tests/other/test_prod_axis_one.h b/tests/jacobian_tests/other/test_prod_axis_one.h
index 1fb1c53..8814cac 100644
--- a/tests/jacobian_tests/other/test_prod_axis_one.h
+++ b/tests/jacobian_tests/other/test_prod_axis_one.h
@@ -42,7 +42,7 @@ const char *test_jacobian_prod_axis_one(void)
     int expected_Ap[4] = {0, 3, 6, 9};
     int expected_Ai[9] = {1, 4, 7, 2, 5, 8, 3, 6, 9};
 
-    mu_assert("vals fail", cmp_double_array(p->jacobian->to_csr(p->jacobian)->x, expected_Ax, 9));
+    mu_assert("vals fail", cmp_double_array(p->jacobian->x, expected_Ax, 9));
     mu_assert("rows fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->p, expected_Ap, 4));
     mu_assert("cols fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->i, expected_Ai, 9));
 
@@ -85,7 +85,7 @@ const char *test_jacobian_prod_axis_one_one_zero(void)
     int expected_Ap[4] = {0, 3, 6, 9};
     int expected_Ai[9] = {1, 4, 7, 2, 5, 8, 3, 6, 9};
 
-    mu_assert("vals fail", cmp_double_array(p->jacobian->to_csr(p->jacobian)->x, expected_Ax, 9));
+    mu_assert("vals fail", cmp_double_array(p->jacobian->x, expected_Ax, 9));
     mu_assert("rows fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->p, expected_Ap, 4));
     mu_assert("cols fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->i, expected_Ai, 9));
 
diff --git a/tests/jacobian_tests/other/test_prod_axis_zero.h b/tests/jacobian_tests/other/test_prod_axis_zero.h
index 7c7957d..5d80fb8 100644
--- a/tests/jacobian_tests/other/test_prod_axis_zero.h
+++ b/tests/jacobian_tests/other/test_prod_axis_zero.h
@@ -36,7 +36,7 @@ const char *test_jacobian_prod_axis_zero(void)
     int expected_Ap[4] = {0, 2, 4, 6};
     int expected_Ai[6] = {1, 2, 3, 4, 5, 6};
 
-    mu_assert("vals fail", cmp_double_array(p->jacobian->to_csr(p->jacobian)->x, expected_Ax, 6));
+    mu_assert("vals fail", cmp_double_array(p->jacobian->x, expected_Ax, 6));
     mu_assert("rows fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->p, expected_Ap, 4));
     mu_assert("cols fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->i, expected_Ai, 6));
 
diff --git a/tests/jacobian_tests/other/test_quad_form.h b/tests/jacobian_tests/other/test_quad_form.h
index 8864b41..c989987 100644
--- a/tests/jacobian_tests/other/test_quad_form.h
+++ b/tests/jacobian_tests/other/test_quad_form.h
@@ -31,7 +31,7 @@ const char *test_quad_form(void)
     int expected_Ap[2] = {0, 3};
     int expected_Ai[3] = {2, 3, 4};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, expected_Ax, 3));
+    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 3));
     mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 2));
     mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 3));
     free_expr(node);
@@ -77,7 +77,7 @@ double expected_Ax[5] = {422, 2222, 3244, 3786, 120};
 int expected_Ap[2] = {0, 5};
 int expected_Ai[5] = {0, 2, 3, 4, 5};
 
-mu_assert("vals fail", cmp_double_array(node->jacobian->to_csr(node->jacobian)->x, expected_Ax, 5));
+mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 5));
 mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 2));
 mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 5));
 free_expr(node);
diff --git a/tests/wsum_hess/affine/test_broadcast.h b/tests/wsum_hess/affine/test_broadcast.h
index 97d3208..ba4de3b 100644
--- a/tests/wsum_hess/affine/test_broadcast.h
+++ b/tests/wsum_hess/affine/test_broadcast.h
@@ -57,7 +57,7 @@ const char *test_wsum_hess_broadcast_row(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("broadcast row wsum_hess: x values fail",
-              cmp_double_array(bcast->wsum_hess->to_csr(bcast->wsum_hess)->x, expected_x, 3));
+              cmp_double_array(bcast->wsum_hess->x, expected_x, 3));
     mu_assert("broadcast row wsum_hess: row pointers fail",
               cmp_int_array(bcast->wsum_hess->to_csr(bcast->wsum_hess)->p, expected_p, 4));
     mu_assert("broadcast row wsum_hess: column indices fail",
@@ -112,7 +112,7 @@ const char *test_wsum_hess_broadcast_col(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("broadcast col wsum_hess: x values fail",
-              cmp_double_array(bcast->wsum_hess->to_csr(bcast->wsum_hess)->x, expected_x, 3));
+              cmp_double_array(bcast->wsum_hess->x, expected_x, 3));
     mu_assert("broadcast col wsum_hess: row pointers fail",
               cmp_int_array(bcast->wsum_hess->to_csr(bcast->wsum_hess)->p, expected_p, 4));
     mu_assert("broadcast col wsum_hess: column indices fail",
@@ -160,7 +160,7 @@ const char *test_wsum_hess_broadcast_scalar_to_matrix(void)
     int expected_i[1] = {0};
 
     mu_assert("broadcast scalar wsum_hess: x values fail",
-              cmp_double_array(bcast->wsum_hess->to_csr(bcast->wsum_hess)->x, expected_x, 1));
+              cmp_double_array(bcast->wsum_hess->x, expected_x, 1));
     mu_assert("broadcast scalar wsum_hess: row pointers fail",
               cmp_int_array(bcast->wsum_hess->to_csr(bcast->wsum_hess)->p, expected_p, 2));
     mu_assert("broadcast scalar wsum_hess: column indices fail",
diff --git a/tests/wsum_hess/affine/test_diag_mat.h b/tests/wsum_hess/affine/test_diag_mat.h
index 6a57355..736f129 100644
--- a/tests/wsum_hess/affine/test_diag_mat.h
+++ b/tests/wsum_hess/affine/test_diag_mat.h
@@ -37,7 +37,7 @@ const char *test_wsum_hess_diag_mat_log(void)
     int expected_i[4] = {0, 1, 2, 3};
 
     mu_assert("diag_mat log hess vals",
-              cmp_double_array(dm->wsum_hess->to_csr(dm->wsum_hess)->x, expected_x, 4));
+              cmp_double_array(dm->wsum_hess->x, expected_x, 4));
     mu_assert("diag_mat log hess p", cmp_int_array(dm->wsum_hess->to_csr(dm->wsum_hess)->p, expected_p, 5));
     mu_assert("diag_mat log hess i", cmp_int_array(dm->wsum_hess->to_csr(dm->wsum_hess)->i, expected_i, 4));
 
diff --git a/tests/wsum_hess/affine/test_hstack.h b/tests/wsum_hess/affine/test_hstack.h
index 7f7d8ab..8cb1f73 100644
--- a/tests/wsum_hess/affine/test_hstack.h
+++ b/tests/wsum_hess/affine/test_hstack.h
@@ -91,7 +91,7 @@ const char *test_wsum_hess_hstack(void)
     int expected_i[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
 
     mu_assert("vals incorrect",
-              cmp_double_array(hstack_node->wsum_hess->to_csr(hstack_node->wsum_hess)->x, expected_x, 9));
+              cmp_double_array(hstack_node->wsum_hess->x, expected_x, 9));
     mu_assert("rows incorrect",
               cmp_int_array(hstack_node->wsum_hess->to_csr(hstack_node->wsum_hess)->p, expected_p, 10));
     mu_assert("cols incorrect",
@@ -201,7 +201,7 @@ const char *test_wsum_hess_hstack_matrix(void)
                           9, 10, 11, 12, 13, 14, 15, 16, 17};
 
     mu_assert("vals incorrect",
-              cmp_double_array(hstack_node->wsum_hess->to_csr(hstack_node->wsum_hess)->x, expected_x, 18));
+              cmp_double_array(hstack_node->wsum_hess->x, expected_x, 18));
     mu_assert("rows incorrect",
               cmp_int_array(hstack_node->wsum_hess->to_csr(hstack_node->wsum_hess)->p, expected_p, 19));
     mu_assert("cols incorrect",
diff --git a/tests/wsum_hess/affine/test_index.h b/tests/wsum_hess/affine/test_index.h
index b0a7bdc..3bf390d 100644
--- a/tests/wsum_hess/affine/test_index.h
+++ b/tests/wsum_hess/affine/test_index.h
@@ -38,7 +38,7 @@ const char *test_wsum_hess_index_log(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("index log hess vals",
-              cmp_double_array(idx->wsum_hess->to_csr(idx->wsum_hess)->x, expected_x, 3));
+              cmp_double_array(idx->wsum_hess->x, expected_x, 3));
     mu_assert("index log hess p", cmp_int_array(idx->wsum_hess->to_csr(idx->wsum_hess)->p, expected_p, 4));
     mu_assert("index log hess i", cmp_int_array(idx->wsum_hess->to_csr(idx->wsum_hess)->i, expected_i, 3));
 
@@ -71,7 +71,7 @@ const char *test_wsum_hess_index_repeated(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("index repeated hess vals",
-              cmp_double_array(idx->wsum_hess->to_csr(idx->wsum_hess)->x, expected_x, 3));
+              cmp_double_array(idx->wsum_hess->x, expected_x, 3));
     mu_assert("index repeated hess p",
               cmp_int_array(idx->wsum_hess->to_csr(idx->wsum_hess)->p, expected_p, 4));
     mu_assert("index repeated hess i",
@@ -110,7 +110,7 @@ const char *test_wsum_hess_sum_index_log(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("sum index log hess vals",
-              cmp_double_array(sum_node->wsum_hess->to_csr(sum_node->wsum_hess)->x, expected_x, 3));
+              cmp_double_array(sum_node->wsum_hess->x, expected_x, 3));
     mu_assert("sum index log hess p",
               cmp_int_array(sum_node->wsum_hess->to_csr(sum_node->wsum_hess)->p, expected_p, 4));
     mu_assert("sum index log hess i",
diff --git a/tests/wsum_hess/affine/test_left_matmul.h b/tests/wsum_hess/affine/test_left_matmul.h
index e95dc33..3ccd99c 100644
--- a/tests/wsum_hess/affine/test_left_matmul.h
+++ b/tests/wsum_hess/affine/test_left_matmul.h
@@ -81,7 +81,7 @@ const char *test_wsum_hess_left_matmul(void)
     int expected_p[4] = {0, 1, 2, 3}; /* each row has 1 diagonal entry */
 
     mu_assert("vals incorrect",
-              cmp_double_array(A_log_x->wsum_hess->to_csr(A_log_x->wsum_hess)->x, expected_x, 3));
+              cmp_double_array(A_log_x->wsum_hess->x, expected_x, 3));
     mu_assert("cols incorrect", cmp_int_array(A_log_x->wsum_hess->to_csr(A_log_x->wsum_hess)->i, expected_i, 3));
     mu_assert("rows incorrect", cmp_int_array(A_log_x->wsum_hess->to_csr(A_log_x->wsum_hess)->p, expected_p, 4));
 
@@ -190,7 +190,7 @@ const char *test_wsum_hess_left_matmul_matrix(void)
     int expected_p[7] = {0, 1, 2, 3, 4, 5, 6}; /* each row has 1 diagonal entry */
 
     mu_assert("vals incorrect",
-              cmp_double_array(A_log_x->wsum_hess->to_csr(A_log_x->wsum_hess)->x, expected_x, 6));
+              cmp_double_array(A_log_x->wsum_hess->x, expected_x, 6));
     mu_assert("cols incorrect", cmp_int_array(A_log_x->wsum_hess->to_csr(A_log_x->wsum_hess)->i, expected_i, 6));
     mu_assert("rows incorrect", cmp_int_array(A_log_x->wsum_hess->to_csr(A_log_x->wsum_hess)->p, expected_p, 7));
 
diff --git a/tests/wsum_hess/affine/test_right_matmul.h b/tests/wsum_hess/affine/test_right_matmul.h
index b1adced..cc89881 100644
--- a/tests/wsum_hess/affine/test_right_matmul.h
+++ b/tests/wsum_hess/affine/test_right_matmul.h
@@ -51,7 +51,7 @@ const char *test_wsum_hess_right_matmul(void)
     int expected_p[5] = {0, 1, 2, 3, 4}; /* each row has 1 diagonal entry */
 
     mu_assert("vals incorrect",
-              cmp_double_array(log_x_A->wsum_hess->to_csr(log_x_A->wsum_hess)->x, expected_x, 4));
+              cmp_double_array(log_x_A->wsum_hess->x, expected_x, 4));
     mu_assert("cols incorrect", cmp_int_array(log_x_A->wsum_hess->to_csr(log_x_A->wsum_hess)->i, expected_i, 4));
     mu_assert("rows incorrect", cmp_int_array(log_x_A->wsum_hess->to_csr(log_x_A->wsum_hess)->p, expected_p, 5));
 
@@ -100,7 +100,7 @@ const char *test_wsum_hess_right_matmul_vector(void)
     int expected_p[4] = {0, 1, 2, 3}; /* each row has 1 diagonal entry */
 
     mu_assert("vals incorrect",
-              cmp_double_array(log_x_A->wsum_hess->to_csr(log_x_A->wsum_hess)->x, expected_x, 3));
+              cmp_double_array(log_x_A->wsum_hess->x, expected_x, 3));
     mu_assert("cols incorrect", cmp_int_array(log_x_A->wsum_hess->to_csr(log_x_A->wsum_hess)->i, expected_i, 3));
     mu_assert("rows incorrect", cmp_int_array(log_x_A->wsum_hess->to_csr(log_x_A->wsum_hess)->p, expected_p, 4));
 
diff --git a/tests/wsum_hess/affine/test_scalar_mult.h b/tests/wsum_hess/affine/test_scalar_mult.h
index 32dec5d..88c4709 100644
--- a/tests/wsum_hess/affine/test_scalar_mult.h
+++ b/tests/wsum_hess/affine/test_scalar_mult.h
@@ -39,7 +39,7 @@ const char *test_wsum_hess_scalar_mult_log_vector(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("scalar mult log hess: x values fail",
-              cmp_double_array(y->wsum_hess->to_csr(y->wsum_hess)->x, expected_x, 3));
+              cmp_double_array(y->wsum_hess->x, expected_x, 3));
     mu_assert("scalar mult log hess: row pointers fail",
               cmp_int_array(y->wsum_hess->to_csr(y->wsum_hess)->p, expected_p, 4));
     mu_assert("scalar mult log hess: column indices fail",
@@ -77,7 +77,7 @@ const char *test_wsum_hess_scalar_mult_log_matrix(void)
     int expected_i[4] = {0, 1, 2, 3};
 
     mu_assert("scalar mult log hess matrix: x values fail",
-              cmp_double_array(y->wsum_hess->to_csr(y->wsum_hess)->x, expected_x, 4));
+              cmp_double_array(y->wsum_hess->x, expected_x, 4));
     mu_assert("scalar mult log hess matrix: row pointers fail",
               cmp_int_array(y->wsum_hess->to_csr(y->wsum_hess)->p, expected_p, 5));
     mu_assert("scalar mult log hess matrix: column indices fail",
diff --git a/tests/wsum_hess/affine/test_sum.h b/tests/wsum_hess/affine/test_sum.h
index c2bd984..9767c86 100644
--- a/tests/wsum_hess/affine/test_sum.h
+++ b/tests/wsum_hess/affine/test_sum.h
@@ -64,7 +64,7 @@ const char *test_wsum_hess_sum_log_axis0(void)
     int expected_i[6] = {0, 1, 2, 3, 4, 5};
 
     mu_assert("vals incorrect",
-              cmp_double_array(sum_node->wsum_hess->to_csr(sum_node->wsum_hess)->x, expected_x, 6));
+              cmp_double_array(sum_node->wsum_hess->x, expected_x, 6));
     mu_assert("rows incorrect",
               cmp_int_array(sum_node->wsum_hess->to_csr(sum_node->wsum_hess)->p, expected_p, 7));
     mu_assert("cols incorrect",
@@ -102,7 +102,7 @@ const char *test_wsum_hess_sum_log_axis1(void)
     int expected_i[6] = {0, 1, 2, 3, 4, 5};
 
     mu_assert("vals incorrect",
-              cmp_double_array(sum_node->wsum_hess->to_csr(sum_node->wsum_hess)->x, expected_x, 6));
+              cmp_double_array(sum_node->wsum_hess->x, expected_x, 6));
     mu_assert("rows incorrect",
               cmp_int_array(sum_node->wsum_hess->to_csr(sum_node->wsum_hess)->p, expected_p, 7));
     mu_assert("cols incorrect",
diff --git a/tests/wsum_hess/affine/test_trace.h b/tests/wsum_hess/affine/test_trace.h
index 9e209b6..a37d558 100644
--- a/tests/wsum_hess/affine/test_trace.h
+++ b/tests/wsum_hess/affine/test_trace.h
@@ -138,7 +138,7 @@ const char *test_wsum_hess_trace_composite(void)
     mu_assert("nnz wrong", trace_node->wsum_hess->to_csr(trace_node->wsum_hess)->nnz == 9);
     mu_assert("rows fail", cmp_int_array(trace_node->wsum_hess->to_csr(trace_node->wsum_hess)->p, expected_Ap, 14));
     mu_assert("vals match",
-              cmp_double_array(trace_node->wsum_hess->to_csr(trace_node->wsum_hess)->x, expected_Ax, 9));
+              cmp_double_array(trace_node->wsum_hess->x, expected_Ax, 9));
     mu_assert("cols match", cmp_int_array(trace_node->wsum_hess->to_csr(trace_node->wsum_hess)->i, expected_Ai, 9));
     free_expr(trace_node);
     return 0;
diff --git a/tests/wsum_hess/affine/test_transpose.h b/tests/wsum_hess/affine/test_transpose.h
index 7d61b41..0911112 100644
--- a/tests/wsum_hess/affine/test_transpose.h
+++ b/tests/wsum_hess/affine/test_transpose.h
@@ -28,7 +28,7 @@ const char *test_wsum_hess_transpose(void)
     int expected_i[16] = {4, 6, 4, 6, 5, 7, 5, 7, 0, 1, 2, 3, 0, 1, 2, 3};
 
     mu_assert("hess values fail",
-              cmp_double_array(XYT->wsum_hess->to_csr(XYT->wsum_hess)->x, expected_x, 8));
+              cmp_double_array(XYT->wsum_hess->x, expected_x, 8));
     mu_assert("jacobian row ptr fail",
               cmp_int_array(XYT->wsum_hess->to_csr(XYT->wsum_hess)->p, expected_p, 5));
     mu_assert("jacobian col idx fail",
diff --git a/tests/wsum_hess/affine/test_upper_tri.h b/tests/wsum_hess/affine/test_upper_tri.h
index 7cfe951..3c874ba 100644
--- a/tests/wsum_hess/affine/test_upper_tri.h
+++ b/tests/wsum_hess/affine/test_upper_tri.h
@@ -50,7 +50,7 @@ const char *test_wsum_hess_upper_tri_log(void)
     int expected_i[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
 
     mu_assert("upper_tri log hess vals",
-              cmp_double_array(ut->wsum_hess->to_csr(ut->wsum_hess)->x, expected_x, 16));
+              cmp_double_array(ut->wsum_hess->x, expected_x, 16));
     mu_assert("upper_tri log hess p",
               cmp_int_array(ut->wsum_hess->to_csr(ut->wsum_hess)->p, expected_p, 17));
     mu_assert("upper_tri log hess i",
diff --git a/tests/wsum_hess/affine/test_vector_mult.h b/tests/wsum_hess/affine/test_vector_mult.h
index d8f384e..e81371f 100644
--- a/tests/wsum_hess/affine/test_vector_mult.h
+++ b/tests/wsum_hess/affine/test_vector_mult.h
@@ -39,7 +39,7 @@ const char *test_wsum_hess_vector_mult_log_vector(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vector mult log hess: x values fail",
-              cmp_double_array(y->wsum_hess->to_csr(y->wsum_hess)->x, expected_x, 3));
+              cmp_double_array(y->wsum_hess->x, expected_x, 3));
     mu_assert("vector mult log hess: row pointers fail",
               cmp_int_array(y->wsum_hess->to_csr(y->wsum_hess)->p, expected_p, 4));
     mu_assert("vector mult log hess: column indices fail",
@@ -77,7 +77,7 @@ const char *test_wsum_hess_vector_mult_log_matrix(void)
     int expected_i[4] = {0, 1, 2, 3};
 
     mu_assert("vector mult log hess matrix: x values fail",
-              cmp_double_array(y->wsum_hess->to_csr(y->wsum_hess)->x, expected_x, 4));
+              cmp_double_array(y->wsum_hess->x, expected_x, 4));
     mu_assert("vector mult log hess matrix: row pointers fail",
               cmp_int_array(y->wsum_hess->to_csr(y->wsum_hess)->p, expected_p, 5));
     mu_assert("vector mult log hess matrix: column indices fail",
diff --git a/tests/wsum_hess/affine/test_vstack.h b/tests/wsum_hess/affine/test_vstack.h
index 97faa17..d98323e 100644
--- a/tests/wsum_hess/affine/test_vstack.h
+++ b/tests/wsum_hess/affine/test_vstack.h
@@ -40,7 +40,7 @@ const char *test_wsum_hess_vstack_vectors(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vstack hess vectors: vals",
-              cmp_double_array(stack->wsum_hess->to_csr(stack->wsum_hess)->x, expected_x, 3));
+              cmp_double_array(stack->wsum_hess->x, expected_x, 3));
     mu_assert("vstack hess vectors: rows",
               cmp_int_array(stack->wsum_hess->to_csr(stack->wsum_hess)->p, expected_p, 4));
     mu_assert("vstack hess vectors: cols",
@@ -103,7 +103,7 @@ const char *test_wsum_hess_vstack_matrix(void)
     int expected_i[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
 
     mu_assert("vstack hess matrix: vals",
-              cmp_double_array(stack->wsum_hess->to_csr(stack->wsum_hess)->x, expected_x, 9));
+              cmp_double_array(stack->wsum_hess->x, expected_x, 9));
     mu_assert("vstack hess matrix: rows",
               cmp_int_array(stack->wsum_hess->to_csr(stack->wsum_hess)->p, expected_p, 10));
     mu_assert("vstack hess matrix: cols",
diff --git a/tests/wsum_hess/bivariate_full_dom/test_matmul.h b/tests/wsum_hess/bivariate_full_dom/test_matmul.h
index 512823a..357551b 100644
--- a/tests/wsum_hess/bivariate_full_dom/test_matmul.h
+++ b/tests/wsum_hess/bivariate_full_dom/test_matmul.h
@@ -101,7 +101,7 @@ const char *test_wsum_hess_matmul(void)
                              7.0, 8.0};          /* row 17 */
 
     mu_assert("Hessian values incorrect",
-              cmp_double_array(Z->wsum_hess->to_csr(Z->wsum_hess)->x, expected_x, 48));
+              cmp_double_array(Z->wsum_hess->x, expected_x, 48));
 
     free_expr(Z);
     return 0;
@@ -212,7 +212,7 @@ const char *test_wsum_hess_matmul_yx(void)
                              2.0, 4.0, 6.0, 8.0}; /* row 17 */
 
     mu_assert("Hessian values incorrect",
-              cmp_double_array(Z->wsum_hess->to_csr(Z->wsum_hess)->x, expected_x, 48));
+              cmp_double_array(Z->wsum_hess->x, expected_x, 48));
 
     free_expr(Z);
     return 0;
diff --git a/tests/wsum_hess/bivariate_full_dom/test_multiply.h b/tests/wsum_hess/bivariate_full_dom/test_multiply.h
index 4b9e428..414755e 100644
--- a/tests/wsum_hess/bivariate_full_dom/test_multiply.h
+++ b/tests/wsum_hess/bivariate_full_dom/test_multiply.h
@@ -32,7 +32,7 @@ const char *test_wsum_hess_multiply_1(void)
 
     mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 13));
     mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 6));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->to_csr(node->wsum_hess)->x, expected_x, 6));
+    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 6));
 
     free_expr(node);
     return 0;
@@ -101,7 +101,7 @@ const char *test_wsum_hess_multiply_sparse_random(void)
     mu_assert("i array fails",
               cmp_int_array(mult_node->wsum_hess->to_csr(mult_node->wsum_hess)->i, expected_i, 29));
     mu_assert("x array fails",
-              cmp_double_array(mult_node->wsum_hess->to_csr(mult_node->wsum_hess)->x, expected_x, 29));
+              cmp_double_array(mult_node->wsum_hess->x, expected_x, 29));
 
     /* Cleanup */
     free_expr(mult_node);
@@ -184,7 +184,7 @@ const char *test_wsum_hess_multiply_linear_ops(void)
     mu_assert("i array fails",
               cmp_int_array(mult_node->wsum_hess->to_csr(mult_node->wsum_hess)->i, expected_i, 7));
     mu_assert("x array fails",
-              cmp_double_array(mult_node->wsum_hess->to_csr(mult_node->wsum_hess)->x, expected_x, 7));
+              cmp_double_array(mult_node->wsum_hess->x, expected_x, 7));
 
     /* Cleanup */
     free_expr(mult_node);
@@ -221,7 +221,7 @@ const char *test_wsum_hess_multiply_2(void)
 
     mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 13));
     mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 6));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->to_csr(node->wsum_hess)->x, expected_x, 6));
+    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 6));
 
     free_expr(node);
     return 0;
diff --git a/tests/wsum_hess/bivariate_restricted_dom/test_quad_over_lin.h b/tests/wsum_hess/bivariate_restricted_dom/test_quad_over_lin.h
index eab59a3..37b7e55 100644
--- a/tests/wsum_hess/bivariate_restricted_dom/test_quad_over_lin.h
+++ b/tests/wsum_hess/bivariate_restricted_dom/test_quad_over_lin.h
@@ -28,7 +28,7 @@ const char *test_wsum_hess_quad_over_lin_xy(void)
 
     mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 10));
     mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 10));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->to_csr(node->wsum_hess)->x, expected_x, 10));
+    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 10));
 
     free_expr(node);
     return 0;
@@ -58,7 +58,7 @@ const char *test_wsum_hess_quad_over_lin_yx(void)
 
     mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 10));
     mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 10));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->to_csr(node->wsum_hess)->x, expected_x, 10));
+    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 10));
 
     free_expr(node);
     return 0;
diff --git a/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr.h b/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr.h
index ae5dbd5..b1a907c 100644
--- a/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr.h
+++ b/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr.h
@@ -32,7 +32,7 @@ const char *test_wsum_hess_rel_entr_1(void)
 
     mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 11));
     mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 12));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->to_csr(node->wsum_hess)->x, expected_x, 12));
+    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 12));
 
     free_expr(node);
     return 0;
@@ -64,7 +64,7 @@ const char *test_wsum_hess_rel_entr_2(void)
 
     mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 11));
     mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 12));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->to_csr(node->wsum_hess)->x, expected_x, 12));
+    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 12));
 
     free_expr(node);
     return 0;
@@ -100,7 +100,7 @@ const char *test_wsum_hess_rel_entr_matrix(void)
 
     mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 13));
     mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 24));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->to_csr(node->wsum_hess)->x, expected_x, 24));
+    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 24));
 
     free_expr(node);
     return 0;
diff --git a/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_scalar_vector.h b/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_scalar_vector.h
index 493830f..8a07632 100644
--- a/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_scalar_vector.h
+++ b/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_scalar_vector.h
@@ -27,7 +27,7 @@ const char *test_wsum_hess_rel_entr_scalar_vector(void)
 
     mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 5));
     mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 10));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->to_csr(node->wsum_hess)->x, expected_x, 10));
+    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 10));
 
     free_expr(node);
     return 0;
diff --git a/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_vector_scalar.h b/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_vector_scalar.h
index 06ba77e..ced2093 100644
--- a/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_vector_scalar.h
+++ b/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_vector_scalar.h
@@ -27,7 +27,7 @@ const char *test_wsum_hess_rel_entr_vector_scalar(void)
 
     mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 5));
     mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 10));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->to_csr(node->wsum_hess)->x, expected_x, 10));
+    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 10));
 
     free_expr(node);
     return 0;
diff --git a/tests/wsum_hess/elementwise_full_dom/test_exp.h b/tests/wsum_hess/elementwise_full_dom/test_exp.h
index 84ff64c..c67e921 100644
--- a/tests/wsum_hess/elementwise_full_dom/test_exp.h
+++ b/tests/wsum_hess/elementwise_full_dom/test_exp.h
@@ -28,7 +28,7 @@ const char *test_wsum_hess_exp(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vals incorrect",
-              cmp_double_array(exp_node->wsum_hess->to_csr(exp_node->wsum_hess)->x, expected_x, 3));
+              cmp_double_array(exp_node->wsum_hess->x, expected_x, 3));
     mu_assert("rows incorrect",
               cmp_int_array(exp_node->wsum_hess->to_csr(exp_node->wsum_hess)->p, expected_p, 4));
     mu_assert("cols incorrect",
diff --git a/tests/wsum_hess/elementwise_full_dom/test_hyperbolic.h b/tests/wsum_hess/elementwise_full_dom/test_hyperbolic.h
index d78070e..b0d0101 100644
--- a/tests/wsum_hess/elementwise_full_dom/test_hyperbolic.h
+++ b/tests/wsum_hess/elementwise_full_dom/test_hyperbolic.h
@@ -35,7 +35,7 @@ const char *test_wsum_hess_sinh(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vals incorrect",
-              cmp_double_array(sinh_node->wsum_hess->to_csr(sinh_node->wsum_hess)->x, expected_x, 3));
+              cmp_double_array(sinh_node->wsum_hess->x, expected_x, 3));
     mu_assert("rows incorrect",
               cmp_int_array(sinh_node->wsum_hess->to_csr(sinh_node->wsum_hess)->p, expected_p, 4));
     mu_assert("cols incorrect",
@@ -73,7 +73,7 @@ const char *test_wsum_hess_tanh(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vals incorrect",
-              cmp_double_array(tanh_node->wsum_hess->to_csr(tanh_node->wsum_hess)->x, expected_x, 3));
+              cmp_double_array(tanh_node->wsum_hess->x, expected_x, 3));
     mu_assert("rows incorrect",
               cmp_int_array(tanh_node->wsum_hess->to_csr(tanh_node->wsum_hess)->p, expected_p, 4));
     mu_assert("cols incorrect",
@@ -111,7 +111,7 @@ const char *test_wsum_hess_asinh(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vals incorrect",
-              cmp_double_array(asinh_node->wsum_hess->to_csr(asinh_node->wsum_hess)->x, expected_x, 3));
+              cmp_double_array(asinh_node->wsum_hess->x, expected_x, 3));
     mu_assert("rows incorrect",
               cmp_int_array(asinh_node->wsum_hess->to_csr(asinh_node->wsum_hess)->p, expected_p, 4));
     mu_assert("cols incorrect",
@@ -150,7 +150,7 @@ const char *test_wsum_hess_atanh(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vals incorrect",
-              cmp_double_array(atanh_node->wsum_hess->to_csr(atanh_node->wsum_hess)->x, expected_x, 3));
+              cmp_double_array(atanh_node->wsum_hess->x, expected_x, 3));
     mu_assert("rows incorrect",
               cmp_int_array(atanh_node->wsum_hess->to_csr(atanh_node->wsum_hess)->p, expected_p, 4));
     mu_assert("cols incorrect",
diff --git a/tests/wsum_hess/elementwise_full_dom/test_logistic.h b/tests/wsum_hess/elementwise_full_dom/test_logistic.h
index f9d25cd..3433dd5 100644
--- a/tests/wsum_hess/elementwise_full_dom/test_logistic.h
+++ b/tests/wsum_hess/elementwise_full_dom/test_logistic.h
@@ -43,7 +43,7 @@ const char *test_wsum_hess_logistic(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vals incorrect",
-              cmp_double_array(logistic_node->wsum_hess->to_csr(logistic_node->wsum_hess)->x, expected_x, 3));
+              cmp_double_array(logistic_node->wsum_hess->x, expected_x, 3));
     mu_assert("rows incorrect",
               cmp_int_array(logistic_node->wsum_hess->to_csr(logistic_node->wsum_hess)->p, expected_p, 4));
     mu_assert("cols incorrect",
diff --git a/tests/wsum_hess/elementwise_full_dom/test_power.h b/tests/wsum_hess/elementwise_full_dom/test_power.h
index 12004f2..1d77ab3 100644
--- a/tests/wsum_hess/elementwise_full_dom/test_power.h
+++ b/tests/wsum_hess/elementwise_full_dom/test_power.h
@@ -28,7 +28,7 @@ const char *test_wsum_hess_power(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vals incorrect",
-              cmp_double_array(power_node->wsum_hess->to_csr(power_node->wsum_hess)->x, expected_x, 3));
+              cmp_double_array(power_node->wsum_hess->x, expected_x, 3));
     mu_assert("rows incorrect",
               cmp_int_array(power_node->wsum_hess->to_csr(power_node->wsum_hess)->p, expected_p, 4));
     mu_assert("cols incorrect",
diff --git a/tests/wsum_hess/elementwise_full_dom/test_trig.h b/tests/wsum_hess/elementwise_full_dom/test_trig.h
index f506f48..7b501bb 100644
--- a/tests/wsum_hess/elementwise_full_dom/test_trig.h
+++ b/tests/wsum_hess/elementwise_full_dom/test_trig.h
@@ -28,7 +28,7 @@ const char *test_wsum_hess_sin(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vals incorrect",
-              cmp_double_array(sin_node->wsum_hess->to_csr(sin_node->wsum_hess)->x, expected_x, 3));
+              cmp_double_array(sin_node->wsum_hess->x, expected_x, 3));
     mu_assert("rows incorrect",
               cmp_int_array(sin_node->wsum_hess->to_csr(sin_node->wsum_hess)->p, expected_p, 4));
     mu_assert("cols incorrect",
@@ -57,7 +57,7 @@ const char *test_wsum_hess_cos(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vals incorrect",
-              cmp_double_array(cos_node->wsum_hess->to_csr(cos_node->wsum_hess)->x, expected_x, 3));
+              cmp_double_array(cos_node->wsum_hess->x, expected_x, 3));
     mu_assert("rows incorrect",
               cmp_int_array(cos_node->wsum_hess->to_csr(cos_node->wsum_hess)->p, expected_p, 4));
     mu_assert("cols incorrect",
@@ -88,7 +88,7 @@ const char *test_wsum_hess_tan(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vals incorrect",
-              cmp_double_array(tan_node->wsum_hess->to_csr(tan_node->wsum_hess)->x, expected_x, 3));
+              cmp_double_array(tan_node->wsum_hess->x, expected_x, 3));
     mu_assert("rows incorrect",
               cmp_int_array(tan_node->wsum_hess->to_csr(tan_node->wsum_hess)->p, expected_p, 4));
     mu_assert("cols incorrect",
diff --git a/tests/wsum_hess/elementwise_full_dom/test_xexp.h b/tests/wsum_hess/elementwise_full_dom/test_xexp.h
index 3ec1e58..bb06dc7 100644
--- a/tests/wsum_hess/elementwise_full_dom/test_xexp.h
+++ b/tests/wsum_hess/elementwise_full_dom/test_xexp.h
@@ -29,7 +29,7 @@ const char *test_wsum_hess_xexp(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vals incorrect",
-              cmp_double_array(xexp_node->wsum_hess->to_csr(xexp_node->wsum_hess)->x, expected_x, 3));
+              cmp_double_array(xexp_node->wsum_hess->x, expected_x, 3));
     mu_assert("rows incorrect",
               cmp_int_array(xexp_node->wsum_hess->to_csr(xexp_node->wsum_hess)->p, expected_p, 4));
     mu_assert("cols incorrect",
diff --git a/tests/wsum_hess/elementwise_restricted_dom/test_entr.h b/tests/wsum_hess/elementwise_restricted_dom/test_entr.h
index f1f3941..cf1eef8 100644
--- a/tests/wsum_hess/elementwise_restricted_dom/test_entr.h
+++ b/tests/wsum_hess/elementwise_restricted_dom/test_entr.h
@@ -28,7 +28,7 @@ const char *test_wsum_hess_entr(void)
     int expected_i[3] = {0, 1, 2};
 
     mu_assert("vals incorrect",
-              cmp_double_array(entr_node->wsum_hess->to_csr(entr_node->wsum_hess)->x, expected_x, 3));
+              cmp_double_array(entr_node->wsum_hess->x, expected_x, 3));
     mu_assert("rows incorrect",
               cmp_int_array(entr_node->wsum_hess->to_csr(entr_node->wsum_hess)->p, expected_p, 4));
     mu_assert("cols incorrect",
diff --git a/tests/wsum_hess/elementwise_restricted_dom/test_log.h b/tests/wsum_hess/elementwise_restricted_dom/test_log.h
index bbac1b8..1d54a1f 100644
--- a/tests/wsum_hess/elementwise_restricted_dom/test_log.h
+++ b/tests/wsum_hess/elementwise_restricted_dom/test_log.h
@@ -40,7 +40,7 @@ const char *test_wsum_hess_log(void)
     int expected_i[3] = {2, 3, 4};
 
     mu_assert("vals incorrect",
-              cmp_double_array(log_node->wsum_hess->to_csr(log_node->wsum_hess)->x, expected_x, 3));
+              cmp_double_array(log_node->wsum_hess->x, expected_x, 3));
     mu_assert("rows incorrect",
               cmp_int_array(log_node->wsum_hess->to_csr(log_node->wsum_hess)->p, expected_p, 8));
     mu_assert("cols incorrect",
diff --git a/tests/wsum_hess/other/test_prod.h b/tests/wsum_hess/other/test_prod.h
index 5c36c5e..928a5ab 100644
--- a/tests/wsum_hess/other/test_prod.h
+++ b/tests/wsum_hess/other/test_prod.h
@@ -28,7 +28,7 @@ const char *test_wsum_hess_prod_no_zero(void)
     int expected_p[9] = {0, 0, 0, 4, 8, 12, 16, 16, 16};
     int expected_i[16] = {2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->to_csr(p->wsum_hess)->x, expected_x, 16));
+    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 16));
     mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 9));
     mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 16));
 
@@ -62,7 +62,7 @@ const char *test_wsum_hess_prod_one_zero(void)
     int expected_p[9] = {0, 0, 0, 4, 8, 12, 16, 16, 16};
     int expected_i[16] = {2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->to_csr(p->wsum_hess)->x, expected_x, 16));
+    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 16));
     mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 9));
     mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 16));
 
@@ -91,7 +91,7 @@ const char *test_wsum_hess_prod_two_zeros(void)
     int expected_p[9] = {0, 0, 0, 4, 8, 12, 16, 16, 16};
     int expected_i[16] = {2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->to_csr(p->wsum_hess)->x, expected_x, 16));
+    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 16));
     mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 9));
     mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 16));
 
@@ -118,7 +118,7 @@ const char *test_wsum_hess_prod_many_zeros(void)
     int expected_p[9] = {0, 0, 0, 4, 8, 12, 16, 16, 16};
     int expected_i[16] = {2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->to_csr(p->wsum_hess)->x, expected_x, 16));
+    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 16));
     mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 9));
     mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 16));
 
diff --git a/tests/wsum_hess/other/test_prod_axis_one.h b/tests/wsum_hess/other/test_prod_axis_one.h
index 02c8b32..930c3fe 100644
--- a/tests/wsum_hess/other/test_prod_axis_one.h
+++ b/tests/wsum_hess/other/test_prod_axis_one.h
@@ -65,7 +65,7 @@ const char *test_wsum_hess_prod_axis_one_no_zeros(void)
                           /* Var 6 (row 1, col 2): cols 2,4 (excludes 6) */
                           2, 4};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->to_csr(p->wsum_hess)->x, expected_x, 12));
+    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 12));
     mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 9));
     mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 12));
 
@@ -165,7 +165,7 @@ const char *test_wsum_hess_prod_axis_one_one_zero(void)
                           /* Var 9 (row 2, col 2): cols 3,6 (excludes 9) */
                           3, 6};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->to_csr(p->wsum_hess)->x, expected_x, 18));
+    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 18));
     mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 11));
     mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 18));
 
@@ -317,7 +317,7 @@ const char *test_wsum_hess_prod_axis_one_mixed_zeros(void)
         }
     }
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->to_csr(p->wsum_hess)->x, expected_x, 30));
+    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 30));
     mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 17));
     mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 30));
 
@@ -373,7 +373,7 @@ const char *test_wsum_hess_prod_axis_one_2x2(void)
                          0,  /* Var 2 (row 0, col 1): only col 0 */
                          1}; /* Var 3 (row 1, col 1): only col 0 */
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->to_csr(p->wsum_hess)->x, expected_x, 4));
+    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 4));
     mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 5));
     mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 4));
 
diff --git a/tests/wsum_hess/other/test_prod_axis_zero.h b/tests/wsum_hess/other/test_prod_axis_zero.h
index 650bcf5..0d0cd5b 100644
--- a/tests/wsum_hess/other/test_prod_axis_zero.h
+++ b/tests/wsum_hess/other/test_prod_axis_zero.h
@@ -63,7 +63,7 @@ const char *test_wsum_hess_prod_axis_zero_no_zeros(void)
      */
     int expected_i[12] = {1, 2, 1, 2, 3, 4, 3, 4, 5, 6, 5, 6};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->to_csr(p->wsum_hess)->x, expected_x, 12));
+    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 12));
     mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 9));
     mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 12));
 
@@ -185,7 +185,7 @@ const char *test_wsum_hess_prod_axis_zero_mixed_zeros(void)
 
     mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 17));
     mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 75));
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->to_csr(p->wsum_hess)->x, expected_x, 75));
+    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 75));
 
     free_expr(p);
     return 0;
@@ -245,7 +245,7 @@ const char *test_wsum_hess_prod_axis_zero_one_zero(void)
      */
     int expected_i[8] = {1, 2, 1, 2, 3, 4, 3, 4};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->to_csr(p->wsum_hess)->x, expected_x, 8));
+    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 8));
     mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 6));
     mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 8));
 
diff --git a/tests/wsum_hess/other/test_quad_form.h b/tests/wsum_hess/other/test_quad_form.h
index 162d26b..859fe33 100644
--- a/tests/wsum_hess/other/test_quad_form.h
+++ b/tests/wsum_hess/other/test_quad_form.h
@@ -40,7 +40,7 @@ const char *test_wsum_hess_quad_form(void)
 
     mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 11));
     mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 10));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->to_csr(node->wsum_hess)->x, expected_x, 10));
+    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 10));
 
     free_expr(node);
     free_csr_matrix(Q);

From b734cd4c0b686c84e937535a6a9dfe712e65fc18 Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Mon, 11 May 2026 11:44:09 +0200
Subject: [PATCH 06/31] remove many to_csr

---
 include/expr.h                                |   4 +-
 include/subexpr.h                             |   8 -
 include/utils/CSR_sum.h                       |   5 +-
 include/utils/matrix.h                        |  47 ++++
 include/utils/matrix_sum.h                    |  36 +++
 include/utils/permuted_dense.h                |   2 +
 src/atoms/affine/add.c                        |  34 +--
 src/atoms/affine/broadcast.c                  | 111 +-------
 src/atoms/affine/diag_vec.c                   |  41 +--
 src/atoms/affine/index.c                      |  30 +--
 src/atoms/affine/parameter.c                  |   5 +-
 src/atoms/affine/promote.c                    |  33 +--
 src/atoms/affine/sum.c                        |  27 +-
 src/atoms/affine/transpose.c                  |   5 +-
 src/atoms/affine/variable.c                   |   3 +-
 src/atoms/affine/vector_mult.c                |  14 +-
 src/atoms/bivariate_full_dom/matmul.c         |  10 +-
 src/atoms/bivariate_full_dom/multiply.c       |  20 +-
 .../bivariate_restricted_dom/quad_over_lin.c  |   3 +-
 src/atoms/elementwise_full_dom/common.c       |  27 +-
 src/atoms/other/quad_form.c                   |  25 +-
 src/problem.c                                 |  12 +-
 src/utils/CSR_sum.c                           |   6 +-
 src/utils/matrix_sum.c                        |  31 +++
 src/utils/permuted_dense.c                    | 238 ++++++++++++++++++
 src/utils/sparse_matrix.c                     | 196 +++++++++++++++
 tests/all_tests.c                             |   6 +
 tests/jacobian_tests/affine/test_broadcast.h  |  40 ++-
 tests/jacobian_tests/affine/test_convolve.h   |  15 +-
 tests/jacobian_tests/affine/test_diag_mat.h   |  14 +-
 tests/jacobian_tests/affine/test_hstack.h     |  12 +-
 tests/jacobian_tests/affine/test_index.h      |  34 +--
 .../jacobian_tests/affine/test_left_matmul.h  |  12 +-
 tests/jacobian_tests/affine/test_neg.h        |  18 +-
 tests/jacobian_tests/affine/test_promote.h    |  18 +-
 .../jacobian_tests/affine/test_right_matmul.h |  12 +-
 .../jacobian_tests/affine/test_scalar_mult.h  |  18 +-
 tests/jacobian_tests/affine/test_sum.h        |  30 +--
 tests/jacobian_tests/affine/test_trace.h      |  16 +-
 tests/jacobian_tests/affine/test_transpose.h  |   9 +-
 tests/jacobian_tests/affine/test_upper_tri.h  |  15 +-
 .../jacobian_tests/affine/test_vector_mult.h  |  18 +-
 tests/jacobian_tests/affine/test_vstack.h     |  18 +-
 .../test_elementwise_mult.h                   |  20 +-
 .../bivariate_full_dom/test_matmul.h          |  16 +-
 .../test_quad_over_lin.h                      |  30 +--
 .../bivariate_restricted_dom/test_rel_entr.h  |  18 +-
 .../test_rel_entr_scalar_vector.h             |   6 +-
 .../test_rel_entr_vector_scalar.h             |   6 +-
 .../composite/test_composite_exp.h            |   5 +-
 .../elementwise_restricted_dom/test_log.h     |  12 +-
 tests/jacobian_tests/other/test_prod.h        |  18 +-
 .../jacobian_tests/other/test_prod_axis_one.h |  12 +-
 .../other/test_prod_axis_zero.h               |   6 +-
 tests/jacobian_tests/other/test_quad_form.h   |  12 +-
 tests/numerical_diff.c                        |  10 +-
 tests/profiling/profile_log_reg.h             |   5 +-
 tests/test_helpers.c                          |  27 ++
 tests/test_helpers.h                          |  10 +
 tests/utils/test_permuted_dense.h             | 209 +++++++++++++++
 tests/wsum_hess/affine/test_broadcast.h       |  27 +-
 tests/wsum_hess/affine/test_convolve.h        |   6 +-
 tests/wsum_hess/affine/test_diag_mat.h        |   7 +-
 tests/wsum_hess/affine/test_hstack.h          |  18 +-
 tests/wsum_hess/affine/test_index.h           |  25 +-
 tests/wsum_hess/affine/test_left_matmul.h     |  14 +-
 tests/wsum_hess/affine/test_right_matmul.h    |  14 +-
 tests/wsum_hess/affine/test_scalar_mult.h     |  18 +-
 tests/wsum_hess/affine/test_sum.h             |  18 +-
 tests/wsum_hess/affine/test_trace.h           |  20 +-
 tests/wsum_hess/affine/test_transpose.h       |   9 +-
 tests/wsum_hess/affine/test_upper_tri.h       |   9 +-
 tests/wsum_hess/affine/test_vector_mult.h     |  18 +-
 tests/wsum_hess/affine/test_vstack.h          |  18 +-
 .../bivariate_full_dom/test_matmul.h          |  29 +--
 .../bivariate_full_dom/test_multiply.h        |  30 +--
 .../test_quad_over_lin.h                      |  12 +-
 .../bivariate_restricted_dom/test_rel_entr.h  |  18 +-
 .../test_rel_entr_scalar_vector.h             |   6 +-
 .../test_rel_entr_vector_scalar.h             |   6 +-
 .../wsum_hess/elementwise_full_dom/test_exp.h |   9 +-
 .../elementwise_full_dom/test_hyperbolic.h    |  36 +--
 .../elementwise_full_dom/test_logistic.h      |   9 +-
 .../elementwise_full_dom/test_power.h         |   9 +-
 .../elementwise_full_dom/test_trig.h          |  27 +-
 .../elementwise_full_dom/test_xexp.h          |   9 +-
 .../elementwise_restricted_dom/test_entr.h    |   9 +-
 .../elementwise_restricted_dom/test_log.h     |   9 +-
 tests/wsum_hess/other/test_prod.h             |  24 +-
 tests/wsum_hess/other/test_prod_axis_one.h    |  24 +-
 tests/wsum_hess/other/test_prod_axis_zero.h   |  18 +-
 tests/wsum_hess/other/test_quad_form.h        |   6 +-
 92 files changed, 1326 insertions(+), 885 deletions(-)
 create mode 100644 include/utils/matrix_sum.h
 create mode 100644 src/utils/matrix_sum.c

diff --git a/include/expr.h b/include/expr.h
index 4facb95..19c6e12 100644
--- a/include/expr.h
+++ b/include/expr.h
@@ -53,8 +53,8 @@ typedef struct
        supporting common subexpressions on the Python side. */
     bool jacobian_csc_filled;
     double *local_jac_diag; /* cached f'(g(x)) diagonal */
-    Matrix *hess_term1; /* Jg^T D Jg workspace */
-    Matrix *hess_term2; /* child wsum_hess workspace */
+    Matrix *hess_term1;     /* Jg^T D Jg workspace */
+    Matrix *hess_term2;     /* child wsum_hess workspace */
 } Expr_Work;
 
 /* Base expression node structure */
diff --git a/include/subexpr.h b/include/subexpr.h
index 62aa6eb..a1ac764 100644
--- a/include/subexpr.h
+++ b/include/subexpr.h
@@ -201,14 +201,6 @@ typedef struct index_expr
     bool has_duplicates; /* True if indices have duplicates (affects Hessian path) */
 } index_expr;
 
-/* Broadcast types */
-typedef enum
-{
-    BROADCAST_ROW,   /* (1, n) -> (m, n) */
-    BROADCAST_COL,   /* (m, 1) -> (m, n) */
-    BROADCAST_SCALAR /* (1, 1) -> (m, n) */
-} broadcast_type;
-
 typedef struct broadcast_expr
 {
     expr base;
diff --git a/include/utils/CSR_sum.h b/include/utils/CSR_sum.h
index f3ba700..5350b59 100644
--- a/include/utils/CSR_sum.h
+++ b/include/utils/CSR_sum.h
@@ -66,8 +66,9 @@ CSR_Matrix *sum_4_csr_alloc(const CSR_Matrix *A, const CSR_Matrix *B,
                             int *idx_maps[4]);
 // ------------------------------------------------------------------------------------
 
-/* Accumulates values from A according to map. Must memset to zero before calling. */
-void accumulator(const CSR_Matrix *A, const int *idx_map, double *out);
+/* Accumulates `nnz` values from `vals` into `out` at the positions given by
+   `idx_map` (length `nnz`). Caller must zero `out` before calling. */
+void accumulator(const double *vals, int nnz, const int *idx_map, double *out);
 
 /* Accumulates values from A according to map with spacing. Must memset to zero
  * before calling. */
diff --git a/include/utils/matrix.h b/include/utils/matrix.h
index be2d551..268e977 100644
--- a/include/utils/matrix.h
+++ b/include/utils/matrix.h
@@ -21,6 +21,14 @@
 #include "CSC_Matrix.h"
 #include "CSR_Matrix.h"
 
+/* Broadcast shape used by the broadcast atom and its vtable methods. */
+typedef enum
+{
+    BROADCAST_ROW,   /* (1, n) -> (m, n) */
+    BROADCAST_COL,   /* (m, 1) -> (m, n) */
+    BROADCAST_SCALAR /* (1, 1) -> (m, n) */
+} broadcast_type;
+
 /* We implement three different types of matrices.
 
     1. 'sparse_matrix' represents a generic CSR matrix.
@@ -91,6 +99,40 @@ typedef struct Matrix
                              struct Matrix *out);
     CSR_Matrix *(*to_csr)(struct Matrix *self);
 
+    /* Row-selection / indexing: returns a new Matrix that selects rows
+       indices[0..n_idxs) of self. Output shape is (n_idxs, self->n). The
+       returned type matches self's concrete type. index_alloc sets up
+       sparsity (values uninitialized); index_fill_values fills values into
+       out, which must have been produced by a prior index_alloc with the
+       same indices/n_idxs. */
+    struct Matrix *(*index_alloc)(struct Matrix *self, const int *indices,
+                                  int n_idxs);
+    void (*index_fill_values)(struct Matrix *self, const int *indices, int n_idxs,
+                              struct Matrix *out);
+
+    /* Row-tiling for the promote atom: self must be a 1-row matrix; returns
+       a new Matrix of shape (size, self->n) where every row is a copy of
+       self's single row. Output type matches self's concrete type.
+       promote_alloc sets sparsity; promote_fill_values fills values. */
+    struct Matrix *(*promote_alloc)(struct Matrix *self, int size);
+    void (*promote_fill_values)(struct Matrix *self, struct Matrix *out);
+
+    /* Broadcast: lift the child Jacobian of a broadcast atom into the output
+       Jacobian. `type` is the broadcast variant; (d1, d2) is the output shape.
+       Output type matches self's concrete type. broadcast_alloc sets sparsity;
+       broadcast_fill_values fills values into out. */
+    struct Matrix *(*broadcast_alloc)(struct Matrix *self, broadcast_type type,
+                                      int d1, int d2);
+    void (*broadcast_fill_values)(struct Matrix *self, broadcast_type type, int d1,
+                                  int d2, struct Matrix *out);
+
+    /* diag_vec: child is an (n, self->n) Jacobian for a length-n vector;
+       output is (n*n, self->n) where child row i lands at output row
+       i*(n+1) (column-major diagonal positions). Other output rows are
+       structurally zero. Output type matches self's concrete type. */
+    struct Matrix *(*diag_vec_alloc)(struct Matrix *self);
+    void (*diag_vec_fill_values)(struct Matrix *self, struct Matrix *out);
+
     /* Refresh any internal caches (e.g. a CSC mirror) so subsequent ATA / ATDA
        calls reflect the current values. Atoms whose child Jacobian is affine
        can skip this on iterations after the first; non-affine children must
@@ -117,6 +159,11 @@ typedef struct Sparse_Matrix
    (free_matrix on the returned Matrix frees A). */
 Matrix *new_sparse_matrix(CSR_Matrix *A);
 
+/* Convenience: allocate a Sparse_Matrix of shape (m, n) with capacity for
+   nnz entries. Equivalent to new_sparse_matrix(new_csr_matrix(m, n, nnz)).
+   Sparsity pattern and values are uninitialized. */
+Matrix *new_sparse_matrix_alloc(int m, int n, int nnz);
+
 /* Transpose helper */
 Matrix *sparse_matrix_trans(const Sparse_Matrix *self, int *iwork);
 
diff --git a/include/utils/matrix_sum.h b/include/utils/matrix_sum.h
new file mode 100644
index 0000000..f9f803f
--- /dev/null
+++ b/include/utils/matrix_sum.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2026 Daniel Cederberg and William Zhang
+ *
+ * This file is part of the SparseDiffEngine project.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MATRIX_SUM_H
+#define MATRIX_SUM_H
+
+#include "matrix.h"
+
+/* Polymorphic wrappers over CSR_sum. A, B, and C must all be Sparse_Matrix-
+   backed for now; the union sparsity of A+B is general sparse, so a
+   Permuted_Dense output is not supported.
+
+   sum_matrices_alloc fills C's sparsity pattern and re-syncs C's base.nnz
+   from the underlying CSR (sum_csr_alloc may shrink nnz below the
+   over-allocated max). */
+void sum_matrices_alloc(Matrix *A, Matrix *B, Matrix *C);
+
+/* Fills C's values; assumes C already has the union sparsity pattern of
+   A and B (typically produced by sum_matrices_alloc). */
+void sum_matrices_fill_values(Matrix *A, Matrix *B, Matrix *C);
+
+#endif /* MATRIX_SUM_H */
diff --git a/include/utils/permuted_dense.h b/include/utils/permuted_dense.h
index 90f0caa..3f3714d 100644
--- a/include/utils/permuted_dense.h
+++ b/include/utils/permuted_dense.h
@@ -40,6 +40,8 @@ typedef struct Permuted_Dense
     double *Y_scratch;     /* dense_m * dense_n, used by ATDA            */
     int *col_inv;          /* length base.n: col_inv[col_perm[jj]] = jj, */
                            /* otherwise -1; used by `x CSC` allocation.  */
+    int *row_inv;          /* length base.m: row_inv[row_perm[ii]] = ii, */
+                           /* otherwise -1; used by index_alloc.         */
     CSR_Matrix *csr_cache; /* lazy CSR view built by to_csr; structure */
                            /* allocated on first call, values refilled */
                            /* on every call. NULL until first call.    */
diff --git a/src/atoms/affine/add.c b/src/atoms/affine/add.c
index 1895a85..d49c879 100644
--- a/src/atoms/affine/add.c
+++ b/src/atoms/affine/add.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 #include "atoms/affine.h"
-#include "utils/CSR_sum.h"
+#include "utils/matrix_sum.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <stdio.h>
@@ -41,16 +41,12 @@ static void jacobian_init_impl(expr *node)
     jacobian_init(node->left);
     jacobian_init(node->right);
 
-    CSR_Matrix *Jl = node->left->jacobian->to_csr(node->left->jacobian);
-    CSR_Matrix *Jr = node->right->jacobian->to_csr(node->right->jacobian);
-
     /* we never have to store more than the sum of children's nnz */
-    int nnz_max = Jl->nnz + Jr->nnz;
-    CSR_Matrix *jac = new_csr_matrix(node->size, node->n_vars, nnz_max);
+    int nnz_max = node->left->jacobian->nnz + node->right->jacobian->nnz;
+    node->jacobian = new_sparse_matrix_alloc(node->size, node->n_vars, nnz_max);
 
-    /* fill sparsity pattern  */
-    sum_csr_alloc(Jl, Jr, jac);
-    node->jacobian = new_sparse_matrix(jac);
+    /* fill sparsity pattern */
+    sum_matrices_alloc(node->left->jacobian, node->right->jacobian, node->jacobian);
 }
 
 static void eval_jacobian(expr *node)
@@ -60,8 +56,8 @@ static void eval_jacobian(expr *node)
     node->right->eval_jacobian(node->right);
 
     /* sum children's jacobians */
-    sum_csr_fill_values(node->left->jacobian->to_csr(node->left->jacobian), node->right->jacobian->to_csr(node->right->jacobian),
-                        node->jacobian->to_csr(node->jacobian));
+    sum_matrices_fill_values(node->left->jacobian, node->right->jacobian,
+                             node->jacobian);
 }
 
 static void wsum_hess_init_impl(expr *node)
@@ -70,16 +66,13 @@ static void wsum_hess_init_impl(expr *node)
     wsum_hess_init(node->left);
     wsum_hess_init(node->right);
 
-    CSR_Matrix *Hl = node->left->wsum_hess->to_csr(node->left->wsum_hess);
-    CSR_Matrix *Hr = node->right->wsum_hess->to_csr(node->right->wsum_hess);
-
     /* we never have to store more than the sum of children's nnz */
-    int nnz_max = Hl->nnz + Hr->nnz;
-    CSR_Matrix *hess = new_csr_matrix(node->n_vars, node->n_vars, nnz_max);
+    int nnz_max = node->left->wsum_hess->nnz + node->right->wsum_hess->nnz;
+    node->wsum_hess = new_sparse_matrix_alloc(node->n_vars, node->n_vars, nnz_max);
 
     /* fill sparsity pattern of hessian */
-    sum_csr_alloc(Hl, Hr, hess);
-    node->wsum_hess = new_sparse_matrix(hess);
+    sum_matrices_alloc(node->left->wsum_hess, node->right->wsum_hess,
+                       node->wsum_hess);
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
@@ -89,9 +82,8 @@ static void eval_wsum_hess(expr *node, const double *w)
     node->right->eval_wsum_hess(node->right, w);
 
     /* sum children's wsum_hess */
-    sum_csr_fill_values(node->left->wsum_hess->to_csr(node->left->wsum_hess),
-                        node->right->wsum_hess->to_csr(node->right->wsum_hess),
-                        node->wsum_hess->to_csr(node->wsum_hess));
+    sum_matrices_fill_values(node->left->wsum_hess, node->right->wsum_hess,
+                             node->wsum_hess);
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/broadcast.c b/src/atoms/affine/broadcast.c
index 73af7ca..213461b 100644
--- a/src/atoms/affine/broadcast.c
+++ b/src/atoms/affine/broadcast.c
@@ -17,7 +17,6 @@
  */
 #include "atoms/affine.h"
 #include "subexpr.h"
-#include "utils/mini_numpy.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <stdio.h>
@@ -71,117 +70,21 @@ static void jacobian_init_impl(expr *node)
 {
     expr *x = node->left;
     jacobian_init(x);
-    broadcast_expr *bcast = (broadcast_expr *) node;
-    CSR_Matrix *Jx = x->jacobian->to_csr(x->jacobian);
-    int total_nnz;
-
-    // --------------------------------------------------------------------
-    //                     count number of nonzeros
-    // --------------------------------------------------------------------
-    if (bcast->type == BROADCAST_ROW)
-    {
-        /* Row broadcast: (1, n) -> (m, n) */
-        total_nnz = Jx->nnz * node->d1;
-    }
-    else if (bcast->type == BROADCAST_COL)
-    {
-        /* Column broadcast: (m, 1) -> (m, n) */
-        total_nnz = Jx->nnz * node->d2;
-    }
-    else
-    {
-        /* Scalar broadcast: (1, 1) -> (m, n) */
-        total_nnz = Jx->nnz * node->size;
-    }
-
-    CSR_Matrix *J = new_csr_matrix(node->size, node->n_vars, total_nnz);
-    node->jacobian = new_sparse_matrix(J);
-
-    // ---------------------------------------------------------------------
-    //                 fill sparsity pattern
-    // ---------------------------------------------------------------------
-
-    if (bcast->type == BROADCAST_ROW)
-    {
-        J->nnz = 0;
-        for (int i = 0; i < node->d2; i++)
-        {
-            int nnz_in_row = Jx->p[i + 1] - Jx->p[i];
-
-            /* copy columns indices */
-            tile_int(J->i + J->nnz, Jx->i + Jx->p[i], nnz_in_row, node->d1);
-
-            /* set row pointers */
-            for (int rep = 0; rep < node->d1; rep++)
-            {
-                J->p[i * node->d1 + rep] = J->nnz;
-                J->nnz += nnz_in_row;
-            }
-        }
-        assert(J->nnz == total_nnz);
-        J->p[node->size] = total_nnz;
-    }
-    else if (bcast->type == BROADCAST_COL)
-    {
-        /* copy column indices */
-        tile_int(J->i, Jx->i, Jx->nnz, node->d2);
-
-        /* set row pointers */
-        int offset = 0;
-        for (int i = 0; i < node->d2; i++)
-        {
-            for (int j = 0; j < node->d1; j++)
-            {
-                int nnz_in_row = Jx->p[j + 1] - Jx->p[j];
-                J->p[i * node->d1 + j] = offset;
-                offset += nnz_in_row;
-            }
-        }
-        assert(offset == total_nnz);
-        J->p[node->size] = total_nnz;
-    }
-    else
-    {
-        /* copy column indices */
-        tile_int(J->i, Jx->i, Jx->nnz, node->size);
 
-        /* set row pointers */
-        int offset = 0;
-        int nnz = Jx->p[1] - Jx->p[0];
-        for (int i = 0; i < node->size; i++)
-        {
-            J->p[i] = offset;
-            offset += nnz;
-        }
-        assert(offset == total_nnz);
-        J->p[node->size] = total_nnz;
-    }
+    /* allocate sparsity for the broadcast output; output type matches child's. */
+    broadcast_expr *bcast = (broadcast_expr *) node;
+    node->jacobian =
+        x->jacobian->broadcast_alloc(x->jacobian, bcast->type, node->d1, node->d2);
 }
 
 static void eval_jacobian(expr *node)
 {
     node->left->eval_jacobian(node->left);
 
+    /* fill values into the preallocated output. */
     broadcast_expr *bcast = (broadcast_expr *) node;
-    CSR_Matrix *Jx = node->left->jacobian->to_csr(node->left->jacobian);
-    if (bcast->type == BROADCAST_ROW)
-    {
-        node->jacobian->nnz = 0;
-        for (int i = 0; i < node->d2; i++)
-        {
-            int nnz_in_row = Jx->p[i + 1] - Jx->p[i];
-            tile_double(node->jacobian->x + node->jacobian->nnz, Jx->x + Jx->p[i], nnz_in_row, node->d1);
-            node->jacobian->nnz += nnz_in_row * node->d1;
-        }
-    }
-    else if (bcast->type == BROADCAST_COL)
-    {
-        tile_double(node->jacobian->x, Jx->x, Jx->nnz, node->d2);
-    }
-    else
-    {
-        tile_double(node->jacobian->x, Jx->x, Jx->nnz, node->size);
-    }
+    node->left->jacobian->broadcast_fill_values(node->left->jacobian, bcast->type,
+                                                node->d1, node->d2, node->jacobian);
 }
 
 static void wsum_hess_init_impl(expr *node)
diff --git a/src/atoms/affine/diag_vec.c b/src/atoms/affine/diag_vec.c
index 3238f32..03fbd45 100644
--- a/src/atoms/affine/diag_vec.c
+++ b/src/atoms/affine/diag_vec.c
@@ -48,49 +48,18 @@ static void forward(expr *node, const double *u)
 static void jacobian_init_impl(expr *node)
 {
     expr *x = node->left;
-    int n = x->size;
     jacobian_init(x);
 
-    CSR_Matrix *Jx = x->jacobian->to_csr(x->jacobian);
-    CSR_Matrix *J = new_csr_matrix(node->size, node->n_vars, Jx->nnz);
-
-    /* Output has n^2 rows but only n diagonal positions are non-empty.
-     * Diagonal position i is at row i*(n+1) in Fortran order. */
-    int nnz = 0;
-    int next_diag = 0;
-    for (int row = 0; row < node->size; row++)
-    {
-        J->p[row] = nnz;
-        if (row == next_diag)
-        {
-            int child_row = row / (n + 1);
-            int len = Jx->p[child_row + 1] - Jx->p[child_row];
-            memcpy(J->i + nnz, Jx->i + Jx->p[child_row], len * sizeof(int));
-            nnz += len;
-            next_diag += n + 1;
-        }
-    }
-    J->p[node->size] = nnz;
-
-    node->jacobian = new_sparse_matrix(J);
+    /* output type matches child's; rows i*(n+1) hold child row i, others zero. */
+    node->jacobian = x->jacobian->diag_vec_alloc(x->jacobian);
 }
 
 static void eval_jacobian(expr *node)
 {
-    expr *x = node->left;
-    int n = x->size;
-    x->eval_jacobian(x);
-
-    CSR_Matrix *J = node->jacobian->to_csr(node->jacobian);
-    CSR_Matrix *Jx = x->jacobian->to_csr(x->jacobian);
+    node->left->eval_jacobian(node->left);
 
-    /* Copy values from child row i to output diagonal row i*(n+1) */
-    for (int i = 0; i < n; i++)
-    {
-        int out_row = i * (n + 1);
-        int len = J->p[out_row + 1] - J->p[out_row];
-        memcpy(J->x + J->p[out_row], Jx->x + Jx->p[i], len * sizeof(double));
-    }
+    /* fill the diagonal rows of the preallocated output. */
+    node->left->jacobian->diag_vec_fill_values(node->left->jacobian, node->jacobian);
 }
 
 static void wsum_hess_init_impl(expr *node)
diff --git a/src/atoms/affine/index.c b/src/atoms/affine/index.c
index 77028fb..3fd070e 100644
--- a/src/atoms/affine/index.c
+++ b/src/atoms/affine/index.c
@@ -64,21 +64,10 @@ static void jacobian_init_impl(expr *node)
     index_expr *idx = (index_expr *) node;
     jacobian_init(x);
 
-    CSR_Matrix *Jx = x->jacobian->to_csr(x->jacobian);
-    CSR_Matrix *J = new_csr_matrix(node->size, node->n_vars, Jx->nnz);
-
-    /* set sparsity pattern */
-    J->p[0] = 0;
-    for (int i = 0; i < idx->n_idxs; i++)
-    {
-        int row = idx->indices[i];
-        int len = Jx->p[row + 1] - Jx->p[row];
-        memcpy(J->i + J->p[i], Jx->i + Jx->p[row], len * sizeof(int));
-        J->p[i + 1] = J->p[i] + len;
-    }
-
-    J->nnz = J->p[idx->n_idxs];
-    node->jacobian = new_sparse_matrix(J);
+    /* allocate sparsity pattern for the matrix consisting of rows
+       'idx->indices' of the child's Jacobian */
+    node->jacobian =
+        x->jacobian->index_alloc(x->jacobian, idx->indices, idx->n_idxs);
 }
 
 static void eval_jacobian(expr *node)
@@ -87,14 +76,9 @@ static void eval_jacobian(expr *node)
     index_expr *idx = (index_expr *) node;
     x->eval_jacobian(x);
 
-    CSR_Matrix *J = node->jacobian->to_csr(node->jacobian);
-    CSR_Matrix *Jx = x->jacobian->to_csr(x->jacobian);
-
-    for (int i = 0; i < idx->n_idxs; i++)
-    {
-        int len = J->p[i + 1] - J->p[i];
-        memcpy(J->x + J->p[i], Jx->x + Jx->p[idx->indices[i]], len * sizeof(double));
-    }
+    /* copy values of the selected rows into the preallocated output */
+    x->jacobian->index_fill_values(x->jacobian, idx->indices, idx->n_idxs,
+                                   node->jacobian);
 }
 
 static void wsum_hess_init_impl(expr *node)
diff --git a/src/atoms/affine/parameter.c b/src/atoms/affine/parameter.c
index db6f14d..7d9c2d5 100644
--- a/src/atoms/affine/parameter.c
+++ b/src/atoms/affine/parameter.c
@@ -32,7 +32,7 @@ static void forward(expr *node, const double *u)
 static void jacobian_init_impl(expr *node)
 {
     /* Zero jacobian: size x n_vars with 0 nonzeros. */
-    node->jacobian = new_sparse_matrix(new_csr_matrix(node->size, node->n_vars, 0));
+    node->jacobian = new_sparse_matrix_alloc(node->size, node->n_vars, 0);
 }
 
 static void eval_jacobian(expr *node)
@@ -43,8 +43,7 @@ static void eval_jacobian(expr *node)
 static void wsum_hess_init_impl(expr *node)
 {
     /* Zero Hessian: n_vars x n_vars with 0 nonzeros. */
-    node->wsum_hess =
-        new_sparse_matrix(new_csr_matrix(node->n_vars, node->n_vars, 0));
+    node->wsum_hess = new_sparse_matrix_alloc(node->n_vars, node->n_vars, 0);
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
diff --git a/src/atoms/affine/promote.c b/src/atoms/affine/promote.c
index dbe231f..1a031d8 100644
--- a/src/atoms/affine/promote.c
+++ b/src/atoms/affine/promote.c
@@ -39,38 +39,18 @@ static void jacobian_init_impl(expr *node)
 {
     expr *x = node->left;
     jacobian_init(x);
-    CSR_Matrix *Jx = x->jacobian->to_csr(x->jacobian);
 
-    /* each output row copies the single row from child's jacobian */
-    int nnz = node->size * Jx->nnz;
-    CSR_Matrix *J = new_csr_matrix(node->size, node->n_vars, nnz);
-
-    /* fill sparsity pattern */
-    J->nnz = 0;
-    for (int row = 0; row < node->size; row++)
-    {
-        J->p[row] = J->nnz;
-        memcpy(J->i + J->nnz, Jx->i, Jx->nnz * sizeof(int));
-        J->nnz += Jx->nnz;
-    }
-    assert(J->nnz == nnz);
-    J->p[node->size] = J->nnz;
-    node->jacobian = new_sparse_matrix(J);
+    /* allocate sparsity for an (node->size, n_vars) matrix whose rows are all
+       copies of the child's single row; output type matches child's type. */
+    node->jacobian = x->jacobian->promote_alloc(x->jacobian, node->size);
 }
 
 static void eval_jacobian(expr *node)
 {
     node->left->eval_jacobian(node->left);
 
-    CSR_Matrix *child_jac = node->left->jacobian->to_csr(node->left->jacobian);
-    int child_nnz = child_jac->p[1] - child_jac->p[0];
-
-    /* Copy child's row values to each output row */
-    for (int row = 0; row < node->size; row++)
-    {
-        memcpy(node->jacobian->x + row * child_nnz, child_jac->x + child_jac->p[0],
-               child_nnz * sizeof(double));
-    }
+    /* tile the child's single row into the preallocated output. */
+    node->left->jacobian->promote_fill_values(node->left->jacobian, node->jacobian);
 }
 
 static void wsum_hess_init_impl(expr *node)
@@ -92,7 +72,8 @@ static void eval_wsum_hess(expr *node, const double *w)
     node->left->eval_wsum_hess(node->left, &sum_w);
 
     /* copy values */
-    memcpy(node->wsum_hess->x, node->left->wsum_hess->x, node->left->wsum_hess->nnz * sizeof(double));
+    memcpy(node->wsum_hess->x, node->left->wsum_hess->x,
+           node->left->wsum_hess->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/sum.c b/src/atoms/affine/sum.c
index 3d1d5cd..6fdcb21 100644
--- a/src/atoms/affine/sum.c
+++ b/src/atoms/affine/sum.c
@@ -118,50 +118,51 @@ static void jacobian_init_impl(expr *node)
 
 static void eval_jacobian(expr *node)
 {
-    expr *x = node->left;
+    expr *child = node->left;
 
     /* evaluate child's jacobian */
-    x->eval_jacobian(x);
+    child->eval_jacobian(child);
 
     /* we have precomputed an idx map between the nonzeros of the child's jacobian
        and this node's jacobian, so we just accumulate accordingly */
     memset(node->jacobian->x, 0, node->jacobian->nnz * sizeof(double));
-    accumulator(x->jacobian->to_csr(x->jacobian), ((sum_expr *) node)->idx_map, node->jacobian->x);
+    accumulator(child->jacobian->x, child->jacobian->nnz,
+                ((sum_expr *) node)->idx_map, node->jacobian->x);
 }
 
 static void wsum_hess_init_impl(expr *node)
 {
-    expr *x = node->left;
+    expr *child = node->left;
     /* initialize child's wsum_hess */
-    wsum_hess_init(x);
+    wsum_hess_init(child);
 
     /* we never have to store more than the child's nnz */
-    node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess);
-    node->work->dwork = SP_MALLOC(x->size * sizeof(double));
+    node->wsum_hess = child->wsum_hess->copy_sparsity(child->wsum_hess);
+    node->work->dwork = SP_MALLOC(child->size * sizeof(double));
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
 {
-    expr *x = node->left;
+    expr *child = node->left;
     sum_expr *snode = (sum_expr *) node;
     int axis = snode->axis;
 
     if (axis == -1)
     {
-        scaled_ones(node->work->dwork, x->size, *w);
+        scaled_ones(node->work->dwork, child->size, *w);
     }
     else if (axis == 0)
     {
-        repeat(node->work->dwork, w, x->d2, x->d1);
+        repeat(node->work->dwork, w, child->d2, child->d1);
     }
     else if (axis == 1)
     {
-        tile_double(node->work->dwork, w, x->d1, x->d2);
+        tile_double(node->work->dwork, w, child->d1, child->d2);
     }
 
-    x->eval_wsum_hess(x, node->work->dwork);
+    child->eval_wsum_hess(child, node->work->dwork);
 
-    memcpy(node->wsum_hess->x, x->wsum_hess->x,
+    memcpy(node->wsum_hess->x, child->wsum_hess->x,
            node->wsum_hess->nnz * sizeof(double));
 }
 
diff --git a/src/atoms/affine/transpose.c b/src/atoms/affine/transpose.c
index ed67e61..5040c33 100644
--- a/src/atoms/affine/transpose.c
+++ b/src/atoms/affine/transpose.c
@@ -73,16 +73,15 @@ static void eval_jacobian(expr *node)
     expr *child = node->left;
     child->eval_jacobian(child);
     CSR_Matrix *Jc = child->jacobian->to_csr(child->jacobian);
-    CSR_Matrix *J = node->jacobian->to_csr(node->jacobian);
 
     int d1 = node->d1;
     int d2 = node->d2;
     int nnz = 0;
-    for (int row = 0; row < J->m; ++row)
+    for (int row = 0; row < node->jacobian->m; ++row)
     {
         int k = (row / d1) + (row % d1) * d2;
         int len = Jc->p[k + 1] - Jc->p[k];
-        memcpy(J->x + nnz, Jc->x + Jc->p[k], len * sizeof(double));
+        memcpy(node->jacobian->x + nnz, Jc->x + Jc->p[k], len * sizeof(double));
         nnz += len;
     }
 }
diff --git a/src/atoms/affine/variable.c b/src/atoms/affine/variable.c
index 82933aa..0af562e 100644
--- a/src/atoms/affine/variable.c
+++ b/src/atoms/affine/variable.c
@@ -47,8 +47,7 @@ static void eval_jacobian(expr *node)
 static void wsum_hess_init_impl(expr *node)
 {
     /* Variables have zero Hessian */
-    node->wsum_hess =
-        new_sparse_matrix(new_csr_matrix(node->n_vars, node->n_vars, 0));
+    node->wsum_hess = new_sparse_matrix_alloc(node->n_vars, node->n_vars, 0);
 }
 
 static void wsum_hess_eval(expr *node, const double *w)
diff --git a/src/atoms/affine/vector_mult.c b/src/atoms/affine/vector_mult.c
index 3fbb8d9..ee7992b 100644
--- a/src/atoms/affine/vector_mult.c
+++ b/src/atoms/affine/vector_mult.c
@@ -67,18 +67,11 @@ static void eval_jacobian(expr *node)
     expr *x = node->left;
     const double *a = ((vector_mult_expr *) node)->param_source->value;
 
-    /* evaluate x */
+    /* evaluate jacobian of child */
     x->eval_jacobian(x);
 
-    /* row-wise scale child's jacobian */
-    CSR_Matrix *Jx = x->jacobian->to_csr(x->jacobian);
-    for (int i = 0; i < node->size; i++)
-    {
-        for (int j = Jx->p[i]; j < Jx->p[i + 1]; j++)
-        {
-            node->jacobian->x[j] = a[i] * Jx->x[j];
-        }
-    }
+    /* row-wise scale child's jacobian: diag(a) @ Jx */
+    x->jacobian->DA_fill_values(a, x->jacobian, node->jacobian);
 }
 
 static void wsum_hess_init_impl(expr *node)
@@ -91,6 +84,7 @@ static void wsum_hess_init_impl(expr *node)
     /* same sparsity as child */
     node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess);
 
+    /* workspace for storing scaled weights */
     node->work->dwork = (double *) SP_MALLOC(node->size * sizeof(double));
 }
 
diff --git a/src/atoms/bivariate_full_dom/matmul.c b/src/atoms/bivariate_full_dom/matmul.c
index 2f8445f..53ca955 100644
--- a/src/atoms/bivariate_full_dom/matmul.c
+++ b/src/atoms/bivariate_full_dom/matmul.c
@@ -516,10 +516,12 @@ static void eval_wsum_hess_chain_rule(expr *node, const double *w)
 
     /* accumulate H = C + C^T + H_f + H_g */
     memset(node->wsum_hess->x, 0, node->wsum_hess->nnz * sizeof(double));
-    accumulator(mnode->C, mnode->idx_map_C, node->wsum_hess->x);
-    accumulator(mnode->CT, mnode->idx_map_CT, node->wsum_hess->x);
-    accumulator(f->wsum_hess->to_csr(f->wsum_hess), mnode->idx_map_Hf, node->wsum_hess->x);
-    accumulator(g->wsum_hess->to_csr(g->wsum_hess), mnode->idx_map_Hg, node->wsum_hess->x);
+    accumulator(mnode->C->x, mnode->C->nnz, mnode->idx_map_C, node->wsum_hess->x);
+    accumulator(mnode->CT->x, mnode->CT->nnz, mnode->idx_map_CT, node->wsum_hess->x);
+    accumulator(f->wsum_hess->x, f->wsum_hess->nnz, mnode->idx_map_Hf,
+                node->wsum_hess->x);
+    accumulator(g->wsum_hess->x, g->wsum_hess->nnz, mnode->idx_map_Hg,
+                node->wsum_hess->x);
 }
 
 expr *new_matmul(expr *x, expr *y)
diff --git a/src/atoms/bivariate_full_dom/multiply.c b/src/atoms/bivariate_full_dom/multiply.c
index d204e98..b17f0df 100644
--- a/src/atoms/bivariate_full_dom/multiply.c
+++ b/src/atoms/bivariate_full_dom/multiply.c
@@ -18,6 +18,7 @@
 #include "atoms/bivariate_full_dom.h"
 #include "subexpr.h"
 #include "utils/CSR_sum.h"
+#include "utils/matrix_sum.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <stdio.h>
@@ -49,14 +50,11 @@ static void jacobian_init_impl(expr *node)
 {
     jacobian_init(node->left);
     jacobian_init(node->right);
-    CSR_Matrix *Jl = node->left->jacobian->to_csr(node->left->jacobian);
-    CSR_Matrix *Jr = node->right->jacobian->to_csr(node->right->jacobian);
-    int nnz_max = Jl->nnz + Jr->nnz;
-    CSR_Matrix *jac = new_csr_matrix(node->size, node->n_vars, nnz_max);
+    int nnz_max = node->left->jacobian->nnz + node->right->jacobian->nnz;
+    node->jacobian = new_sparse_matrix_alloc(node->size, node->n_vars, nnz_max);
 
     /* fill sparsity pattern */
-    sum_csr_alloc(Jl, Jr, jac);
-    node->jacobian = new_sparse_matrix(jac);
+    sum_matrices_alloc(node->left->jacobian, node->right->jacobian, node->jacobian);
 }
 
 static void eval_jacobian(expr *node)
@@ -268,10 +266,12 @@ static void eval_wsum_hess(expr *node, const double *w)
         //        compute H = C + C^T + term2 + term3
         // ---------------------------------------------------------------
         memset(node->wsum_hess->x, 0, node->wsum_hess->nnz * sizeof(double));
-        accumulator(C, mul_node->idx_map_C, node->wsum_hess->x);
-        accumulator(CT, mul_node->idx_map_CT, node->wsum_hess->x);
-        accumulator(x->wsum_hess->to_csr(x->wsum_hess), mul_node->idx_map_Hx, node->wsum_hess->x);
-        accumulator(y->wsum_hess->to_csr(y->wsum_hess), mul_node->idx_map_Hy, node->wsum_hess->x);
+        accumulator(C->x, C->nnz, mul_node->idx_map_C, node->wsum_hess->x);
+        accumulator(CT->x, CT->nnz, mul_node->idx_map_CT, node->wsum_hess->x);
+        accumulator(x->wsum_hess->x, x->wsum_hess->nnz, mul_node->idx_map_Hx,
+                    node->wsum_hess->x);
+        accumulator(y->wsum_hess->x, y->wsum_hess->nnz, mul_node->idx_map_Hy,
+                    node->wsum_hess->x);
     }
 }
 
diff --git a/src/atoms/bivariate_restricted_dom/quad_over_lin.c b/src/atoms/bivariate_restricted_dom/quad_over_lin.c
index 4cc2531..c550c6c 100644
--- a/src/atoms/bivariate_restricted_dom/quad_over_lin.c
+++ b/src/atoms/bivariate_restricted_dom/quad_over_lin.c
@@ -188,8 +188,7 @@ static void wsum_hess_init_impl(expr *node)
     /* if left node is a variable */
     if (x->var_id != NOT_A_VARIABLE)
     {
-        CSR_Matrix *H =
-            new_csr_matrix(node->n_vars, node->n_vars, 3 * x->size + 1);
+        CSR_Matrix *H = new_csr_matrix(node->n_vars, node->n_vars, 3 * x->size + 1);
         node->wsum_hess = new_sparse_matrix(H);
 
         /* if x has lower idx than y*/
diff --git a/src/atoms/elementwise_full_dom/common.c b/src/atoms/elementwise_full_dom/common.c
index 8a6946e..95775e4 100644
--- a/src/atoms/elementwise_full_dom/common.c
+++ b/src/atoms/elementwise_full_dom/common.c
@@ -19,7 +19,7 @@
 #include "subexpr.h"
 #include "utils/CSC_Matrix.h"
 #include "utils/CSR_Matrix.h"
-#include "utils/CSR_sum.h"
+#include "utils/matrix_sum.h"
 #include "utils/tracked_alloc.h"
 #include <stdio.h>
 #include <stdlib.h>
@@ -116,14 +116,13 @@ void wsum_hess_init_elementwise(expr *node)
             node->work->hess_term2 =
                 child->wsum_hess->copy_sparsity(child->wsum_hess);
 
-            /* wsum_hess = term1 + term2 (sum_csr_alloc operates on CSR; extract
-               via to_csr on the operands). The result is stored as CSR. */
-            CSR_Matrix *t1 = node->work->hess_term1->to_csr(node->work->hess_term1);
-            CSR_Matrix *t2 = node->work->hess_term2->to_csr(node->work->hess_term2);
-            int max_nnz = t1->nnz + t2->nnz;
-            CSR_Matrix *hess = new_csr_matrix(node->n_vars, node->n_vars, max_nnz);
-            sum_csr_alloc(t1, t2, hess);
-            node->wsum_hess = new_sparse_matrix(hess);
+            /* wsum_hess = term1 + term2 */
+            int max_nnz =
+                node->work->hess_term1->nnz + node->work->hess_term2->nnz;
+            node->wsum_hess =
+                new_sparse_matrix_alloc(node->n_vars, node->n_vars, max_nnz);
+            sum_matrices_alloc(node->work->hess_term1, node->work->hess_term2,
+                               node->wsum_hess);
         }
     }
 }
@@ -171,16 +170,12 @@ void eval_wsum_hess_elementwise(expr *node, const double *w)
             }
 
             child->eval_wsum_hess(child, node->work->dwork);
-            CSR_Matrix *t2_csr =
-                node->work->hess_term2->to_csr(node->work->hess_term2);
-            memcpy(t2_csr->x, child->wsum_hess->x,
+            memcpy(node->work->hess_term2->x, child->wsum_hess->x,
                    child->wsum_hess->nnz * sizeof(double));
 
             /* wsum_hess = term1 + term2 */
-            CSR_Matrix *t1_csr =
-                node->work->hess_term1->to_csr(node->work->hess_term1);
-            sum_csr_fill_values(t1_csr, t2_csr,
-                                node->wsum_hess->to_csr(node->wsum_hess));
+            sum_matrices_fill_values(node->work->hess_term1,
+                                     node->work->hess_term2, node->wsum_hess);
         }
     }
 }
diff --git a/src/atoms/other/quad_form.c b/src/atoms/other/quad_form.c
index 703d3a8..3906dc4 100644
--- a/src/atoms/other/quad_form.c
+++ b/src/atoms/other/quad_form.c
@@ -18,7 +18,7 @@
 #include "atoms/non_elementwise_full_dom.h"
 #include "subexpr.h"
 #include "utils/CSC_Matrix.h"
-#include "utils/CSR_sum.h"
+#include "utils/matrix_sum.h"
 #include "utils/cblas_wrapper.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
@@ -172,12 +172,12 @@ static void wsum_hess_init_impl(expr *node)
         node->work->hess_term2 = x->wsum_hess->copy_sparsity(x->wsum_hess);
 
         /* hess = term1 + term2 */
-        CSR_Matrix *t1 = node->work->hess_term1->to_csr(node->work->hess_term1);
-        CSR_Matrix *t2 = node->work->hess_term2->to_csr(node->work->hess_term2);
-        int max_nnz = t1->nnz + t2->nnz;
-        CSR_Matrix *hess = new_csr_matrix(node->n_vars, node->n_vars, max_nnz);
-        sum_csr_alloc(t1, t2, hess);
-        node->wsum_hess = new_sparse_matrix(hess);
+        int max_nnz =
+            node->work->hess_term1->nnz + node->work->hess_term2->nnz;
+        node->wsum_hess =
+            new_sparse_matrix_alloc(node->n_vars, node->n_vars, max_nnz);
+        sum_matrices_alloc(node->work->hess_term1, node->work->hess_term2,
+                           node->wsum_hess);
     }
 }
 
@@ -210,7 +210,6 @@ static void eval_wsum_hess(expr *node, const double *w)
 
         CSC_Matrix *QJf = ((quad_form_expr *) node)->QJf;
         CSR_Matrix *term1 = node->work->hess_term1->to_csr(node->work->hess_term1);
-        CSR_Matrix *term2 = node->work->hess_term2->to_csr(node->work->hess_term2);
 
         /* term1 = J_f^T Q J_f = J_f^T B  */
         BA_fill_values(Q, Jf, QJf);
@@ -218,14 +217,16 @@ static void eval_wsum_hess(expr *node, const double *w)
 
         /* term2 */
         x->eval_wsum_hess(x, node->work->dwork);
-        memcpy(term2->x, x->wsum_hess->x, x->wsum_hess->nnz * sizeof(double));
+        memcpy(node->work->hess_term2->x, x->wsum_hess->x,
+               x->wsum_hess->nnz * sizeof(double));
 
         /* scale both terms by 2w */
-        cblas_dscal(term1->nnz, two_w, term1->x, 1);
-        cblas_dscal(term2->nnz, two_w, term2->x, 1);
+        cblas_dscal(node->work->hess_term1->nnz, two_w, node->work->hess_term1->x, 1);
+        cblas_dscal(node->work->hess_term2->nnz, two_w, node->work->hess_term2->x, 1);
 
         /* sum the two terms */
-        sum_csr_fill_values(term1, term2, node->wsum_hess->to_csr(node->wsum_hess));
+        sum_matrices_fill_values(node->work->hess_term1, node->work->hess_term2,
+                                 node->wsum_hess);
     }
 }
 
diff --git a/src/problem.c b/src/problem.c
index 3abd929..d25c6ea 100644
--- a/src/problem.c
+++ b/src/problem.c
@@ -546,17 +546,15 @@ void problem_hessian(problem *prob, double obj_w, const double *w)
     memset(H->x, 0, H->nnz * sizeof(double));
 
     /* accumulate objective function */
-    CSR_Matrix *obj_hess_csr = obj->wsum_hess->to_csr(obj->wsum_hess);
-    accumulator(obj_hess_csr, idx_map, H->x);
-    offset = obj_hess_csr->nnz;
+    accumulator(obj->wsum_hess->x, obj->wsum_hess->nnz, idx_map, H->x);
+    offset = obj->wsum_hess->nnz;
 
     /* accumulate constraint functions */
     for (int i = 0; i < prob->n_constraints; i++)
     {
-        CSR_Matrix *c_hess_csr =
-            constrs[i]->wsum_hess->to_csr(constrs[i]->wsum_hess);
-        accumulator(c_hess_csr, idx_map + offset, H->x);
-        offset += c_hess_csr->nnz;
+        Matrix *c_hess = constrs[i]->wsum_hess;
+        accumulator(c_hess->x, c_hess->nnz, idx_map + offset, H->x);
+        offset += c_hess->nnz;
     }
 
     clock_gettime(CLOCK_MONOTONIC, &timer.end);
diff --git a/src/utils/CSR_sum.c b/src/utils/CSR_sum.c
index 0e5d4cf..ea0ff04 100644
--- a/src/utils/CSR_sum.c
+++ b/src/utils/CSR_sum.c
@@ -285,12 +285,12 @@ void sum_evenly_spaced_rows_csr_alloc(const CSR_Matrix *A, CSR_Matrix *C,
     }
 }
 
-void accumulator(const CSR_Matrix *A, const int *idx_map, double *out)
+void accumulator(const double *vals, int nnz, const int *idx_map, double *out)
 {
     /* don't forget to initialize accumulator to 0 before calling this */
-    for (int j = 0; j < A->nnz; j++)
+    for (int j = 0; j < nnz; j++)
     {
-        out[idx_map[j]] += A->x[j];
+        out[idx_map[j]] += vals[j];
     }
 }
 
diff --git a/src/utils/matrix_sum.c b/src/utils/matrix_sum.c
new file mode 100644
index 0000000..72bc9c5
--- /dev/null
+++ b/src/utils/matrix_sum.c
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2026 Daniel Cederberg and William Zhang
+ *
+ * This file is part of the SparseDiffEngine project.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "utils/matrix_sum.h"
+#include "utils/CSR_sum.h"
+
+void sum_matrices_alloc(Matrix *A, Matrix *B, Matrix *C)
+{
+    CSR_Matrix *cc = C->to_csr(C);
+    sum_csr_alloc(A->to_csr(A), B->to_csr(B), cc);
+    C->nnz = cc->nnz;
+}
+
+void sum_matrices_fill_values(Matrix *A, Matrix *B, Matrix *C)
+{
+    sum_csr_fill_values(A->to_csr(A), B->to_csr(B), C->to_csr(C));
+}
diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c
index ab86c0f..7c579a1 100644
--- a/src/utils/permuted_dense.c
+++ b/src/utils/permuted_dense.c
@@ -31,6 +31,7 @@ static void permuted_dense_free(Matrix *self)
     free(pd->X);
     free(pd->Y_scratch);
     free(pd->col_inv);
+    free(pd->row_inv);
     free_csr_matrix(pd->csr_cache);
     free(pd);
 }
@@ -88,6 +89,225 @@ static CSR_Matrix *permuted_dense_to_csr(Matrix *self)
     return pd->csr_cache;
 }
 
+static Matrix *permuted_dense_vtable_index_alloc(Matrix *self, const int *indices,
+                                                 int n_idxs)
+{
+    const Permuted_Dense *pd = (const Permuted_Dense *) self;
+
+    /* Scan indices: which output positions i hit a row in pd->row_perm? */
+    int *new_row_perm = (int *) SP_MALLOC(n_idxs * sizeof(int));
+    int new_dense_m = 0;
+    for (int i = 0; i < n_idxs; i++)
+    {
+        if (pd->row_inv[indices[i]] >= 0)
+        {
+            new_row_perm[new_dense_m++] = i;
+        }
+    }
+
+    Matrix *out = new_permuted_dense(n_idxs, pd->base.n, new_dense_m,
+                                     pd->dense_n, new_row_perm, pd->col_perm,
+                                     NULL);
+    free(new_row_perm);
+    return out;
+}
+
+static void permuted_dense_vtable_index_fill_values(Matrix *self,
+                                                    const int *indices, int n_idxs,
+                                                    Matrix *out)
+{
+    (void) n_idxs;
+    const Permuted_Dense *pd = (const Permuted_Dense *) self;
+    Permuted_Dense *out_pd = (Permuted_Dense *) out;
+    int dense_n = pd->dense_n;
+    for (int k = 0; k < out_pd->dense_m; k++)
+    {
+        int i = out_pd->row_perm[k];
+        int old_ii = pd->row_inv[indices[i]];
+        memcpy(out_pd->X + (size_t) k * dense_n, pd->X + (size_t) old_ii * dense_n,
+               dense_n * sizeof(double));
+    }
+}
+
+static Matrix *permuted_dense_vtable_promote_alloc(Matrix *self, int size)
+{
+    const Permuted_Dense *pd = (const Permuted_Dense *) self;
+    assert(pd->dense_m <= 1);
+
+    if (pd->dense_m == 0)
+    {
+        /* source row is all-zero; output is also structurally all-zero. */
+        return new_permuted_dense(size, pd->base.n, 0, pd->dense_n, NULL,
+                                  pd->col_perm, NULL);
+    }
+
+    int *new_row_perm = (int *) SP_MALLOC(size * sizeof(int));
+    for (int i = 0; i < size; i++)
+    {
+        new_row_perm[i] = i;
+    }
+    Matrix *out = new_permuted_dense(size, pd->base.n, size, pd->dense_n,
+                                     new_row_perm, pd->col_perm, NULL);
+    free(new_row_perm);
+    return out;
+}
+
+static void permuted_dense_vtable_promote_fill_values(Matrix *self, Matrix *out)
+{
+    const Permuted_Dense *pd = (const Permuted_Dense *) self;
+    Permuted_Dense *out_pd = (Permuted_Dense *) out;
+    if (pd->dense_m == 0) return;
+    int dense_n = pd->dense_n;
+    for (int k = 0; k < out_pd->dense_m; k++)
+    {
+        memcpy(out_pd->X + (size_t) k * dense_n, pd->X,
+               dense_n * sizeof(double));
+    }
+}
+
+static Matrix *permuted_dense_vtable_broadcast_alloc(Matrix *self,
+                                                     broadcast_type type, int d1,
+                                                     int d2)
+{
+    const Permuted_Dense *pd = (const Permuted_Dense *) self;
+    int out_m = d1 * d2;
+
+    int new_dense_m;
+    if (type == BROADCAST_SCALAR)
+    {
+        new_dense_m = (pd->dense_m == 0) ? 0 : out_m;
+    }
+    else if (type == BROADCAST_ROW)
+    {
+        new_dense_m = d1 * pd->dense_m;
+    }
+    else /* BROADCAST_COL */
+    {
+        new_dense_m = d2 * pd->dense_m;
+    }
+
+    if (new_dense_m == 0)
+    {
+        return new_permuted_dense(out_m, pd->base.n, 0, pd->dense_n, NULL,
+                                  pd->col_perm, NULL);
+    }
+
+    int *new_row_perm = (int *) SP_MALLOC(new_dense_m * sizeof(int));
+    int k = 0;
+    if (type == BROADCAST_SCALAR)
+    {
+        for (int i = 0; i < out_m; i++)
+        {
+            new_row_perm[k++] = i;
+        }
+    }
+    else if (type == BROADCAST_ROW)
+    {
+        for (int j_ii = 0; j_ii < pd->dense_m; j_ii++)
+        {
+            int j_old = pd->row_perm[j_ii];
+            for (int i = 0; i < d1; i++)
+            {
+                new_row_perm[k++] = j_old * d1 + i;
+            }
+        }
+    }
+    else /* BROADCAST_COL */
+    {
+        for (int j = 0; j < d2; j++)
+        {
+            for (int ii_old = 0; ii_old < pd->dense_m; ii_old++)
+            {
+                new_row_perm[k++] = j * d1 + pd->row_perm[ii_old];
+            }
+        }
+    }
+
+    Matrix *out = new_permuted_dense(out_m, pd->base.n, new_dense_m,
+                                     pd->dense_n, new_row_perm, pd->col_perm,
+                                     NULL);
+    free(new_row_perm);
+    return out;
+}
+
+static void permuted_dense_vtable_broadcast_fill_values(Matrix *self,
+                                                        broadcast_type type, int d1,
+                                                        int d2, Matrix *out)
+{
+    const Permuted_Dense *pd = (const Permuted_Dense *) self;
+    Permuted_Dense *out_pd = (Permuted_Dense *) out;
+    if (pd->dense_m == 0)
+    {
+        return;
+    }
+    int dense_n = pd->dense_n;
+
+    if (type == BROADCAST_SCALAR)
+    {
+        for (int k = 0; k < out_pd->dense_m; k++)
+        {
+            memcpy(out_pd->X + (size_t) k * dense_n, pd->X,
+                   dense_n * sizeof(double));
+        }
+    }
+    else if (type == BROADCAST_ROW)
+    {
+        /* output row k corresponds to child dense row (k / d1). */
+        (void) d2;
+        for (int k = 0; k < out_pd->dense_m; k++)
+        {
+            memcpy(out_pd->X + (size_t) k * dense_n,
+                   pd->X + (size_t) (k / d1) * dense_n,
+                   dense_n * sizeof(double));
+        }
+    }
+    else /* BROADCAST_COL */
+    {
+        (void) d1;
+        size_t child_block = (size_t) pd->dense_m * (size_t) dense_n;
+        for (int j = 0; j < d2; j++)
+        {
+            memcpy(out_pd->X + (size_t) j * child_block, pd->X,
+                   child_block * sizeof(double));
+        }
+    }
+}
+
+static Matrix *permuted_dense_vtable_diag_vec_alloc(Matrix *self)
+{
+    const Permuted_Dense *pd = (const Permuted_Dense *) self;
+    int n = pd->base.m;
+    int out_m = n * n;
+
+    if (pd->dense_m == 0)
+    {
+        return new_permuted_dense(out_m, pd->base.n, 0, pd->dense_n, NULL,
+                                  pd->col_perm, NULL);
+    }
+
+    int *new_row_perm = (int *) SP_MALLOC(pd->dense_m * sizeof(int));
+    for (int ii = 0; ii < pd->dense_m; ii++)
+    {
+        new_row_perm[ii] = pd->row_perm[ii] * (n + 1);
+    }
+    Matrix *out = new_permuted_dense(out_m, pd->base.n, pd->dense_m, pd->dense_n,
+                                     new_row_perm, pd->col_perm, NULL);
+    free(new_row_perm);
+    return out;
+}
+
+static void permuted_dense_vtable_diag_vec_fill_values(Matrix *self, Matrix *out)
+{
+    const Permuted_Dense *pd = (const Permuted_Dense *) self;
+    Permuted_Dense *out_pd = (Permuted_Dense *) out;
+    if (pd->dense_m == 0)
+    {
+        return;
+    }
+    memcpy(out_pd->X, pd->X,
+           (size_t) pd->dense_m * (size_t) pd->dense_n * sizeof(double));
+}
+
 Matrix *new_permuted_dense(int m, int n, int dense_m, int dense_n,
                            const int *row_perm, const int *col_perm,
                            const double *X_data)
@@ -119,6 +339,14 @@ Matrix *new_permuted_dense(int m, int n, int dense_m, int dense_n,
     pd->base.ATA_alloc = permuted_dense_vtable_ATA_alloc;
     pd->base.ATDA_fill_values = permuted_dense_vtable_ATDA_fill_values;
     pd->base.to_csr = permuted_dense_to_csr;
+    pd->base.index_alloc = permuted_dense_vtable_index_alloc;
+    pd->base.index_fill_values = permuted_dense_vtable_index_fill_values;
+    pd->base.promote_alloc = permuted_dense_vtable_promote_alloc;
+    pd->base.promote_fill_values = permuted_dense_vtable_promote_fill_values;
+    pd->base.broadcast_alloc = permuted_dense_vtable_broadcast_alloc;
+    pd->base.broadcast_fill_values = permuted_dense_vtable_broadcast_fill_values;
+    pd->base.diag_vec_alloc = permuted_dense_vtable_diag_vec_alloc;
+    pd->base.diag_vec_fill_values = permuted_dense_vtable_diag_vec_fill_values;
     pd->base.refresh_csc_values = permuted_dense_refresh_csc_values;
     pd->base.free_fn = permuted_dense_free;
 
@@ -132,6 +360,7 @@ Matrix *new_permuted_dense(int m, int n, int dense_m, int dense_n,
     pd->base.x = pd->X;
     pd->Y_scratch = (double *) SP_MALLOC(sz * sizeof(double));
     pd->col_inv = (int *) SP_MALLOC(n * sizeof(int));
+    pd->row_inv = (int *) SP_MALLOC(m * sizeof(int));
 
     if (dense_m > 0)
     {
@@ -151,6 +380,15 @@ Matrix *new_permuted_dense(int m, int n, int dense_m, int dense_n,
         pd->col_inv[col_perm[jj]] = jj;
     }
 
+    for (int i = 0; i < m; i++)
+    {
+        pd->row_inv[i] = -1;
+    }
+    for (int ii = 0; ii < dense_m; ii++)
+    {
+        pd->row_inv[row_perm[ii]] = ii;
+    }
+
     if (X_data != NULL && sz > 0)
     {
         memcpy(pd->X, X_data, sz * sizeof(double));
diff --git a/src/utils/sparse_matrix.c b/src/utils/sparse_matrix.c
index 53b8e38..5653b49 100644
--- a/src/utils/sparse_matrix.c
+++ b/src/utils/sparse_matrix.c
@@ -18,6 +18,7 @@
 #include "utils/CSC_Matrix.h"
 #include "utils/linalg_sparse_matmuls.h"
 #include "utils/matrix.h"
+#include "utils/mini_numpy.h"
 #include "utils/tracked_alloc.h"
 #include <stdlib.h>
 #include <string.h>
@@ -98,6 +99,188 @@ static CSR_Matrix *sparse_to_csr(Matrix *self)
     return ((Sparse_Matrix *) self)->csr;
 }
 
+static Matrix *sparse_index_alloc(Matrix *self, const int *indices, int n_idxs)
+{
+    CSR_Matrix *Jx = ((Sparse_Matrix *) self)->csr;
+    CSR_Matrix *J = new_csr_matrix(n_idxs, self->n, Jx->nnz);
+
+    J->p[0] = 0;
+    for (int i = 0; i < n_idxs; i++)
+    {
+        int row = indices[i];
+        int len = Jx->p[row + 1] - Jx->p[row];
+        memcpy(J->i + J->p[i], Jx->i + Jx->p[row], len * sizeof(int));
+        J->p[i + 1] = J->p[i] + len;
+    }
+    J->nnz = J->p[n_idxs];
+    return new_sparse_matrix(J);
+}
+
+static void sparse_index_fill_values(Matrix *self, const int *indices, int n_idxs,
+                                     Matrix *out)
+{
+    CSR_Matrix *Jx = ((Sparse_Matrix *) self)->csr;
+    CSR_Matrix *J = ((Sparse_Matrix *) out)->csr;
+    for (int i = 0; i < n_idxs; i++)
+    {
+        int len = J->p[i + 1] - J->p[i];
+        memcpy(J->x + J->p[i], Jx->x + Jx->p[indices[i]], len * sizeof(double));
+    }
+}
+
+static Matrix *sparse_promote_alloc(Matrix *self, int size)
+{
+    CSR_Matrix *Jx = ((Sparse_Matrix *) self)->csr;
+    int row_nnz = Jx->nnz;
+    CSR_Matrix *J = new_csr_matrix(size, self->n, size * row_nnz);
+
+    for (int row = 0; row < size; row++)
+    {
+        J->p[row] = row * row_nnz;
+        memcpy(J->i + row * row_nnz, Jx->i, row_nnz * sizeof(int));
+    }
+    J->p[size] = size * row_nnz;
+    J->nnz = size * row_nnz;
+    return new_sparse_matrix(J);
+}
+
+static void sparse_promote_fill_values(Matrix *self, Matrix *out)
+{
+    CSR_Matrix *Jx = ((Sparse_Matrix *) self)->csr;
+    int row_nnz = Jx->nnz;
+    for (int row = 0; row < out->m; row++)
+    {
+        memcpy(out->x + row * row_nnz, Jx->x, row_nnz * sizeof(double));
+    }
+}
+
+static Matrix *sparse_broadcast_alloc(Matrix *self, broadcast_type type, int d1,
+                                      int d2)
+{
+    CSR_Matrix *Jx = ((Sparse_Matrix *) self)->csr;
+    int out_m = d1 * d2;
+    int total_nnz;
+    if (type == BROADCAST_ROW)
+    {
+        total_nnz = Jx->nnz * d1;
+    }
+    else if (type == BROADCAST_COL)
+    {
+        total_nnz = Jx->nnz * d2;
+    }
+    else /* BROADCAST_SCALAR */
+    {
+        total_nnz = Jx->nnz * out_m;
+    }
+
+    CSR_Matrix *J = new_csr_matrix(out_m, self->n, total_nnz);
+
+    if (type == BROADCAST_ROW)
+    {
+        int acc = 0;
+        for (int i = 0; i < d2; i++)
+        {
+            int nnz_in_row = Jx->p[i + 1] - Jx->p[i];
+            tile_int(J->i + acc, Jx->i + Jx->p[i], nnz_in_row, d1);
+            for (int rep = 0; rep < d1; rep++)
+            {
+                J->p[i * d1 + rep] = acc;
+                acc += nnz_in_row;
+            }
+        }
+        J->p[out_m] = total_nnz;
+    }
+    else if (type == BROADCAST_COL)
+    {
+        tile_int(J->i, Jx->i, Jx->nnz, d2);
+        int offset = 0;
+        for (int i = 0; i < d2; i++)
+        {
+            for (int j = 0; j < d1; j++)
+            {
+                int nnz_in_row = Jx->p[j + 1] - Jx->p[j];
+                J->p[i * d1 + j] = offset;
+                offset += nnz_in_row;
+            }
+        }
+        J->p[out_m] = total_nnz;
+    }
+    else /* BROADCAST_SCALAR */
+    {
+        tile_int(J->i, Jx->i, Jx->nnz, out_m);
+        int row_nnz = Jx->nnz;
+        for (int i = 0; i < out_m; i++)
+        {
+            J->p[i] = i * row_nnz;
+        }
+        J->p[out_m] = total_nnz;
+    }
+    return new_sparse_matrix(J);
+}
+
+static void sparse_broadcast_fill_values(Matrix *self, broadcast_type type, int d1,
+                                         int d2, Matrix *out)
+{
+    CSR_Matrix *Jx = ((Sparse_Matrix *) self)->csr;
+    if (type == BROADCAST_ROW)
+    {
+        int acc = 0;
+        for (int i = 0; i < d2; i++)
+        {
+            int nnz_in_row = Jx->p[i + 1] - Jx->p[i];
+            tile_double(out->x + acc, Jx->x + Jx->p[i], nnz_in_row, d1);
+            acc += nnz_in_row * d1;
+        }
+    }
+    else if (type == BROADCAST_COL)
+    {
+        tile_double(out->x, Jx->x, Jx->nnz, d2);
+    }
+    else /* BROADCAST_SCALAR */
+    {
+        tile_double(out->x, Jx->x, Jx->nnz, d1 * d2);
+    }
+}
+
+static Matrix *sparse_diag_vec_alloc(Matrix *self)
+{
+    CSR_Matrix *Jx = ((Sparse_Matrix *) self)->csr;
+    int n = self->m;
+    int out_m = n * n;
+    CSR_Matrix *J = new_csr_matrix(out_m, self->n, Jx->nnz);
+
+    int nnz = 0;
+    int next_diag = 0;
+    for (int row = 0; row < out_m; row++)
+    {
+        J->p[row] = nnz;
+        if (row == next_diag)
+        {
+            int child_row = row / (n + 1);
+            int len = Jx->p[child_row + 1] - Jx->p[child_row];
+            memcpy(J->i + nnz, Jx->i + Jx->p[child_row], len * sizeof(int));
+            nnz += len;
+            next_diag += n + 1;
+        }
+    }
+    J->p[out_m] = nnz;
+    J->nnz = nnz;
+    return new_sparse_matrix(J);
+}
+
+static void sparse_diag_vec_fill_values(Matrix *self, Matrix *out)
+{
+    CSR_Matrix *Jx = ((Sparse_Matrix *) self)->csr;
+    CSR_Matrix *J = ((Sparse_Matrix *) out)->csr;
+    int n = self->m;
+    for (int i = 0; i < n; i++)
+    {
+        int out_row = i * (n + 1);
+        int len = J->p[out_row + 1] - J->p[out_row];
+        memcpy(J->x + J->p[out_row], Jx->x + Jx->p[i], len * sizeof(double));
+    }
+}
+
 /* Build CSC structure on first call; refill values from csr->x on every call. */
 static void sparse_refresh_csc_values(Matrix *self)
 {
@@ -116,6 +299,14 @@ static void wire_vtable(Sparse_Matrix *sm)
     sm->base.ATA_alloc = sparse_ATA_alloc;
     sm->base.ATDA_fill_values = sparse_ATDA_fill_values;
     sm->base.to_csr = sparse_to_csr;
+    sm->base.index_alloc = sparse_index_alloc;
+    sm->base.index_fill_values = sparse_index_fill_values;
+    sm->base.promote_alloc = sparse_promote_alloc;
+    sm->base.promote_fill_values = sparse_promote_fill_values;
+    sm->base.broadcast_alloc = sparse_broadcast_alloc;
+    sm->base.broadcast_fill_values = sparse_broadcast_fill_values;
+    sm->base.diag_vec_alloc = sparse_diag_vec_alloc;
+    sm->base.diag_vec_fill_values = sparse_diag_vec_fill_values;
     sm->base.refresh_csc_values = sparse_refresh_csc_values;
     sm->base.free_fn = sparse_free;
 }
@@ -132,6 +323,11 @@ Matrix *new_sparse_matrix(CSR_Matrix *A)
     return &sm->base;
 }
 
+Matrix *new_sparse_matrix_alloc(int m, int n, int nnz)
+{
+    return new_sparse_matrix(new_csr_matrix(m, n, nnz));
+}
+
 Matrix *sparse_matrix_trans(const Sparse_Matrix *self, int *iwork)
 {
     CSR_Matrix *AT = transpose(self->csr, iwork);
diff --git a/tests/all_tests.c b/tests/all_tests.c
index 1d38759..d39b303 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -370,6 +370,12 @@ int main(void)
     mu_run_test(test_permuted_dense_times_csc_no_active, tests_run);
     mu_run_test(test_permuted_dense_to_csr_lazy, tests_run);
     mu_run_test(test_permuted_dense_col_inv, tests_run);
+    mu_run_test(test_permuted_dense_index, tests_run);
+    mu_run_test(test_permuted_dense_promote, tests_run);
+    mu_run_test(test_permuted_dense_broadcast_scalar, tests_run);
+    mu_run_test(test_permuted_dense_broadcast_row, tests_run);
+    mu_run_test(test_permuted_dense_broadcast_col, tests_run);
+    mu_run_test(test_permuted_dense_diag_vec, tests_run);
     mu_run_test(test_YT_kron_I, tests_run);
     mu_run_test(test_YT_kron_I_larger, tests_run);
     mu_run_test(test_I_kron_X, tests_run);
diff --git a/tests/jacobian_tests/affine/test_broadcast.h b/tests/jacobian_tests/affine/test_broadcast.h
index 2a5d491..a1f79ed 100644
--- a/tests/jacobian_tests/affine/test_broadcast.h
+++ b/tests/jacobian_tests/affine/test_broadcast.h
@@ -37,12 +37,9 @@ const char *test_broadcast_row_jacobian(void)
     int expected_p[7] = {0, 1, 2, 3, 4, 5, 6};
     int expected_i[6] = {0, 0, 1, 1, 2, 2};
 
-    mu_assert("broadcast row jacobian vals fail",
-              cmp_double_array(bcast->jacobian->x, expected_x, 6));
-    mu_assert("broadcast row jacobian rows fail",
-              cmp_int_array(bcast->jacobian->to_csr(bcast->jacobian)->p, expected_p, 4));
-    mu_assert("broadcast row jacobian cols fail",
-              cmp_int_array(bcast->jacobian->to_csr(bcast->jacobian)->i, expected_i, 6));
+    mu_assert("vals fail", cmp_values(bcast->jacobian, expected_x, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(bcast->jacobian, expected_p, expected_i, 6, 6));
 
     free_expr(bcast);
     return 0;
@@ -82,12 +79,9 @@ const char *test_broadcast_col_jacobian(void)
     int expected_p[7] = {0, 1, 2, 3, 4, 5, 6};
     int expected_i[6] = {0, 1, 2, 0, 1, 2};
 
-    mu_assert("broadcast col jacobian vals fail",
-              cmp_double_array(bcast->jacobian->x, expected_x, 6));
-    mu_assert("broadcast col jacobian rows fail",
-              cmp_int_array(bcast->jacobian->to_csr(bcast->jacobian)->p, expected_p, 7));
-    mu_assert("broadcast col jacobian cols fail",
-              cmp_int_array(bcast->jacobian->to_csr(bcast->jacobian)->i, expected_i, 6));
+    mu_assert("vals fail", cmp_values(bcast->jacobian, expected_x, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(bcast->jacobian, expected_p, expected_i, 6, 6));
 
     free_expr(bcast);
     return 0;
@@ -123,12 +117,9 @@ const char *test_broadcast_scalar_to_matrix_jacobian(void)
     int expected_p[7] = {0, 1, 2, 3, 4, 5, 6};
     int expected_i[6] = {0, 0, 0, 0, 0, 0};
 
-    mu_assert("broadcast scalar jacobian vals fail",
-              cmp_double_array(bcast->jacobian->x, expected_x, 6));
-    mu_assert("broadcast scalar jacobian rows fail",
-              cmp_int_array(bcast->jacobian->to_csr(bcast->jacobian)->p, expected_p, 7));
-    mu_assert("broadcast scalar jacobian cols fail",
-              cmp_int_array(bcast->jacobian->to_csr(bcast->jacobian)->i, expected_i, 6));
+    mu_assert("vals fail", cmp_values(bcast->jacobian, expected_x, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(bcast->jacobian, expected_p, expected_i, 6, 6));
 
     free_expr(bcast);
     return 0;
@@ -150,17 +141,18 @@ const char *test_double_broadcast(void)
     jacobian_init(sum);
     sum->eval_jacobian(sum);
 
+    /* TODO: what is this test? */
+
     /* All 6 elements depend on the single input variable */
     // double expected_x[6] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
     // int expected_p[7] = {0, 1, 2, 3, 4, 5, 6};
     // int expected_i[6] = {0, 0, 0, 0, 0, 0};
     //
-    // mu_assert("broadcast scalar jacobian vals fail",
-    //          cmp_double_array(sum->jacobian->x, expected_x, 6));
-    // mu_assert("broadcast scalar jacobian rows fail",
-    //          cmp_int_array(sum ->jacobian->p, expected_p, 7));
-    // mu_assert("broadcast scalar jacobian cols fail",
-    //          cmp_int_array(bcast->jacobian->to_csr(bcast->jacobian)->i, expected_i, 6));
+    // mu_assert("broadcast scalar jacobian vals fail", //
+    // cmp_double_array(sum->jacobian->x, expected_x, 6)); mu_assert("broadcast
+    // scalar jacobian rows fail", // cmp_int_array(sum ->jacobian->p, expected_p,
+    // 7)); mu_assert("broadcast scalar jacobian cols fail", //
+    // cmp_int_array(bcast->jacobian->to_csr(bcast->jacobian)->i, // expected_i, 6));
 
     free_expr(sum);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_convolve.h b/tests/jacobian_tests/affine/test_convolve.h
index a7a4ef8..ed8ffb8 100644
--- a/tests/jacobian_tests/affine/test_convolve.h
+++ b/tests/jacobian_tests/affine/test_convolve.h
@@ -29,20 +29,17 @@ const char *test_jacobian_convolve(void)
     jacobian_init(y);
     y->eval_jacobian(y);
 
-    mu_assert("Jacobian should have 5 rows", y->jacobian->to_csr(y->jacobian)->m == 5);
-    mu_assert("Jacobian should have 3 columns", y->jacobian->to_csr(y->jacobian)->n == 3);
-    mu_assert("Jacobian should have 9 nonzeros", y->jacobian->to_csr(y->jacobian)->nnz == 9);
+    mu_assert("Jacobian should have 5 rows", y->jacobian->m == 5);
+    mu_assert("Jacobian should have 3 columns", y->jacobian->n == 3);
+    mu_assert("Jacobian should have 9 nonzeros", y->jacobian->nnz == 9);
 
     int expected_p[6] = {0, 1, 3, 6, 8, 9};
     int expected_i[9] = {0, 0, 1, 0, 1, 2, 1, 2, 2};
     double expected_x[9] = {1.0, 2.0, 1.0, 3.0, 2.0, 1.0, 3.0, 2.0, 3.0};
 
-    mu_assert("Convolve Jacobian row pointers incorrect",
-              cmp_int_array(y->jacobian->to_csr(y->jacobian)->p, expected_p, 6));
-    mu_assert("Convolve Jacobian column indices incorrect",
-              cmp_int_array(y->jacobian->to_csr(y->jacobian)->i, expected_i, 9));
-    mu_assert("Convolve Jacobian values incorrect",
-              cmp_double_array(y->jacobian->x, expected_x, 9));
+    mu_assert("sparsity fail",
+              cmp_sparsity(y->jacobian, expected_p, expected_i, 5, 9));
+    mu_assert("vals fail", cmp_values(y->jacobian, expected_x, 9));
 
     free_expr(y);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_diag_mat.h b/tests/jacobian_tests/affine/test_diag_mat.h
index 49a8de9..eb42b75 100644
--- a/tests/jacobian_tests/affine/test_diag_mat.h
+++ b/tests/jacobian_tests/affine/test_diag_mat.h
@@ -23,9 +23,9 @@ const char *test_diag_mat_jacobian_variable(void)
     int expected_p[3] = {0, 1, 2};
     int expected_i[2] = {0, 3};
 
-    mu_assert("diag_mat jac vals", cmp_double_array(dm->jacobian->x, expected_x, 2));
-    mu_assert("diag_mat jac p", cmp_int_array(dm->jacobian->to_csr(dm->jacobian)->p, expected_p, 3));
-    mu_assert("diag_mat jac i", cmp_int_array(dm->jacobian->to_csr(dm->jacobian)->i, expected_i, 2));
+    mu_assert("vals fail", cmp_values(dm->jacobian, expected_x, 2));
+    mu_assert("sparsity fail",
+              cmp_sparsity(dm->jacobian, expected_p, expected_i, 2, 2));
 
     free_expr(dm);
     return 0;
@@ -49,12 +49,12 @@ const char *test_diag_mat_jacobian_of_log(void)
     dm->eval_jacobian(dm);
 
     double expected_x[2] = {1.0, 0.25};
+    int expected_p[3] = {0, 1, 2};
     int expected_i[2] = {0, 3};
 
-    mu_assert("diag_mat log jac vals",
-              cmp_double_array(dm->jacobian->x, expected_x, 2));
-    mu_assert("diag_mat log jac cols",
-              cmp_int_array(dm->jacobian->to_csr(dm->jacobian)->i, expected_i, 2));
+    mu_assert("vals fail", cmp_values(dm->jacobian, expected_x, 2));
+    mu_assert("sparsity fail",
+              cmp_sparsity(dm->jacobian, expected_p, expected_i, 2, 2));
 
     free_expr(dm);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_hstack.h b/tests/jacobian_tests/affine/test_hstack.h
index 1cc0b21..017730e 100644
--- a/tests/jacobian_tests/affine/test_hstack.h
+++ b/tests/jacobian_tests/affine/test_hstack.h
@@ -42,9 +42,9 @@ const char *test_jacobian_hstack_vectors(void)
     int expected_Ai[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
     int expected_Ap[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
 
-    mu_assert("vals fail", cmp_double_array(stack->jacobian->x, expected_Ax, 9));
-    mu_assert("cols fail", cmp_int_array(stack->jacobian->to_csr(stack->jacobian)->i, expected_Ai, 9));
-    mu_assert("rows fail", cmp_int_array(stack->jacobian->to_csr(stack->jacobian)->p, expected_Ap, 10));
+    mu_assert("vals fail", cmp_values(stack->jacobian, expected_Ax, 9));
+    mu_assert("sparsity fail",
+              cmp_sparsity(stack->jacobian, expected_Ap, expected_Ai, 9, 9));
 
     free_expr(stack);
     return 0;
@@ -86,9 +86,9 @@ const char *test_jacobian_hstack_matrix(void)
     int expected_Ap[19] = {0,  1,  2,  3,  4,  5,  6,  7,  8, 9,
                            10, 11, 12, 13, 14, 15, 16, 17, 18};
 
-    mu_assert("vals fail", cmp_double_array(stack->jacobian->x, expected_Ax, 18));
-    mu_assert("cols fail", cmp_int_array(stack->jacobian->to_csr(stack->jacobian)->i, expected_Ai, 18));
-    mu_assert("rows fail", cmp_int_array(stack->jacobian->to_csr(stack->jacobian)->p, expected_Ap, 19));
+    mu_assert("vals fail", cmp_values(stack->jacobian, expected_Ax, 18));
+    mu_assert("sparsity fail",
+              cmp_sparsity(stack->jacobian, expected_Ap, expected_Ai, 18, 18));
 
     free_expr(stack);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_index.h b/tests/jacobian_tests/affine/test_index.h
index c783448..34a316a 100644
--- a/tests/jacobian_tests/affine/test_index.h
+++ b/tests/jacobian_tests/affine/test_index.h
@@ -56,9 +56,9 @@ const char *test_index_jacobian_of_variable(void)
     int expected_p[3] = {0, 1, 2}; /* CSR row ptrs */
     int expected_i[2] = {0, 2};    /* column indices */
 
-    mu_assert("index jac vals", cmp_double_array(idx->jacobian->x, expected_x, 2));
-    mu_assert("index jac p", cmp_int_array(idx->jacobian->to_csr(idx->jacobian)->p, expected_p, 3));
-    mu_assert("index jac i", cmp_int_array(idx->jacobian->to_csr(idx->jacobian)->i, expected_i, 2));
+    mu_assert("vals fail", cmp_values(idx->jacobian, expected_x, 2));
+    mu_assert("sparsity fail",
+              cmp_sparsity(idx->jacobian, expected_p, expected_i, 2, 2));
 
     free_expr(idx);
     return 0;
@@ -76,16 +76,17 @@ const char *test_index_jacobian_of_log(void)
     jacobian_init(idx);
     idx->eval_jacobian(idx);
 
-    /* d/dx log(x) = diag(1/x), then select rows 0 and 2
+    /* d/dx log(x) = diag(1/x), then select rows 0 and 2.
+     * Each selected row has exactly one nonzero (the diagonal entry).
      * Row 0: 1/1 = 1.0 at col 0
      * Row 1: 1/4 = 0.25 at col 2 */
     double expected_x[2] = {1.0, 0.25};
+    int expected_p[3] = {0, 1, 2};
     int expected_i[2] = {0, 2};
 
-    mu_assert("index of log jac vals",
-              cmp_double_array(idx->jacobian->x, expected_x, 2));
-    mu_assert("index of log jac cols",
-              cmp_int_array(idx->jacobian->to_csr(idx->jacobian)->i, expected_i, 2));
+    mu_assert("vals fail", cmp_values(idx->jacobian, expected_x, 2));
+    mu_assert("sparsity fail",
+              cmp_sparsity(idx->jacobian, expected_p, expected_i, 2, 2));
 
     free_expr(idx);
     return 0;
@@ -107,12 +108,9 @@ const char *test_index_jacobian_repeated(void)
     int expected_p[3] = {0, 1, 2};
     int expected_i[2] = {0, 0}; /* Both reference col 0 */
 
-    mu_assert("index repeated jac vals",
-              cmp_double_array(idx->jacobian->x, expected_x, 2));
-    mu_assert("index repeated row ptr",
-              cmp_int_array(idx->jacobian->to_csr(idx->jacobian)->p, expected_p, 3));
-    mu_assert("index repeated jac i",
-              cmp_int_array(idx->jacobian->to_csr(idx->jacobian)->i, expected_i, 2));
+    mu_assert("vals fail", cmp_values(idx->jacobian, expected_x, 2));
+    mu_assert("sparsity fail",
+              cmp_sparsity(idx->jacobian, expected_p, expected_i, 2, 2));
 
     free_expr(idx);
     return 0;
@@ -133,12 +131,14 @@ const char *test_sum_of_index(void)
     jacobian_init(s);
     s->eval_jacobian(s);
 
-    /* Gradient: [1, 0, 1] in sparse form */
+    /* Gradient: [1, 0, 1] in sparse form. Single output row holds both nnz. */
     double expected_x[2] = {1.0, 1.0};
+    int expected_p[2] = {0, 2};
     int expected_i[2] = {0, 2};
 
-    mu_assert("sum of index vals", cmp_double_array(s->jacobian->x, expected_x, 2));
-    mu_assert("sum of index cols", cmp_int_array(s->jacobian->to_csr(s->jacobian)->i, expected_i, 2));
+    mu_assert("vals fail", cmp_values(s->jacobian, expected_x, 2));
+    mu_assert("sparsity fail",
+              cmp_sparsity(s->jacobian, expected_p, expected_i, 1, 2));
 
     free_expr(s);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_left_matmul.h b/tests/jacobian_tests/affine/test_left_matmul.h
index 1fb7bff..c98aca7 100644
--- a/tests/jacobian_tests/affine/test_left_matmul.h
+++ b/tests/jacobian_tests/affine/test_left_matmul.h
@@ -61,9 +61,9 @@ const char *test_jacobian_left_matmul_log(void)
     int expected_Ai[7] = {0, 2, 0, 2, 0, 2, 0};
     int expected_Ap[5] = {0, 2, 4, 6, 7};
 
-    mu_assert("vals fail", cmp_double_array(A_log_x->jacobian->x, expected_Ax, 7));
-    mu_assert("cols fail", cmp_int_array(A_log_x->jacobian->to_csr(A_log_x->jacobian)->i, expected_Ai, 7));
-    mu_assert("rows fail", cmp_int_array(A_log_x->jacobian->to_csr(A_log_x->jacobian)->p, expected_Ap, 5));
+    mu_assert("vals fail", cmp_values(A_log_x->jacobian, expected_Ax, 7));
+    mu_assert("sparsity fail",
+              cmp_sparsity(A_log_x->jacobian, expected_Ap, expected_Ai, 4, 7));
 
     free_csr_matrix(A);
     free_expr(A_log_x);
@@ -100,9 +100,9 @@ const char *test_jacobian_left_matmul_log_matrix(void)
     int expected_Ai[14] = {0, 2, 0, 2, 0, 2, 0, 3, 5, 3, 5, 3, 5, 3};
     int expected_Ap[9] = {0, 2, 4, 6, 7, 9, 11, 13, 14};
 
-    mu_assert("vals fail", cmp_double_array(A_log_x->jacobian->x, expected_Ax, 14));
-    mu_assert("cols fail", cmp_int_array(A_log_x->jacobian->to_csr(A_log_x->jacobian)->i, expected_Ai, 14));
-    mu_assert("rows fail", cmp_int_array(A_log_x->jacobian->to_csr(A_log_x->jacobian)->p, expected_Ap, 9));
+    mu_assert("vals fail", cmp_values(A_log_x->jacobian, expected_Ax, 14));
+    mu_assert("sparsity fail",
+              cmp_sparsity(A_log_x->jacobian, expected_Ap, expected_Ai, 8, 14));
 
     free_csr_matrix(A);
     free_expr(A_log_x);
diff --git a/tests/jacobian_tests/affine/test_neg.h b/tests/jacobian_tests/affine/test_neg.h
index 3190bec..f4d859c 100644
--- a/tests/jacobian_tests/affine/test_neg.h
+++ b/tests/jacobian_tests/affine/test_neg.h
@@ -19,12 +19,9 @@ const char *test_neg_jacobian(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("neg jacobian vals fail",
-              cmp_double_array(neg_node->jacobian->x, expected_x, 3));
-    mu_assert("neg jacobian rows fail",
-              cmp_int_array(neg_node->jacobian->to_csr(neg_node->jacobian)->p, expected_p, 4));
-    mu_assert("neg jacobian cols fail",
-              cmp_int_array(neg_node->jacobian->to_csr(neg_node->jacobian)->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(neg_node->jacobian, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(neg_node->jacobian, expected_p, expected_i, 3, 3));
 
     free_expr(neg_node);
     return 0;
@@ -50,12 +47,9 @@ const char *test_neg_chain(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("neg chain jacobian vals fail",
-              cmp_double_array(neg2->jacobian->x, expected_x, 3));
-    mu_assert("neg chain jacobian rows fail",
-              cmp_int_array(neg2->jacobian->to_csr(neg2->jacobian)->p, expected_p, 4));
-    mu_assert("neg chain jacobian cols fail",
-              cmp_int_array(neg2->jacobian->to_csr(neg2->jacobian)->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(neg2->jacobian, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(neg2->jacobian, expected_p, expected_i, 3, 3));
 
     free_expr(neg2);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_promote.h b/tests/jacobian_tests/affine/test_promote.h
index 75f68fd..c0e31ed 100644
--- a/tests/jacobian_tests/affine/test_promote.h
+++ b/tests/jacobian_tests/affine/test_promote.h
@@ -22,12 +22,9 @@ const char *test_promote_scalar_jacobian(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 0, 0};
 
-    mu_assert("promote jacobian vals fail",
-              cmp_double_array(promote_node->jacobian->x, expected_x, 3));
-    mu_assert("promote jacobian rows fail",
-              cmp_int_array(promote_node->jacobian->to_csr(promote_node->jacobian)->p, expected_p, 4));
-    mu_assert("promote jacobian cols fail",
-              cmp_int_array(promote_node->jacobian->to_csr(promote_node->jacobian)->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(promote_node->jacobian, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(promote_node->jacobian, expected_p, expected_i, 3, 3));
 
     free_expr(promote_node);
     return 0;
@@ -54,12 +51,9 @@ const char *test_promote_scalar_to_matrix_jacobian(void)
     int expected_p[7] = {0, 1, 2, 3, 4, 5, 6};
     int expected_i[6] = {0, 0, 0, 0, 0, 0};
 
-    mu_assert("promote matrix jacobian vals fail",
-              cmp_double_array(promote_node->jacobian->x, expected_x, 6));
-    mu_assert("promote matrix jacobian rows fail",
-              cmp_int_array(promote_node->jacobian->to_csr(promote_node->jacobian)->p, expected_p, 7));
-    mu_assert("promote matrix jacobian cols fail",
-              cmp_int_array(promote_node->jacobian->to_csr(promote_node->jacobian)->i, expected_i, 6));
+    mu_assert("vals fail", cmp_values(promote_node->jacobian, expected_x, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(promote_node->jacobian, expected_p, expected_i, 6, 6));
 
     free_expr(promote_node);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_right_matmul.h b/tests/jacobian_tests/affine/test_right_matmul.h
index ad74d60..f2b40f0 100644
--- a/tests/jacobian_tests/affine/test_right_matmul.h
+++ b/tests/jacobian_tests/affine/test_right_matmul.h
@@ -47,9 +47,9 @@ const char *test_jacobian_right_matmul_log(void)
     int expected_Ai[8] = {0, 2, 1, 3, 0, 2, 1, 3};
     int expected_Ap[7] = {0, 2, 4, 4, 4, 6, 8};
 
-    mu_assert("vals fail", cmp_double_array(log_x_A->jacobian->x, expected_Ax, 8));
-    mu_assert("cols fail", cmp_int_array(log_x_A->jacobian->to_csr(log_x_A->jacobian)->i, expected_Ai, 8));
-    mu_assert("rows fail", cmp_int_array(log_x_A->jacobian->to_csr(log_x_A->jacobian)->p, expected_Ap, 7));
+    mu_assert("vals fail", cmp_values(log_x_A->jacobian, expected_Ax, 8));
+    mu_assert("sparsity fail",
+              cmp_sparsity(log_x_A->jacobian, expected_Ap, expected_Ai, 6, 8));
 
     free_csr_matrix(A);
     free_expr(log_x_A);
@@ -92,9 +92,9 @@ const char *test_jacobian_right_matmul_log_vector(void)
     int expected_Ai[4] = {0, 1, 1, 2};
     int expected_Ap[3] = {0, 2, 4};
 
-    mu_assert("vals fail", cmp_double_array(log_x_A->jacobian->x, expected_Ax, 4));
-    mu_assert("cols fail", cmp_int_array(log_x_A->jacobian->to_csr(log_x_A->jacobian)->i, expected_Ai, 4));
-    mu_assert("rows fail", cmp_int_array(log_x_A->jacobian->to_csr(log_x_A->jacobian)->p, expected_Ap, 3));
+    mu_assert("vals fail", cmp_values(log_x_A->jacobian, expected_Ax, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(log_x_A->jacobian, expected_Ap, expected_Ai, 2, 4));
 
     free_csr_matrix(A);
     free_expr(log_x_A);
diff --git a/tests/jacobian_tests/affine/test_scalar_mult.h b/tests/jacobian_tests/affine/test_scalar_mult.h
index 3eda315..afdcdd4 100644
--- a/tests/jacobian_tests/affine/test_scalar_mult.h
+++ b/tests/jacobian_tests/affine/test_scalar_mult.h
@@ -36,12 +36,9 @@ const char *test_jacobian_scalar_mult_log_vector(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("scalar mult log: x values fail",
-              cmp_double_array(y->jacobian->x, expected_x, 3));
-    mu_assert("scalar mult log: row pointers fail",
-              cmp_int_array(y->jacobian->to_csr(y->jacobian)->p, expected_p, 4));
-    mu_assert("scalar mult log: column indices fail",
-              cmp_int_array(y->jacobian->to_csr(y->jacobian)->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(y->jacobian, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(y->jacobian, expected_p, expected_i, 3, 3));
 
     free_expr(y);
     return 0;
@@ -73,12 +70,9 @@ const char *test_jacobian_scalar_mult_log_matrix(void)
     int expected_p[5] = {0, 1, 2, 3, 4};
     int expected_i[4] = {0, 1, 2, 3};
 
-    mu_assert("scalar mult log matrix: x values fail",
-              cmp_double_array(y->jacobian->x, expected_x, 4));
-    mu_assert("scalar mult log matrix: row pointers fail",
-              cmp_int_array(y->jacobian->to_csr(y->jacobian)->p, expected_p, 5));
-    mu_assert("scalar mult log matrix: column indices fail",
-              cmp_int_array(y->jacobian->to_csr(y->jacobian)->i, expected_i, 4));
+    mu_assert("vals fail", cmp_values(y->jacobian, expected_x, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(y->jacobian, expected_p, expected_i, 4, 4));
 
     free_expr(y);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_sum.h b/tests/jacobian_tests/affine/test_sum.h
index 026d3bf..150fe53 100644
--- a/tests/jacobian_tests/affine/test_sum.h
+++ b/tests/jacobian_tests/affine/test_sum.h
@@ -25,9 +25,9 @@ const char *test_jacobian_sum_log(void)
     int expected_Ap[2] = {0, 3};
     int expected_Ai[3] = {2, 3, 4};
 
-    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->x, expected_Ax, 3));
-    mu_assert("rows fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->i, expected_Ai, 3));
+    mu_assert("vals fail", cmp_values(sum_node->jacobian, expected_Ax, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(sum_node->jacobian, expected_Ap, expected_Ai, 1, 3));
 
     free_expr(sum_node);
     return 0;
@@ -59,9 +59,9 @@ const char *test_jacobian_sum_mult(void)
     int expected_Ap[2] = {0, 6}; /* 1x10 matrix: row 0 spans all 6 nonzeros */
     int expected_Ai[6] = {2, 3, 4, 6, 7, 8}; /* column indices */
 
-    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->x, expected_Ax, 6));
-    mu_assert("rows fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->i, expected_Ai, 6));
+    mu_assert("vals fail", cmp_values(sum_node->jacobian, expected_Ax, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(sum_node->jacobian, expected_Ap, expected_Ai, 1, 6));
 
     free_expr(sum_node);
     return 0;
@@ -99,9 +99,9 @@ const char *test_jacobian_sum_log_axis_0(void)
     int expected_Ap[3] = {0, 3, 6};
     int expected_Ai[6] = {2, 3, 4, 5, 6, 7}; /* column indices */
 
-    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->x, expected_Ax, 6));
-    mu_assert("rows fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->p, expected_Ap, 3));
-    mu_assert("cols fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->i, expected_Ai, 6));
+    mu_assert("vals fail", cmp_values(sum_node->jacobian, expected_Ax, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(sum_node->jacobian, expected_Ap, expected_Ai, 2, 6));
 
     free_expr(sum_node);
     return 0;
@@ -146,9 +146,9 @@ const char *test_jacobian_sum_add_log_axis_0(void)
     int expected_Ai[12] = {2, 3, 4, 8,  9,  10,  /* row 0 columns */
                            5, 6, 7, 11, 12, 13}; /* row 1 columns */
 
-    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->x, expected_Ax, 12));
-    mu_assert("rows fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->p, expected_Ap, 3));
-    mu_assert("cols fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->i, expected_Ai, 12));
+    mu_assert("vals fail", cmp_values(sum_node->jacobian, expected_Ax, 12));
+    mu_assert("sparsity fail",
+              cmp_sparsity(sum_node->jacobian, expected_Ap, expected_Ai, 2, 12));
 
     free_expr(sum_node);
     return 0;
@@ -188,9 +188,9 @@ const char *test_jacobian_sum_log_axis_1(void)
     int expected_Ap[4] = {0, 2, 4, 6};
     int expected_Ai[6] = {2, 5, 3, 6, 4, 7}; /* column indices */
 
-    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->x, expected_Ax, 6));
-    mu_assert("rows fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(sum_node->jacobian->to_csr(sum_node->jacobian)->i, expected_Ai, 6));
+    mu_assert("vals fail", cmp_values(sum_node->jacobian, expected_Ax, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(sum_node->jacobian, expected_Ap, expected_Ai, 3, 6));
 
     free_expr(sum_node);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_trace.h b/tests/jacobian_tests/affine/test_trace.h
index 0caa122..808d634 100644
--- a/tests/jacobian_tests/affine/test_trace.h
+++ b/tests/jacobian_tests/affine/test_trace.h
@@ -39,10 +39,9 @@ const char *test_jacobian_trace_variable(void)
     int expected_Ap[2] = {0, 3};
     int expected_Ai[3] = {1, 5, 9}; /* column indices (global variable indices) */
 
-    mu_assert("vals fail",
-              cmp_double_array(trace_node->jacobian->x, expected_Ax, 3));
-    mu_assert("rows fail", cmp_int_array(trace_node->jacobian->to_csr(trace_node->jacobian)->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(trace_node->jacobian->to_csr(trace_node->jacobian)->i, expected_Ai, 3));
+    mu_assert("vals fail", cmp_values(trace_node->jacobian, expected_Ax, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(trace_node->jacobian, expected_Ap, expected_Ai, 1, 3));
 
     free_expr(trace_node);
     return 0;
@@ -93,11 +92,10 @@ const char *test_jacobian_trace_composite(void)
     int expected_Ap[2] = {0, 3};
     int expected_Ai[3] = {1, 5, 9}; /* column indices (global variable indices) */
 
-    mu_assert("vals match count", trace_node->jacobian->to_csr(trace_node->jacobian)->nnz == 3);
-    mu_assert("rows fail", cmp_int_array(trace_node->jacobian->to_csr(trace_node->jacobian)->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(trace_node->jacobian->to_csr(trace_node->jacobian)->i, expected_Ai, 3));
-    mu_assert("vals fail",
-              cmp_double_array(trace_node->jacobian->x, expected_Ax, 3));
+    mu_assert("vals match count", trace_node->jacobian->nnz == 3);
+    mu_assert("sparsity fail",
+              cmp_sparsity(trace_node->jacobian, expected_Ap, expected_Ai, 1, 3));
+    mu_assert("vals fail", cmp_values(trace_node->jacobian, expected_Ax, 3));
 
     free_expr(trace_node);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_transpose.h b/tests/jacobian_tests/affine/test_transpose.h
index d2201da..523cb2e 100644
--- a/tests/jacobian_tests/affine/test_transpose.h
+++ b/tests/jacobian_tests/affine/test_transpose.h
@@ -33,12 +33,9 @@ const char *test_jacobian_transpose(void)
     int expected_p[5] = {0, 2, 4, 6, 8};
     int expected_i[8] = {0, 1, 2, 3, 0, 1, 2, 3};
 
-    mu_assert("jacobian values fail",
-              cmp_double_array(transpose_AX->jacobian->x, expected_x, 8));
-    mu_assert("jacobian row ptr fail",
-              cmp_int_array(transpose_AX->jacobian->to_csr(transpose_AX->jacobian)->p, expected_p, 5));
-    mu_assert("jacobian col idx fail",
-              cmp_int_array(transpose_AX->jacobian->to_csr(transpose_AX->jacobian)->i, expected_i, 8));
+    mu_assert("vals fail", cmp_values(transpose_AX->jacobian, expected_x, 8));
+    mu_assert("sparsity fail",
+              cmp_sparsity(transpose_AX->jacobian, expected_p, expected_i, 4, 8));
     free_expr(transpose_AX);
     free_csr_matrix(A);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_upper_tri.h b/tests/jacobian_tests/affine/test_upper_tri.h
index 0a01a30..c4a22a1 100644
--- a/tests/jacobian_tests/affine/test_upper_tri.h
+++ b/tests/jacobian_tests/affine/test_upper_tri.h
@@ -27,10 +27,9 @@ const char *test_upper_tri_jacobian_variable(void)
     int expected_p[7] = {0, 1, 2, 3, 4, 5, 6};
     int expected_i[6] = {4, 8, 12, 9, 13, 14};
 
-    mu_assert("upper_tri jac vals",
-              cmp_double_array(ut->jacobian->x, expected_x, 6));
-    mu_assert("upper_tri jac p", cmp_int_array(ut->jacobian->to_csr(ut->jacobian)->p, expected_p, 7));
-    mu_assert("upper_tri jac i", cmp_int_array(ut->jacobian->to_csr(ut->jacobian)->i, expected_i, 6));
+    mu_assert("vals fail", cmp_values(ut->jacobian, expected_x, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(ut->jacobian, expected_p, expected_i, 6, 6));
 
     free_expr(ut);
     return 0;
@@ -57,12 +56,12 @@ const char *test_upper_tri_jacobian_of_log(void)
     ut->eval_jacobian(ut);
 
     double expected_x[6] = {0.2, 1.0 / 9.0, 1.0 / 13.0, 0.1, 1.0 / 14.0, 1.0 / 15.0};
+    int expected_p[7] = {0, 1, 2, 3, 4, 5, 6};
     int expected_i[6] = {4, 8, 12, 9, 13, 14};
 
-    mu_assert("upper_tri log jac vals",
-              cmp_double_array(ut->jacobian->x, expected_x, 6));
-    mu_assert("upper_tri log jac cols",
-              cmp_int_array(ut->jacobian->to_csr(ut->jacobian)->i, expected_i, 6));
+    mu_assert("vals fail", cmp_values(ut->jacobian, expected_x, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(ut->jacobian, expected_p, expected_i, 6, 6));
 
     free_expr(ut);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_vector_mult.h b/tests/jacobian_tests/affine/test_vector_mult.h
index a686c54..8196f9e 100644
--- a/tests/jacobian_tests/affine/test_vector_mult.h
+++ b/tests/jacobian_tests/affine/test_vector_mult.h
@@ -35,12 +35,9 @@ const char *test_jacobian_vector_mult_log_vector(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vector mult log: x values fail",
-              cmp_double_array(y->jacobian->x, expected_x, 3));
-    mu_assert("vector mult log: row pointers fail",
-              cmp_int_array(y->jacobian->to_csr(y->jacobian)->p, expected_p, 4));
-    mu_assert("vector mult log: column indices fail",
-              cmp_int_array(y->jacobian->to_csr(y->jacobian)->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(y->jacobian, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(y->jacobian, expected_p, expected_i, 3, 3));
 
     free_expr(y);
     return 0;
@@ -71,12 +68,9 @@ const char *test_jacobian_vector_mult_log_matrix(void)
     int expected_p[5] = {0, 1, 2, 3, 4};
     int expected_i[4] = {0, 1, 2, 3};
 
-    mu_assert("vector mult log matrix: x values fail",
-              cmp_double_array(y->jacobian->x, expected_x, 4));
-    mu_assert("vector mult log matrix: row pointers fail",
-              cmp_int_array(y->jacobian->to_csr(y->jacobian)->p, expected_p, 5));
-    mu_assert("vector mult log matrix: column indices fail",
-              cmp_int_array(y->jacobian->to_csr(y->jacobian)->i, expected_i, 4));
+    mu_assert("vals fail", cmp_values(y->jacobian, expected_x, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(y->jacobian, expected_p, expected_i, 4, 4));
 
     free_expr(y);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_vstack.h b/tests/jacobian_tests/affine/test_vstack.h
index 2f6b2e3..86f4ac8 100644
--- a/tests/jacobian_tests/affine/test_vstack.h
+++ b/tests/jacobian_tests/affine/test_vstack.h
@@ -38,12 +38,9 @@ const char *test_jacobian_vstack_vectors(void)
     int expected_i[6] = {0, 1, 2, 0, 1, 2};
     int expected_p[7] = {0, 1, 2, 3, 4, 5, 6};
 
-    mu_assert("vstack jac vectors: vals",
-              cmp_double_array(stack->jacobian->x, expected_x, 6));
-    mu_assert("vstack jac vectors: cols",
-              cmp_int_array(stack->jacobian->to_csr(stack->jacobian)->i, expected_i, 6));
-    mu_assert("vstack jac vectors: rows",
-              cmp_int_array(stack->jacobian->to_csr(stack->jacobian)->p, expected_p, 7));
+    mu_assert("vals fail", cmp_values(stack->jacobian, expected_x, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(stack->jacobian, expected_p, expected_i, 6, 6));
 
     free_expr(stack);
     return 0;
@@ -88,12 +85,9 @@ const char *test_jacobian_vstack_matrix(void)
     int expected_i[9] = {0, 1, 6, 2, 3, 7, 4, 5, 8};
     int expected_p[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
 
-    mu_assert("vstack jac matrix: vals",
-              cmp_double_array(stack->jacobian->x, expected_x, 9));
-    mu_assert("vstack jac matrix: cols",
-              cmp_int_array(stack->jacobian->to_csr(stack->jacobian)->i, expected_i, 9));
-    mu_assert("vstack jac matrix: rows",
-              cmp_int_array(stack->jacobian->to_csr(stack->jacobian)->p, expected_p, 10));
+    mu_assert("vals fail", cmp_values(stack->jacobian, expected_x, 9));
+    mu_assert("sparsity fail",
+              cmp_sparsity(stack->jacobian, expected_p, expected_i, 9, 9));
 
     free_expr(stack);
     return 0;
diff --git a/tests/jacobian_tests/bivariate_full_dom/test_elementwise_mult.h b/tests/jacobian_tests/bivariate_full_dom/test_elementwise_mult.h
index bdf3bd9..d300d3b 100644
--- a/tests/jacobian_tests/bivariate_full_dom/test_elementwise_mult.h
+++ b/tests/jacobian_tests/bivariate_full_dom/test_elementwise_mult.h
@@ -24,9 +24,8 @@ const char *test_jacobian_elementwise_mult_1(void)
     int rows[4] = {0, 2, 4, 6};
     int cols[6] = {2, 7, 3, 8, 4, 9};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, vals, 6));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, rows, 4));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, cols, 6));
+    mu_assert("vals fail", cmp_values(node->jacobian, vals, 6));
+    mu_assert("sparsity fail", cmp_sparsity(node->jacobian, rows, cols, 3, 6));
     free_expr(node);
     return 0;
 }
@@ -49,9 +48,8 @@ const char *test_jacobian_elementwise_mult_2(void)
     int rows[4] = {0, 2, 4, 6};
     int cols[6] = {2, 7, 3, 8, 4, 9};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, vals, 6));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, rows, 4));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, cols, 6));
+    mu_assert("vals fail", cmp_values(node->jacobian, vals, 6));
+    mu_assert("sparsity fail", cmp_sparsity(node->jacobian, rows, cols, 3, 6));
     free_expr(node);
     return 0;
 }
@@ -112,9 +110,8 @@ const char *test_jacobian_elementwise_mult_3(void)
     int rows[4] = {0, 4, 10, 16};
     int cols[16] = {2, 3, 7, 8, 2, 3, 4, 7, 8, 9, 2, 3, 4, 7, 8, 9};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, vals, 16));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, rows, 4));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, cols, 16));
+    mu_assert("vals fail", cmp_values(node->jacobian, vals, 16));
+    mu_assert("sparsity fail", cmp_sparsity(node->jacobian, rows, cols, 3, 16));
     free_expr(node);
     free_csr_matrix(A);
     free_csr_matrix(B);
@@ -157,9 +154,8 @@ const char *test_jacobian_elementwise_mult_4(void)
     int rows[4] = {0, 2, 5, 8};
     int cols[8] = {2, 3, 2, 3, 4, 2, 3, 4};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, vals, 8));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, rows, 4));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, cols, 8));
+    mu_assert("vals fail", cmp_values(node->jacobian, vals, 8));
+    mu_assert("sparsity fail", cmp_sparsity(node->jacobian, rows, cols, 3, 8));
     free_expr(node);
     free_csr_matrix(A);
     return 0;
diff --git a/tests/jacobian_tests/bivariate_full_dom/test_matmul.h b/tests/jacobian_tests/bivariate_full_dom/test_matmul.h
index e825918..6155252 100644
--- a/tests/jacobian_tests/bivariate_full_dom/test_matmul.h
+++ b/tests/jacobian_tests/bivariate_full_dom/test_matmul.h
@@ -49,15 +49,10 @@ const char *test_jacobian_matmul(void)
     Z->eval_jacobian(Z);
 
     /* Verify sparsity pattern */
-    mu_assert("Jacobian should have 8 rows", Z->jacobian->to_csr(Z->jacobian)->m == z_size);
-    mu_assert("Jacobian should have 18 columns", Z->jacobian->to_csr(Z->jacobian)->n == n_vars);
-    mu_assert("Jacobian should have 48 nonzeros", Z->jacobian->to_csr(Z->jacobian)->nnz == 48);
+    mu_assert("Jacobian should have 18 columns", Z->jacobian->n == n_vars);
 
-    /* Check row pointers: each row should have 6 entries */
+    /* Each row should have 6 entries. */
     int expected_p[9] = {0, 6, 12, 18, 24, 30, 36, 42, 48};
-    mu_assert("Row pointers incorrect",
-              cmp_int_array(Z->jacobian->to_csr(Z->jacobian)->p, expected_p, 9));
-
     int expected_i[48] = {0, 2, 4, 6,  7,  8,   /* row 0 */
                           1, 3, 5, 6,  7,  8,   /* row 1 */
                           0, 2, 4, 9,  10, 11,  /* row 2 */
@@ -66,8 +61,8 @@ const char *test_jacobian_matmul(void)
                           1, 3, 5, 12, 13, 14,  /* row 5 */
                           0, 2, 4, 15, 16, 17,  /* row 6 */
                           1, 3, 5, 15, 16, 17}; /* row 7 */
-    mu_assert("Column indices incorrect",
-              cmp_int_array(Z->jacobian->to_csr(Z->jacobian)->i, expected_i, 48));
+    mu_assert("sparsity fail",
+              cmp_sparsity(Z->jacobian, expected_p, expected_i, z_size, 48));
 
     /* Verify Jacobian values row-wise: for each row, values are
        [Y^T row for the column, X row values] since X has lower var_id */
@@ -81,8 +76,7 @@ const char *test_jacobian_matmul(void)
         /* row 6 (col 3) */ 16.0, 17.0, 18.0, 1.0, 3.0, 5.0,
         /* row 7 (col 3) */ 16.0, 17.0, 18.0, 2.0, 4.0, 6.0};
 
-    mu_assert("Jacobian values incorrect",
-              cmp_double_array(Z->jacobian->x, expected_x, 48));
+    mu_assert("vals fail", cmp_values(Z->jacobian, expected_x, 48));
 
     free_expr(Z);
     return 0;
diff --git a/tests/jacobian_tests/bivariate_restricted_dom/test_quad_over_lin.h b/tests/jacobian_tests/bivariate_restricted_dom/test_quad_over_lin.h
index 05eb5b2..7889075 100644
--- a/tests/jacobian_tests/bivariate_restricted_dom/test_quad_over_lin.h
+++ b/tests/jacobian_tests/bivariate_restricted_dom/test_quad_over_lin.h
@@ -25,9 +25,9 @@ const char *test_quad_over_lin1(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 3, 4, 7};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 4));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 4));
+    mu_assert("vals fail", cmp_values(node->jacobian, expected_Ax, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 1, 4));
     free_expr(node);
     return 0;
 }
@@ -49,9 +49,9 @@ const char *test_quad_over_lin2(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 5, 6, 7};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 4));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 4));
+    mu_assert("vals fail", cmp_values(node->jacobian, expected_Ax, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 1, 4));
     free_expr(node);
     return 0;
 }
@@ -86,9 +86,9 @@ const char *test_quad_over_lin3(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 3, 4, 7};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_vals, 4));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 4));
+    mu_assert("vals fail", cmp_values(node->jacobian, expected_vals, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 1, 4));
 
     free_csr_matrix(A);
     free_expr(node);
@@ -126,9 +126,9 @@ const char *test_quad_over_lin4(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 5, 6, 7};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_vals, 4));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 4));
+    mu_assert("vals fail", cmp_values(node->jacobian, expected_vals, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 1, 4));
 
     free_csr_matrix(A);
     free_expr(node);
@@ -166,9 +166,9 @@ const char *test_quad_over_lin5(void)
     int expected_Ap[2] = {0, 7};
     int expected_Ai[7] = {0, 1, 2, 3, 5, 6, 7};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_vals, 7));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 7));
+    mu_assert("vals fail", cmp_values(node->jacobian, expected_vals, 7));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 1, 7));
 
     free_csr_matrix(A);
     free_expr(node);
diff --git a/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr.h b/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr.h
index e432d86..e9998fa 100644
--- a/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr.h
+++ b/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr.h
@@ -30,9 +30,9 @@ const char *test_jacobian_rel_entr_vector_args_1(void)
     int expected_Ap[4] = {0, 2, 4, 6};
     int expected_Ai[6] = {2, 7, 3, 8, 4, 9};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 6));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 3, 6));
+    mu_assert("vals fail", cmp_values(node->jacobian, expected_Ax, 6));
     free_expr(node);
     return 0;
 }
@@ -61,9 +61,9 @@ const char *test_jacobian_rel_entr_vector_args_2(void)
     int expected_Ap[4] = {0, 2, 4, 6};
     int expected_Ai[6] = {2, 7, 3, 8, 4, 9};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 6));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 3, 6));
+    mu_assert("vals fail", cmp_values(node->jacobian, expected_Ax, 6));
     free_expr(node);
     return 0;
 }
@@ -103,9 +103,9 @@ const char *test_jacobian_rel_entr_matrix_args(void)
     int expected_Ap[7] = {0, 2, 4, 6, 8, 10, 12};
     int expected_Ai[12] = {0, 6, 1, 7, 2, 8, 3, 9, 4, 10, 5, 11};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 12));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 7));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 12));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 6, 12));
+    mu_assert("vals fail", cmp_values(node->jacobian, expected_Ax, 12));
 
     free_expr(node);
     return 0;
diff --git a/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_scalar_vector.h b/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_scalar_vector.h
index 2d54b3d..5236d3e 100644
--- a/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_scalar_vector.h
+++ b/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_scalar_vector.h
@@ -28,9 +28,9 @@ const char *test_jacobian_rel_entr_scalar_vector(void)
     int expected_Ap[4] = {0, 2, 4, 6};
     int expected_Ai[6] = {0, 1, 0, 2, 0, 3};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 6));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 6));
+    mu_assert("vals fail", cmp_values(node->jacobian, expected_Ax, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 3, 6));
     free_expr(node);
     return 0;
 }
diff --git a/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_vector_scalar.h b/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_vector_scalar.h
index 4e88988..d2f594f 100644
--- a/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_vector_scalar.h
+++ b/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_vector_scalar.h
@@ -28,9 +28,9 @@ const char *test_jacobian_rel_entr_vector_scalar(void)
     int expected_Ap[4] = {0, 2, 4, 6};
     int expected_Ai[6] = {0, 3, 1, 3, 2, 3};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 6));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 6));
+    mu_assert("vals fail", cmp_values(node->jacobian, expected_Ax, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 3, 6));
     free_expr(node);
     return 0;
 }
diff --git a/tests/jacobian_tests/composite/test_composite_exp.h b/tests/jacobian_tests/composite/test_composite_exp.h
index ad38123..5af53bd 100644
--- a/tests/jacobian_tests/composite/test_composite_exp.h
+++ b/tests/jacobian_tests/composite/test_composite_exp.h
@@ -33,9 +33,8 @@ const char *test_jacobian_composite_exp(void)
     double vals[6] = {3 * e10, 2 * e10, 1 * e10, 2 * e7, 1 * e7, 1 * e7};
     int rows[3] = {0, 3, 6};
     int cols[6] = {2, 3, 4, 2, 3, 4};
-    mu_assert("vals fail", cmp_double_array(exp_node->jacobian->x, vals, 6));
-    mu_assert("rows fail", cmp_int_array(exp_node->jacobian->to_csr(exp_node->jacobian)->p, rows, 3));
-    mu_assert("cols fail", cmp_int_array(exp_node->jacobian->to_csr(exp_node->jacobian)->i, cols, 6));
+    mu_assert("vals fail", cmp_values(exp_node->jacobian, vals, 6));
+    mu_assert("sparsity fail", cmp_sparsity(exp_node->jacobian, rows, cols, 2, 6));
     free_expr(exp_node);
     free_csr_matrix(A);
     return 0;
diff --git a/tests/jacobian_tests/elementwise_restricted_dom/test_log.h b/tests/jacobian_tests/elementwise_restricted_dom/test_log.h
index d5b55c2..b671910 100644
--- a/tests/jacobian_tests/elementwise_restricted_dom/test_log.h
+++ b/tests/jacobian_tests/elementwise_restricted_dom/test_log.h
@@ -18,9 +18,9 @@ const char *test_jacobian_log(void)
     log_node->forward(log_node, u_vals);
     jacobian_init(log_node);
     log_node->eval_jacobian(log_node);
-    mu_assert("vals fail", cmp_double_array(log_node->jacobian->x, expected_Ax, 3));
-    mu_assert("rows fail", cmp_int_array(log_node->jacobian->to_csr(log_node->jacobian)->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(log_node->jacobian->to_csr(log_node->jacobian)->i, expected_Ai, 3));
+    mu_assert("vals fail", cmp_values(log_node->jacobian, expected_Ax, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(log_node->jacobian, expected_Ap, expected_Ai, 3, 3));
     free_expr(log_node);
     return 0;
 }
@@ -36,9 +36,9 @@ const char *test_jacobian_log_matrix(void)
     log_node->forward(log_node, u_vals);
     jacobian_init(log_node);
     log_node->eval_jacobian(log_node);
-    mu_assert("vals fail", cmp_double_array(log_node->jacobian->x, expected_Ax, 4));
-    mu_assert("rows fail", cmp_int_array(log_node->jacobian->to_csr(log_node->jacobian)->p, expected_Ap, 5));
-    mu_assert("cols fail", cmp_int_array(log_node->jacobian->to_csr(log_node->jacobian)->i, expected_Ai, 4));
+    mu_assert("vals fail", cmp_values(log_node->jacobian, expected_Ax, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(log_node->jacobian, expected_Ap, expected_Ai, 4, 4));
     free_expr(log_node);
     return 0;
 }
diff --git a/tests/jacobian_tests/other/test_prod.h b/tests/jacobian_tests/other/test_prod.h
index 0603ec4..7875437 100644
--- a/tests/jacobian_tests/other/test_prod.h
+++ b/tests/jacobian_tests/other/test_prod.h
@@ -25,9 +25,9 @@ const char *test_jacobian_prod_no_zero(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->jacobian->x, expected_Ax, 4));
-    mu_assert("rows fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->i, expected_Ai, 4));
+    mu_assert("vals fail", cmp_values(p->jacobian, expected_Ax, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->jacobian, expected_Ap, expected_Ai, 1, 4));
 
     free_expr(p);
     return 0;
@@ -50,9 +50,9 @@ const char *test_jacobian_prod_one_zero(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->jacobian->x, expected_Ax, 4));
-    mu_assert("rows fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->i, expected_Ai, 4));
+    mu_assert("vals fail", cmp_values(p->jacobian, expected_Ax, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->jacobian, expected_Ap, expected_Ai, 1, 4));
 
     free_expr(p);
     return 0;
@@ -73,9 +73,9 @@ const char *test_jacobian_prod_two_zeros(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->jacobian->x, expected_Ax, 4));
-    mu_assert("rows fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->i, expected_Ai, 4));
+    mu_assert("vals fail", cmp_values(p->jacobian, expected_Ax, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->jacobian, expected_Ap, expected_Ai, 1, 4));
 
     free_expr(p);
     return 0;
diff --git a/tests/jacobian_tests/other/test_prod_axis_one.h b/tests/jacobian_tests/other/test_prod_axis_one.h
index 8814cac..41cd7c8 100644
--- a/tests/jacobian_tests/other/test_prod_axis_one.h
+++ b/tests/jacobian_tests/other/test_prod_axis_one.h
@@ -42,9 +42,9 @@ const char *test_jacobian_prod_axis_one(void)
     int expected_Ap[4] = {0, 3, 6, 9};
     int expected_Ai[9] = {1, 4, 7, 2, 5, 8, 3, 6, 9};
 
-    mu_assert("vals fail", cmp_double_array(p->jacobian->x, expected_Ax, 9));
-    mu_assert("rows fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->i, expected_Ai, 9));
+    mu_assert("vals fail", cmp_values(p->jacobian, expected_Ax, 9));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->jacobian, expected_Ap, expected_Ai, 3, 9));
 
     free_expr(p);
     return 0;
@@ -85,9 +85,9 @@ const char *test_jacobian_prod_axis_one_one_zero(void)
     int expected_Ap[4] = {0, 3, 6, 9};
     int expected_Ai[9] = {1, 4, 7, 2, 5, 8, 3, 6, 9};
 
-    mu_assert("vals fail", cmp_double_array(p->jacobian->x, expected_Ax, 9));
-    mu_assert("rows fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->i, expected_Ai, 9));
+    mu_assert("vals fail", cmp_values(p->jacobian, expected_Ax, 9));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->jacobian, expected_Ap, expected_Ai, 3, 9));
 
     free_expr(p);
     return 0;
diff --git a/tests/jacobian_tests/other/test_prod_axis_zero.h b/tests/jacobian_tests/other/test_prod_axis_zero.h
index 5d80fb8..8db93d6 100644
--- a/tests/jacobian_tests/other/test_prod_axis_zero.h
+++ b/tests/jacobian_tests/other/test_prod_axis_zero.h
@@ -36,9 +36,9 @@ const char *test_jacobian_prod_axis_zero(void)
     int expected_Ap[4] = {0, 2, 4, 6};
     int expected_Ai[6] = {1, 2, 3, 4, 5, 6};
 
-    mu_assert("vals fail", cmp_double_array(p->jacobian->x, expected_Ax, 6));
-    mu_assert("rows fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(p->jacobian->to_csr(p->jacobian)->i, expected_Ai, 6));
+    mu_assert("vals fail", cmp_values(p->jacobian, expected_Ax, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->jacobian, expected_Ap, expected_Ai, 3, 6));
 
     free_expr(p);
     return 0;
diff --git a/tests/jacobian_tests/other/test_quad_form.h b/tests/jacobian_tests/other/test_quad_form.h
index c989987..d420751 100644
--- a/tests/jacobian_tests/other/test_quad_form.h
+++ b/tests/jacobian_tests/other/test_quad_form.h
@@ -31,9 +31,9 @@ const char *test_quad_form(void)
     int expected_Ap[2] = {0, 3};
     int expected_Ai[3] = {2, 3, 4};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 3));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 3));
+    mu_assert("vals fail", cmp_values(node->jacobian, expected_Ax, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 1, 3));
     free_expr(node);
     free_csr_matrix(Q);
     return 0;
@@ -77,9 +77,9 @@ double expected_Ax[5] = {422, 2222, 3244, 3786, 120};
 int expected_Ap[2] = {0, 5};
 int expected_Ai[5] = {0, 2, 3, 4, 5};
 
-mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 5));
-mu_assert("rows fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->p, expected_Ap, 2));
-mu_assert("cols fail", cmp_int_array(node->jacobian->to_csr(node->jacobian)->i, expected_Ai, 5));
+mu_assert("vals fail", cmp_values(node->jacobian, expected_Ax, 5));
+mu_assert("sparsity fail",
+              cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 1, 5));
 free_expr(node);
 free_expr(Au);
 free_csr_matrix(Q);
diff --git a/tests/numerical_diff.c b/tests/numerical_diff.c
index 291ae7d..3cce811 100644
--- a/tests/numerical_diff.c
+++ b/tests/numerical_diff.c
@@ -127,6 +127,12 @@ double *numerical_wsum_hess(expr *node, const double *u, const double *w, double
 
     memcpy(u_work, u, n * sizeof(double));
 
+    /* Hoist the CSR view once. For Sparse_Matrix (the only type used by tests
+       that reach here), csr->x aliases node->jacobian->x, so eval_jacobian
+       writes inside the loop update jac->x in place. A PD-backed Jacobian
+       would need a per-iteration to_csr refresh; not exercised today. */
+    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
+
     for (int j = 0; j < n; j++)
     {
         /* g(u + h*e_j) */
@@ -134,14 +140,14 @@ double *numerical_wsum_hess(expr *node, const double *u, const double *w, double
         node->forward(node, u_work);
         node->eval_jacobian(node);
         memset(g_plus, 0, n * sizeof(double));
-        csr_transpose_mult_vec(node->jacobian->to_csr(node->jacobian), w, g_plus);
+        csr_transpose_mult_vec(jac, w, g_plus);
 
         /* g(u - h*e_j) */
         u_work[j] = u[j] - h;
         node->forward(node, u_work);
         node->eval_jacobian(node);
         memset(g_minus, 0, n * sizeof(double));
-        csr_transpose_mult_vec(node->jacobian->to_csr(node->jacobian), w, g_minus);
+        csr_transpose_mult_vec(jac, w, g_minus);
 
         u_work[j] = u[j];
 
diff --git a/tests/profiling/profile_log_reg.h b/tests/profiling/profile_log_reg.h
index 2af45a9..1e4370a 100644
--- a/tests/profiling/profile_log_reg.h
+++ b/tests/profiling/profile_log_reg.h
@@ -90,7 +90,8 @@ const char *profile_log_reg(void)
     double *w_ones = (double *) malloc(m * sizeof(double));
     for (int i = 0; i < m; i++) w_ones[i] = 1.0;
 
-    /* ---- Path B: time the manual chain rule, Jacobian and Hessian separately ---- */
+    /* ---- Path B: time the manual chain rule, Jacobian and Hessian separately ----
+     */
     Timer t_b_jac, t_b_hess;
     /* dwork = sigmoid(z); used as the diagonal in DA below and (still in
        dwork) as sigmas read by local_wsum_hess. */
@@ -99,7 +100,7 @@ const char *profile_log_reg(void)
     permuted_dense_DA_fill_values(log_obj->work->dwork, A_pd, Jlog_pd);
     permuted_dense_to_csr_fill_values(Jlog_pd, Jlog_csr);
     memset(Jobj_csr->x, 0, Jobj_csr->nnz * sizeof(double));
-    accumulator(Jlog_csr, idx_map, Jobj_csr->x);
+    accumulator(Jlog_csr->x, Jlog_csr->nnz, idx_map, Jobj_csr->x);
     clock_gettime(CLOCK_MONOTONIC, &t_b_jac.end);
     clock_gettime(CLOCK_MONOTONIC, &t_b_hess.start);
     log_obj->local_wsum_hess(log_obj, d2, w_ones);
diff --git a/tests/test_helpers.c b/tests/test_helpers.c
index 90d8446..e00d3c7 100644
--- a/tests/test_helpers.c
+++ b/tests/test_helpers.c
@@ -5,6 +5,7 @@
 
 #include "expr.h"
 #include "utils/CSR_Matrix.h"
+#include "utils/matrix.h"
 
 #define EPSILON 1e-7
 
@@ -44,6 +45,32 @@ int cmp_int_array(const int *actual, const int *expected, int size)
     return 1;
 }
 
+int cmp_sparsity(Matrix *M, const int *exp_p, const int *exp_i, int m, int nnz)
+{
+    if (M->m != m)
+    {
+        printf("  FAILED: M->m = %d, expected %d\n", M->m, m);
+        return 0;
+    }
+    if (M->nnz != nnz)
+    {
+        printf("  FAILED: M->nnz = %d, expected %d\n", M->nnz, nnz);
+        return 0;
+    }
+    CSR_Matrix *csr = M->to_csr(M);
+    return cmp_int_array(csr->p, exp_p, m + 1) && cmp_int_array(csr->i, exp_i, nnz);
+}
+
+int cmp_values(const Matrix *M, const double *exp_x, int nnz)
+{
+    if (M->nnz != nnz)
+    {
+        printf("  FAILED: M->nnz = %d, expected %d\n", M->nnz, nnz);
+        return 0;
+    }
+    return cmp_double_array(M->x, exp_x, nnz);
+}
+
 #ifndef M_PI
 #define M_PI 3.14159265358979323846
 #endif
diff --git a/tests/test_helpers.h b/tests/test_helpers.h
index fd47151..8d9decc 100644
--- a/tests/test_helpers.h
+++ b/tests/test_helpers.h
@@ -3,6 +3,7 @@
 
 #include "expr.h"
 #include "utils/CSR_Matrix.h"
+#include "utils/matrix.h"
 
 /* Compare two double arrays directly
  * Returns 1 if all values match, 0 otherwise */
@@ -12,6 +13,15 @@ int cmp_double_array(const double *actual, const double *expected, int size);
  * Returns 1 if all values match, 0 otherwise */
 int cmp_int_array(const int *actual, const int *expected, int size);
 
+/* Verify M has shape (m, *) with exactly nnz entries, and that M's CSR
+ * row pointers and column indices match exp_p (length m+1) and exp_i
+ * (length nnz). Returns 1 on full match, 0 otherwise. */
+int cmp_sparsity(Matrix *M, const int *exp_p, const int *exp_i, int m, int nnz);
+
+/* Verify M has nnz entries and that its value array matches exp_x of
+ * length nnz. Returns 1 on full match, 0 otherwise. */
+int cmp_values(const Matrix *M, const double *exp_x, int nnz);
+
 /* Create a random m x n CSR matrix with approximate nonzero density
  * in [0, 1]. Nonzero values are standard Gaussian (Box-Muller). */
 CSR_Matrix *new_csr_random(int m, int n, double density);
diff --git a/tests/utils/test_permuted_dense.h b/tests/utils/test_permuted_dense.h
index 2e385a0..2b1cddc 100644
--- a/tests/utils/test_permuted_dense.h
+++ b/tests/utils/test_permuted_dense.h
@@ -359,4 +359,213 @@ const char *test_permuted_dense_col_inv(void)
     return 0;
 }
 
+/* PD index_alloc / index_fill_values: select rows from a PD; output must be
+   another PD with row_perm equal to the output positions where indices[i]
+   hit the source row_perm. */
+const char *test_permuted_dense_index(void)
+{
+    /* Source PD, shape (6, 4), dense block at rows {1, 3, 4} x cols {0, 2}. */
+    int row_perm[3] = {1, 3, 4};
+    int col_perm[2] = {0, 2};
+    double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    Matrix *M = new_permuted_dense(6, 4, 3, 2, row_perm, col_perm, X);
+
+    /* Index by [0, 3, 1, 5, 4]:
+       - position 0 -> source row 0 (not in row_perm, zero)
+       - position 1 -> source row 3 (in row_perm at ii=1, dense)
+       - position 2 -> source row 1 (in row_perm at ii=0, dense)
+       - position 3 -> source row 5 (not in row_perm, zero)
+       - position 4 -> source row 4 (in row_perm at ii=2, dense) */
+    int indices[5] = {0, 3, 1, 5, 4};
+    Matrix *out = M->index_alloc(M, indices, 5);
+    Permuted_Dense *out_pd = (Permuted_Dense *) out;
+
+    mu_assert("out m", out->m == 5);
+    mu_assert("out n", out->n == 4);
+    mu_assert("out nnz", out->nnz == 6); /* dense_m=3 * dense_n=2 */
+    mu_assert("dense_m", out_pd->dense_m == 3);
+    mu_assert("dense_n", out_pd->dense_n == 2);
+
+    int expected_row_perm[3] = {1, 2, 4};
+    mu_assert("row_perm", cmp_int_array(out_pd->row_perm, expected_row_perm, 3));
+    int expected_col_perm[2] = {0, 2};
+    mu_assert("col_perm", cmp_int_array(out_pd->col_perm, expected_col_perm, 2));
+
+    M->index_fill_values(M, indices, 5, out);
+
+    /* Row 0 of out (i=1) = source row 3 = X[1, :] = {3, 4}.
+       Row 1 of out (i=2) = source row 1 = X[0, :] = {1, 2}.
+       Row 2 of out (i=4) = source row 4 = X[2, :] = {5, 6}. */
+    double expected_X[6] = {3.0, 4.0, 1.0, 2.0, 5.0, 6.0};
+    mu_assert("values", cmp_double_array(out_pd->X, expected_X, 6));
+
+    free_matrix(out);
+    free_matrix(M);
+    return 0;
+}
+
+/* PD promote_alloc / promote_fill_values: tile a 1-row PD into a
+   `size`-row PD where every row is a copy of the source row. */
+const char *test_permuted_dense_promote(void)
+{
+    /* Source PD, shape (1, 5), single dense row at row 0, cols {1, 3}. */
+    int row_perm[1] = {0};
+    int col_perm[2] = {1, 3};
+    double X[2] = {7.0, 9.0};
+    Matrix *M = new_permuted_dense(1, 5, 1, 2, row_perm, col_perm, X);
+
+    Matrix *out = M->promote_alloc(M, 4);
+    Permuted_Dense *out_pd = (Permuted_Dense *) out;
+
+    mu_assert("out m", out->m == 4);
+    mu_assert("out n", out->n == 5);
+    mu_assert("out nnz", out->nnz == 8); /* dense_m=4 * dense_n=2 */
+    mu_assert("dense_m", out_pd->dense_m == 4);
+    mu_assert("dense_n", out_pd->dense_n == 2);
+
+    int expected_row_perm[4] = {0, 1, 2, 3};
+    mu_assert("row_perm", cmp_int_array(out_pd->row_perm, expected_row_perm, 4));
+    int expected_col_perm[2] = {1, 3};
+    mu_assert("col_perm", cmp_int_array(out_pd->col_perm, expected_col_perm, 2));
+
+    M->promote_fill_values(M, out);
+
+    double expected_X[8] = {7.0, 9.0, 7.0, 9.0, 7.0, 9.0, 7.0, 9.0};
+    mu_assert("values", cmp_double_array(out_pd->X, expected_X, 8));
+
+    free_matrix(out);
+    free_matrix(M);
+    return 0;
+}
+
+/* PD broadcast_alloc / broadcast_fill_values, SCALAR variant.
+   (1, 5) PD with single dense row -> (d1*d2, 5) PD with that row tiled. */
+const char *test_permuted_dense_broadcast_scalar(void)
+{
+    int row_perm[1] = {0};
+    int col_perm[2] = {1, 3};
+    double X[2] = {7.0, 9.0};
+    Matrix *M = new_permuted_dense(1, 5, 1, 2, row_perm, col_perm, X);
+
+    int d1 = 2, d2 = 3; /* out shape (2, 3), m = 6 */
+    Matrix *out = M->broadcast_alloc(M, BROADCAST_SCALAR, d1, d2);
+    Permuted_Dense *out_pd = (Permuted_Dense *) out;
+
+    mu_assert("out m", out->m == 6);
+    mu_assert("out n", out->n == 5);
+    mu_assert("dense_m", out_pd->dense_m == 6);
+    mu_assert("dense_n", out_pd->dense_n == 2);
+    int expected_rp[6] = {0, 1, 2, 3, 4, 5};
+    mu_assert("row_perm", cmp_int_array(out_pd->row_perm, expected_rp, 6));
+
+    M->broadcast_fill_values(M, BROADCAST_SCALAR, d1, d2, out);
+    double expected_X[12] = {7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9};
+    mu_assert("values", cmp_double_array(out_pd->X, expected_X, 12));
+
+    free_matrix(out);
+    free_matrix(M);
+    return 0;
+}
+
+/* PD broadcast_alloc / broadcast_fill_values, ROW variant.
+   (1, d2) input has Jacobian of shape (d2, n_vars). Source PD: m=d2=3,
+   row_perm={0, 2} (rows 0 and 2 dense), col_perm={1, 4}, single dense row
+   per dense_m. Output (d1, d2) = (2, 3): each child row replicated d1=2
+   times. */
+const char *test_permuted_dense_broadcast_row(void)
+{
+    int row_perm[2] = {0, 2};
+    int col_perm[2] = {1, 4};
+    double X[4] = {1.0, 2.0,  /* row corresponding to child row 0 */
+                   3.0, 4.0}; /* row corresponding to child row 2 */
+    Matrix *M = new_permuted_dense(3, 6, 2, 2, row_perm, col_perm, X);
+
+    int d1 = 2, d2 = 3; /* output (2, 3), out m = 6 */
+    Matrix *out = M->broadcast_alloc(M, BROADCAST_ROW, d1, d2);
+    Permuted_Dense *out_pd = (Permuted_Dense *) out;
+
+    mu_assert("out m", out->m == 6);
+    mu_assert("dense_m", out_pd->dense_m == 4); /* d1 * 2 */
+    mu_assert("dense_n", out_pd->dense_n == 2);
+    /* row_perm = {child_row_perm[0]*d1, +1, child_row_perm[1]*d1, +1}
+                = {0, 1, 4, 5} */
+    int expected_rp[4] = {0, 1, 4, 5};
+    mu_assert("row_perm", cmp_int_array(out_pd->row_perm, expected_rp, 4));
+
+    M->broadcast_fill_values(M, BROADCAST_ROW, d1, d2, out);
+    /* each child row replicated d1 times */
+    double expected_X[8] = {1.0, 2.0, 1.0, 2.0, 3.0, 4.0, 3.0, 4.0};
+    mu_assert("values", cmp_double_array(out_pd->X, expected_X, 8));
+
+    free_matrix(out);
+    free_matrix(M);
+    return 0;
+}
+
+/* PD broadcast_alloc / broadcast_fill_values, COL variant.
+   (d1, 1) input has Jacobian of shape (d1, n_vars). Source PD: m=d1=3,
+   row_perm={0, 2}, col_perm={1, 4}, two dense rows. Output (d1, d2) = (3, 2),
+   out m = 6: each child row appears d2 times, shifted by j*d1. */
+const char *test_permuted_dense_broadcast_col(void)
+{
+    int row_perm[2] = {0, 2};
+    int col_perm[2] = {1, 4};
+    double X[4] = {1.0, 2.0, 3.0, 4.0};
+    Matrix *M = new_permuted_dense(3, 6, 2, 2, row_perm, col_perm, X);
+
+    int d1 = 3, d2 = 2;
+    Matrix *out = M->broadcast_alloc(M, BROADCAST_COL, d1, d2);
+    Permuted_Dense *out_pd = (Permuted_Dense *) out;
+
+    mu_assert("out m", out->m == 6);
+    mu_assert("dense_m", out_pd->dense_m == 4); /* d2 * 2 */
+    mu_assert("dense_n", out_pd->dense_n == 2);
+    /* row_perm = {0+0, 0+2, 3+0, 3+2} = {0, 2, 3, 5} */
+    int expected_rp[4] = {0, 2, 3, 5};
+    mu_assert("row_perm", cmp_int_array(out_pd->row_perm, expected_rp, 4));
+
+    M->broadcast_fill_values(M, BROADCAST_COL, d1, d2, out);
+    /* X = d2 copies of full source X block */
+    double expected_X[8] = {1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0};
+    mu_assert("values", cmp_double_array(out_pd->X, expected_X, 8));
+
+    free_matrix(out);
+    free_matrix(M);
+    return 0;
+}
+
+/* PD diag_vec_alloc / diag_vec_fill_values.
+   Source PD shape (3, 6) with dense_m=2 (rows 0 and 2) -> output PD shape
+   (9, 6) with the same 2 dense rows mapped to positions {0, 8} = {0*4, 2*4}. */
+const char *test_permuted_dense_diag_vec(void)
+{
+    int row_perm[2] = {0, 2};
+    int col_perm[2] = {1, 4};
+    double X[4] = {1.0, 2.0,
+                   3.0, 4.0};
+    Matrix *M = new_permuted_dense(3, 6, 2, 2, row_perm, col_perm, X);
+
+    Matrix *out = M->diag_vec_alloc(M);
+    Permuted_Dense *out_pd = (Permuted_Dense *) out;
+
+    mu_assert("out m", out->m == 9);
+    mu_assert("out n", out->n == 6);
+    mu_assert("dense_m", out_pd->dense_m == 2);
+    mu_assert("dense_n", out_pd->dense_n == 2);
+    /* row_perm = {0*(n+1), 2*(n+1)} = {0, 8} */
+    int expected_rp[2] = {0, 8};
+    mu_assert("row_perm", cmp_int_array(out_pd->row_perm, expected_rp, 2));
+    int expected_cp[2] = {1, 4};
+    mu_assert("col_perm", cmp_int_array(out_pd->col_perm, expected_cp, 2));
+
+    M->diag_vec_fill_values(M, out);
+    /* X is identical to the source X */
+    double expected_X[4] = {1.0, 2.0, 3.0, 4.0};
+    mu_assert("values", cmp_double_array(out_pd->X, expected_X, 4));
+
+    free_matrix(out);
+    free_matrix(M);
+    return 0;
+}
+
 #endif /* TEST_PERMUTED_DENSE_H */
diff --git a/tests/wsum_hess/affine/test_broadcast.h b/tests/wsum_hess/affine/test_broadcast.h
index ba4de3b..6635a7d 100644
--- a/tests/wsum_hess/affine/test_broadcast.h
+++ b/tests/wsum_hess/affine/test_broadcast.h
@@ -56,12 +56,9 @@ const char *test_wsum_hess_broadcast_row(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("broadcast row wsum_hess: x values fail",
-              cmp_double_array(bcast->wsum_hess->x, expected_x, 3));
-    mu_assert("broadcast row wsum_hess: row pointers fail",
-              cmp_int_array(bcast->wsum_hess->to_csr(bcast->wsum_hess)->p, expected_p, 4));
-    mu_assert("broadcast row wsum_hess: column indices fail",
-              cmp_int_array(bcast->wsum_hess->to_csr(bcast->wsum_hess)->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(bcast->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(bcast->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(bcast);
     return 0;
@@ -111,12 +108,9 @@ const char *test_wsum_hess_broadcast_col(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("broadcast col wsum_hess: x values fail",
-              cmp_double_array(bcast->wsum_hess->x, expected_x, 3));
-    mu_assert("broadcast col wsum_hess: row pointers fail",
-              cmp_int_array(bcast->wsum_hess->to_csr(bcast->wsum_hess)->p, expected_p, 4));
-    mu_assert("broadcast col wsum_hess: column indices fail",
-              cmp_int_array(bcast->wsum_hess->to_csr(bcast->wsum_hess)->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(bcast->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(bcast->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(bcast);
     return 0;
@@ -159,12 +153,9 @@ const char *test_wsum_hess_broadcast_scalar_to_matrix(void)
     int expected_p[2] = {0, 1};
     int expected_i[1] = {0};
 
-    mu_assert("broadcast scalar wsum_hess: x values fail",
-              cmp_double_array(bcast->wsum_hess->x, expected_x, 1));
-    mu_assert("broadcast scalar wsum_hess: row pointers fail",
-              cmp_int_array(bcast->wsum_hess->to_csr(bcast->wsum_hess)->p, expected_p, 2));
-    mu_assert("broadcast scalar wsum_hess: column indices fail",
-              cmp_int_array(bcast->wsum_hess->to_csr(bcast->wsum_hess)->i, expected_i, 1));
+    mu_assert("vals fail", cmp_values(bcast->wsum_hess, expected_x, 1));
+    mu_assert("sparsity fail",
+              cmp_sparsity(bcast->wsum_hess, expected_p, expected_i, 1, 1));
 
     free_expr(bcast);
     return 0;
diff --git a/tests/wsum_hess/affine/test_convolve.h b/tests/wsum_hess/affine/test_convolve.h
index f44aaba..98dd7ab 100644
--- a/tests/wsum_hess/affine/test_convolve.h
+++ b/tests/wsum_hess/affine/test_convolve.h
@@ -26,10 +26,10 @@ const char *test_wsum_hess_convolve(void)
     wsum_hess_init(y);
     y->eval_wsum_hess(y, w);
 
-    mu_assert("Convolve wsum_hess should be 3x3", y->wsum_hess->to_csr(y->wsum_hess)->m == 3);
-    mu_assert("Convolve wsum_hess should be square", y->wsum_hess->to_csr(y->wsum_hess)->n == 3);
+    mu_assert("Convolve wsum_hess should be 3x3", y->wsum_hess->m == 3);
+    mu_assert("Convolve wsum_hess should be square", y->wsum_hess->n == 3);
     mu_assert("Convolve wsum_hess should have zero nonzeros",
-              y->wsum_hess->to_csr(y->wsum_hess)->nnz == 0);
+              y->wsum_hess->nnz == 0);
 
     free_expr(y);
     return 0;
diff --git a/tests/wsum_hess/affine/test_diag_mat.h b/tests/wsum_hess/affine/test_diag_mat.h
index 736f129..2774d5f 100644
--- a/tests/wsum_hess/affine/test_diag_mat.h
+++ b/tests/wsum_hess/affine/test_diag_mat.h
@@ -36,10 +36,9 @@ const char *test_wsum_hess_diag_mat_log(void)
     int expected_p[5] = {0, 1, 2, 3, 4};
     int expected_i[4] = {0, 1, 2, 3};
 
-    mu_assert("diag_mat log hess vals",
-              cmp_double_array(dm->wsum_hess->x, expected_x, 4));
-    mu_assert("diag_mat log hess p", cmp_int_array(dm->wsum_hess->to_csr(dm->wsum_hess)->p, expected_p, 5));
-    mu_assert("diag_mat log hess i", cmp_int_array(dm->wsum_hess->to_csr(dm->wsum_hess)->i, expected_i, 4));
+    mu_assert("vals fail", cmp_values(dm->wsum_hess, expected_x, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(dm->wsum_hess, expected_p, expected_i, 4, 4));
 
     free_expr(dm);
     return 0;
diff --git a/tests/wsum_hess/affine/test_hstack.h b/tests/wsum_hess/affine/test_hstack.h
index 8cb1f73..49809a9 100644
--- a/tests/wsum_hess/affine/test_hstack.h
+++ b/tests/wsum_hess/affine/test_hstack.h
@@ -90,12 +90,9 @@ const char *test_wsum_hess_hstack(void)
     int expected_p[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
     int expected_i[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(hstack_node->wsum_hess->x, expected_x, 9));
-    mu_assert("rows incorrect",
-              cmp_int_array(hstack_node->wsum_hess->to_csr(hstack_node->wsum_hess)->p, expected_p, 10));
-    mu_assert("cols incorrect",
-              cmp_int_array(hstack_node->wsum_hess->to_csr(hstack_node->wsum_hess)->i, expected_i, 9));
+    mu_assert("vals fail", cmp_values(hstack_node->wsum_hess, expected_x, 9));
+    mu_assert("sparsity fail",
+              cmp_sparsity(hstack_node->wsum_hess, expected_p, expected_i, 9, 9));
 
     free_expr(hstack_node);
     return 0;
@@ -200,12 +197,9 @@ const char *test_wsum_hess_hstack_matrix(void)
     int expected_i[18] = {0, 1,  2,  3,  4,  5,  6,  7,  8,
                           9, 10, 11, 12, 13, 14, 15, 16, 17};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(hstack_node->wsum_hess->x, expected_x, 18));
-    mu_assert("rows incorrect",
-              cmp_int_array(hstack_node->wsum_hess->to_csr(hstack_node->wsum_hess)->p, expected_p, 19));
-    mu_assert("cols incorrect",
-              cmp_int_array(hstack_node->wsum_hess->to_csr(hstack_node->wsum_hess)->i, expected_i, 18));
+    mu_assert("vals fail", cmp_values(hstack_node->wsum_hess, expected_x, 18));
+    mu_assert("sparsity fail",
+              cmp_sparsity(hstack_node->wsum_hess, expected_p, expected_i, 18, 18));
 
     free_expr(hstack_node);
     return 0;
diff --git a/tests/wsum_hess/affine/test_index.h b/tests/wsum_hess/affine/test_index.h
index 3bf390d..6d77a39 100644
--- a/tests/wsum_hess/affine/test_index.h
+++ b/tests/wsum_hess/affine/test_index.h
@@ -37,10 +37,9 @@ const char *test_wsum_hess_index_log(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("index log hess vals",
-              cmp_double_array(idx->wsum_hess->x, expected_x, 3));
-    mu_assert("index log hess p", cmp_int_array(idx->wsum_hess->to_csr(idx->wsum_hess)->p, expected_p, 4));
-    mu_assert("index log hess i", cmp_int_array(idx->wsum_hess->to_csr(idx->wsum_hess)->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(idx->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(idx->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(idx);
     return 0;
@@ -70,12 +69,9 @@ const char *test_wsum_hess_index_repeated(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("index repeated hess vals",
-              cmp_double_array(idx->wsum_hess->x, expected_x, 3));
-    mu_assert("index repeated hess p",
-              cmp_int_array(idx->wsum_hess->to_csr(idx->wsum_hess)->p, expected_p, 4));
-    mu_assert("index repeated hess i",
-              cmp_int_array(idx->wsum_hess->to_csr(idx->wsum_hess)->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(idx->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(idx->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(idx);
     return 0;
@@ -109,12 +105,9 @@ const char *test_wsum_hess_sum_index_log(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("sum index log hess vals",
-              cmp_double_array(sum_node->wsum_hess->x, expected_x, 3));
-    mu_assert("sum index log hess p",
-              cmp_int_array(sum_node->wsum_hess->to_csr(sum_node->wsum_hess)->p, expected_p, 4));
-    mu_assert("sum index log hess i",
-              cmp_int_array(sum_node->wsum_hess->to_csr(sum_node->wsum_hess)->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(sum_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(sum_node->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(sum_node);
     return 0;
diff --git a/tests/wsum_hess/affine/test_left_matmul.h b/tests/wsum_hess/affine/test_left_matmul.h
index 3ccd99c..c6d6859 100644
--- a/tests/wsum_hess/affine/test_left_matmul.h
+++ b/tests/wsum_hess/affine/test_left_matmul.h
@@ -80,10 +80,9 @@ const char *test_wsum_hess_left_matmul(void)
     int expected_i[3] = {0, 1, 2};
     int expected_p[4] = {0, 1, 2, 3}; /* each row has 1 diagonal entry */
 
-    mu_assert("vals incorrect",
-              cmp_double_array(A_log_x->wsum_hess->x, expected_x, 3));
-    mu_assert("cols incorrect", cmp_int_array(A_log_x->wsum_hess->to_csr(A_log_x->wsum_hess)->i, expected_i, 3));
-    mu_assert("rows incorrect", cmp_int_array(A_log_x->wsum_hess->to_csr(A_log_x->wsum_hess)->p, expected_p, 4));
+    mu_assert("vals fail", cmp_values(A_log_x->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(A_log_x->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_csr_matrix(A);
     free_expr(A_log_x);
@@ -189,10 +188,9 @@ const char *test_wsum_hess_left_matmul_matrix(void)
     int expected_i[6] = {0, 1, 2, 3, 4, 5};
     int expected_p[7] = {0, 1, 2, 3, 4, 5, 6}; /* each row has 1 diagonal entry */
 
-    mu_assert("vals incorrect",
-              cmp_double_array(A_log_x->wsum_hess->x, expected_x, 6));
-    mu_assert("cols incorrect", cmp_int_array(A_log_x->wsum_hess->to_csr(A_log_x->wsum_hess)->i, expected_i, 6));
-    mu_assert("rows incorrect", cmp_int_array(A_log_x->wsum_hess->to_csr(A_log_x->wsum_hess)->p, expected_p, 7));
+    mu_assert("vals fail", cmp_values(A_log_x->wsum_hess, expected_x, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(A_log_x->wsum_hess, expected_p, expected_i, 6, 6));
 
     free_csr_matrix(A);
     free_expr(A_log_x);
diff --git a/tests/wsum_hess/affine/test_right_matmul.h b/tests/wsum_hess/affine/test_right_matmul.h
index cc89881..dc843cc 100644
--- a/tests/wsum_hess/affine/test_right_matmul.h
+++ b/tests/wsum_hess/affine/test_right_matmul.h
@@ -50,10 +50,9 @@ const char *test_wsum_hess_right_matmul(void)
     int expected_i[4] = {0, 1, 2, 3};
     int expected_p[5] = {0, 1, 2, 3, 4}; /* each row has 1 diagonal entry */
 
-    mu_assert("vals incorrect",
-              cmp_double_array(log_x_A->wsum_hess->x, expected_x, 4));
-    mu_assert("cols incorrect", cmp_int_array(log_x_A->wsum_hess->to_csr(log_x_A->wsum_hess)->i, expected_i, 4));
-    mu_assert("rows incorrect", cmp_int_array(log_x_A->wsum_hess->to_csr(log_x_A->wsum_hess)->p, expected_p, 5));
+    mu_assert("vals fail", cmp_values(log_x_A->wsum_hess, expected_x, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(log_x_A->wsum_hess, expected_p, expected_i, 4, 4));
 
     free_csr_matrix(A);
     free_expr(log_x_A);
@@ -99,10 +98,9 @@ const char *test_wsum_hess_right_matmul_vector(void)
     int expected_i[3] = {0, 1, 2};
     int expected_p[4] = {0, 1, 2, 3}; /* each row has 1 diagonal entry */
 
-    mu_assert("vals incorrect",
-              cmp_double_array(log_x_A->wsum_hess->x, expected_x, 3));
-    mu_assert("cols incorrect", cmp_int_array(log_x_A->wsum_hess->to_csr(log_x_A->wsum_hess)->i, expected_i, 3));
-    mu_assert("rows incorrect", cmp_int_array(log_x_A->wsum_hess->to_csr(log_x_A->wsum_hess)->p, expected_p, 4));
+    mu_assert("vals fail", cmp_values(log_x_A->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(log_x_A->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_csr_matrix(A);
     free_expr(log_x_A);
diff --git a/tests/wsum_hess/affine/test_scalar_mult.h b/tests/wsum_hess/affine/test_scalar_mult.h
index 88c4709..d08e134 100644
--- a/tests/wsum_hess/affine/test_scalar_mult.h
+++ b/tests/wsum_hess/affine/test_scalar_mult.h
@@ -38,12 +38,9 @@ const char *test_wsum_hess_scalar_mult_log_vector(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("scalar mult log hess: x values fail",
-              cmp_double_array(y->wsum_hess->x, expected_x, 3));
-    mu_assert("scalar mult log hess: row pointers fail",
-              cmp_int_array(y->wsum_hess->to_csr(y->wsum_hess)->p, expected_p, 4));
-    mu_assert("scalar mult log hess: column indices fail",
-              cmp_int_array(y->wsum_hess->to_csr(y->wsum_hess)->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(y->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(y->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(y);
     return 0;
@@ -76,12 +73,9 @@ const char *test_wsum_hess_scalar_mult_log_matrix(void)
     int expected_p[5] = {0, 1, 2, 3, 4};
     int expected_i[4] = {0, 1, 2, 3};
 
-    mu_assert("scalar mult log hess matrix: x values fail",
-              cmp_double_array(y->wsum_hess->x, expected_x, 4));
-    mu_assert("scalar mult log hess matrix: row pointers fail",
-              cmp_int_array(y->wsum_hess->to_csr(y->wsum_hess)->p, expected_p, 5));
-    mu_assert("scalar mult log hess matrix: column indices fail",
-              cmp_int_array(y->wsum_hess->to_csr(y->wsum_hess)->i, expected_i, 4));
+    mu_assert("vals fail", cmp_values(y->wsum_hess, expected_x, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(y->wsum_hess, expected_p, expected_i, 4, 4));
 
     free_expr(y);
     return 0;
diff --git a/tests/wsum_hess/affine/test_sum.h b/tests/wsum_hess/affine/test_sum.h
index 9767c86..cc1b69c 100644
--- a/tests/wsum_hess/affine/test_sum.h
+++ b/tests/wsum_hess/affine/test_sum.h
@@ -63,12 +63,9 @@ const char *test_wsum_hess_sum_log_axis0(void)
     int expected_p[7] = {0, 1, 2, 3, 4, 5, 6};
     int expected_i[6] = {0, 1, 2, 3, 4, 5};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(sum_node->wsum_hess->x, expected_x, 6));
-    mu_assert("rows incorrect",
-              cmp_int_array(sum_node->wsum_hess->to_csr(sum_node->wsum_hess)->p, expected_p, 7));
-    mu_assert("cols incorrect",
-              cmp_int_array(sum_node->wsum_hess->to_csr(sum_node->wsum_hess)->i, expected_i, 6));
+    mu_assert("vals fail", cmp_values(sum_node->wsum_hess, expected_x, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(sum_node->wsum_hess, expected_p, expected_i, 6, 6));
 
     free_expr(sum_node);
 
@@ -101,12 +98,9 @@ const char *test_wsum_hess_sum_log_axis1(void)
     int expected_p[7] = {0, 1, 2, 3, 4, 5, 6};
     int expected_i[6] = {0, 1, 2, 3, 4, 5};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(sum_node->wsum_hess->x, expected_x, 6));
-    mu_assert("rows incorrect",
-              cmp_int_array(sum_node->wsum_hess->to_csr(sum_node->wsum_hess)->p, expected_p, 7));
-    mu_assert("cols incorrect",
-              cmp_int_array(sum_node->wsum_hess->to_csr(sum_node->wsum_hess)->i, expected_i, 6));
+    mu_assert("vals fail", cmp_values(sum_node->wsum_hess, expected_x, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(sum_node->wsum_hess, expected_p, expected_i, 6, 6));
 
     free_expr(sum_node);
 
diff --git a/tests/wsum_hess/affine/test_trace.h b/tests/wsum_hess/affine/test_trace.h
index a37d558..bfbb5e0 100644
--- a/tests/wsum_hess/affine/test_trace.h
+++ b/tests/wsum_hess/affine/test_trace.h
@@ -35,10 +35,10 @@ const char *test_wsum_hess_trace_variable(void)
     trace_node->eval_wsum_hess(trace_node, &w);
 
     /* For a linear operation (variable), Hessian is zero */
-    mu_assert("wsum_hess should be empty", trace_node->wsum_hess->to_csr(trace_node->wsum_hess)->nnz == 0);
+    mu_assert("wsum_hess should be empty", trace_node->wsum_hess->nnz == 0);
 
     mu_assert("dims correct",
-              trace_node->wsum_hess->to_csr(trace_node->wsum_hess)->m == 13 && trace_node->wsum_hess->to_csr(trace_node->wsum_hess)->n == 13);
+              trace_node->wsum_hess->m == 13 && trace_node->wsum_hess->n == 13);
 
     free_expr(trace_node);
     return 0;
@@ -70,11 +70,9 @@ const char *test_wsum_hess_trace_log_variable(void)
     int expected_Ap[14] = {0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 9};
     int expected_Ai[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
 
-    CSR_Matrix *H = trace_node->wsum_hess->to_csr(trace_node->wsum_hess);
-    mu_assert("nnz wrong", H->nnz == 9);
-    mu_assert("vals match", cmp_double_array(H->x, expected_Ax, 9));
-    mu_assert("cols match", cmp_int_array(H->i, expected_Ai, 9));
-    mu_assert("rows fail", cmp_int_array(H->p, expected_Ap, 14));
+    mu_assert("sparsity fail",
+              cmp_sparsity(trace_node->wsum_hess, expected_Ap, expected_Ai, 13, 9));
+    mu_assert("vals fail", cmp_values(trace_node->wsum_hess, expected_Ax, 9));
 
     free_expr(trace_node);
     return 0;
@@ -135,11 +133,9 @@ const char *test_wsum_hess_trace_composite(void)
     int expected_Ap[14] = {0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 9};
     int expected_Ai[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
 
-    mu_assert("nnz wrong", trace_node->wsum_hess->to_csr(trace_node->wsum_hess)->nnz == 9);
-    mu_assert("rows fail", cmp_int_array(trace_node->wsum_hess->to_csr(trace_node->wsum_hess)->p, expected_Ap, 14));
-    mu_assert("vals match",
-              cmp_double_array(trace_node->wsum_hess->x, expected_Ax, 9));
-    mu_assert("cols match", cmp_int_array(trace_node->wsum_hess->to_csr(trace_node->wsum_hess)->i, expected_Ai, 9));
+    mu_assert("sparsity fail",
+              cmp_sparsity(trace_node->wsum_hess, expected_Ap, expected_Ai, 13, 9));
+    mu_assert("vals fail", cmp_values(trace_node->wsum_hess, expected_Ax, 9));
     free_expr(trace_node);
     return 0;
 }
diff --git a/tests/wsum_hess/affine/test_transpose.h b/tests/wsum_hess/affine/test_transpose.h
index 0911112..ad4e93f 100644
--- a/tests/wsum_hess/affine/test_transpose.h
+++ b/tests/wsum_hess/affine/test_transpose.h
@@ -27,12 +27,9 @@ const char *test_wsum_hess_transpose(void)
     int expected_p[9] = {0, 2, 4, 6, 8, 10, 12, 14, 16};
     int expected_i[16] = {4, 6, 4, 6, 5, 7, 5, 7, 0, 1, 2, 3, 0, 1, 2, 3};
 
-    mu_assert("hess values fail",
-              cmp_double_array(XYT->wsum_hess->x, expected_x, 8));
-    mu_assert("jacobian row ptr fail",
-              cmp_int_array(XYT->wsum_hess->to_csr(XYT->wsum_hess)->p, expected_p, 5));
-    mu_assert("jacobian col idx fail",
-              cmp_int_array(XYT->wsum_hess->to_csr(XYT->wsum_hess)->i, expected_i, 8));
+    mu_assert("vals fail", cmp_values(XYT->wsum_hess, expected_x, 16));
+    mu_assert("sparsity fail",
+              cmp_sparsity(XYT->wsum_hess, expected_p, expected_i, 8, 16));
     free_expr(XYT);
 
     return 0;
diff --git a/tests/wsum_hess/affine/test_upper_tri.h b/tests/wsum_hess/affine/test_upper_tri.h
index 3c874ba..0cff5b6 100644
--- a/tests/wsum_hess/affine/test_upper_tri.h
+++ b/tests/wsum_hess/affine/test_upper_tri.h
@@ -49,12 +49,9 @@ const char *test_wsum_hess_upper_tri_log(void)
     int expected_p[17] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
     int expected_i[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
 
-    mu_assert("upper_tri log hess vals",
-              cmp_double_array(ut->wsum_hess->x, expected_x, 16));
-    mu_assert("upper_tri log hess p",
-              cmp_int_array(ut->wsum_hess->to_csr(ut->wsum_hess)->p, expected_p, 17));
-    mu_assert("upper_tri log hess i",
-              cmp_int_array(ut->wsum_hess->to_csr(ut->wsum_hess)->i, expected_i, 16));
+    mu_assert("vals fail", cmp_values(ut->wsum_hess, expected_x, 16));
+    mu_assert("sparsity fail",
+              cmp_sparsity(ut->wsum_hess, expected_p, expected_i, 16, 16));
 
     free_expr(ut);
     return 0;
diff --git a/tests/wsum_hess/affine/test_vector_mult.h b/tests/wsum_hess/affine/test_vector_mult.h
index e81371f..5d0d140 100644
--- a/tests/wsum_hess/affine/test_vector_mult.h
+++ b/tests/wsum_hess/affine/test_vector_mult.h
@@ -38,12 +38,9 @@ const char *test_wsum_hess_vector_mult_log_vector(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vector mult log hess: x values fail",
-              cmp_double_array(y->wsum_hess->x, expected_x, 3));
-    mu_assert("vector mult log hess: row pointers fail",
-              cmp_int_array(y->wsum_hess->to_csr(y->wsum_hess)->p, expected_p, 4));
-    mu_assert("vector mult log hess: column indices fail",
-              cmp_int_array(y->wsum_hess->to_csr(y->wsum_hess)->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(y->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(y->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(y);
     return 0;
@@ -76,12 +73,9 @@ const char *test_wsum_hess_vector_mult_log_matrix(void)
     int expected_p[5] = {0, 1, 2, 3, 4};
     int expected_i[4] = {0, 1, 2, 3};
 
-    mu_assert("vector mult log hess matrix: x values fail",
-              cmp_double_array(y->wsum_hess->x, expected_x, 4));
-    mu_assert("vector mult log hess matrix: row pointers fail",
-              cmp_int_array(y->wsum_hess->to_csr(y->wsum_hess)->p, expected_p, 5));
-    mu_assert("vector mult log hess matrix: column indices fail",
-              cmp_int_array(y->wsum_hess->to_csr(y->wsum_hess)->i, expected_i, 4));
+    mu_assert("vals fail", cmp_values(y->wsum_hess, expected_x, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(y->wsum_hess, expected_p, expected_i, 4, 4));
 
     free_expr(y);
     return 0;
diff --git a/tests/wsum_hess/affine/test_vstack.h b/tests/wsum_hess/affine/test_vstack.h
index d98323e..cb473a1 100644
--- a/tests/wsum_hess/affine/test_vstack.h
+++ b/tests/wsum_hess/affine/test_vstack.h
@@ -39,12 +39,9 @@ const char *test_wsum_hess_vstack_vectors(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vstack hess vectors: vals",
-              cmp_double_array(stack->wsum_hess->x, expected_x, 3));
-    mu_assert("vstack hess vectors: rows",
-              cmp_int_array(stack->wsum_hess->to_csr(stack->wsum_hess)->p, expected_p, 4));
-    mu_assert("vstack hess vectors: cols",
-              cmp_int_array(stack->wsum_hess->to_csr(stack->wsum_hess)->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(stack->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(stack->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(stack);
     return 0;
@@ -102,12 +99,9 @@ const char *test_wsum_hess_vstack_matrix(void)
     int expected_p[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
     int expected_i[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
 
-    mu_assert("vstack hess matrix: vals",
-              cmp_double_array(stack->wsum_hess->x, expected_x, 9));
-    mu_assert("vstack hess matrix: rows",
-              cmp_int_array(stack->wsum_hess->to_csr(stack->wsum_hess)->p, expected_p, 10));
-    mu_assert("vstack hess matrix: cols",
-              cmp_int_array(stack->wsum_hess->to_csr(stack->wsum_hess)->i, expected_i, 9));
+    mu_assert("vals fail", cmp_values(stack->wsum_hess, expected_x, 9));
+    mu_assert("sparsity fail",
+              cmp_sparsity(stack->wsum_hess, expected_p, expected_i, 9, 9));
 
     free_expr(stack);
     return 0;
diff --git a/tests/wsum_hess/bivariate_full_dom/test_matmul.h b/tests/wsum_hess/bivariate_full_dom/test_matmul.h
index 357551b..9241609 100644
--- a/tests/wsum_hess/bivariate_full_dom/test_matmul.h
+++ b/tests/wsum_hess/bivariate_full_dom/test_matmul.h
@@ -49,16 +49,10 @@ const char *test_wsum_hess_matmul(void)
     Z->eval_wsum_hess(Z, w);
 
     /* Verify Hessian dimensions and sparsity */
-    mu_assert("Hessian should be 18x18", Z->wsum_hess->to_csr(Z->wsum_hess)->m == n_vars);
-    mu_assert("Hessian should be 18x18", Z->wsum_hess->to_csr(Z->wsum_hess)->n == n_vars);
-    mu_assert("Hessian should have 48 nonzeros", Z->wsum_hess->to_csr(Z->wsum_hess)->nnz == 48);
+    mu_assert("Hessian should be 18 cols", Z->wsum_hess->n == n_vars);
 
     int expected_p[19] = {0,  4,  8,  12, 16, 20, 24, 26, 28, 30,
                           32, 34, 36, 38, 40, 42, 44, 46, 48};
-
-    mu_assert("Row pointers incorrect",
-              cmp_int_array(Z->wsum_hess->to_csr(Z->wsum_hess)->p, expected_p, 19));
-
     int expected_i[48] = {6, 9,  12, 15, /* row 0 */
                           6, 9,  12, 15, /* row 1 */
                           7, 10, 13, 16, /* row 2 */
@@ -78,8 +72,8 @@ const char *test_wsum_hess_matmul(void)
                           2, 3,          /* row 16*/
                           4, 5};
 
-    mu_assert("Column indices incorrect",
-              cmp_int_array(Z->wsum_hess->to_csr(Z->wsum_hess)->i, expected_i, 48));
+    mu_assert("sparsity fail",
+              cmp_sparsity(Z->wsum_hess, expected_p, expected_i, n_vars, 48));
 
     double expected_x[48] = {1.0, 3.0, 5.0, 7.0, /* row 0 */
                              2.0, 4.0, 6.0, 8.0, /* row 1 */
@@ -100,8 +94,7 @@ const char *test_wsum_hess_matmul(void)
                              7.0, 8.0,           /* row 16 */
                              7.0, 8.0};          /* row 17 */
 
-    mu_assert("Hessian values incorrect",
-              cmp_double_array(Z->wsum_hess->x, expected_x, 48));
+    mu_assert("vals fail", cmp_values(Z->wsum_hess, expected_x, 48));
 
     free_expr(Z);
     return 0;
@@ -150,9 +143,7 @@ const char *test_wsum_hess_matmul_yx(void)
     Z->eval_wsum_hess(Z, w);
 
     /* Verify Hessian dimensions and sparsity */
-    mu_assert("Hessian should be 18x18", Z->wsum_hess->to_csr(Z->wsum_hess)->m == n_vars);
-    mu_assert("Hessian should be 18x18", Z->wsum_hess->to_csr(Z->wsum_hess)->n == n_vars);
-    mu_assert("Hessian should have 48 nonzeros", Z->wsum_hess->to_csr(Z->wsum_hess)->nnz == 48);
+    mu_assert("Hessian should be 18 cols", Z->wsum_hess->n == n_vars);
 
     /* Row pointers when Y < X:
      * Rows 0-11 (Y variables): each couples with m=2 X variables
@@ -161,9 +152,6 @@ const char *test_wsum_hess_matmul_yx(void)
     int expected_p[19] = {0,  2,  4,  6,  8,  10, 12, 14, 16, 18,
                           20, 22, 24, 28, 32, 36, 40, 44, 48};
 
-    mu_assert("Row pointers incorrect",
-              cmp_int_array(Z->wsum_hess->to_csr(Z->wsum_hess)->p, expected_p, 19));
-
     /* Column indices when Y < X:
      * Y[k_idx, col] couples with X[row, k_idx] for all row
      * X variable index = 12 + row + k_idx*m
@@ -189,8 +177,8 @@ const char *test_wsum_hess_matmul_yx(void)
                           2,  5,  8, 11,  /* row 16: X[0,2] */
                           2,  5,  8, 11}; /* row 17: X[1,2] */
 
-    mu_assert("Column indices incorrect",
-              cmp_int_array(Z->wsum_hess->to_csr(Z->wsum_hess)->i, expected_i, 48));
+    mu_assert("sparsity fail",
+              cmp_sparsity(Z->wsum_hess, expected_p, expected_i, n_vars, 48));
 
     double expected_x[48] = {1.0, 2.0,            /* row 0 */
                              1.0, 2.0,            /* row 1 */
@@ -211,8 +199,7 @@ const char *test_wsum_hess_matmul_yx(void)
                              1.0, 3.0, 5.0, 7.0,  /* row 16 */
                              2.0, 4.0, 6.0, 8.0}; /* row 17 */
 
-    mu_assert("Hessian values incorrect",
-              cmp_double_array(Z->wsum_hess->x, expected_x, 48));
+    mu_assert("vals fail", cmp_values(Z->wsum_hess, expected_x, 48));
 
     free_expr(Z);
     return 0;
diff --git a/tests/wsum_hess/bivariate_full_dom/test_multiply.h b/tests/wsum_hess/bivariate_full_dom/test_multiply.h
index 414755e..d1b8484 100644
--- a/tests/wsum_hess/bivariate_full_dom/test_multiply.h
+++ b/tests/wsum_hess/bivariate_full_dom/test_multiply.h
@@ -30,9 +30,9 @@ const char *test_wsum_hess_multiply_1(void)
     int expected_i[6] = {8, 9, 10, 3, 4, 5};
     double expected_x[6] = {1.0, 2.0, 3.0, 1.0, 2.0, 3.0};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 13));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 6));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->wsum_hess, expected_p, expected_i, 12, 6));
+    mu_assert("vals fail", cmp_values(node->wsum_hess, expected_x, 6));
 
     free_expr(node);
     return 0;
@@ -96,12 +96,9 @@ const char *test_wsum_hess_multiply_sparse_random(void)
         0.361058,    -0.09679721, -0.02914438, 0.02402617,  0.31718166,  -0.03065625,
         -0.09679721, -0.18801593, 0.01371497,  -0.01130641, 0.06305481};
 
-    mu_assert("p array fails",
-              cmp_int_array(mult_node->wsum_hess->to_csr(mult_node->wsum_hess)->p, expected_p, 11));
-    mu_assert("i array fails",
-              cmp_int_array(mult_node->wsum_hess->to_csr(mult_node->wsum_hess)->i, expected_i, 29));
-    mu_assert("x array fails",
-              cmp_double_array(mult_node->wsum_hess->x, expected_x, 29));
+    mu_assert("sparsity fail",
+              cmp_sparsity(mult_node->wsum_hess, expected_p, expected_i, 10, 29));
+    mu_assert("vals fail", cmp_values(mult_node->wsum_hess, expected_x, 29));
 
     /* Cleanup */
     free_expr(mult_node);
@@ -179,12 +176,9 @@ const char *test_wsum_hess_multiply_linear_ops(void)
     int expected_i[7] = {0, 2, 1, 2, 0, 1, 2};
     double expected_x[7] = {74.0, 75.0, 216.0, 18.0, 75.0, 18.0, 76.0};
 
-    mu_assert("p array fails",
-              cmp_int_array(mult_node->wsum_hess->to_csr(mult_node->wsum_hess)->p, expected_p, 4));
-    mu_assert("i array fails",
-              cmp_int_array(mult_node->wsum_hess->to_csr(mult_node->wsum_hess)->i, expected_i, 7));
-    mu_assert("x array fails",
-              cmp_double_array(mult_node->wsum_hess->x, expected_x, 7));
+    mu_assert("sparsity fail",
+              cmp_sparsity(mult_node->wsum_hess, expected_p, expected_i, 3, 7));
+    mu_assert("vals fail", cmp_values(mult_node->wsum_hess, expected_x, 7));
 
     /* Cleanup */
     free_expr(mult_node);
@@ -219,9 +213,9 @@ const char *test_wsum_hess_multiply_2(void)
     int expected_i[6] = {8, 9, 10, 3, 4, 5};
     double expected_x[6] = {1.0, 2.0, 3.0, 1.0, 2.0, 3.0};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 13));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 6));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->wsum_hess, expected_p, expected_i, 12, 6));
+    mu_assert("vals fail", cmp_values(node->wsum_hess, expected_x, 6));
 
     free_expr(node);
     return 0;
diff --git a/tests/wsum_hess/bivariate_restricted_dom/test_quad_over_lin.h b/tests/wsum_hess/bivariate_restricted_dom/test_quad_over_lin.h
index 37b7e55..2cb5dc4 100644
--- a/tests/wsum_hess/bivariate_restricted_dom/test_quad_over_lin.h
+++ b/tests/wsum_hess/bivariate_restricted_dom/test_quad_over_lin.h
@@ -26,9 +26,9 @@ const char *test_wsum_hess_quad_over_lin_xy(void)
     double expected_x[10] = {1.0,   -0.25, 1.0,  -0.5,  1.0,
                              -0.75, -0.25, -0.5, -0.75, 0.875};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 10));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 10));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 10));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->wsum_hess, expected_p, expected_i, 9, 10));
+    mu_assert("vals fail", cmp_values(node->wsum_hess, expected_x, 10));
 
     free_expr(node);
     return 0;
@@ -56,9 +56,9 @@ const char *test_wsum_hess_quad_over_lin_yx(void)
     double expected_x[10] = {0.875, -0.25, -0.5, -0.75, -0.25,
                              1.0,   -0.5,  1.0,  -0.75, 1.0};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 10));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 10));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 10));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->wsum_hess, expected_p, expected_i, 9, 10));
+    mu_assert("vals fail", cmp_values(node->wsum_hess, expected_x, 10));
 
     free_expr(node);
     return 0;
diff --git a/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr.h b/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr.h
index b1a907c..5c3adde 100644
--- a/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr.h
+++ b/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr.h
@@ -30,9 +30,9 @@ const char *test_wsum_hess_rel_entr_1(void)
     double expected_x[12] = {1.0,   -0.25,  1.0,  -0.4, 1.0,  -0.5,
                              -0.25, 0.0625, -0.4, 0.16, -0.5, 0.25};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 11));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 12));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 12));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->wsum_hess, expected_p, expected_i, 10, 12));
+    mu_assert("vals fail", cmp_values(node->wsum_hess, expected_x, 12));
 
     free_expr(node);
     return 0;
@@ -62,9 +62,9 @@ const char *test_wsum_hess_rel_entr_2(void)
     double expected_x[12] = {0.0625, -0.25, 0.16, -0.4, 0.25, -0.5,
                              -0.25,  1.0,   -0.4, 1.0,  -0.5, 1.0};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 11));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 12));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 12));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->wsum_hess, expected_p, expected_i, 10, 12));
+    mu_assert("vals fail", cmp_values(node->wsum_hess, expected_x, 12));
 
     free_expr(node);
     return 0;
@@ -98,9 +98,9 @@ const char *test_wsum_hess_rel_entr_matrix(void)
         -1.0 / 6.0, 1.0 / 36.0,         -0.4, 0.16, -0.75, 0.5625,
         -4.0 / 3.0, 1.7777777777777777, -2.5, 6.25, -6.0,  36.0};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 13));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 24));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 24));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->wsum_hess, expected_p, expected_i, 12, 24));
+    mu_assert("vals fail", cmp_values(node->wsum_hess, expected_x, 24));
 
     free_expr(node);
     return 0;
diff --git a/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_scalar_vector.h b/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_scalar_vector.h
index 8a07632..74ba8b7 100644
--- a/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_scalar_vector.h
+++ b/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_scalar_vector.h
@@ -25,9 +25,9 @@ const char *test_wsum_hess_rel_entr_scalar_vector(void)
     double expected_x[10] = {15.0, -2.0,       -5.0 / 3.0, -1.5, -2.0,
                              1.0,  -5.0 / 3.0, 5.0 / 9.0,  -1.5, 0.375};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 5));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 10));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 10));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->wsum_hess, expected_p, expected_i, 4, 10));
+    mu_assert("vals fail", cmp_values(node->wsum_hess, expected_x, 10));
 
     free_expr(node);
     return 0;
diff --git a/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_vector_scalar.h b/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_vector_scalar.h
index ced2093..d74e92b 100644
--- a/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_vector_scalar.h
+++ b/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_vector_scalar.h
@@ -25,9 +25,9 @@ const char *test_wsum_hess_rel_entr_vector_scalar(void)
     double expected_x[10] = {1.0,   -0.25, 1.0,  -0.5,  1.0,
                              -0.75, -0.25, -0.5, -0.75, 0.875};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 5));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 10));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 10));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->wsum_hess, expected_p, expected_i, 4, 10));
+    mu_assert("vals fail", cmp_values(node->wsum_hess, expected_x, 10));
 
     free_expr(node);
     return 0;
diff --git a/tests/wsum_hess/elementwise_full_dom/test_exp.h b/tests/wsum_hess/elementwise_full_dom/test_exp.h
index c67e921..891eae0 100644
--- a/tests/wsum_hess/elementwise_full_dom/test_exp.h
+++ b/tests/wsum_hess/elementwise_full_dom/test_exp.h
@@ -27,12 +27,9 @@ const char *test_wsum_hess_exp(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(exp_node->wsum_hess->x, expected_x, 3));
-    mu_assert("rows incorrect",
-              cmp_int_array(exp_node->wsum_hess->to_csr(exp_node->wsum_hess)->p, expected_p, 4));
-    mu_assert("cols incorrect",
-              cmp_int_array(exp_node->wsum_hess->to_csr(exp_node->wsum_hess)->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(exp_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(exp_node->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(exp_node);
 
diff --git a/tests/wsum_hess/elementwise_full_dom/test_hyperbolic.h b/tests/wsum_hess/elementwise_full_dom/test_hyperbolic.h
index b0d0101..cf543a7 100644
--- a/tests/wsum_hess/elementwise_full_dom/test_hyperbolic.h
+++ b/tests/wsum_hess/elementwise_full_dom/test_hyperbolic.h
@@ -34,12 +34,9 @@ const char *test_wsum_hess_sinh(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(sinh_node->wsum_hess->x, expected_x, 3));
-    mu_assert("rows incorrect",
-              cmp_int_array(sinh_node->wsum_hess->to_csr(sinh_node->wsum_hess)->p, expected_p, 4));
-    mu_assert("cols incorrect",
-              cmp_int_array(sinh_node->wsum_hess->to_csr(sinh_node->wsum_hess)->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(sinh_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(sinh_node->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(sinh_node);
 
@@ -72,12 +69,9 @@ const char *test_wsum_hess_tanh(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(tanh_node->wsum_hess->x, expected_x, 3));
-    mu_assert("rows incorrect",
-              cmp_int_array(tanh_node->wsum_hess->to_csr(tanh_node->wsum_hess)->p, expected_p, 4));
-    mu_assert("cols incorrect",
-              cmp_int_array(tanh_node->wsum_hess->to_csr(tanh_node->wsum_hess)->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(tanh_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(tanh_node->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(tanh_node);
 
@@ -110,12 +104,9 @@ const char *test_wsum_hess_asinh(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(asinh_node->wsum_hess->x, expected_x, 3));
-    mu_assert("rows incorrect",
-              cmp_int_array(asinh_node->wsum_hess->to_csr(asinh_node->wsum_hess)->p, expected_p, 4));
-    mu_assert("cols incorrect",
-              cmp_int_array(asinh_node->wsum_hess->to_csr(asinh_node->wsum_hess)->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(asinh_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(asinh_node->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(asinh_node);
 
@@ -149,12 +140,9 @@ const char *test_wsum_hess_atanh(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(atanh_node->wsum_hess->x, expected_x, 3));
-    mu_assert("rows incorrect",
-              cmp_int_array(atanh_node->wsum_hess->to_csr(atanh_node->wsum_hess)->p, expected_p, 4));
-    mu_assert("cols incorrect",
-              cmp_int_array(atanh_node->wsum_hess->to_csr(atanh_node->wsum_hess)->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(atanh_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(atanh_node->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(atanh_node);
 
diff --git a/tests/wsum_hess/elementwise_full_dom/test_logistic.h b/tests/wsum_hess/elementwise_full_dom/test_logistic.h
index 3433dd5..025ec8c 100644
--- a/tests/wsum_hess/elementwise_full_dom/test_logistic.h
+++ b/tests/wsum_hess/elementwise_full_dom/test_logistic.h
@@ -42,12 +42,9 @@ const char *test_wsum_hess_logistic(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(logistic_node->wsum_hess->x, expected_x, 3));
-    mu_assert("rows incorrect",
-              cmp_int_array(logistic_node->wsum_hess->to_csr(logistic_node->wsum_hess)->p, expected_p, 4));
-    mu_assert("cols incorrect",
-              cmp_int_array(logistic_node->wsum_hess->to_csr(logistic_node->wsum_hess)->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(logistic_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(logistic_node->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(logistic_node);
 
diff --git a/tests/wsum_hess/elementwise_full_dom/test_power.h b/tests/wsum_hess/elementwise_full_dom/test_power.h
index 1d77ab3..1456bef 100644
--- a/tests/wsum_hess/elementwise_full_dom/test_power.h
+++ b/tests/wsum_hess/elementwise_full_dom/test_power.h
@@ -27,12 +27,9 @@ const char *test_wsum_hess_power(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(power_node->wsum_hess->x, expected_x, 3));
-    mu_assert("rows incorrect",
-              cmp_int_array(power_node->wsum_hess->to_csr(power_node->wsum_hess)->p, expected_p, 4));
-    mu_assert("cols incorrect",
-              cmp_int_array(power_node->wsum_hess->to_csr(power_node->wsum_hess)->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(power_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(power_node->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(power_node);
 
diff --git a/tests/wsum_hess/elementwise_full_dom/test_trig.h b/tests/wsum_hess/elementwise_full_dom/test_trig.h
index 7b501bb..550c272 100644
--- a/tests/wsum_hess/elementwise_full_dom/test_trig.h
+++ b/tests/wsum_hess/elementwise_full_dom/test_trig.h
@@ -27,12 +27,9 @@ const char *test_wsum_hess_sin(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(sin_node->wsum_hess->x, expected_x, 3));
-    mu_assert("rows incorrect",
-              cmp_int_array(sin_node->wsum_hess->to_csr(sin_node->wsum_hess)->p, expected_p, 4));
-    mu_assert("cols incorrect",
-              cmp_int_array(sin_node->wsum_hess->to_csr(sin_node->wsum_hess)->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(sin_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(sin_node->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(sin_node);
 
@@ -56,12 +53,9 @@ const char *test_wsum_hess_cos(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(cos_node->wsum_hess->x, expected_x, 3));
-    mu_assert("rows incorrect",
-              cmp_int_array(cos_node->wsum_hess->to_csr(cos_node->wsum_hess)->p, expected_p, 4));
-    mu_assert("cols incorrect",
-              cmp_int_array(cos_node->wsum_hess->to_csr(cos_node->wsum_hess)->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(cos_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(cos_node->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(cos_node);
 
@@ -87,12 +81,9 @@ const char *test_wsum_hess_tan(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(tan_node->wsum_hess->x, expected_x, 3));
-    mu_assert("rows incorrect",
-              cmp_int_array(tan_node->wsum_hess->to_csr(tan_node->wsum_hess)->p, expected_p, 4));
-    mu_assert("cols incorrect",
-              cmp_int_array(tan_node->wsum_hess->to_csr(tan_node->wsum_hess)->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(tan_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(tan_node->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(tan_node);
 
diff --git a/tests/wsum_hess/elementwise_full_dom/test_xexp.h b/tests/wsum_hess/elementwise_full_dom/test_xexp.h
index bb06dc7..55e80ef 100644
--- a/tests/wsum_hess/elementwise_full_dom/test_xexp.h
+++ b/tests/wsum_hess/elementwise_full_dom/test_xexp.h
@@ -28,12 +28,9 @@ const char *test_wsum_hess_xexp(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(xexp_node->wsum_hess->x, expected_x, 3));
-    mu_assert("rows incorrect",
-              cmp_int_array(xexp_node->wsum_hess->to_csr(xexp_node->wsum_hess)->p, expected_p, 4));
-    mu_assert("cols incorrect",
-              cmp_int_array(xexp_node->wsum_hess->to_csr(xexp_node->wsum_hess)->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(xexp_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(xexp_node->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(xexp_node);
 
diff --git a/tests/wsum_hess/elementwise_restricted_dom/test_entr.h b/tests/wsum_hess/elementwise_restricted_dom/test_entr.h
index cf1eef8..f7c83ca 100644
--- a/tests/wsum_hess/elementwise_restricted_dom/test_entr.h
+++ b/tests/wsum_hess/elementwise_restricted_dom/test_entr.h
@@ -27,12 +27,9 @@ const char *test_wsum_hess_entr(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(entr_node->wsum_hess->x, expected_x, 3));
-    mu_assert("rows incorrect",
-              cmp_int_array(entr_node->wsum_hess->to_csr(entr_node->wsum_hess)->p, expected_p, 4));
-    mu_assert("cols incorrect",
-              cmp_int_array(entr_node->wsum_hess->to_csr(entr_node->wsum_hess)->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(entr_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(entr_node->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(entr_node);
 
diff --git a/tests/wsum_hess/elementwise_restricted_dom/test_log.h b/tests/wsum_hess/elementwise_restricted_dom/test_log.h
index 1d54a1f..cd2a9d6 100644
--- a/tests/wsum_hess/elementwise_restricted_dom/test_log.h
+++ b/tests/wsum_hess/elementwise_restricted_dom/test_log.h
@@ -39,12 +39,9 @@ const char *test_wsum_hess_log(void)
     int expected_p[8] = {0, 0, 0, 1, 2, 3, 3, 3};
     int expected_i[3] = {2, 3, 4};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(log_node->wsum_hess->x, expected_x, 3));
-    mu_assert("rows incorrect",
-              cmp_int_array(log_node->wsum_hess->to_csr(log_node->wsum_hess)->p, expected_p, 8));
-    mu_assert("cols incorrect",
-              cmp_int_array(log_node->wsum_hess->to_csr(log_node->wsum_hess)->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(log_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(log_node->wsum_hess, expected_p, expected_i, 7, 3));
 
     free_expr(log_node);
 
diff --git a/tests/wsum_hess/other/test_prod.h b/tests/wsum_hess/other/test_prod.h
index 928a5ab..28184c1 100644
--- a/tests/wsum_hess/other/test_prod.h
+++ b/tests/wsum_hess/other/test_prod.h
@@ -28,9 +28,9 @@ const char *test_wsum_hess_prod_no_zero(void)
     int expected_p[9] = {0, 0, 0, 4, 8, 12, 16, 16, 16};
     int expected_i[16] = {2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 16));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 9));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 16));
+    mu_assert("vals fail", cmp_values(p->wsum_hess, expected_x, 16));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->wsum_hess, expected_p, expected_i, 8, 16));
 
     free_expr(p);
     return 0;
@@ -62,9 +62,9 @@ const char *test_wsum_hess_prod_one_zero(void)
     int expected_p[9] = {0, 0, 0, 4, 8, 12, 16, 16, 16};
     int expected_i[16] = {2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 16));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 9));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 16));
+    mu_assert("vals fail", cmp_values(p->wsum_hess, expected_x, 16));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->wsum_hess, expected_p, expected_i, 8, 16));
 
     free_expr(p);
     return 0;
@@ -91,9 +91,9 @@ const char *test_wsum_hess_prod_two_zeros(void)
     int expected_p[9] = {0, 0, 0, 4, 8, 12, 16, 16, 16};
     int expected_i[16] = {2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 16));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 9));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 16));
+    mu_assert("vals fail", cmp_values(p->wsum_hess, expected_x, 16));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->wsum_hess, expected_p, expected_i, 8, 16));
 
     free_expr(p);
     return 0;
@@ -118,9 +118,9 @@ const char *test_wsum_hess_prod_many_zeros(void)
     int expected_p[9] = {0, 0, 0, 4, 8, 12, 16, 16, 16};
     int expected_i[16] = {2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 16));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 9));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 16));
+    mu_assert("vals fail", cmp_values(p->wsum_hess, expected_x, 16));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->wsum_hess, expected_p, expected_i, 8, 16));
 
     free_expr(p);
     return 0;
diff --git a/tests/wsum_hess/other/test_prod_axis_one.h b/tests/wsum_hess/other/test_prod_axis_one.h
index 930c3fe..3ba1326 100644
--- a/tests/wsum_hess/other/test_prod_axis_one.h
+++ b/tests/wsum_hess/other/test_prod_axis_one.h
@@ -65,9 +65,9 @@ const char *test_wsum_hess_prod_axis_one_no_zeros(void)
                           /* Var 6 (row 1, col 2): cols 2,4 (excludes 6) */
                           2, 4};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 12));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 9));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 12));
+    mu_assert("vals fail", cmp_values(p->wsum_hess, expected_x, 12));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->wsum_hess, expected_p, expected_i, 8, 12));
 
     free_expr(p);
     return 0;
@@ -165,9 +165,9 @@ const char *test_wsum_hess_prod_axis_one_one_zero(void)
                           /* Var 9 (row 2, col 2): cols 3,6 (excludes 9) */
                           3, 6};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 18));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 11));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 18));
+    mu_assert("vals fail", cmp_values(p->wsum_hess, expected_x, 18));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->wsum_hess, expected_p, expected_i, 10, 18));
 
     free_expr(p);
     return 0;
@@ -317,9 +317,9 @@ const char *test_wsum_hess_prod_axis_one_mixed_zeros(void)
         }
     }
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 30));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 17));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 30));
+    mu_assert("vals fail", cmp_values(p->wsum_hess, expected_x, 30));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->wsum_hess, expected_p, expected_i, 16, 30));
 
     free_expr(p);
     return 0;
@@ -373,9 +373,9 @@ const char *test_wsum_hess_prod_axis_one_2x2(void)
                          0,  /* Var 2 (row 0, col 1): only col 0 */
                          1}; /* Var 3 (row 1, col 1): only col 0 */
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 4));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 5));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 4));
+    mu_assert("vals fail", cmp_values(p->wsum_hess, expected_x, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->wsum_hess, expected_p, expected_i, 4, 4));
 
     free_expr(p);
     return 0;
diff --git a/tests/wsum_hess/other/test_prod_axis_zero.h b/tests/wsum_hess/other/test_prod_axis_zero.h
index 0d0cd5b..4e5a8f4 100644
--- a/tests/wsum_hess/other/test_prod_axis_zero.h
+++ b/tests/wsum_hess/other/test_prod_axis_zero.h
@@ -63,9 +63,9 @@ const char *test_wsum_hess_prod_axis_zero_no_zeros(void)
      */
     int expected_i[12] = {1, 2, 1, 2, 3, 4, 3, 4, 5, 6, 5, 6};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 12));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 9));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 12));
+    mu_assert("vals fail", cmp_values(p->wsum_hess, expected_x, 12));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->wsum_hess, expected_p, expected_i, 8, 12));
 
     free_expr(p);
     return 0;
@@ -183,9 +183,9 @@ const char *test_wsum_hess_prod_axis_zero_mixed_zeros(void)
         }
     }
 
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 17));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 75));
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 75));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->wsum_hess, expected_p, expected_i, 16, 75));
+    mu_assert("vals fail", cmp_values(p->wsum_hess, expected_x, 75));
 
     free_expr(p);
     return 0;
@@ -245,9 +245,9 @@ const char *test_wsum_hess_prod_axis_zero_one_zero(void)
      */
     int expected_i[8] = {1, 2, 1, 2, 3, 4, 3, 4};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 8));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->p, expected_p, 6));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->to_csr(p->wsum_hess)->i, expected_i, 8));
+    mu_assert("vals fail", cmp_values(p->wsum_hess, expected_x, 8));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->wsum_hess, expected_p, expected_i, 5, 8));
 
     free_expr(p);
     return 0;
diff --git a/tests/wsum_hess/other/test_quad_form.h b/tests/wsum_hess/other/test_quad_form.h
index 859fe33..b97e682 100644
--- a/tests/wsum_hess/other/test_quad_form.h
+++ b/tests/wsum_hess/other/test_quad_form.h
@@ -38,9 +38,9 @@ const char *test_wsum_hess_quad_form(void)
     int expected_i[10] = {3, 4, 3, 4, 5, 4, 5, 6, 5, 6};
     double expected_x[10] = {4.0, 8.0, 8.0, 20.0, 12.0, 12.0, 16.0, 4.0, 4.0, 24.0};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->p, expected_p, 11));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->to_csr(node->wsum_hess)->i, expected_i, 10));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 10));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->wsum_hess, expected_p, expected_i, 10, 10));
+    mu_assert("vals fail", cmp_values(node->wsum_hess, expected_x, 10));
 
     free_expr(node);
     free_csr_matrix(Q);

From 9a53450132d2417dbb7590db09c61a545d72d4c5 Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Mon, 11 May 2026 12:22:25 +0200
Subject: [PATCH 07/31] remove memcpy in to_csr

---
 include/utils/matrix.h                  |  8 ++---
 include/utils/matrix_sum.h              |  5 ++++
 include/utils/permuted_dense.h          |  7 +++--
 src/atoms/bivariate_full_dom/multiply.c | 22 ++++++--------
 src/utils/matrix_sum.c                  |  7 +++++
 src/utils/permuted_dense.c              | 36 +++++++++++++----------
 tests/profiling/profile_log_reg.h       |  7 ++---
 tests/utils/test_permuted_dense.h       | 39 +++++--------------------
 8 files changed, 59 insertions(+), 72 deletions(-)

diff --git a/include/utils/matrix.h b/include/utils/matrix.h
index 268e977..2037e62 100644
--- a/include/utils/matrix.h
+++ b/include/utils/matrix.h
@@ -70,10 +70,10 @@ typedef struct Matrix
     int m, n, nnz;
 
     /* Non-owning pointer to the value buffer. Sparse_Matrix: csr->x.
-       Permuted_Dense: pd->X. Dense_Matrix: dm->x. Sparse and Permuted_Dense
-       share row-major layout for equal sparsity patterns (see
-       permuted_dense_to_csr_fill_values), so memcpy via M->x is valid between
-       same-shape Sparse/PD pairs. */
+       Permuted_Dense: pd->X (also aliased as pd->csr_cache->x). Dense_Matrix:
+       dm->x. Sparse and Permuted_Dense share row-major layout for equal
+       sparsity patterns, so memcpy via M->x is valid between same-shape
+       Sparse/PD pairs. */
     double *x;
 
     /* Operators for the left-multiply matrix in left_matmul. */
diff --git a/include/utils/matrix_sum.h b/include/utils/matrix_sum.h
index f9f803f..4f8c8d4 100644
--- a/include/utils/matrix_sum.h
+++ b/include/utils/matrix_sum.h
@@ -33,4 +33,9 @@ void sum_matrices_alloc(Matrix *A, Matrix *B, Matrix *C);
    A and B (typically produced by sum_matrices_alloc). */
 void sum_matrices_fill_values(Matrix *A, Matrix *B, Matrix *C);
 
+/* Fills C's values for C = diag(d1) * A + diag(d2) * B; assumes C already
+   has the union sparsity pattern of A and B. */
+void sum_scaled_matrices_fill_values(Matrix *A, Matrix *B, Matrix *C,
+                                     const double *d1, const double *d2);
+
 #endif /* MATRIX_SUM_H */
diff --git a/include/utils/permuted_dense.h b/include/utils/permuted_dense.h
index 3f3714d..3192a18 100644
--- a/include/utils/permuted_dense.h
+++ b/include/utils/permuted_dense.h
@@ -54,9 +54,10 @@ Matrix *new_permuted_dense(int m, int n, int dense_m, int dense_n,
                            const int *row_perm, const int *col_perm,
                            const double *X_data);
 
-/* Convert to CSR. The output has dense_m * dense_n nonzeros. */
-CSR_Matrix *permuted_dense_to_csr_alloc(const Permuted_Dense *self);
-void permuted_dense_to_csr_fill_values(const Permuted_Dense *self, CSR_Matrix *out);
+/* CSR view: callers should use the vtable, i.e. base.to_csr(base). The PD
+   owns and caches the returned CSR_Matrix; its value array aliases self->X,
+   so values are always live with no separate fill needed. Callers must not
+   free the returned CSR — it's released by free_matrix on the PD. */
 
 /* Fill out = diag(d) * self, where d has length self->base.m. out must have
    the same structure as self (same dimensions and same row_perm/col_perm). */
diff --git a/src/atoms/bivariate_full_dom/multiply.c b/src/atoms/bivariate_full_dom/multiply.c
index b17f0df..8e02cfe 100644
--- a/src/atoms/bivariate_full_dom/multiply.c
+++ b/src/atoms/bivariate_full_dom/multiply.c
@@ -67,8 +67,8 @@ static void eval_jacobian(expr *node)
 
     /* chain rule: the jacobian of h(x) = f(g1(x), g2(x))) is Jh = J_{f, 1} J_{g1} +
      * J_{f, 2} J_{g2} */
-    sum_scaled_csr_matrices_fill_values(x->jacobian->to_csr(x->jacobian), y->jacobian->to_csr(y->jacobian),
-                                        node->jacobian->to_csr(node->jacobian), y->value, x->value);
+    sum_scaled_matrices_fill_values(x->jacobian, y->jacobian, node->jacobian,
+                                    y->value, x->value);
 }
 
 static void wsum_hess_init_impl(expr *node)
@@ -172,7 +172,7 @@ static void wsum_hess_init_impl(expr *node)
            matrix in the sum) */
         int *maps[4];
         CSR_Matrix *hess = sum_4_csr_alloc(C, CT, x->wsum_hess->to_csr(x->wsum_hess),
-                                            y->wsum_hess->to_csr(y->wsum_hess), maps);
+                                           y->wsum_hess->to_csr(y->wsum_hess), maps);
         node->wsum_hess = new_sparse_matrix(hess);
         mul_node->idx_map_C = maps[0];
         mul_node->idx_map_CT = maps[1];
@@ -190,12 +190,8 @@ static void eval_wsum_hess(expr *node, const double *w)
     if (x->var_id != NOT_A_VARIABLE && y->var_id != NOT_A_VARIABLE &&
         x->var_id != y->var_id)
     {
-        /* node->wsum_hess is Sparse_Matrix (built explicitly above), so the CSR
-           view aliases its storage. Two contiguous halves [w; w] of length
-           node->size each, matching the (var1, var2) and (var2, var1) blocks. */
-        CSR_Matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
-        memcpy(H->x, w, node->size * sizeof(double));
-        memcpy(H->x + node->size, w, node->size * sizeof(double));
+        memcpy(node->wsum_hess->x, w, node->size * sizeof(double));
+        memcpy(node->wsum_hess->x + node->size, w, node->size * sizeof(double));
     }
     else
     {
@@ -209,8 +205,8 @@ static void eval_wsum_hess(expr *node, const double *w)
         // ----------------------------------------------------------------------
         if (!x->work->jacobian_csc_filled)
         {
-            csr_to_csc_fill_values(x->jacobian->to_csr(x->jacobian), x->work->jacobian_csc,
-                                   x->work->csc_work);
+            csr_to_csc_fill_values(x->jacobian->to_csr(x->jacobian),
+                                   x->work->jacobian_csc, x->work->csc_work);
 
             if (is_x_affine)
             {
@@ -220,8 +216,8 @@ static void eval_wsum_hess(expr *node, const double *w)
 
         if (!y->work->jacobian_csc_filled)
         {
-            csr_to_csc_fill_values(y->jacobian->to_csr(y->jacobian), y->work->jacobian_csc,
-                                   y->work->csc_work);
+            csr_to_csc_fill_values(y->jacobian->to_csr(y->jacobian),
+                                   y->work->jacobian_csc, y->work->csc_work);
 
             if (is_y_affine)
             {
diff --git a/src/utils/matrix_sum.c b/src/utils/matrix_sum.c
index 72bc9c5..8d0bd34 100644
--- a/src/utils/matrix_sum.c
+++ b/src/utils/matrix_sum.c
@@ -29,3 +29,10 @@ void sum_matrices_fill_values(Matrix *A, Matrix *B, Matrix *C)
 {
     sum_csr_fill_values(A->to_csr(A), B->to_csr(B), C->to_csr(C));
 }
+
+void sum_scaled_matrices_fill_values(Matrix *A, Matrix *B, Matrix *C,
+                                     const double *d1, const double *d2)
+{
+    sum_scaled_csr_matrices_fill_values(A->to_csr(A), B->to_csr(B), C->to_csr(C),
+                                        d1, d2);
+}
diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c
index 7c579a1..d4d0434 100644
--- a/src/utils/permuted_dense.c
+++ b/src/utils/permuted_dense.c
@@ -28,11 +28,17 @@ static void permuted_dense_free(Matrix *self)
     Permuted_Dense *pd = (Permuted_Dense *) self;
     free(pd->row_perm);
     free(pd->col_perm);
-    free(pd->X);
     free(pd->Y_scratch);
     free(pd->col_inv);
     free(pd->row_inv);
+    /* csr_cache->x aliases pd->X (set in permuted_dense_to_csr_alloc); NULL it
+       so free_csr_matrix doesn't double-free the shared buffer. */
+    if (pd->csr_cache != NULL)
+    {
+        pd->csr_cache->x = NULL;
+    }
     free_csr_matrix(pd->csr_cache);
+    free(pd->X);
     free(pd);
 }
 
@@ -69,15 +75,12 @@ static void permuted_dense_vtable_ATDA_fill_values(const Matrix *self,
                                     (Permuted_Dense *) out);
 }
 
-/* Lazy CSR view: allocate structure on first call, refill values on every call.
-   This means the returned CSR's values always reflect the current X.
+/* Forward decl; definition lower in the file. */
+static CSR_Matrix *permuted_dense_to_csr_alloc(const Permuted_Dense *self);
 
-   Future optimization: pd->X and csr_cache->x have bit-identical memory layout
-   (row-major dense block, same offsets), so we could alias csr_cache->x = pd->X
-   and skip the value fill entirely. That requires a non-owning x flag on
-   CSR_Matrix so free_csr_matrix doesn't double-free pd->X. The current
-   memcpy-on-every-call is cheap (O(dense_m * dense_n) bandwidth), and revisiting
-   this can wait until a profile shows it matters. */
+/* Lazy CSR view: allocate structure on first call, then return the cache.
+   The cache's x array aliases pd->X (see permuted_dense_to_csr_alloc), so
+   values are always live without a per-call refresh. */
 static CSR_Matrix *permuted_dense_to_csr(Matrix *self)
 {
     Permuted_Dense *pd = (Permuted_Dense *) self;
@@ -85,7 +88,6 @@ static CSR_Matrix *permuted_dense_to_csr(Matrix *self)
     {
         pd->csr_cache = permuted_dense_to_csr_alloc(pd);
     }
-    permuted_dense_to_csr_fill_values(pd, pd->csr_cache);
     return pd->csr_cache;
 }
 
@@ -397,13 +399,20 @@ Matrix *new_permuted_dense(int m, int n, int dense_m, int dense_n,
     return &pd->base;
 }
 
-CSR_Matrix *permuted_dense_to_csr_alloc(const Permuted_Dense *self)
+static CSR_Matrix *permuted_dense_to_csr_alloc(const Permuted_Dense *self)
 {
     int dense_m = self->dense_m;
     int dense_n = self->dense_n;
     int m = self->base.m;
     CSR_Matrix *C = new_csr_matrix(m, self->base.n, dense_m * dense_n);
 
+    /* Alias C->x to self->X: the dense block layout already matches what the
+       CSR view's value array would hold, so values are always live with no
+       memcpy needed. The PD owns the buffer; permuted_dense_free nulls
+       C->x before free_csr_matrix to avoid double-free. */
+    free(C->x);
+    C->x = self->X;
+
     /* fill column indices (each dense row contributes a copy of col_perm) */
     for (int ii = 0; ii < dense_m; ii++)
     {
@@ -425,11 +434,6 @@ CSR_Matrix *permuted_dense_to_csr_alloc(const Permuted_Dense *self)
     return C;
 }
 
-void permuted_dense_to_csr_fill_values(const Permuted_Dense *self, CSR_Matrix *out)
-{
-    memcpy(out->x, self->X, self->dense_m * self->dense_n * sizeof(double));
-}
-
 void permuted_dense_DA_fill_values(const double *d, const Permuted_Dense *self,
                                    Permuted_Dense *out)
 {
diff --git a/tests/profiling/profile_log_reg.h b/tests/profiling/profile_log_reg.h
index 1e4370a..779af75 100644
--- a/tests/profiling/profile_log_reg.h
+++ b/tests/profiling/profile_log_reg.h
@@ -79,8 +79,8 @@ const char *profile_log_reg(void)
     free(full_rows);
     free(full_cols);
 
-    /* CSR scaffolding for the row-sum step. */
-    CSR_Matrix *Jlog_csr = permuted_dense_to_csr_alloc(Jlog_pd);
+    /* CSR scaffolding for the row-sum step (PD owns the cached CSR view). */
+    CSR_Matrix *Jlog_csr = Jlog_M->to_csr(Jlog_M);
     CSR_Matrix *Jobj_csr = new_csr_matrix(1, n, n);
     int *iwork = (int *) malloc((size_t) m * n * sizeof(int));
     int *idx_map = (int *) malloc((size_t) m * n * sizeof(int));
@@ -98,7 +98,6 @@ const char *profile_log_reg(void)
     clock_gettime(CLOCK_MONOTONIC, &t_b_jac.start);
     log_obj->local_jacobian(log_obj, log_obj->work->dwork);
     permuted_dense_DA_fill_values(log_obj->work->dwork, A_pd, Jlog_pd);
-    permuted_dense_to_csr_fill_values(Jlog_pd, Jlog_csr);
     memset(Jobj_csr->x, 0, Jobj_csr->nnz * sizeof(double));
     accumulator(Jlog_csr->x, Jlog_csr->nnz, idx_map, Jobj_csr->x);
     clock_gettime(CLOCK_MONOTONIC, &t_b_jac.end);
@@ -163,7 +162,7 @@ const char *profile_log_reg(void)
     free(iwork);
     free(idx_map);
     free_csr_matrix(Jobj_csr);
-    free_csr_matrix(Jlog_csr);
+    /* Jlog_csr is owned by Jlog_M's cache; released by free_matrix below. */
     free_matrix(H_pd_M);
     free_matrix(Jlog_M);
     free_matrix(A_pd_M);
diff --git a/tests/utils/test_permuted_dense.h b/tests/utils/test_permuted_dense.h
index 2b1cddc..75ca878 100644
--- a/tests/utils/test_permuted_dense.h
+++ b/tests/utils/test_permuted_dense.h
@@ -23,11 +23,8 @@ const char *test_permuted_dense_to_csr_basic(void)
     double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
 
     Matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
-    Permuted_Dense *pd = (Permuted_Dense *) M;
-
-    CSR_Matrix *C = permuted_dense_to_csr_alloc(pd);
-    permuted_dense_to_csr_fill_values(pd, C);
 
+    CSR_Matrix *C = M->to_csr(M);
     int Cp_expected[6] = {0, 0, 2, 4, 4, 6};
     int Ci_expected[6] = {0, 3, 0, 3, 0, 3};
     double Cx_expected[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
@@ -39,7 +36,6 @@ const char *test_permuted_dense_to_csr_basic(void)
     mu_assert("i", cmp_int_array(C->i, Ci_expected, 6));
     mu_assert("x", cmp_double_array(C->x, Cx_expected, 6));
 
-    free_csr_matrix(C);
     free_matrix(M);
     return 0;
 }
@@ -49,16 +45,12 @@ const char *test_permuted_dense_to_csr_basic(void)
 const char *test_permuted_dense_to_csr_empty(void)
 {
     Matrix *M = new_permuted_dense(4, 5, 0, 0, NULL, NULL, NULL);
-    Permuted_Dense *pd = (Permuted_Dense *) M;
-
-    CSR_Matrix *C = permuted_dense_to_csr_alloc(pd);
-    permuted_dense_to_csr_fill_values(pd, C);
 
+    CSR_Matrix *C = M->to_csr(M);
     int Cp_expected[5] = {0, 0, 0, 0, 0};
     mu_assert("nnz", C->nnz == 0);
     mu_assert("p", cmp_int_array(C->p, Cp_expected, 5));
 
-    free_csr_matrix(C);
     free_matrix(M);
     return 0;
 }
@@ -72,11 +64,8 @@ const char *test_permuted_dense_to_csr_full(void)
     double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
 
     Matrix *M = new_permuted_dense(2, 3, 2, 3, row_perm, col_perm, X);
-    Permuted_Dense *pd = (Permuted_Dense *) M;
-
-    CSR_Matrix *C = permuted_dense_to_csr_alloc(pd);
-    permuted_dense_to_csr_fill_values(pd, C);
 
+    CSR_Matrix *C = M->to_csr(M);
     int Cp_expected[3] = {0, 3, 6};
     int Ci_expected[6] = {0, 1, 2, 0, 1, 2};
     double Cx_expected[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
@@ -85,7 +74,6 @@ const char *test_permuted_dense_to_csr_full(void)
     mu_assert("i", cmp_int_array(C->i, Ci_expected, 6));
     mu_assert("x", cmp_double_array(C->x, Cx_expected, 6));
 
-    free_csr_matrix(C);
     free_matrix(M);
     return 0;
 }
@@ -99,11 +87,8 @@ const char *test_permuted_dense_to_csr_single_row(void)
     double X[2] = {7.0, 9.0};
 
     Matrix *M = new_permuted_dense(4, 5, 1, 2, row_perm, col_perm, X);
-    Permuted_Dense *pd = (Permuted_Dense *) M;
-
-    CSR_Matrix *C = permuted_dense_to_csr_alloc(pd);
-    permuted_dense_to_csr_fill_values(pd, C);
 
+    CSR_Matrix *C = M->to_csr(M);
     int Cp_expected[5] = {0, 0, 0, 2, 2};
     int Ci_expected[2] = {1, 4};
     double Cx_expected[2] = {7.0, 9.0};
@@ -112,7 +97,6 @@ const char *test_permuted_dense_to_csr_single_row(void)
     mu_assert("i", cmp_int_array(C->i, Ci_expected, 2));
     mu_assert("x", cmp_double_array(C->x, Cx_expected, 2));
 
-    free_csr_matrix(C);
     free_matrix(M);
     return 0;
 }
@@ -125,11 +109,8 @@ const char *test_permuted_dense_to_csr_single_col(void)
     double X[3] = {1.0, 2.0, 3.0};
 
     Matrix *M = new_permuted_dense(4, 4, 3, 1, row_perm, col_perm, X);
-    Permuted_Dense *pd = (Permuted_Dense *) M;
-
-    CSR_Matrix *C = permuted_dense_to_csr_alloc(pd);
-    permuted_dense_to_csr_fill_values(pd, C);
 
+    CSR_Matrix *C = M->to_csr(M);
     int Cp_expected[5] = {0, 1, 1, 2, 3};
     int Ci_expected[3] = {2, 2, 2};
     double Cx_expected[3] = {1.0, 2.0, 3.0};
@@ -138,7 +119,6 @@ const char *test_permuted_dense_to_csr_single_col(void)
     mu_assert("i", cmp_int_array(C->i, Ci_expected, 3));
     mu_assert("x", cmp_double_array(C->x, Cx_expected, 3));
 
-    free_csr_matrix(C);
     free_matrix(M);
     return 0;
 }
@@ -162,19 +142,14 @@ const char *test_permuted_dense_DA_fill_values(void)
     permuted_dense_DA_fill_values(d, pd, pd_out);
 
     /* Ground truth: build CSR of self, run DA_fill_values, compare. */
-    CSR_Matrix *csr = permuted_dense_to_csr_alloc(pd);
-    permuted_dense_to_csr_fill_values(pd, csr);
+    CSR_Matrix *csr = M->to_csr(M);
     CSR_Matrix *csr_expected = new_csr_copy_sparsity(csr);
     DA_fill_values(d, csr, csr_expected);
 
-    CSR_Matrix *csr_out = permuted_dense_to_csr_alloc(pd_out);
-    permuted_dense_to_csr_fill_values(pd_out, csr_out);
-
+    CSR_Matrix *csr_out = M_out->to_csr(M_out);
     mu_assert("x", cmp_double_array(csr_out->x, csr_expected->x, csr->nnz));
 
-    free_csr_matrix(csr);
     free_csr_matrix(csr_expected);
-    free_csr_matrix(csr_out);
     free_matrix(M);
     free_matrix(M_out);
     return 0;

From 5ae3cfb0bb1d0effc7c20a93609028c321b957b7 Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Mon, 11 May 2026 14:00:40 +0200
Subject: [PATCH 08/31] add profile for trimmed log reg

---
 tests/all_tests.c                         |  2 +
 tests/profiling/profile_trimmed_log_reg.h | 94 +++++++++++++++++++++++
 2 files changed, 96 insertions(+)
 create mode 100644 tests/profiling/profile_trimmed_log_reg.h

diff --git a/tests/all_tests.c b/tests/all_tests.c
index d39b303..168daf9 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -105,6 +105,7 @@
 #ifdef PROFILE_ONLY
 #include "profiling/profile_left_matmul.h"
 #include "profiling/profile_log_reg.h"
+#include "profiling/profile_trimmed_log_reg.h"
 #endif /* PROFILE_ONLY */
 
 int main(void)
@@ -422,6 +423,7 @@ int main(void)
     printf("\n--- Profiling Tests ---\n");
     mu_run_test(profile_left_matmul, tests_run);
     mu_run_test(profile_log_reg, tests_run);
+    mu_run_test(profile_trimmed_log_reg, tests_run);
 #endif /* PROFILE_ONLY */
 
     printf("\n=== All %d tests passed ===\n", tests_run);
diff --git a/tests/profiling/profile_trimmed_log_reg.h b/tests/profiling/profile_trimmed_log_reg.h
new file mode 100644
index 0000000..4dbe9f0
--- /dev/null
+++ b/tests/profiling/profile_trimmed_log_reg.h
@@ -0,0 +1,94 @@
+#ifndef PROFILE_TRIMMED_LOG_REG_H
+#define PROFILE_TRIMMED_LOG_REG_H
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "atoms/affine.h"
+#include "atoms/bivariate_full_dom.h"
+#include "atoms/elementwise_full_dom.h"
+#include "expr.h"
+#include "minunit.h"
+#include "subexpr.h"
+#include "utils/Timer.h"
+
+/* Profile Jacobian + Hessian of:
+     obj = sum( w ∘ logistic( -(y ∘ (A·theta)) ) )
+
+   theta (n x 1), w (m x 1)  : variables (n_vars = n + m)
+   A     (m x n)             : dense constant
+   y     (m x 1)              : constant in {-1, +1}, wrapped as PARAM_FIXED
+
+   Forward pass is excluded from timing. */
+const char *profile_trimmed_log_reg(void)
+{
+    int m = 2000;
+    int n = 785;
+    int n_vars = n + m;
+
+    /* ---- Random inputs ---- */
+    srand(42);
+    double *A_data = (double *) malloc((size_t) m * n * sizeof(double));
+    double *y_data = (double *) malloc((size_t) m * sizeof(double));
+    double *u = (double *) malloc((size_t) n_vars * sizeof(double));
+    for (int i = 0; i < m * n; i++)
+    {
+        A_data[i] = (double) rand() / RAND_MAX - 0.5;
+    }
+    for (int i = 0; i < m; i++)
+    {
+        y_data[i] = (rand() % 2 == 0) ? 1.0 : -1.0;
+    }
+    for (int i = 0; i < n_vars; i++)
+    {
+        u[i] = (double) rand() / RAND_MAX - 0.5;
+    }
+
+    /* ---- Build expression DAG ---- */
+    expr *theta = new_variable(n, 1, 0, n_vars);
+    expr *w = new_variable(m, 1, n, n_vars);
+
+    expr *y_param = new_parameter(m, 1, PARAM_FIXED, n_vars, y_data);
+
+    expr *A_theta = new_left_matmul_dense(NULL, theta, m, n, A_data);
+    expr *y_A_theta = new_vector_mult(y_param, A_theta);
+    expr *neg_node = new_neg(y_A_theta);
+    expr *sig = new_logistic(neg_node);
+    expr *w_sig = new_elementwise_mult(w, sig);
+    expr *obj = new_sum(w_sig, -1);
+
+    jacobian_init(obj);
+    wsum_hess_init(obj);
+
+    /* Forward (untimed). */
+    obj->forward(obj, u);
+
+    /* ---- Time eval_jacobian and eval_wsum_hess ---- */
+    double w_one = 1.0;
+    Timer t_jac, t_hess;
+    clock_gettime(CLOCK_MONOTONIC, &t_jac.start);
+    obj->eval_jacobian(obj);
+    clock_gettime(CLOCK_MONOTONIC, &t_jac.end);
+
+    clock_gettime(CLOCK_MONOTONIC, &t_hess.start);
+    obj->eval_wsum_hess(obj, &w_one);
+    clock_gettime(CLOCK_MONOTONIC, &t_hess.end);
+
+    double sec_jac = GET_ELAPSED_SECONDS(t_jac);
+    double sec_hess = GET_ELAPSED_SECONDS(t_hess);
+
+    printf("\n");
+    printf("                          Jacobian      Hessian        Total\n");
+    printf("  trimmed_log_reg:      %10.6fs  %10.6fs  %10.6fs\n", sec_jac,
+           sec_hess, sec_jac + sec_hess);
+
+    /* ---- Cleanup ---- */
+    free_expr(obj);
+    free(A_data);
+    free(y_data);
+    free(u);
+
+    return 0;
+}
+
+#endif /* PROFILE_TRIMMED_LOG_REG_H */

From d2e84a7409970d9623dd950211ded5c415bbab0c Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Mon, 11 May 2026 15:17:22 +0200
Subject: [PATCH 09/31] infrastructure for matrix multiplication when one
 matrix is permuted_dense

---
 include/subexpr.h                       |   4 +-
 include/utils/matrix.h                  |   9 +
 include/utils/permuted_dense.h          |  59 +++
 src/atoms/bivariate_full_dom/multiply.c |  56 ++-
 src/utils/dense_matrix.c                |   7 +
 src/utils/permuted_dense.c              | 456 ++++++++++++++++++++
 src/utils/sparse_matrix.c               |   7 +
 tests/all_tests.c                       |  13 +
 tests/utils/test_permuted_dense.h       | 527 ++++++++++++++++++++++++
 9 files changed, 1107 insertions(+), 31 deletions(-)

diff --git a/include/subexpr.h b/include/subexpr.h
index a1ac764..a934a93 100644
--- a/include/subexpr.h
+++ b/include/subexpr.h
@@ -109,8 +109,8 @@ typedef struct hstack_expr
 typedef struct elementwise_mult_expr
 {
     expr base;
-    CSR_Matrix *CSR_work1; /* C  = Jg2^T diag(w) Jg1 */
-    CSR_Matrix *CSR_work2; /* CT = C^T */
+    Matrix *cross_C;       /* C  = Jg2^T diag(w) Jg1 (Sparse or PD) */
+    CSR_Matrix *CSR_work2; /* CT = C^T (always CSR) */
     int *idx_map_C;        /* C[j]  -> wsum_hess pos */
     int *idx_map_CT;       /* CT[j] -> wsum_hess pos */
     int *idx_map_Hx;       /* x->wsum_hess[j] -> pos */
diff --git a/include/utils/matrix.h b/include/utils/matrix.h
index 2037e62..96a8827 100644
--- a/include/utils/matrix.h
+++ b/include/utils/matrix.h
@@ -21,6 +21,10 @@
 #include "CSC_Matrix.h"
 #include "CSR_Matrix.h"
 
+/* Forward declaration; full definition in permuted_dense.h. Used by the
+   as_permuted_dense vtable getter. */
+struct Permuted_Dense;
+
 /* Broadcast shape used by the broadcast atom and its vtable methods. */
 typedef enum
 {
@@ -99,6 +103,11 @@ typedef struct Matrix
                              struct Matrix *out);
     CSR_Matrix *(*to_csr)(struct Matrix *self);
 
+    /* Returns self downcast to Permuted_Dense if self is PD-backed, NULL
+       otherwise. Used by bivariate dispatchers to route to type-specialized
+       kernels. */
+    struct Permuted_Dense *(*as_permuted_dense)(struct Matrix *self);
+
     /* Row-selection / indexing: returns a new Matrix that selects rows
        indices[0..n_idxs) of self. Output shape is (n_idxs, self->n). The
        returned type matches self's concrete type. index_alloc sets up
diff --git a/include/utils/permuted_dense.h b/include/utils/permuted_dense.h
index 3192a18..99fd318 100644
--- a/include/utils/permuted_dense.h
+++ b/include/utils/permuted_dense.h
@@ -19,6 +19,7 @@
 #define PERMUTED_DENSE_H
 
 #include "matrix.h"
+#include <stddef.h>
 
 /* Permuted_Dense represents a matrix whose only nonzeros lie in a dense
    block, after rows and columns are restricted to chosen subsets of the
@@ -45,6 +46,13 @@ typedef struct Permuted_Dense
     CSR_Matrix *csr_cache; /* lazy CSR view built by to_csr; structure */
                            /* allocated on first call, values refilled */
                            /* on every call. NULL until first call.    */
+    /* Scratch buffer for BTA_csr_pd / BTA_pd_csr fill kernels. Owned by the
+       output PD, allocated by the corresponding BTA *_alloc so per-call
+       BTA / BTDA fill kernels can reuse it across solver iterations
+       (avoids malloc/free of large dense buffers per Hessian eval). Sized
+       at alloc time; NULL on PDs not produced by those allocators. */
+    double *gather_X_scratch;
+    size_t gather_X_size;
 } Permuted_Dense;
 
 /* Constructor. row_perm and col_perm must be strictly increasing in their
@@ -77,6 +85,57 @@ Matrix *permuted_dense_ATA_alloc(const Permuted_Dense *self);
 void permuted_dense_ATDA_fill_values(const Permuted_Dense *self, const double *d,
                                      Permuted_Dense *out);
 
+/* Allocate a new Permuted_Dense for C = B^T @ A where A and B are both PD.
+   Output shape (B->base.n, A->base.n) with dense block (B->dense_n,
+   A->dense_n), row_perm = B->col_perm, col_perm = A->col_perm. Values
+   uninitialized. The output structure does not depend on row_perm_A or
+   row_perm_B (only the values do; see permuted_dense_BTA_fill_values). */
+Matrix *permuted_dense_BTA_alloc(const Permuted_Dense *A,
+                                 const Permuted_Dense *B);
+
+/* Fill out->X = B->X^T @ A->X restricted to rows in row_perm_A ∩ row_perm_B.
+   out must have the structure produced by permuted_dense_BTA_alloc(A, B).
+   For matching row_perms, this is a single cblas_dgemm; otherwise the
+   intersecting rows are first gathered into contiguous scratch buffers. */
+void permuted_dense_BTA_fill_values(const Permuted_Dense *A,
+                                    const Permuted_Dense *B,
+                                    Permuted_Dense *out);
+
+/* Allocate a new Permuted_Dense for C = B^T @ A where A is Sparse (CSR)
+   and B is PD. Output is PD with row_perm = B->col_perm and col_perm = the
+   sorted union of columns appearing in A's rows at positions row_perm_B.
+   Dense block size = (B->dense_n, |col_active|). Values uninitialized. */
+Matrix *BTA_csr_pd_alloc(const CSR_Matrix *A_csr, const Permuted_Dense *B);
+
+/* Fill out->X = X_B^T @ A_sub_dense, where A_sub_dense is A's rows at
+   positions row_perm_B, columns restricted to out's col_perm, scattered
+   to a dense buffer. out must have the structure produced by
+   BTA_csr_pd_alloc(A_csr, B). */
+void BTA_csr_pd_fill_values(const CSR_Matrix *A_csr, const Permuted_Dense *B,
+                            Permuted_Dense *out);
+
+/* Allocate a new Permuted_Dense for C = B^T @ A where A is PD and B is
+   Sparse (CSR). Output is PD with row_perm = the sorted union of columns
+   appearing in B's rows at positions row_perm_A, and col_perm = col_perm_A.
+   Dense block size = (|row_active|, dense_n_A). Values uninitialized. */
+Matrix *BTA_pd_csr_alloc(const Permuted_Dense *A, const CSR_Matrix *B_csr);
+
+/* Fill out->X = B_sub^T @ X_A, where B_sub is B's rows at positions
+   row_perm_A, columns restricted to out's row_perm, scattered to a dense
+   buffer. out must have the structure produced by BTA_pd_csr_alloc(A, B). */
+void BTA_pd_csr_fill_values(const Permuted_Dense *A, const CSR_Matrix *B_csr,
+                            Permuted_Dense *out);
+
+/* BTDA variants — fold a diagonal d into the BTA computation. Each fills
+   out->X = B^T diag(d) A (d may be NULL for plain B^T A). out must have
+   the structure produced by the corresponding BTA *_alloc function. */
+void BTDA_csr_pd_fill_values(const CSR_Matrix *A_csr, const double *d,
+                             const Permuted_Dense *B, Permuted_Dense *out);
+void BTDA_pd_csr_fill_values(const Permuted_Dense *A, const double *d,
+                             const CSR_Matrix *B_csr, Permuted_Dense *out);
+void BTDA_pd_pd_fill_values(const Permuted_Dense *A, const double *d,
+                            const Permuted_Dense *B, Permuted_Dense *out);
+
 /* Allocate a new Permuted_Dense for C = self @ J. C has global shape
    (self->base.m, J->n) with row_perm = self->row_perm and col_perm equal
    to the sorted list of columns of J that have at least one structural
diff --git a/src/atoms/bivariate_full_dom/multiply.c b/src/atoms/bivariate_full_dom/multiply.c
index 8e02cfe..575ac46 100644
--- a/src/atoms/bivariate_full_dom/multiply.c
+++ b/src/atoms/bivariate_full_dom/multiply.c
@@ -18,6 +18,7 @@
 #include "atoms/bivariate_full_dom.h"
 #include "subexpr.h"
 #include "utils/CSR_sum.h"
+#include "utils/matrix_BTA.h"
 #include "utils/matrix_sum.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
@@ -148,30 +149,34 @@ static void wsum_hess_init_impl(expr *node)
             node->work->dwork = (double *) SP_MALLOC(node->size * sizeof(double));
         }
 
-        /* prepare sparsity pattern of csc conversion */
+        /* CSC scaffolding is still needed for the (Sparse, Sparse) fast path
+           through BTA_matrices_* / BTDA_matrices_* — those route through
+           Sparse_Matrix's csc_cache. For PD operands, refresh_csc_values is
+           a no-op so the call is harmless. */
         jacobian_csc_init(x);
         jacobian_csc_init(y);
-        CSC_Matrix *Jg1 = x->work->jacobian_csc;
-        CSC_Matrix *Jg2 = y->work->jacobian_csc;
-
-        /* compute sparsity of C and prepare CT */
-        CSR_Matrix *C = BTA_alloc(Jg1, Jg2);
-        node->work->iwork = (int *) SP_MALLOC(C->m * sizeof(int));
-        CSR_Matrix *CT = AT_alloc(C, node->work->iwork);
 
         /* initialize wsum_hessians of children */
         wsum_hess_init(x);
         wsum_hess_init(y);
 
         elementwise_mult_expr *mul_node = (elementwise_mult_expr *) node;
-        mul_node->CSR_work1 = C;
+
+        /* compute sparsity of C polymorphically (Sparse, PD-CSR, CSR-PD, PD-PD). */
+        mul_node->cross_C = BTA_matrices_alloc(x->jacobian, y->jacobian);
+
+        /* CT structure is always CSR (via AT on C's CSR view). */
+        CSR_Matrix *C_csr = mul_node->cross_C->to_csr(mul_node->cross_C);
+        node->work->iwork = (int *) SP_MALLOC(C_csr->m * sizeof(int));
+        CSR_Matrix *CT = AT_alloc(C_csr, node->work->iwork);
         mul_node->CSR_work2 = CT;
 
         /* compute sparsity pattern of H = C + C^T + term2 + term3 (we also
            fill index maps telling us where to accumulate each element of each
            matrix in the sum) */
         int *maps[4];
-        CSR_Matrix *hess = sum_4_csr_alloc(C, CT, x->wsum_hess->to_csr(x->wsum_hess),
+        CSR_Matrix *hess = sum_4_csr_alloc(C_csr, CT,
+                                           x->wsum_hess->to_csr(x->wsum_hess),
                                            y->wsum_hess->to_csr(y->wsum_hess), maps);
         node->wsum_hess = new_sparse_matrix(hess);
         mul_node->idx_map_C = maps[0];
@@ -198,44 +203,36 @@ static void eval_wsum_hess(expr *node, const double *w)
         bool is_x_affine = x->is_affine(x);
         bool is_y_affine = y->is_affine(y);
         // ----------------------------------------------------------------------
-        //            convert Jacobians of children to CSC format
-        //      (we only need to do this once if the child is affine)
-        //      TODO: what if we have parameters? Should we set jacobian_csc_filled
-        //      to false whenever parameters change value?
+        //  Refresh each operand's CSC cache as needed for the (Sparse, Sparse)
+        //  dispatch path. For PD operands, refresh_csc_values is a no-op. The
+        //  jacobian_csc_filled flag preserves the affine optimization: we only
+        //  refresh on the first eval for affine children.
         // ----------------------------------------------------------------------
         if (!x->work->jacobian_csc_filled)
         {
-            csr_to_csc_fill_values(x->jacobian->to_csr(x->jacobian),
-                                   x->work->jacobian_csc, x->work->csc_work);
-
+            x->jacobian->refresh_csc_values(x->jacobian);
             if (is_x_affine)
             {
                 x->work->jacobian_csc_filled = true;
             }
         }
-
         if (!y->work->jacobian_csc_filled)
         {
-            csr_to_csc_fill_values(y->jacobian->to_csr(y->jacobian),
-                                   y->work->jacobian_csc, y->work->csc_work);
-
+            y->jacobian->refresh_csc_values(y->jacobian);
             if (is_y_affine)
             {
                 y->work->jacobian_csc_filled = true;
             }
         }
 
-        CSC_Matrix *Jg1 = x->work->jacobian_csc;
-        CSC_Matrix *Jg2 = y->work->jacobian_csc;
-
         // ---------------------------------------------------------------
         //                    compute C and CT
         // ---------------------------------------------------------------
         elementwise_mult_expr *mul_node = (elementwise_mult_expr *) node;
-        CSR_Matrix *C = mul_node->CSR_work1;
         CSR_Matrix *CT = mul_node->CSR_work2;
-        BTDA_fill_values(Jg1, Jg2, w, C);
-        AT_fill_values(C, CT, node->work->iwork);
+        BTDA_matrices_fill_values(x->jacobian, w, y->jacobian, mul_node->cross_C);
+        AT_fill_values(mul_node->cross_C->to_csr(mul_node->cross_C), CT,
+                       node->work->iwork);
 
         // ---------------------------------------------------------------
         //              compute term2 and term 3
@@ -262,7 +259,8 @@ static void eval_wsum_hess(expr *node, const double *w)
         //        compute H = C + C^T + term2 + term3
         // ---------------------------------------------------------------
         memset(node->wsum_hess->x, 0, node->wsum_hess->nnz * sizeof(double));
-        accumulator(C->x, C->nnz, mul_node->idx_map_C, node->wsum_hess->x);
+        accumulator(mul_node->cross_C->x, mul_node->cross_C->nnz,
+                    mul_node->idx_map_C, node->wsum_hess->x);
         accumulator(CT->x, CT->nnz, mul_node->idx_map_CT, node->wsum_hess->x);
         accumulator(x->wsum_hess->x, x->wsum_hess->nnz, mul_node->idx_map_Hx,
                     node->wsum_hess->x);
@@ -274,7 +272,7 @@ static void eval_wsum_hess(expr *node, const double *w)
 static void free_type_data(expr *node)
 {
     elementwise_mult_expr *mul_node = (elementwise_mult_expr *) node;
-    free_csr_matrix(mul_node->CSR_work1);
+    free_matrix(mul_node->cross_C);
     free_csr_matrix(mul_node->CSR_work2);
     free(mul_node->idx_map_C);
     free(mul_node->idx_map_CT);
diff --git a/src/utils/dense_matrix.c b/src/utils/dense_matrix.c
index 5060359..7d6b9a6 100644
--- a/src/utils/dense_matrix.c
+++ b/src/utils/dense_matrix.c
@@ -22,6 +22,12 @@
 #include <stdlib.h>
 #include <string.h>
 
+static struct Permuted_Dense *dense_as_permuted_dense(Matrix *self)
+{
+    (void) self;
+    return NULL;
+}
+
 static void dense_block_left_mult_vec(const Matrix *A, const double *x, double *y,
                                       int p)
 {
@@ -57,6 +63,7 @@ Matrix *new_dense_matrix(int m, int n, const double *data)
     dm->base.n = n;
     dm->base.nnz = m * n;
     dm->base.block_left_mult_vec = dense_block_left_mult_vec;
+    dm->base.as_permuted_dense = dense_as_permuted_dense;
     dm->base.block_left_mult_sparsity = I_kron_A_alloc;
     dm->base.block_left_mult_values = I_kron_A_fill_values;
     dm->base.free_fn = dense_free;
diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c
index d4d0434..5d4b3eb 100644
--- a/src/utils/permuted_dense.c
+++ b/src/utils/permuted_dense.c
@@ -39,6 +39,7 @@ static void permuted_dense_free(Matrix *self)
     }
     free_csr_matrix(pd->csr_cache);
     free(pd->X);
+    free(pd->gather_X_scratch);
     free(pd);
 }
 
@@ -81,6 +82,11 @@ static CSR_Matrix *permuted_dense_to_csr_alloc(const Permuted_Dense *self);
 /* Lazy CSR view: allocate structure on first call, then return the cache.
    The cache's x array aliases pd->X (see permuted_dense_to_csr_alloc), so
    values are always live without a per-call refresh. */
+static struct Permuted_Dense *permuted_dense_as_permuted_dense(Matrix *self)
+{
+    return (Permuted_Dense *) self;
+}
+
 static CSR_Matrix *permuted_dense_to_csr(Matrix *self)
 {
     Permuted_Dense *pd = (Permuted_Dense *) self;
@@ -341,6 +347,7 @@ Matrix *new_permuted_dense(int m, int n, int dense_m, int dense_n,
     pd->base.ATA_alloc = permuted_dense_vtable_ATA_alloc;
     pd->base.ATDA_fill_values = permuted_dense_vtable_ATDA_fill_values;
     pd->base.to_csr = permuted_dense_to_csr;
+    pd->base.as_permuted_dense = permuted_dense_as_permuted_dense;
     pd->base.index_alloc = permuted_dense_vtable_index_alloc;
     pd->base.index_fill_values = permuted_dense_vtable_index_fill_values;
     pd->base.promote_alloc = permuted_dense_vtable_promote_alloc;
@@ -474,6 +481,455 @@ void permuted_dense_ATDA_fill_values(const Permuted_Dense *self, const double *d
                 dense_n);
 }
 
+Matrix *permuted_dense_BTA_alloc(const Permuted_Dense *A, const Permuted_Dense *B)
+{
+    return new_permuted_dense(B->base.n, A->base.n, B->dense_n, A->dense_n,
+                              B->col_perm, A->col_perm, NULL);
+}
+
+/* Return 1 iff arrays a and b of length n are element-wise equal. */
+static int int_arrays_equal(const int *a, const int *b, int n)
+{
+    for (int i = 0; i < n; i++)
+    {
+        if (a[i] != b[i]) return 0;
+    }
+    return 1;
+}
+
+void permuted_dense_BTA_fill_values(const Permuted_Dense *A, const Permuted_Dense *B,
+                                    Permuted_Dense *out)
+{
+    int dn_A = A->dense_n;
+    int dn_B = B->dense_n;
+
+    /* Fast path: matching row_perms (the common case). One dgemm on the
+       full X buffers. */
+    if (A->dense_m == B->dense_m &&
+        int_arrays_equal(A->row_perm, B->row_perm, A->dense_m))
+    {
+        int s = A->dense_m;
+        cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dn_B, dn_A, s, 1.0,
+                    B->X, dn_B, A->X, dn_A, 0.0, out->X, dn_A);
+        return;
+    }
+
+    /* General path: intersect row_perm_A and row_perm_B via sorted merge,
+       gather the matching rows into contiguous scratch buffers, dgemm. */
+    int max_s = A->dense_m < B->dense_m ? A->dense_m : B->dense_m;
+    int *idx_A = (int *) SP_MALLOC((size_t) max_s * sizeof(int));
+    int *idx_B = (int *) SP_MALLOC((size_t) max_s * sizeof(int));
+    int s = 0;
+    int ii = 0, jj = 0;
+    while (ii < A->dense_m && jj < B->dense_m)
+    {
+        int ra = A->row_perm[ii];
+        int rb = B->row_perm[jj];
+        if (ra == rb)
+        {
+            idx_A[s] = ii;
+            idx_B[s] = jj;
+            s++;
+            ii++;
+            jj++;
+        }
+        else if (ra < rb)
+        {
+            ii++;
+        }
+        else
+        {
+            jj++;
+        }
+    }
+
+    if (s == 0)
+    {
+        memset(out->X, 0,
+               (size_t) out->dense_m * (size_t) out->dense_n * sizeof(double));
+        free(idx_A);
+        free(idx_B);
+        return;
+    }
+
+    double *XA_sub = (double *) SP_MALLOC((size_t) s * (size_t) dn_A * sizeof(double));
+    double *XB_sub = (double *) SP_MALLOC((size_t) s * (size_t) dn_B * sizeof(double));
+    for (int k = 0; k < s; k++)
+    {
+        memcpy(XA_sub + (size_t) k * dn_A, A->X + (size_t) idx_A[k] * dn_A,
+               (size_t) dn_A * sizeof(double));
+        memcpy(XB_sub + (size_t) k * dn_B, B->X + (size_t) idx_B[k] * dn_B,
+               (size_t) dn_B * sizeof(double));
+    }
+
+    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dn_B, dn_A, s, 1.0,
+                XB_sub, dn_B, XA_sub, dn_A, 0.0, out->X, dn_A);
+
+    free(XA_sub);
+    free(XB_sub);
+    free(idx_A);
+    free(idx_B);
+}
+
+Matrix *BTA_csr_pd_alloc(const CSR_Matrix *A_csr, const Permuted_Dense *B)
+{
+    /* Gather the union of columns appearing in A's rows at positions
+       row_perm_B. Use a bitmap of size A_csr->n for O(nnz) collection. */
+    int p = A_csr->n;
+    char *seen = (char *) SP_CALLOC((size_t) p, sizeof(char));
+    int s_A = 0;
+    for (int kk = 0; kk < B->dense_m; kk++)
+    {
+        int row = B->row_perm[kk];
+        for (int e = A_csr->p[row]; e < A_csr->p[row + 1]; e++)
+        {
+            int j = A_csr->i[e];
+            if (!seen[j])
+            {
+                seen[j] = 1;
+                s_A++;
+            }
+        }
+    }
+
+    int *col_active = (int *) SP_MALLOC((size_t) (s_A > 0 ? s_A : 1) * sizeof(int));
+    int idx = 0;
+    for (int j = 0; j < p; j++)
+    {
+        if (seen[j])
+        {
+            col_active[idx++] = j;
+        }
+    }
+
+    Matrix *out = new_permuted_dense(B->base.n, p, B->dense_n, s_A,
+                                     B->col_perm, col_active, NULL);
+    free(col_active);
+    free(seen);
+
+    /* Persistent scratch for BTA_csr_pd_fill_values / BTDA_csr_pd_fill_values:
+       A_sub_dense (B->dense_m x s_A row-major doubles). Pre-zeroed; each fill
+       memsets only the slots it touches by re-zeroing the whole buffer. */
+    Permuted_Dense *out_pd = (Permuted_Dense *) out;
+    out_pd->gather_X_size = (size_t) B->dense_m * (size_t) s_A;
+    if (out_pd->gather_X_size > 0)
+    {
+        out_pd->gather_X_scratch =
+            (double *) SP_CALLOC(out_pd->gather_X_size, sizeof(double));
+    }
+    return out;
+}
+
+/* Note: when A_csr is a leaf-variable Jacobian (each row has a single entry
+   at column var_id + k, value 1), A_sub_dense is a permuted identity and
+   the dgemm reduces to X_C = X_B^T — a pure transpose with no multiplication
+   needed. A fast path can detect this and skip the dgemm; deferred until a
+   workload shows the savings matter. */
+void BTA_csr_pd_fill_values(const CSR_Matrix *A_csr, const Permuted_Dense *B,
+                            Permuted_Dense *out)
+{
+    int dense_m = B->dense_m;
+    int dn_B = B->dense_n;
+    int s_A = out->dense_n;
+
+    if (s_A == 0 || dense_m == 0)
+    {
+        /* Output dense block is empty; nothing to fill. */
+        return;
+    }
+
+    /* Use out->col_inv (pre-built by new_permuted_dense) as col_inv_out and
+       out->gather_X_scratch as A_sub_dense; both are owned by out. */
+    double *A_sub_dense = out->gather_X_scratch;
+    memset(A_sub_dense, 0, out->gather_X_size * sizeof(double));
+
+    for (int kk = 0; kk < dense_m; kk++)
+    {
+        int row = B->row_perm[kk];
+        for (int e = A_csr->p[row]; e < A_csr->p[row + 1]; e++)
+        {
+            int j = A_csr->i[e];
+            int jj = out->col_inv[j];
+            /* jj should always be valid (we built col_perm from these entries),
+               but guard against asymmetry between alloc and fill calls. */
+            if (jj >= 0)
+            {
+                A_sub_dense[(size_t) kk * s_A + jj] = A_csr->x[e];
+            }
+        }
+    }
+
+    /* out->X = X_B^T @ A_sub_dense */
+    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dn_B, s_A, dense_m,
+                1.0, B->X, dn_B, A_sub_dense, s_A, 0.0, out->X, s_A);
+}
+
+Matrix *BTA_pd_csr_alloc(const Permuted_Dense *A, const CSR_Matrix *B_csr)
+{
+    /* Gather the union of columns appearing in B's rows at positions
+       row_perm_A. Bitmap of size B_csr->n for O(nnz) collection. */
+    int q = B_csr->n;
+    char *seen = (char *) SP_CALLOC((size_t) q, sizeof(char));
+    int r_B = 0;
+    for (int kk = 0; kk < A->dense_m; kk++)
+    {
+        int row = A->row_perm[kk];
+        for (int e = B_csr->p[row]; e < B_csr->p[row + 1]; e++)
+        {
+            int i = B_csr->i[e];
+            if (!seen[i])
+            {
+                seen[i] = 1;
+                r_B++;
+            }
+        }
+    }
+
+    int *row_active = (int *) SP_MALLOC((size_t) (r_B > 0 ? r_B : 1) * sizeof(int));
+    int idx = 0;
+    for (int i = 0; i < q; i++)
+    {
+        if (seen[i])
+        {
+            row_active[idx++] = i;
+        }
+    }
+
+    Matrix *out = new_permuted_dense(q, A->base.n, r_B, A->dense_n,
+                                     row_active, A->col_perm, NULL);
+    free(row_active);
+    free(seen);
+
+    /* Persistent scratch for BTA_pd_csr_fill_values / BTDA_pd_csr_fill_values:
+       B_sub_dense (A->dense_m x r_B row-major doubles). */
+    Permuted_Dense *out_pd = (Permuted_Dense *) out;
+    out_pd->gather_X_size = (size_t) A->dense_m * (size_t) r_B;
+    if (out_pd->gather_X_size > 0)
+    {
+        out_pd->gather_X_scratch =
+            (double *) SP_CALLOC(out_pd->gather_X_size, sizeof(double));
+    }
+    return out;
+}
+
+/* Note: when B_csr is a leaf-variable Jacobian (each row has a single entry
+   at column var_id + k, value 1), B_sub_dense is an identity matrix and
+   the dgemm reduces to X_C = X_A — a pure copy with no multiplication
+   needed. A fast path can detect this and skip the dgemm; deferred until a
+   workload shows the savings matter. */
+void BTA_pd_csr_fill_values(const Permuted_Dense *A, const CSR_Matrix *B_csr,
+                            Permuted_Dense *out)
+{
+    int dense_m = A->dense_m;
+    int dn_A = A->dense_n;
+    int r_B = out->dense_m;
+
+    if (r_B == 0 || dense_m == 0)
+    {
+        /* Output dense block is empty; nothing to fill. */
+        return;
+    }
+
+    /* Use out->row_inv (pre-built by new_permuted_dense) as row_inv_out and
+       out->gather_X_scratch as B_sub_dense; both are owned by out. */
+    double *B_sub_dense = out->gather_X_scratch;
+    memset(B_sub_dense, 0, out->gather_X_size * sizeof(double));
+
+    for (int kk = 0; kk < dense_m; kk++)
+    {
+        int row = A->row_perm[kk];
+        for (int e = B_csr->p[row]; e < B_csr->p[row + 1]; e++)
+        {
+            int i = B_csr->i[e];
+            int ii = out->row_inv[i];
+            if (ii >= 0)
+            {
+                B_sub_dense[(size_t) kk * r_B + ii] = B_csr->x[e];
+            }
+        }
+    }
+
+    /* out->X = B_sub_dense^T @ X_A */
+    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, r_B, dn_A, dense_m,
+                1.0, B_sub_dense, r_B, A->X, dn_A, 0.0, out->X, dn_A);
+}
+
+/* BTDA variant of BTA_csr_pd: out->X = X_B^T diag(d) A_sub_dense. Folds d
+   into the scatter step. */
+void BTDA_csr_pd_fill_values(const CSR_Matrix *A_csr, const double *d,
+                             const Permuted_Dense *B, Permuted_Dense *out)
+{
+    int dense_m = B->dense_m;
+    int dn_B = B->dense_n;
+    int s_A = out->dense_n;
+
+    if (s_A == 0 || dense_m == 0)
+    {
+        return;
+    }
+
+    double *A_sub_dense = out->gather_X_scratch;
+    memset(A_sub_dense, 0, out->gather_X_size * sizeof(double));
+
+    for (int kk = 0; kk < dense_m; kk++)
+    {
+        int row = B->row_perm[kk];
+        double dk = d ? d[row] : 1.0;
+        for (int e = A_csr->p[row]; e < A_csr->p[row + 1]; e++)
+        {
+            int j = A_csr->i[e];
+            int jj = out->col_inv[j];
+            if (jj >= 0)
+            {
+                A_sub_dense[(size_t) kk * s_A + jj] = dk * A_csr->x[e];
+            }
+        }
+    }
+
+    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dn_B, s_A, dense_m,
+                1.0, B->X, dn_B, A_sub_dense, s_A, 0.0, out->X, s_A);
+}
+
+/* BTDA variant of BTA_pd_csr: out->X = B_sub_dense^T diag(d) X_A. Folds d
+   into the scatter step. */
+void BTDA_pd_csr_fill_values(const Permuted_Dense *A, const double *d,
+                             const CSR_Matrix *B_csr, Permuted_Dense *out)
+{
+    int dense_m = A->dense_m;
+    int dn_A = A->dense_n;
+    int r_B = out->dense_m;
+
+    if (r_B == 0 || dense_m == 0)
+    {
+        return;
+    }
+
+    double *B_sub_dense = out->gather_X_scratch;
+    memset(B_sub_dense, 0, out->gather_X_size * sizeof(double));
+
+    for (int kk = 0; kk < dense_m; kk++)
+    {
+        int row = A->row_perm[kk];
+        double dk = d ? d[row] : 1.0;
+        for (int e = B_csr->p[row]; e < B_csr->p[row + 1]; e++)
+        {
+            int i = B_csr->i[e];
+            int ii = out->row_inv[i];
+            if (ii >= 0)
+            {
+                B_sub_dense[(size_t) kk * r_B + ii] = dk * B_csr->x[e];
+            }
+        }
+    }
+
+    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, r_B, dn_A, dense_m,
+                1.0, B_sub_dense, r_B, A->X, dn_A, 0.0, out->X, dn_A);
+}
+
+/* BTDA(PD, PD): out->X = X_B^T diag(d) X_A, restricted to row_perm_A ∩
+   row_perm_B. When d == NULL, this is just permuted_dense_BTA_fill_values.
+   Otherwise we first row-scale A's X into A's Y_scratch by d, then run the
+   same intersect-and-gather logic as the BTA case using Y_scratch in place
+   of X_A. */
+void BTDA_pd_pd_fill_values(const Permuted_Dense *A, const double *d,
+                            const Permuted_Dense *B, Permuted_Dense *out)
+{
+    if (d == NULL)
+    {
+        permuted_dense_BTA_fill_values(A, B, out);
+        return;
+    }
+
+    int dn_A = A->dense_n;
+    int dn_B = B->dense_n;
+    int dense_m_A = A->dense_m;
+
+    /* Build Y = diag(d_perm_A) X_A in A's Y_scratch (mutates only the
+       Y_scratch buffer, so const A is preserved in spirit). */
+    cblas_dcopy(dense_m_A * dn_A, A->X, 1, A->Y_scratch, 1);
+    for (int kk = 0; kk < dense_m_A; kk++)
+    {
+        cblas_dscal(dn_A, d[A->row_perm[kk]], A->Y_scratch + kk * dn_A, 1);
+    }
+
+    /* Fast path: matching row_perms. One dgemm using Y_scratch as A. */
+    int match = (A->dense_m == B->dense_m);
+    if (match)
+    {
+        for (int kk = 0; kk < A->dense_m; kk++)
+        {
+            if (A->row_perm[kk] != B->row_perm[kk])
+            {
+                match = 0;
+                break;
+            }
+        }
+    }
+    if (match)
+    {
+        int s = A->dense_m;
+        cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dn_B, dn_A, s, 1.0,
+                    B->X, dn_B, A->Y_scratch, dn_A, 0.0, out->X, dn_A);
+        return;
+    }
+
+    /* General path: intersect row_perm_A and row_perm_B, gather Y_scratch's
+       and B's matched rows, then dgemm. */
+    int max_s = A->dense_m < B->dense_m ? A->dense_m : B->dense_m;
+    int *idx_A = (int *) SP_MALLOC((size_t) max_s * sizeof(int));
+    int *idx_B = (int *) SP_MALLOC((size_t) max_s * sizeof(int));
+    int s = 0;
+    int ii = 0, jj = 0;
+    while (ii < A->dense_m && jj < B->dense_m)
+    {
+        int ra = A->row_perm[ii];
+        int rb = B->row_perm[jj];
+        if (ra == rb)
+        {
+            idx_A[s] = ii;
+            idx_B[s] = jj;
+            s++;
+            ii++;
+            jj++;
+        }
+        else if (ra < rb)
+        {
+            ii++;
+        }
+        else
+        {
+            jj++;
+        }
+    }
+
+    if (s == 0)
+    {
+        memset(out->X, 0,
+               (size_t) out->dense_m * (size_t) out->dense_n * sizeof(double));
+        free(idx_A);
+        free(idx_B);
+        return;
+    }
+
+    double *YA_sub = (double *) SP_MALLOC((size_t) s * (size_t) dn_A * sizeof(double));
+    double *XB_sub = (double *) SP_MALLOC((size_t) s * (size_t) dn_B * sizeof(double));
+    for (int k = 0; k < s; k++)
+    {
+        memcpy(YA_sub + (size_t) k * dn_A, A->Y_scratch + (size_t) idx_A[k] * dn_A,
+               (size_t) dn_A * sizeof(double));
+        memcpy(XB_sub + (size_t) k * dn_B, B->X + (size_t) idx_B[k] * dn_B,
+               (size_t) dn_B * sizeof(double));
+    }
+
+    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dn_B, dn_A, s, 1.0,
+                XB_sub, dn_B, YA_sub, dn_A, 0.0, out->X, dn_A);
+
+    free(YA_sub);
+    free(XB_sub);
+    free(idx_A);
+    free(idx_B);
+}
+
 Matrix *permuted_dense_times_csc_alloc(const Permuted_Dense *self,
                                        const CSC_Matrix *J)
 {
diff --git a/src/utils/sparse_matrix.c b/src/utils/sparse_matrix.c
index 5653b49..d4fdc7c 100644
--- a/src/utils/sparse_matrix.c
+++ b/src/utils/sparse_matrix.c
@@ -99,6 +99,12 @@ static CSR_Matrix *sparse_to_csr(Matrix *self)
     return ((Sparse_Matrix *) self)->csr;
 }
 
+static struct Permuted_Dense *sparse_as_permuted_dense(Matrix *self)
+{
+    (void) self;
+    return NULL;
+}
+
 static Matrix *sparse_index_alloc(Matrix *self, const int *indices, int n_idxs)
 {
     CSR_Matrix *Jx = ((Sparse_Matrix *) self)->csr;
@@ -299,6 +305,7 @@ static void wire_vtable(Sparse_Matrix *sm)
     sm->base.ATA_alloc = sparse_ATA_alloc;
     sm->base.ATDA_fill_values = sparse_ATDA_fill_values;
     sm->base.to_csr = sparse_to_csr;
+    sm->base.as_permuted_dense = sparse_as_permuted_dense;
     sm->base.index_alloc = sparse_index_alloc;
     sm->base.index_fill_values = sparse_index_fill_values;
     sm->base.promote_alloc = sparse_promote_alloc;
diff --git a/tests/all_tests.c b/tests/all_tests.c
index 168daf9..efb1c95 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -66,6 +66,7 @@
 #include "utils/test_linalg_sparse_matmuls.h"
 #include "utils/test_linalg_utils_matmul_chain_rule.h"
 #include "utils/test_matrix.h"
+#include "utils/test_matrix_BTA.h"
 #include "utils/test_permuted_dense.h"
 #include "wsum_hess/affine/test_broadcast.h"
 #include "wsum_hess/affine/test_convolve.h"
@@ -377,6 +378,18 @@ int main(void)
     mu_run_test(test_permuted_dense_broadcast_row, tests_run);
     mu_run_test(test_permuted_dense_broadcast_col, tests_run);
     mu_run_test(test_permuted_dense_diag_vec, tests_run);
+    mu_run_test(test_permuted_dense_BTA_matching_row_perm, tests_run);
+    mu_run_test(test_permuted_dense_BTA_partial_overlap, tests_run);
+    mu_run_test(test_permuted_dense_BTDA_decomposition, tests_run);
+    mu_run_test(test_BTA_csr_pd_basic, tests_run);
+    mu_run_test(test_BTA_csr_pd_leaf_variable, tests_run);
+    mu_run_test(test_BTA_csr_pd_no_overlap, tests_run);
+    mu_run_test(test_BTA_pd_csr_basic, tests_run);
+    mu_run_test(test_BTA_pd_csr_leaf_variable, tests_run);
+    mu_run_test(test_BTA_pd_csr_no_overlap, tests_run);
+    mu_run_test(test_BTDA_matrices_pd_pd, tests_run);
+    mu_run_test(test_BTDA_matrices_csr_pd, tests_run);
+    mu_run_test(test_BTDA_matrices_pd_csr, tests_run);
     mu_run_test(test_YT_kron_I, tests_run);
     mu_run_test(test_YT_kron_I_larger, tests_run);
     mu_run_test(test_I_kron_X, tests_run);
diff --git a/tests/utils/test_permuted_dense.h b/tests/utils/test_permuted_dense.h
index 75ca878..387c273 100644
--- a/tests/utils/test_permuted_dense.h
+++ b/tests/utils/test_permuted_dense.h
@@ -543,4 +543,531 @@ const char *test_permuted_dense_diag_vec(void)
     return 0;
 }
 
+/* ---- Helpers for BTA / BTDA tests ---- */
+
+/* Scatter a PD into a dense m x n_global buffer (row-major), zero-filled.
+   Buffer is allocated by the caller. */
+static void scatter_pd_to_dense(const Permuted_Dense *pd, int n_global, double *dense)
+{
+    int m = pd->base.m;
+    memset(dense, 0, (size_t) m * (size_t) n_global * sizeof(double));
+    for (int ii = 0; ii < pd->dense_m; ii++)
+    {
+        int row = pd->row_perm[ii];
+        for (int jj = 0; jj < pd->dense_n; jj++)
+        {
+            int col = pd->col_perm[jj];
+            dense[row * n_global + col] = pd->X[ii * pd->dense_n + jj];
+        }
+    }
+}
+
+/* BTA: A and B share row_perm = [1, 3]; both have m=4, distinct col_perms.
+   C = B^T A is computed via the primitive and compared against a hand
+   reference X_B^T X_A. */
+const char *test_permuted_dense_BTA_matching_row_perm(void)
+{
+    int row_perm[2] = {1, 3};
+    int col_perm_A[2] = {0, 2};
+    int col_perm_B[2] = {1, 3};
+    /* X_A is (2, 2), X_B is (2, 2), both row-major. */
+    double XA[4] = {1.0, 2.0, 3.0, 4.0}; /* rows: [1,2], [3,4] */
+    double XB[4] = {5.0, 6.0, 7.0, 8.0}; /* rows: [5,6], [7,8] */
+    Matrix *A_m = new_permuted_dense(4, 4, 2, 2, row_perm, col_perm_A, XA);
+    Matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm, col_perm_B, XB);
+    Permuted_Dense *A = (Permuted_Dense *) A_m;
+    Permuted_Dense *B = (Permuted_Dense *) B_m;
+
+    Matrix *C_m = permuted_dense_BTA_alloc(A, B);
+    Permuted_Dense *C = (Permuted_Dense *) C_m;
+
+    mu_assert("out m", C_m->m == 4); /* B.n */
+    mu_assert("out n", C_m->n == 4); /* A.n */
+    mu_assert("dense_m", C->dense_m == 2);
+    mu_assert("dense_n", C->dense_n == 2);
+    mu_assert("row_perm", cmp_int_array(C->row_perm, col_perm_B, 2));
+    mu_assert("col_perm", cmp_int_array(C->col_perm, col_perm_A, 2));
+
+    permuted_dense_BTA_fill_values(A, B, C);
+
+    /* Reference: X_B^T X_A. With X_B = [[5,6],[7,8]], X_A = [[1,2],[3,4]]:
+       X_B^T = [[5,7],[6,8]]. X_B^T X_A = [[5*1+7*3, 5*2+7*4], [6*1+8*3, 6*2+8*4]]
+                                        = [[26, 38], [30, 44]]. */
+    double expected[4] = {26.0, 38.0, 30.0, 44.0};
+    mu_assert("values", cmp_double_array(C->X, expected, 4));
+
+    free_matrix(C_m);
+    free_matrix(B_m);
+    free_matrix(A_m);
+    return 0;
+}
+
+/* BTA with partial overlap: row_perm_A = [1, 3, 5], row_perm_B = [3, 5, 7].
+   Intersection = {3, 5}. */
+const char *test_permuted_dense_BTA_partial_overlap(void)
+{
+    int row_perm_A[3] = {1, 3, 5};
+    int row_perm_B[3] = {3, 5, 7};
+    int col_perm_A[2] = {0, 2};
+    int col_perm_B[2] = {1, 3};
+    /* X_A rows correspond to A row_perm order: row 0 -> source row 1, row 1 -> 3, row 2 -> 5. */
+    double XA[6] = {1.0, 2.0,   /* row 1 (NOT in B) */
+                    3.0, 4.0,   /* row 3 (in B at pos 0) */
+                    5.0, 6.0};  /* row 5 (in B at pos 1) */
+    /* X_B rows: row 0 -> source row 3, row 1 -> 5, row 2 -> 7. */
+    double XB[6] = {10.0, 20.0, /* row 3 (in A at pos 1) */
+                    30.0, 40.0, /* row 5 (in A at pos 2) */
+                    50.0, 60.0}; /* row 7 (NOT in A) */
+    Matrix *A_m = new_permuted_dense(8, 4, 3, 2, row_perm_A, col_perm_A, XA);
+    Matrix *B_m = new_permuted_dense(8, 4, 3, 2, row_perm_B, col_perm_B, XB);
+    Permuted_Dense *A = (Permuted_Dense *) A_m;
+    Permuted_Dense *B = (Permuted_Dense *) B_m;
+
+    Matrix *C_m = permuted_dense_BTA_alloc(A, B);
+    Permuted_Dense *C = (Permuted_Dense *) C_m;
+    permuted_dense_BTA_fill_values(A, B, C);
+
+    /* Reference: scatter A, B to dense 8x4, compute B^T A, compare block at
+       (col_perm_B, col_perm_A). */
+    double *A_d = (double *) calloc((size_t) 8 * 4, sizeof(double));
+    double *B_d = (double *) calloc((size_t) 8 * 4, sizeof(double));
+    scatter_pd_to_dense(A, 4, A_d);
+    scatter_pd_to_dense(B, 4, B_d);
+
+    /* Reference C_ref is 4x4 = B_d^T (4x8) * A_d (8x4). */
+    double C_ref[16];
+    memset(C_ref, 0, sizeof C_ref);
+    for (int i = 0; i < 4; i++)
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            double s = 0.0;
+            for (int k = 0; k < 8; k++)
+            {
+                s += B_d[k * 4 + i] * A_d[k * 4 + j];
+            }
+            C_ref[i * 4 + j] = s;
+        }
+    }
+
+    /* Extract reference block at (col_perm_B, col_perm_A) and compare to C->X. */
+    double expected[4];
+    for (int ii = 0; ii < 2; ii++)
+    {
+        for (int jj = 0; jj < 2; jj++)
+        {
+            expected[ii * 2 + jj] = C_ref[col_perm_B[ii] * 4 + col_perm_A[jj]];
+        }
+    }
+    mu_assert("values", cmp_double_array(C->X, expected, 4));
+
+    free(A_d);
+    free(B_d);
+    free_matrix(C_m);
+    free_matrix(B_m);
+    free_matrix(A_m);
+    return 0;
+}
+
+/* Full BTDA decomposition: tmp = diag(w) A; C = B^T tmp. Compare against a
+   dense triple product B_d^T diag(w) A_d. */
+const char *test_permuted_dense_BTDA_decomposition(void)
+{
+    int row_perm[3] = {0, 1, 2};
+    int col_perm_A[2] = {0, 2};
+    int col_perm_B[2] = {1, 3};
+    double XA[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    double XB[6] = {7.0, 8.0, 9.0, 10.0, 11.0, 12.0};
+    double w[3] = {2.0, -1.0, 3.0};
+
+    Matrix *A_m = new_permuted_dense(3, 4, 3, 2, row_perm, col_perm_A, XA);
+    Matrix *B_m = new_permuted_dense(3, 4, 3, 2, row_perm, col_perm_B, XB);
+    Permuted_Dense *A = (Permuted_Dense *) A_m;
+    Permuted_Dense *B = (Permuted_Dense *) B_m;
+
+    /* tmp has the same sparsity as A. */
+    Matrix *tmp_m = A_m->copy_sparsity(A_m);
+    Permuted_Dense *tmp = (Permuted_Dense *) tmp_m;
+    permuted_dense_DA_fill_values(w, A, tmp);
+
+    Matrix *C_m = permuted_dense_BTA_alloc(tmp, B);
+    Permuted_Dense *C = (Permuted_Dense *) C_m;
+    permuted_dense_BTA_fill_values(tmp, B, C);
+
+    /* Reference: dense B_d^T diag(w) A_d, extract (col_perm_B, col_perm_A) block. */
+    double *A_d = (double *) calloc((size_t) 3 * 4, sizeof(double));
+    double *B_d = (double *) calloc((size_t) 3 * 4, sizeof(double));
+    scatter_pd_to_dense(A, 4, A_d);
+    scatter_pd_to_dense(B, 4, B_d);
+
+    double C_ref[16];
+    memset(C_ref, 0, sizeof C_ref);
+    for (int i = 0; i < 4; i++)
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            double s = 0.0;
+            for (int k = 0; k < 3; k++)
+            {
+                s += B_d[k * 4 + i] * w[k] * A_d[k * 4 + j];
+            }
+            C_ref[i * 4 + j] = s;
+        }
+    }
+    double expected[4];
+    for (int ii = 0; ii < 2; ii++)
+    {
+        for (int jj = 0; jj < 2; jj++)
+        {
+            expected[ii * 2 + jj] = C_ref[col_perm_B[ii] * 4 + col_perm_A[jj]];
+        }
+    }
+    mu_assert("values", cmp_double_array(C->X, expected, 4));
+
+    free(A_d);
+    free(B_d);
+    free_matrix(C_m);
+    free_matrix(tmp_m);
+    free_matrix(B_m);
+    free_matrix(A_m);
+    return 0;
+}
+
+/* Scatter a CSR matrix into a dense m x n_global buffer (row-major).
+   Caller allocates and zero-fills. */
+static void scatter_csr_to_dense(const CSR_Matrix *A_csr, int n_global,
+                                  double *dense)
+{
+    int m = A_csr->m;
+    memset(dense, 0, (size_t) m * (size_t) n_global * sizeof(double));
+    for (int i = 0; i < m; i++)
+    {
+        for (int e = A_csr->p[i]; e < A_csr->p[i + 1]; e++)
+        {
+            int j = A_csr->i[e];
+            dense[i * n_global + j] = A_csr->x[e];
+        }
+    }
+}
+
+/* BTA(CSR A, PD B): basic correctness against a dense reference.
+   A is (4, 5) CSR with mixed sparsity; B is (4, 4) PD with row_perm = [1, 3],
+   col_perm = [0, 2], dense block (2, 2). */
+const char *test_BTA_csr_pd_basic(void)
+{
+    /* CSR A: m=4, n=5, with nonzeros:
+       row 0: cols {1, 4}
+       row 1: cols {0, 2}
+       row 2: cols {2}
+       row 3: cols {1, 4} */
+    CSR_Matrix *A = new_csr_matrix(4, 5, 7);
+    A->p[0] = 0;
+    A->p[1] = 2;
+    A->p[2] = 4;
+    A->p[3] = 5;
+    A->p[4] = 7;
+    int Ai[7] = {1, 4, 0, 2, 2, 1, 4};
+    double Ax[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
+    memcpy(A->i, Ai, sizeof Ai);
+    memcpy(A->x, Ax, sizeof Ax);
+
+    /* PD B: m=4, n=4, row_perm = [1, 3], col_perm = [0, 2], X = [[10, 20], [30, 40]]. */
+    int row_perm_B[2] = {1, 3};
+    int col_perm_B[2] = {0, 2};
+    double XB[4] = {10.0, 20.0, 30.0, 40.0};
+    Matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
+    Permuted_Dense *B = (Permuted_Dense *) B_m;
+
+    Matrix *out_m = BTA_csr_pd_alloc(A, B);
+    Permuted_Dense *out = (Permuted_Dense *) out_m;
+
+    /* Expected col_active: union of A's columns in rows 1 and 3
+       = {0, 2} ∪ {1, 4} = {0, 1, 2, 4}, size 4. */
+    int expected_col_perm[4] = {0, 1, 2, 4};
+    mu_assert("out m", out_m->m == 4); /* B.n */
+    mu_assert("out n", out_m->n == 5); /* A.n */
+    mu_assert("dense_m", out->dense_m == 2);
+    mu_assert("dense_n", out->dense_n == 4);
+    mu_assert("row_perm", cmp_int_array(out->row_perm, col_perm_B, 2));
+    mu_assert("col_perm", cmp_int_array(out->col_perm, expected_col_perm, 4));
+
+    BTA_csr_pd_fill_values(A, B, out);
+
+    /* Reference: scatter A and B to dense 4x{5,4}, compute B^T A, extract
+       block at (col_perm_B × out->col_perm). */
+    double *A_d = (double *) calloc(4 * 5, sizeof(double));
+    double *B_d = (double *) calloc(4 * 4, sizeof(double));
+    scatter_csr_to_dense(A, 5, A_d);
+    scatter_pd_to_dense(B, 4, B_d);
+
+    double C_ref[4 * 5];
+    memset(C_ref, 0, sizeof C_ref);
+    for (int i = 0; i < 4; i++)
+    {
+        for (int j = 0; j < 5; j++)
+        {
+            double s = 0.0;
+            for (int k = 0; k < 4; k++)
+            {
+                s += B_d[k * 4 + i] * A_d[k * 5 + j];
+            }
+            C_ref[i * 5 + j] = s;
+        }
+    }
+    double expected_X[8];
+    for (int ii = 0; ii < 2; ii++)
+    {
+        for (int jj = 0; jj < 4; jj++)
+        {
+            expected_X[ii * 4 + jj] = C_ref[col_perm_B[ii] * 5 + expected_col_perm[jj]];
+        }
+    }
+    mu_assert("values", cmp_double_array(out->X, expected_X, 8));
+
+    free(A_d);
+    free(B_d);
+    free_matrix(out_m);
+    free_matrix(B_m);
+    free_csr_matrix(A);
+    return 0;
+}
+
+/* BTA(CSR A, PD B) where A is a leaf-variable Jacobian (identity-in-block).
+   A is (4, 8): row k has a 1 at column 4+k (variable v of size 4 at var_id=4).
+   Expected: col_perm_out = {4+row_perm_B[kk]} = {4+1, 4+3} = {5, 7}, and X_C = X_B^T. */
+const char *test_BTA_csr_pd_leaf_variable(void)
+{
+    CSR_Matrix *A = new_csr_matrix(4, 8, 4);
+    for (int k = 0; k < 4; k++)
+    {
+        A->p[k] = k;
+        A->i[k] = 4 + k;
+        A->x[k] = 1.0;
+    }
+    A->p[4] = 4;
+
+    int row_perm_B[2] = {1, 3};
+    int col_perm_B[2] = {0, 2};
+    double XB[4] = {10.0, 20.0, 30.0, 40.0}; /* row-major (2, 2) */
+    Matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
+    Permuted_Dense *B = (Permuted_Dense *) B_m;
+
+    Matrix *out_m = BTA_csr_pd_alloc(A, B);
+    Permuted_Dense *out = (Permuted_Dense *) out_m;
+
+    int expected_col_perm[2] = {5, 7};
+    mu_assert("dense_m", out->dense_m == 2);
+    mu_assert("dense_n", out->dense_n == 2);
+    mu_assert("row_perm", cmp_int_array(out->row_perm, col_perm_B, 2));
+    mu_assert("col_perm", cmp_int_array(out->col_perm, expected_col_perm, 2));
+
+    BTA_csr_pd_fill_values(A, B, out);
+
+    /* X_C should be X_B^T = [[10, 30], [20, 40]] row-major. */
+    double expected_X[4] = {10.0, 30.0, 20.0, 40.0};
+    mu_assert("values", cmp_double_array(out->X, expected_X, 4));
+
+    free_matrix(out_m);
+    free_matrix(B_m);
+    free_csr_matrix(A);
+    return 0;
+}
+
+/* BTA(CSR A, PD B) where A has no entries in any row of row_perm_B.
+   Output dense block should have dense_n = 0. */
+const char *test_BTA_csr_pd_no_overlap(void)
+{
+    /* A: rows 0 and 2 have entries; rows 1 and 3 (row_perm_B) are empty. */
+    CSR_Matrix *A = new_csr_matrix(4, 5, 3);
+    A->p[0] = 0;
+    A->p[1] = 2;
+    A->p[2] = 2;
+    A->p[3] = 3;
+    A->p[4] = 3;
+    int Ai[3] = {1, 4, 2};
+    double Ax[3] = {1.0, 2.0, 3.0};
+    memcpy(A->i, Ai, sizeof Ai);
+    memcpy(A->x, Ax, sizeof Ax);
+
+    int row_perm_B[2] = {1, 3}; /* rows that ARE empty in A */
+    int col_perm_B[2] = {0, 2};
+    double XB[4] = {1.0, 2.0, 3.0, 4.0};
+    Matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
+    Permuted_Dense *B = (Permuted_Dense *) B_m;
+
+    Matrix *out_m = BTA_csr_pd_alloc(A, B);
+    Permuted_Dense *out = (Permuted_Dense *) out_m;
+
+    mu_assert("dense_m", out->dense_m == 2);
+    mu_assert("dense_n", out->dense_n == 0);
+
+    /* Fill should be a no-op (0-sized dense block). */
+    BTA_csr_pd_fill_values(A, B, out);
+
+    free_matrix(out_m);
+    free_matrix(B_m);
+    free_csr_matrix(A);
+    return 0;
+}
+
+/* BTA(PD A, CSR B): basic correctness against a dense reference.
+   A is (4, 5) PD with row_perm = [1, 3], col_perm = [0, 2], dense block (2, 2).
+   B is (4, 4) CSR with arbitrary sparsity. */
+const char *test_BTA_pd_csr_basic(void)
+{
+    /* PD A: m=4, n=5, row_perm = [1, 3], col_perm = [0, 2].
+       X = [[1, 2], [3, 4]] (2 x 2 row-major). */
+    int row_perm_A[2] = {1, 3};
+    int col_perm_A[2] = {0, 2};
+    double XA[4] = {1.0, 2.0, 3.0, 4.0};
+    Matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
+    Permuted_Dense *A = (Permuted_Dense *) A_m;
+
+    /* CSR B: m=4, n=4.
+       row 0: cols {1, 3}
+       row 1: cols {0, 2}
+       row 2: cols {2}
+       row 3: cols {0, 3} */
+    CSR_Matrix *B = new_csr_matrix(4, 4, 7);
+    B->p[0] = 0;
+    B->p[1] = 2;
+    B->p[2] = 4;
+    B->p[3] = 5;
+    B->p[4] = 7;
+    int Bi[7] = {1, 3, 0, 2, 2, 0, 3};
+    double Bx[7] = {10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0};
+    memcpy(B->i, Bi, sizeof Bi);
+    memcpy(B->x, Bx, sizeof Bx);
+
+    Matrix *out_m = BTA_pd_csr_alloc(A, B);
+    Permuted_Dense *out = (Permuted_Dense *) out_m;
+
+    /* row_active = union of B's cols in rows 1 and 3
+                  = {0, 2} ∪ {0, 3} = {0, 2, 3}, size 3. */
+    int expected_row_perm[3] = {0, 2, 3};
+    mu_assert("out m", out_m->m == 4); /* B.n */
+    mu_assert("out n", out_m->n == 5); /* A.n */
+    mu_assert("dense_m", out->dense_m == 3);
+    mu_assert("dense_n", out->dense_n == 2);
+    mu_assert("row_perm", cmp_int_array(out->row_perm, expected_row_perm, 3));
+    mu_assert("col_perm", cmp_int_array(out->col_perm, col_perm_A, 2));
+
+    BTA_pd_csr_fill_values(A, B, out);
+
+    /* Reference: dense B^T A, extract block at (row_active × col_perm_A). */
+    double *A_d = (double *) calloc(4 * 5, sizeof(double));
+    double *B_d = (double *) calloc(4 * 4, sizeof(double));
+    scatter_pd_to_dense(A, 5, A_d);
+    scatter_csr_to_dense(B, 4, B_d);
+
+    double C_ref[4 * 5];
+    memset(C_ref, 0, sizeof C_ref);
+    for (int i = 0; i < 4; i++)
+    {
+        for (int j = 0; j < 5; j++)
+        {
+            double s = 0.0;
+            for (int k = 0; k < 4; k++)
+            {
+                s += B_d[k * 4 + i] * A_d[k * 5 + j];
+            }
+            C_ref[i * 5 + j] = s;
+        }
+    }
+    double expected_X[6];
+    for (int ii = 0; ii < 3; ii++)
+    {
+        for (int jj = 0; jj < 2; jj++)
+        {
+            expected_X[ii * 2 + jj] =
+                C_ref[expected_row_perm[ii] * 5 + col_perm_A[jj]];
+        }
+    }
+    mu_assert("values", cmp_double_array(out->X, expected_X, 6));
+
+    free(A_d);
+    free(B_d);
+    free_matrix(out_m);
+    free_csr_matrix(B);
+    free_matrix(A_m);
+    return 0;
+}
+
+/* BTA(PD A, CSR B) where B is a leaf-variable Jacobian (identity-in-block).
+   B is (4, 8): row k has a 1 at column 4+k (variable v of size 4 at var_id=4).
+   Expected: row_perm_out = {4+row_perm_A[kk]} = {4+1, 4+3} = {5, 7}, X_C = X_A. */
+const char *test_BTA_pd_csr_leaf_variable(void)
+{
+    int row_perm_A[2] = {1, 3};
+    int col_perm_A[2] = {0, 2};
+    double XA[4] = {1.0, 2.0, 3.0, 4.0};
+    Matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
+    Permuted_Dense *A = (Permuted_Dense *) A_m;
+
+    CSR_Matrix *B = new_csr_matrix(4, 8, 4);
+    for (int k = 0; k < 4; k++)
+    {
+        B->p[k] = k;
+        B->i[k] = 4 + k;
+        B->x[k] = 1.0;
+    }
+    B->p[4] = 4;
+
+    Matrix *out_m = BTA_pd_csr_alloc(A, B);
+    Permuted_Dense *out = (Permuted_Dense *) out_m;
+
+    int expected_row_perm[2] = {5, 7};
+    mu_assert("dense_m", out->dense_m == 2);
+    mu_assert("dense_n", out->dense_n == 2);
+    mu_assert("row_perm", cmp_int_array(out->row_perm, expected_row_perm, 2));
+    mu_assert("col_perm", cmp_int_array(out->col_perm, col_perm_A, 2));
+
+    BTA_pd_csr_fill_values(A, B, out);
+
+    /* X_C should equal X_A. */
+    mu_assert("values", cmp_double_array(out->X, XA, 4));
+
+    free_matrix(out_m);
+    free_csr_matrix(B);
+    free_matrix(A_m);
+    return 0;
+}
+
+/* BTA(PD A, CSR B) where B has no entries in any row of row_perm_A.
+   Output dense block should have dense_m = 0. */
+const char *test_BTA_pd_csr_no_overlap(void)
+{
+    int row_perm_A[2] = {1, 3};
+    int col_perm_A[2] = {0, 2};
+    double XA[4] = {1.0, 2.0, 3.0, 4.0};
+    Matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
+    Permuted_Dense *A = (Permuted_Dense *) A_m;
+
+    /* B: rows 0 and 2 have entries; rows 1 and 3 (row_perm_A) are empty. */
+    CSR_Matrix *B = new_csr_matrix(4, 4, 3);
+    B->p[0] = 0;
+    B->p[1] = 2;
+    B->p[2] = 2;
+    B->p[3] = 3;
+    B->p[4] = 3;
+    int Bi[3] = {0, 1, 2};
+    double Bx[3] = {1.0, 2.0, 3.0};
+    memcpy(B->i, Bi, sizeof Bi);
+    memcpy(B->x, Bx, sizeof Bx);
+
+    Matrix *out_m = BTA_pd_csr_alloc(A, B);
+    Permuted_Dense *out = (Permuted_Dense *) out_m;
+
+    mu_assert("dense_m", out->dense_m == 0);
+    mu_assert("dense_n", out->dense_n == 2);
+
+    /* Fill should be a no-op (0-sized dense block on the row axis). */
+    BTA_pd_csr_fill_values(A, B, out);
+
+    free_matrix(out_m);
+    free_csr_matrix(B);
+    free_matrix(A_m);
+    return 0;
+}
+
 #endif /* TEST_PERMUTED_DENSE_H */

From 12ba841dacfa55cf9be87d5bbf331a3d2c16ca2f Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Mon, 11 May 2026 15:17:56 +0200
Subject: [PATCH 10/31] ran formatter

---
 include/utils/matrix_BTA.h                |  32 +++++
 include/utils/permuted_dense.h            |   6 +-
 src/utils/matrix_BTA.c                    |  77 +++++++++++
 tests/profiling/profile_trimmed_log_reg.h |   4 +-
 tests/utils/test_matrix_BTA.h             | 157 ++++++++++++++++++++++
 tests/utils/test_permuted_dense.h         |  30 +++--
 6 files changed, 287 insertions(+), 19 deletions(-)
 create mode 100644 include/utils/matrix_BTA.h
 create mode 100644 src/utils/matrix_BTA.c
 create mode 100644 tests/utils/test_matrix_BTA.h

diff --git a/include/utils/matrix_BTA.h b/include/utils/matrix_BTA.h
new file mode 100644
index 0000000..8d06069
--- /dev/null
+++ b/include/utils/matrix_BTA.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2026 Daniel Cederberg and William Zhang
+ *
+ * This file is part of the SparseDiffEngine project.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ */
+#ifndef MATRIX_BTA_H
+#define MATRIX_BTA_H
+
+#include "matrix.h"
+
+/* Polymorphic dispatchers for C = B^T A and C = B^T diag(d) A. The output
+   type depends on the input types: (PD, PD) → PD, (Sparse, PD) → PD,
+   (PD, Sparse) → PD, (Sparse, Sparse) → Sparse. Dispatched via
+   as_permuted_dense() on both operands. */
+
+/* Allocate sparsity for C = B^T A. */
+Matrix *BTA_matrices_alloc(Matrix *A, Matrix *B);
+
+/* Fill out->x = B^T diag(d) A (d may be NULL for plain B^T A). out must
+   have the structure produced by BTA_matrices_alloc(A, B). For the
+   (Sparse, Sparse) path, the caller must ensure both operands' csc_caches
+   are fresh (via refresh_csc_values) before calling; the dispatcher does
+   not refresh. */
+void BTDA_matrices_fill_values(Matrix *A, const double *d, Matrix *B, Matrix *C);
+
+#endif /* MATRIX_BTA_H */
diff --git a/include/utils/permuted_dense.h b/include/utils/permuted_dense.h
index 99fd318..2d8f9dd 100644
--- a/include/utils/permuted_dense.h
+++ b/include/utils/permuted_dense.h
@@ -90,15 +90,13 @@ void permuted_dense_ATDA_fill_values(const Permuted_Dense *self, const double *d
    A->dense_n), row_perm = B->col_perm, col_perm = A->col_perm. Values
    uninitialized. The output structure does not depend on row_perm_A or
    row_perm_B (only the values do; see permuted_dense_BTA_fill_values). */
-Matrix *permuted_dense_BTA_alloc(const Permuted_Dense *A,
-                                 const Permuted_Dense *B);
+Matrix *permuted_dense_BTA_alloc(const Permuted_Dense *A, const Permuted_Dense *B);
 
 /* Fill out->X = B->X^T @ A->X restricted to rows in row_perm_A ∩ row_perm_B.
    out must have the structure produced by permuted_dense_BTA_alloc(A, B).
    For matching row_perms, this is a single cblas_dgemm; otherwise the
    intersecting rows are first gathered into contiguous scratch buffers. */
-void permuted_dense_BTA_fill_values(const Permuted_Dense *A,
-                                    const Permuted_Dense *B,
+void permuted_dense_BTA_fill_values(const Permuted_Dense *A, const Permuted_Dense *B,
                                     Permuted_Dense *out);
 
 /* Allocate a new Permuted_Dense for C = B^T @ A where A is Sparse (CSR)
diff --git a/src/utils/matrix_BTA.c b/src/utils/matrix_BTA.c
new file mode 100644
index 0000000..bd36c00
--- /dev/null
+++ b/src/utils/matrix_BTA.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2026 Daniel Cederberg and William Zhang
+ *
+ * This file is part of the SparseDiffEngine project.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ */
+#include "utils/matrix_BTA.h"
+
+#include "utils/CSC_Matrix.h"
+#include "utils/CSR_Matrix.h"
+#include "utils/permuted_dense.h"
+
+Matrix *BTA_matrices_alloc(Matrix *A, Matrix *B)
+{
+    Permuted_Dense *pd_A = A->as_permuted_dense(A);
+    Permuted_Dense *pd_B = B->as_permuted_dense(B);
+
+    if (pd_A && pd_B)
+    {
+        return permuted_dense_BTA_alloc(pd_A, pd_B);
+    }
+    if (pd_B)
+    {
+        /* A is Sparse, B is PD */
+        CSR_Matrix *A_csr = A->to_csr(A);
+        return BTA_csr_pd_alloc(A_csr, pd_B);
+    }
+    if (pd_A)
+    {
+        /* A is PD, B is Sparse */
+        CSR_Matrix *B_csr = B->to_csr(B);
+        return BTA_pd_csr_alloc(pd_A, B_csr);
+    }
+
+    /* Both Sparse: delegate to CSC BTA. Caller must ensure caches are fresh. */
+    Sparse_Matrix *sm_A = (Sparse_Matrix *) A;
+    Sparse_Matrix *sm_B = (Sparse_Matrix *) B;
+    A->refresh_csc_values(A);
+    B->refresh_csc_values(B);
+    CSR_Matrix *C_csr = BTA_alloc(sm_A->csc_cache, sm_B->csc_cache);
+    return new_sparse_matrix(C_csr);
+}
+
+void BTDA_matrices_fill_values(Matrix *A, const double *d, Matrix *B, Matrix *C)
+{
+    Permuted_Dense *pd_A = A->as_permuted_dense(A);
+    Permuted_Dense *pd_B = B->as_permuted_dense(B);
+
+    if (pd_A && pd_B)
+    {
+        BTDA_pd_pd_fill_values(pd_A, d, pd_B, (Permuted_Dense *) C);
+        return;
+    }
+    if (pd_B)
+    {
+        CSR_Matrix *A_csr = A->to_csr(A);
+        BTDA_csr_pd_fill_values(A_csr, d, pd_B, (Permuted_Dense *) C);
+        return;
+    }
+    if (pd_A)
+    {
+        CSR_Matrix *B_csr = B->to_csr(B);
+        BTDA_pd_csr_fill_values(pd_A, d, B_csr, (Permuted_Dense *) C);
+        return;
+    }
+
+    /* Both Sparse: delegate to CSC BTDA. */
+    Sparse_Matrix *sm_A = (Sparse_Matrix *) A;
+    Sparse_Matrix *sm_B = (Sparse_Matrix *) B;
+    Sparse_Matrix *sm_C = (Sparse_Matrix *) C;
+    BTDA_fill_values(sm_A->csc_cache, sm_B->csc_cache, d, sm_C->csr);
+}
diff --git a/tests/profiling/profile_trimmed_log_reg.h b/tests/profiling/profile_trimmed_log_reg.h
index 4dbe9f0..d010d00 100644
--- a/tests/profiling/profile_trimmed_log_reg.h
+++ b/tests/profiling/profile_trimmed_log_reg.h
@@ -79,8 +79,8 @@ const char *profile_trimmed_log_reg(void)
 
     printf("\n");
     printf("                          Jacobian      Hessian        Total\n");
-    printf("  trimmed_log_reg:      %10.6fs  %10.6fs  %10.6fs\n", sec_jac,
-           sec_hess, sec_jac + sec_hess);
+    printf("  trimmed_log_reg:      %10.6fs  %10.6fs  %10.6fs\n", sec_jac, sec_hess,
+           sec_jac + sec_hess);
 
     /* ---- Cleanup ---- */
     free_expr(obj);
diff --git a/tests/utils/test_matrix_BTA.h b/tests/utils/test_matrix_BTA.h
new file mode 100644
index 0000000..7528484
--- /dev/null
+++ b/tests/utils/test_matrix_BTA.h
@@ -0,0 +1,157 @@
+#ifndef TEST_MATRIX_BTA_H
+#define TEST_MATRIX_BTA_H
+
+#include "minunit.h"
+#include "test_helpers.h"
+#include "utils/CSR_Matrix.h"
+#include "utils/matrix_BTA.h"
+#include "utils/permuted_dense.h"
+#include <stdlib.h>
+#include <string.h>
+
+/* Wrapper dispatch sanity: (PD, PD). Compare against direct
+   BTDA_pd_pd_fill_values. */
+const char *test_BTDA_matrices_pd_pd(void)
+{
+    int row_perm[2] = {0, 1};
+    int col_perm_A[2] = {0, 2};
+    int col_perm_B[2] = {1, 3};
+    double XA[4] = {1.0, 2.0, 3.0, 4.0};
+    double XB[4] = {5.0, 6.0, 7.0, 8.0};
+    double d[2] = {2.0, -1.5};
+
+    Matrix *A_m = new_permuted_dense(2, 4, 2, 2, row_perm, col_perm_A, XA);
+    Matrix *B_m = new_permuted_dense(2, 4, 2, 2, row_perm, col_perm_B, XB);
+
+    /* Wrapper path. */
+    Matrix *C_m = BTA_matrices_alloc(A_m, B_m);
+    BTDA_matrices_fill_values(A_m, d, B_m, C_m);
+
+    /* Direct primitive path on independent operands. */
+    Matrix *A2 = new_permuted_dense(2, 4, 2, 2, row_perm, col_perm_A, XA);
+    Matrix *B2 = new_permuted_dense(2, 4, 2, 2, row_perm, col_perm_B, XB);
+    Matrix *C2 =
+        permuted_dense_BTA_alloc((Permuted_Dense *) A2, (Permuted_Dense *) B2);
+    BTDA_pd_pd_fill_values((Permuted_Dense *) A2, d, (Permuted_Dense *) B2,
+                           (Permuted_Dense *) C2);
+
+    mu_assert("values", cmp_double_array(C_m->x, C2->x, C_m->nnz));
+
+    free_matrix(C_m);
+    free_matrix(B_m);
+    free_matrix(A_m);
+    free_matrix(C2);
+    free_matrix(B2);
+    free_matrix(A2);
+    return 0;
+}
+
+/* Wrapper dispatch sanity: (CSR, PD). Compare against direct
+   BTDA_csr_pd_fill_values. */
+const char *test_BTDA_matrices_csr_pd(void)
+{
+    /* A: 4x5 CSR */
+    CSR_Matrix *A = new_csr_matrix(4, 5, 5);
+    A->p[0] = 0;
+    A->p[1] = 2;
+    A->p[2] = 3;
+    A->p[3] = 4;
+    A->p[4] = 5;
+    int Ai[5] = {0, 3, 2, 1, 4};
+    double Ax[5] = {1.0, 2.0, 3.0, 4.0, 5.0};
+    memcpy(A->i, Ai, sizeof Ai);
+    memcpy(A->x, Ax, sizeof Ax);
+    Matrix *A_m = new_sparse_matrix(A);
+
+    /* B: 4x4 PD, row_perm = [1, 3], col_perm = [0, 2]. */
+    int row_perm_B[2] = {1, 3};
+    int col_perm_B[2] = {0, 2};
+    double XB[4] = {10.0, 20.0, 30.0, 40.0};
+    Matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
+
+    double d[4] = {1.0, -2.0, 0.5, 3.0};
+
+    /* Wrapper path. */
+    Matrix *C_m = BTA_matrices_alloc(A_m, B_m);
+    BTDA_matrices_fill_values(A_m, d, B_m, C_m);
+
+    /* Direct primitive path. */
+    CSR_Matrix *A2 = new_csr_matrix(4, 5, 5);
+    A2->p[0] = 0;
+    A2->p[1] = 2;
+    A2->p[2] = 3;
+    A2->p[3] = 4;
+    A2->p[4] = 5;
+    memcpy(A2->i, Ai, sizeof Ai);
+    memcpy(A2->x, Ax, sizeof Ax);
+    Matrix *B2_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
+    Permuted_Dense *B2 = (Permuted_Dense *) B2_m;
+    Matrix *C2 = BTA_csr_pd_alloc(A2, B2);
+    BTDA_csr_pd_fill_values(A2, d, B2, (Permuted_Dense *) C2);
+
+    mu_assert("values", cmp_double_array(C_m->x, C2->x, C_m->nnz));
+
+    free_matrix(C_m);
+    free_matrix(B_m);
+    free_matrix(A_m);
+    free_matrix(C2);
+    free_matrix(B2_m);
+    free_csr_matrix(A2);
+    return 0;
+}
+
+/* Wrapper dispatch sanity: (PD, CSR). Compare against direct
+   BTDA_pd_csr_fill_values. */
+const char *test_BTDA_matrices_pd_csr(void)
+{
+    /* A: 4x5 PD, row_perm = [1, 3], col_perm = [0, 2]. */
+    int row_perm_A[2] = {1, 3};
+    int col_perm_A[2] = {0, 2};
+    double XA[4] = {1.0, 2.0, 3.0, 4.0};
+    Matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
+
+    /* B: 4x4 CSR. */
+    CSR_Matrix *B = new_csr_matrix(4, 4, 5);
+    B->p[0] = 0;
+    B->p[1] = 2;
+    B->p[2] = 3;
+    B->p[3] = 4;
+    B->p[4] = 5;
+    int Bi[5] = {0, 2, 1, 0, 3};
+    double Bx[5] = {10.0, 20.0, 30.0, 40.0, 50.0};
+    memcpy(B->i, Bi, sizeof Bi);
+    memcpy(B->x, Bx, sizeof Bx);
+    Matrix *B_m = new_sparse_matrix(B);
+
+    double d[4] = {1.0, -2.0, 0.5, 3.0};
+
+    /* Wrapper path. */
+    Matrix *C_m = BTA_matrices_alloc(A_m, B_m);
+    BTDA_matrices_fill_values(A_m, d, B_m, C_m);
+
+    /* Direct primitive path. */
+    Matrix *A2_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
+    Permuted_Dense *A2 = (Permuted_Dense *) A2_m;
+    CSR_Matrix *B2 = new_csr_matrix(4, 4, 5);
+    B2->p[0] = 0;
+    B2->p[1] = 2;
+    B2->p[2] = 3;
+    B2->p[3] = 4;
+    B2->p[4] = 5;
+    memcpy(B2->i, Bi, sizeof Bi);
+    memcpy(B2->x, Bx, sizeof Bx);
+    Matrix *C2 = BTA_pd_csr_alloc(A2, B2);
+    BTDA_pd_csr_fill_values(A2, d, B2, (Permuted_Dense *) C2);
+
+    mu_assert("values", cmp_double_array(C_m->x, C2->x, C_m->nnz));
+
+    free_matrix(C_m);
+    free_matrix(B_m);
+    free_matrix(A_m);
+    free_matrix(C2);
+    free_csr_matrix(B2);
+    free_matrix(A2_m);
+    return 0;
+}
+
+#endif /* TEST_MATRIX_BTA_H */
diff --git a/tests/utils/test_permuted_dense.h b/tests/utils/test_permuted_dense.h
index 387c273..81b63d6 100644
--- a/tests/utils/test_permuted_dense.h
+++ b/tests/utils/test_permuted_dense.h
@@ -516,8 +516,7 @@ const char *test_permuted_dense_diag_vec(void)
 {
     int row_perm[2] = {0, 2};
     int col_perm[2] = {1, 4};
-    double X[4] = {1.0, 2.0,
-                   3.0, 4.0};
+    double X[4] = {1.0, 2.0, 3.0, 4.0};
     Matrix *M = new_permuted_dense(3, 6, 2, 2, row_perm, col_perm, X);
 
     Matrix *out = M->diag_vec_alloc(M);
@@ -547,7 +546,8 @@ const char *test_permuted_dense_diag_vec(void)
 
 /* Scatter a PD into a dense m x n_global buffer (row-major), zero-filled.
    Buffer is allocated by the caller. */
-static void scatter_pd_to_dense(const Permuted_Dense *pd, int n_global, double *dense)
+static void scatter_pd_to_dense(const Permuted_Dense *pd, int n_global,
+                                double *dense)
 {
     int m = pd->base.m;
     memset(dense, 0, (size_t) m * (size_t) n_global * sizeof(double));
@@ -610,13 +610,14 @@ const char *test_permuted_dense_BTA_partial_overlap(void)
     int row_perm_B[3] = {3, 5, 7};
     int col_perm_A[2] = {0, 2};
     int col_perm_B[2] = {1, 3};
-    /* X_A rows correspond to A row_perm order: row 0 -> source row 1, row 1 -> 3, row 2 -> 5. */
-    double XA[6] = {1.0, 2.0,   /* row 1 (NOT in B) */
-                    3.0, 4.0,   /* row 3 (in B at pos 0) */
-                    5.0, 6.0};  /* row 5 (in B at pos 1) */
+    /* X_A rows correspond to A row_perm order: row 0 -> source row 1, row 1 -> 3,
+     * row 2 -> 5. */
+    double XA[6] = {1.0, 2.0,  /* row 1 (NOT in B) */
+                    3.0, 4.0,  /* row 3 (in B at pos 0) */
+                    5.0, 6.0}; /* row 5 (in B at pos 1) */
     /* X_B rows: row 0 -> source row 3, row 1 -> 5, row 2 -> 7. */
-    double XB[6] = {10.0, 20.0, /* row 3 (in A at pos 1) */
-                    30.0, 40.0, /* row 5 (in A at pos 2) */
+    double XB[6] = {10.0, 20.0,  /* row 3 (in A at pos 1) */
+                    30.0, 40.0,  /* row 5 (in A at pos 2) */
                     50.0, 60.0}; /* row 7 (NOT in A) */
     Matrix *A_m = new_permuted_dense(8, 4, 3, 2, row_perm_A, col_perm_A, XA);
     Matrix *B_m = new_permuted_dense(8, 4, 3, 2, row_perm_B, col_perm_B, XB);
@@ -736,7 +737,7 @@ const char *test_permuted_dense_BTDA_decomposition(void)
 /* Scatter a CSR matrix into a dense m x n_global buffer (row-major).
    Caller allocates and zero-fills. */
 static void scatter_csr_to_dense(const CSR_Matrix *A_csr, int n_global,
-                                  double *dense)
+                                 double *dense)
 {
     int m = A_csr->m;
     memset(dense, 0, (size_t) m * (size_t) n_global * sizeof(double));
@@ -771,7 +772,8 @@ const char *test_BTA_csr_pd_basic(void)
     memcpy(A->i, Ai, sizeof Ai);
     memcpy(A->x, Ax, sizeof Ax);
 
-    /* PD B: m=4, n=4, row_perm = [1, 3], col_perm = [0, 2], X = [[10, 20], [30, 40]]. */
+    /* PD B: m=4, n=4, row_perm = [1, 3], col_perm = [0, 2], X = [[10, 20], [30,
+     * 40]]. */
     int row_perm_B[2] = {1, 3};
     int col_perm_B[2] = {0, 2};
     double XB[4] = {10.0, 20.0, 30.0, 40.0};
@@ -819,7 +821,8 @@ const char *test_BTA_csr_pd_basic(void)
     {
         for (int jj = 0; jj < 4; jj++)
         {
-            expected_X[ii * 4 + jj] = C_ref[col_perm_B[ii] * 5 + expected_col_perm[jj]];
+            expected_X[ii * 4 + jj] =
+                C_ref[col_perm_B[ii] * 5 + expected_col_perm[jj]];
         }
     }
     mu_assert("values", cmp_double_array(out->X, expected_X, 8));
@@ -834,7 +837,8 @@ const char *test_BTA_csr_pd_basic(void)
 
 /* BTA(CSR A, PD B) where A is a leaf-variable Jacobian (identity-in-block).
    A is (4, 8): row k has a 1 at column 4+k (variable v of size 4 at var_id=4).
-   Expected: col_perm_out = {4+row_perm_B[kk]} = {4+1, 4+3} = {5, 7}, and X_C = X_B^T. */
+   Expected: col_perm_out = {4+row_perm_B[kk]} = {4+1, 4+3} = {5, 7}, and X_C =
+   X_B^T. */
 const char *test_BTA_csr_pd_leaf_variable(void)
 {
     CSR_Matrix *A = new_csr_matrix(4, 8, 4);

From 74a0a42504c93c22e28219e49e8b72e87cef1939 Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Tue, 12 May 2026 09:03:02 +0200
Subject: [PATCH 11/31] better BTDA for permuted dense x permuted_dense

---
 include/atoms/affine.h                        |   6 +-
 include/atoms/bivariate_full_dom.h            |   2 +-
 include/atoms/non_elementwise_full_dom.h      |   4 +-
 include/expr.h                                |  18 +-
 include/old-code/old_CSR.h                    |  18 +-
 include/old-code/old_CSR_sum.h                |  22 +-
 include/old-code/old_affine.h                 |   4 +-
 include/problem.h                             |   6 +-
 include/subexpr.h                             |  52 +-
 include/utils/COO_Matrix.h                    |  16 +-
 include/utils/{CSC_Matrix.h => CSC_matrix.h}  |  44 +-
 include/utils/{CSR_Matrix.h => CSR_matrix.h}  |  34 +-
 include/utils/CSR_sum.h                       |  28 +-
 include/utils/dense_matrix.h                  |  10 +-
 include/utils/linalg_dense_sparse_matmuls.h   |  24 +-
 include/utils/linalg_sparse_matmuls.h         |  28 +-
 include/utils/matrix.h                        | 124 +--
 include/utils/matrix_BTA.h                    |   4 +-
 include/utils/matrix_sum.h                    |  12 +-
 include/utils/mini_numpy.h                    |   6 +-
 include/utils/permuted_dense.h                | 195 ++---
 include/utils/sparse_matrix.h                 |  46 +
 include/utils/utils.h                         |  15 +
 src/atoms/affine/add.c                        |   1 +
 src/atoms/affine/convolve.c                   |   3 +-
 src/atoms/affine/diag_mat.c                   |   2 +-
 src/atoms/affine/hstack.c                     |  11 +-
 src/atoms/affine/left_matmul.c                |  35 +-
 src/atoms/affine/parameter.c                  |   1 +
 src/atoms/affine/right_matmul.c               |  14 +-
 src/atoms/affine/sum.c                        |   5 +-
 src/atoms/affine/trace.c                      |   5 +-
 src/atoms/affine/transpose.c                  |   7 +-
 src/atoms/affine/upper_tri.c                  |   2 +-
 src/atoms/affine/variable.c                   |   3 +-
 src/atoms/bivariate_full_dom/matmul.c         |  33 +-
 src/atoms/bivariate_full_dom/multiply.c       |  21 +-
 .../bivariate_restricted_dom/quad_over_lin.c  |  17 +-
 src/atoms/bivariate_restricted_dom/rel_entr.c |   5 +-
 .../rel_entr_scalar_vector.c                  |   5 +-
 .../rel_entr_vector_scalar.c                  |   5 +-
 src/atoms/elementwise_full_dom/common.c       |  11 +-
 src/atoms/elementwise_restricted_dom/common.c |   5 +-
 src/atoms/other/prod.c                        |   7 +-
 src/atoms/other/prod_axis_one.c               |  13 +-
 src/atoms/other/prod_axis_zero.c              |   5 +-
 src/atoms/other/quad_form.c                   |  35 +-
 src/expr.c                                    |   2 +-
 src/old-code/linear_op.c                      |   7 +-
 src/old-code/old_CSR.c                        |  18 +-
 src/old-code/old_CSR_sum.c                    |  14 +-
 src/problem.c                                 |  20 +-
 src/utils/COO_Matrix.c                        |   4 +-
 src/utils/{CSC_Matrix.c => CSC_matrix.c}      |  48 +-
 src/utils/{CSR_Matrix.c => CSR_matrix.c}      |  38 +-
 src/utils/CSR_sum.c                           |  32 +-
 src/utils/dense_matrix.c                      |  18 +-
 src/utils/linalg_dense_sparse_matmuls.c       |  34 +-
 src/utils/linalg_sparse_matmuls.c             |  48 +-
 src/utils/matrix_BTA.c                        |  47 +-
 src/utils/matrix_sum.c                        |   8 +-
 src/utils/mini_numpy.c                        |   4 +-
 src/utils/permuted_dense.c                    | 806 ++++++++----------
 src/utils/sparse_matrix.c                     | 124 +--
 src/utils/utils.c                             |  19 +
 tests/all_tests.c                             |   1 +
 tests/forward_pass/affine/test_diag_mat.h     |   2 +-
 tests/forward_pass/affine/test_linear_op.h    |   4 +-
 tests/forward_pass/affine/test_upper_tri.h    |   2 +-
 tests/jacobian_tests/affine/test_convolve.h   |   2 +-
 tests/jacobian_tests/affine/test_diag_mat.h   |   2 +-
 tests/jacobian_tests/affine/test_index.h      |   2 +-
 .../jacobian_tests/affine/test_left_matmul.h  |  14 +-
 .../jacobian_tests/affine/test_right_matmul.h |   8 +-
 tests/jacobian_tests/affine/test_transpose.h  |   2 +-
 tests/jacobian_tests/affine/test_upper_tri.h  |   2 +-
 .../test_elementwise_mult.h                   |   6 +-
 .../test_quad_over_lin.h                      |   6 +-
 .../composite/test_chain_rule_jacobian.h      |  24 +-
 .../composite/test_composite_exp.h            |   6 +-
 .../jacobian_tests/other/test_prod_axis_one.h |   4 +-
 .../other/test_prod_axis_zero.h               |   2 +-
 tests/jacobian_tests/other/test_quad_form.h   |   6 +-
 tests/numerical_diff.c                        |  10 +-
 tests/numerical_diff/test_numerical_diff.h    |   4 +-
 tests/problem/test_problem.h                  |   6 +-
 tests/profiling/profile_left_matmul.h         |   4 +-
 tests/profiling/profile_log_reg.h             |  36 +-
 tests/test_helpers.c                          |  12 +-
 tests/test_helpers.h                          |  12 +-
 tests/utils/test_coo_matrix.h                 |   8 +-
 tests/utils/test_csc_matrix.h                 |  34 +-
 tests/utils/test_csr_csc_conversion.h         |  44 +-
 tests/utils/test_csr_matrix.h                 |  60 +-
 tests/utils/test_linalg_sparse_matmuls.h      |  70 +-
 .../test_linalg_utils_matmul_chain_rule.h     |  44 +-
 tests/utils/test_matrix.h                     |  25 +-
 tests/utils/test_matrix_BTA.h                 |  65 +-
 tests/utils/test_permuted_dense.h             | 388 +++++----
 tests/wsum_hess/affine/test_left_matmul.h     |  12 +-
 tests/wsum_hess/affine/test_right_matmul.h    |   8 +-
 tests/wsum_hess/affine/test_sum.h             |   2 +-
 .../bivariate_full_dom/test_multiply.h        |  20 +-
 .../composite/test_chain_rule_wsum_hess.h     |  30 +-
 .../elementwise_restricted_dom/test_log.h     |   2 +-
 tests/wsum_hess/other/test_prod_axis_one.h    |   8 +-
 tests/wsum_hess/other/test_prod_axis_zero.h   |   4 +-
 tests/wsum_hess/other/test_quad_form.h        |   2 +-
 108 files changed, 1656 insertions(+), 1609 deletions(-)
 rename include/utils/{CSC_Matrix.h => CSC_matrix.h} (58%)
 rename include/utils/{CSR_Matrix.h => CSR_matrix.h} (64%)
 create mode 100644 include/utils/sparse_matrix.h
 rename src/utils/{CSC_Matrix.c => CSC_matrix.c} (90%)
 rename src/utils/{CSR_Matrix.c => CSR_matrix.c} (86%)

diff --git a/include/atoms/affine.h b/include/atoms/affine.h
index 9527d3c..3fd9118 100644
--- a/include/atoms/affine.h
+++ b/include/atoms/affine.h
@@ -20,7 +20,7 @@
 
 #include "expr.h"
 #include "subexpr.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 
 expr *new_add(expr *left, expr *right);
 expr *new_neg(expr *child);
@@ -45,7 +45,7 @@ expr *new_transpose(expr *child);
 /* Left matrix multiplication: A @ f(x) where A is a constant sparse matrix.
    param_node is NULL for fixed constants. We currently do not support sparse
    parameters, so param_node should always be null. */
-expr *new_left_matmul(expr *param_node, expr *u, const CSR_Matrix *A);
+expr *new_left_matmul(expr *param_node, expr *u, const CSR_matrix *A);
 
 /* Left matrix multiplication: A @ f(x) where A is a constant dense matrix
    (in row-major, m x n, with values given by 'data') or a parameter
@@ -59,7 +59,7 @@ expr *new_left_matmul_dense(expr *param_node, expr *u, int m, int n,
 /* Right matrix multiplication: f(x) @ A where A is a constant sparse matrix.
    We currently do not support sparse parameters, so param_node should always be
    null. */
-expr *new_right_matmul(expr *param_node, expr *u, const CSR_Matrix *A);
+expr *new_right_matmul(expr *param_node, expr *u, const CSR_matrix *A);
 
 /* Right matrix multiplication: f(x) @ A where A is a constant dense matrix
    (in row-major, m x n, with values given by 'data') or a parameter
diff --git a/include/atoms/bivariate_full_dom.h b/include/atoms/bivariate_full_dom.h
index a864c7b..44e3f01 100644
--- a/include/atoms/bivariate_full_dom.h
+++ b/include/atoms/bivariate_full_dom.h
@@ -22,7 +22,7 @@
 
 expr *new_elementwise_mult(expr *left, expr *right);
 
-/* Matrix multiplication: Z = X @ Y */
+/* matrix multiplication: Z = X @ Y */
 expr *new_matmul(expr *x, expr *y);
 
 #endif /* BIVARIATE_FULL_DOM_H */
diff --git a/include/atoms/non_elementwise_full_dom.h b/include/atoms/non_elementwise_full_dom.h
index 7de046c..65c6070 100644
--- a/include/atoms/non_elementwise_full_dom.h
+++ b/include/atoms/non_elementwise_full_dom.h
@@ -20,9 +20,9 @@
 
 #include "expr.h"
 #include "subexpr.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 
-expr *new_quad_form(expr *child, CSR_Matrix *Q);
+expr *new_quad_form(expr *child, CSR_matrix *Q);
 
 /* product of all entries, without axis argument */
 expr *new_prod(expr *child);
diff --git a/include/expr.h b/include/expr.h
index 19c6e12..fbe5a67 100644
--- a/include/expr.h
+++ b/include/expr.h
@@ -18,8 +18,8 @@
 #ifndef EXPR_H
 #define EXPR_H
 
-#include "utils/CSC_Matrix.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSC_matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/matrix.h"
 #include <stdbool.h>
 #include <stddef.h>
@@ -45,16 +45,16 @@ typedef struct
 {
     double *dwork;
     int *iwork;
-    CSC_Matrix *jacobian_csc;
-    int *csc_work; /* for CSR-CSC conversion */
+    CSC_matrix *jacobian_csc;
+    int *csc_work; /* for CSR_matrix-CSC_matrix conversion */
 
     /* jacobian_csc_filled is only used for affine functions to avoid redundant
        conversions. Could become relevant for non-affine functions if we start
        supporting common subexpressions on the Python side. */
     bool jacobian_csc_filled;
     double *local_jac_diag; /* cached f'(g(x)) diagonal */
-    Matrix *hess_term1;     /* Jg^T D Jg workspace */
-    Matrix *hess_term2;     /* child wsum_hess workspace */
+    matrix *hess_term1;     /* Jg^T D Jg workspace */
+    matrix *hess_term2;     /* child wsum_hess workspace */
 } Expr_Work;
 
 /* Base expression node structure */
@@ -71,8 +71,8 @@ typedef struct expr
     //                     oracle related quantities
     // ------------------------------------------------------------------------
     double *value;
-    Matrix *jacobian;
-    Matrix *wsum_hess;
+    matrix *jacobian;
+    matrix *wsum_hess;
     forward_fn forward;
     jacobian_init_fn jacobian_init_impl;
     wsum_hess_init_fn wsum_hess_init_impl;
@@ -111,7 +111,7 @@ void free_expr(expr *node);
 void jacobian_init(expr *node);
 void wsum_hess_init(expr *node);
 
-/* Initialize CSC form of the Jacobian from the CSR Jacobian.
+/* Initialize CSC_matrix form of the Jacobian from the CSR_matrix Jacobian.
  * Must be called after jacobian_init. */
 void jacobian_csc_init(expr *node);
 
diff --git a/include/old-code/old_CSR.h b/include/old-code/old_CSR.h
index 41af333..7033246 100644
--- a/include/old-code/old_CSR.h
+++ b/include/old-code/old_CSR.h
@@ -18,29 +18,29 @@
 #ifndef OLD_CSR_H
 #define OLD_CSR_H
 
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 
 /* Build (I_p kron A) = blkdiag(A, A, ..., A) of size (p*A->m) x (p*A->n) */
-CSR_Matrix *block_diag_repeat_csr(const CSR_Matrix *A, int p);
+CSR_matrix *block_diag_repeat_csr(const CSR_matrix *A, int p);
 
 /* Build (A kron I_p) of size (A->m * p) x (A->n * p) with nnz = A->nnz * p. */
-CSR_Matrix *kron_identity_csr(const CSR_Matrix *A, int p);
+CSR_matrix *kron_identity_csr(const CSR_matrix *A, int p);
 
 /* Computes values of the row matrix C = z^T A (column indices must have been
    pre-computed) and transposed matrix AT must be provided) */
-void Ax_csr_fill_values(const CSR_Matrix *AT, const double *z, CSR_Matrix *C);
+void Ax_csr_fill_values(const CSR_matrix *AT, const double *z, CSR_matrix *C);
 
-/* Insert value into CSR matrix A with just one row at col_idx. Assumes that A
+/* Insert value into CSR_matrix matrix A with just one row at col_idx. Assumes that A
 has enough space and that A does not have an element at col_idx. It does update
 nnz. */
-void csr_insert_value(CSR_Matrix *A, int col_idx, double value);
+void csr_insert_value(CSR_matrix *A, int col_idx, double value);
 
-/* Compute C = diag(d) * A where d is an array and A, C are CSR matrices
+/* Compute C = diag(d) * A where d is an array and A, C are CSR_matrix matrices
  * d must have length m
  * C must be pre-allocated with same dimensions as A */
-void diag_csr_mult(const double *d, const CSR_Matrix *A, CSR_Matrix *C);
+void diag_csr_mult(const double *d, const CSR_matrix *A, CSR_matrix *C);
 
 /* y = Ax, where y is returned as dense (no column offset) */
-void Ax_csr_wo_offset(const CSR_Matrix *A, const double *x, double *y);
+void Ax_csr_wo_offset(const CSR_matrix *A, const double *x, double *y);
 
 #endif /* OLD_CSR_H */
diff --git a/include/old-code/old_CSR_sum.h b/include/old-code/old_CSR_sum.h
index 72d0b30..3582f31 100644
--- a/include/old-code/old_CSR_sum.h
+++ b/include/old-code/old_CSR_sum.h
@@ -18,44 +18,44 @@
 #ifndef OLD_CSR_SUM_H
 #define OLD_CSR_SUM_H
 
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 
-/* Compute C = A + B where A, B, C are CSR matrices
+/* Compute C = A + B where A, B, C are CSR_matrix matrices
  * A and B must have same dimensions
  * C must be pre-allocated with sufficient nnz capacity.
  * C must be different from A and B */
-void sum_csr_matrices(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C);
+void sum_csr_matrices(const CSR_matrix *A, const CSR_matrix *B, CSR_matrix *C);
 
-/* Compute C = diag(d1) * A + diag(d2) * B where A, B, C are CSR matrices */
-void sum_scaled_csr_matrices(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C,
+/* Compute C = diag(d1) * A + diag(d2) * B where A, B, C are CSR_matrix matrices */
+void sum_scaled_csr_matrices(const CSR_matrix *A, const CSR_matrix *B, CSR_matrix *C,
                              const double *d1, const double *d2);
 
 /* forward declaration */
 struct int_double_pair;
 
 /* Sum all rows of A into a single row matrix C */
-void sum_all_rows_csr(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_all_rows_csr(const CSR_matrix *A, CSR_matrix *C,
                       struct int_double_pair *pairs);
 
 /* Sum blocks of rows of A into a matrix C */
-void sum_block_of_rows_csr(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_block_of_rows_csr(const CSR_matrix *A, CSR_matrix *C,
                            struct int_double_pair *pairs, int row_block_size);
 
 /* Sum evenly spaced rows of A into a matrix C */
-void sum_evenly_spaced_rows_csr(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_evenly_spaced_rows_csr(const CSR_matrix *A, CSR_matrix *C,
                                 struct int_double_pair *pairs, int row_spacing);
 
 /* Sum evenly spaced rows of A starting at offset into a row matrix C */
-void sum_spaced_rows_into_row_csr(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_spaced_rows_into_row_csr(const CSR_matrix *A, CSR_matrix *C,
                                   struct int_double_pair *pairs, int offset,
                                   int spacing);
 
 /* Fill values of summed rows using precomputed idx_map and sparsity of C */
-void sum_all_rows_csr_fill_values(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_all_rows_csr_fill_values(const CSR_matrix *A, CSR_matrix *C,
                                   const int *idx_map);
 
 /* Fill values of summed block rows using precomputed idx_map */
-void sum_block_of_rows_csr_fill_values(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_block_of_rows_csr_fill_values(const CSR_matrix *A, CSR_matrix *C,
                                        const int *idx_map);
 
 #endif /* OLD_CSR_SUM_H */
diff --git a/include/old-code/old_affine.h b/include/old-code/old_affine.h
index 3b08491..cf0e598 100644
--- a/include/old-code/old_affine.h
+++ b/include/old-code/old_affine.h
@@ -19,8 +19,8 @@
 #define OLD_AFFINE_H
 
 #include "expr.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 
-expr *new_linear(expr *u, const CSR_Matrix *A, const double *b);
+expr *new_linear(expr *u, const CSR_matrix *A, const double *b);
 
 #endif /* OLD_AFFINE_H */
diff --git a/include/problem.h b/include/problem.h
index 7e68a28..1c2a77d 100644
--- a/include/problem.h
+++ b/include/problem.h
@@ -20,7 +20,7 @@
 
 #include "expr.h"
 #include "utils/COO_Matrix.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/Timer.h"
 #include <stdbool.h>
 
@@ -59,8 +59,8 @@ typedef struct problem
     double *gradient_values;
 
     /* allocated by problem_init_derivatives */
-    CSR_Matrix *jacobian;
-    CSR_Matrix *lagrange_hessian;
+    CSR_matrix *jacobian;
+    CSR_matrix *lagrange_hessian;
     int *hess_idx_map; /* maps all wsum_hess nnz to lagrange_hessian */
     COO_Matrix *jacobian_coo;
     COO_Matrix *lagrange_hessian_coo; /* lower triangular part stored in COO */
diff --git a/include/subexpr.h b/include/subexpr.h
index a934a93..c2a3662 100644
--- a/include/subexpr.h
+++ b/include/subexpr.h
@@ -19,8 +19,8 @@
 #define SUBEXPR_H
 
 #include "expr.h"
-#include "utils/CSC_Matrix.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSC_matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/matrix.h"
 
 /* Forward declaration */
@@ -40,7 +40,7 @@ typedef struct parameter_expr
 } parameter_expr;
 
 /* Linear operator: y = A * x + b
- * The matrix A is stored as node->jacobian (CSR). */
+ * The matrix A is stored as node->jacobian (CSR_matrix). */
 typedef struct linear_op_expr
 {
     expr base;
@@ -58,8 +58,8 @@ typedef struct power_expr
 typedef struct quad_form_expr
 {
     expr base;
-    CSR_Matrix *Q;
-    CSC_Matrix *QJf; /* Q * J_f in CSC (for chain rule hessian) */
+    CSR_matrix *Q;
+    CSC_matrix *QJf; /* Q * J_f in CSC_matrix (for chain rule hessian) */
 } quad_form_expr;
 
 /* Sum reduction along an axis */
@@ -102,15 +102,15 @@ typedef struct hstack_expr
     expr base;
     expr **args;
     int n_args;
-    CSR_Matrix *CSR_work; /* for summing Hessians of children */
+    CSR_matrix *CSR_work; /* for summing Hessians of children */
 } hstack_expr;
 
 /* Elementwise multiplication */
 typedef struct elementwise_mult_expr
 {
     expr base;
-    Matrix *cross_C;       /* C  = Jg2^T diag(w) Jg1 (Sparse or PD) */
-    CSR_Matrix *CSR_work2; /* CT = C^T (always CSR) */
+    matrix *cross_C;       /* C  = Jg2^T diag(w) Jg1 (Sparse or PD) */
+    CSR_matrix *CSR_work2; /* CT = C^T (always CSR_matrix) */
     int *idx_map_C;        /* C[j]  -> wsum_hess pos */
     int *idx_map_CT;       /* CT[j] -> wsum_hess pos */
     int *idx_map_Hx;       /* x->wsum_hess[j] -> pos */
@@ -123,20 +123,20 @@ important distinction compared to linear_op_expr. */
 typedef struct left_matmul_expr
 {
     expr base;
-    Matrix *A;
-    Matrix *AT;
+    matrix *A;
+    matrix *AT;
     int n_blocks;
-    CSC_Matrix *Jchild_CSC;
-    CSC_Matrix *J_CSC;
+    CSC_matrix *Jchild_CSC;
+    CSC_matrix *J_CSC;
     int *csc_to_csr_work;
     expr *param_source;
     void (*refresh_param_values)(struct left_matmul_expr *);
 
-    /* When true, jacobian_init_impl produces a Permuted_Dense node->jacobian
-       directly (skipping the CSC mirror machinery). Set by new_left_matmul_dense
+    /* When true, jacobian_init_impl produces a permuted_dense node->jacobian
+       directly (skipping the CSC_matrix mirror machinery). Set by new_left_matmul_dense
        when A is a constant dense matrix, child is a leaf variable, and
        n_blocks == 1 — in that case the Jacobian is exactly A placed in the
-       variable's column slot, which is naturally a full-dense Permuted_Dense. */
+       variable's column slot, which is naturally a full-dense permuted_dense. */
     bool produce_pd_jacobian;
 } left_matmul_expr;
 
@@ -158,7 +158,7 @@ typedef struct vector_mult_expr
 /* 1D convolution: y = conv(a, child) where a is a length-m kernel held by
  * param_source. Output has size (m + n - 1) where n is the child length.
  * Forward and wsum_hess backprop are computed as direct loops; for Jacobian
- * we materialize T(a) as a CSR once at jacobian_init and reuse the engine's
+ * we materialize T(a) as a CSR_matrix once at jacobian_init and reuse the engine's
  * block-left-mult machinery for composite children. */
 typedef struct convolve_expr
 {
@@ -166,8 +166,8 @@ typedef struct convolve_expr
     expr *param_source; /* length-m kernel */
     int m;              /* kernel length */
     int n;              /* input length */
-    CSR_Matrix *T;      /* (m+n-1) x n convolution matrix */
-    CSC_Matrix *Jchild_CSC;
+    CSR_matrix *T;      /* (m+n-1) x n convolution matrix */
+    CSC_matrix *Jchild_CSC;
 } convolve_expr;
 
 /* Bivariate matrix multiplication: Z = f(u) @ g(u) where both children
@@ -176,16 +176,16 @@ typedef struct matmul_expr
 {
     expr base;
     /* Jacobian workspace */
-    CSR_Matrix *term1_CSR; /* (Y^T x I_m) @ J_f */
-    CSR_Matrix *term2_CSR; /* (I_n x X) @ J_g */
+    CSR_matrix *term1_CSR; /* (Y^T x I_m) @ J_f */
+    CSR_matrix *term2_CSR; /* (I_n x X) @ J_g */
 
     /* Hessian workspace (composite only) */
-    CSR_Matrix *B;       /* cross-Hessian B(w), mk x kn */
-    CSR_Matrix *BJg;     /* B @ J_g */
-    CSC_Matrix *BJg_CSC; /* BJg in CSC */
-    int *BJg_csc_work;   /* CSR-to-CSC workspace */
-    CSR_Matrix *C;       /* J_f^T @ B @ J_g */
-    CSR_Matrix *CT;      /* C^T */
+    CSR_matrix *B;       /* cross-Hessian B(w), mk x kn */
+    CSR_matrix *BJg;     /* B @ J_g */
+    CSC_matrix *BJg_CSC; /* BJg in CSC_matrix */
+    int *BJg_csc_work;   /* CSR_matrix-to-CSC_matrix workspace */
+    CSR_matrix *C;       /* J_f^T @ B @ J_g */
+    CSR_matrix *CT;      /* C^T */
     int *idx_map_C;
     int *idx_map_CT;
     int *idx_map_Hf;
diff --git a/include/utils/COO_Matrix.h b/include/utils/COO_Matrix.h
index 51a39e9..549bb1a 100644
--- a/include/utils/COO_Matrix.h
+++ b/include/utils/COO_Matrix.h
@@ -18,15 +18,15 @@
 #ifndef COO_MATRIX_H
 #define COO_MATRIX_H
 
-#include "CSR_Matrix.h"
+#include "CSR_matrix.h"
 
-/* COO (Coordinate) Sparse Matrix Format
+/* COO (Coordinate) Sparse matrix Format
  *
  * For an m x n matrix with nnz nonzeros:
  * - rows: array of size nnz containing row indices
  * - cols: array of size nnz containing column indices
  * - x: array of size nnz containing values
- * - value_map: array of size nnz mapping CSR entries to COO entries (for
+ * - value_map: array of size nnz mapping CSR_matrix entries to COO entries (for
  * lower-triangular COO)
  * - m: number of rows
  * - n: number of columns
@@ -43,16 +43,16 @@ typedef struct COO_Matrix
     int nnz;
 } COO_Matrix;
 
-/* Construct a COO matrix from a CSR matrix */
-COO_Matrix *new_coo_matrix(const CSR_Matrix *A);
+/* Construct a COO matrix from a CSR_matrix matrix */
+COO_Matrix *new_coo_matrix(const CSR_matrix *A);
 
 /* Construct a COO matrix containing only the lower-triangular
- * entries (col <= row) of a symmetric CSR matrix. Populates
+ * entries (col <= row) of a symmetric CSR_matrix matrix. Populates
  * value_map so that refresh_lower_triangular_coo can update
  * values without recomputing structure. */
-COO_Matrix *new_coo_matrix_lower_triangular(const CSR_Matrix *A);
+COO_Matrix *new_coo_matrix_lower_triangular(const CSR_matrix *A);
 
-/* Refresh COO values from a new CSR value array using value_map */
+/* Refresh COO values from a new CSR_matrix value array using value_map */
 void refresh_lower_triangular_coo(COO_Matrix *coo, const double *vals);
 
 void free_coo_matrix(COO_Matrix *matrix);
diff --git a/include/utils/CSC_Matrix.h b/include/utils/CSC_matrix.h
similarity index 58%
rename from include/utils/CSC_Matrix.h
rename to include/utils/CSC_matrix.h
index f5236f6..3d8beed 100644
--- a/include/utils/CSC_Matrix.h
+++ b/include/utils/CSC_matrix.h
@@ -18,9 +18,9 @@
 #ifndef CSC_MATRIX_H
 #define CSC_MATRIX_H
 
-#include "CSR_Matrix.h"
+#include "CSR_matrix.h"
 
-/* CSC (Compressed Sparse Column) Matrix Format
+/* CSC_matrix (Compressed Sparse Column) matrix Format
  *
  * For an m x n matrix with nnz nonzeros:
  * - p: array of size (n + 1) indicating start of each column
@@ -30,7 +30,7 @@
  * - n: number of columns
  * - nnz: number of nonzero entries
  */
-typedef struct CSC_Matrix
+typedef struct CSC_matrix
 {
     int *p;
     int *i;
@@ -38,43 +38,43 @@ typedef struct CSC_Matrix
     int m;
     int n;
     int nnz;
-} CSC_Matrix;
+} CSC_matrix;
 
 /* constructor and destructor */
-CSC_Matrix *new_csc_matrix(int m, int n, int nnz);
-void free_csc_matrix(CSC_Matrix *matrix);
+CSC_matrix *new_csc_matrix(int m, int n, int nnz);
+void free_csc_matrix(CSC_matrix *matrix);
 
 /* Fill sparsity of C = A^T D A for diagonal D */
-CSR_Matrix *ATA_alloc(const CSC_Matrix *A);
+CSR_matrix *ATA_alloc(const CSC_matrix *A);
 
 /* Fill sparsity of C = B^T D A for diagonal D */
-CSR_Matrix *BTA_alloc(const CSC_Matrix *A, const CSC_Matrix *B);
+CSR_matrix *BTA_alloc(const CSC_matrix *A, const CSC_matrix *B);
 
 /* Fill sparsity of C = BA, where B is symmetric. */
-CSC_Matrix *symBA_alloc(const CSR_Matrix *B, const CSC_Matrix *A);
+CSC_matrix *symBA_alloc(const CSR_matrix *B, const CSC_matrix *A);
 
 /* Compute values for C = A^T D A (null d corresponds to D as identity) */
-void ATDA_fill_values(const CSC_Matrix *A, const double *d, CSR_Matrix *C);
+void ATDA_fill_values(const CSC_matrix *A, const double *d, CSR_matrix *C);
 
 /* Compute values for C = B^T D A (null d corresonds to D as identity) */
-void BTDA_fill_values(const CSC_Matrix *A, const CSC_Matrix *B, const double *d,
-                      CSR_Matrix *C);
+void BTDA_fill_values(const CSC_matrix *A, const CSC_matrix *B, const double *d,
+                      CSR_matrix *C);
 
 /* Fill values of C = BA. The matrix B does not have to be symmetric */
-void BA_fill_values(const CSR_Matrix *B, const CSC_Matrix *A, CSC_Matrix *C);
+void BA_fill_values(const CSR_matrix *B, const CSC_matrix *A, CSC_matrix *C);
 
 /* Fill values of C = x^T A. The matrix C must have filled sparsity. */
-void yTA_fill_values(const CSC_Matrix *A, const double *x, CSR_Matrix *C);
+void yTA_fill_values(const CSC_matrix *A, const double *x, CSR_matrix *C);
 
-/* Count nonzero columns of a CSC matrix */
-int count_nonzero_cols_csc(const CSC_Matrix *A);
+/* Count nonzero columns of a CSC_matrix matrix */
+int count_nonzero_cols_csc(const CSC_matrix *A);
 
-/* convert from CSR to CSC format */
-CSC_Matrix *csr_to_csc_alloc(const CSR_Matrix *A, int *iwork);
-void csr_to_csc_fill_values(const CSR_Matrix *A, CSC_Matrix *C, int *iwork);
+/* convert from CSR_matrix to CSC_matrix format */
+CSC_matrix *csr_to_csc_alloc(const CSR_matrix *A, int *iwork);
+void csr_to_csc_fill_values(const CSR_matrix *A, CSC_matrix *C, int *iwork);
 
-/* convert from CSC to CSR format */
-CSR_Matrix *csc_to_csr_alloc(const CSC_Matrix *A, int *iwork);
-void csc_to_csr_fill_values(const CSC_Matrix *A, CSR_Matrix *C, int *iwork);
+/* convert from CSC_matrix to CSR_matrix format */
+CSR_matrix *csc_to_csr_alloc(const CSC_matrix *A, int *iwork);
+void csc_to_csr_fill_values(const CSC_matrix *A, CSR_matrix *C, int *iwork);
 
 #endif /* CSC_MATRIX_H */
diff --git a/include/utils/CSR_Matrix.h b/include/utils/CSR_matrix.h
similarity index 64%
rename from include/utils/CSR_Matrix.h
rename to include/utils/CSR_matrix.h
index dd5ad91..32a03d2 100644
--- a/include/utils/CSR_Matrix.h
+++ b/include/utils/CSR_matrix.h
@@ -19,7 +19,7 @@
 #define CSR_MATRIX_H
 #include <stdbool.h>
 
-/* CSR (Compressed Sparse Row) Matrix Format
+/* CSR_matrix (Compressed Sparse Row) matrix Format
  *
  * For an m x n matrix with nnz nonzeros:
  * - p: array of size (m + 1) indicating start of each row
@@ -29,7 +29,7 @@
  * - n: number of columns
  * - nnz: number of nonzero entries
  */
-typedef struct CSR_Matrix
+typedef struct CSR_matrix
 {
     int *p;
     int *i;
@@ -37,37 +37,37 @@ typedef struct CSR_Matrix
     int m;
     int n;
     int nnz;
-} CSR_Matrix;
+} CSR_matrix;
 
 /* constructors and destructors */
-CSR_Matrix *new_csr_matrix(int m, int n, int nnz);
-CSR_Matrix *new_csr(const CSR_Matrix *A);
-CSR_Matrix *new_csr_copy_sparsity(const CSR_Matrix *A);
-void free_csr_matrix(CSR_Matrix *matrix);
-void copy_csr_matrix(const CSR_Matrix *A, CSR_Matrix *C);
+CSR_matrix *new_csr_matrix(int m, int n, int nnz);
+CSR_matrix *new_csr(const CSR_matrix *A);
+CSR_matrix *new_csr_copy_sparsity(const CSR_matrix *A);
+void free_csr_matrix(CSR_matrix *matrix);
+void copy_csr_matrix(const CSR_matrix *A, CSR_matrix *C);
 
 /* transpose functionality (iwork must be of size A->n) */
-CSR_Matrix *transpose(const CSR_Matrix *A, int *iwork);
-CSR_Matrix *AT_alloc(const CSR_Matrix *A, int *iwork);
-void AT_fill_values(const CSR_Matrix *A, CSR_Matrix *AT, int *iwork);
+CSR_matrix *transpose(const CSR_matrix *A, int *iwork);
+CSR_matrix *AT_alloc(const CSR_matrix *A, int *iwork);
+void AT_fill_values(const CSR_matrix *A, CSR_matrix *AT, int *iwork);
 
 /* computes dense y = Ax */
-void Ax_csr(const CSR_Matrix *A, const double *x, double *y, int col_offset);
+void Ax_csr(const CSR_matrix *A, const double *x, double *y, int col_offset);
 
 /* fills values of C = diag(d) @ A */
-void DA_fill_values(const double *d, const CSR_Matrix *A, CSR_Matrix *C);
+void DA_fill_values(const double *d, const CSR_matrix *A, CSR_matrix *C);
 
 /* Count number of columns with nonzero entries in A and marks them in col_nz */
-int count_nonzero_cols(const CSR_Matrix *A, bool *col_nz);
+int count_nonzero_cols(const CSR_matrix *A, bool *col_nz);
 
 /* inserts 'idx' into array 'arr' in sorted order, and moves the other elements */
 void insert_idx(int idx, int *arr, int len);
 
 /* get value at position (row, col) in A */
-double csr_get_value(const CSR_Matrix *A, int row, int col);
+double csr_get_value(const CSR_matrix *A, int row, int col);
 
-/* Expand symmetric CSR matrix A to full matrix C. A is assumed to store
+/* Expand symmetric CSR_matrix matrix A to full matrix C. A is assumed to store
    only upper triangle. C must be pre-allocated with sufficient nnz */
-void symmetrize_csr(const int *Ap, const int *Ai, int m, CSR_Matrix *C);
+void symmetrize_csr(const int *Ap, const int *Ai, int m, CSR_matrix *C);
 
 #endif /* CSR_MATRIX_H */
diff --git a/include/utils/CSR_sum.h b/include/utils/CSR_sum.h
index 5350b59..97c4718 100644
--- a/include/utils/CSR_sum.h
+++ b/include/utils/CSR_sum.h
@@ -18,24 +18,24 @@
 #ifndef CSR_SUM_H
 #define CSR_SUM_H
 
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 
 /* forward declaration */
 struct int_double_pair;
 
 /* Compute sparsity pattern of C = A + B (and sets C->nnz) */
-void sum_csr_alloc(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C);
+void sum_csr_alloc(const CSR_matrix *A, const CSR_matrix *B, CSR_matrix *C);
 
 /* Fills values of C = A + B (assuming C's sparsity pattern is set) */
-void sum_csr_fill_values(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C);
+void sum_csr_fill_values(const CSR_matrix *A, const CSR_matrix *B, CSR_matrix *C);
 
 /* Fills values of C = diag(d1) * A + diag(d2) * B (assuming C's sparsity is set)*/
-void sum_scaled_csr_matrices_fill_values(const CSR_Matrix *A, const CSR_Matrix *B,
-                                         CSR_Matrix *C, const double *d1,
+void sum_scaled_csr_matrices_fill_values(const CSR_matrix *A, const CSR_matrix *B,
+                                         CSR_matrix *C, const double *d1,
                                          const double *d2);
 
-/* The following five functions are used for summing either more than two CSR
-   matrices or rows of CSR matrices. To implement the filling of values efficiently,
+/* The following five functions are used for summing either more than two CSR_matrix
+   matrices or rows of CSR_matrix matrices. To implement the filling of values efficiently,
    we compute an idx_map when we fill the sparsity pattern of the output matrix,
    which maps each nonzero entry in the input matrix to its position in the output
    matrix. This allows us to fill the values with a single pass of the output matrix
@@ -48,21 +48,21 @@ void sum_scaled_csr_matrices_fill_values(const CSR_Matrix *A, const CSR_Matrix *
    Output matrix C, input matrix A, iwork->size = max(A->n, A->nnz) for the first
    four functions. The last function allocates the output matrix and returns it. */
 // ------------------------------------------------------------------------------------
-void sum_all_rows_csr_alloc(const CSR_Matrix *A, CSR_Matrix *C, int *iwork,
+void sum_all_rows_csr_alloc(const CSR_matrix *A, CSR_matrix *C, int *iwork,
                             int *idx_map);
 
-void sum_block_of_rows_csr_alloc(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_block_of_rows_csr_alloc(const CSR_matrix *A, CSR_matrix *C,
                                  int row_block_size, int *iwork, int *idx_map);
 
-void sum_evenly_spaced_rows_csr_alloc(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_evenly_spaced_rows_csr_alloc(const CSR_matrix *A, CSR_matrix *C,
                                       int row_spacing, int *iwork, int *idx_map);
 
-void sum_spaced_rows_into_row_csr_alloc(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_spaced_rows_into_row_csr_alloc(const CSR_matrix *A, CSR_matrix *C,
                                         int spacing, int *iwork, int *idx_map);
 
 /* Compute sparsity pattern of out = A + B + C + D */
-CSR_Matrix *sum_4_csr_alloc(const CSR_Matrix *A, const CSR_Matrix *B,
-                            const CSR_Matrix *C, const CSR_Matrix *D,
+CSR_matrix *sum_4_csr_alloc(const CSR_matrix *A, const CSR_matrix *B,
+                            const CSR_matrix *C, const CSR_matrix *D,
                             int *idx_maps[4]);
 // ------------------------------------------------------------------------------------
 
@@ -72,7 +72,7 @@ void accumulator(const double *vals, int nnz, const int *idx_map, double *out);
 
 /* Accumulates values from A according to map with spacing. Must memset to zero
  * before calling. */
-void accumulator_with_spacing(const CSR_Matrix *A, const int *idx_map, double *out,
+void accumulator_with_spacing(const CSR_matrix *A, const int *idx_map, double *out,
                               int spacing);
 
 #endif /* CSR_SUM_H */
diff --git a/include/utils/dense_matrix.h b/include/utils/dense_matrix.h
index bd1693b..3de84b1 100644
--- a/include/utils/dense_matrix.h
+++ b/include/utils/dense_matrix.h
@@ -21,19 +21,19 @@
 #include "matrix.h"
 
 /* Dense matrix (row-major) */
-typedef struct Dense_Matrix
+typedef struct dense_matrix
 {
-    Matrix base;
+    matrix base;
     double *x;
     double *work; /* scratch buffer, length n */
-} Dense_Matrix;
+} dense_matrix;
 
 /* Constructors. If data is NULL, the value buffer is allocated but left
    uninitialized; otherwise m*n entries are copied from data. */
-Matrix *new_dense_matrix(int m, int n, const double *data);
+matrix *new_dense_matrix(int m, int n, const double *data);
 
 /* Transpose helper */
-Matrix *dense_matrix_trans(const Dense_Matrix *self);
+matrix *dense_matrix_trans(const dense_matrix *self);
 
 void A_transpose(double *AT, const double *A, int m, int n);
 
diff --git a/include/utils/linalg_dense_sparse_matmuls.h b/include/utils/linalg_dense_sparse_matmuls.h
index 949b2da..24d8f8c 100644
--- a/include/utils/linalg_dense_sparse_matmuls.h
+++ b/include/utils/linalg_dense_sparse_matmuls.h
@@ -18,26 +18,26 @@
 #ifndef LINALG_DENSE_SPARSE_H
 #define LINALG_DENSE_SPARSE_H
 
-#include "CSC_Matrix.h"
-#include "CSR_Matrix.h"
+#include "CSC_matrix.h"
+#include "CSR_matrix.h"
 #include "matrix.h"
 
-/* C = (I_p kron A) @ J via the polymorphic Matrix interface.
- * A is dense m x n, J is (n*p) x k in CSC, C is (m*p) x k in CSC. */
+/* C = (I_p kron A) @ J via the polymorphic matrix interface.
+ * A is dense m x n, J is (n*p) x k in CSC_matrix, C is (m*p) x k in CSC_matrix. */
 // TODO: maybe we can replace these with I_kron_X functionality?
-CSC_Matrix *I_kron_A_alloc(const Matrix *A, const CSC_Matrix *J, int p);
-void I_kron_A_fill_values(const Matrix *A, const CSC_Matrix *J, CSC_Matrix *C);
+CSC_matrix *I_kron_A_alloc(const matrix *A, const CSC_matrix *J, int p);
+void I_kron_A_fill_values(const matrix *A, const CSC_matrix *J, CSC_matrix *C);
 
 /* Sparsity and values of C = (Y^T kron I_m) @ J where Y is k x n, J is (m*k) x p,
    and C is (m*n) x p. Y is given in column-major dense format. */
-CSR_Matrix *YT_kron_I_alloc(int m, int k, int n, const CSC_Matrix *J);
-void YT_kron_I_fill_values(int m, int k, int n, const double *Y, const CSC_Matrix *J,
-                           CSR_Matrix *C);
+CSR_matrix *YT_kron_I_alloc(int m, int k, int n, const CSC_matrix *J);
+void YT_kron_I_fill_values(int m, int k, int n, const double *Y, const CSC_matrix *J,
+                           CSR_matrix *C);
 
 /* Sparsity and values of C = (I_n kron X) @ J where X is m x k (col-major dense),
    J is (k*n) x p, and C is (m*n) x p. */
-CSR_Matrix *I_kron_X_alloc(int m, int k, int n, const CSC_Matrix *J);
-void I_kron_X_fill_values(int m, int k, int n, const double *X, const CSC_Matrix *J,
-                          CSR_Matrix *C);
+CSR_matrix *I_kron_X_alloc(int m, int k, int n, const CSC_matrix *J);
+void I_kron_X_fill_values(int m, int k, int n, const double *X, const CSC_matrix *J,
+                          CSR_matrix *C);
 
 #endif /* LINALG_DENSE_SPARSE_H */
diff --git a/include/utils/linalg_sparse_matmuls.h b/include/utils/linalg_sparse_matmuls.h
index a833078..516ff18 100644
--- a/include/utils/linalg_sparse_matmuls.h
+++ b/include/utils/linalg_sparse_matmuls.h
@@ -18,41 +18,41 @@
 #ifndef LINALG_H
 #define LINALG_H
 
-#include "CSC_Matrix.h"
-#include "CSR_Matrix.h"
+#include "CSC_matrix.h"
+#include "CSR_matrix.h"
 
 /* Compute sparsity pattern and values for the matrix-matrix multiplication
    C = (I_p kron A) @ J where A is m x n, J is (n*p) x k, and C is (m*p) x k,
    without relying on generic sparse matrix-matrix multiplication. Specialized
    logic for this is much faster (50-100x) than generic sparse matmul.
 
-    * J is provided in CSC format and is split into p blocks of n rows each
-    * C is returned in CSC format
+    * J is provided in CSC_matrix format and is split into p blocks of n rows each
+    * C is returned in CSC_matrix format
     * Mathematically it corresponds to  C = [A @ J1; A @ J2; ...; A @ Jp],
       where J = [J1; J2; ...; Jp]
 */
-CSC_Matrix *block_left_multiply_fill_sparsity(const CSR_Matrix *A,
-                                              const CSC_Matrix *J, int p);
+CSC_matrix *block_left_multiply_fill_sparsity(const CSR_matrix *A,
+                                              const CSC_matrix *J, int p);
 
-void block_left_multiply_fill_values(const CSR_Matrix *A, const CSC_Matrix *J,
-                                     CSC_Matrix *C);
+void block_left_multiply_fill_values(const CSR_matrix *A, const CSC_matrix *J,
+                                     CSC_matrix *C);
 
 /* Compute y = kron(I_p, A) @ x where A is m x n and x is(n*p)-length vector.
    The output y is m*p-length vector corresponding to
    y = [A @ x1; A @ x2; ...; A @ xp] where x is divided into p blocks of n
    elements.
 */
-void block_left_multiply_vec(const CSR_Matrix *A, const double *x, double *y, int p);
+void block_left_multiply_vec(const CSR_matrix *A, const double *x, double *y, int p);
 
-/* Fill values of C = A @ B where A is CSR, B is CSC.
+/* Fill values of C = A @ B where A is CSR_matrix, B is CSC_matrix.
  * C must have sparsity pattern already computed.
  */
-void csr_csc_matmul_fill_values(const CSR_Matrix *A, const CSC_Matrix *B,
-                                CSR_Matrix *C);
+void csr_csc_matmul_fill_values(const CSR_matrix *A, const CSC_matrix *B,
+                                CSR_matrix *C);
 
-/* C = A @ B where A is CSR, B is CSC. Result C is CSR.
+/* C = A @ B where A is CSR_matrix, B is CSC_matrix. Result C is CSR_matrix.
  * Allocates and precomputes sparsity pattern. No workspace required.
  */
-CSR_Matrix *csr_csc_matmul_alloc(const CSR_Matrix *A, const CSC_Matrix *B);
+CSR_matrix *csr_csc_matmul_alloc(const CSR_matrix *A, const CSC_matrix *B);
 
 #endif /* LINALG_H */
diff --git a/include/utils/matrix.h b/include/utils/matrix.h
index 96a8827..a20980a 100644
--- a/include/utils/matrix.h
+++ b/include/utils/matrix.h
@@ -18,12 +18,12 @@
 #ifndef MATRIX_H
 #define MATRIX_H
 
-#include "CSC_Matrix.h"
-#include "CSR_Matrix.h"
+#include "CSC_matrix.h"
+#include "CSR_matrix.h"
 
 /* Forward declaration; full definition in permuted_dense.h. Used by the
    as_permuted_dense vtable getter. */
-struct Permuted_Dense;
+struct permuted_dense;
 
 /* Broadcast shape used by the broadcast atom and its vtable methods. */
 typedef enum
@@ -35,7 +35,7 @@ typedef enum
 
 /* We implement three different types of matrices.
 
-    1. 'sparse_matrix' represents a generic CSR matrix.
+    1. 'sparse_matrix' represents a generic CSR_matrix matrix.
     2. 'permuted_dense' represents a matrix that only consists of a dense block
         (potentially after permuting columns).
     3. 'blkdiag_dense' represents a block diagonal matrix with a constant dense
@@ -55,129 +55,97 @@ typedef enum
        * to_csr_values
        *
 
-   1. sparse_matrix: generic CSR matrix.
+   1. sparse_matrix: generic CSR_matrix matrix.
    2. permuted_dense:
 
 
 */
 
-/* Base matrix type with function pointers for polymorphic dispatch. There are
-   two types of matrices: 'sparse_matrix' and 'permuted_dense'. Each type
-   implements the same set of operations, but with different algorithms.
-   The following operations are implemented: TODO
+/* Base matrix type with function pointers for polymorphic dispatch. There are two
+   types of matrices: 'sparse_matrix' and 'permuted_dense'. Each type implements the
+   same set of operations, but with different algorithms. The following operations
+   are implemented: TODO
 */
-typedef struct Matrix
+typedef struct matrix
 {
-    /* Dimensions and number of explicitly stored entries. For Sparse_Matrix
-       nnz is the CSR nnz; for Dense_Matrix it is m * n; for Permuted_Dense it
-       is dense_m * dense_n (the size of the stored dense block). */
-    int m, n, nnz;
-
-    /* Non-owning pointer to the value buffer. Sparse_Matrix: csr->x.
-       Permuted_Dense: pd->X (also aliased as pd->csr_cache->x). Dense_Matrix:
-       dm->x. Sparse and Permuted_Dense share row-major layout for equal
-       sparsity patterns, so memcpy via M->x is valid between same-shape
-       Sparse/PD pairs. */
-    double *x;
+    int m, n, nnz; /* shape and nnz*/
+    double *x;     /* non-owning pointer to the value buffer */
 
     /* Operators for the left-multiply matrix in left_matmul. */
-    void (*block_left_mult_vec)(const struct Matrix *self, const double *x,
+    void (*block_left_mult_vec)(const struct matrix *self, const double *x,
                                 double *y, int p);
-    CSC_Matrix *(*block_left_mult_sparsity)(const struct Matrix *self,
-                                            const CSC_Matrix *J, int p);
-    void (*block_left_mult_values)(const struct Matrix *self, const CSC_Matrix *J,
-                                   CSC_Matrix *C);
+    CSC_matrix *(*block_left_mult_sparsity)(const struct matrix *self,
+                                            const CSC_matrix *J, int p);
+    void (*block_left_mult_values)(const struct matrix *self, const CSC_matrix *J,
+                                   CSC_matrix *C);
 
     /* Chain-rule operations used by transformer atoms (elementwise, etc.).
        All chain-rule outputs are the same concrete type as self (uniform
        polymorphism). copy_sparsity returns a matrix of same shape and type as
        self; DA_fill_values writes diag(d) * self into out; ATA_alloc allocates
        a matrix with sparsity of self^T * self; ATDA_fill_values fills out with
-       self^T * diag(d) * self; to_csr returns a CSR view of self (constant-time
-       for Sparse_Matrix, lazily built/refreshed for other types). */
-    struct Matrix *(*copy_sparsity)(const struct Matrix *self);
-    void (*DA_fill_values)(const double *d, const struct Matrix *self,
-                           struct Matrix *out);
-    struct Matrix *(*ATA_alloc)(struct Matrix *self);
-    void (*ATDA_fill_values)(const struct Matrix *self, const double *d,
-                             struct Matrix *out);
-    CSR_Matrix *(*to_csr)(struct Matrix *self);
-
-    /* Returns self downcast to Permuted_Dense if self is PD-backed, NULL
+       self^T * diag(d) * self; to_csr returns a CSR_matrix view of self (constant-time
+       for sparse_matrix, lazily built/refreshed for other types). */
+    struct matrix *(*copy_sparsity)(const struct matrix *self);
+    void (*DA_fill_values)(const double *d, const struct matrix *self,
+                           struct matrix *out);
+    struct matrix *(*ATA_alloc)(struct matrix *self);
+    void (*ATDA_fill_values)(const struct matrix *self, const double *d,
+                             struct matrix *out);
+    CSR_matrix *(*to_csr)(struct matrix *self);
+
+    /* Returns self downcast to permuted_dense if self is PD-backed, NULL
        otherwise. Used by bivariate dispatchers to route to type-specialized
        kernels. */
-    struct Permuted_Dense *(*as_permuted_dense)(struct Matrix *self);
+    struct permuted_dense *(*as_permuted_dense)(struct matrix *self);
 
-    /* Row-selection / indexing: returns a new Matrix that selects rows
+    /* Row-selection / indexing: returns a new matrix that selects rows
        indices[0..n_idxs) of self. Output shape is (n_idxs, self->n). The
        returned type matches self's concrete type. index_alloc sets up
        sparsity (values uninitialized); index_fill_values fills values into
        out, which must have been produced by a prior index_alloc with the
        same indices/n_idxs. */
-    struct Matrix *(*index_alloc)(struct Matrix *self, const int *indices,
+    struct matrix *(*index_alloc)(struct matrix *self, const int *indices,
                                   int n_idxs);
-    void (*index_fill_values)(struct Matrix *self, const int *indices, int n_idxs,
-                              struct Matrix *out);
+    void (*index_fill_values)(struct matrix *self, const int *indices, int n_idxs,
+                              struct matrix *out);
 
     /* Row-tiling for the promote atom: self must be a 1-row matrix; returns
-       a new Matrix of shape (size, self->n) where every row is a copy of
+       a new matrix of shape (size, self->n) where every row is a copy of
        self's single row. Output type matches self's concrete type.
        promote_alloc sets sparsity; promote_fill_values fills values. */
-    struct Matrix *(*promote_alloc)(struct Matrix *self, int size);
-    void (*promote_fill_values)(struct Matrix *self, struct Matrix *out);
+    struct matrix *(*promote_alloc)(struct matrix *self, int size);
+    void (*promote_fill_values)(struct matrix *self, struct matrix *out);
 
     /* Broadcast: lift the child Jacobian of a broadcast atom into the output
        Jacobian. `type` is the broadcast variant; (d1, d2) is the output shape.
        Output type matches self's concrete type. broadcast_alloc sets sparsity;
        broadcast_fill_values fills values into out. */
-    struct Matrix *(*broadcast_alloc)(struct Matrix *self, broadcast_type type,
+    struct matrix *(*broadcast_alloc)(struct matrix *self, broadcast_type type,
                                       int d1, int d2);
-    void (*broadcast_fill_values)(struct Matrix *self, broadcast_type type, int d1,
-                                  int d2, struct Matrix *out);
+    void (*broadcast_fill_values)(struct matrix *self, broadcast_type type, int d1,
+                                  int d2, struct matrix *out);
 
     /* diag_vec: child is an (n, self->n) Jacobian for a length-n vector;
        output is (n*n, self->n) where child row i lands at output row
        i*(n+1) (column-major diagonal positions). Other output rows are
        structurally zero. Output type matches self's concrete type. */
-    struct Matrix *(*diag_vec_alloc)(struct Matrix *self);
-    void (*diag_vec_fill_values)(struct Matrix *self, struct Matrix *out);
+    struct matrix *(*diag_vec_alloc)(struct matrix *self);
+    void (*diag_vec_fill_values)(struct matrix *self, struct matrix *out);
 
-    /* Refresh any internal caches (e.g. a CSC mirror) so subsequent ATA / ATDA
+    /* Refresh any internal caches (e.g. a CSC_matrix mirror) so subsequent ATA / ATDA
        calls reflect the current values. Atoms whose child Jacobian is affine
        can skip this on iterations after the first; non-affine children must
        call it before every chain-rule call. No-op for types that don't have
        a cache (e.g. permuted_dense). */
-    void (*refresh_csc_values)(struct Matrix *self);
+    void (*refresh_csc_values)(struct matrix *self);
 
     /* Lifecycle. */
-    void (*free_fn)(struct Matrix *self);
-} Matrix;
-
-/* Sparse matrix wrapping CSR. csc_cache is a lazily-built CSC mirror used by
-   the chain-rule ATA / ATDA paths; it's allocated on first need and refilled
-   by refresh_csc_values. csc_iwork is the workspace for csr_to_csc. */
-typedef struct Sparse_Matrix
-{
-    Matrix base;
-    CSR_Matrix *csr;
-    CSC_Matrix *csc_cache;
-    int *csc_iwork;
-} Sparse_Matrix;
-
-/* Constructor. Takes ownership of A; the caller must not free A separately
-   (free_matrix on the returned Matrix frees A). */
-Matrix *new_sparse_matrix(CSR_Matrix *A);
-
-/* Convenience: allocate a Sparse_Matrix of shape (m, n) with capacity for
-   nnz entries. Equivalent to new_sparse_matrix(new_csr_matrix(m, n, nnz)).
-   Sparsity pattern and values are uninitialized. */
-Matrix *new_sparse_matrix_alloc(int m, int n, int nnz);
-
-/* Transpose helper */
-Matrix *sparse_matrix_trans(const Sparse_Matrix *self, int *iwork);
+    void (*free_fn)(struct matrix *self);
+} matrix;
 
 /* Free helper */
-static inline void free_matrix(Matrix *m)
+static inline void free_matrix(matrix *m)
 {
     if (m)
     {
diff --git a/include/utils/matrix_BTA.h b/include/utils/matrix_BTA.h
index 8d06069..bbf68ab 100644
--- a/include/utils/matrix_BTA.h
+++ b/include/utils/matrix_BTA.h
@@ -20,13 +20,13 @@
    as_permuted_dense() on both operands. */
 
 /* Allocate sparsity for C = B^T A. */
-Matrix *BTA_matrices_alloc(Matrix *A, Matrix *B);
+matrix *BTA_matrices_alloc(matrix *A, matrix *B);
 
 /* Fill out->x = B^T diag(d) A (d may be NULL for plain B^T A). out must
    have the structure produced by BTA_matrices_alloc(A, B). For the
    (Sparse, Sparse) path, the caller must ensure both operands' csc_caches
    are fresh (via refresh_csc_values) before calling; the dispatcher does
    not refresh. */
-void BTDA_matrices_fill_values(Matrix *A, const double *d, Matrix *B, Matrix *C);
+void BTDA_matrices_fill_values(matrix *A, const double *d, matrix *B, matrix *C);
 
 #endif /* MATRIX_BTA_H */
diff --git a/include/utils/matrix_sum.h b/include/utils/matrix_sum.h
index 4f8c8d4..31e5a56 100644
--- a/include/utils/matrix_sum.h
+++ b/include/utils/matrix_sum.h
@@ -20,22 +20,22 @@
 
 #include "matrix.h"
 
-/* Polymorphic wrappers over CSR_sum. A, B, and C must all be Sparse_Matrix-
+/* Polymorphic wrappers over CSR_sum. A, B, and C must all be sparse_matrix-
    backed for now; the union sparsity of A+B is general sparse, so a
-   Permuted_Dense output is not supported.
+   permuted_dense output is not supported.
 
    sum_matrices_alloc fills C's sparsity pattern and re-syncs C's base.nnz
-   from the underlying CSR (sum_csr_alloc may shrink nnz below the
+   from the underlying CSR_matrix (sum_csr_alloc may shrink nnz below the
    over-allocated max). */
-void sum_matrices_alloc(Matrix *A, Matrix *B, Matrix *C);
+void sum_matrices_alloc(matrix *A, matrix *B, matrix *C);
 
 /* Fills C's values; assumes C already has the union sparsity pattern of
    A and B (typically produced by sum_matrices_alloc). */
-void sum_matrices_fill_values(Matrix *A, Matrix *B, Matrix *C);
+void sum_matrices_fill_values(matrix *A, matrix *B, matrix *C);
 
 /* Fills C's values for C = diag(d1) * A + diag(d2) * B; assumes C already
    has the union sparsity pattern of A and B. */
-void sum_scaled_matrices_fill_values(Matrix *A, Matrix *B, Matrix *C,
+void sum_scaled_matrices_fill_values(matrix *A, matrix *B, matrix *C,
                                      const double *d1, const double *d2);
 
 #endif /* MATRIX_SUM_H */
diff --git a/include/utils/mini_numpy.h b/include/utils/mini_numpy.h
index c87f80a..a17ca9e 100644
--- a/include/utils/mini_numpy.h
+++ b/include/utils/mini_numpy.h
@@ -18,7 +18,7 @@
 #ifndef MINI_NUMPY_H
 #define MINI_NUMPY_H
 
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 
 /* Example: a = [1, 2], len = 2, repeats = 3, result = [1, 1, 1, 2, 2, 2] */
 void repeat(double *result, const double *a, int len, int repeats);
@@ -46,10 +46,10 @@ void I_kron_XT_vec(int m, int k, int n, const double *X, const double *w, double
 /* Fill T_csr's row pointers and column indices for the 1D full-convolution
    Toeplitz matrix T(a), sized (m+n-1) x n with m*n nonzeros. Values (x) are
    not written; call conv_matrix_fill_values to populate them. */
-void conv_matrix_fill_sparsity(CSR_Matrix *T_csr, int m, int n);
+void conv_matrix_fill_sparsity(CSR_matrix *T_csr, int m, int n);
 
 /* Overwrite T_csr->x from kernel a, using the sparsity already written by
    conv_matrix_fill_sparsity. T[r, col] = a[r - col]. */
-void conv_matrix_fill_values(CSR_Matrix *T_csr, const double *a);
+void conv_matrix_fill_values(CSR_matrix *T_csr, const double *a);
 
 #endif /* MINI_NUMPY_H */
diff --git a/include/utils/permuted_dense.h b/include/utils/permuted_dense.h
index 2d8f9dd..0157da0 100644
--- a/include/utils/permuted_dense.h
+++ b/include/utils/permuted_dense.h
@@ -21,130 +21,133 @@
 #include "matrix.h"
 #include <stddef.h>
 
-/* Permuted_Dense represents a matrix whose only nonzeros lie in a dense
+/* permuted_dense represents a matrix whose only nonzeros lie in a dense
    block, after rows and columns are restricted to chosen subsets of the
-   global index space. For local indices (ii, jj) with 0 <= ii < dense_m
-   and 0 <= jj < dense_n,
+   global index space. For local indices (ii, jj) with 0 <= ii < m0
+   and 0 <= jj < n0,
 
        M[row_perm[ii], col_perm[jj]] = X[ii, jj].
 
    All other entries of M are zero. row_perm and col_perm are stored in
    strictly increasing order; the constructor asserts this. */
-typedef struct Permuted_Dense
+typedef struct permuted_dense
 {
-    Matrix base;           /* base.m, base.n = global ambient dimensions */
-    int dense_m;           /* rows of dense block (= len(row_perm))      */
-    int dense_n;           /* cols of dense block (= len(col_perm))      */
+    matrix base;           /* base.m, base.n = global ambient dimensions */
+    int m0;                /* rows of dense block (= len(row_perm))      */
+    int n0;                /* cols of dense block (= len(col_perm))      */
     int *row_perm;         /* row_perm[ii] in [0, base.m), sorted        */
     int *col_perm;         /* col_perm[jj] in [0, base.n), sorted        */
-    double *X;             /* dense_m * dense_n, row-major               */
-    double *Y_scratch;     /* dense_m * dense_n, used by ATDA            */
+    double *X;             /* m0 * n0, row-major               */
     int *col_inv;          /* length base.n: col_inv[col_perm[jj]] = jj, */
-                           /* otherwise -1; used by `x CSC` allocation.  */
+                           /* otherwise -1; used by `x CSC_matrix` allocation.  */
     int *row_inv;          /* length base.m: row_inv[row_perm[ii]] = ii, */
                            /* otherwise -1; used by index_alloc.         */
-    CSR_Matrix *csr_cache; /* lazy CSR view built by to_csr; structure */
+    CSR_matrix *csr_cache; /* lazy CSR_matrix view built by to_csr; structure */
                            /* allocated on first call, values refilled */
                            /* on every call. NULL until first call.    */
-    /* Scratch buffer for BTA_csr_pd / BTA_pd_csr fill kernels. Owned by the
-       output PD, allocated by the corresponding BTA *_alloc so per-call
-       BTA / BTDA fill kernels can reuse it across solver iterations
-       (avoids malloc/free of large dense buffers per Hessian eval). Sized
-       at alloc time; NULL on PDs not produced by those allocators. */
-    double *gather_X_scratch;
-    size_t gather_X_size;
-} Permuted_Dense;
+    /* Mutable double-precision BLAS scratch shared across kernels that
+       operate on this PD. Two non-overlapping roles (a given fill call uses
+       at most one):
+         - Y-buffer: holds diag(d_perm) X for ATDA / BTDA_pd_pd (size m0*n0).
+         - gather:   holds densified CSR rows for BTA/BTDA_csr_pd /
+                     _pd_csr (size depends on the input PD's dimensions).
+       Sized at alloc time for the largest role this PD could play. Functions
+       taking a const permuted_dense * may still mutate `dwork`. */
+    double *dwork;
+    size_t dwork_size;
+
+    /* Mutable int scratch. Currently only used to hold the row-intersection
+       index arrays idx_A / idx_B in BTA_pd_pd_fill_values and the
+       slow path of BTDA_pd_pd_fill_values; allocated by
+       permuted_dense_BTA_alloc for those outputs (NULL on PDs from other
+       allocators). Fill kernels fall back to a per-call SP_MALLOC if
+       iwork_size is too small. */
+    int *iwork;
+    size_t iwork_size;
+
+    /* CONTRACT: `dwork` and `iwork` are freely overwritten by every kernel
+       that takes this PD as input or output — contents do NOT survive
+       across calls. Do not use them to cache precomputed factors or carry
+       state between kernel invocations: any subsequent call (ATDA, BTDA,
+       BTA gather, …) may clobber them without warning. If you need
+       persistence, add a dedicated field. */
+} permuted_dense;
 
 /* Constructor. row_perm and col_perm must be strictly increasing in their
    respective ranges. If X_data is NULL the value buffer is allocated but
-   left uninitialized; otherwise dense_m * dense_n entries are copied. */
-Matrix *new_permuted_dense(int m, int n, int dense_m, int dense_n,
-                           const int *row_perm, const int *col_perm,
-                           const double *X_data);
+   left uninitialized; otherwise m0 * n0 entries are copied. */
+matrix *new_permuted_dense(int m, int n, int m0, int n0, const int *row_perm,
+                           const int *col_perm, const double *X_data);
 
-/* CSR view: callers should use the vtable, i.e. base.to_csr(base). The PD
-   owns and caches the returned CSR_Matrix; its value array aliases self->X,
+/* CSR_matrix view: callers should use the vtable, i.e. base.to_csr(base). The PD
+   owns and caches the returned CSR_matrix; its value array aliases self->X,
    so values are always live with no separate fill needed. Callers must not
-   free the returned CSR — it's released by free_matrix on the PD. */
-
-/* Fill out = diag(d) * self, where d has length self->base.m. out must have
-   the same structure as self (same dimensions and same row_perm/col_perm). */
-void permuted_dense_DA_fill_values(const double *d, const Permuted_Dense *self,
-                                   Permuted_Dense *out);
-
-/* Allocate a new Permuted_Dense for C = self^T @ self. C is square of global
-   size self->base.n, with dense block self->dense_n x self->dense_n and both
-   permutations equal to self->col_perm. Values are uninitialized; the caller
-   is expected to fill them via permuted_dense_ATDA_fill_values. */
-Matrix *permuted_dense_ATA_alloc(const Permuted_Dense *self);
-
-/* Fill out.X = self.X^T diag(d) self.X, where d has length self->base.m.
-   out must have the structure produced by permuted_dense_ATA_alloc(self).
-   Uses self->Y_scratch as workspace; const-correctness is preserved because
-   only the buffer pointed to by Y_scratch is mutated. */
-void permuted_dense_ATDA_fill_values(const Permuted_Dense *self, const double *d,
-                                     Permuted_Dense *out);
-
-/* Allocate a new Permuted_Dense for C = B^T @ A where A and B are both PD.
-   Output shape (B->base.n, A->base.n) with dense block (B->dense_n,
-   A->dense_n), row_perm = B->col_perm, col_perm = A->col_perm. Values
-   uninitialized. The output structure does not depend on row_perm_A or
-   row_perm_B (only the values do; see permuted_dense_BTA_fill_values). */
-Matrix *permuted_dense_BTA_alloc(const Permuted_Dense *A, const Permuted_Dense *B);
-
-/* Fill out->X = B->X^T @ A->X restricted to rows in row_perm_A ∩ row_perm_B.
-   out must have the structure produced by permuted_dense_BTA_alloc(A, B).
-   For matching row_perms, this is a single cblas_dgemm; otherwise the
-   intersecting rows are first gathered into contiguous scratch buffers. */
-void permuted_dense_BTA_fill_values(const Permuted_Dense *A, const Permuted_Dense *B,
-                                    Permuted_Dense *out);
-
-/* Allocate a new Permuted_Dense for C = B^T @ A where A is Sparse (CSR)
+   free the returned CSR_matrix — it's released by free_matrix on the PD. */
+
+/* Fill values of C = diag(d) @ A where len(d) = number of (global) rows of A */
+void permuted_dense_DA_fill_values(const double *d, const permuted_dense *A,
+                                   permuted_dense *C);
+
+/* Fill sparsity of C = AT @ A */
+matrix *permuted_dense_ATA_alloc(const permuted_dense *A);
+
+/* Fill values of C = AT @ diag(d) @ A */
+void permuted_dense_ATDA_fill_values(const permuted_dense *A, const double *d,
+                                     permuted_dense *C);
+
+/* Fill sparsity of C = BT @ A where A and B are both permuted_dense.
+   (If B and A have no overlapping rows, then C is empty) */
+matrix *permuted_dense_BTA_alloc(const permuted_dense *A, const permuted_dense *B);
+
+/* Fill values of C = BT @ A where A and B are both permuted dense. */
+void BTA_pd_pd_fill_values(const permuted_dense *A, const permuted_dense *B,
+                           permuted_dense *C);
+
+/* Allocate a new permuted_dense for C = B^T @ A where A is Sparse (CSR_matrix)
    and B is PD. Output is PD with row_perm = B->col_perm and col_perm = the
    sorted union of columns appearing in A's rows at positions row_perm_B.
-   Dense block size = (B->dense_n, |col_active|). Values uninitialized. */
-Matrix *BTA_csr_pd_alloc(const CSR_Matrix *A_csr, const Permuted_Dense *B);
+   Dense block size = (B->n0, |col_active|). Values uninitialized. */
+matrix *BTA_csr_pd_alloc(const CSR_matrix *A_csr, const permuted_dense *B);
 
-/* Fill out->X = X_B^T @ A_sub_dense, where A_sub_dense is A's rows at
-   positions row_perm_B, columns restricted to out's col_perm, scattered
-   to a dense buffer. out must have the structure produced by
+/* Fill C->X = X_B^T @ A_sub_dense, where A_sub_dense is A's rows at
+   positions row_perm_B, columns restricted to C's col_perm, scattered
+   to a dense buffer. C must have the structure produced by
    BTA_csr_pd_alloc(A_csr, B). */
-void BTA_csr_pd_fill_values(const CSR_Matrix *A_csr, const Permuted_Dense *B,
-                            Permuted_Dense *out);
+void BTA_csr_pd_fill_values(const CSR_matrix *A_csr, const permuted_dense *B,
+                            permuted_dense *C);
 
-/* Allocate a new Permuted_Dense for C = B^T @ A where A is PD and B is
-   Sparse (CSR). Output is PD with row_perm = the sorted union of columns
+/* Allocate a new permuted_dense for C = B^T @ A where A is PD and B is
+   Sparse (CSR_matrix). Output is PD with row_perm = the sorted union of columns
    appearing in B's rows at positions row_perm_A, and col_perm = col_perm_A.
-   Dense block size = (|row_active|, dense_n_A). Values uninitialized. */
-Matrix *BTA_pd_csr_alloc(const Permuted_Dense *A, const CSR_Matrix *B_csr);
+   Dense block size = (|row_active|, n0_A). Values uninitialized. */
+matrix *BTA_pd_csr_alloc(const permuted_dense *A, const CSR_matrix *B_csr);
 
-/* Fill out->X = B_sub^T @ X_A, where B_sub is B's rows at positions
-   row_perm_A, columns restricted to out's row_perm, scattered to a dense
-   buffer. out must have the structure produced by BTA_pd_csr_alloc(A, B). */
-void BTA_pd_csr_fill_values(const Permuted_Dense *A, const CSR_Matrix *B_csr,
-                            Permuted_Dense *out);
+/* Fill C->X = B_sub^T @ X_A, where B_sub is B's rows at positions
+   row_perm_A, columns restricted to C's row_perm, scattered to a dense
+   buffer. C must have the structure produced by BTA_pd_csr_alloc(A, B). */
+void BTA_pd_csr_fill_values(const permuted_dense *A, const CSR_matrix *B_csr,
+                            permuted_dense *C);
 
 /* BTDA variants — fold a diagonal d into the BTA computation. Each fills
-   out->X = B^T diag(d) A (d may be NULL for plain B^T A). out must have
-   the structure produced by the corresponding BTA *_alloc function. */
-void BTDA_csr_pd_fill_values(const CSR_Matrix *A_csr, const double *d,
-                             const Permuted_Dense *B, Permuted_Dense *out);
-void BTDA_pd_csr_fill_values(const Permuted_Dense *A, const double *d,
-                             const CSR_Matrix *B_csr, Permuted_Dense *out);
-void BTDA_pd_pd_fill_values(const Permuted_Dense *A, const double *d,
-                            const Permuted_Dense *B, Permuted_Dense *out);
-
-/* Allocate a new Permuted_Dense for C = self @ J. C has global shape
-   (self->base.m, J->n) with row_perm = self->row_perm and col_perm equal
+   C->X = B^T diag(d) A (d may be NULL for plain B^T A). C must have the
+   structure produced by the corresponding BTA *_alloc function. */
+void BTDA_csr_pd_fill_values(const CSR_matrix *A_csr, const double *d,
+                             const permuted_dense *B, permuted_dense *C);
+void BTDA_pd_csr_fill_values(const permuted_dense *A, const double *d,
+                             const CSR_matrix *B_csr, permuted_dense *C);
+void BTDA_pd_pd_fill_values(const permuted_dense *A, const double *d,
+                            const permuted_dense *B, permuted_dense *C);
+
+/* Allocate a new permuted_dense for C = A @ J. C has global shape
+   (A->base.m, J->n) with row_perm = A->row_perm and col_perm equal
    to the sorted list of columns of J that have at least one structural
-   nonzero in some row in self->col_perm. Values are uninitialized. */
-Matrix *permuted_dense_times_csc_alloc(const Permuted_Dense *self,
-                                       const CSC_Matrix *J);
-
-/* Fill out.X[ii, jj] = sum_kk self.X[ii, kk] * J[col_perm_self[kk],
-   col_perm_out[jj]]. out must have the structure produced by
-   permuted_dense_times_csc_alloc(self, J). */
-void permuted_dense_times_csc_fill_values(const Permuted_Dense *self,
-                                          const CSC_Matrix *J, Permuted_Dense *out);
+   nonzero in some row in A->col_perm. Values are uninitialized. */
+matrix *permuted_dense_times_csc_alloc(const permuted_dense *A, const CSC_matrix *J);
+
+/* Fill C.X[ii, jj] = sum_kk A.X[ii, kk] * J[col_perm_A[kk],
+   col_perm_C[jj]]. C must have the structure produced by
+   permuted_dense_times_csc_alloc(A, J). */
+void permuted_dense_times_csc_fill_values(const permuted_dense *A,
+                                          const CSC_matrix *J, permuted_dense *C);
 
 #endif /* PERMUTED_DENSE_H */
diff --git a/include/utils/sparse_matrix.h b/include/utils/sparse_matrix.h
new file mode 100644
index 0000000..7ee97aa
--- /dev/null
+++ b/include/utils/sparse_matrix.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2026 Daniel Cederberg and William Zhang
+ *
+ * This file is part of the SparseDiffEngine project.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SPARSE_MATRIX_H
+#define SPARSE_MATRIX_H
+
+#include "matrix.h"
+
+/* Sparse matrix wrapping CSR_matrix. csc_cache is a lazily-built CSC_matrix mirror used by
+   the chain-rule ATA / ATDA paths; it's allocated on first need and refilled
+   by refresh_csc_values. csc_iwork is the workspace for csr_to_csc. */
+typedef struct sparse_matrix
+{
+    matrix base;
+    CSR_matrix *csr;
+    CSC_matrix *csc_cache;
+    int *csc_iwork;
+} sparse_matrix;
+
+/* Constructor. Takes ownership of A; the caller must not free A separately
+   (free_matrix on the returned matrix frees A). */
+matrix *new_sparse_matrix(CSR_matrix *A);
+
+/* Convenience: allocate a sparse_matrix of shape (m, n) with capacity for
+   nnz entries. Equivalent to new_sparse_matrix(new_csr_matrix(m, n, nnz)).
+   Sparsity pattern and values are uninitialized. */
+matrix *new_sparse_matrix_alloc(int m, int n, int nnz);
+
+/* Transpose helper */
+matrix *sparse_matrix_trans(const sparse_matrix *self, int *iwork);
+
+#endif /* SPARSE_MATRIX_H */
diff --git a/include/utils/utils.h b/include/utils/utils.h
index 8346858..1b9a6bd 100644
--- a/include/utils/utils.h
+++ b/include/utils/utils.h
@@ -18,9 +18,24 @@
 #ifndef UTILS_H
 #define UTILS_H
 
+#include <stdbool.h>
+
+#ifndef MAX
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
+#endif
+#ifndef MIN
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
 
 /* Sort an array of integers in ascending order */
 void sort_int_array(int *array, int size);
 
+/* Return true iff sorted index arrays a_idx and b_idx (lengths a_len, b_len)
+   share any value, where b_idx entries are shifted by b_offset before
+   comparison (a_idx[ai] == b_idx[bi] - b_offset). Use b_offset = 0 for a
+   plain intersection check. O(a_len + b_len) sorted-merge scan with early
+   termination on first match. */
+bool has_overlap(const int *a_idx, int a_len, const int *b_idx, int b_len,
+                 int b_offset);
+
 #endif // UTILS_H
diff --git a/src/atoms/affine/add.c b/src/atoms/affine/add.c
index d49c879..288bfde 100644
--- a/src/atoms/affine/add.c
+++ b/src/atoms/affine/add.c
@@ -17,6 +17,7 @@
  */
 #include "atoms/affine.h"
 #include "utils/matrix_sum.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <stdio.h>
diff --git a/src/atoms/affine/convolve.c b/src/atoms/affine/convolve.c
index 1496f95..27ffa54 100644
--- a/src/atoms/affine/convolve.c
+++ b/src/atoms/affine/convolve.c
@@ -17,9 +17,10 @@
  */
 #include "atoms/affine.h"
 #include "subexpr.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/linalg_sparse_matmuls.h"
 #include "utils/mini_numpy.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/src/atoms/affine/diag_mat.c b/src/atoms/affine/diag_mat.c
index e168d02..0d5a7c1 100644
--- a/src/atoms/affine/diag_mat.c
+++ b/src/atoms/affine/diag_mat.c
@@ -28,7 +28,7 @@ expr *new_diag_mat(expr *child)
     assert(child->d1 == child->d2);
     int n = child->d1;
 
-    int *indices = (int *) malloc((size_t) n * sizeof(int));
+    int *indices = (int *) malloc(n * sizeof(int));
     for (int i = 0; i < n; i++)
     {
         indices[i] = i * (n + 1);
diff --git a/src/atoms/affine/hstack.c b/src/atoms/affine/hstack.c
index c4e589b..419b23f 100644
--- a/src/atoms/affine/hstack.c
+++ b/src/atoms/affine/hstack.c
@@ -17,6 +17,7 @@
  */
 #include "atoms/affine.h"
 #include "utils/CSR_sum.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <stdio.h>
@@ -56,7 +57,7 @@ static void jacobian_init_impl(expr *node)
         nnz += hnode->args[i]->jacobian->nnz;
     }
 
-    CSR_Matrix *A = new_csr_matrix(node->size, node->n_vars, nnz);
+    CSR_matrix *A = new_csr_matrix(node->size, node->n_vars, nnz);
 
     /* precompute sparsity pattern of this node's jacobian */
     int row_offset = 0;
@@ -65,7 +66,7 @@ static void jacobian_init_impl(expr *node)
     for (int i = 0; i < hnode->n_args; i++)
     {
         expr *child = hnode->args[i];
-        CSR_Matrix *B = child->jacobian->to_csr(child->jacobian);
+        CSR_matrix *B = child->jacobian->to_csr(child->jacobian);
 
         /* copy columns */
         memcpy(A->i + A->nnz, B->i, B->nnz * sizeof(int));
@@ -111,14 +112,14 @@ static void wsum_hess_init_impl(expr *node)
 
     /* worst-case scenario the nnz of node->wsum_hess is the sum of children's
        nnz */
-    CSR_Matrix *H = new_csr_matrix(node->n_vars, node->n_vars, nnz);
+    CSR_matrix *H = new_csr_matrix(node->n_vars, node->n_vars, nnz);
     hnode->CSR_work = new_csr_matrix(node->n_vars, node->n_vars, nnz);
 
     /* fill sparsity pattern */
     H->nnz = 0;
     for (int i = 0; i < hnode->n_args; i++)
     {
-        Matrix *child_hess = hnode->args[i]->wsum_hess;
+        matrix *child_hess = hnode->args[i]->wsum_hess;
         copy_csr_matrix(H, hnode->CSR_work);
         sum_csr_alloc(hnode->CSR_work, child_hess->to_csr(child_hess), H);
     }
@@ -128,7 +129,7 @@ static void wsum_hess_init_impl(expr *node)
 static void wsum_hess_eval(expr *node, const double *w)
 {
     hstack_expr *hnode = (hstack_expr *) node;
-    CSR_Matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
+    CSR_matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
     int row_offset = 0;
     memset(H->x, 0, H->nnz * sizeof(double));
 
diff --git a/src/atoms/affine/left_matmul.c b/src/atoms/affine/left_matmul.c
index 44e462c..e799a42 100644
--- a/src/atoms/affine/left_matmul.c
+++ b/src/atoms/affine/left_matmul.c
@@ -19,6 +19,7 @@
 #include "subexpr.h"
 #include "utils/dense_matrix.h"
 #include "utils/permuted_dense.h"
+#include "utils/sparse_matrix.h"
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -79,7 +80,7 @@ static void forward(expr *node, const double *u)
     node->left->forward(node->left, u);
 
     /* y = A_kron @ vec(f(x)) */
-    Matrix *A = lnode->A;
+    matrix *A = lnode->A;
     int n_blocks = lnode->n_blocks;
     A->block_left_mult_vec(A, x->value, node->value, n_blocks);
 }
@@ -117,9 +118,9 @@ static void jacobian_init_impl(expr *node)
     /* initialize child's jacobian */
     jacobian_init(x);
 
-    /* Fast path: A is a constant Dense_Matrix, child is a leaf variable, and
+    /* Fast path: A is a constant dense_matrix, child is a leaf variable, and
        there are no Kronecker blocks. The Jacobian is A placed at the variable's
-       column slot — a full-dense Permuted_Dense. Skip the CSC mirror entirely. */
+       column slot — a full-dense permuted_dense. Skip the CSC_matrix mirror entirely. */
     if (lnode->produce_pd_jacobian)
     {
         int m_loc = lnode->A->m;
@@ -127,7 +128,7 @@ static void jacobian_init_impl(expr *node)
         int *col_perm = (int *) SP_MALLOC(lnode->A->n * sizeof(int));
         for (int i = 0; i < m_loc; i++) row_perm[i] = i;
         for (int j = 0; j < lnode->A->n; j++) col_perm[j] = x->var_id + j;
-        Dense_Matrix *dm = (Dense_Matrix *) lnode->A;
+        dense_matrix *dm = (dense_matrix *) lnode->A;
         node->jacobian = new_permuted_dense(m_loc, node->n_vars, m_loc,
                                             lnode->A->n, row_perm, col_perm, dm->x);
         free(row_perm);
@@ -135,11 +136,11 @@ static void jacobian_init_impl(expr *node)
         return;
     }
 
-    /* General path via CSC mirror. */
+    /* General path via CSC_matrix mirror. */
     lnode->Jchild_CSC =
         csr_to_csc_alloc(x->jacobian->to_csr(x->jacobian), node->work->iwork);
 
-    /* precompute sparsity of this node's jacobian in CSC and CSR */
+    /* precompute sparsity of this node's jacobian in CSC_matrix and CSR_matrix */
     lnode->J_CSC = lnode->A->block_left_mult_sparsity(lnode->A, lnode->Jchild_CSC,
                                                       lnode->n_blocks);
     node->jacobian =
@@ -154,10 +155,10 @@ static void eval_jacobian(expr *node)
     /* Fast path: PD Jacobian backed by constant A. Values never change. */
     if (lnode->produce_pd_jacobian) return;
 
-    CSC_Matrix *Jchild_CSC = lnode->Jchild_CSC;
-    CSC_Matrix *J_CSC = lnode->J_CSC;
+    CSC_matrix *Jchild_CSC = lnode->Jchild_CSC;
+    CSC_matrix *J_CSC = lnode->J_CSC;
 
-    /* evaluate child's jacobian and convert to CSC */
+    /* evaluate child's jacobian and convert to CSC_matrix */
     x->eval_jacobian(x);
     csr_to_csc_fill_values(x->jacobian->to_csr(x->jacobian), Jchild_CSC, node->work->iwork);
 
@@ -186,7 +187,7 @@ static void eval_wsum_hess(expr *node, const double *w)
     left_matmul_expr *lnode = (left_matmul_expr *) node;
 
     /* compute A^T w*/
-    Matrix *AT = lnode->AT;
+    matrix *AT = lnode->AT;
     int n_blocks = lnode->n_blocks;
     AT->block_left_mult_vec(AT, w, node->work->dwork, n_blocks);
 
@@ -197,8 +198,8 @@ static void eval_wsum_hess(expr *node, const double *w)
 
 static void refresh_dense_left(left_matmul_expr *lnode)
 {
-    Dense_Matrix *dm_A = (Dense_Matrix *) lnode->A;
-    Dense_Matrix *dm_AT = (Dense_Matrix *) lnode->AT;
+    dense_matrix *dm_A = (dense_matrix *) lnode->A;
+    dense_matrix *dm_AT = (dense_matrix *) lnode->AT;
     int m = dm_A->base.m;
     int n = dm_A->base.n;
 
@@ -209,7 +210,7 @@ static void refresh_dense_left(left_matmul_expr *lnode)
     A_transpose(dm_A->x, dm_AT->x, n, m);
 }
 
-expr *new_left_matmul(expr *param_node, expr *u, const CSR_Matrix *A)
+expr *new_left_matmul(expr *param_node, expr *u, const CSR_matrix *A)
 {
     /* We expect u->d1 == A->n. However, numpy's broadcasting rules allow users
        to do A @ u where u is (n, ) which in C is actually (1, n). In that case
@@ -245,7 +246,7 @@ expr *new_left_matmul(expr *param_node, expr *u, const CSR_Matrix *A)
 
     /* allocate workspace. iwork is used for converting J_child csr to csc
        (requiring size node->n_vars) and for transposing A (requiring size A->n).
-       csc_to_csr_work is used for converting J_CSC to CSR (requiring
+       csc_to_csr_work is used for converting J_CSC to CSR_matrix (requiring
        node->size) */
     node->work->iwork = (int *) SP_MALLOC(MAX(A->n, node->n_vars) * sizeof(int));
     lnode->csc_to_csr_work = (int *) SP_MALLOC(node->size * sizeof(int));
@@ -254,7 +255,7 @@ expr *new_left_matmul(expr *param_node, expr *u, const CSR_Matrix *A)
     /* store A and AT. new_sparse_matrix takes ownership, so clone first. */
     lnode->A = new_sparse_matrix(new_csr(A));
     lnode->AT =
-        sparse_matrix_trans((const Sparse_Matrix *) lnode->A, node->work->iwork);
+        sparse_matrix_trans((const sparse_matrix *) lnode->A, node->work->iwork);
 
     /* parameter support */
     lnode->param_source = param_node;
@@ -333,11 +334,11 @@ expr *new_left_matmul_dense(expr *param_node, expr *u, int m, int n,
         }
 
         lnode->A = new_dense_matrix(m, n, data);
-        lnode->AT = dense_matrix_trans((const Dense_Matrix *) lnode->A);
+        lnode->AT = dense_matrix_trans((const dense_matrix *) lnode->A);
 
         /* If the child is a leaf variable and there are no blocks, the Jacobian
            is exactly A placed in the variable's column slot — a full-dense
-           Permuted_Dense. Enable the fast path. */
+           permuted_dense. Enable the fast path. */
         if (u->var_id != NOT_A_VARIABLE && n_blocks == 1)
         {
             lnode->produce_pd_jacobian = true;
diff --git a/src/atoms/affine/parameter.c b/src/atoms/affine/parameter.c
index 7d9c2d5..0b9ffbe 100644
--- a/src/atoms/affine/parameter.c
+++ b/src/atoms/affine/parameter.c
@@ -17,6 +17,7 @@
  */
 #include "atoms/affine.h"
 #include "subexpr.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/src/atoms/affine/right_matmul.c b/src/atoms/affine/right_matmul.c
index c3344f2..a892235 100644
--- a/src/atoms/affine/right_matmul.c
+++ b/src/atoms/affine/right_matmul.c
@@ -17,7 +17,7 @@
  */
 #include "atoms/affine.h"
 #include "subexpr.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/dense_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <stdio.h>
@@ -29,12 +29,12 @@
     transpose: f(x) @ A = (A^T @ f(x)^T)^T.
 
    For the parameter case:
-     - param_source stores A values in CSR data order
+     - param_source stores A values in CSR_matrix data order
      - inner left_matmul stores AT as its A-matrix and A as its AT-matrix
      - on refresh: update AT (inner's AT, the original A) from param_source,
        then recompute A^T (inner's A) from the updated A. */
 
-/* Refresh for sparse right_matmul: param stores A in CSR data order.
+/* Refresh for sparse right_matmul: param stores A in CSR_matrix data order.
    Inner left_matmul: lnode->A = AT (transposed), lnode->AT = A (original).
    So: update lnode->AT from param values, then recompute lnode->A. */
 static void refresh_dense_right(left_matmul_expr *lnode)
@@ -44,8 +44,8 @@ static void refresh_dense_right(left_matmul_expr *lnode)
        Furthermore, lnode->param_source->value corresponds to the column-major
        version of A, which is BT (an m x n matrix) */
 
-    Dense_Matrix *B = (Dense_Matrix *) lnode->AT;
-    Dense_Matrix *BT = (Dense_Matrix *) lnode->A;
+    dense_matrix *B = (dense_matrix *) lnode->AT;
+    dense_matrix *BT = (dense_matrix *) lnode->A;
     int m = B->base.n;
     int n = B->base.m;
 
@@ -53,12 +53,12 @@ static void refresh_dense_right(left_matmul_expr *lnode)
     A_transpose(B->x, BT->x, m, n);
 }
 
-expr *new_right_matmul(expr *param_node, expr *u, const CSR_Matrix *A)
+expr *new_right_matmul(expr *param_node, expr *u, const CSR_matrix *A)
 {
     /* We can express right matmul using left matmul and transpose:
        u @ A = (A^T @ u^T)^T. */
     int *work_transpose = (int *) SP_MALLOC(A->n * sizeof(int));
-    CSR_Matrix *AT = transpose(A, work_transpose);
+    CSR_matrix *AT = transpose(A, work_transpose);
 
     expr *u_transpose = new_transpose(u);
     expr *left_matmul = new_left_matmul(NULL, u_transpose, AT);
diff --git a/src/atoms/affine/sum.c b/src/atoms/affine/sum.c
index 6fdcb21..ce3343f 100644
--- a/src/atoms/affine/sum.c
+++ b/src/atoms/affine/sum.c
@@ -19,6 +19,7 @@
 #include "utils/CSR_sum.h"
 #include "utils/int_double_pair.h"
 #include "utils/mini_numpy.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include "utils/utils.h"
 #include <assert.h>
@@ -86,10 +87,10 @@ static void jacobian_init_impl(expr *node)
 
     /* initialize child's jacobian */
     jacobian_init(x);
-    CSR_Matrix *Jx = x->jacobian->to_csr(x->jacobian);
+    CSR_matrix *Jx = x->jacobian->to_csr(x->jacobian);
 
     /* we never have to store more than the child's nnz */
-    CSR_Matrix *jac = new_csr_matrix(node->size, node->n_vars, Jx->nnz);
+    CSR_matrix *jac = new_csr_matrix(node->size, node->n_vars, Jx->nnz);
     node->work->iwork = SP_MALLOC(MAX(jac->n, Jx->nnz) * sizeof(int));
     snode->idx_map = SP_MALLOC(Jx->nnz * sizeof(int));
 
diff --git a/src/atoms/affine/trace.c b/src/atoms/affine/trace.c
index 7c1900a..c1ebdb6 100644
--- a/src/atoms/affine/trace.c
+++ b/src/atoms/affine/trace.c
@@ -18,6 +18,7 @@
 #include "atoms/affine.h"
 #include "utils/CSR_sum.h"
 #include "utils/int_double_pair.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include "utils/utils.h"
 #include <assert.h>
@@ -55,7 +56,7 @@ static void jacobian_init_impl(expr *node)
     // ---------------------------------------------------------------
     //    count total nnz and allocate matrix with sufficient space
     // ---------------------------------------------------------------
-    const CSR_Matrix *A = x->jacobian->to_csr(x->jacobian);
+    const CSR_matrix *A = x->jacobian->to_csr(x->jacobian);
     int total_nnz = 0;
     int row_spacing = x->d1 + 1;
 
@@ -64,7 +65,7 @@ static void jacobian_init_impl(expr *node)
         total_nnz += A->p[row + 1] - A->p[row];
     }
 
-    CSR_Matrix *jac = new_csr_matrix(1, node->n_vars, total_nnz);
+    CSR_matrix *jac = new_csr_matrix(1, node->n_vars, total_nnz);
 
     // ---------------------------------------------------------------
     // fill sparsity pattern and idx_map
diff --git a/src/atoms/affine/transpose.c b/src/atoms/affine/transpose.c
index 5040c33..999af08 100644
--- a/src/atoms/affine/transpose.c
+++ b/src/atoms/affine/transpose.c
@@ -16,6 +16,7 @@
  * limitations under the License.
  */
 #include "atoms/affine.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <stdlib.h>
@@ -45,8 +46,8 @@ static void jacobian_init_impl(expr *node)
 {
     expr *child = node->left;
     jacobian_init(child);
-    CSR_Matrix *Jc = child->jacobian->to_csr(child->jacobian);
-    CSR_Matrix *J = new_csr_matrix(node->size, node->n_vars, Jc->nnz);
+    CSR_matrix *Jc = child->jacobian->to_csr(child->jacobian);
+    CSR_matrix *J = new_csr_matrix(node->size, node->n_vars, Jc->nnz);
 
     /* fill sparsity */
     int d1 = node->d1;
@@ -72,7 +73,7 @@ static void eval_jacobian(expr *node)
 {
     expr *child = node->left;
     child->eval_jacobian(child);
-    CSR_Matrix *Jc = child->jacobian->to_csr(child->jacobian);
+    CSR_matrix *Jc = child->jacobian->to_csr(child->jacobian);
 
     int d1 = node->d1;
     int d2 = node->d2;
diff --git a/src/atoms/affine/upper_tri.c b/src/atoms/affine/upper_tri.c
index 51c339b..2da4d36 100644
--- a/src/atoms/affine/upper_tri.c
+++ b/src/atoms/affine/upper_tri.c
@@ -40,7 +40,7 @@ expr *new_upper_tri(expr *child)
     int *indices = NULL;
     if (n_elems > 0)
     {
-        indices = (int *) malloc((size_t) n_elems * sizeof(int));
+        indices = (int *) malloc(n_elems * sizeof(int));
         int k = 0;
         for (int i = 0; i < n; i++)
         {
diff --git a/src/atoms/affine/variable.c b/src/atoms/affine/variable.c
index 0af562e..1f6887d 100644
--- a/src/atoms/affine/variable.c
+++ b/src/atoms/affine/variable.c
@@ -16,6 +16,7 @@
  * limitations under the License.
  */
 #include "atoms/affine.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <stdlib.h>
 #include <string.h>
@@ -27,7 +28,7 @@ static void forward(expr *node, const double *u)
 
 static void jacobian_init_impl(expr *node)
 {
-    CSR_Matrix *jac = new_csr_matrix(node->size, node->n_vars, node->size);
+    CSR_matrix *jac = new_csr_matrix(node->size, node->n_vars, node->size);
     for (int j = 0; j < node->size; j++)
     {
         jac->p[j] = j;
diff --git a/src/atoms/bivariate_full_dom/matmul.c b/src/atoms/bivariate_full_dom/matmul.c
index 53ca955..746ba23 100644
--- a/src/atoms/bivariate_full_dom/matmul.c
+++ b/src/atoms/bivariate_full_dom/matmul.c
@@ -17,12 +17,13 @@
  */
 #include "atoms/bivariate_full_dom.h"
 #include "subexpr.h"
-#include "utils/CSC_Matrix.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSC_matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/CSR_sum.h"
 #include "utils/linalg_dense_sparse_matmuls.h"
 #include "utils/linalg_sparse_matmuls.h"
 #include "utils/mini_numpy.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include "utils/utils.h"
 #include <assert.h>
@@ -41,10 +42,10 @@
 // column positions (offset by j in the Y-variable indexing).
 // ------------------------------------------------------------------------------
 
-static CSR_Matrix *build_cross_hessian_sparsity(int m, int k, int n)
+static CSR_matrix *build_cross_hessian_sparsity(int m, int k, int n)
 {
     int total_nnz = m * k * n;
-    CSR_Matrix *B = new_csr_matrix(m * k, k * n, total_nnz);
+    CSR_matrix *B = new_csr_matrix(m * k, k * n, total_nnz);
     int idx = 0;
 
     for (int j = 0; j < k; j++)
@@ -64,7 +65,7 @@ static CSR_Matrix *build_cross_hessian_sparsity(int m, int k, int n)
 }
 
 static void fill_cross_hessian_values(int m, int k, int n, const double *w,
-                                      CSR_Matrix *B)
+                                      CSR_matrix *B)
 {
     int idx = 0;
     for (int j = 0; j < k; j++)
@@ -139,7 +140,7 @@ static void jacobian_init_no_chain_rule(expr *node)
     int k = x->d2;
     int n = y->d2;
     int nnz = m * n * 2 * k;
-    CSR_Matrix *jac = new_csr_matrix(node->size, node->n_vars, nnz);
+    CSR_matrix *jac = new_csr_matrix(node->size, node->n_vars, nnz);
 
     int nnz_idx = 0;
     for (int i = 0; i < node->size; i++)
@@ -183,7 +184,7 @@ static void eval_jacobian_no_chain_rule(expr *node)
     expr *y = node->right;
     int m = x->d1;
     int k = x->d2;
-    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
+    CSR_matrix *jac = node->jacobian->to_csr(node->jacobian);
     double *Jx = jac->x;
 
     for (int i = 0; i < node->size; i++)
@@ -236,7 +237,7 @@ static void jacobian_init_chain_rule(expr *node)
     mnode->term1_CSR = YT_kron_I_alloc(m, k, n, f->work->jacobian_csc);
     mnode->term2_CSR = I_kron_X_alloc(m, k, n, g->work->jacobian_csc);
     int max_nnz = mnode->term1_CSR->nnz + mnode->term2_CSR->nnz;
-    CSR_Matrix *jac = new_csr_matrix(node->size, node->n_vars, max_nnz);
+    CSR_matrix *jac = new_csr_matrix(node->size, node->n_vars, max_nnz);
     sum_csr_alloc(mnode->term1_CSR, mnode->term2_CSR, jac);
     node->jacobian = new_sparse_matrix(jac);
 }
@@ -277,7 +278,7 @@ static void wsum_hess_init_no_chain_rule(expr *node)
     int k = x->d2;
     int n = y->d2;
     int total_nnz = 2 * m * k * n;
-    CSR_Matrix *hess = new_csr_matrix(node->n_vars, node->n_vars, total_nnz);
+    CSR_matrix *hess = new_csr_matrix(node->n_vars, node->n_vars, total_nnz);
     int nnz = 0;
     int *Hi = hess->i;
     int *Hp = hess->p;
@@ -420,8 +421,8 @@ static void wsum_hess_init_chain_rule(expr *node)
     int m = f->d1;
     int k = f->d2;
     int n = g->d2;
-    CSC_Matrix *Jf = f->work->jacobian_csc;
-    CSC_Matrix *Jg = g->work->jacobian_csc;
+    CSC_matrix *Jf = f->work->jacobian_csc;
+    CSC_matrix *Jg = g->work->jacobian_csc;
 
     /* initialize C = Jf^T @ B @ Jg = Jf^T @ (B @ Jg) */
     mnode->B = build_cross_hessian_sparsity(m, k, n);
@@ -441,7 +442,7 @@ static void wsum_hess_init_chain_rule(expr *node)
 
     /* sum the four terms and fill idx maps */
     int *maps[4];
-    CSR_Matrix *hess =
+    CSR_matrix *hess =
         sum_4_csr_alloc(mnode->C, mnode->CT, f->wsum_hess->to_csr(f->wsum_hess),
                         g->wsum_hess->to_csr(g->wsum_hess), maps);
     node->wsum_hess = new_sparse_matrix(hess);
@@ -468,10 +469,10 @@ static void eval_wsum_hess_chain_rule(expr *node, const double *w)
     int n = g->d2;
     bool is_f_affine = f->is_affine(f);
     bool is_g_affine = g->is_affine(g);
-    CSC_Matrix *Jf = f->work->jacobian_csc;
-    CSC_Matrix *Jg = g->work->jacobian_csc;
+    CSC_matrix *Jf = f->work->jacobian_csc;
+    CSC_matrix *Jg = g->work->jacobian_csc;
 
-    /* refresh child Jacobian CSC values (cache if affine) */
+    /* refresh child Jacobian CSC_matrix values (cache if affine) */
     if (!f->work->jacobian_csc_filled)
     {
         csr_to_csc_fill_values(f->jacobian->to_csr(f->jacobian), Jf, f->work->csc_work);
@@ -481,7 +482,7 @@ static void eval_wsum_hess_chain_rule(expr *node, const double *w)
         }
     }
 
-    /* refresh child Jacobian CSC values (cache if affine) */
+    /* refresh child Jacobian CSC_matrix values (cache if affine) */
     if (!g->work->jacobian_csc_filled)
     {
         csr_to_csc_fill_values(g->jacobian->to_csr(g->jacobian), Jg, g->work->csc_work);
diff --git a/src/atoms/bivariate_full_dom/multiply.c b/src/atoms/bivariate_full_dom/multiply.c
index 575ac46..a2e2b63 100644
--- a/src/atoms/bivariate_full_dom/multiply.c
+++ b/src/atoms/bivariate_full_dom/multiply.c
@@ -20,6 +20,7 @@
 #include "utils/CSR_sum.h"
 #include "utils/matrix_BTA.h"
 #include "utils/matrix_sum.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <stdio.h>
@@ -82,7 +83,7 @@ static void wsum_hess_init_impl(expr *node)
         x->var_id != y->var_id)
     {
         assert(y->var_id != NOT_A_VARIABLE);
-        CSR_Matrix *hess =
+        CSR_matrix *hess =
             new_csr_matrix(node->n_vars, node->n_vars, 2 * node->size);
 
         int i, var1_id, var2_id;
@@ -149,9 +150,9 @@ static void wsum_hess_init_impl(expr *node)
             node->work->dwork = (double *) SP_MALLOC(node->size * sizeof(double));
         }
 
-        /* CSC scaffolding is still needed for the (Sparse, Sparse) fast path
+        /* CSC_matrix scaffolding is still needed for the (Sparse, Sparse) fast path
            through BTA_matrices_* / BTDA_matrices_* — those route through
-           Sparse_Matrix's csc_cache. For PD operands, refresh_csc_values is
+           sparse_matrix's csc_cache. For PD operands, refresh_csc_values is
            a no-op so the call is harmless. */
         jacobian_csc_init(x);
         jacobian_csc_init(y);
@@ -162,20 +163,20 @@ static void wsum_hess_init_impl(expr *node)
 
         elementwise_mult_expr *mul_node = (elementwise_mult_expr *) node;
 
-        /* compute sparsity of C polymorphically (Sparse, PD-CSR, CSR-PD, PD-PD). */
+        /* compute sparsity of C polymorphically (Sparse, PD-CSR_matrix, CSR_matrix-PD, PD-PD). */
         mul_node->cross_C = BTA_matrices_alloc(x->jacobian, y->jacobian);
 
-        /* CT structure is always CSR (via AT on C's CSR view). */
-        CSR_Matrix *C_csr = mul_node->cross_C->to_csr(mul_node->cross_C);
+        /* CT structure is always CSR_matrix (via AT on C's CSR_matrix view). */
+        CSR_matrix *C_csr = mul_node->cross_C->to_csr(mul_node->cross_C);
         node->work->iwork = (int *) SP_MALLOC(C_csr->m * sizeof(int));
-        CSR_Matrix *CT = AT_alloc(C_csr, node->work->iwork);
+        CSR_matrix *CT = AT_alloc(C_csr, node->work->iwork);
         mul_node->CSR_work2 = CT;
 
         /* compute sparsity pattern of H = C + C^T + term2 + term3 (we also
            fill index maps telling us where to accumulate each element of each
            matrix in the sum) */
         int *maps[4];
-        CSR_Matrix *hess = sum_4_csr_alloc(C_csr, CT,
+        CSR_matrix *hess = sum_4_csr_alloc(C_csr, CT,
                                            x->wsum_hess->to_csr(x->wsum_hess),
                                            y->wsum_hess->to_csr(y->wsum_hess), maps);
         node->wsum_hess = new_sparse_matrix(hess);
@@ -203,7 +204,7 @@ static void eval_wsum_hess(expr *node, const double *w)
         bool is_x_affine = x->is_affine(x);
         bool is_y_affine = y->is_affine(y);
         // ----------------------------------------------------------------------
-        //  Refresh each operand's CSC cache as needed for the (Sparse, Sparse)
+        //  Refresh each operand's CSC_matrix cache as needed for the (Sparse, Sparse)
         //  dispatch path. For PD operands, refresh_csc_values is a no-op. The
         //  jacobian_csc_filled flag preserves the affine optimization: we only
         //  refresh on the first eval for affine children.
@@ -229,7 +230,7 @@ static void eval_wsum_hess(expr *node, const double *w)
         //                    compute C and CT
         // ---------------------------------------------------------------
         elementwise_mult_expr *mul_node = (elementwise_mult_expr *) node;
-        CSR_Matrix *CT = mul_node->CSR_work2;
+        CSR_matrix *CT = mul_node->CSR_work2;
         BTDA_matrices_fill_values(x->jacobian, w, y->jacobian, mul_node->cross_C);
         AT_fill_values(mul_node->cross_C->to_csr(mul_node->cross_C), CT,
                        node->work->iwork);
diff --git a/src/atoms/bivariate_restricted_dom/quad_over_lin.c b/src/atoms/bivariate_restricted_dom/quad_over_lin.c
index c550c6c..07b08df 100644
--- a/src/atoms/bivariate_restricted_dom/quad_over_lin.c
+++ b/src/atoms/bivariate_restricted_dom/quad_over_lin.c
@@ -17,7 +17,8 @@
  */
 #include "atoms/bivariate_restricted_dom.h"
 #include "subexpr.h"
-#include "utils/CSC_Matrix.h"
+#include "utils/CSC_matrix.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <math.h>
@@ -58,7 +59,7 @@ static void jacobian_init_impl(expr *node)
     /* if left node is a variable */
     if (x->var_id != NOT_A_VARIABLE)
     {
-        CSR_Matrix *jac = new_csr_matrix(1, node->n_vars, x->size + 1);
+        CSR_matrix *jac = new_csr_matrix(1, node->n_vars, x->size + 1);
         jac->p[0] = 0;
         jac->p[1] = x->size + 1;
 
@@ -88,9 +89,9 @@ static void jacobian_init_impl(expr *node)
         /* compute required allocation and allocate jacobian */
         bool *col_nz = (bool *) SP_CALLOC(
             node->n_vars, sizeof(bool)); /* TODO: could use iwork here instead*/
-        CSR_Matrix *Jx = x->jacobian->to_csr(x->jacobian);
+        CSR_matrix *Jx = x->jacobian->to_csr(x->jacobian);
         int nonzero_cols = count_nonzero_cols(Jx, col_nz);
-        CSR_Matrix *jac = new_csr_matrix(1, node->n_vars, nonzero_cols + 1);
+        CSR_matrix *jac = new_csr_matrix(1, node->n_vars, nonzero_cols + 1);
 
         /* precompute column indices */
         jac->nnz = 0;
@@ -125,7 +126,7 @@ static void jacobian_init_impl(expr *node)
 
         node->jacobian = new_sparse_matrix(jac);
 
-        /* prepare CSC form of child jacobian for chain rule.
+        /* prepare CSC_matrix form of child jacobian for chain rule.
          * For a linear operator the values are constant, so fill
          * them once here. */
         jacobian_csc_init(x);
@@ -137,7 +138,7 @@ static void eval_jacobian(expr *node)
 {
     expr *x = node->left;
     expr *y = node->right;
-    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
+    CSR_matrix *jac = node->jacobian->to_csr(node->jacobian);
 
     /* if x is a variable */
     if (x->var_id != NOT_A_VARIABLE)
@@ -168,7 +169,7 @@ static void eval_jacobian(expr *node)
             node->work->dwork[j] = (2.0 * x->value[j]) / y->value[0];
         }
 
-        /* chain rule (no derivative wrt y) using CSC format */
+        /* chain rule (no derivative wrt y) using CSC_matrix format */
         yTA_fill_values(x->work->jacobian_csc, node->work->dwork, jac);
 
         /* insert derivative wrt y at right place (for correctness this assumes
@@ -188,7 +189,7 @@ static void wsum_hess_init_impl(expr *node)
     /* if left node is a variable */
     if (x->var_id != NOT_A_VARIABLE)
     {
-        CSR_Matrix *H = new_csr_matrix(node->n_vars, node->n_vars, 3 * x->size + 1);
+        CSR_matrix *H = new_csr_matrix(node->n_vars, node->n_vars, 3 * x->size + 1);
         node->wsum_hess = new_sparse_matrix(H);
 
         /* if x has lower idx than y*/
diff --git a/src/atoms/bivariate_restricted_dom/rel_entr.c b/src/atoms/bivariate_restricted_dom/rel_entr.c
index ad7a1f7..0c7ea2a 100644
--- a/src/atoms/bivariate_restricted_dom/rel_entr.c
+++ b/src/atoms/bivariate_restricted_dom/rel_entr.c
@@ -16,6 +16,7 @@
  * limitations under the License.
  */
 #include "atoms/bivariate_restricted_dom.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <math.h>
@@ -45,7 +46,7 @@ static void forward_vector_args(expr *node, const double *u)
 
 static void jacobian_init_vectors_args(expr *node)
 {
-    CSR_Matrix *jac = new_csr_matrix(node->size, node->n_vars, 2 * node->size);
+    CSR_matrix *jac = new_csr_matrix(node->size, node->n_vars, 2 * node->size);
 
     expr *x = node->left;
     expr *y = node->right;
@@ -101,7 +102,7 @@ static void eval_jacobian_vector_args(expr *node)
 
 static void wsum_hess_init_vector_args(expr *node)
 {
-    CSR_Matrix *H = new_csr_matrix(node->n_vars, node->n_vars, 4 * node->size);
+    CSR_matrix *H = new_csr_matrix(node->n_vars, node->n_vars, 4 * node->size);
     expr *x = node->left;
     expr *y = node->right;
 
diff --git a/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c b/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c
index a120718..6dbc5b4 100644
--- a/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c
+++ b/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c
@@ -16,6 +16,7 @@
  * limitations under the License.
  */
 #include "atoms/bivariate_restricted_dom.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <math.h>
@@ -50,7 +51,7 @@ static void jacobian_init_scalar_vector(expr *node)
     assert(x->var_id != NOT_A_VARIABLE && y->var_id != NOT_A_VARIABLE);
     assert(x->var_id != y->var_id);
 
-    CSR_Matrix *jac = new_csr_matrix(node->size, node->n_vars, 2 * node->size);
+    CSR_matrix *jac = new_csr_matrix(node->size, node->n_vars, 2 * node->size);
 
     if (x->var_id < y->var_id)
     {
@@ -104,7 +105,7 @@ static void wsum_hess_init_scalar_vector(expr *node)
     int var_id_x = x->var_id;
     int var_id_y = y->var_id;
 
-    CSR_Matrix *H =
+    CSR_matrix *H =
         new_csr_matrix(node->n_vars, node->n_vars, 3 * node->size + 1);
 
     if (var_id_x < var_id_y)
diff --git a/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c b/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c
index 1fabd35..99cfe08 100644
--- a/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c
+++ b/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c
@@ -16,6 +16,7 @@
  * limitations under the License.
  */
 #include "atoms/bivariate_restricted_dom.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <math.h>
@@ -50,7 +51,7 @@ static void jacobian_init_vector_scalar(expr *node)
     assert(x->var_id != NOT_A_VARIABLE && y->var_id != NOT_A_VARIABLE);
     assert(x->var_id != y->var_id);
 
-    CSR_Matrix *jac = new_csr_matrix(node->size, node->n_vars, 2 * node->size);
+    CSR_matrix *jac = new_csr_matrix(node->size, node->n_vars, 2 * node->size);
 
     if (x->var_id < y->var_id)
     {
@@ -104,7 +105,7 @@ static void wsum_hess_init_vector_scalar(expr *node)
     int var_id_x = x->var_id;
     int var_id_y = y->var_id;
 
-    CSR_Matrix *H =
+    CSR_matrix *H =
         new_csr_matrix(node->n_vars, node->n_vars, 3 * node->size + 1);
 
     if (var_id_x < var_id_y)
diff --git a/src/atoms/elementwise_full_dom/common.c b/src/atoms/elementwise_full_dom/common.c
index 95775e4..94740b1 100644
--- a/src/atoms/elementwise_full_dom/common.c
+++ b/src/atoms/elementwise_full_dom/common.c
@@ -17,9 +17,10 @@
  */
 #include "atoms/elementwise_full_dom.h"
 #include "subexpr.h"
-#include "utils/CSC_Matrix.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSC_matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/matrix_sum.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <stdio.h>
 #include <stdlib.h>
@@ -32,7 +33,7 @@ void jacobian_init_elementwise(expr *node)
     /* if the variable is a child */
     if (child->var_id != NOT_A_VARIABLE)
     {
-        CSR_Matrix *jac = new_csr_matrix(node->size, node->n_vars, node->size);
+        CSR_matrix *jac = new_csr_matrix(node->size, node->n_vars, node->size);
         for (int j = 0; j < node->size; j++)
         {
             jac->p[j] = j;
@@ -81,7 +82,7 @@ void wsum_hess_init_elementwise(expr *node)
     /* if the variable is a child */
     if (id != NOT_A_VARIABLE)
     {
-        CSR_Matrix *hess = new_csr_matrix(node->n_vars, node->n_vars, node->size);
+        CSR_matrix *hess = new_csr_matrix(node->n_vars, node->n_vars, node->size);
 
         for (i = 0; i < node->size; i++)
         {
@@ -139,7 +140,7 @@ void eval_wsum_hess_elementwise(expr *node, const double *w)
     {
         if (child->is_affine(child))
         {
-            /* Refresh the child Jacobian's CSC mirror once; subsequent calls
+            /* Refresh the child Jacobian's CSC_matrix mirror once; subsequent calls
                skip since the affine child's values don't change. */
             if (!child->work->jacobian_csc_filled)
             {
diff --git a/src/atoms/elementwise_restricted_dom/common.c b/src/atoms/elementwise_restricted_dom/common.c
index ddb4ddb..2ce3af0 100644
--- a/src/atoms/elementwise_restricted_dom/common.c
+++ b/src/atoms/elementwise_restricted_dom/common.c
@@ -16,6 +16,7 @@
  * limitations under the License.
  */
 #include "atoms/elementwise_restricted_dom.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <stdlib.h>
 
@@ -23,7 +24,7 @@ void jacobian_init_restricted(expr *node)
 {
     expr *child = node->left;
 
-    CSR_Matrix *jac = new_csr_matrix(node->size, node->n_vars, node->size);
+    CSR_matrix *jac = new_csr_matrix(node->size, node->n_vars, node->size);
     for (int j = 0; j < node->size; j++)
     {
         jac->p[j] = j;
@@ -39,7 +40,7 @@ void wsum_hess_init_restricted(expr *node)
     int id = child->var_id;
     int i;
 
-    CSR_Matrix *hess = new_csr_matrix(node->n_vars, node->n_vars, node->size);
+    CSR_matrix *hess = new_csr_matrix(node->n_vars, node->n_vars, node->size);
 
     for (i = 0; i < node->size; i++)
     {
diff --git a/src/atoms/other/prod.c b/src/atoms/other/prod.c
index 8e1477e..4a380c2 100644
--- a/src/atoms/other/prod.c
+++ b/src/atoms/other/prod.c
@@ -16,6 +16,7 @@
  * limitations under the License.
  */
 #include "atoms/non_elementwise_full_dom.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <math.h>
@@ -71,7 +72,7 @@ static void jacobian_init_impl(expr *node)
     /* if x is a variable */
     if (x->var_id != NOT_A_VARIABLE)
     {
-        CSR_Matrix *jac = new_csr_matrix(1, node->n_vars, x->size);
+        CSR_matrix *jac = new_csr_matrix(1, node->n_vars, x->size);
         jac->p[0] = 0;
         jac->p[1] = x->size;
         for (int j = 0; j < x->size; j++)
@@ -130,10 +131,10 @@ static void wsum_hess_init_impl(expr *node)
     /* if x is a variable */
     if (x->var_id != NOT_A_VARIABLE)
     {
-        /* allocate n_vars x n_vars CSR matrix with dense block */
+        /* allocate n_vars x n_vars CSR_matrix matrix with dense block */
         int block_size = x->size;
         int nnz = block_size * block_size;
-        CSR_Matrix *hess = new_csr_matrix(node->n_vars, node->n_vars, nnz);
+        CSR_matrix *hess = new_csr_matrix(node->n_vars, node->n_vars, nnz);
 
         /* fill row pointers for the dense block */
         for (int i = 0; i < block_size; i++)
diff --git a/src/atoms/other/prod_axis_one.c b/src/atoms/other/prod_axis_one.c
index b310d95..49234df 100644
--- a/src/atoms/other/prod_axis_one.c
+++ b/src/atoms/other/prod_axis_one.c
@@ -16,6 +16,7 @@
  * limitations under the License.
  */
 #include "atoms/non_elementwise_full_dom.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <math.h>
@@ -81,7 +82,7 @@ static void jacobian_init_impl(expr *node)
     /* if x is a variable */
     if (x->var_id != NOT_A_VARIABLE)
     {
-        CSR_Matrix *jac = new_csr_matrix(node->size, node->n_vars, x->size);
+        CSR_matrix *jac = new_csr_matrix(node->size, node->n_vars, x->size);
 
         /* set row pointers (each row has d2 nnzs) */
         for (int row = 0; row < x->d1; row++)
@@ -162,7 +163,7 @@ static void wsum_hess_init_impl(expr *node)
         /* each row i has d2-1 non-zero entries, with column indices corresponding to
            the columns in that row (except the diagonal element). */
         int nnz = x->d1 * x->d2 * (x->d2 - 1);
-        CSR_Matrix *H = new_csr_matrix(node->n_vars, node->n_vars, nnz);
+        CSR_matrix *H = new_csr_matrix(node->n_vars, node->n_vars, nnz);
 
         /* fill sparsity pattern */
         int nnz_per_row = x->d2 - 1;
@@ -206,7 +207,7 @@ static inline void wsum_hess_row_no_zeros(expr *node, const double *w, int row,
                                           int d2)
 {
     expr *x = node->left;
-    CSR_Matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
+    CSR_matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
     double scale = w[row] * node->value[row];
 
     /* for each variable xk in this row, fill in Hessian entries
@@ -233,7 +234,7 @@ static inline void wsum_hess_row_one_zero(expr *node, const double *w, int row,
 {
     expr *x = node->left;
     prod_axis *pnode = (prod_axis *) node;
-    CSR_Matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
+    CSR_matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
     double *H_vals = H->x;
     int p = pnode->zero_index[row]; /* zero column index */
     double w_prod = w[row] * pnode->prod_nonzero[row];
@@ -278,7 +279,7 @@ static inline void wsum_hess_row_two_zeros(expr *node, const double *w, int row,
 {
     expr *x = node->left;
     prod_axis *pnode = (prod_axis *) node;
-    CSR_Matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
+    CSR_matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
     double *H_vals = H->x;
 
     /* find indices p and q where row has zeros */
@@ -331,7 +332,7 @@ static inline void wsum_hess_row_two_zeros(expr *node, const double *w, int row,
 
 static inline void wsum_hess_row_many_zeros(expr *node, int row, int d2)
 {
-    CSR_Matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
+    CSR_matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
     double *H_vals = H->x;
     expr *x = node->left;
 
diff --git a/src/atoms/other/prod_axis_zero.c b/src/atoms/other/prod_axis_zero.c
index 61352fb..a5eebae 100644
--- a/src/atoms/other/prod_axis_zero.c
+++ b/src/atoms/other/prod_axis_zero.c
@@ -16,6 +16,7 @@
  * limitations under the License.
  */
 #include "atoms/non_elementwise_full_dom.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <math.h>
@@ -76,7 +77,7 @@ static void jacobian_init_impl(expr *node)
     /* if x is a variable */
     if (x->var_id != NOT_A_VARIABLE)
     {
-        CSR_Matrix *jac = new_csr_matrix(node->size, node->n_vars, x->size);
+        CSR_matrix *jac = new_csr_matrix(node->size, node->n_vars, x->size);
 
         /* set row pointers (each row has d1 nnzs) */
         for (int row = 0; row < x->d2; row++)
@@ -155,7 +156,7 @@ static void wsum_hess_init_impl(expr *node)
     {
         /* Hessian has block diagonal structure: d2 blocks of size d1 x d1 */
         int nnz = x->d2 * x->d1 * x->d1;
-        CSR_Matrix *H = new_csr_matrix(node->n_vars, node->n_vars, nnz);
+        CSR_matrix *H = new_csr_matrix(node->n_vars, node->n_vars, nnz);
 
         /* fill row pointers for the variable's rows (block diagonal) */
         for (int i = 0; i < x->size; i++)
diff --git a/src/atoms/other/quad_form.c b/src/atoms/other/quad_form.c
index 3906dc4..3c4cac0 100644
--- a/src/atoms/other/quad_form.c
+++ b/src/atoms/other/quad_form.c
@@ -17,9 +17,10 @@
  */
 #include "atoms/non_elementwise_full_dom.h"
 #include "subexpr.h"
-#include "utils/CSC_Matrix.h"
+#include "utils/CSC_matrix.h"
 #include "utils/matrix_sum.h"
 #include "utils/cblas_wrapper.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <math.h>
@@ -35,7 +36,7 @@ static void forward(expr *node, const double *u)
     x->forward(x, u);
 
     /* local forward pass  */
-    CSR_Matrix *Q = ((quad_form_expr *) node)->Q;
+    CSR_matrix *Q = ((quad_form_expr *) node)->Q;
     Ax_csr(Q, x->value, node->work->dwork, 0);
     node->value[0] = 0.0;
 
@@ -51,7 +52,7 @@ static void jacobian_init_impl(expr *node)
 
     if (x->var_id != NOT_A_VARIABLE)
     {
-        CSR_Matrix *jac = new_csr_matrix(1, node->n_vars, x->size);
+        CSR_matrix *jac = new_csr_matrix(1, node->n_vars, x->size);
         jac->p[0] = 0;
         jac->p[1] = x->size;
 
@@ -66,11 +67,11 @@ static void jacobian_init_impl(expr *node)
         /* chain rule: J = 2 * (Q @ f(x))^T * J_f */
         jacobian_init(x);
         jacobian_csc_init(x);
-        CSC_Matrix *J_csc = x->work->jacobian_csc;
+        CSC_matrix *J_csc = x->work->jacobian_csc;
 
         /* allocate the right number of nnz */
         int nnz = count_nonzero_cols_csc(J_csc);
-        CSR_Matrix *jac = new_csr_matrix(1, node->n_vars, nnz);
+        CSR_matrix *jac = new_csr_matrix(1, node->n_vars, nnz);
         jac->p[0] = 0;
         jac->p[1] = nnz;
 
@@ -90,8 +91,8 @@ static void jacobian_init_impl(expr *node)
 static void eval_jacobian(expr *node)
 {
     expr *x = node->left;
-    CSR_Matrix *Q = ((quad_form_expr *) node)->Q;
-    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
+    CSR_matrix *Q = ((quad_form_expr *) node)->Q;
+    CSR_matrix *jac = node->jacobian->to_csr(node->jacobian);
 
     if (x->var_id != NOT_A_VARIABLE)
     {
@@ -125,12 +126,12 @@ static void eval_jacobian(expr *node)
 
 static void wsum_hess_init_impl(expr *node)
 {
-    CSR_Matrix *Q = ((quad_form_expr *) node)->Q;
+    CSR_matrix *Q = ((quad_form_expr *) node)->Q;
     expr *x = node->left;
 
     if (x->var_id != NOT_A_VARIABLE)
     {
-        CSR_Matrix *H = new_csr_matrix(node->n_vars, node->n_vars, Q->nnz);
+        CSR_matrix *H = new_csr_matrix(node->n_vars, node->n_vars, Q->nnz);
 
         /* set global row pointers */
         memcpy(H->p + x->var_id, Q->p, (x->size + 1) * sizeof(int));
@@ -160,10 +161,10 @@ static void wsum_hess_init_impl(expr *node)
 
         /* jacobian_csc_init(x) already called in jacobian_init */
         quad_form_expr *qnode = (quad_form_expr *) node;
-        CSC_Matrix *Jf = x->work->jacobian_csc;
+        CSC_matrix *Jf = x->work->jacobian_csc;
 
         /* term1 = Jf^T W Jf = Jf^T B*/
-        CSC_Matrix *B = symBA_alloc(Q, Jf);
+        CSC_matrix *B = symBA_alloc(Q, Jf);
         qnode->QJf = B;
         node->work->hess_term1 = new_sparse_matrix(BTA_alloc(Jf, B));
 
@@ -183,7 +184,7 @@ static void wsum_hess_init_impl(expr *node)
 
 static void eval_wsum_hess(expr *node, const double *w)
 {
-    CSR_Matrix *Q = ((quad_form_expr *) node)->Q;
+    CSR_matrix *Q = ((quad_form_expr *) node)->Q;
     expr *x = node->left;
     double two_w = 2.0 * w[0];
 
@@ -196,8 +197,8 @@ static void eval_wsum_hess(expr *node, const double *w)
     }
     else
     {
-        /* fill the CSC representation of the Jacobian of the child */
-        CSC_Matrix *Jf = x->work->jacobian_csc;
+        /* fill the CSC_matrix representation of the Jacobian of the child */
+        CSC_matrix *Jf = x->work->jacobian_csc;
         if (!x->work->jacobian_csc_filled)
         {
             csr_to_csc_fill_values(x->jacobian->to_csr(x->jacobian), Jf, x->work->csc_work);
@@ -208,8 +209,8 @@ static void eval_wsum_hess(expr *node, const double *w)
             }
         }
 
-        CSC_Matrix *QJf = ((quad_form_expr *) node)->QJf;
-        CSR_Matrix *term1 = node->work->hess_term1->to_csr(node->work->hess_term1);
+        CSC_matrix *QJf = ((quad_form_expr *) node)->QJf;
+        CSR_matrix *term1 = node->work->hess_term1->to_csr(node->work->hess_term1);
 
         /* term1 = J_f^T Q J_f = J_f^T B  */
         BA_fill_values(Q, Jf, QJf);
@@ -249,7 +250,7 @@ static bool is_affine(const expr *node)
     return false;
 }
 
-expr *new_quad_form(expr *left, CSR_Matrix *Q)
+expr *new_quad_form(expr *left, CSR_matrix *Q)
 {
     assert(left->d1 == 1 || left->d2 == 1); /* left must be a vector */
     quad_form_expr *qnode = (quad_form_expr *) SP_CALLOC(1, sizeof(quad_form_expr));
diff --git a/src/expr.c b/src/expr.c
index 9b300d4..8aa199e 100644
--- a/src/expr.c
+++ b/src/expr.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 #include "expr.h"
-#include "utils/CSC_Matrix.h"
+#include "utils/CSC_matrix.h"
 #include "utils/int_double_pair.h"
 #include "utils/tracked_alloc.h"
 #include <stdlib.h>
diff --git a/src/old-code/linear_op.c b/src/old-code/linear_op.c
index 33bfed5..65b67b5 100644
--- a/src/old-code/linear_op.c
+++ b/src/old-code/linear_op.c
@@ -17,7 +17,8 @@
  */
 #include "old-code/old_affine.h"
 #include "subexpr.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <stdlib.h>
@@ -85,7 +86,7 @@ static void eval_wsum_hess(expr *node, const double *w)
     (void) w;
 }
 
-expr *new_linear(expr *u, const CSR_Matrix *A, const double *b)
+expr *new_linear(expr *u, const CSR_matrix *A, const double *b)
 {
     assert(u->d2 == 1);
     /* Allocate the type-specific struct */
@@ -98,7 +99,7 @@ expr *new_linear(expr *u, const CSR_Matrix *A, const double *b)
     expr_retain(u);
 
     /* Store A directly as the jacobian (linear op jacobian is constant) */
-    CSR_Matrix *jac = new_csr_matrix(A->m, A->n, A->nnz);
+    CSR_matrix *jac = new_csr_matrix(A->m, A->n, A->nnz);
     copy_csr_matrix(A, jac);
     node->jacobian = new_sparse_matrix(jac);
 
diff --git a/src/old-code/old_CSR.c b/src/old-code/old_CSR.c
index 09225ea..420e825 100644
--- a/src/old-code/old_CSR.c
+++ b/src/old-code/old_CSR.c
@@ -16,11 +16,11 @@
  * limitations under the License.
  */
 #include "old-code/old_CSR.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 #include <assert.h>
 #include <string.h>
 
-CSR_Matrix *block_diag_repeat_csr(const CSR_Matrix *A, int p)
+CSR_matrix *block_diag_repeat_csr(const CSR_matrix *A, int p)
 {
     assert(p > 0);
 
@@ -28,7 +28,7 @@ CSR_Matrix *block_diag_repeat_csr(const CSR_Matrix *A, int p)
     int n = A->n;
     int nnz = A->nnz;
 
-    CSR_Matrix *A_kron = new_csr_matrix(m * p, n * p, nnz * p);
+    CSR_matrix *A_kron = new_csr_matrix(m * p, n * p, nnz * p);
 
     int nnz_cursor = 0;
     for (int block = 0; block < p; block++)
@@ -55,7 +55,7 @@ CSR_Matrix *block_diag_repeat_csr(const CSR_Matrix *A, int p)
     return A_kron;
 }
 
-CSR_Matrix *kron_identity_csr(const CSR_Matrix *A, int p)
+CSR_matrix *kron_identity_csr(const CSR_matrix *A, int p)
 {
     assert(p > 0);
 
@@ -63,7 +63,7 @@ CSR_Matrix *kron_identity_csr(const CSR_Matrix *A, int p)
     int n = A->n;
     int nnz = A->nnz;
 
-    CSR_Matrix *A_kron = new_csr_matrix(m * p, n * p, nnz * p);
+    CSR_matrix *A_kron = new_csr_matrix(m * p, n * p, nnz * p);
 
     int nnz_cursor = 0;
     for (int row_block = 0; row_block < m; row_block++)
@@ -90,7 +90,7 @@ CSR_Matrix *kron_identity_csr(const CSR_Matrix *A, int p)
     return A_kron;
 }
 
-void Ax_csr_fill_values(const CSR_Matrix *AT, const double *z, CSR_Matrix *C)
+void Ax_csr_fill_values(const CSR_matrix *AT, const double *z, CSR_matrix *C)
 {
     int A_ncols = AT->m;
 
@@ -116,7 +116,7 @@ void Ax_csr_fill_values(const CSR_Matrix *AT, const double *z, CSR_Matrix *C)
     }
 }
 
-void csr_insert_value(CSR_Matrix *A, int col_idx, double value)
+void csr_insert_value(CSR_matrix *A, int col_idx, double value)
 {
     assert(A->m == 1);
 
@@ -144,7 +144,7 @@ void csr_insert_value(CSR_Matrix *A, int col_idx, double value)
     A->nnz++;
 }
 
-void Ax_csr_wo_offset(const CSR_Matrix *A, const double *x, double *y)
+void Ax_csr_wo_offset(const CSR_matrix *A, const double *x, double *y)
 {
     for (int row = 0; row < A->m; row++)
     {
@@ -157,7 +157,7 @@ void Ax_csr_wo_offset(const CSR_Matrix *A, const double *x, double *y)
     }
 }
 
-void diag_csr_mult(const double *d, const CSR_Matrix *A, CSR_Matrix *C)
+void diag_csr_mult(const double *d, const CSR_matrix *A, CSR_matrix *C)
 {
     copy_csr_matrix(A, C);
 
diff --git a/src/old-code/old_CSR_sum.c b/src/old-code/old_CSR_sum.c
index e1cea50..c6a659d 100644
--- a/src/old-code/old_CSR_sum.c
+++ b/src/old-code/old_CSR_sum.c
@@ -16,13 +16,13 @@
  * limitations under the License.
  */
 #include "old-code/old_CSR_sum.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/int_double_pair.h"
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 
-void sum_csr_matrices(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C)
+void sum_csr_matrices(const CSR_matrix *A, const CSR_matrix *B, CSR_matrix *C)
 {
     /* A and B must be different from C */
     assert(A != C && B != C);
@@ -84,7 +84,7 @@ void sum_csr_matrices(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C)
     C->p[A->m] = C->nnz;
 }
 
-void sum_scaled_csr_matrices(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C,
+void sum_scaled_csr_matrices(const CSR_matrix *A, const CSR_matrix *B, CSR_matrix *C,
                              const double *d1, const double *d2)
 {
     C->nnz = 0;
@@ -150,7 +150,7 @@ void sum_scaled_csr_matrices(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matri
     C->p[A->m] = C->nnz;
 }
 
-void sum_all_rows_csr(const CSR_Matrix *A, CSR_Matrix *C, int_double_pair *pairs)
+void sum_all_rows_csr(const CSR_matrix *A, CSR_matrix *C, int_double_pair *pairs)
 {
     assert(C->m == 1);
     C->n = A->n;
@@ -185,7 +185,7 @@ void sum_all_rows_csr(const CSR_Matrix *A, CSR_Matrix *C, int_double_pair *pairs
     C->p[1] = C->nnz;
 }
 
-void sum_block_of_rows_csr(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_block_of_rows_csr(const CSR_matrix *A, CSR_matrix *C,
                            int_double_pair *pairs, int row_block_size)
 {
     assert(A->m % row_block_size == 0);
@@ -238,7 +238,7 @@ void sum_block_of_rows_csr(const CSR_Matrix *A, CSR_Matrix *C,
     }
 }
 
-void sum_evenly_spaced_rows_csr(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_evenly_spaced_rows_csr(const CSR_matrix *A, CSR_matrix *C,
                                 int_double_pair *pairs, int row_spacing)
 {
     assert(C->m == row_spacing);
@@ -286,7 +286,7 @@ void sum_evenly_spaced_rows_csr(const CSR_Matrix *A, CSR_Matrix *C,
     }
 }
 
-void sum_spaced_rows_into_row_csr(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_spaced_rows_into_row_csr(const CSR_matrix *A, CSR_matrix *C,
                                   int_double_pair *pairs, int offset, int spacing)
 {
     assert(C->m == 1);
diff --git a/src/problem.c b/src/problem.c
index d25c6ea..a6ea358 100644
--- a/src/problem.c
+++ b/src/problem.c
@@ -83,10 +83,10 @@ static void problem_lagrange_hess_fill_sparsity(problem *prob, int *iwork)
     int *cols = iwork;
     int *col_to_pos = iwork; /* reused after qsort */
     int nnz = 0;
-    CSR_Matrix *H_obj =
+    CSR_matrix *H_obj =
         prob->objective->wsum_hess->to_csr(prob->objective->wsum_hess);
-    CSR_Matrix *H_c;
-    CSR_Matrix *H = prob->lagrange_hessian;
+    CSR_matrix *H_c;
+    CSR_matrix *H = prob->lagrange_hessian;
     H->p[0] = 0;
 
     // ----------------------------------------------------------------------
@@ -177,7 +177,7 @@ void problem_init_jacobian(problem *prob)
     {
         expr *c = prob->constraints[i];
         jacobian_init(c);
-        CSR_Matrix *Jc = c->jacobian->to_csr(c->jacobian);
+        CSR_matrix *Jc = c->jacobian->to_csr(c->jacobian);
         nnz += Jc->nnz;
 
         if (c->is_affine(c))
@@ -193,14 +193,14 @@ void problem_init_jacobian(problem *prob)
     prob->jacobian = new_csr_matrix(prob->total_constraint_size, prob->n_vars, nnz);
 
     /* set sparsity pattern of jacobian */
-    CSR_Matrix *H = prob->jacobian;
+    CSR_matrix *H = prob->jacobian;
     H->p[0] = 0;
     int row_offset = 0;
     int nnz_offset = 0;
     for (int i = 0; i < prob->n_constraints; i++)
     {
         expr *c = prob->constraints[i];
-        CSR_Matrix *Jc = c->jacobian->to_csr(c->jacobian);
+        CSR_matrix *Jc = c->jacobian->to_csr(c->jacobian);
 
         for (int r = 1; r <= Jc->m; r++)
         {
@@ -469,7 +469,7 @@ void problem_gradient(problem *prob)
 
     /* copy sparse jacobian to dense gradient */
     memset(prob->gradient_values, 0, prob->n_vars * sizeof(double));
-    CSR_Matrix *jac = prob->objective->jacobian->to_csr(prob->objective->jacobian);
+    CSR_matrix *jac = prob->objective->jacobian->to_csr(prob->objective->jacobian);
     for (int k = jac->p[0]; k < jac->p[1]; k++)
     {
         prob->gradient_values[jac->i[k]] = jac->x[k];
@@ -485,7 +485,7 @@ void problem_jacobian(problem *prob)
     clock_gettime(CLOCK_MONOTONIC, &timer.start);
     bool first_call = !prob->jacobian_called;
 
-    CSR_Matrix *J = prob->jacobian;
+    CSR_matrix *J = prob->jacobian;
     int nnz_offset = 0;
 
     for (int i = 0; i < prob->n_constraints; i++)
@@ -539,7 +539,7 @@ void problem_hessian(problem *prob, double obj_w, const double *w)
     // ------------------------------------------------------------------------
     //           assemble Lagrange hessian using index map
     // ------------------------------------------------------------------------
-    CSR_Matrix *H = prob->lagrange_hessian;
+    CSR_matrix *H = prob->lagrange_hessian;
     int *idx_map = prob->hess_idx_map;
 
     /* zero out hessian before adding contribution from obj and constraints */
@@ -552,7 +552,7 @@ void problem_hessian(problem *prob, double obj_w, const double *w)
     /* accumulate constraint functions */
     for (int i = 0; i < prob->n_constraints; i++)
     {
-        Matrix *c_hess = constrs[i]->wsum_hess;
+        matrix *c_hess = constrs[i]->wsum_hess;
         accumulator(c_hess->x, c_hess->nnz, idx_map + offset, H->x);
         offset += c_hess->nnz;
     }
diff --git a/src/utils/COO_Matrix.c b/src/utils/COO_Matrix.c
index 727be25..9b67291 100644
--- a/src/utils/COO_Matrix.c
+++ b/src/utils/COO_Matrix.c
@@ -20,7 +20,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-COO_Matrix *new_coo_matrix(const CSR_Matrix *A)
+COO_Matrix *new_coo_matrix(const CSR_matrix *A)
 {
     COO_Matrix *coo = (COO_Matrix *) SP_MALLOC(sizeof(COO_Matrix));
     coo->m = A->m;
@@ -45,7 +45,7 @@ COO_Matrix *new_coo_matrix(const CSR_Matrix *A)
     return coo;
 }
 
-COO_Matrix *new_coo_matrix_lower_triangular(const CSR_Matrix *A)
+COO_Matrix *new_coo_matrix_lower_triangular(const CSR_matrix *A)
 {
     /* Pass 1: count lower-triangular entries (col <= row) */
     int count = 0;
diff --git a/src/utils/CSC_Matrix.c b/src/utils/CSC_matrix.c
similarity index 90%
rename from src/utils/CSC_Matrix.c
rename to src/utils/CSC_matrix.c
index 4718040..09a3aca 100644
--- a/src/utils/CSC_Matrix.c
+++ b/src/utils/CSC_matrix.c
@@ -15,16 +15,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "utils/CSC_Matrix.h"
+#include "utils/CSC_matrix.h"
 #include "utils/iVec.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 
-CSC_Matrix *new_csc_matrix(int m, int n, int nnz)
+CSC_matrix *new_csc_matrix(int m, int n, int nnz)
 {
-    CSC_Matrix *matrix = (CSC_Matrix *) SP_MALLOC(sizeof(CSC_Matrix));
+    CSC_matrix *matrix = (CSC_matrix *) SP_MALLOC(sizeof(CSC_matrix));
     if (!matrix) return NULL;
 
     matrix->p = (int *) SP_MALLOC((n + 1) * sizeof(int));
@@ -47,7 +47,7 @@ CSC_Matrix *new_csc_matrix(int m, int n, int nnz)
     return matrix;
 }
 
-void free_csc_matrix(CSC_Matrix *matrix)
+void free_csc_matrix(CSC_matrix *matrix)
 {
     if (matrix)
     {
@@ -58,7 +58,7 @@ void free_csc_matrix(CSC_Matrix *matrix)
     }
 }
 
-CSR_Matrix *ATA_alloc(const CSC_Matrix *A)
+CSR_matrix *ATA_alloc(const CSC_matrix *A)
 {
     /* A is m x n, A^T A is n x n */
     int n = A->n;
@@ -102,7 +102,7 @@ CSR_Matrix *ATA_alloc(const CSC_Matrix *A)
     }
 
     /* Allocate C and symmetrize it */
-    CSR_Matrix *C = new_csr_matrix(n, n, nnz);
+    CSR_matrix *C = new_csr_matrix(n, n, nnz);
     symmetrize_csr(Cp, Ci->data, n, C);
 
     /* free workspace */
@@ -169,7 +169,7 @@ static inline double sparse_wdot(const double *a_x, const int *a_i, int a_nnz,
     return sum;
 }
 
-void ATDA_fill_values(const CSC_Matrix *A, const double *d, CSR_Matrix *C)
+void ATDA_fill_values(const CSC_matrix *A, const double *d, CSR_matrix *C)
 {
     int j, ii, jj;
     for (ii = 0; ii < C->m; ii++)
@@ -203,9 +203,9 @@ void ATDA_fill_values(const CSC_Matrix *A, const double *d, CSR_Matrix *C)
     }
 }
 
-CSC_Matrix *csr_to_csc_alloc(const CSR_Matrix *A, int *iwork)
+CSC_matrix *csr_to_csc_alloc(const CSR_matrix *A, int *iwork)
 {
-    CSC_Matrix *C = new_csc_matrix(A->m, A->n, A->nnz);
+    CSC_matrix *C = new_csc_matrix(A->m, A->n, A->nnz);
 
     int i, j;
     int *count = iwork;
@@ -247,7 +247,7 @@ CSC_Matrix *csr_to_csc_alloc(const CSR_Matrix *A, int *iwork)
     return C;
 }
 
-void csr_to_csc_fill_values(const CSR_Matrix *A, CSC_Matrix *C, int *iwork)
+void csr_to_csc_fill_values(const CSR_matrix *A, CSC_matrix *C, int *iwork)
 {
     int i, j;
     int *count = iwork;
@@ -266,9 +266,9 @@ void csr_to_csc_fill_values(const CSR_Matrix *A, CSC_Matrix *C, int *iwork)
     }
 }
 
-CSR_Matrix *csc_to_csr_alloc(const CSC_Matrix *A, int *iwork)
+CSR_matrix *csc_to_csr_alloc(const CSC_matrix *A, int *iwork)
 {
-    CSR_Matrix *C = new_csr_matrix(A->m, A->n, A->nnz);
+    CSR_matrix *C = new_csr_matrix(A->m, A->n, A->nnz);
 
     int i, j;
     int *count = iwork;
@@ -312,7 +312,7 @@ CSR_Matrix *csc_to_csr_alloc(const CSC_Matrix *A, int *iwork)
     return C;
 }
 
-void csc_to_csr_fill_values(const CSC_Matrix *A, CSR_Matrix *C, int *iwork)
+void csc_to_csr_fill_values(const CSC_matrix *A, CSR_matrix *C, int *iwork)
 {
     int i, j;
     int *count = iwork;
@@ -332,7 +332,7 @@ void csc_to_csr_fill_values(const CSC_Matrix *A, CSR_Matrix *C, int *iwork)
     }
 }
 
-CSR_Matrix *BTA_alloc(const CSC_Matrix *A, const CSC_Matrix *B)
+CSR_matrix *BTA_alloc(const CSC_matrix *A, const CSC_matrix *B)
 {
     /* A is m x n, B is m x p, C = B^T A is p x n */
     int n = A->n;
@@ -378,7 +378,7 @@ CSR_Matrix *BTA_alloc(const CSC_Matrix *A, const CSC_Matrix *B)
     }
 
     /* Allocate C */
-    CSR_Matrix *C = new_csr_matrix(p, n, nnz);
+    CSR_matrix *C = new_csr_matrix(p, n, nnz);
     memcpy(C->p, Cp, (p + 1) * sizeof(int));
     memcpy(C->i, Ci->data, nnz * sizeof(int));
 
@@ -389,7 +389,7 @@ CSR_Matrix *BTA_alloc(const CSC_Matrix *A, const CSC_Matrix *B)
     return C;
 }
 
-void yTA_fill_values(const CSC_Matrix *A, const double *y, CSR_Matrix *C)
+void yTA_fill_values(const CSC_matrix *A, const double *y, CSR_matrix *C)
 {
     for (int col = 0; col < A->n; col++)
     {
@@ -413,9 +413,9 @@ void yTA_fill_values(const CSC_Matrix *A, const double *y, CSR_Matrix *C)
     }
 }
 
-/* computes C = B^T * D * A in CSR */
-void BTDA_fill_values(const CSC_Matrix *A, const CSC_Matrix *B, const double *d,
-                      CSR_Matrix *C)
+/* computes C = B^T * D * A in CSR_matrix */
+void BTDA_fill_values(const CSC_matrix *A, const CSC_matrix *B, const double *d,
+                      CSR_matrix *C)
 {
     int i, j, jj;
     for (i = 0; i < C->m; i++)
@@ -446,7 +446,7 @@ void BTDA_fill_values(const CSC_Matrix *A, const CSC_Matrix *B, const double *d,
  * faster when Q is dense, since it touches each Q entry exactly once.
  * The sparse_dot approach below is simpler but redundantly scans
  * column j of A for each nonzero row of C. */
-void BA_fill_values(const CSR_Matrix *Q, const CSC_Matrix *A, CSC_Matrix *C)
+void BA_fill_values(const CSR_matrix *Q, const CSC_matrix *A, CSC_matrix *C)
 {
     /* fill values of C = Q * A, given the sparsity pattern of C. */
     int i, j, ii;
@@ -467,10 +467,10 @@ void BA_fill_values(const CSR_Matrix *Q, const CSC_Matrix *A, CSC_Matrix *C)
     }
 }
 
-CSC_Matrix *symBA_alloc(const CSR_Matrix *B, const CSC_Matrix *A)
+CSC_matrix *symBA_alloc(const CSR_matrix *B, const CSC_matrix *A)
 {
     /* Allocate C = B * A (sparsity only). B must be symmetric.
-     * B is CSR (m x m), A is CSC (m x n), C is CSC (m x n).
+     * B is CSR_matrix (m x m), A is CSC_matrix (m x n), C is CSC_matrix (m x n).
      *
      * Column j of C is B * a_j = sum_k A_{k,j} B[:, k], so the nonzero
      * rows of column j of C are the union of the nonzero rows of B[:, k].
@@ -525,7 +525,7 @@ CSC_Matrix *symBA_alloc(const CSR_Matrix *B, const CSC_Matrix *A)
 
     /* allocate C and copy the computed structure */
     int total_nnz = Cp[n];
-    CSC_Matrix *C = new_csc_matrix(m, n, total_nnz);
+    CSC_matrix *C = new_csc_matrix(m, n, total_nnz);
     memcpy(C->p, Cp, (n + 1) * sizeof(int));
     memcpy(C->i, Ci->data, total_nnz * sizeof(int));
 
@@ -536,7 +536,7 @@ CSC_Matrix *symBA_alloc(const CSR_Matrix *B, const CSC_Matrix *A)
     return C;
 }
 
-int count_nonzero_cols_csc(const CSC_Matrix *A)
+int count_nonzero_cols_csc(const CSC_matrix *A)
 {
     int count = 0;
     for (int j = 0; j < A->n; j++)
diff --git a/src/utils/CSR_Matrix.c b/src/utils/CSR_matrix.c
similarity index 86%
rename from src/utils/CSR_Matrix.c
rename to src/utils/CSR_matrix.c
index c52b97d..63847ab 100644
--- a/src/utils/CSR_Matrix.c
+++ b/src/utils/CSR_matrix.c
@@ -15,7 +15,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/int_double_pair.h"
 #include "utils/tracked_alloc.h"
 #include "utils/utils.h"
@@ -25,9 +25,9 @@
 #include <stdlib.h>
 #include <string.h>
 
-CSR_Matrix *new_csr_matrix(int m, int n, int nnz)
+CSR_matrix *new_csr_matrix(int m, int n, int nnz)
 {
-    CSR_Matrix *matrix = (CSR_Matrix *) SP_MALLOC(sizeof(CSR_Matrix));
+    CSR_matrix *matrix = (CSR_matrix *) SP_MALLOC(sizeof(CSR_matrix));
     matrix->p = (int *) SP_CALLOC(m + 1, sizeof(int));
     matrix->i = (int *) SP_CALLOC(nnz, sizeof(int));
     matrix->x = (double *) SP_MALLOC(nnz * sizeof(double));
@@ -37,24 +37,24 @@ CSR_Matrix *new_csr_matrix(int m, int n, int nnz)
     return matrix;
 }
 
-CSR_Matrix *new_csr(const CSR_Matrix *A)
+CSR_matrix *new_csr(const CSR_matrix *A)
 {
-    CSR_Matrix *copy = new_csr_matrix(A->m, A->n, A->nnz);
+    CSR_matrix *copy = new_csr_matrix(A->m, A->n, A->nnz);
     memcpy(copy->p, A->p, (A->m + 1) * sizeof(int));
     memcpy(copy->i, A->i, A->nnz * sizeof(int));
     memcpy(copy->x, A->x, A->nnz * sizeof(double));
     return copy;
 }
 
-CSR_Matrix *new_csr_copy_sparsity(const CSR_Matrix *A)
+CSR_matrix *new_csr_copy_sparsity(const CSR_matrix *A)
 {
-    CSR_Matrix *copy = new_csr_matrix(A->m, A->n, A->nnz);
+    CSR_matrix *copy = new_csr_matrix(A->m, A->n, A->nnz);
     memcpy(copy->p, A->p, (A->m + 1) * sizeof(int));
     memcpy(copy->i, A->i, A->nnz * sizeof(int));
     return copy;
 }
 
-void free_csr_matrix(CSR_Matrix *matrix)
+void free_csr_matrix(CSR_matrix *matrix)
 {
     if (matrix)
     {
@@ -65,7 +65,7 @@ void free_csr_matrix(CSR_Matrix *matrix)
     }
 }
 
-void copy_csr_matrix(const CSR_Matrix *A, CSR_Matrix *C)
+void copy_csr_matrix(const CSR_matrix *A, CSR_matrix *C)
 {
     C->m = A->m;
     C->n = A->n;
@@ -75,7 +75,7 @@ void copy_csr_matrix(const CSR_Matrix *A, CSR_Matrix *C)
     memcpy(C->x, A->x, A->nnz * sizeof(double));
 }
 
-void Ax_csr(const CSR_Matrix *A, const double *x, double *y, int col_offset)
+void Ax_csr(const CSR_matrix *A, const double *x, double *y, int col_offset)
 {
     for (int row = 0; row < A->m; row++)
     {
@@ -88,7 +88,7 @@ void Ax_csr(const CSR_Matrix *A, const double *x, double *y, int col_offset)
     }
 }
 
-int count_nonzero_cols(const CSR_Matrix *A, bool *col_nz)
+int count_nonzero_cols(const CSR_matrix *A, bool *col_nz)
 {
     for (int row = 0; row < A->m; row++)
     {
@@ -120,7 +120,7 @@ void insert_idx(int idx, int *arr, int len)
     arr[j] = idx;
 }
 
-void DA_fill_values(const double *d, const CSR_Matrix *A, CSR_Matrix *C)
+void DA_fill_values(const double *d, const CSR_matrix *A, CSR_matrix *C)
 {
     memcpy(C->x, A->x, A->nnz * sizeof(double));
 
@@ -133,9 +133,9 @@ void DA_fill_values(const double *d, const CSR_Matrix *A, CSR_Matrix *C)
     }
 }
 
-CSR_Matrix *transpose(const CSR_Matrix *A, int *iwork)
+CSR_matrix *transpose(const CSR_matrix *A, int *iwork)
 {
-    CSR_Matrix *AT = new_csr_matrix(A->n, A->m, A->nnz);
+    CSR_matrix *AT = new_csr_matrix(A->n, A->m, A->nnz);
 
     int i, j;
     int *count = iwork;
@@ -178,10 +178,10 @@ CSR_Matrix *transpose(const CSR_Matrix *A, int *iwork)
     return AT;
 }
 
-CSR_Matrix *AT_alloc(const CSR_Matrix *A, int *iwork)
+CSR_matrix *AT_alloc(const CSR_matrix *A, int *iwork)
 {
     /* Allocate A^T and compute sparsity pattern without filling values */
-    CSR_Matrix *AT = new_csr_matrix(A->n, A->m, A->nnz);
+    CSR_matrix *AT = new_csr_matrix(A->n, A->m, A->nnz);
 
     int i, j;
     int *count = iwork;
@@ -223,7 +223,7 @@ CSR_Matrix *AT_alloc(const CSR_Matrix *A, int *iwork)
     return AT;
 }
 
-void AT_fill_values(const CSR_Matrix *A, CSR_Matrix *AT, int *iwork)
+void AT_fill_values(const CSR_matrix *A, CSR_matrix *AT, int *iwork)
 {
     /* Fill values of A^T given sparsity pattern is already computed */
     int i, j;
@@ -241,7 +241,7 @@ void AT_fill_values(const CSR_Matrix *A, CSR_Matrix *AT, int *iwork)
     }
 }
 
-double csr_get_value(const CSR_Matrix *A, int row, int col)
+double csr_get_value(const CSR_matrix *A, int row, int col)
 {
     for (int j = A->p[row]; j < A->p[row + 1]; j++)
     {
@@ -253,7 +253,7 @@ double csr_get_value(const CSR_Matrix *A, int row, int col)
     return 0.0;
 }
 
-void symmetrize_csr(const int *Ap, const int *Ai, int m, CSR_Matrix *C)
+void symmetrize_csr(const int *Ap, const int *Ai, int m, CSR_matrix *C)
 {
     int i, j, col;
 
diff --git a/src/utils/CSR_sum.c b/src/utils/CSR_sum.c
index ea0ff04..9d8ba79 100644
--- a/src/utils/CSR_sum.c
+++ b/src/utils/CSR_sum.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 #include "utils/CSR_sum.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/int_double_pair.h"
 #include "utils/tracked_alloc.h"
 #include "utils/utils.h"
@@ -24,7 +24,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-void sum_csr_alloc(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C)
+void sum_csr_alloc(const CSR_matrix *A, const CSR_matrix *B, CSR_matrix *C)
 {
     /* A and B must be different from C */
     assert(A != C && B != C);
@@ -81,7 +81,7 @@ void sum_csr_alloc(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C)
     C->p[A->m] = C->nnz;
 }
 
-void sum_csr_fill_values(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C)
+void sum_csr_fill_values(const CSR_matrix *A, const CSR_matrix *B, CSR_matrix *C)
 {
     /* Assumes C->p and C->i already contain the sparsity pattern of A+B.
        Fills only C->x accordingly. */
@@ -113,8 +113,8 @@ void sum_csr_fill_values(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C
     }
 }
 
-void sum_scaled_csr_matrices_fill_values(const CSR_Matrix *A, const CSR_Matrix *B,
-                                         CSR_Matrix *C, const double *d1,
+void sum_scaled_csr_matrices_fill_values(const CSR_matrix *A, const CSR_matrix *B,
+                                         CSR_matrix *C, const double *d1,
                                          const double *d2)
 {
     /* Assumes C->p and C->i already contain the sparsity pattern of A+B.
@@ -148,7 +148,7 @@ void sum_scaled_csr_matrices_fill_values(const CSR_Matrix *A, const CSR_Matrix *
 }
 
 /* iwork must have size max(A->n, A->nnz), and idx_map must have size A->nnz */
-void sum_block_of_rows_csr_alloc(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_block_of_rows_csr_alloc(const CSR_matrix *A, CSR_matrix *C,
                                  int row_block_size, int *iwork, int *idx_map)
 {
     assert(A->m % row_block_size == 0);
@@ -220,7 +220,7 @@ void sum_block_of_rows_csr_alloc(const CSR_Matrix *A, CSR_Matrix *C,
 }
 
 /* iwork must have size max(A->n, A->nnz), and idx_map must have size A->nnz */
-void sum_evenly_spaced_rows_csr_alloc(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_evenly_spaced_rows_csr_alloc(const CSR_matrix *A, CSR_matrix *C,
                                       int row_spacing, int *iwork, int *idx_map)
 {
     assert(C->m == row_spacing);
@@ -294,7 +294,7 @@ void accumulator(const double *vals, int nnz, const int *idx_map, double *out)
     }
 }
 
-void accumulator_with_spacing(const CSR_Matrix *A, const int *idx_map, double *out,
+void accumulator_with_spacing(const CSR_matrix *A, const int *idx_map, double *out,
                               int spacing)
 {
     /* don't forget to initialze accumulator to 0 before calling this */
@@ -307,7 +307,7 @@ void accumulator_with_spacing(const CSR_Matrix *A, const int *idx_map, double *o
     }
 }
 
-void sum_all_rows_csr_alloc(const CSR_Matrix *A, CSR_Matrix *C, int *iwork,
+void sum_all_rows_csr_alloc(const CSR_matrix *A, CSR_matrix *C, int *iwork,
                             int *idx_map)
 {
     // -------------------------------------------------------------------
@@ -354,23 +354,23 @@ void sum_all_rows_csr_alloc(const CSR_Matrix *A, CSR_Matrix *C, int *iwork,
 
 /*
  * Sums evenly spaced rows from A into a single row in C and fills an index map.
- * A: input CSR matrix
- * C: output CSR matrix (must have m=1)
+ * A: input CSR_matrix matrix
+ * C: output CSR_matrix matrix (must have m=1)
  * spacing: row spacing
  * iwork: workspace of size at least max(A->n, A->nnz)
  * idx_map: output index map, size at least A->nnz
  */
-CSR_Matrix *sum_4_csr_alloc(const CSR_Matrix *A, const CSR_Matrix *B,
-                            const CSR_Matrix *C, const CSR_Matrix *D,
+CSR_matrix *sum_4_csr_alloc(const CSR_matrix *A, const CSR_matrix *B,
+                            const CSR_matrix *C, const CSR_matrix *D,
                             int *idx_maps[4])
 {
-    const CSR_Matrix *inputs[4] = {A, B, C, D};
+    const CSR_matrix *inputs[4] = {A, B, C, D};
     int m = A->m;
     int n = A->n;
     int nnz_ub = A->nnz + B->nnz + C->nnz + D->nnz;
 
     /* allocate output and index maps */
-    CSR_Matrix *out = new_csr_matrix(m, n, nnz_ub);
+    CSR_matrix *out = new_csr_matrix(m, n, nnz_ub);
     for (int k = 0; k < 4; k++)
     {
         idx_maps[k] = (int *) SP_MALLOC(inputs[k]->nnz * sizeof(int));
@@ -430,7 +430,7 @@ CSR_Matrix *sum_4_csr_alloc(const CSR_Matrix *A, const CSR_Matrix *B,
     return out;
 }
 
-void sum_spaced_rows_into_row_csr_alloc(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_spaced_rows_into_row_csr_alloc(const CSR_matrix *A, CSR_matrix *C,
                                         int spacing, int *iwork, int *idx_map)
 {
     assert(C->m == 1);
diff --git a/src/utils/dense_matrix.c b/src/utils/dense_matrix.c
index 7d6b9a6..cb090f2 100644
--- a/src/utils/dense_matrix.c
+++ b/src/utils/dense_matrix.c
@@ -22,16 +22,16 @@
 #include <stdlib.h>
 #include <string.h>
 
-static struct Permuted_Dense *dense_as_permuted_dense(Matrix *self)
+static struct permuted_dense *dense_as_permuted_dense(matrix *self)
 {
     (void) self;
     return NULL;
 }
 
-static void dense_block_left_mult_vec(const Matrix *A, const double *x, double *y,
+static void dense_block_left_mult_vec(const matrix *A, const double *x, double *y,
                                       int p)
 {
-    const Dense_Matrix *dm = (const Dense_Matrix *) A;
+    const dense_matrix *dm = (const dense_matrix *) A;
     int m = dm->base.m;
     int n = dm->base.n;
 
@@ -48,17 +48,17 @@ static void dense_block_left_mult_vec(const Matrix *A, const double *x, double *
                 n, 0.0, y, m);
 }
 
-static void dense_free(Matrix *A)
+static void dense_free(matrix *A)
 {
-    Dense_Matrix *dm = (Dense_Matrix *) A;
+    dense_matrix *dm = (dense_matrix *) A;
     free(dm->x);
     free(dm->work);
     free(dm);
 }
 
-Matrix *new_dense_matrix(int m, int n, const double *data)
+matrix *new_dense_matrix(int m, int n, const double *data)
 {
-    Dense_Matrix *dm = (Dense_Matrix *) SP_CALLOC(1, sizeof(Dense_Matrix));
+    dense_matrix *dm = (dense_matrix *) SP_CALLOC(1, sizeof(dense_matrix));
     dm->base.m = m;
     dm->base.n = n;
     dm->base.nnz = m * n;
@@ -77,7 +77,7 @@ Matrix *new_dense_matrix(int m, int n, const double *data)
     return &dm->base;
 }
 
-Matrix *dense_matrix_trans(const Dense_Matrix *A)
+matrix *dense_matrix_trans(const dense_matrix *A)
 {
     int m = A->base.m;
     int n = A->base.n;
@@ -85,7 +85,7 @@ Matrix *dense_matrix_trans(const Dense_Matrix *A)
 
     A_transpose(AT_x, A->x, m, n);
 
-    Matrix *result = new_dense_matrix(n, m, AT_x);
+    matrix *result = new_dense_matrix(n, m, AT_x);
     free(AT_x);
     return result;
 }
diff --git a/src/utils/linalg_dense_sparse_matmuls.c b/src/utils/linalg_dense_sparse_matmuls.c
index 74cd5fe..1695f2d 100644
--- a/src/utils/linalg_dense_sparse_matmuls.c
+++ b/src/utils/linalg_dense_sparse_matmuls.c
@@ -15,8 +15,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "utils/CSC_Matrix.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSC_matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/cblas_wrapper.h"
 #include "utils/dense_matrix.h"
 #include "utils/iVec.h"
@@ -26,10 +26,10 @@
 #include <string.h>
 
 /* ---------------------------------------------------------------
- * C = (I_p kron A) @ J  via the polymorphic Matrix interface.
- * A is dense m x n, J is (n*p) x k CSC, C is (m*p) x k CSC.
+ * C = (I_p kron A) @ J  via the polymorphic matrix interface.
+ * A is dense m x n, J is (n*p) x k CSC_matrix, C is (m*p) x k CSC_matrix.
  * --------------------------------------------------------------- */
-CSC_Matrix *I_kron_A_alloc(const Matrix *A, const CSC_Matrix *J, int p)
+CSC_matrix *I_kron_A_alloc(const matrix *A, const CSC_matrix *J, int p)
 {
     int m = A->m;
     int n = A->n;
@@ -82,7 +82,7 @@ CSC_Matrix *I_kron_A_alloc(const Matrix *A, const CSC_Matrix *J, int p)
         Cp[j + 1] = Ci->len;
     }
 
-    CSC_Matrix *C = new_csc_matrix(m * p, J->n, Ci->len);
+    CSC_matrix *C = new_csc_matrix(m * p, J->n, Ci->len);
     memcpy(C->p, Cp, (J->n + 1) * sizeof(int));
     memcpy(C->i, Ci->data, Ci->len * sizeof(int));
     free(Cp);
@@ -91,9 +91,9 @@ CSC_Matrix *I_kron_A_alloc(const Matrix *A, const CSC_Matrix *J, int p)
     return C;
 }
 
-void I_kron_A_fill_values(const Matrix *A, const CSC_Matrix *J, CSC_Matrix *C)
+void I_kron_A_fill_values(const matrix *A, const CSC_matrix *J, CSC_matrix *C)
 {
-    const Dense_Matrix *dm = (const Dense_Matrix *) A;
+    const dense_matrix *dm = (const dense_matrix *) A;
     int m = dm->base.m;
     int n = dm->base.n;
     int k = J->n;
@@ -156,9 +156,9 @@ void I_kron_A_fill_values(const Matrix *A, const CSC_Matrix *J, CSC_Matrix *C)
 
 /* ---------------------------------------------------------------
  * C = (Y^T kron I_m) @ J
- * Y is k x n (col-major), J is (m*k) x p CSC, C is (m*n) x p CSR
+ * Y is k x n (col-major), J is (m*k) x p CSC_matrix, C is (m*n) x p CSR_matrix
  * --------------------------------------------------------------- */
-CSR_Matrix *YT_kron_I_alloc(int m, int k, int n, const CSC_Matrix *J)
+CSR_matrix *YT_kron_I_alloc(int m, int k, int n, const CSC_matrix *J)
 {
     (void) k;
     /* C has n blocks of m rows.  All rows at the same position within
@@ -198,7 +198,7 @@ CSR_Matrix *YT_kron_I_alloc(int m, int k, int n, const CSC_Matrix *J)
     // ---------------------------------------------------------------
     //           replicate sparsity pattern across blocks
     // ---------------------------------------------------------------
-    CSR_Matrix *C = new_csr_matrix(m * n, J->n, total_nnz);
+    CSR_matrix *C = new_csr_matrix(m * n, J->n, total_nnz);
     int idx = 0;
     for (i = 0; i < m * n; i++)
     {
@@ -219,8 +219,8 @@ CSR_Matrix *YT_kron_I_alloc(int m, int k, int n, const CSC_Matrix *J)
     return C;
 }
 
-void YT_kron_I_fill_values(int m, int k, int n, const double *Y, const CSC_Matrix *J,
-                           CSR_Matrix *C)
+void YT_kron_I_fill_values(int m, int k, int n, const double *Y, const CSC_matrix *J,
+                           CSR_matrix *C)
 {
     (void) n;
     assert(C->m == m * n);
@@ -256,7 +256,7 @@ void YT_kron_I_fill_values(int m, int k, int n, const double *Y, const CSC_Matri
     }
 }
 
-CSR_Matrix *I_kron_X_alloc(int m, int k, int n, const CSC_Matrix *J)
+CSR_matrix *I_kron_X_alloc(int m, int k, int n, const CSC_matrix *J)
 {
     /* Step 1: for each block, find which columns of J have any
      *         nonzero in row range [blk*k, blk*k + k). */
@@ -287,7 +287,7 @@ CSR_Matrix *I_kron_X_alloc(int m, int k, int n, const CSC_Matrix *J)
 
     /* Step 2: replicate each block's pattern for all m rows
      *         within that block. */
-    CSR_Matrix *C = new_csr_matrix(m * n, J->n, total_nnz);
+    CSR_matrix *C = new_csr_matrix(m * n, J->n, total_nnz);
     int idx = 0;
     for (i = 0; i < m * n; i++)
     {
@@ -308,8 +308,8 @@ CSR_Matrix *I_kron_X_alloc(int m, int k, int n, const CSC_Matrix *J)
     return C;
 }
 
-void I_kron_X_fill_values(int m, int k, int n, const double *X, const CSC_Matrix *J,
-                          CSR_Matrix *C)
+void I_kron_X_fill_values(int m, int k, int n, const double *X, const CSC_matrix *J,
+                          CSR_matrix *C)
 {
     (void) n;
     assert(C->m == m * n);
diff --git a/src/utils/linalg_sparse_matmuls.c b/src/utils/linalg_sparse_matmuls.c
index 42ee992..b8fe39c 100644
--- a/src/utils/linalg_sparse_matmuls.c
+++ b/src/utils/linalg_sparse_matmuls.c
@@ -15,35 +15,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "utils/CSC_Matrix.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSC_matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/iVec.h"
 #include "utils/tracked_alloc.h"
+#include "utils/utils.h"
 #include <assert.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-static inline bool has_overlap(const int *a_idx, int a_len, const int *b_idx,
-                               int b_len, int b_offset)
-{
-    int ai = 0, bi = 0;
-    while (ai < a_len && bi < b_len)
-    {
-        if (a_idx[ai] == b_idx[bi] - b_offset) return true;
-        if (a_idx[ai] < b_idx[bi] - b_offset)
-        {
-            ai++;
-        }
-        else
-        {
-            bi++;
-        }
-    }
-    return false;
-}
-
 /* Unweighted sparse dot product of two sorted index arrays */
 static inline double sparse_dot(const double *a_x, const int *a_i, int a_nnz,
                                 const double *b_x, const int *b_i, int b_nnz,
@@ -104,8 +86,8 @@ static inline double sparse_dot_offset(const double *a_x, const int *a_idx,
     return sum;
 }
 
-CSC_Matrix *block_left_multiply_fill_sparsity(const CSR_Matrix *A,
-                                              const CSC_Matrix *J, int p)
+CSC_matrix *block_left_multiply_fill_sparsity(const CSR_matrix *A,
+                                              const CSC_matrix *J, int p)
 {
     /* A is m x n, J is (n*p) x k, C is (m*p) x k */
     int m = A->m;
@@ -175,7 +157,7 @@ CSC_Matrix *block_left_multiply_fill_sparsity(const CSR_Matrix *A,
         Cp[j + 1] = Ci->len;
     }
 
-    CSC_Matrix *C = new_csc_matrix(m * p, J->n, Ci->len);
+    CSC_matrix *C = new_csc_matrix(m * p, J->n, Ci->len);
     memcpy(C->p, Cp, (J->n + 1) * sizeof(int));
     memcpy(C->i, Ci->data, Ci->len * sizeof(int));
     free(Cp);
@@ -184,8 +166,8 @@ CSC_Matrix *block_left_multiply_fill_sparsity(const CSR_Matrix *A,
     return C;
 }
 
-void block_left_multiply_fill_values(const CSR_Matrix *A, const CSC_Matrix *J,
-                                     CSC_Matrix *C)
+void block_left_multiply_fill_values(const CSR_matrix *A, const CSC_matrix *J,
+                                     CSC_matrix *C)
 {
     /* A is m x n, J is (n*p) x k, C is (m*p) x k */
     int m = A->m;
@@ -246,9 +228,9 @@ void block_left_multiply_fill_values(const CSR_Matrix *A, const CSC_Matrix *J,
     }
 }
 
-/* Fill values of C = A @ B where A is CSR, B is CSC. */
-void csr_csc_matmul_fill_values(const CSR_Matrix *A, const CSC_Matrix *B,
-                                CSR_Matrix *C)
+/* Fill values of C = A @ B where A is CSR_matrix, B is CSC_matrix. */
+void csr_csc_matmul_fill_values(const CSR_matrix *A, const CSC_matrix *B,
+                                CSR_matrix *C)
 {
     for (int i = 0; i < A->m; i++)
     {
@@ -268,9 +250,9 @@ void csr_csc_matmul_fill_values(const CSR_Matrix *A, const CSC_Matrix *B,
     }
 }
 
-/* C = A @ B where A is CSR (m x n), B is CSC (n x p). Result C is CSR (m x p)
+/* C = A @ B where A is CSR_matrix (m x n), B is CSC_matrix (n x p). Result C is CSR_matrix (m x p)
   with precomputed sparsity pattern */
-CSR_Matrix *csr_csc_matmul_alloc(const CSR_Matrix *A, const CSC_Matrix *B)
+CSR_matrix *csr_csc_matmul_alloc(const CSR_matrix *A, const CSC_matrix *B)
 {
     int m = A->m;
     int p = B->n;
@@ -304,7 +286,7 @@ CSR_Matrix *csr_csc_matmul_alloc(const CSR_Matrix *A, const CSC_Matrix *B)
         Cp[i + 1] = nnz;
     }
 
-    CSR_Matrix *C = new_csr_matrix(m, p, nnz);
+    CSR_matrix *C = new_csr_matrix(m, p, nnz);
     memcpy(C->p, Cp, (m + 1) * sizeof(int));
     memcpy(C->i, Ci->data, nnz * sizeof(int));
     free(Cp);
@@ -317,7 +299,7 @@ CSR_Matrix *csr_csc_matmul_alloc(const CSR_Matrix *A, const CSC_Matrix *B)
  * y = [A @ x1; A @ x2; ...; A @ xp] where A is m x n and x is (n*p)-length vector.
  * x is split into p blocks of n elements each.
  */
-void block_left_multiply_vec(const struct CSR_Matrix *A, const double *x, double *y,
+void block_left_multiply_vec(const struct CSR_matrix *A, const double *x, double *y,
                              int p)
 {
     /* For each block */
diff --git a/src/utils/matrix_BTA.c b/src/utils/matrix_BTA.c
index bd36c00..51261a2 100644
--- a/src/utils/matrix_BTA.c
+++ b/src/utils/matrix_BTA.c
@@ -11,14 +11,15 @@
  */
 #include "utils/matrix_BTA.h"
 
-#include "utils/CSC_Matrix.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSC_matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/permuted_dense.h"
+#include "utils/sparse_matrix.h"
 
-Matrix *BTA_matrices_alloc(Matrix *A, Matrix *B)
+matrix *BTA_matrices_alloc(matrix *A, matrix *B)
 {
-    Permuted_Dense *pd_A = A->as_permuted_dense(A);
-    Permuted_Dense *pd_B = B->as_permuted_dense(B);
+    permuted_dense *pd_A = A->as_permuted_dense(A);
+    permuted_dense *pd_B = B->as_permuted_dense(B);
 
     if (pd_A && pd_B)
     {
@@ -27,51 +28,51 @@ Matrix *BTA_matrices_alloc(Matrix *A, Matrix *B)
     if (pd_B)
     {
         /* A is Sparse, B is PD */
-        CSR_Matrix *A_csr = A->to_csr(A);
+        CSR_matrix *A_csr = A->to_csr(A);
         return BTA_csr_pd_alloc(A_csr, pd_B);
     }
     if (pd_A)
     {
         /* A is PD, B is Sparse */
-        CSR_Matrix *B_csr = B->to_csr(B);
+        CSR_matrix *B_csr = B->to_csr(B);
         return BTA_pd_csr_alloc(pd_A, B_csr);
     }
 
-    /* Both Sparse: delegate to CSC BTA. Caller must ensure caches are fresh. */
-    Sparse_Matrix *sm_A = (Sparse_Matrix *) A;
-    Sparse_Matrix *sm_B = (Sparse_Matrix *) B;
+    /* Both Sparse: delegate to CSC_matrix BTA. Caller must ensure caches are fresh. */
+    sparse_matrix *sm_A = (sparse_matrix *) A;
+    sparse_matrix *sm_B = (sparse_matrix *) B;
     A->refresh_csc_values(A);
     B->refresh_csc_values(B);
-    CSR_Matrix *C_csr = BTA_alloc(sm_A->csc_cache, sm_B->csc_cache);
+    CSR_matrix *C_csr = BTA_alloc(sm_A->csc_cache, sm_B->csc_cache);
     return new_sparse_matrix(C_csr);
 }
 
-void BTDA_matrices_fill_values(Matrix *A, const double *d, Matrix *B, Matrix *C)
+void BTDA_matrices_fill_values(matrix *A, const double *d, matrix *B, matrix *C)
 {
-    Permuted_Dense *pd_A = A->as_permuted_dense(A);
-    Permuted_Dense *pd_B = B->as_permuted_dense(B);
+    permuted_dense *pd_A = A->as_permuted_dense(A);
+    permuted_dense *pd_B = B->as_permuted_dense(B);
 
     if (pd_A && pd_B)
     {
-        BTDA_pd_pd_fill_values(pd_A, d, pd_B, (Permuted_Dense *) C);
+        BTDA_pd_pd_fill_values(pd_A, d, pd_B, (permuted_dense *) C);
         return;
     }
     if (pd_B)
     {
-        CSR_Matrix *A_csr = A->to_csr(A);
-        BTDA_csr_pd_fill_values(A_csr, d, pd_B, (Permuted_Dense *) C);
+        CSR_matrix *A_csr = A->to_csr(A);
+        BTDA_csr_pd_fill_values(A_csr, d, pd_B, (permuted_dense *) C);
         return;
     }
     if (pd_A)
     {
-        CSR_Matrix *B_csr = B->to_csr(B);
-        BTDA_pd_csr_fill_values(pd_A, d, B_csr, (Permuted_Dense *) C);
+        CSR_matrix *B_csr = B->to_csr(B);
+        BTDA_pd_csr_fill_values(pd_A, d, B_csr, (permuted_dense *) C);
         return;
     }
 
-    /* Both Sparse: delegate to CSC BTDA. */
-    Sparse_Matrix *sm_A = (Sparse_Matrix *) A;
-    Sparse_Matrix *sm_B = (Sparse_Matrix *) B;
-    Sparse_Matrix *sm_C = (Sparse_Matrix *) C;
+    /* Both Sparse: delegate to CSC_matrix BTDA. */
+    sparse_matrix *sm_A = (sparse_matrix *) A;
+    sparse_matrix *sm_B = (sparse_matrix *) B;
+    sparse_matrix *sm_C = (sparse_matrix *) C;
     BTDA_fill_values(sm_A->csc_cache, sm_B->csc_cache, d, sm_C->csr);
 }
diff --git a/src/utils/matrix_sum.c b/src/utils/matrix_sum.c
index 8d0bd34..c8b12b5 100644
--- a/src/utils/matrix_sum.c
+++ b/src/utils/matrix_sum.c
@@ -18,19 +18,19 @@
 #include "utils/matrix_sum.h"
 #include "utils/CSR_sum.h"
 
-void sum_matrices_alloc(Matrix *A, Matrix *B, Matrix *C)
+void sum_matrices_alloc(matrix *A, matrix *B, matrix *C)
 {
-    CSR_Matrix *cc = C->to_csr(C);
+    CSR_matrix *cc = C->to_csr(C);
     sum_csr_alloc(A->to_csr(A), B->to_csr(B), cc);
     C->nnz = cc->nnz;
 }
 
-void sum_matrices_fill_values(Matrix *A, Matrix *B, Matrix *C)
+void sum_matrices_fill_values(matrix *A, matrix *B, matrix *C)
 {
     sum_csr_fill_values(A->to_csr(A), B->to_csr(B), C->to_csr(C));
 }
 
-void sum_scaled_matrices_fill_values(Matrix *A, Matrix *B, Matrix *C,
+void sum_scaled_matrices_fill_values(matrix *A, matrix *B, matrix *C,
                                      const double *d1, const double *d2)
 {
     sum_scaled_csr_matrices_fill_values(A->to_csr(A), B->to_csr(B), C->to_csr(C),
diff --git a/src/utils/mini_numpy.c b/src/utils/mini_numpy.c
index baa00c5..de0d345 100644
--- a/src/utils/mini_numpy.c
+++ b/src/utils/mini_numpy.c
@@ -101,7 +101,7 @@ void I_kron_XT_vec(int m, int k, int n, const double *X, const double *w, double
     }
 }
 
-void conv_matrix_fill_sparsity(CSR_Matrix *T_csr, int m, int n)
+void conv_matrix_fill_sparsity(CSR_matrix *T_csr, int m, int n)
 {
     int nnz = 0;
     for (int r = 0; r < T_csr->m; r++)
@@ -118,7 +118,7 @@ void conv_matrix_fill_sparsity(CSR_Matrix *T_csr, int m, int n)
     T_csr->p[T_csr->m] = nnz;
 }
 
-void conv_matrix_fill_values(CSR_Matrix *T_csr, const double *a)
+void conv_matrix_fill_values(CSR_matrix *T_csr, const double *a)
 {
     for (int r = 0; r < T_csr->m; r++)
     {
diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c
index 5d4b3eb..65c9e04 100644
--- a/src/utils/permuted_dense.c
+++ b/src/utils/permuted_dense.c
@@ -19,16 +19,16 @@
 #include "utils/cblas_wrapper.h"
 #include "utils/iVec.h"
 #include "utils/tracked_alloc.h"
+#include "utils/utils.h"
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 
-static void permuted_dense_free(Matrix *self)
+static void permuted_dense_free(matrix *self)
 {
-    Permuted_Dense *pd = (Permuted_Dense *) self;
+    permuted_dense *pd = (permuted_dense *) self;
     free(pd->row_perm);
     free(pd->col_perm);
-    free(pd->Y_scratch);
     free(pd->col_inv);
     free(pd->row_inv);
     /* csr_cache->x aliases pd->X (set in permuted_dense_to_csr_alloc); NULL it
@@ -39,57 +39,59 @@ static void permuted_dense_free(Matrix *self)
     }
     free_csr_matrix(pd->csr_cache);
     free(pd->X);
-    free(pd->gather_X_scratch);
+    free(pd->dwork);
+    free(pd->iwork);
     free(pd);
 }
 
-/* Permuted_Dense has no CSC mirror; chain-rule kernels operate on X directly. */
-static void permuted_dense_refresh_csc_values(Matrix *self)
+/* permuted_dense has no CSC_matrix mirror; chain-rule kernels operate on X directly.
+ */
+static void permuted_dense_refresh_csc_values(matrix *self)
 {
     (void) self;
 }
 
 /* Vtable adapters — each delegates to the existing permuted_dense_* kernel. */
-static Matrix *permuted_dense_vtable_copy_sparsity(const Matrix *self)
+static matrix *permuted_dense_vtable_copy_sparsity(const matrix *self)
 {
-    const Permuted_Dense *pd = (const Permuted_Dense *) self;
-    return new_permuted_dense(pd->base.m, pd->base.n, pd->dense_m, pd->dense_n,
-                              pd->row_perm, pd->col_perm, NULL);
+    const permuted_dense *pd = (const permuted_dense *) self;
+    return new_permuted_dense(pd->base.m, pd->base.n, pd->m0, pd->n0, pd->row_perm,
+                              pd->col_perm, NULL);
 }
 
-static void permuted_dense_vtable_DA_fill_values(const double *d, const Matrix *self,
-                                                 Matrix *out)
+static void permuted_dense_vtable_DA_fill_values(const double *d, const matrix *self,
+                                                 matrix *out)
 {
-    permuted_dense_DA_fill_values(d, (const Permuted_Dense *) self,
-                                  (Permuted_Dense *) out);
+    permuted_dense_DA_fill_values(d, (const permuted_dense *) self,
+                                  (permuted_dense *) out);
 }
 
-static Matrix *permuted_dense_vtable_ATA_alloc(Matrix *self)
+static matrix *permuted_dense_vtable_ATA_alloc(matrix *self)
 {
-    return permuted_dense_ATA_alloc((const Permuted_Dense *) self);
+    return permuted_dense_ATA_alloc((const permuted_dense *) self);
 }
 
-static void permuted_dense_vtable_ATDA_fill_values(const Matrix *self,
-                                                   const double *d, Matrix *out)
+static void permuted_dense_vtable_ATDA_fill_values(const matrix *self,
+                                                   const double *d, matrix *out)
 {
-    permuted_dense_ATDA_fill_values((const Permuted_Dense *) self, d,
-                                    (Permuted_Dense *) out);
+    permuted_dense_ATDA_fill_values((const permuted_dense *) self, d,
+                                    (permuted_dense *) out);
 }
 
 /* Forward decl; definition lower in the file. */
-static CSR_Matrix *permuted_dense_to_csr_alloc(const Permuted_Dense *self);
+static CSR_matrix *permuted_dense_to_csr_alloc(const permuted_dense *A);
 
-/* Lazy CSR view: allocate structure on first call, then return the cache.
+/* Lazy CSR_matrix view: allocate structure on first call, then return the cache.
    The cache's x array aliases pd->X (see permuted_dense_to_csr_alloc), so
    values are always live without a per-call refresh. */
-static struct Permuted_Dense *permuted_dense_as_permuted_dense(Matrix *self)
+static struct permuted_dense *permuted_dense_as_permuted_dense(matrix *self)
 {
-    return (Permuted_Dense *) self;
+    return (permuted_dense *) self;
 }
 
-static CSR_Matrix *permuted_dense_to_csr(Matrix *self)
+static CSR_matrix *permuted_dense_to_csr(matrix *self)
 {
-    Permuted_Dense *pd = (Permuted_Dense *) self;
+    permuted_dense *pd = (permuted_dense *) self;
     if (pd->csr_cache == NULL)
     {
         pd->csr_cache = permuted_dense_to_csr_alloc(pd);
@@ -97,56 +99,53 @@ static CSR_Matrix *permuted_dense_to_csr(Matrix *self)
     return pd->csr_cache;
 }
 
-static Matrix *permuted_dense_vtable_index_alloc(Matrix *self, const int *indices,
+static matrix *permuted_dense_vtable_index_alloc(matrix *self, const int *indices,
                                                  int n_idxs)
 {
-    const Permuted_Dense *pd = (const Permuted_Dense *) self;
+    const permuted_dense *pd = (const permuted_dense *) self;
 
     /* Scan indices: which output positions i hit a row in pd->row_perm? */
     int *new_row_perm = (int *) SP_MALLOC(n_idxs * sizeof(int));
-    int new_dense_m = 0;
+    int new_m0 = 0;
     for (int i = 0; i < n_idxs; i++)
     {
         if (pd->row_inv[indices[i]] >= 0)
         {
-            new_row_perm[new_dense_m++] = i;
+            new_row_perm[new_m0++] = i;
         }
     }
 
-    Matrix *out = new_permuted_dense(n_idxs, pd->base.n, new_dense_m,
-                                     pd->dense_n, new_row_perm, pd->col_perm,
-                                     NULL);
+    matrix *out = new_permuted_dense(n_idxs, pd->base.n, new_m0, pd->n0,
+                                     new_row_perm, pd->col_perm, NULL);
     free(new_row_perm);
     return out;
 }
 
-static void permuted_dense_vtable_index_fill_values(Matrix *self,
-                                                    const int *indices, int n_idxs,
-                                                    Matrix *out)
+static void permuted_dense_vtable_index_fill_values(matrix *self, const int *indices,
+                                                    int n_idxs, matrix *out)
 {
     (void) n_idxs;
-    const Permuted_Dense *pd = (const Permuted_Dense *) self;
-    Permuted_Dense *out_pd = (Permuted_Dense *) out;
-    int dense_n = pd->dense_n;
-    for (int k = 0; k < out_pd->dense_m; k++)
+    const permuted_dense *pd = (const permuted_dense *) self;
+    permuted_dense *out_pd = (permuted_dense *) out;
+    int n0 = pd->n0;
+    for (int k = 0; k < out_pd->m0; k++)
     {
         int i = out_pd->row_perm[k];
         int old_ii = pd->row_inv[indices[i]];
-        memcpy(out_pd->X + (size_t) k * dense_n, pd->X + (size_t) old_ii * dense_n,
-               dense_n * sizeof(double));
+        memcpy(out_pd->X + k * n0, pd->X + old_ii * n0, n0 * sizeof(double));
     }
 }
 
-static Matrix *permuted_dense_vtable_promote_alloc(Matrix *self, int size)
+static matrix *permuted_dense_vtable_promote_alloc(matrix *self, int size)
 {
-    const Permuted_Dense *pd = (const Permuted_Dense *) self;
-    assert(pd->dense_m <= 1);
+    const permuted_dense *pd = (const permuted_dense *) self;
+    assert(pd->m0 <= 1);
 
-    if (pd->dense_m == 0)
+    if (pd->m0 == 0)
     {
         /* source row is all-zero; output is also structurally all-zero. */
-        return new_permuted_dense(size, pd->base.n, 0, pd->dense_n, NULL,
-                                  pd->col_perm, NULL);
+        return new_permuted_dense(size, pd->base.n, 0, pd->n0, NULL, pd->col_perm,
+                                  NULL);
     }
 
     int *new_row_perm = (int *) SP_MALLOC(size * sizeof(int));
@@ -154,53 +153,52 @@ static Matrix *permuted_dense_vtable_promote_alloc(Matrix *self, int size)
     {
         new_row_perm[i] = i;
     }
-    Matrix *out = new_permuted_dense(size, pd->base.n, size, pd->dense_n,
-                                     new_row_perm, pd->col_perm, NULL);
+    matrix *out = new_permuted_dense(size, pd->base.n, size, pd->n0, new_row_perm,
+                                     pd->col_perm, NULL);
     free(new_row_perm);
     return out;
 }
 
-static void permuted_dense_vtable_promote_fill_values(Matrix *self, Matrix *out)
+static void permuted_dense_vtable_promote_fill_values(matrix *self, matrix *out)
 {
-    const Permuted_Dense *pd = (const Permuted_Dense *) self;
-    Permuted_Dense *out_pd = (Permuted_Dense *) out;
-    if (pd->dense_m == 0) return;
-    int dense_n = pd->dense_n;
-    for (int k = 0; k < out_pd->dense_m; k++)
+    const permuted_dense *pd = (const permuted_dense *) self;
+    permuted_dense *out_pd = (permuted_dense *) out;
+    if (pd->m0 == 0) return;
+    int n0 = pd->n0;
+    for (int k = 0; k < out_pd->m0; k++)
     {
-        memcpy(out_pd->X + (size_t) k * dense_n, pd->X,
-               dense_n * sizeof(double));
+        memcpy(out_pd->X + k * n0, pd->X, n0 * sizeof(double));
     }
 }
 
-static Matrix *permuted_dense_vtable_broadcast_alloc(Matrix *self,
+static matrix *permuted_dense_vtable_broadcast_alloc(matrix *self,
                                                      broadcast_type type, int d1,
                                                      int d2)
 {
-    const Permuted_Dense *pd = (const Permuted_Dense *) self;
+    const permuted_dense *pd = (const permuted_dense *) self;
     int out_m = d1 * d2;
 
-    int new_dense_m;
+    int new_m0;
     if (type == BROADCAST_SCALAR)
     {
-        new_dense_m = (pd->dense_m == 0) ? 0 : out_m;
+        new_m0 = (pd->m0 == 0) ? 0 : out_m;
     }
     else if (type == BROADCAST_ROW)
     {
-        new_dense_m = d1 * pd->dense_m;
+        new_m0 = d1 * pd->m0;
     }
     else /* BROADCAST_COL */
     {
-        new_dense_m = d2 * pd->dense_m;
+        new_m0 = d2 * pd->m0;
     }
 
-    if (new_dense_m == 0)
+    if (new_m0 == 0)
     {
-        return new_permuted_dense(out_m, pd->base.n, 0, pd->dense_n, NULL,
-                                  pd->col_perm, NULL);
+        return new_permuted_dense(out_m, pd->base.n, 0, pd->n0, NULL, pd->col_perm,
+                                  NULL);
     }
 
-    int *new_row_perm = (int *) SP_MALLOC(new_dense_m * sizeof(int));
+    int *new_row_perm = (int *) SP_MALLOC(new_m0 * sizeof(int));
     int k = 0;
     if (type == BROADCAST_SCALAR)
     {
@@ -211,7 +209,7 @@ static Matrix *permuted_dense_vtable_broadcast_alloc(Matrix *self,
     }
     else if (type == BROADCAST_ROW)
     {
-        for (int j_ii = 0; j_ii < pd->dense_m; j_ii++)
+        for (int j_ii = 0; j_ii < pd->m0; j_ii++)
         {
             int j_old = pd->row_perm[j_ii];
             for (int i = 0; i < d1; i++)
@@ -224,124 +222,117 @@ static Matrix *permuted_dense_vtable_broadcast_alloc(Matrix *self,
     {
         for (int j = 0; j < d2; j++)
         {
-            for (int ii_old = 0; ii_old < pd->dense_m; ii_old++)
+            for (int ii_old = 0; ii_old < pd->m0; ii_old++)
             {
                 new_row_perm[k++] = j * d1 + pd->row_perm[ii_old];
             }
         }
     }
 
-    Matrix *out = new_permuted_dense(out_m, pd->base.n, new_dense_m,
-                                     pd->dense_n, new_row_perm, pd->col_perm,
-                                     NULL);
+    matrix *out = new_permuted_dense(out_m, pd->base.n, new_m0, pd->n0, new_row_perm,
+                                     pd->col_perm, NULL);
     free(new_row_perm);
     return out;
 }
 
-static void permuted_dense_vtable_broadcast_fill_values(Matrix *self,
+static void permuted_dense_vtable_broadcast_fill_values(matrix *self,
                                                         broadcast_type type, int d1,
-                                                        int d2, Matrix *out)
+                                                        int d2, matrix *out)
 {
-    const Permuted_Dense *pd = (const Permuted_Dense *) self;
-    Permuted_Dense *out_pd = (Permuted_Dense *) out;
-    if (pd->dense_m == 0)
+    const permuted_dense *pd = (const permuted_dense *) self;
+    permuted_dense *out_pd = (permuted_dense *) out;
+    if (pd->m0 == 0)
     {
         return;
     }
-    int dense_n = pd->dense_n;
+    int n0 = pd->n0;
 
     if (type == BROADCAST_SCALAR)
     {
-        for (int k = 0; k < out_pd->dense_m; k++)
+        for (int k = 0; k < out_pd->m0; k++)
         {
-            memcpy(out_pd->X + (size_t) k * dense_n, pd->X,
-                   dense_n * sizeof(double));
+            memcpy(out_pd->X + k * n0, pd->X, n0 * sizeof(double));
         }
     }
     else if (type == BROADCAST_ROW)
     {
         /* output row k corresponds to child dense row (k / d1). */
         (void) d2;
-        for (int k = 0; k < out_pd->dense_m; k++)
+        for (int k = 0; k < out_pd->m0; k++)
         {
-            memcpy(out_pd->X + (size_t) k * dense_n,
-                   pd->X + (size_t) (k / d1) * dense_n,
-                   dense_n * sizeof(double));
+            memcpy(out_pd->X + k * n0, pd->X + (k / d1) * n0, n0 * sizeof(double));
         }
     }
     else /* BROADCAST_COL */
     {
         (void) d1;
-        size_t child_block = (size_t) pd->dense_m * (size_t) dense_n;
+        size_t child_block = pd->m0 * n0;
         for (int j = 0; j < d2; j++)
         {
-            memcpy(out_pd->X + (size_t) j * child_block, pd->X,
-                   child_block * sizeof(double));
+            memcpy(out_pd->X + j * child_block, pd->X, child_block * sizeof(double));
         }
     }
 }
 
-static Matrix *permuted_dense_vtable_diag_vec_alloc(Matrix *self)
+static matrix *permuted_dense_vtable_diag_vec_alloc(matrix *self)
 {
-    const Permuted_Dense *pd = (const Permuted_Dense *) self;
+    const permuted_dense *pd = (const permuted_dense *) self;
     int n = pd->base.m;
     int out_m = n * n;
 
-    if (pd->dense_m == 0)
+    if (pd->m0 == 0)
     {
-        return new_permuted_dense(out_m, pd->base.n, 0, pd->dense_n, NULL,
-                                  pd->col_perm, NULL);
+        return new_permuted_dense(out_m, pd->base.n, 0, pd->n0, NULL, pd->col_perm,
+                                  NULL);
     }
 
-    int *new_row_perm = (int *) SP_MALLOC(pd->dense_m * sizeof(int));
-    for (int ii = 0; ii < pd->dense_m; ii++)
+    int *new_row_perm = (int *) SP_MALLOC(pd->m0 * sizeof(int));
+    for (int ii = 0; ii < pd->m0; ii++)
     {
         new_row_perm[ii] = pd->row_perm[ii] * (n + 1);
     }
-    Matrix *out = new_permuted_dense(out_m, pd->base.n, pd->dense_m, pd->dense_n,
-                                     new_row_perm, pd->col_perm, NULL);
+    matrix *out = new_permuted_dense(out_m, pd->base.n, pd->m0, pd->n0, new_row_perm,
+                                     pd->col_perm, NULL);
     free(new_row_perm);
     return out;
 }
 
-static void permuted_dense_vtable_diag_vec_fill_values(Matrix *self, Matrix *out)
+static void permuted_dense_vtable_diag_vec_fill_values(matrix *self, matrix *out)
 {
-    const Permuted_Dense *pd = (const Permuted_Dense *) self;
-    Permuted_Dense *out_pd = (Permuted_Dense *) out;
-    if (pd->dense_m == 0)
+    const permuted_dense *pd = (const permuted_dense *) self;
+    permuted_dense *out_pd = (permuted_dense *) out;
+    if (pd->m0 == 0)
     {
         return;
     }
-    memcpy(out_pd->X, pd->X,
-           (size_t) pd->dense_m * (size_t) pd->dense_n * sizeof(double));
+    memcpy(out_pd->X, pd->X, pd->m0 * pd->n0 * sizeof(double));
 }
 
-Matrix *new_permuted_dense(int m, int n, int dense_m, int dense_n,
-                           const int *row_perm, const int *col_perm,
-                           const double *X_data)
+matrix *new_permuted_dense(int m, int n, int m0, int n0, const int *row_perm,
+                           const int *col_perm, const double *X_data)
 {
     /* Validate sorted invariants. */
-    for (int ii = 1; ii < dense_m; ii++)
+    for (int ii = 1; ii < m0; ii++)
     {
         assert(row_perm[ii] > row_perm[ii - 1]);
     }
-    for (int jj = 1; jj < dense_n; jj++)
+    for (int jj = 1; jj < n0; jj++)
     {
         assert(col_perm[jj] > col_perm[jj - 1]);
     }
-    if (dense_m > 0)
+    if (m0 > 0)
     {
-        assert(row_perm[0] >= 0 && row_perm[dense_m - 1] < m);
+        assert(row_perm[0] >= 0 && row_perm[m0 - 1] < m);
     }
-    if (dense_n > 0)
+    if (n0 > 0)
     {
-        assert(col_perm[0] >= 0 && col_perm[dense_n - 1] < n);
+        assert(col_perm[0] >= 0 && col_perm[n0 - 1] < n);
     }
 
-    Permuted_Dense *pd = (Permuted_Dense *) SP_CALLOC(1, sizeof(Permuted_Dense));
+    permuted_dense *pd = (permuted_dense *) SP_CALLOC(1, sizeof(permuted_dense));
     pd->base.m = m;
     pd->base.n = n;
-    pd->base.nnz = dense_m * dense_n;
+    pd->base.nnz = m0 * n0;
     pd->base.copy_sparsity = permuted_dense_vtable_copy_sparsity;
     pd->base.DA_fill_values = permuted_dense_vtable_DA_fill_values;
     pd->base.ATA_alloc = permuted_dense_vtable_ATA_alloc;
@@ -359,32 +350,36 @@ Matrix *new_permuted_dense(int m, int n, int dense_m, int dense_n,
     pd->base.refresh_csc_values = permuted_dense_refresh_csc_values;
     pd->base.free_fn = permuted_dense_free;
 
-    pd->dense_m = dense_m;
-    pd->dense_n = dense_n;
+    pd->m0 = m0;
+    pd->n0 = n0;
 
-    int sz = dense_m * dense_n;
-    pd->row_perm = (int *) SP_MALLOC(dense_m * sizeof(int));
-    pd->col_perm = (int *) SP_MALLOC(dense_n * sizeof(int));
+    int sz = m0 * n0;
+    pd->row_perm = (int *) SP_MALLOC(m0 * sizeof(int));
+    pd->col_perm = (int *) SP_MALLOC(n0 * sizeof(int));
     pd->X = (double *) SP_MALLOC(sz * sizeof(double));
     pd->base.x = pd->X;
-    pd->Y_scratch = (double *) SP_MALLOC(sz * sizeof(double));
+    /* `dwork` sized for the Y-buffer role (Y = diag(d_perm) X) used by ATDA /
+       BTDA_pd_pd. BTA_csr_pd_alloc / BTA_pd_csr_alloc upgrade this to a
+       larger gather buffer when their output PD will instead play that role. */
+    pd->dwork_size = sz;
+    pd->dwork = (double *) SP_MALLOC(pd->dwork_size * sizeof(double));
     pd->col_inv = (int *) SP_MALLOC(n * sizeof(int));
     pd->row_inv = (int *) SP_MALLOC(m * sizeof(int));
 
-    if (dense_m > 0)
+    if (m0 > 0)
     {
-        memcpy(pd->row_perm, row_perm, dense_m * sizeof(int));
+        memcpy(pd->row_perm, row_perm, m0 * sizeof(int));
     }
-    if (dense_n > 0)
+    if (n0 > 0)
     {
-        memcpy(pd->col_perm, col_perm, dense_n * sizeof(int));
+        memcpy(pd->col_perm, col_perm, n0 * sizeof(int));
     }
 
     for (int j = 0; j < n; j++)
     {
         pd->col_inv[j] = -1;
     }
-    for (int jj = 0; jj < dense_n; jj++)
+    for (int jj = 0; jj < n0; jj++)
     {
         pd->col_inv[col_perm[jj]] = jj;
     }
@@ -393,7 +388,7 @@ Matrix *new_permuted_dense(int m, int n, int dense_m, int dense_n,
     {
         pd->row_inv[i] = -1;
     }
-    for (int ii = 0; ii < dense_m; ii++)
+    for (int ii = 0; ii < m0; ii++)
     {
         pd->row_inv[row_perm[ii]] = ii;
     }
@@ -406,31 +401,31 @@ Matrix *new_permuted_dense(int m, int n, int dense_m, int dense_n,
     return &pd->base;
 }
 
-static CSR_Matrix *permuted_dense_to_csr_alloc(const Permuted_Dense *self)
+static CSR_matrix *permuted_dense_to_csr_alloc(const permuted_dense *A)
 {
-    int dense_m = self->dense_m;
-    int dense_n = self->dense_n;
-    int m = self->base.m;
-    CSR_Matrix *C = new_csr_matrix(m, self->base.n, dense_m * dense_n);
+    int m0 = A->m0;
+    int n0 = A->n0;
+    int m = A->base.m;
+    CSR_matrix *C = new_csr_matrix(m, A->base.n, m0 * n0);
 
-    /* Alias C->x to self->X: the dense block layout already matches what the
-       CSR view's value array would hold, so values are always live with no
+    /* Alias C->x to A->X: the dense block layout already matches what the
+       CSR_matrix view's value array would hold, so values are always live with no
        memcpy needed. The PD owns the buffer; permuted_dense_free nulls
        C->x before free_csr_matrix to avoid double-free. */
     free(C->x);
-    C->x = self->X;
+    C->x = A->X;
 
     /* fill column indices (each dense row contributes a copy of col_perm) */
-    for (int ii = 0; ii < dense_m; ii++)
+    for (int ii = 0; ii < m0; ii++)
     {
-        memcpy(C->i + ii * dense_n, self->col_perm, dense_n * sizeof(int));
+        memcpy(C->i + ii * n0, A->col_perm, n0 * sizeof(int));
     }
 
     /* set row pointers via count and then cumulative sum  */
     memset(C->p, 0, (m + 1) * sizeof(int));
-    for (int ii = 0; ii < dense_m; ii++)
+    for (int ii = 0; ii < m0; ii++)
     {
-        C->p[self->row_perm[ii] + 1] = dense_n;
+        C->p[A->row_perm[ii] + 1] = n0;
     }
 
     for (int i = 0; i < m; i++)
@@ -441,50 +436,69 @@ static CSR_Matrix *permuted_dense_to_csr_alloc(const Permuted_Dense *self)
     return C;
 }
 
-void permuted_dense_DA_fill_values(const double *d, const Permuted_Dense *self,
-                                   Permuted_Dense *out)
+void permuted_dense_DA_fill_values(const double *d, const permuted_dense *A,
+                                   permuted_dense *C)
 {
-    int dense_m = self->dense_m;
-    int dense_n = self->dense_n;
-    cblas_dcopy(dense_m * dense_n, self->X, 1, out->X, 1);
-    for (int ii = 0; ii < dense_m; ii++)
+    int m0 = A->m0;
+    int n0 = A->n0;
+    cblas_dcopy(m0 * n0, A->X, 1, C->X, 1);
+    for (int ii = 0; ii < m0; ii++)
     {
-        cblas_dscal(dense_n, d[self->row_perm[ii]], out->X + ii * dense_n, 1);
+        cblas_dscal(n0, d[A->row_perm[ii]], C->X + ii * n0, 1);
     }
 }
 
-Matrix *permuted_dense_ATA_alloc(const Permuted_Dense *self)
+matrix *permuted_dense_ATA_alloc(const permuted_dense *A)
 {
-    int n = self->base.n;
-    int dense_n = self->dense_n;
-    return new_permuted_dense(n, n, dense_n, dense_n, self->col_perm, self->col_perm,
-                              NULL);
+    int n = A->base.n;
+    /* C = AT @ A has a dense block of size n0 x n0, with row and column index
+       sets given by A's col_perm. (This follows from Cij = ai^T aj where
+       ai and aj are columns of A. Here, ai and aj always have overlapping entries,
+       so Cij != 0 for (i, j) in A->col_perm x A->col_perm) */
+    return new_permuted_dense(n, n, A->n0, A->n0, A->col_perm, A->col_perm, NULL);
 }
 
-void permuted_dense_ATDA_fill_values(const Permuted_Dense *self, const double *d,
-                                     Permuted_Dense *out)
+void permuted_dense_ATDA_fill_values(const permuted_dense *A, const double *d,
+                                     permuted_dense *C)
 {
-    int dense_m = self->dense_m;
-    int dense_n = self->dense_n;
+    int m0 = A->m0;
+    int n0 = A->n0;
 
-    /* Y_scratch = diag(d_perm) X, where d_perm[kk] = d[row_perm[kk]]. */
-    cblas_dcopy(dense_m * dense_n, self->X, 1, self->Y_scratch, 1);
-    for (int ii = 0; ii < dense_m; ii++)
+    /* dwork = diag(d_perm) @ X, where d_perm[ii] = d[row_perm[ii]]. */
+    cblas_dcopy(m0 * n0, A->X, 1, A->dwork, 1);
+    for (int ii = 0; ii < m0; ii++)
     {
-        cblas_dscal(dense_n, d[self->row_perm[ii]], self->Y_scratch + ii * dense_n,
-                    1);
+        cblas_dscal(n0, d[A->row_perm[ii]], A->dwork + ii * n0, 1);
     }
 
-    /* out.X = X^T Y_scratch. */
-    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dense_n, dense_n, dense_m,
-                1.0, self->X, dense_n, self->Y_scratch, dense_n, 0.0, out->X,
-                dense_n);
+    /* C  = XT @ dwork = XT @ diag(d_perm) @ X */
+    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, n0, n0, m0, 1.0, A->X, n0,
+                A->dwork, n0, 0.0, C->X, n0);
 }
 
-Matrix *permuted_dense_BTA_alloc(const Permuted_Dense *A, const Permuted_Dense *B)
+matrix *permuted_dense_BTA_alloc(const permuted_dense *A, const permuted_dense *B)
 {
-    return new_permuted_dense(B->base.n, A->base.n, B->dense_n, A->dense_n,
-                              B->col_perm, A->col_perm, NULL);
+    /* if A and B have no overlapping rows, then C = BT @ A is empty */
+    if (!has_overlap(A->row_perm, A->m0, B->row_perm, B->m0, 0))
+    {
+        return new_permuted_dense(B->base.n, A->base.n, 0, 0, NULL, NULL, NULL);
+    }
+
+    /* otherwise C has a dense block of size B->n0 x A->n0, with row and column
+       index sets given by B->col_perm and A->col_perm, respectively */
+    matrix *C = new_permuted_dense(B->base.n, A->base.n, B->n0, A->n0, B->col_perm,
+                                   A->col_perm, NULL);
+
+    /* Pre-allocate C->iwork for idx_A + idx_B in BTA / BTDA_pd_pd slow paths
+       (each needs at most max_s = MIN(A->m0, B->m0) ints; we store both
+       arrays back-to-back in iwork, hence 2 * max_s). */
+    permuted_dense *C_pd = (permuted_dense *) C;
+    C_pd->iwork_size = (size_t) 2 * MIN(A->m0, B->m0);
+    if (C_pd->iwork_size > 0)
+    {
+        C_pd->iwork = (int *) SP_MALLOC(C_pd->iwork_size * sizeof(int));
+    }
+    return C;
 }
 
 /* Return 1 iff arrays a and b of length n are element-wise equal. */
@@ -497,38 +511,23 @@ static int int_arrays_equal(const int *a, const int *b, int n)
     return 1;
 }
 
-void permuted_dense_BTA_fill_values(const Permuted_Dense *A, const Permuted_Dense *B,
-                                    Permuted_Dense *out)
+/* Find intersection of two sorted, ascending int arrays. For each pair of positions
+   (ii, jj) where a[ii] == b[jj], write ii into idx_a and jj into idx_b. Returns the
+   count of matches. Buffers idx_a and idx_b must have capacity >= min(a_len, b_len);
+   no allocation is performed. */
+static inline int sorted_intersect_indices(const int *a, int a_len, const int *b,
+                                           int b_len, int *idx_a, int *idx_b)
 {
-    int dn_A = A->dense_n;
-    int dn_B = B->dense_n;
-
-    /* Fast path: matching row_perms (the common case). One dgemm on the
-       full X buffers. */
-    if (A->dense_m == B->dense_m &&
-        int_arrays_equal(A->row_perm, B->row_perm, A->dense_m))
-    {
-        int s = A->dense_m;
-        cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dn_B, dn_A, s, 1.0,
-                    B->X, dn_B, A->X, dn_A, 0.0, out->X, dn_A);
-        return;
-    }
-
-    /* General path: intersect row_perm_A and row_perm_B via sorted merge,
-       gather the matching rows into contiguous scratch buffers, dgemm. */
-    int max_s = A->dense_m < B->dense_m ? A->dense_m : B->dense_m;
-    int *idx_A = (int *) SP_MALLOC((size_t) max_s * sizeof(int));
-    int *idx_B = (int *) SP_MALLOC((size_t) max_s * sizeof(int));
     int s = 0;
     int ii = 0, jj = 0;
-    while (ii < A->dense_m && jj < B->dense_m)
+    while (ii < a_len && jj < b_len)
     {
-        int ra = A->row_perm[ii];
-        int rb = B->row_perm[jj];
+        int ra = a[ii];
+        int rb = b[jj];
         if (ra == rb)
         {
-            idx_A[s] = ii;
-            idx_B[s] = jj;
+            idx_a[s] = ii;
+            idx_b[s] = jj;
             s++;
             ii++;
             jj++;
@@ -542,43 +541,89 @@ void permuted_dense_BTA_fill_values(const Permuted_Dense *A, const Permuted_Dens
             jj++;
         }
     }
+    return s;
+}
 
-    if (s == 0)
+void BTA_pd_pd_fill_values(const permuted_dense *A, const permuted_dense *B,
+                           permuted_dense *C)
+{
+    /* C may be empty if there is no overlap in row permutations */
+    if (C->base.nnz == 0)
     {
-        memset(out->X, 0,
-               (size_t) out->dense_m * (size_t) out->dense_n * sizeof(double));
-        free(idx_A);
-        free(idx_B);
         return;
     }
 
-    double *XA_sub = (double *) SP_MALLOC((size_t) s * (size_t) dn_A * sizeof(double));
-    double *XB_sub = (double *) SP_MALLOC((size_t) s * (size_t) dn_B * sizeof(double));
+    /* if B and A have identical row_perms, one matmul suffices */
+    if (A->m0 == B->m0 && int_arrays_equal(A->row_perm, B->row_perm, A->m0))
+    {
+        cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, B->n0, A->n0, A->m0,
+                    1.0, B->X, B->n0, A->X, A->n0, 0.0, C->X, A->n0);
+        return;
+    }
+
+    /* find intersection of row permutations. We use C->iwork as the storage
+       for idx_A | idx_B (back-to-back) and grow it in place if too small */
+    int max_s = MIN(A->m0, B->m0);
+    size_t needed = 2 * (size_t) max_s;
+    if (C->iwork_size < needed)
+    {
+        free(C->iwork);
+        C->iwork = (int *) SP_MALLOC(needed * sizeof(int));
+        C->iwork_size = needed;
+    }
+    int *idx_A = C->iwork;
+    int *idx_B = C->iwork + max_s;
+    int s = sorted_intersect_indices(A->row_perm, A->m0, B->row_perm, B->m0, idx_A,
+                                     idx_B);
+    assert(s > 0);
+
+    /* Gather the matching rows into A->dwork and B->dwork (space is sufficient
+       since A->dwork has at least space for A's full block, and we only need part
+       of it. Same comment applies to B->dwork). */
     for (int k = 0; k < s; k++)
     {
-        memcpy(XA_sub + (size_t) k * dn_A, A->X + (size_t) idx_A[k] * dn_A,
-               (size_t) dn_A * sizeof(double));
-        memcpy(XB_sub + (size_t) k * dn_B, B->X + (size_t) idx_B[k] * dn_B,
-               (size_t) dn_B * sizeof(double));
+        memcpy(A->dwork + k * A->n0, A->X + idx_A[k] * A->n0,
+               A->n0 * sizeof(double));
+        memcpy(B->dwork + k * B->n0, B->X + idx_B[k] * B->n0,
+               B->n0 * sizeof(double));
     }
 
-    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dn_B, dn_A, s, 1.0,
-                XB_sub, dn_B, XA_sub, dn_A, 0.0, out->X, dn_A);
+    /* matmul on the gathered rows */
+    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, B->n0, A->n0, s, 1.0,
+                B->dwork, B->n0, A->dwork, A->n0, 0.0, C->X, A->n0);
+}
+
+void BTDA_pd_pd_fill_values(const permuted_dense *A, const double *d,
+                            const permuted_dense *B, permuted_dense *C)
+{
+    /* C may be empty if there is no overlap in row permutations of A and B */
+    if (C->base.nnz == 0)
+    {
+        return;
+    }
 
-    free(XA_sub);
-    free(XB_sub);
-    free(idx_A);
-    free(idx_B);
+    /* d == NULL means plain BT @ A */
+    if (d == NULL)
+    {
+        BTA_pd_pd_fill_values(A, B, C);
+        return;
+    }
+
+    /* C = BT @ (DA) */
+    permuted_dense *DA = (permuted_dense *) A->base.copy_sparsity(&A->base);
+    permuted_dense_DA_fill_values(d, A, DA);
+    BTA_pd_pd_fill_values(DA, B, C);
+    free_matrix(&DA->base);
 }
 
-Matrix *BTA_csr_pd_alloc(const CSR_Matrix *A_csr, const Permuted_Dense *B)
+matrix *BTA_csr_pd_alloc(const CSR_matrix *A_csr, const permuted_dense *B)
 {
     /* Gather the union of columns appearing in A's rows at positions
        row_perm_B. Use a bitmap of size A_csr->n for O(nnz) collection. */
     int p = A_csr->n;
-    char *seen = (char *) SP_CALLOC((size_t) p, sizeof(char));
+    char *seen = (char *) SP_CALLOC(p, sizeof(char));
     int s_A = 0;
-    for (int kk = 0; kk < B->dense_m; kk++)
+    for (int kk = 0; kk < B->m0; kk++)
     {
         int row = B->row_perm[kk];
         for (int e = A_csr->p[row]; e < A_csr->p[row + 1]; e++)
@@ -592,7 +637,7 @@ Matrix *BTA_csr_pd_alloc(const CSR_Matrix *A_csr, const Permuted_Dense *B)
         }
     }
 
-    int *col_active = (int *) SP_MALLOC((size_t) (s_A > 0 ? s_A : 1) * sizeof(int));
+    int *col_active = (int *) SP_MALLOC((s_A > 0 ? s_A : 1) * sizeof(int));
     int idx = 0;
     for (int j = 0; j < p; j++)
     {
@@ -602,22 +647,24 @@ Matrix *BTA_csr_pd_alloc(const CSR_Matrix *A_csr, const Permuted_Dense *B)
         }
     }
 
-    Matrix *out = new_permuted_dense(B->base.n, p, B->dense_n, s_A,
-                                     B->col_perm, col_active, NULL);
+    matrix *C =
+        new_permuted_dense(B->base.n, p, B->n0, s_A, B->col_perm, col_active, NULL);
     free(col_active);
     free(seen);
 
-    /* Persistent scratch for BTA_csr_pd_fill_values / BTDA_csr_pd_fill_values:
-       A_sub_dense (B->dense_m x s_A row-major doubles). Pre-zeroed; each fill
-       memsets only the slots it touches by re-zeroing the whole buffer. */
-    Permuted_Dense *out_pd = (Permuted_Dense *) out;
-    out_pd->gather_X_size = (size_t) B->dense_m * (size_t) s_A;
-    if (out_pd->gather_X_size > 0)
+    /* Upgrade `dwork` (currently sized for the Y-role at m0_C * n0_C = B->n0 *
+       s_A) to fit the gather buffer A_sub_dense used by BTA_csr_pd /
+       BTDA_csr_pd_fill_values: shape (B->m0, s_A) row-major. The dgemm
+       reads it as (B->m0, s_A), so size B->m0 * s_A doubles suffices. */
+    permuted_dense *C_pd = (permuted_dense *) C;
+    size_t gather_size = B->m0 * s_A;
+    if (gather_size > C_pd->dwork_size)
     {
-        out_pd->gather_X_scratch =
-            (double *) SP_CALLOC(out_pd->gather_X_size, sizeof(double));
+        free(C_pd->dwork);
+        C_pd->dwork_size = gather_size;
+        C_pd->dwork = (double *) SP_CALLOC(gather_size, sizeof(double));
     }
-    return out;
+    return C;
 }
 
 /* Note: when A_csr is a leaf-variable Jacobian (each row has a single entry
@@ -625,53 +672,55 @@ Matrix *BTA_csr_pd_alloc(const CSR_Matrix *A_csr, const Permuted_Dense *B)
    the dgemm reduces to X_C = X_B^T — a pure transpose with no multiplication
    needed. A fast path can detect this and skip the dgemm; deferred until a
    workload shows the savings matter. */
-void BTA_csr_pd_fill_values(const CSR_Matrix *A_csr, const Permuted_Dense *B,
-                            Permuted_Dense *out)
+void BTA_csr_pd_fill_values(const CSR_matrix *A_csr, const permuted_dense *B,
+                            permuted_dense *C)
 {
-    int dense_m = B->dense_m;
-    int dn_B = B->dense_n;
-    int s_A = out->dense_n;
+    int m0 = B->m0;
+    int dn_B = B->n0;
+    int s_A = C->n0;
 
-    if (s_A == 0 || dense_m == 0)
+    if (s_A == 0 || m0 == 0)
     {
         /* Output dense block is empty; nothing to fill. */
         return;
     }
 
-    /* Use out->col_inv (pre-built by new_permuted_dense) as col_inv_out and
-       out->gather_X_scratch as A_sub_dense; both are owned by out. */
-    double *A_sub_dense = out->gather_X_scratch;
-    memset(A_sub_dense, 0, out->gather_X_size * sizeof(double));
+    /* Use C->col_inv (pre-built by new_permuted_dense) as col_inv_out and
+       C->dwork as A_sub_dense; both are owned by C. dwork is sized at alloc
+       time to cover m0 * s_A; only that prefix is touched. */
+    double *A_sub_dense = C->dwork;
+    size_t used = m0 * s_A;
+    memset(A_sub_dense, 0, used * sizeof(double));
 
-    for (int kk = 0; kk < dense_m; kk++)
+    for (int kk = 0; kk < m0; kk++)
     {
         int row = B->row_perm[kk];
         for (int e = A_csr->p[row]; e < A_csr->p[row + 1]; e++)
         {
             int j = A_csr->i[e];
-            int jj = out->col_inv[j];
+            int jj = C->col_inv[j];
             /* jj should always be valid (we built col_perm from these entries),
                but guard against asymmetry between alloc and fill calls. */
             if (jj >= 0)
             {
-                A_sub_dense[(size_t) kk * s_A + jj] = A_csr->x[e];
+                A_sub_dense[kk * s_A + jj] = A_csr->x[e];
             }
         }
     }
 
-    /* out->X = X_B^T @ A_sub_dense */
-    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dn_B, s_A, dense_m,
-                1.0, B->X, dn_B, A_sub_dense, s_A, 0.0, out->X, s_A);
+    /* C->X = X_B^T @ A_sub_dense */
+    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dn_B, s_A, m0, 1.0, B->X,
+                dn_B, A_sub_dense, s_A, 0.0, C->X, s_A);
 }
 
-Matrix *BTA_pd_csr_alloc(const Permuted_Dense *A, const CSR_Matrix *B_csr)
+matrix *BTA_pd_csr_alloc(const permuted_dense *A, const CSR_matrix *B_csr)
 {
     /* Gather the union of columns appearing in B's rows at positions
        row_perm_A. Bitmap of size B_csr->n for O(nnz) collection. */
     int q = B_csr->n;
-    char *seen = (char *) SP_CALLOC((size_t) q, sizeof(char));
+    char *seen = (char *) SP_CALLOC(q, sizeof(char));
     int r_B = 0;
-    for (int kk = 0; kk < A->dense_m; kk++)
+    for (int kk = 0; kk < A->m0; kk++)
     {
         int row = A->row_perm[kk];
         for (int e = B_csr->p[row]; e < B_csr->p[row + 1]; e++)
@@ -685,7 +734,7 @@ Matrix *BTA_pd_csr_alloc(const Permuted_Dense *A, const CSR_Matrix *B_csr)
         }
     }
 
-    int *row_active = (int *) SP_MALLOC((size_t) (r_B > 0 ? r_B : 1) * sizeof(int));
+    int *row_active = (int *) SP_MALLOC((r_B > 0 ? r_B : 1) * sizeof(int));
     int idx = 0;
     for (int i = 0; i < q; i++)
     {
@@ -695,21 +744,23 @@ Matrix *BTA_pd_csr_alloc(const Permuted_Dense *A, const CSR_Matrix *B_csr)
         }
     }
 
-    Matrix *out = new_permuted_dense(q, A->base.n, r_B, A->dense_n,
-                                     row_active, A->col_perm, NULL);
+    matrix *C =
+        new_permuted_dense(q, A->base.n, r_B, A->n0, row_active, A->col_perm, NULL);
     free(row_active);
     free(seen);
 
-    /* Persistent scratch for BTA_pd_csr_fill_values / BTDA_pd_csr_fill_values:
-       B_sub_dense (A->dense_m x r_B row-major doubles). */
-    Permuted_Dense *out_pd = (Permuted_Dense *) out;
-    out_pd->gather_X_size = (size_t) A->dense_m * (size_t) r_B;
-    if (out_pd->gather_X_size > 0)
+    /* Upgrade `dwork` (currently sized for the Y-role at m0_C * n0_C = r_B *
+       A->n0) to fit the gather buffer B_sub_dense used by BTA_pd_csr /
+       BTDA_pd_csr_fill_values: shape (A->m0, r_B) row-major. */
+    permuted_dense *C_pd = (permuted_dense *) C;
+    size_t gather_size = A->m0 * r_B;
+    if (gather_size > C_pd->dwork_size)
     {
-        out_pd->gather_X_scratch =
-            (double *) SP_CALLOC(out_pd->gather_X_size, sizeof(double));
+        free(C_pd->dwork);
+        C_pd->dwork_size = gather_size;
+        C_pd->dwork = (double *) SP_CALLOC(gather_size, sizeof(double));
     }
-    return out;
+    return C;
 }
 
 /* Note: when B_csr is a leaf-variable Jacobian (each row has a single entry
@@ -717,265 +768,162 @@ Matrix *BTA_pd_csr_alloc(const Permuted_Dense *A, const CSR_Matrix *B_csr)
    the dgemm reduces to X_C = X_A — a pure copy with no multiplication
    needed. A fast path can detect this and skip the dgemm; deferred until a
    workload shows the savings matter. */
-void BTA_pd_csr_fill_values(const Permuted_Dense *A, const CSR_Matrix *B_csr,
-                            Permuted_Dense *out)
+void BTA_pd_csr_fill_values(const permuted_dense *A, const CSR_matrix *B_csr,
+                            permuted_dense *C)
 {
-    int dense_m = A->dense_m;
-    int dn_A = A->dense_n;
-    int r_B = out->dense_m;
+    int m0 = A->m0;
+    int dn_A = A->n0;
+    int r_B = C->m0;
 
-    if (r_B == 0 || dense_m == 0)
+    if (r_B == 0 || m0 == 0)
     {
         /* Output dense block is empty; nothing to fill. */
         return;
     }
 
-    /* Use out->row_inv (pre-built by new_permuted_dense) as row_inv_out and
-       out->gather_X_scratch as B_sub_dense; both are owned by out. */
-    double *B_sub_dense = out->gather_X_scratch;
-    memset(B_sub_dense, 0, out->gather_X_size * sizeof(double));
+    /* Use C->row_inv (pre-built by new_permuted_dense) as row_inv_out and
+       C->dwork as B_sub_dense; both are owned by C. dwork is sized at alloc
+       time to cover m0 * r_B; only that prefix is touched. */
+    double *B_sub_dense = C->dwork;
+    size_t used = m0 * r_B;
+    memset(B_sub_dense, 0, used * sizeof(double));
 
-    for (int kk = 0; kk < dense_m; kk++)
+    for (int kk = 0; kk < m0; kk++)
     {
         int row = A->row_perm[kk];
         for (int e = B_csr->p[row]; e < B_csr->p[row + 1]; e++)
         {
             int i = B_csr->i[e];
-            int ii = out->row_inv[i];
+            int ii = C->row_inv[i];
             if (ii >= 0)
             {
-                B_sub_dense[(size_t) kk * r_B + ii] = B_csr->x[e];
+                B_sub_dense[kk * r_B + ii] = B_csr->x[e];
             }
         }
     }
 
-    /* out->X = B_sub_dense^T @ X_A */
-    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, r_B, dn_A, dense_m,
-                1.0, B_sub_dense, r_B, A->X, dn_A, 0.0, out->X, dn_A);
+    /* C->X = B_sub_dense^T @ X_A */
+    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, r_B, dn_A, m0, 1.0,
+                B_sub_dense, r_B, A->X, dn_A, 0.0, C->X, dn_A);
 }
 
-/* BTDA variant of BTA_csr_pd: out->X = X_B^T diag(d) A_sub_dense. Folds d
+/* BTDA variant of BTA_csr_pd: C->X = X_B^T diag(d) A_sub_dense. Folds d
    into the scatter step. */
-void BTDA_csr_pd_fill_values(const CSR_Matrix *A_csr, const double *d,
-                             const Permuted_Dense *B, Permuted_Dense *out)
+void BTDA_csr_pd_fill_values(const CSR_matrix *A_csr, const double *d,
+                             const permuted_dense *B, permuted_dense *C)
 {
-    int dense_m = B->dense_m;
-    int dn_B = B->dense_n;
-    int s_A = out->dense_n;
+    int m0 = B->m0;
+    int dn_B = B->n0;
+    int s_A = C->n0;
 
-    if (s_A == 0 || dense_m == 0)
+    if (s_A == 0 || m0 == 0)
     {
         return;
     }
 
-    double *A_sub_dense = out->gather_X_scratch;
-    memset(A_sub_dense, 0, out->gather_X_size * sizeof(double));
+    double *A_sub_dense = C->dwork;
+    size_t used = m0 * s_A;
+    memset(A_sub_dense, 0, used * sizeof(double));
 
-    for (int kk = 0; kk < dense_m; kk++)
+    for (int kk = 0; kk < m0; kk++)
     {
         int row = B->row_perm[kk];
         double dk = d ? d[row] : 1.0;
         for (int e = A_csr->p[row]; e < A_csr->p[row + 1]; e++)
         {
             int j = A_csr->i[e];
-            int jj = out->col_inv[j];
+            int jj = C->col_inv[j];
             if (jj >= 0)
             {
-                A_sub_dense[(size_t) kk * s_A + jj] = dk * A_csr->x[e];
+                A_sub_dense[kk * s_A + jj] = dk * A_csr->x[e];
             }
         }
     }
 
-    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dn_B, s_A, dense_m,
-                1.0, B->X, dn_B, A_sub_dense, s_A, 0.0, out->X, s_A);
+    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dn_B, s_A, m0, 1.0, B->X,
+                dn_B, A_sub_dense, s_A, 0.0, C->X, s_A);
 }
 
-/* BTDA variant of BTA_pd_csr: out->X = B_sub_dense^T diag(d) X_A. Folds d
+/* BTDA variant of BTA_pd_csr: C->X = B_sub_dense^T diag(d) X_A. Folds d
    into the scatter step. */
-void BTDA_pd_csr_fill_values(const Permuted_Dense *A, const double *d,
-                             const CSR_Matrix *B_csr, Permuted_Dense *out)
+void BTDA_pd_csr_fill_values(const permuted_dense *A, const double *d,
+                             const CSR_matrix *B_csr, permuted_dense *C)
 {
-    int dense_m = A->dense_m;
-    int dn_A = A->dense_n;
-    int r_B = out->dense_m;
+    int m0 = A->m0;
+    int dn_A = A->n0;
+    int r_B = C->m0;
 
-    if (r_B == 0 || dense_m == 0)
+    if (r_B == 0 || m0 == 0)
     {
         return;
     }
 
-    double *B_sub_dense = out->gather_X_scratch;
-    memset(B_sub_dense, 0, out->gather_X_size * sizeof(double));
+    double *B_sub_dense = C->dwork;
+    size_t used = m0 * r_B;
+    memset(B_sub_dense, 0, used * sizeof(double));
 
-    for (int kk = 0; kk < dense_m; kk++)
+    for (int kk = 0; kk < m0; kk++)
     {
         int row = A->row_perm[kk];
         double dk = d ? d[row] : 1.0;
         for (int e = B_csr->p[row]; e < B_csr->p[row + 1]; e++)
         {
             int i = B_csr->i[e];
-            int ii = out->row_inv[i];
+            int ii = C->row_inv[i];
             if (ii >= 0)
             {
-                B_sub_dense[(size_t) kk * r_B + ii] = dk * B_csr->x[e];
-            }
-        }
-    }
-
-    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, r_B, dn_A, dense_m,
-                1.0, B_sub_dense, r_B, A->X, dn_A, 0.0, out->X, dn_A);
-}
-
-/* BTDA(PD, PD): out->X = X_B^T diag(d) X_A, restricted to row_perm_A ∩
-   row_perm_B. When d == NULL, this is just permuted_dense_BTA_fill_values.
-   Otherwise we first row-scale A's X into A's Y_scratch by d, then run the
-   same intersect-and-gather logic as the BTA case using Y_scratch in place
-   of X_A. */
-void BTDA_pd_pd_fill_values(const Permuted_Dense *A, const double *d,
-                            const Permuted_Dense *B, Permuted_Dense *out)
-{
-    if (d == NULL)
-    {
-        permuted_dense_BTA_fill_values(A, B, out);
-        return;
-    }
-
-    int dn_A = A->dense_n;
-    int dn_B = B->dense_n;
-    int dense_m_A = A->dense_m;
-
-    /* Build Y = diag(d_perm_A) X_A in A's Y_scratch (mutates only the
-       Y_scratch buffer, so const A is preserved in spirit). */
-    cblas_dcopy(dense_m_A * dn_A, A->X, 1, A->Y_scratch, 1);
-    for (int kk = 0; kk < dense_m_A; kk++)
-    {
-        cblas_dscal(dn_A, d[A->row_perm[kk]], A->Y_scratch + kk * dn_A, 1);
-    }
-
-    /* Fast path: matching row_perms. One dgemm using Y_scratch as A. */
-    int match = (A->dense_m == B->dense_m);
-    if (match)
-    {
-        for (int kk = 0; kk < A->dense_m; kk++)
-        {
-            if (A->row_perm[kk] != B->row_perm[kk])
-            {
-                match = 0;
-                break;
+                B_sub_dense[kk * r_B + ii] = dk * B_csr->x[e];
             }
         }
     }
-    if (match)
-    {
-        int s = A->dense_m;
-        cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dn_B, dn_A, s, 1.0,
-                    B->X, dn_B, A->Y_scratch, dn_A, 0.0, out->X, dn_A);
-        return;
-    }
-
-    /* General path: intersect row_perm_A and row_perm_B, gather Y_scratch's
-       and B's matched rows, then dgemm. */
-    int max_s = A->dense_m < B->dense_m ? A->dense_m : B->dense_m;
-    int *idx_A = (int *) SP_MALLOC((size_t) max_s * sizeof(int));
-    int *idx_B = (int *) SP_MALLOC((size_t) max_s * sizeof(int));
-    int s = 0;
-    int ii = 0, jj = 0;
-    while (ii < A->dense_m && jj < B->dense_m)
-    {
-        int ra = A->row_perm[ii];
-        int rb = B->row_perm[jj];
-        if (ra == rb)
-        {
-            idx_A[s] = ii;
-            idx_B[s] = jj;
-            s++;
-            ii++;
-            jj++;
-        }
-        else if (ra < rb)
-        {
-            ii++;
-        }
-        else
-        {
-            jj++;
-        }
-    }
-
-    if (s == 0)
-    {
-        memset(out->X, 0,
-               (size_t) out->dense_m * (size_t) out->dense_n * sizeof(double));
-        free(idx_A);
-        free(idx_B);
-        return;
-    }
 
-    double *YA_sub = (double *) SP_MALLOC((size_t) s * (size_t) dn_A * sizeof(double));
-    double *XB_sub = (double *) SP_MALLOC((size_t) s * (size_t) dn_B * sizeof(double));
-    for (int k = 0; k < s; k++)
-    {
-        memcpy(YA_sub + (size_t) k * dn_A, A->Y_scratch + (size_t) idx_A[k] * dn_A,
-               (size_t) dn_A * sizeof(double));
-        memcpy(XB_sub + (size_t) k * dn_B, B->X + (size_t) idx_B[k] * dn_B,
-               (size_t) dn_B * sizeof(double));
-    }
-
-    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dn_B, dn_A, s, 1.0,
-                XB_sub, dn_B, YA_sub, dn_A, 0.0, out->X, dn_A);
-
-    free(YA_sub);
-    free(XB_sub);
-    free(idx_A);
-    free(idx_B);
+    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, r_B, dn_A, m0, 1.0,
+                B_sub_dense, r_B, A->X, dn_A, 0.0, C->X, dn_A);
 }
 
-Matrix *permuted_dense_times_csc_alloc(const Permuted_Dense *self,
-                                       const CSC_Matrix *J)
+matrix *permuted_dense_times_csc_alloc(const permuted_dense *A, const CSC_matrix *J)
 {
     /* Active columns: those with at least one structural nonzero in a row
-       of col_perm_self. col_inv[r] != -1 iff r is in col_perm_self. */
-    iVec *col_perm_out = iVec_new(8);
+       of col_perm_A. col_inv[r] != -1 iff r is in col_perm_A. */
+    iVec *col_perm_C = iVec_new(8);
     for (int j = 0; j < J->n; j++)
     {
         for (int e = J->p[j]; e < J->p[j + 1]; e++)
         {
-            if (self->col_inv[J->i[e]] != -1)
+            if (A->col_inv[J->i[e]] != -1)
             {
-                iVec_append(col_perm_out, j);
+                iVec_append(col_perm_C, j);
                 break;
             }
         }
     }
 
-    Matrix *M_out =
-        new_permuted_dense(self->base.m, J->n, self->dense_m, col_perm_out->len,
-                           self->row_perm, col_perm_out->data, NULL);
-    iVec_free(col_perm_out);
-    return M_out;
+    matrix *C = new_permuted_dense(A->base.m, J->n, A->m0, col_perm_C->len,
+                                   A->row_perm, col_perm_C->data, NULL);
+    iVec_free(col_perm_C);
+    return C;
 }
 
-void permuted_dense_times_csc_fill_values(const Permuted_Dense *self,
-                                          const CSC_Matrix *J, Permuted_Dense *out)
+void permuted_dense_times_csc_fill_values(const permuted_dense *A,
+                                          const CSC_matrix *J, permuted_dense *C)
 {
-    int dense_m = self->dense_m;
-    int dense_n_self = self->dense_n;
-    int dense_n_out = out->dense_n;
+    int m0 = A->m0;
+    int n0_A = A->n0;
+    int n0_C = C->n0;
 
-    /* Each entry (r, val) of J in active columns with r in col_perm_self
-       contributes val * self.X[:, kk] to out.X[:, jj], where kk = col_inv[r]
-       and jj is the position of the column in col_perm_out. Columns of X
-       and out.X are accessed via row-major strides. */
-    memset(out->X, 0, dense_m * dense_n_out * sizeof(double));
-    for (int jj = 0; jj < dense_n_out; jj++)
+    /* Each entry (r, val) of J in active columns with r in col_perm_A
+       contributes val * A.X[:, kk] to C.X[:, jj], where kk = col_inv[r]
+       and jj is the position of the column in col_perm_C. Columns of X
+       and C.X are accessed via row-major strides. */
+    memset(C->X, 0, m0 * n0_C * sizeof(double));
+    for (int jj = 0; jj < n0_C; jj++)
     {
-        int col = out->col_perm[jj];
+        int col = C->col_perm[jj];
         for (int e = J->p[col]; e < J->p[col + 1]; e++)
         {
-            int kk = self->col_inv[J->i[e]];
+            int kk = A->col_inv[J->i[e]];
             if (kk == -1) continue;
-            cblas_daxpy(dense_m, J->x[e], self->X + kk, dense_n_self, out->X + jj,
-                        dense_n_out);
+            cblas_daxpy(m0, J->x[e], A->X + kk, n0_A, C->X + jj, n0_C);
         }
     }
 }
diff --git a/src/utils/sparse_matrix.c b/src/utils/sparse_matrix.c
index d4fdc7c..6a83bc1 100644
--- a/src/utils/sparse_matrix.c
+++ b/src/utils/sparse_matrix.c
@@ -15,7 +15,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "utils/CSC_Matrix.h"
+#include "utils/sparse_matrix.h"
+
+#include "utils/CSC_matrix.h"
 #include "utils/linalg_sparse_matmuls.h"
 #include "utils/matrix.h"
 #include "utils/mini_numpy.h"
@@ -23,30 +25,30 @@
 #include <stdlib.h>
 #include <string.h>
 
-static void sparse_block_left_mult_vec(const Matrix *self, const double *x,
+static void sparse_block_left_mult_vec(const matrix *self, const double *x,
                                        double *y, int p)
 {
-    const Sparse_Matrix *sm = (const Sparse_Matrix *) self;
+    const sparse_matrix *sm = (const sparse_matrix *) self;
     block_left_multiply_vec(sm->csr, x, y, p);
 }
 
-static CSC_Matrix *sparse_block_left_mult_sparsity(const Matrix *self,
-                                                   const CSC_Matrix *J, int p)
+static CSC_matrix *sparse_block_left_mult_sparsity(const matrix *self,
+                                                   const CSC_matrix *J, int p)
 {
-    const Sparse_Matrix *sm = (const Sparse_Matrix *) self;
+    const sparse_matrix *sm = (const sparse_matrix *) self;
     return block_left_multiply_fill_sparsity(sm->csr, J, p);
 }
 
-static void sparse_block_left_mult_values(const Matrix *self, const CSC_Matrix *J,
-                                          CSC_Matrix *C)
+static void sparse_block_left_mult_values(const matrix *self, const CSC_matrix *J,
+                                          CSC_matrix *C)
 {
-    const Sparse_Matrix *sm = (const Sparse_Matrix *) self;
+    const sparse_matrix *sm = (const sparse_matrix *) self;
     block_left_multiply_fill_values(sm->csr, J, C);
 }
 
-static void sparse_free(Matrix *self)
+static void sparse_free(matrix *self)
 {
-    Sparse_Matrix *sm = (Sparse_Matrix *) self;
+    sparse_matrix *sm = (sparse_matrix *) self;
     free_csr_matrix(sm->csr);
     free_csc_matrix(sm->csc_cache);
     free(sm->csc_iwork);
@@ -54,61 +56,61 @@ static void sparse_free(Matrix *self)
 }
 
 /* Forward decl: ctor is referenced by copy_sparsity below. */
-Matrix *new_sparse_matrix(CSR_Matrix *A);
+matrix *new_sparse_matrix(CSR_matrix *A);
 
-/* Build the CSC cache structure if absent. Values are NOT filled here; caller
+/* Build the CSC_matrix cache structure if absent. Values are NOT filled here; caller
    must call refresh_csc_values before consuming. ATA_alloc only needs structure,
    so it's safe to call after build_csc_structure alone. */
-static void build_csc_structure_if_absent(Sparse_Matrix *sm)
+static void build_csc_structure_if_absent(sparse_matrix *sm)
 {
     if (sm->csc_cache != NULL) return;
     sm->csc_iwork = (int *) SP_MALLOC(sm->csr->n * sizeof(int));
     sm->csc_cache = csr_to_csc_alloc(sm->csr, sm->csc_iwork);
 }
 
-static Matrix *sparse_copy_sparsity(const Matrix *self)
+static matrix *sparse_copy_sparsity(const matrix *self)
 {
-    const Sparse_Matrix *sm = (const Sparse_Matrix *) self;
+    const sparse_matrix *sm = (const sparse_matrix *) self;
     return new_sparse_matrix(new_csr_copy_sparsity(sm->csr));
 }
 
-static void sparse_DA_fill_values(const double *d, const Matrix *self, Matrix *out)
+static void sparse_DA_fill_values(const double *d, const matrix *self, matrix *out)
 {
-    const Sparse_Matrix *sm = (const Sparse_Matrix *) self;
-    Sparse_Matrix *sm_out = (Sparse_Matrix *) out;
+    const sparse_matrix *sm = (const sparse_matrix *) self;
+    sparse_matrix *sm_out = (sparse_matrix *) out;
     DA_fill_values(d, sm->csr, sm_out->csr);
 }
 
-static Matrix *sparse_ATA_alloc(Matrix *self)
+static matrix *sparse_ATA_alloc(matrix *self)
 {
-    Sparse_Matrix *sm = (Sparse_Matrix *) self;
+    sparse_matrix *sm = (sparse_matrix *) self;
     build_csc_structure_if_absent(sm);
     return new_sparse_matrix(ATA_alloc(sm->csc_cache));
 }
 
 /* Caller must have called refresh_csc_values since the last change to csr->x. */
-static void sparse_ATDA_fill_values(const Matrix *self, const double *d, Matrix *out)
+static void sparse_ATDA_fill_values(const matrix *self, const double *d, matrix *out)
 {
-    const Sparse_Matrix *sm = (const Sparse_Matrix *) self;
-    Sparse_Matrix *sm_out = (Sparse_Matrix *) out;
+    const sparse_matrix *sm = (const sparse_matrix *) self;
+    sparse_matrix *sm_out = (sparse_matrix *) out;
     ATDA_fill_values(sm->csc_cache, d, sm_out->csr);
 }
 
-static CSR_Matrix *sparse_to_csr(Matrix *self)
+static CSR_matrix *sparse_to_csr(matrix *self)
 {
-    return ((Sparse_Matrix *) self)->csr;
+    return ((sparse_matrix *) self)->csr;
 }
 
-static struct Permuted_Dense *sparse_as_permuted_dense(Matrix *self)
+static struct permuted_dense *sparse_as_permuted_dense(matrix *self)
 {
     (void) self;
     return NULL;
 }
 
-static Matrix *sparse_index_alloc(Matrix *self, const int *indices, int n_idxs)
+static matrix *sparse_index_alloc(matrix *self, const int *indices, int n_idxs)
 {
-    CSR_Matrix *Jx = ((Sparse_Matrix *) self)->csr;
-    CSR_Matrix *J = new_csr_matrix(n_idxs, self->n, Jx->nnz);
+    CSR_matrix *Jx = ((sparse_matrix *) self)->csr;
+    CSR_matrix *J = new_csr_matrix(n_idxs, self->n, Jx->nnz);
 
     J->p[0] = 0;
     for (int i = 0; i < n_idxs; i++)
@@ -122,11 +124,11 @@ static Matrix *sparse_index_alloc(Matrix *self, const int *indices, int n_idxs)
     return new_sparse_matrix(J);
 }
 
-static void sparse_index_fill_values(Matrix *self, const int *indices, int n_idxs,
-                                     Matrix *out)
+static void sparse_index_fill_values(matrix *self, const int *indices, int n_idxs,
+                                     matrix *out)
 {
-    CSR_Matrix *Jx = ((Sparse_Matrix *) self)->csr;
-    CSR_Matrix *J = ((Sparse_Matrix *) out)->csr;
+    CSR_matrix *Jx = ((sparse_matrix *) self)->csr;
+    CSR_matrix *J = ((sparse_matrix *) out)->csr;
     for (int i = 0; i < n_idxs; i++)
     {
         int len = J->p[i + 1] - J->p[i];
@@ -134,11 +136,11 @@ static void sparse_index_fill_values(Matrix *self, const int *indices, int n_idx
     }
 }
 
-static Matrix *sparse_promote_alloc(Matrix *self, int size)
+static matrix *sparse_promote_alloc(matrix *self, int size)
 {
-    CSR_Matrix *Jx = ((Sparse_Matrix *) self)->csr;
+    CSR_matrix *Jx = ((sparse_matrix *) self)->csr;
     int row_nnz = Jx->nnz;
-    CSR_Matrix *J = new_csr_matrix(size, self->n, size * row_nnz);
+    CSR_matrix *J = new_csr_matrix(size, self->n, size * row_nnz);
 
     for (int row = 0; row < size; row++)
     {
@@ -150,9 +152,9 @@ static Matrix *sparse_promote_alloc(Matrix *self, int size)
     return new_sparse_matrix(J);
 }
 
-static void sparse_promote_fill_values(Matrix *self, Matrix *out)
+static void sparse_promote_fill_values(matrix *self, matrix *out)
 {
-    CSR_Matrix *Jx = ((Sparse_Matrix *) self)->csr;
+    CSR_matrix *Jx = ((sparse_matrix *) self)->csr;
     int row_nnz = Jx->nnz;
     for (int row = 0; row < out->m; row++)
     {
@@ -160,10 +162,10 @@ static void sparse_promote_fill_values(Matrix *self, Matrix *out)
     }
 }
 
-static Matrix *sparse_broadcast_alloc(Matrix *self, broadcast_type type, int d1,
+static matrix *sparse_broadcast_alloc(matrix *self, broadcast_type type, int d1,
                                       int d2)
 {
-    CSR_Matrix *Jx = ((Sparse_Matrix *) self)->csr;
+    CSR_matrix *Jx = ((sparse_matrix *) self)->csr;
     int out_m = d1 * d2;
     int total_nnz;
     if (type == BROADCAST_ROW)
@@ -179,7 +181,7 @@ static Matrix *sparse_broadcast_alloc(Matrix *self, broadcast_type type, int d1,
         total_nnz = Jx->nnz * out_m;
     }
 
-    CSR_Matrix *J = new_csr_matrix(out_m, self->n, total_nnz);
+    CSR_matrix *J = new_csr_matrix(out_m, self->n, total_nnz);
 
     if (type == BROADCAST_ROW)
     {
@@ -224,10 +226,10 @@ static Matrix *sparse_broadcast_alloc(Matrix *self, broadcast_type type, int d1,
     return new_sparse_matrix(J);
 }
 
-static void sparse_broadcast_fill_values(Matrix *self, broadcast_type type, int d1,
-                                         int d2, Matrix *out)
+static void sparse_broadcast_fill_values(matrix *self, broadcast_type type, int d1,
+                                         int d2, matrix *out)
 {
-    CSR_Matrix *Jx = ((Sparse_Matrix *) self)->csr;
+    CSR_matrix *Jx = ((sparse_matrix *) self)->csr;
     if (type == BROADCAST_ROW)
     {
         int acc = 0;
@@ -248,12 +250,12 @@ static void sparse_broadcast_fill_values(Matrix *self, broadcast_type type, int
     }
 }
 
-static Matrix *sparse_diag_vec_alloc(Matrix *self)
+static matrix *sparse_diag_vec_alloc(matrix *self)
 {
-    CSR_Matrix *Jx = ((Sparse_Matrix *) self)->csr;
+    CSR_matrix *Jx = ((sparse_matrix *) self)->csr;
     int n = self->m;
     int out_m = n * n;
-    CSR_Matrix *J = new_csr_matrix(out_m, self->n, Jx->nnz);
+    CSR_matrix *J = new_csr_matrix(out_m, self->n, Jx->nnz);
 
     int nnz = 0;
     int next_diag = 0;
@@ -274,10 +276,10 @@ static Matrix *sparse_diag_vec_alloc(Matrix *self)
     return new_sparse_matrix(J);
 }
 
-static void sparse_diag_vec_fill_values(Matrix *self, Matrix *out)
+static void sparse_diag_vec_fill_values(matrix *self, matrix *out)
 {
-    CSR_Matrix *Jx = ((Sparse_Matrix *) self)->csr;
-    CSR_Matrix *J = ((Sparse_Matrix *) out)->csr;
+    CSR_matrix *Jx = ((sparse_matrix *) self)->csr;
+    CSR_matrix *J = ((sparse_matrix *) out)->csr;
     int n = self->m;
     for (int i = 0; i < n; i++)
     {
@@ -287,15 +289,15 @@ static void sparse_diag_vec_fill_values(Matrix *self, Matrix *out)
     }
 }
 
-/* Build CSC structure on first call; refill values from csr->x on every call. */
-static void sparse_refresh_csc_values(Matrix *self)
+/* Build CSC_matrix structure on first call; refill values from csr->x on every call. */
+static void sparse_refresh_csc_values(matrix *self)
 {
-    Sparse_Matrix *sm = (Sparse_Matrix *) self;
+    sparse_matrix *sm = (sparse_matrix *) self;
     build_csc_structure_if_absent(sm);
     csr_to_csc_fill_values(sm->csr, sm->csc_cache, sm->csc_iwork);
 }
 
-static void wire_vtable(Sparse_Matrix *sm)
+static void wire_vtable(sparse_matrix *sm)
 {
     sm->base.block_left_mult_vec = sparse_block_left_mult_vec;
     sm->base.block_left_mult_sparsity = sparse_block_left_mult_sparsity;
@@ -318,9 +320,9 @@ static void wire_vtable(Sparse_Matrix *sm)
     sm->base.free_fn = sparse_free;
 }
 
-Matrix *new_sparse_matrix(CSR_Matrix *A)
+matrix *new_sparse_matrix(CSR_matrix *A)
 {
-    Sparse_Matrix *sm = (Sparse_Matrix *) SP_CALLOC(1, sizeof(Sparse_Matrix));
+    sparse_matrix *sm = (sparse_matrix *) SP_CALLOC(1, sizeof(sparse_matrix));
     sm->base.m = A->m;
     sm->base.n = A->n;
     sm->base.nnz = A->nnz;
@@ -330,15 +332,15 @@ Matrix *new_sparse_matrix(CSR_Matrix *A)
     return &sm->base;
 }
 
-Matrix *new_sparse_matrix_alloc(int m, int n, int nnz)
+matrix *new_sparse_matrix_alloc(int m, int n, int nnz)
 {
     return new_sparse_matrix(new_csr_matrix(m, n, nnz));
 }
 
-Matrix *sparse_matrix_trans(const Sparse_Matrix *self, int *iwork)
+matrix *sparse_matrix_trans(const sparse_matrix *self, int *iwork)
 {
-    CSR_Matrix *AT = transpose(self->csr, iwork);
-    Sparse_Matrix *sm = (Sparse_Matrix *) SP_CALLOC(1, sizeof(Sparse_Matrix));
+    CSR_matrix *AT = transpose(self->csr, iwork);
+    sparse_matrix *sm = (sparse_matrix *) SP_CALLOC(1, sizeof(sparse_matrix));
     sm->base.m = AT->m;
     sm->base.n = AT->n;
     sm->base.nnz = AT->nnz;
diff --git a/src/utils/utils.c b/src/utils/utils.c
index 5163b3c..9d504a1 100644
--- a/src/utils/utils.c
+++ b/src/utils/utils.c
@@ -30,3 +30,22 @@ void sort_int_array(int *array, int size)
 {
     qsort(array, size, sizeof(int), compare_int_asc);
 }
+
+bool has_overlap(const int *a_idx, int a_len, const int *b_idx, int b_len,
+                 int b_offset)
+{
+    int ai = 0, bi = 0;
+    while (ai < a_len && bi < b_len)
+    {
+        if (a_idx[ai] == b_idx[bi] - b_offset) return true;
+        if (a_idx[ai] < b_idx[bi] - b_offset)
+        {
+            ai++;
+        }
+        else
+        {
+            bi++;
+        }
+    }
+    return false;
+}
diff --git a/tests/all_tests.c b/tests/all_tests.c
index efb1c95..f6cd7d8 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -379,6 +379,7 @@ int main(void)
     mu_run_test(test_permuted_dense_broadcast_col, tests_run);
     mu_run_test(test_permuted_dense_diag_vec, tests_run);
     mu_run_test(test_permuted_dense_BTA_matching_row_perm, tests_run);
+    mu_run_test(test_permuted_dense_BTA_empty_overlap, tests_run);
     mu_run_test(test_permuted_dense_BTA_partial_overlap, tests_run);
     mu_run_test(test_permuted_dense_BTDA_decomposition, tests_run);
     mu_run_test(test_BTA_csr_pd_basic, tests_run);
diff --git a/tests/forward_pass/affine/test_diag_mat.h b/tests/forward_pass/affine/test_diag_mat.h
index cb20a05..d5703ac 100644
--- a/tests/forward_pass/affine/test_diag_mat.h
+++ b/tests/forward_pass/affine/test_diag_mat.h
@@ -8,7 +8,7 @@
 const char *test_diag_mat_forward(void)
 {
     /* 3x3 matrix variable (column-major): [1,2,3,4,5,6,7,8,9]
-     * Matrix:  1 4 7
+     * matrix:  1 4 7
      *          2 5 8
      *          3 6 9
      * Diagonal: (0,0)=1, (1,1)=5, (2,2)=9 */
diff --git a/tests/forward_pass/affine/test_linear_op.h b/tests/forward_pass/affine/test_linear_op.h
index 801a403..d235cd7 100644
--- a/tests/forward_pass/affine/test_linear_op.h
+++ b/tests/forward_pass/affine/test_linear_op.h
@@ -11,14 +11,14 @@
 
 const char *test_linear_op(void)
 {
-    /* create CSR matrix
+    /* create CSR_matrix matrix
      A = [0 0 2 3 0 0]
          [0 0 1 0 2 0]
          [0 0 3 4 5 0] */
     double Ax[7] = {2.0, 3.0, 1.0, 2.0, 3.0, 4.0, 5.0};
     int Ai[7] = {2, 3, 2, 4, 2, 3, 4};
     int Ap[4] = {0, 2, 4, 7};
-    CSR_Matrix *A = new_csr_matrix(3, 6, 7);
+    CSR_matrix *A = new_csr_matrix(3, 6, 7);
     memcpy(A->x, Ax, 7 * sizeof(double));
     memcpy(A->i, Ai, 7 * sizeof(int));
     memcpy(A->p, Ap, 4 * sizeof(int));
diff --git a/tests/forward_pass/affine/test_upper_tri.h b/tests/forward_pass/affine/test_upper_tri.h
index 056c3a5..3db43e8 100644
--- a/tests/forward_pass/affine/test_upper_tri.h
+++ b/tests/forward_pass/affine/test_upper_tri.h
@@ -8,7 +8,7 @@
 const char *test_upper_tri_forward_4x4(void)
 {
     /* 4x4 matrix variable (column-major): [1..16]
-     * Matrix:  1  5   9  13
+     * matrix:  1  5   9  13
      *          2  6  10  14
      *          3  7  11  15
      *          4  8  12  16
diff --git a/tests/jacobian_tests/affine/test_convolve.h b/tests/jacobian_tests/affine/test_convolve.h
index ed8ffb8..0eb4db1 100644
--- a/tests/jacobian_tests/affine/test_convolve.h
+++ b/tests/jacobian_tests/affine/test_convolve.h
@@ -18,7 +18,7 @@ const char *test_jacobian_convolve(void)
      *    [3, 2, 1],
      *    [0, 3, 2],
      *    [0, 0, 3]]
-     * stored in CSR with nnz = 9, shape 5 x 3. */
+     * stored in CSR_matrix with nnz = 9, shape 5 x 3. */
     double kernel[3] = {1.0, 2.0, 3.0};
     expr *kernel_param = new_parameter(3, 1, PARAM_FIXED, 3, kernel);
     expr *x = new_variable(3, 1, 0, 3);
diff --git a/tests/jacobian_tests/affine/test_diag_mat.h b/tests/jacobian_tests/affine/test_diag_mat.h
index eb42b75..9b669a0 100644
--- a/tests/jacobian_tests/affine/test_diag_mat.h
+++ b/tests/jacobian_tests/affine/test_diag_mat.h
@@ -10,7 +10,7 @@ const char *test_diag_mat_jacobian_variable(void)
 {
     /* diag_mat of a 2x2 variable (4 vars total)
      * Diagonal indices in column-major: [0, 3]
-     * Jacobian is 2x4 CSR: row 0 has col 0, row 1 has col 3 */
+     * Jacobian is 2x4 CSR_matrix: row 0 has col 0, row 1 has col 3 */
     double u[4] = {1.0, 2.0, 3.0, 4.0};
     expr *var = new_variable(2, 2, 0, 4);
     expr *dm = new_diag_mat(var);
diff --git a/tests/jacobian_tests/affine/test_index.h b/tests/jacobian_tests/affine/test_index.h
index 34a316a..eeda8c2 100644
--- a/tests/jacobian_tests/affine/test_index.h
+++ b/tests/jacobian_tests/affine/test_index.h
@@ -53,7 +53,7 @@ const char *test_index_jacobian_of_variable(void)
 
     /* Jacobian is 2x3 with pattern: row 0 selects col 0, row 1 selects col 2 */
     double expected_x[2] = {1.0, 1.0};
-    int expected_p[3] = {0, 1, 2}; /* CSR row ptrs */
+    int expected_p[3] = {0, 1, 2}; /* CSR_matrix row ptrs */
     int expected_i[2] = {0, 2};    /* column indices */
 
     mu_assert("vals fail", cmp_values(idx->jacobian, expected_x, 2));
diff --git a/tests/jacobian_tests/affine/test_left_matmul.h b/tests/jacobian_tests/affine/test_left_matmul.h
index c98aca7..85ee558 100644
--- a/tests/jacobian_tests/affine/test_left_matmul.h
+++ b/tests/jacobian_tests/affine/test_left_matmul.h
@@ -23,7 +23,7 @@ const char *test_jacobian_left_matmul_log(void)
      * [5,   0, 2    ]
      * [7,   0, 0    ]
      *
-     * Stored in CSR format (4x3 sparse):
+     * Stored in CSR_matrix format (4x3 sparse):
      * nnz = 7
      * p = [0, 2, 4, 6, 7]
      * i = [0, 2, 0, 2, 0, 2, 0]
@@ -32,8 +32,8 @@ const char *test_jacobian_left_matmul_log(void)
     double x_vals[3] = {1.0, 2.0, 3.0};
     expr *x = new_variable(3, 1, 0, 3);
 
-    /* Create sparse matrix A in CSR format */
-    CSR_Matrix *A = new_csr_matrix(4, 3, 7);
+    /* Create sparse matrix A in CSR_matrix format */
+    CSR_matrix *A = new_csr_matrix(4, 3, 7);
     int A_p[5] = {0, 2, 4, 6, 7};
     int A_i[7] = {0, 2, 0, 2, 0, 2, 0};
     double A_x[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
@@ -76,8 +76,8 @@ const char *test_jacobian_left_matmul_log_matrix(void)
     double x_vals[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     expr *x = new_variable(3, 2, 0, 6);
 
-    /* Create sparse matrix A in CSR format (4x3) */
-    CSR_Matrix *A = new_csr_matrix(4, 3, 7);
+    /* Create sparse matrix A in CSR_matrix format (4x3) */
+    CSR_matrix *A = new_csr_matrix(4, 3, 7);
     int A_p[5] = {0, 2, 4, 6, 7};
     int A_i[7] = {0, 2, 0, 2, 0, 2, 0};
     double A_x[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
@@ -116,7 +116,7 @@ const char *test_jacobian_left_matmul_exp_composite(void)
     expr *x = new_variable(3, 1, 0, 3);
 
     /* Create B matrix (3x3 all ones) */
-    CSR_Matrix *B = new_csr_matrix(3, 3, 9);
+    CSR_matrix *B = new_csr_matrix(3, 3, 9);
     int B_p[4] = {0, 3, 6, 9};
     int B_i[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
     double B_x[9] = {1, 1, 1, 1, 1, 1, 1, 1, 1};
@@ -125,7 +125,7 @@ const char *test_jacobian_left_matmul_exp_composite(void)
     memcpy(B->x, B_x, 9 * sizeof(double));
 
     /* Create A matrix */
-    CSR_Matrix *A = new_csr_matrix(4, 3, 7);
+    CSR_matrix *A = new_csr_matrix(4, 3, 7);
     int A_p[5] = {0, 2, 4, 6, 7};
     int A_i[7] = {0, 2, 0, 2, 0, 2, 0};
     double A_x[7] = {1, 2, 3, 4, 5, 6, 7};
diff --git a/tests/jacobian_tests/affine/test_right_matmul.h b/tests/jacobian_tests/affine/test_right_matmul.h
index f2b40f0..0925857 100644
--- a/tests/jacobian_tests/affine/test_right_matmul.h
+++ b/tests/jacobian_tests/affine/test_right_matmul.h
@@ -17,8 +17,8 @@ const char *test_jacobian_right_matmul_log(void)
     double x_vals[4] = {1.0, 3.0, 2.0, 4.0}; // column-wise vectorization
     expr *x = new_variable(2, 2, 0, 4);
 
-    /* Create sparse matrix A in CSR format (2x3) */
-    CSR_Matrix *A = new_csr_matrix(2, 3, 4);
+    /* Create sparse matrix A in CSR_matrix format (2x3) */
+    CSR_matrix *A = new_csr_matrix(2, 3, 4);
     int A_p[3] = {0, 2, 4};
     int A_i[4] = {0, 2, 0, 2};
     double A_x[4] = {1.0, 2.0, 3.0, 4.0};
@@ -66,8 +66,8 @@ const char *test_jacobian_right_matmul_log_vector(void)
     double x_vals[3] = {1.0, 2.0, 3.0};
     expr *x = new_variable(1, 3, 0, 3);
 
-    /* Create sparse matrix A in CSR format (3x2) */
-    CSR_Matrix *A = new_csr_matrix(3, 2, 4);
+    /* Create sparse matrix A in CSR_matrix format (3x2) */
+    CSR_matrix *A = new_csr_matrix(3, 2, 4);
     int A_p[4] = {0, 1, 3, 4};
     int A_i[4] = {0, 0, 1, 1};
     double A_x[4] = {1.0, 2.0, 3.0, 4.0};
diff --git a/tests/jacobian_tests/affine/test_transpose.h b/tests/jacobian_tests/affine/test_transpose.h
index 523cb2e..2d16837 100644
--- a/tests/jacobian_tests/affine/test_transpose.h
+++ b/tests/jacobian_tests/affine/test_transpose.h
@@ -11,7 +11,7 @@
 const char *test_jacobian_transpose(void)
 {
     // A = [1 2; 3 4]
-    CSR_Matrix *A = new_csr_matrix(2, 2, 4);
+    CSR_matrix *A = new_csr_matrix(2, 2, 4);
     int A_p[3] = {0, 2, 4};
     int A_i[4] = {0, 1, 0, 1};
     double A_x[4] = {1, 2, 3, 4};
diff --git a/tests/jacobian_tests/affine/test_upper_tri.h b/tests/jacobian_tests/affine/test_upper_tri.h
index c4a22a1..82d7419 100644
--- a/tests/jacobian_tests/affine/test_upper_tri.h
+++ b/tests/jacobian_tests/affine/test_upper_tri.h
@@ -10,7 +10,7 @@ const char *test_upper_tri_jacobian_variable(void)
 {
     /* upper_tri of a 4x4 variable (16 vars total)
      * Row-major upper tri indices: [4, 8, 12, 9, 13, 14]
-     * Jacobian is 6x16 CSR: row k has a single 1.0 at col indices[k] */
+     * Jacobian is 6x16 CSR_matrix: row k has a single 1.0 at col indices[k] */
     double u[16];
     for (int k = 0; k < 16; k++)
     {
diff --git a/tests/jacobian_tests/bivariate_full_dom/test_elementwise_mult.h b/tests/jacobian_tests/bivariate_full_dom/test_elementwise_mult.h
index d300d3b..35304c5 100644
--- a/tests/jacobian_tests/bivariate_full_dom/test_elementwise_mult.h
+++ b/tests/jacobian_tests/bivariate_full_dom/test_elementwise_mult.h
@@ -64,7 +64,7 @@ const char *test_jacobian_elementwise_mult_3(void)
          0 0 1 1 3 0 0 0 0 0
          0 0 1 -1 1 0 0 0 0 0]
     */
-    CSR_Matrix *A = new_csr_matrix(3, 10, 9);
+    CSR_matrix *A = new_csr_matrix(3, 10, 9);
     double Ax_vals[9] = {1.0, 2.0, 1.0, 1.0, 3.0, 1.0, -1.0, 1.0};
     int Ai[9] = {2, 3, 2, 3, 4, 2, 3, 4};
     int Ap[4] = {0, 2, 5, 8};
@@ -78,7 +78,7 @@ const char *test_jacobian_elementwise_mult_3(void)
          0 0 0 0 0 0 0 1 -2 1]
 
     */
-    CSR_Matrix *B = new_csr_matrix(3, 10, 9);
+    CSR_matrix *B = new_csr_matrix(3, 10, 9);
     double Bx_vals[9] = {1.0, 3.0, 1.0, 1.0, 4.0, 1.0, -2.0, 1.0};
     int Bi[9] = {7, 8, 7, 8, 9, 7, 8, 9};
     int Bp[4] = {0, 2, 5, 8};
@@ -128,7 +128,7 @@ const char *test_jacobian_elementwise_mult_4(void)
          0 0 1 1 3 0 0 0 0 0
          0 0 1 -1 1 0 0 0 0 0]
     */
-    CSR_Matrix *A = new_csr_matrix(3, 10, 9);
+    CSR_matrix *A = new_csr_matrix(3, 10, 9);
     double Ax_vals[9] = {1.0, 2.0, 1.0, 1.0, 3.0, 1.0, -1.0, 1.0};
     int Ai[9] = {2, 3, 2, 3, 4, 2, 3, 4};
     int Ap[4] = {0, 2, 5, 8};
diff --git a/tests/jacobian_tests/bivariate_restricted_dom/test_quad_over_lin.h b/tests/jacobian_tests/bivariate_restricted_dom/test_quad_over_lin.h
index 7889075..12accf1 100644
--- a/tests/jacobian_tests/bivariate_restricted_dom/test_quad_over_lin.h
+++ b/tests/jacobian_tests/bivariate_restricted_dom/test_quad_over_lin.h
@@ -63,7 +63,7 @@ const char *test_quad_over_lin3(void)
     // A = [0 0 1 2 3 0 0 0
     //      0 0 4 5 6 0 0]
 
-    CSR_Matrix *A = new_csr_matrix(2, 8, 6);
+    CSR_matrix *A = new_csr_matrix(2, 8, 6);
     double Ax[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     int Ai[6] = {2, 3, 4, 2, 3, 4};
     int Ap[3] = {0, 3, 6};
@@ -103,7 +103,7 @@ const char *test_quad_over_lin4(void)
     //      0 0 0 0 0 4 5 6
     //
 
-    CSR_Matrix *A = new_csr_matrix(2, 8, 6);
+    CSR_matrix *A = new_csr_matrix(2, 8, 6);
     double Ax[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     int Ai[6] = {5, 6, 7, 5, 6, 7};
     int Ap[3] = {0, 3, 6};
@@ -143,7 +143,7 @@ const char *test_quad_over_lin5(void)
     //      0 2 0 0 0 4 5 6
     //
 
-    CSR_Matrix *A = new_csr_matrix(2, 8, 9);
+    CSR_matrix *A = new_csr_matrix(2, 8, 9);
     double Ax[9] = {1, 3, 1.0, 2.0, 3.0, 2, 4.0, 5.0, 6.0};
     int Ai[9] = {0, 3, 5, 6, 7, 1, 5, 6, 7};
     int Ap[3] = {0, 5, 9};
diff --git a/tests/jacobian_tests/composite/test_chain_rule_jacobian.h b/tests/jacobian_tests/composite/test_chain_rule_jacobian.h
index bb7a23c..49eb8b3 100644
--- a/tests/jacobian_tests/composite/test_chain_rule_jacobian.h
+++ b/tests/jacobian_tests/composite/test_chain_rule_jacobian.h
@@ -6,7 +6,7 @@
 #include "minunit.h"
 #include "numerical_diff.h"
 #include "test_helpers.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 
 const char *test_jacobian_exp_sum(void)
 {
@@ -79,8 +79,8 @@ const char *test_jacobian_Ax_Bx_multiply(void)
     /* the first and last values are not used, but good to include them in test */
     double u_vals[4] = {1.0, 2.0, 3.0, 4.0};
 
-    CSR_Matrix *A = new_csr_random(2, 2, 1.0);
-    CSR_Matrix *B = new_csr_random(2, 2, 1.0);
+    CSR_matrix *A = new_csr_random(2, 2, 1.0);
+    CSR_matrix *B = new_csr_random(2, 2, 1.0);
     expr *x = new_variable(2, 1, 1, 4);
     expr *Ax = new_left_matmul(NULL, x, A);
     expr *Bx = new_left_matmul(NULL, x, B);
@@ -99,8 +99,8 @@ const char *test_jacobian_AX_BX_multiply(void)
 {
     double u_vals[4] = {1.0, 2.0, 3.0, 4.0};
 
-    CSR_Matrix *A = new_csr_random(2, 2, 1.0);
-    CSR_Matrix *B = new_csr_random(2, 2, 1.0);
+    CSR_matrix *A = new_csr_random(2, 2, 1.0);
+    CSR_matrix *B = new_csr_random(2, 2, 1.0);
     expr *X = new_variable(2, 2, 0, 4);
     expr *AX = new_left_matmul(NULL, X, A);
     expr *BX = new_left_matmul(NULL, X, B);
@@ -120,10 +120,10 @@ const char *test_jacobian_quad_form_Ax(void)
     /* (Ax)^T Q (Ax) where Q is symmetric */
     double u_vals[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
 
-    CSR_Matrix *A = new_csr_random(3, 4, 1.0);
+    CSR_matrix *A = new_csr_random(3, 4, 1.0);
 
     /* Q = [1 2 0; 2 3 0; 0 0 4] */
-    CSR_Matrix *Q = new_csr_matrix(3, 3, 5);
+    CSR_matrix *Q = new_csr_matrix(3, 3, 5);
     double Qx[5] = {1.0, 2.0, 2.0, 3.0, 4.0};
     int Qi[5] = {0, 1, 0, 1, 2};
     int Qp[4] = {0, 2, 4, 5};
@@ -151,7 +151,7 @@ const char *test_jacobian_quad_form_exp(void)
     double u_vals[3] = {0.5, 1.0, 1.5};
 
     /* Q = [1 2 0; 2 3 0; 0 0 4] */
-    CSR_Matrix *Q = new_csr_matrix(3, 3, 5);
+    CSR_matrix *Q = new_csr_matrix(3, 3, 5);
     double Qx[5] = {1.0, 2.0, 2.0, 3.0, 4.0};
     int Qi[5] = {0, 1, 0, 1, 2};
     int Qp[4] = {0, 2, 4, 5};
@@ -212,8 +212,8 @@ const char *test_jacobian_matmul_Ax_By(void)
     /* Z = (A @ X) @ (B @ Y) with constant matrices A, B */
     double u_vals[10] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0};
 
-    CSR_Matrix *A = new_csr_random(3, 2, 1.0);
-    CSR_Matrix *B = new_csr_random(2, 3, 1.0);
+    CSR_matrix *A = new_csr_random(3, 2, 1.0);
+    CSR_matrix *B = new_csr_random(2, 3, 1.0);
 
     expr *X = new_variable(2, 2, 0, 10);    /* 2x2, vars 0-3 */
     expr *Y = new_variable(3, 2, 4, 10);    /* 3x2, vars 4-9 */
@@ -235,8 +235,8 @@ const char *test_jacobian_matmul_sin_Ax_cos_Bx(void)
     /* Z = sin(A @ X) @ cos(B @ X), shared variable X */
     double u_vals[6] = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0};
 
-    CSR_Matrix *A = new_csr_random(2, 3, 1.0);
-    CSR_Matrix *B = new_csr_random(2, 3, 1.0);
+    CSR_matrix *A = new_csr_random(2, 3, 1.0);
+    CSR_matrix *B = new_csr_random(2, 3, 1.0);
 
     expr *X = new_variable(3, 2, 0, 6);     /* 3x2, vars 0-5 */
     expr *AX = new_left_matmul(NULL, X, A); /* 2x2 */
diff --git a/tests/jacobian_tests/composite/test_composite_exp.h b/tests/jacobian_tests/composite/test_composite_exp.h
index 5af53bd..b83d32b 100644
--- a/tests/jacobian_tests/composite/test_composite_exp.h
+++ b/tests/jacobian_tests/composite/test_composite_exp.h
@@ -11,7 +11,7 @@ const char *test_jacobian_composite_exp(void)
 {
     double u_vals[6] = {0, 0, 1, 2, 3, 0};
 
-    CSR_Matrix *A = new_csr_matrix(2, 6, 6);
+    CSR_matrix *A = new_csr_matrix(2, 6, 6);
     double Ax[6] = {3, 2, 1, 2, 1, 1};
     int Ai[6] = {2, 3, 4, 2, 3, 4};
     int Ap[3] = {0, 3, 6};
@@ -45,7 +45,7 @@ const char *test_jacobian_composite_exp_add(void)
 {
     double u_vals[7] = {0, 0, 1, 1, 1, 2, 2};
 
-    CSR_Matrix *A = new_csr_matrix(3, 7, 9);
+    CSR_matrix *A = new_csr_matrix(3, 7, 9);
     double Ax[9] = {1, 1, 1, 2, 2, 2, 3, 3, 3};
     int Ai[9] = {2, 3, 4, 2, 3, 4, 2, 3, 4};
     int Ap[4] = {0, 3, 6, 9};
@@ -53,7 +53,7 @@ const char *test_jacobian_composite_exp_add(void)
     memcpy(A->i, Ai, 9 * sizeof(int));
     memcpy(A->p, Ap, 4 * sizeof(int));
 
-    CSR_Matrix *B = new_csr_matrix(3, 7, 6);
+    CSR_matrix *B = new_csr_matrix(3, 7, 6);
     double Bx[6] = {1, 1, 2, 2, 3, 3};
     int Bi[6] = {5, 6, 5, 6, 5, 6};
     int Bp[4] = {0, 2, 4, 6};
diff --git a/tests/jacobian_tests/other/test_prod_axis_one.h b/tests/jacobian_tests/other/test_prod_axis_one.h
index 41cd7c8..8310f86 100644
--- a/tests/jacobian_tests/other/test_prod_axis_one.h
+++ b/tests/jacobian_tests/other/test_prod_axis_one.h
@@ -37,7 +37,7 @@ const char *test_jacobian_prod_axis_one(void)
     jacobian_init(p);
     p->eval_jacobian(p);
 
-    /* CSR format for 3x10 Jacobian with row-strided structure */
+    /* CSR_matrix format for 3x10 Jacobian with row-strided structure */
     double expected_Ax[9] = {28.0, 7.0, 4.0, 40.0, 16.0, 10.0, 54.0, 27.0, 18.0};
     int expected_Ap[4] = {0, 3, 6, 9};
     int expected_Ai[9] = {1, 4, 7, 2, 5, 8, 3, 6, 9};
@@ -80,7 +80,7 @@ const char *test_jacobian_prod_axis_one_one_zero(void)
     jacobian_init(p);
     p->eval_jacobian(p);
 
-    /* CSR format for 3x10 Jacobian with row-strided structure */
+    /* CSR_matrix format for 3x10 Jacobian with row-strided structure */
     double expected_Ax[9] = {28.0, 7.0, 4.0, 0.0, 16.0, 0.0, 54.0, 27.0, 18.0};
     int expected_Ap[4] = {0, 3, 6, 9};
     int expected_Ai[9] = {1, 4, 7, 2, 5, 8, 3, 6, 9};
diff --git a/tests/jacobian_tests/other/test_prod_axis_zero.h b/tests/jacobian_tests/other/test_prod_axis_zero.h
index 8db93d6..519a40c 100644
--- a/tests/jacobian_tests/other/test_prod_axis_zero.h
+++ b/tests/jacobian_tests/other/test_prod_axis_zero.h
@@ -31,7 +31,7 @@ const char *test_jacobian_prod_axis_zero(void)
     jacobian_init(p);
     p->eval_jacobian(p);
 
-    /* CSR format for 3x8 Jacobian with block diagonal structure */
+    /* CSR_matrix format for 3x8 Jacobian with block diagonal structure */
     double expected_Ax[6] = {2.0, 1.0, 4.0, 3.0, 6.0, 5.0};
     int expected_Ap[4] = {0, 2, 4, 6};
     int expected_Ai[6] = {1, 2, 3, 4, 5, 6};
diff --git a/tests/jacobian_tests/other/test_quad_form.h b/tests/jacobian_tests/other/test_quad_form.h
index d420751..1a224fe 100644
--- a/tests/jacobian_tests/other/test_quad_form.h
+++ b/tests/jacobian_tests/other/test_quad_form.h
@@ -14,7 +14,7 @@ const char *test_quad_form(void)
     // Q = [1 2 0; 2 3 0; 0 0 4]
     double u_vals[5] = {0, 0, 1, 2, 3};
     expr *x = new_variable(3, 1, 2, 5);
-    CSR_Matrix *Q = new_csr_matrix(3, 3, 5);
+    CSR_matrix *Q = new_csr_matrix(3, 3, 5);
     double Qx[5] = {1.0, 2.0, 2.0, 3.0, 4.0};
     int Qi[5] = {0, 1, 0, 1, 2};
     int Qp[4] = {0, 2, 4, 5};
@@ -51,7 +51,7 @@ src/other/quad_form.c. const char *test_quad_form2(void)
     //         1 0 0 2 0 1]
 double u_vals[6] = {1, 2, 3, 4, 5, 6};
 expr *u = new_variable(6, 1, 0, 6);
-CSR_Matrix *Q = new_csr_matrix(3, 3, 5);
+CSR_matrix *Q = new_csr_matrix(3, 3, 5);
 double Qx[5] = {1.0, 2.0, 2.0, 3.0, 4.0};
 int Qi[5] = {0, 1, 0, 1, 2};
 int Qp[4] = {0, 2, 4, 5};
@@ -59,7 +59,7 @@ memcpy(Q->x, Qx, 5 * sizeof(double));
 memcpy(Q->i, Qi, 5 * sizeof(int));
 memcpy(Q->p, Qp, 4 * sizeof(int));
 
-CSR_Matrix *A = new_csr_matrix(3, 6, 10);
+CSR_matrix *A = new_csr_matrix(3, 6, 10);
 double Ax[10] = {1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6, 1.0, 2.0, 1.0};
 int Ai[10] = {0, 2, 3, 4, 2, 3, 4, 0, 3, 5};
 int Ap[4] = {0, 4, 7, 10};
diff --git a/tests/numerical_diff.c b/tests/numerical_diff.c
index 3cce811..b79547d 100644
--- a/tests/numerical_diff.c
+++ b/tests/numerical_diff.c
@@ -13,7 +13,7 @@ static int is_close(double a, double b)
     return fabs(a - b) <= fmax(ABS_TOL, REL_TOL * fmax(fabs(a), fabs(b)));
 }
 
-static void csr_to_dense(const CSR_Matrix *A, double *dense)
+static void csr_to_dense(const CSR_matrix *A, double *dense)
 {
     for (int row = 0; row < A->m; row++)
     {
@@ -98,9 +98,9 @@ int check_jacobian_num(expr *node, const double *u, double h)
     return result;
 }
 
-/* Compute g = J^T w where J is CSR (m x n) and w has m entries.
+/* Compute g = J^T w where J is CSR_matrix (m x n) and w has m entries.
  * Result written into g (size n), which must be zero-initialized. */
-static void csr_transpose_mult_vec(const CSR_Matrix *J, const double *w, double *g)
+static void csr_transpose_mult_vec(const CSR_matrix *J, const double *w, double *g)
 {
     for (int row = 0; row < J->m; row++)
     {
@@ -127,11 +127,11 @@ double *numerical_wsum_hess(expr *node, const double *u, const double *w, double
 
     memcpy(u_work, u, n * sizeof(double));
 
-    /* Hoist the CSR view once. For Sparse_Matrix (the only type used by tests
+    /* Hoist the CSR_matrix view once. For sparse_matrix (the only type used by tests
        that reach here), csr->x aliases node->jacobian->x, so eval_jacobian
        writes inside the loop update jac->x in place. A PD-backed Jacobian
        would need a per-iteration to_csr refresh; not exercised today. */
-    CSR_Matrix *jac = node->jacobian->to_csr(node->jacobian);
+    CSR_matrix *jac = node->jacobian->to_csr(node->jacobian);
 
     for (int j = 0; j < n; j++)
     {
diff --git a/tests/numerical_diff/test_numerical_diff.h b/tests/numerical_diff/test_numerical_diff.h
index 8af633d..e98616d 100644
--- a/tests/numerical_diff/test_numerical_diff.h
+++ b/tests/numerical_diff/test_numerical_diff.h
@@ -9,7 +9,7 @@ const char *test_check_jacobian_composite_exp(void)
 {
     double u_vals[6] = {0, 0, 1, 2, 3, 0};
 
-    CSR_Matrix *A = new_csr_matrix(2, 6, 6);
+    CSR_matrix *A = new_csr_matrix(2, 6, 6);
     double Ax[6] = {3, 2, 1, 2, 1, 1};
     int Ai[6] = {2, 3, 4, 2, 3, 4};
     int Ap[3] = {0, 3, 6};
@@ -36,7 +36,7 @@ const char *test_check_wsum_hess_exp_composite(void)
     double Ax[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
     int Ai[] = {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4};
     int Ap[] = {0, 5, 10, 15};
-    CSR_Matrix *A_csr = new_csr_matrix(3, 5, 15);
+    CSR_matrix *A_csr = new_csr_matrix(3, 5, 15);
     memcpy(A_csr->x, Ax, 15 * sizeof(double));
     memcpy(A_csr->i, Ai, 15 * sizeof(int));
     memcpy(A_csr->p, Ap, 4 * sizeof(int));
diff --git a/tests/problem/test_problem.h b/tests/problem/test_problem.h
index fbb512f..d245cfa 100644
--- a/tests/problem/test_problem.h
+++ b/tests/problem/test_problem.h
@@ -175,7 +175,7 @@ const char *test_problem_jacobian(void)
     problem_constraint_forward(prob, u);
     problem_jacobian(prob);
 
-    CSR_Matrix *jac = prob->jacobian;
+    CSR_matrix *jac = prob->jacobian;
 
     /* Check dimensions */
     mu_assert("jac rows wrong", jac->m == 2);
@@ -239,7 +239,7 @@ const char *test_problem_jacobian_multi(void)
     problem_constraint_forward(prob, u);
     problem_jacobian(prob);
 
-    CSR_Matrix *jac = prob->jacobian;
+    CSR_matrix *jac = prob->jacobian;
 
     /* Check dimensions: 4 rows (2 + 2), 2 cols */
     mu_assert("jac rows wrong", jac->m == 4);
@@ -322,7 +322,7 @@ const char *test_problem_hessian(void)
     double w_obj = 2.0;
     problem_hessian(prob, w_obj, w);
 
-    CSR_Matrix *H = prob->lagrange_hessian;
+    CSR_matrix *H = prob->lagrange_hessian;
 
     /* Check dimensions: 3x3 symmetric */
     mu_assert("H rows wrong", H->m == 3);
diff --git a/tests/profiling/profile_left_matmul.h b/tests/profiling/profile_left_matmul.h
index 4644a41..df6d96b 100644
--- a/tests/profiling/profile_left_matmul.h
+++ b/tests/profiling/profile_left_matmul.h
@@ -13,10 +13,10 @@
 
 const char *profile_left_matmul(void)
 {
-    /* A @ X where A is 50 x 50 dense stored in CSR and X is 50 x 50 variable */
+    /* A @ X where A is 50 x 50 dense stored in CSR_matrix and X is 50 x 50 variable */
     int n = 100;
     expr *X = new_variable(n, n, 0, n * n);
-    CSR_Matrix *A = new_csr_matrix(n, n, n * n);
+    CSR_matrix *A = new_csr_matrix(n, n, n * n);
     for (int i = 0; i < n * n; i++)
     {
         A->x[i] = 1.0; /* dense matrix of all ones */
diff --git a/tests/profiling/profile_log_reg.h b/tests/profiling/profile_log_reg.h
index 779af75..e89cf0c 100644
--- a/tests/profiling/profile_log_reg.h
+++ b/tests/profiling/profile_log_reg.h
@@ -16,9 +16,9 @@
 
 /* Profile and validate Jacobian + Hessian of obj = sum(logistic(A x)).
 
-   Path A: the engine's expression DAG (CSR/CSC chain rule).
-   Path B: hardcoded chain rule using Permuted_Dense kernels for the dense
-           linear algebra (DA and ATDA), plus the engine's CSR row-sum
+   Path A: the engine's expression DAG (CSR_matrix/CSC_matrix chain rule).
+   Path B: hardcoded chain rule using permuted_dense kernels for the dense
+           linear algebra (DA and ATDA), plus the engine's CSR_matrix row-sum
            primitives for J_sum.
 
    Forward pass is excluded from timing. */
@@ -69,19 +69,19 @@ const char *profile_log_reg(void)
     for (int i = 0; i < m; i++) full_rows[i] = i;
     for (int j = 0; j < n; j++) full_cols[j] = j;
 
-    Matrix *A_pd_M = new_permuted_dense(m, n, m, n, full_rows, full_cols, A_data);
-    Permuted_Dense *A_pd = (Permuted_Dense *) A_pd_M;
-    Matrix *Jlog_M = new_permuted_dense(m, n, m, n, full_rows, full_cols, NULL);
-    Permuted_Dense *Jlog_pd = (Permuted_Dense *) Jlog_M;
-    Matrix *H_pd_M = permuted_dense_ATA_alloc(A_pd);
-    Permuted_Dense *H_pd = (Permuted_Dense *) H_pd_M;
+    matrix *A_pd_M = new_permuted_dense(m, n, m, n, full_rows, full_cols, A_data);
+    permuted_dense *A_pd = (permuted_dense *) A_pd_M;
+    matrix *Jlog_M = new_permuted_dense(m, n, m, n, full_rows, full_cols, NULL);
+    permuted_dense *Jlog_pd = (permuted_dense *) Jlog_M;
+    matrix *H_pd_M = permuted_dense_ATA_alloc(A_pd);
+    permuted_dense *H_pd = (permuted_dense *) H_pd_M;
 
     free(full_rows);
     free(full_cols);
 
-    /* CSR scaffolding for the row-sum step (PD owns the cached CSR view). */
-    CSR_Matrix *Jlog_csr = Jlog_M->to_csr(Jlog_M);
-    CSR_Matrix *Jobj_csr = new_csr_matrix(1, n, n);
+    /* CSR_matrix scaffolding for the row-sum step (PD owns the cached CSR_matrix view). */
+    CSR_matrix *Jlog_csr = Jlog_M->to_csr(Jlog_M);
+    CSR_matrix *Jobj_csr = new_csr_matrix(1, n, n);
     int *iwork = (int *) malloc((size_t) m * n * sizeof(int));
     int *idx_map = (int *) malloc((size_t) m * n * sizeof(int));
     sum_all_rows_csr_alloc(Jlog_csr, Jobj_csr, iwork, idx_map);
@@ -110,16 +110,16 @@ const char *profile_log_reg(void)
 
     printf("\n");
     printf("                            Jacobian      Hessian        Total\n");
-    printf("  Path A (engine CSR/CSC): %10.6fs  %10.6fs  %10.6fs\n", sec_a_jac,
+    printf("  Path A (engine CSR_matrix/CSC_matrix): %10.6fs  %10.6fs  %10.6fs\n", sec_a_jac,
            sec_a_hess, sec_a_jac + sec_a_hess);
-    printf("  Path B (Permuted_Dense): %10.6fs  %10.6fs  %10.6fs\n", sec_b_jac,
+    printf("  Path B (permuted_dense): %10.6fs  %10.6fs  %10.6fs\n", sec_b_jac,
            sec_b_hess, sec_b_jac + sec_b_hess);
     printf("  Speedup (A / B):         %10.2fx %10.2fx %10.2fx\n",
            sec_a_jac / sec_b_jac, sec_a_hess / sec_b_hess,
            (sec_a_jac + sec_a_hess) / (sec_b_jac + sec_b_hess));
 
     /* ---- Compare Jacobian (1 x n, both have full sparsity) ---- */
-    CSR_Matrix *J_a = obj->jacobian->to_csr(obj->jacobian);
+    CSR_matrix *J_a = obj->jacobian->to_csr(obj->jacobian);
     mu_assert("J n mismatch", J_a->n == Jobj_csr->n);
     mu_assert("J nnz mismatch", J_a->nnz == Jobj_csr->nnz);
     double max_J_diff = 0.0;
@@ -131,12 +131,12 @@ const char *profile_log_reg(void)
     printf("  Jacobian max abs diff:   %10.3e\n", max_J_diff);
     mu_assert("Jacobian mismatch", max_J_diff < 1e-10);
 
-    /* ---- Compare Hessian (n x n): scatter Path A's CSR into a dense
+    /* ---- Compare Hessian (n x n): scatter Path A's CSR_matrix into a dense
        n x n array, compare to H_pd->X (already dense row-major).
-       Extract the CSR view ONCE: PD's to_csr does an O(dense_m * dense_n)
+       Extract the CSR_matrix view ONCE: PD's to_csr does an O(m0 * n0)
        memcpy refresh per call, so calling it inside the inner loop is
        quadratically expensive. ---- */
-    CSR_Matrix *H_a = obj->wsum_hess->to_csr(obj->wsum_hess);
+    CSR_matrix *H_a = obj->wsum_hess->to_csr(obj->wsum_hess);
     double *H_a_dense = (double *) calloc((size_t) n * n, sizeof(double));
     for (int i = 0; i < n; i++)
     {
diff --git a/tests/test_helpers.c b/tests/test_helpers.c
index e00d3c7..43b3707 100644
--- a/tests/test_helpers.c
+++ b/tests/test_helpers.c
@@ -4,7 +4,7 @@
 #include <string.h>
 
 #include "expr.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/matrix.h"
 
 #define EPSILON 1e-7
@@ -45,7 +45,7 @@ int cmp_int_array(const int *actual, const int *expected, int size)
     return 1;
 }
 
-int cmp_sparsity(Matrix *M, const int *exp_p, const int *exp_i, int m, int nnz)
+int cmp_sparsity(matrix *M, const int *exp_p, const int *exp_i, int m, int nnz)
 {
     if (M->m != m)
     {
@@ -57,11 +57,11 @@ int cmp_sparsity(Matrix *M, const int *exp_p, const int *exp_i, int m, int nnz)
         printf("  FAILED: M->nnz = %d, expected %d\n", M->nnz, nnz);
         return 0;
     }
-    CSR_Matrix *csr = M->to_csr(M);
+    CSR_matrix *csr = M->to_csr(M);
     return cmp_int_array(csr->p, exp_p, m + 1) && cmp_int_array(csr->i, exp_i, nnz);
 }
 
-int cmp_values(const Matrix *M, const double *exp_x, int nnz)
+int cmp_values(const matrix *M, const double *exp_x, int nnz)
 {
     if (M->nnz != nnz)
     {
@@ -83,7 +83,7 @@ static double randn(void)
     return sqrt(-2.0 * log(u1)) * cos(2.0 * M_PI * u2);
 }
 
-CSR_Matrix *new_csr_random(int m, int n, double density)
+CSR_matrix *new_csr_random(int m, int n, double density)
 {
     /* Single pass: over-allocate, fill, then copy to exact size */
     int cap = (int) ((double) m * (double) n * density * 1.5) + m;
@@ -114,7 +114,7 @@ CSR_Matrix *new_csr_random(int m, int n, double density)
     }
     tmp_p[m] = nnz;
 
-    CSR_Matrix *A = new_csr_matrix(m, n, nnz);
+    CSR_matrix *A = new_csr_matrix(m, n, nnz);
     memcpy(A->p, tmp_p, ((size_t) m + 1) * sizeof(int));
     memcpy(A->i, tmp_i, (size_t) nnz * sizeof(int));
     memcpy(A->x, tmp_x, (size_t) nnz * sizeof(double));
diff --git a/tests/test_helpers.h b/tests/test_helpers.h
index 8d9decc..b3e96ce 100644
--- a/tests/test_helpers.h
+++ b/tests/test_helpers.h
@@ -2,7 +2,7 @@
 #define TEST_HELPERS_H
 
 #include "expr.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/matrix.h"
 
 /* Compare two double arrays directly
@@ -13,17 +13,17 @@ int cmp_double_array(const double *actual, const double *expected, int size);
  * Returns 1 if all values match, 0 otherwise */
 int cmp_int_array(const int *actual, const int *expected, int size);
 
-/* Verify M has shape (m, *) with exactly nnz entries, and that M's CSR
+/* Verify M has shape (m, *) with exactly nnz entries, and that M's CSR_matrix
  * row pointers and column indices match exp_p (length m+1) and exp_i
  * (length nnz). Returns 1 on full match, 0 otherwise. */
-int cmp_sparsity(Matrix *M, const int *exp_p, const int *exp_i, int m, int nnz);
+int cmp_sparsity(matrix *M, const int *exp_p, const int *exp_i, int m, int nnz);
 
 /* Verify M has nnz entries and that its value array matches exp_x of
  * length nnz. Returns 1 on full match, 0 otherwise. */
-int cmp_values(const Matrix *M, const double *exp_x, int nnz);
+int cmp_values(const matrix *M, const double *exp_x, int nnz);
 
-/* Create a random m x n CSR matrix with approximate nonzero density
+/* Create a random m x n CSR_matrix matrix with approximate nonzero density
  * in [0, 1]. Nonzero values are standard Gaussian (Box-Muller). */
-CSR_Matrix *new_csr_random(int m, int n, double density);
+CSR_matrix *new_csr_random(int m, int n, double density);
 
 #endif /* TEST_HELPERS_H */
diff --git a/tests/utils/test_coo_matrix.h b/tests/utils/test_coo_matrix.h
index 53fa307..eb48a3d 100644
--- a/tests/utils/test_coo_matrix.h
+++ b/tests/utils/test_coo_matrix.h
@@ -8,12 +8,12 @@
 
 const char *test_csr_to_coo(void)
 {
-    /* Create a 3x3 CSR matrix A:
+    /* Create a 3x3 CSR_matrix matrix A:
      * [1.0  2.0  0.0]
      * [0.0  3.0  4.0]
      * [5.0  0.0  6.0]
      */
-    CSR_Matrix *A = new_csr_matrix(3, 3, 6);
+    CSR_matrix *A = new_csr_matrix(3, 3, 6);
     double Ax[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     int Ai[6] = {0, 1, 1, 2, 0, 2};
     int Ap[4] = {0, 2, 4, 6};
@@ -48,7 +48,7 @@ const char *test_csr_to_coo_lower_triangular(void)
      * [2  5  6]
      * [3  6  9]
      */
-    CSR_Matrix *A = new_csr_matrix(3, 3, 9);
+    CSR_matrix *A = new_csr_matrix(3, 3, 9);
     int Ap[4] = {0, 3, 6, 9};
     int Ai[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
     double Ax[9] = {1, 2, 3, 2, 5, 6, 3, 6, 9};
@@ -81,7 +81,7 @@ const char *test_csr_to_coo_lower_triangular(void)
 
 const char *test_refresh_lower_triangular_coo(void)
 {
-    CSR_Matrix *A = new_csr_matrix(3, 3, 9);
+    CSR_matrix *A = new_csr_matrix(3, 3, 9);
     int Ap[4] = {0, 3, 6, 9};
     int Ai[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
     double Ax[9] = {1, 2, 3, 2, 5, 6, 3, 6, 9};
diff --git a/tests/utils/test_csc_matrix.h b/tests/utils/test_csc_matrix.h
index 5459021..9b6841a 100644
--- a/tests/utils/test_csc_matrix.h
+++ b/tests/utils/test_csc_matrix.h
@@ -5,7 +5,7 @@
 
 #include "minunit.h"
 #include "test_helpers.h"
-#include "utils/CSC_Matrix.h"
+#include "utils/CSC_matrix.h"
 
 /* Test ATA_alloc with a simple 3x3 example
  * A is 4x3 (4 rows, 3 columns):
@@ -21,14 +21,14 @@
  */
 const char *test_ATA_alloc_simple(void)
 {
-    CSC_Matrix *A = new_csc_matrix(4, 3, 6);
+    CSC_matrix *A = new_csc_matrix(4, 3, 6);
     int Ap[4] = {0, 2, 3, 6};
     int Ai[5] = {0, 2, 1, 2, 1};
     memcpy(A->p, Ap, 4 * sizeof(int));
     memcpy(A->i, Ai, 5 * sizeof(int));
 
     /* Compute C = A^T A */
-    CSR_Matrix *C = ATA_alloc(A);
+    CSR_matrix *C = ATA_alloc(A);
     int expected_p[4] = {0, 2, 3, 5};
     int expected_i[5] = {0, 2, 1, 0, 2};
 
@@ -57,13 +57,13 @@ const char *test_ATA_alloc_simple(void)
  */
 const char *test_ATA_alloc_diagonal_like(void)
 {
-    /* Create A in CSC format (3 rows, 4 cols, 4 nonzeros) */
-    CSC_Matrix *A = new_csc_matrix(3, 4, 4);
+    /* Create A in CSC_matrix format (3 rows, 4 cols, 4 nonzeros) */
+    CSC_matrix *A = new_csc_matrix(3, 4, 4);
     int Ap[5] = {0, 1, 2, 3, 4};
     int Ai[4] = {0, 1, 2, 0};
     memcpy(A->p, Ap, 5 * sizeof(int));
     memcpy(A->i, Ai, 4 * sizeof(int));
-    CSR_Matrix *C = ATA_alloc(A);
+    CSR_matrix *C = ATA_alloc(A);
 
     int expected_p[5] = {0, 2, 3, 4, 6};
     int expected_i[6] = {0, 3, 1, 2, 0, 3};
@@ -80,15 +80,15 @@ const char *test_ATA_alloc_diagonal_like(void)
 
 const char *test_ATA_alloc_random(void)
 {
-    /* Create A in CSC format  */
-    CSC_Matrix *A = new_csc_matrix(10, 15, 15);
+    /* Create A in CSC_matrix format  */
+    CSC_matrix *A = new_csc_matrix(10, 15, 15);
     int Ap[16] = {0, 1, 1, 1, 1, 4, 5, 6, 7, 8, 9, 11, 11, 11, 13, 15};
     int Ai[15] = {5, 0, 6, 9, 0, 5, 1, 3, 6, 0, 6, 3, 6, 6, 8};
     double Ax[15] = {7, 4, 8, 5, 7, 3, 7, 8, 5, 4, 8, 8, 3, 6, 5};
     memcpy(A->p, Ap, 16 * sizeof(int));
     memcpy(A->i, Ai, 15 * sizeof(int));
     memcpy(A->x, Ax, 15 * sizeof(double));
-    CSR_Matrix *C = ATA_alloc(A);
+    CSR_matrix *C = ATA_alloc(A);
 
     int expected_p[16] = {0, 2, 2, 2, 2, 8, 11, 13, 14, 16, 21, 27, 27, 27, 33, 38};
     int expected_i[38] = {0,  6, 4,  5, 9,  10, 13, 14, 4, 5,  10, 0,  6,
@@ -117,10 +117,10 @@ const char *test_ATA_alloc_random(void)
 
 const char *test_ATA_alloc_random2(void)
 {
-    /* Create A in CSC format  */
+    /* Create A in CSC_matrix format  */
     int m = 15;
     int n = 10;
-    CSC_Matrix *A = new_csc_matrix(m, n, 15);
+    CSC_matrix *A = new_csc_matrix(m, n, 15);
     int Ap[11] = {0, 2, 4, 6, 6, 9, 12, 12, 14, 14, 15};
     int Ai[15] = {9, 12, 3, 4, 1, 6, 4, 8, 13, 1, 3, 7, 5, 13, 6};
     double Ax[15] = {0.99, 0.9,  0.51, 0.64, 0.39, 0.29, 0.26, 0.91,
@@ -128,7 +128,7 @@ const char *test_ATA_alloc_random2(void)
     memcpy(A->p, Ap, 11 * sizeof(int));
     memcpy(A->i, Ai, 15 * sizeof(int));
     memcpy(A->x, Ax, 15 * sizeof(double));
-    CSR_Matrix *C = ATA_alloc(A);
+    CSR_matrix *C = ATA_alloc(A);
 
     int expected_p[11] = {0, 1, 4, 7, 7, 10, 13, 13, 15, 15, 17};
     int expected_i[17] = {0, 1, 4, 5, 2, 5, 9, 1, 4, 7, 1, 2, 5, 4, 7, 2, 9};
@@ -154,7 +154,7 @@ const char *test_ATA_alloc_random2(void)
 }
 const char *test_BTA_alloc_and_BTDA_fill(void)
 {
-    /* Create A: 4x3 CSC matrix
+    /* Create A: 4x3 CSC_matrix matrix
      * [1.0  0.0  2.0]
      * [0.0  3.0  0.0]
      * [4.0  0.0  5.0]
@@ -162,7 +162,7 @@ const char *test_BTA_alloc_and_BTDA_fill(void)
      */
     int m = 4;
     int n = 3;
-    CSC_Matrix *A = new_csc_matrix(m, n, 6);
+    CSC_matrix *A = new_csc_matrix(m, n, 6);
     int Ap_A[4] = {0, 2, 4, 6};
     int Ai_A[6] = {0, 2, 1, 3, 0, 2};
     double Ax_A[6] = {1.0, 4.0, 3.0, 6.0, 2.0, 5.0};
@@ -170,14 +170,14 @@ const char *test_BTA_alloc_and_BTDA_fill(void)
     memcpy(A->i, Ai_A, 6 * sizeof(int));
     memcpy(A->x, Ax_A, 6 * sizeof(double));
 
-    /* Create B: 4x2 CSC matrix
+    /* Create B: 4x2 CSC_matrix matrix
      * [1.0  0.0]
      * [0.0  2.0]
      * [3.0  0.0]
      * [0.0  4.0]
      */
     int p = 2;
-    CSC_Matrix *B = new_csc_matrix(m, p, 4);
+    CSC_matrix *B = new_csc_matrix(m, p, 4);
     int Bp[3] = {0, 2, 4};
     int Bi[4] = {0, 2, 1, 3};
     double Bx[4] = {1.0, 3.0, 2.0, 4.0};
@@ -186,7 +186,7 @@ const char *test_BTA_alloc_and_BTDA_fill(void)
     memcpy(B->x, Bx, 4 * sizeof(double));
 
     /* Allocate C = B^T A (should be 2x3) */
-    CSR_Matrix *C = BTA_alloc(A, B);
+    CSR_matrix *C = BTA_alloc(A, B);
 
     /* Sparsity pattern check before filling values */
     int expected_p[3] = {0, 2, 3};
diff --git a/tests/utils/test_csr_csc_conversion.h b/tests/utils/test_csr_csc_conversion.h
index efbdc9e..0a00121 100644
--- a/tests/utils/test_csr_csc_conversion.h
+++ b/tests/utils/test_csr_csc_conversion.h
@@ -5,19 +5,19 @@
 
 #include "minunit.h"
 #include "test_helpers.h"
-#include "utils/CSC_Matrix.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSC_matrix.h"
+#include "utils/CSR_matrix.h"
 
-/* Test CSR to CSC conversion with fill_sparsity and fill_values */
+/* Test CSR_matrix to CSC_matrix conversion with fill_sparsity and fill_values */
 const char *test_csr_to_csc_split(void)
 {
-    /* Create a 4x5 CSR matrix A:
+    /* Create a 4x5 CSR_matrix matrix A:
      * [1.0  0.0  0.0  0.0  1.0]
      * [0.0  0.0  3.0  0.0  0.0]
      * [0.0  2.0  0.0  0.0  0.0]
      * [0.0  0.0  0.0  4.0  0.0]
      */
-    CSR_Matrix *A = new_csr_matrix(4, 5, 5);
+    CSR_matrix *A = new_csr_matrix(4, 5, 5);
     double Ax[5] = {1.0, 1.0, 3.0, 2.0, 4.0};
     int Ai[5] = {0, 4, 2, 1, 3};
     int Ap[5] = {0, 2, 3, 4, 5};
@@ -29,7 +29,7 @@ const char *test_csr_to_csc_split(void)
     int *iwork = (int *) malloc(A->n * sizeof(int));
 
     /* First, fill sparsity pattern */
-    CSC_Matrix *C = csr_to_csc_alloc(A, iwork);
+    CSC_matrix *C = csr_to_csc_alloc(A, iwork);
 
     /* Check sparsity pattern */
     int Cp_correct[6] = {0, 1, 2, 3, 4, 5};
@@ -53,16 +53,16 @@ const char *test_csr_to_csc_split(void)
     return 0;
 }
 
-/* Test CSC to CSR conversion with fill_sparsity */
+/* Test CSC_matrix to CSR_matrix conversion with fill_sparsity */
 const char *test_csc_to_csr_sparsity(void)
 {
-    /* Create a 4x5 CSC matrix A:
+    /* Create a 4x5 CSC_matrix matrix A:
      * [1.0  0.0  0.0  0.0  2.0]
      * [0.0  0.0  3.0  0.0  0.0]
      * [0.0  4.0  0.0  0.0  0.0]
      * [0.0  0.0  0.0  5.0  0.0]
      */
-    CSC_Matrix *A = new_csc_matrix(4, 5, 5);
+    CSC_matrix *A = new_csc_matrix(4, 5, 5);
     double Ax[5] = {1.0, 4.0, 3.0, 5.0, 2.0};
     int Ai[5] = {0, 2, 1, 3, 0};
     int Ap[6] = {0, 1, 2, 3, 4, 5};
@@ -74,9 +74,9 @@ const char *test_csc_to_csr_sparsity(void)
     int *iwork = (int *) malloc(A->m * sizeof(int));
 
     /* Fill sparsity pattern */
-    CSR_Matrix *C = csc_to_csr_alloc(A, iwork);
+    CSR_matrix *C = csc_to_csr_alloc(A, iwork);
 
-    /* Expected CSR format:
+    /* Expected CSR_matrix format:
      * Row 0: [1.0 at col 0, 2.0 at col 4]
      * Row 1: [3.0 at col 2]
      * Row 2: [4.0 at col 1]
@@ -97,11 +97,11 @@ const char *test_csc_to_csr_sparsity(void)
     return 0;
 }
 
-/* Test CSC to CSR conversion with fill_values */
+/* Test CSC_matrix to CSR_matrix conversion with fill_values */
 const char *test_csc_to_csr_values(void)
 {
-    /* Create a 4x5 CSC matrix A */
-    CSC_Matrix *A = new_csc_matrix(4, 5, 5);
+    /* Create a 4x5 CSC_matrix matrix A */
+    CSC_matrix *A = new_csc_matrix(4, 5, 5);
     double Ax[5] = {1.0, 4.0, 3.0, 5.0, 2.0};
     int Ai[5] = {0, 2, 1, 3, 0};
     int Ap[6] = {0, 1, 2, 3, 4, 5};
@@ -113,7 +113,7 @@ const char *test_csc_to_csr_values(void)
     int *iwork = (int *) malloc(A->m * sizeof(int));
 
     /* Fill sparsity pattern */
-    CSR_Matrix *C = csc_to_csr_alloc(A, iwork);
+    CSR_matrix *C = csc_to_csr_alloc(A, iwork);
 
     /* Fill values */
     csc_to_csr_fill_values(A, C, iwork);
@@ -130,15 +130,15 @@ const char *test_csc_to_csr_values(void)
     return 0;
 }
 
-/* Test round-trip conversion: CSR -> CSC -> CSR */
+/* Test round-trip conversion: CSR_matrix -> CSC_matrix -> CSR_matrix */
 const char *test_csr_csc_csr_roundtrip(void)
 {
-    /* Create a 3x4 CSR matrix A:
+    /* Create a 3x4 CSR_matrix matrix A:
      * [1.0  2.0  0.0  3.0]
      * [0.0  4.0  5.0  0.0]
      * [6.0  0.0  7.0  8.0]
      */
-    CSR_Matrix *A = new_csr_matrix(3, 4, 8);
+    CSR_matrix *A = new_csr_matrix(3, 4, 8);
     double Ax[8] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
     int Ai[8] = {0, 1, 3, 1, 2, 0, 2, 3};
     int Ap[4] = {0, 3, 5, 8};
@@ -146,14 +146,14 @@ const char *test_csr_csc_csr_roundtrip(void)
     memcpy(A->i, Ai, 8 * sizeof(int));
     memcpy(A->p, Ap, 4 * sizeof(int));
 
-    /* Convert CSR to CSC */
+    /* Convert CSR_matrix to CSC_matrix */
     int *iwork_csc = (int *) malloc(A->n * sizeof(int));
-    CSC_Matrix *B = csr_to_csc_alloc(A, iwork_csc);
+    CSC_matrix *B = csr_to_csc_alloc(A, iwork_csc);
     csr_to_csc_fill_values(A, B, iwork_csc);
 
-    /* Convert CSC back to CSR */
+    /* Convert CSC_matrix back to CSR_matrix */
     int *iwork_csr = (int *) malloc(B->m * sizeof(int));
-    CSR_Matrix *C = csc_to_csr_alloc(B, iwork_csr);
+    CSR_matrix *C = csc_to_csr_alloc(B, iwork_csr);
     csc_to_csr_fill_values(B, C, iwork_csr);
 
     /* C should match A */
diff --git a/tests/utils/test_csr_matrix.h b/tests/utils/test_csr_matrix.h
index 68e0128..cb6231e 100644
--- a/tests/utils/test_csr_matrix.h
+++ b/tests/utils/test_csr_matrix.h
@@ -6,18 +6,18 @@
 #include "old-code/old_CSR.h"
 #include "old-code/old_CSR_sum.h"
 #include "test_helpers.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/CSR_sum.h"
 #include "utils/int_double_pair.h"
 
 const char *test_diag_csr_mult(void)
 {
-    /* Create a 3x3 CSR matrix A:
+    /* Create a 3x3 CSR_matrix matrix A:
      * [1.0  2.0  0.0]
      * [0.0  3.0  4.0]
      * [5.0  0.0  6.0]
      */
-    CSR_Matrix *A = new_csr_matrix(3, 3, 6);
+    CSR_matrix *A = new_csr_matrix(3, 3, 6);
     double Ax[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     int Ai[6] = {0, 1, 1, 2, 0, 2};
     int Ap[4] = {0, 2, 4, 6};
@@ -31,7 +31,7 @@ const char *test_diag_csr_mult(void)
      * [0.0  9.0  12.0]
      * [2.5  0.0  3.0]
      */
-    CSR_Matrix *C = new_csr_matrix(3, 3, 6);
+    CSR_matrix *C = new_csr_matrix(3, 3, 6);
     diag_csr_mult(d, A, C);
 
     double Ax_correct[6] = {2.0, 4.0, 9.0, 12.0, 2.5, 3.0};
@@ -55,7 +55,7 @@ const char *test_diag_csr_mult(void)
 */
 const char *test_csr_sum(void)
 {
-    CSR_Matrix *A = new_csr_matrix(3, 3, 5);
+    CSR_matrix *A = new_csr_matrix(3, 3, 5);
     double Ax[5] = {1.0, 2.0, 3.0, 4.0, 5.0};
     int Ai[5] = {0, 2, 1, 0, 2};
     int Ap[4] = {0, 2, 3, 5};
@@ -63,7 +63,7 @@ const char *test_csr_sum(void)
     memcpy(A->i, Ai, 5 * sizeof(int));
     memcpy(A->p, Ap, 4 * sizeof(int));
 
-    CSR_Matrix *B = new_csr_matrix(3, 3, 4);
+    CSR_matrix *B = new_csr_matrix(3, 3, 4);
     double Bx[4] = {1.0, 2.0, 3.0, 6.0};
     int Bi[4] = {1, 0, 2, 1};
     int Bp[4] = {0, 1, 3, 4};
@@ -71,7 +71,7 @@ const char *test_csr_sum(void)
     memcpy(B->i, Bi, 4 * sizeof(int));
     memcpy(B->p, Bp, 4 * sizeof(int));
 
-    CSR_Matrix *C = new_csr_matrix(3, 3, 9);
+    CSR_matrix *C = new_csr_matrix(3, 3, 9);
     sum_csr_matrices(A, B, C);
 
     double Cx_correct[9] = {1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 6.0, 5.0};
@@ -97,7 +97,7 @@ const char *test_csr_sum(void)
 */
 const char *test_csr_sum2(void)
 {
-    CSR_Matrix *A = new_csr_matrix(3, 3, 5);
+    CSR_matrix *A = new_csr_matrix(3, 3, 5);
     double Ax[5] = {1.0, 2.0, 3.0, 4.0, 5.0};
     int Ai[5] = {0, 2, 2, 0, 2};
     int Ap[4] = {0, 2, 3, 5};
@@ -105,7 +105,7 @@ const char *test_csr_sum2(void)
     memcpy(A->i, Ai, 5 * sizeof(int));
     memcpy(A->p, Ap, 4 * sizeof(int));
 
-    CSR_Matrix *B = new_csr_matrix(3, 3, 4);
+    CSR_matrix *B = new_csr_matrix(3, 3, 4);
     double Bx[4] = {1.0, 2.0, 3.0, 6.0};
     int Bi[4] = {1, 0, 2, 1};
     int Bp[4] = {0, 1, 3, 4};
@@ -113,7 +113,7 @@ const char *test_csr_sum2(void)
     memcpy(B->i, Bi, 4 * sizeof(int));
     memcpy(B->p, Bp, 4 * sizeof(int));
 
-    CSR_Matrix *C = new_csr_matrix(3, 3, 8);
+    CSR_matrix *C = new_csr_matrix(3, 3, 8);
     sum_csr_matrices(A, B, C);
 
     double Cx_correct[8] = {1, 1, 2, 2, 6, 4, 6, 5};
@@ -134,7 +134,7 @@ const char *test_csr_sum2(void)
 
 const char *test_transpose(void)
 {
-    CSR_Matrix *A = new_csr_matrix(4, 5, 5);
+    CSR_matrix *A = new_csr_matrix(4, 5, 5);
     double Ax[5] = {1.0, 1.0, 3.0, 2.0, 4.0};
     int Ai[5] = {0, 4, 1, 0, 1};
     int Ap[5] = {0, 2, 3, 4, 5};
@@ -143,7 +143,7 @@ const char *test_transpose(void)
     memcpy(A->p, Ap, 5 * sizeof(int));
 
     int iwork[5];
-    CSR_Matrix *AT = transpose(A, iwork);
+    CSR_matrix *AT = transpose(A, iwork);
     double ATx_correct[5] = {1.0, 2.0, 3.0, 4.0, 1.0};
     int ATi_correct[5] = {0, 2, 1, 3, 0};
     int ATp_correct[6] = {0, 2, 4, 4, 4, 5};
@@ -167,7 +167,7 @@ A = [1 0 0 0 1
 */
 const char *test_csr_vecmat_values_sparse(void)
 {
-    CSR_Matrix *A = new_csr_matrix(4, 5, 5);
+    CSR_matrix *A = new_csr_matrix(4, 5, 5);
     double Ax[5] = {1.0, 1.0, 3.0, 2.0, 4.0};
     int Ai[5] = {0, 4, 1, 0, 1};
     int Ap[5] = {0, 2, 3, 4, 5};
@@ -177,7 +177,7 @@ const char *test_csr_vecmat_values_sparse(void)
 
     double z[4] = {1.0, 2.0, 3.0, 4.0};
 
-    CSR_Matrix *C = new_csr_matrix(1, 3, 3);
+    CSR_matrix *C = new_csr_matrix(1, 3, 3);
     double Cx[3] = {0.0, 0.0, 0.0};
     int Ci[3] = {0, 1, 4};
     int Cp[2] = {0, 3};
@@ -187,7 +187,7 @@ const char *test_csr_vecmat_values_sparse(void)
 
     int iwork[5];
 
-    CSR_Matrix *AT = transpose(A, iwork);
+    CSR_matrix *AT = transpose(A, iwork);
 
     Ax_csr_fill_values(AT, z, C);
 
@@ -204,7 +204,7 @@ const char *test_csr_vecmat_values_sparse(void)
 }
 const char *test_sum_all_rows_csr(void)
 {
-    /* Create a 3x4 CSR matrix A:
+    /* Create a 3x4 CSR_matrix matrix A:
      * [1.0  2.0  0.0  0.0]
      * [0.0  3.0  4.0  0.0]
      * [5.0  0.0  6.0  7.0]
@@ -212,14 +212,14 @@ const char *test_sum_all_rows_csr(void)
      * Sum all rows should give:
      * [6.0  5.0  10.0  7.0]
      */
-    CSR_Matrix *A = new_csr_matrix(3, 4, 7);
+    CSR_matrix *A = new_csr_matrix(3, 4, 7);
     double Ax[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
     int Ai[7] = {0, 1, 1, 2, 0, 2, 3};
     int Ap[4] = {0, 2, 4, 7};
     memcpy(A->x, Ax, 7 * sizeof(double));
     memcpy(A->i, Ai, 7 * sizeof(int));
     memcpy(A->p, Ap, 4 * sizeof(int));
-    CSR_Matrix *C = new_csr_matrix(1, 4, 4);
+    CSR_matrix *C = new_csr_matrix(1, 4, 4);
     int_double_pair *pairs = new_int_double_pair_array(7);
     sum_all_rows_csr(A, C, pairs);
     double Cx_correct[4] = {6.0, 5.0, 10.0, 7.0};
@@ -239,7 +239,7 @@ const char *test_sum_all_rows_csr(void)
 }
 const char *test_sum_block_of_rows_csr(void)
 {
-    /* Create a 9x4 CSR matrix A and sum blocks of size 3
+    /* Create a 9x4 CSR_matrix matrix A and sum blocks of size 3
      * Block 0 (rows 0-2):
      * [1.0  2.0  0.0  0.0]
      * [0.0  3.0  1.0  0.0]
@@ -260,7 +260,7 @@ const char *test_sum_block_of_rows_csr(void)
      *
      * Result C should be 3x4 matrix with the sums above
      */
-    CSR_Matrix *A = new_csr_matrix(9, 4, 18);
+    CSR_matrix *A = new_csr_matrix(9, 4, 18);
 
     double Ax[18] = {1.0, 2.0,  /* row 0 */
                      3.0, 1.0,  /* row 1 */
@@ -289,7 +289,7 @@ const char *test_sum_block_of_rows_csr(void)
     memcpy(A->p, Ap, 10 * sizeof(int));
 
     /* Allocate C for 3 blocks and enough space for all nonzeros */
-    CSR_Matrix *C = new_csr_matrix(3, 4, 12);
+    CSR_matrix *C = new_csr_matrix(3, 4, 12);
     int_double_pair *pairs = new_int_double_pair_array(18);
 
     sum_block_of_rows_csr(A, C, pairs, 3);
@@ -318,11 +318,11 @@ const char *test_sum_block_of_rows_csr(void)
 }
 const char *test_sum_evenly_spaced_rows_csr(void)
 {
-    /* Create a 9x4 CSR matrix A (same as test_sum_block_of_rows_csr) and sum evenly
+    /* Create a 9x4 CSR_matrix matrix A (same as test_sum_block_of_rows_csr) and sum evenly
      * spaced rows With row_spacing=3:
 
 
-    A = 9x4 CSR matrix:
+    A = 9x4 CSR_matrix matrix:
             1 2 0 0
             0 3 1 0
             0 0 4 5
@@ -338,7 +338,7 @@ const char *test_sum_evenly_spaced_rows_csr(void)
             row 1: sum of rows 1, 4, 7 = [1 4 6 0]
             row 2: sum of rows 2, 5, 8 = [3 2 4 11]
     */
-    CSR_Matrix *A = new_csr_matrix(9, 4, 18);
+    CSR_matrix *A = new_csr_matrix(9, 4, 18);
 
     double Ax[18] = {1.0, 2.0,  /* row 0 */
                      3.0, 1.0,  /* row 1 */
@@ -367,7 +367,7 @@ const char *test_sum_evenly_spaced_rows_csr(void)
     memcpy(A->p, Ap, 10 * sizeof(int));
 
     /* Allocate C for 3 rows (row_spacing=3) and enough space for all nonzeros */
-    CSR_Matrix *C = new_csr_matrix(3, 4, 10);
+    CSR_matrix *C = new_csr_matrix(3, 4, 10);
     int_double_pair *pairs = new_int_double_pair_array(18);
 
     sum_evenly_spaced_rows_csr(A, C, pairs, 3);
@@ -396,12 +396,12 @@ const char *test_sum_evenly_spaced_rows_csr(void)
 }
 const char *test_AT_alloc_and_fill(void)
 {
-    /* Create a 3x4 CSR matrix A:
+    /* Create a 3x4 CSR_matrix matrix A:
      * [1.0  0.0  2.0  0.0]
      * [0.0  3.0  0.0  4.0]
      * [5.0  0.0  6.0  0.0]
      */
-    CSR_Matrix *A = new_csr_matrix(3, 4, 6);
+    CSR_matrix *A = new_csr_matrix(3, 4, 6);
     double Ax[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     int Ai[6] = {0, 2, 1, 3, 0, 2};
     int Ap[4] = {0, 2, 4, 6};
@@ -411,7 +411,7 @@ const char *test_AT_alloc_and_fill(void)
 
     /* Allocate A^T (should be 4x3) */
     int *iwork = (int *) malloc(A->n * sizeof(int));
-    CSR_Matrix *AT = AT_alloc(A, iwork);
+    CSR_matrix *AT = AT_alloc(A, iwork);
 
     /* Fill values of A^T */
     AT_fill_values(A, AT, iwork);
@@ -454,7 +454,7 @@ const char *test_kron_identity_csr(void)
      * [3  0 | 0  0 | 4  0]
      * [0  3 | 0  0 | 0  4]
      */
-    CSR_Matrix *A = new_csr_matrix(2, 3, 4);
+    CSR_matrix *A = new_csr_matrix(2, 3, 4);
     double Ax[4] = {1.0, 2.0, 3.0, 4.0};
     int Ai[4] = {0, 2, 0, 2};
     int Ap[3] = {0, 2, 4};
@@ -462,7 +462,7 @@ const char *test_kron_identity_csr(void)
     memcpy(A->i, Ai, 4 * sizeof(int));
     memcpy(A->p, Ap, 3 * sizeof(int));
 
-    CSR_Matrix *result = kron_identity_csr(A, 2);
+    CSR_matrix *result = kron_identity_csr(A, 2);
 
     /* Expected: 4x6 with 8 nonzeros
      * Row 0: [1, 0, 0, 0, 2, 0] -> cols {0, 4}, vals {1, 2}
diff --git a/tests/utils/test_linalg_sparse_matmuls.h b/tests/utils/test_linalg_sparse_matmuls.h
index 120f99b..6e9753f 100644
--- a/tests/utils/test_linalg_sparse_matmuls.h
+++ b/tests/utils/test_linalg_sparse_matmuls.h
@@ -5,18 +5,18 @@
 
 #include "minunit.h"
 #include "test_helpers.h"
-#include "utils/CSC_Matrix.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSC_matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/linalg_sparse_matmuls.h"
 
 /* Test block_left_multiply_fill_sparsity with simple case: single block */
 const char *test_block_left_multiply_single_block(void)
 {
-    /* A is 2x3 CSR:
+    /* A is 2x3 CSR_matrix:
      * [1.0  0.0  0.0]
      * [0.0  1.0  1.0]
      */
-    CSR_Matrix *A = new_csr_matrix(2, 3, 3);
+    CSR_matrix *A = new_csr_matrix(2, 3, 3);
     double Ax[3] = {1.0, 1.0, 1.0};
     int Ai[3] = {0, 1, 2};
     int Ap[3] = {0, 1, 3};
@@ -24,12 +24,12 @@ const char *test_block_left_multiply_single_block(void)
     memcpy(A->i, Ai, 3 * sizeof(int));
     memcpy(A->p, Ap, 3 * sizeof(int));
 
-    /* J is 3x2 CSC (single block, so p=1):
+    /* J is 3x2 CSC_matrix (single block, so p=1):
      * [1.0  0.0]
      * [1.0  0.0]
      * [0.0  1.0]
      */
-    CSC_Matrix *J = new_csc_matrix(3, 2, 3);
+    CSC_matrix *J = new_csc_matrix(3, 2, 3);
     double Jx[3] = {1.0, 1.0, 1.0};
     int Ji[3] = {0, 1, 2};
     int Jp[3] = {0, 2, 3};
@@ -38,7 +38,7 @@ const char *test_block_left_multiply_single_block(void)
     memcpy(J->p, Jp, 3 * sizeof(int));
 
     /* Compute C = A @ J1 (p=1 means just one block) */
-    CSC_Matrix *C = block_left_multiply_fill_sparsity(A, J, 1);
+    CSC_matrix *C = block_left_multiply_fill_sparsity(A, J, 1);
 
     /* Expected C is 2x2:
      * C[0,0] = A[0,:] @ J[:,0] = 1.0 * 1.0 = 1.0 (row 0 has column 0, J col 0 has
@@ -63,11 +63,11 @@ const char *test_block_left_multiply_single_block(void)
 /* Test block_left_multiply_fill_sparsity with two blocks */
 const char *test_block_left_multiply_two_blocks(void)
 {
-    /* A is 2x2 CSR:
+    /* A is 2x2 CSR_matrix:
      * [1.0  0.0]
      * [0.0  1.0]
      */
-    CSR_Matrix *A = new_csr_matrix(2, 2, 2);
+    CSR_matrix *A = new_csr_matrix(2, 2, 2);
     double Ax[2] = {1.0, 1.0};
     int Ai[2] = {0, 1};
     int Ap[3] = {0, 1, 2};
@@ -75,7 +75,7 @@ const char *test_block_left_multiply_two_blocks(void)
     memcpy(A->i, Ai, 2 * sizeof(int));
     memcpy(A->p, Ap, 3 * sizeof(int));
 
-    /* J is 4x3 CSC (two blocks of 2 rows each):
+    /* J is 4x3 CSC_matrix (two blocks of 2 rows each):
      * Block 1 rows [0,1]:
      * [1.0  0.0  0.0]
      * [0.0  0.0  0.0]
@@ -88,7 +88,7 @@ const char *test_block_left_multiply_two_blocks(void)
      * [0.0  1.0  0.0]
      * [0.0  0.0  1.0]
      */
-    CSC_Matrix *J = new_csc_matrix(4, 3, 3);
+    CSC_matrix *J = new_csc_matrix(4, 3, 3);
     double Jx[3] = {1.0, 1.0, 1.0};
     int Ji[3] = {0, 2, 3};
     int Jp[4] = {0, 1, 2, 3};
@@ -109,7 +109,7 @@ const char *test_block_left_multiply_two_blocks(void)
      * [0.0  0.0  0.0]
      * [0.0  1.0  1.0]
      */
-    CSC_Matrix *C = block_left_multiply_fill_sparsity(A, J, 2);
+    CSC_matrix *C = block_left_multiply_fill_sparsity(A, J, 2);
     block_left_multiply_fill_values(A, J, C);
 
     int expected_p2[4] = {0, 1, 2, 3};
@@ -130,8 +130,8 @@ const char *test_block_left_multiply_two_blocks(void)
 /* Test block_left_multiply_fill_sparsity with all zero column in J */
 const char *test_block_left_multiply_zero_column(void)
 {
-    /* A is 2x2 CSR (identity) */
-    CSR_Matrix *A = new_csr_matrix(2, 2, 2);
+    /* A is 2x2 CSR_matrix (identity) */
+    CSR_matrix *A = new_csr_matrix(2, 2, 2);
     double Ax[2] = {1.0, 1.0};
     int Ai[2] = {0, 1};
     int Ap[3] = {0, 1, 2};
@@ -143,7 +143,7 @@ const char *test_block_left_multiply_zero_column(void)
      * [1.0  0.0]
      * [0.0  0.0]
      */
-    CSC_Matrix *J = new_csc_matrix(2, 2, 1);
+    CSC_matrix *J = new_csc_matrix(2, 2, 1);
     double Jx[1] = {1.0};
     int Ji[1] = {0};
     int Jp[3] = {0, 1, 1}; /* Column 0 has one nonzero, column 1 is empty */
@@ -151,7 +151,7 @@ const char *test_block_left_multiply_zero_column(void)
     memcpy(J->i, Ji, 1 * sizeof(int));
     memcpy(J->p, Jp, 3 * sizeof(int));
 
-    CSC_Matrix *C = block_left_multiply_fill_sparsity(A, J, 1);
+    CSC_matrix *C = block_left_multiply_fill_sparsity(A, J, 1);
 
     int expected_p3[3] = {0, 1, 1};
     int expected_i3[1] = {0};
@@ -166,15 +166,15 @@ const char *test_block_left_multiply_zero_column(void)
     return NULL;
 }
 
-/* Test csr_csc_matmul_alloc: C = A @ B where A is CSR and B is CSC */
+/* Test csr_csc_matmul_alloc: C = A @ B where A is CSR_matrix and B is CSC_matrix */
 const char *test_csr_csc_matmul_alloc_basic(void)
 {
-    /* A is 3x2 CSR:
+    /* A is 3x2 CSR_matrix:
      * [1.0  0.0]
      * [0.0  1.0]
      * [1.0  1.0]
      */
-    CSR_Matrix *A = new_csr_matrix(3, 2, 4);
+    CSR_matrix *A = new_csr_matrix(3, 2, 4);
     double Ax[4] = {1.0, 1.0, 1.0, 1.0};
     int Ai[4] = {0, 1, 0, 1};
     int Ap[4] = {0, 1, 2, 4};
@@ -182,11 +182,11 @@ const char *test_csr_csc_matmul_alloc_basic(void)
     memcpy(A->i, Ai, 4 * sizeof(int));
     memcpy(A->p, Ap, 4 * sizeof(int));
 
-    /* B is 2x3 CSC:
+    /* B is 2x3 CSC_matrix:
      * [1.0  0.0  1.0]
      * [0.0  1.0  1.0]
      */
-    CSC_Matrix *B = new_csc_matrix(2, 3, 4);
+    CSC_matrix *B = new_csc_matrix(2, 3, 4);
     double Bx[4] = {1.0, 1.0, 1.0, 1.0};
     int Bi[4] = {0, 1, 0, 1};
     int Bp[4] = {0, 1, 2, 4};
@@ -199,7 +199,7 @@ const char *test_csr_csc_matmul_alloc_basic(void)
      *      [0, 1, 1],
      *      [1, 1, 2]]
      */
-    CSR_Matrix *C = csr_csc_matmul_alloc(A, B);
+    CSR_matrix *C = csr_csc_matmul_alloc(A, B);
 
     int expected_p4[4] = {0, 2, 4, 7};
     int expected_i4[7] = {0, 2, 1, 2, 0, 1, 2};
@@ -217,11 +217,11 @@ const char *test_csr_csc_matmul_alloc_basic(void)
 /* Test csr_csc_matmul_alloc with sparse result */
 const char *test_csr_csc_matmul_alloc_sparse(void)
 {
-    /* A is 2x3 CSR:
+    /* A is 2x3 CSR_matrix:
      * [1.0  0.0  0.0]
      * [0.0  0.0  1.0]
      */
-    CSR_Matrix *A = new_csr_matrix(2, 3, 2);
+    CSR_matrix *A = new_csr_matrix(2, 3, 2);
     double Ax[2] = {1.0, 1.0};
     int Ai[2] = {0, 2};
     int Ap[3] = {0, 1, 2};
@@ -229,12 +229,12 @@ const char *test_csr_csc_matmul_alloc_sparse(void)
     memcpy(A->i, Ai, 2 * sizeof(int));
     memcpy(A->p, Ap, 3 * sizeof(int));
 
-    /* B is 3x2 CSC:
+    /* B is 3x2 CSC_matrix:
      * [1.0  0.0]
      * [0.0  0.0]
      * [0.0  1.0]
      */
-    CSC_Matrix *B = new_csc_matrix(3, 2, 2);
+    CSC_matrix *B = new_csc_matrix(3, 2, 2);
     double Bx[2] = {1.0, 1.0};
     int Bi[2] = {0, 2};
     int Bp[3] = {0, 1, 2};
@@ -246,7 +246,7 @@ const char *test_csr_csc_matmul_alloc_sparse(void)
      * C = [[1, 0],
      *      [0, 1]]
      */
-    CSR_Matrix *C = csr_csc_matmul_alloc(A, B);
+    CSR_matrix *C = csr_csc_matmul_alloc(A, B);
 
     int expected_p5[3] = {0, 1, 2};
     int expected_i5[2] = {0, 1};
@@ -264,11 +264,11 @@ const char *test_csr_csc_matmul_alloc_sparse(void)
 /* Test block_left_multiply_vec with single block: y = A @ x */
 const char *test_block_left_multiply_vec_single_block(void)
 {
-    /* A is 2x3 CSR:
+    /* A is 2x3 CSR_matrix:
      * [1.0  0.0  2.0]
      * [0.0  3.0  0.0]
      */
-    CSR_Matrix *A = new_csr_matrix(2, 3, 3);
+    CSR_matrix *A = new_csr_matrix(2, 3, 3);
     double Ax[3] = {1.0, 3.0, 2.0};
     int Ai[3] = {0, 1, 2};
     int Ap[3] = {0, 2, 3};
@@ -296,11 +296,11 @@ const char *test_block_left_multiply_vec_single_block(void)
 /* Test block_left_multiply_vec with two blocks: y = [A @ x1; A @ x2] */
 const char *test_block_left_multiply_vec_two_blocks(void)
 {
-    /* A is 2x3 CSR:
+    /* A is 2x3 CSR_matrix:
      * [1.0  2.0  0.0]
      * [0.0  3.0  4.0]
      */
-    CSR_Matrix *A = new_csr_matrix(2, 3, 4);
+    CSR_matrix *A = new_csr_matrix(2, 3, 4);
     double Ax[4] = {1.0, 2.0, 3.0, 4.0};
     int Ai[4] = {0, 1, 1, 2};
     int Ap[3] = {0, 2, 4};
@@ -330,12 +330,12 @@ const char *test_block_left_multiply_vec_two_blocks(void)
 /* Test block_left_multiply_vec with sparse matrix and multiple blocks */
 const char *test_block_left_multiply_vec_sparse(void)
 {
-    /* A is 3x4 CSR (very sparse):
+    /* A is 3x4 CSR_matrix (very sparse):
      * [2.0  0.0  0.0  0.0]
      * [0.0  0.0  3.0  0.0]
      * [0.0  0.0  0.0  4.0]
      */
-    CSR_Matrix *A = new_csr_matrix(3, 4, 3);
+    CSR_matrix *A = new_csr_matrix(3, 4, 3);
     double Ax[3] = {2.0, 3.0, 4.0};
     int Ai[3] = {0, 2, 3};
     int Ap[4] = {0, 1, 2, 3};
@@ -365,11 +365,11 @@ const char *test_block_left_multiply_vec_sparse(void)
 /* Test block_left_multiply_vec with three blocks */
 const char *test_block_left_multiply_vec_three_blocks(void)
 {
-    /* A is 2x2 CSR:
+    /* A is 2x2 CSR_matrix:
      * [1.0  2.0]
      * [3.0  4.0]
      */
-    CSR_Matrix *A = new_csr_matrix(2, 2, 4);
+    CSR_matrix *A = new_csr_matrix(2, 2, 4);
     double Ax[4] = {1.0, 2.0, 3.0, 4.0};
     int Ai[4] = {0, 1, 0, 1};
     int Ap[3] = {0, 2, 4};
diff --git a/tests/utils/test_linalg_utils_matmul_chain_rule.h b/tests/utils/test_linalg_utils_matmul_chain_rule.h
index 0c66589..8da9d05 100644
--- a/tests/utils/test_linalg_utils_matmul_chain_rule.h
+++ b/tests/utils/test_linalg_utils_matmul_chain_rule.h
@@ -4,8 +4,8 @@
 
 #include "minunit.h"
 #include "test_helpers.h"
-#include "utils/CSC_Matrix.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSC_matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/linalg_dense_sparse_matmuls.h"
 
 /* Test YT_kron_I_alloc and YT_kron_I_fill_values
@@ -17,7 +17,7 @@
  *   [1  3]
  *   [2  4]
  *
- * J (mk=4 x p=3, CSC):
+ * J (mk=4 x p=3, CSC_matrix):
  *   [1  0  2]
  *   [0  1  0]
  *   [3  0  0]
@@ -33,8 +33,8 @@ const char *test_YT_kron_I(void)
 {
     int m = 2, k = 2, n = 2;
 
-    /* J is 4x3 CSC */
-    CSC_Matrix *J = new_csc_matrix(4, 3, 5);
+    /* J is 4x3 CSC_matrix */
+    CSC_matrix *J = new_csc_matrix(4, 3, 5);
     int Jp[4] = {0, 2, 3, 5};
     int Ji[5] = {0, 2, 1, 0, 3};
     double Jx[5] = {1.0, 3.0, 1.0, 2.0, 1.0};
@@ -45,9 +45,9 @@ const char *test_YT_kron_I(void)
     /* Y col-major: Y[0,0]=1, Y[1,0]=2, Y[0,1]=3, Y[1,1]=4 */
     double Y[4] = {1.0, 2.0, 3.0, 4.0};
 
-    CSR_Matrix *C = YT_kron_I_alloc(m, k, n, J);
+    CSR_matrix *C = YT_kron_I_alloc(m, k, n, J);
 
-    /* Expected CSR (from scipy) */
+    /* Expected CSR_matrix (from scipy) */
     int exp_p[5] = {0, 2, 4, 6, 8};
     int exp_i[8] = {0, 2, 1, 2, 0, 2, 1, 2};
     double exp_x[8] = {7.0, 2.0, 1.0, 2.0, 15.0, 6.0, 3.0, 4.0};
@@ -71,7 +71,7 @@ const char *test_YT_kron_I(void)
  *   [1.0  0.5  2.0]
  *   [3.0  1.0  0.5]
  *
- * J (mk=6 x p=4, CSC):
+ * J (mk=6 x p=4, CSC_matrix):
  *   [1  0  0  2]
  *   [0  0  1  0]
  *   [0  3  0  0]
@@ -85,8 +85,8 @@ const char *test_YT_kron_I_larger(void)
 {
     int m = 3, k = 2, n = 3;
 
-    /* J is 6x4 CSC */
-    CSC_Matrix *J = new_csc_matrix(6, 4, 8);
+    /* J is 6x4 CSC_matrix */
+    CSC_matrix *J = new_csc_matrix(6, 4, 8);
     int Jp[5] = {0, 2, 4, 6, 8};
     int Ji[8] = {0, 3, 2, 4, 1, 5, 0, 3};
     double Jx[8] = {1.0, 2.0, 3.0, 1.0, 1.0, 4.0, 2.0, 1.0};
@@ -97,9 +97,9 @@ const char *test_YT_kron_I_larger(void)
     /* Y col-major */
     double Y[6] = {1.0, 3.0, 0.5, 1.0, 2.0, 0.5};
 
-    CSR_Matrix *C = YT_kron_I_alloc(m, k, n, J);
+    CSR_matrix *C = YT_kron_I_alloc(m, k, n, J);
 
-    /* Expected CSR (from scipy) */
+    /* Expected CSR_matrix (from scipy) */
     int exp_p[10] = {0, 2, 4, 6, 8, 10, 12, 14, 16, 18};
     int exp_i[18] = {0, 3, 1, 2, 1, 2, 0, 3, 1, 2, 1, 2, 0, 3, 1, 2, 1, 2};
     double exp_x[18] = {7.0, 5.0, 3.0, 1.0, 3.0, 12.0, 2.5, 2.0, 1.0,
@@ -127,7 +127,7 @@ const char *test_YT_kron_I_larger(void)
  *   [1  3]
  *   [2  4]
  *
- * J (kn=4 x p=3, CSC):
+ * J (kn=4 x p=3, CSC_matrix):
  *   [1  0  2]
  *   [0  1  0]
  *   [3  0  0]
@@ -143,8 +143,8 @@ const char *test_I_kron_X(void)
 {
     int m = 2, k = 2, n = 2;
 
-    /* J is 4x3 CSC */
-    CSC_Matrix *J = new_csc_matrix(4, 3, 5);
+    /* J is 4x3 CSC_matrix */
+    CSC_matrix *J = new_csc_matrix(4, 3, 5);
     int Jp[4] = {0, 2, 3, 5};
     int Ji[5] = {0, 2, 1, 0, 3};
     double Jx[5] = {1.0, 3.0, 1.0, 2.0, 1.0};
@@ -155,9 +155,9 @@ const char *test_I_kron_X(void)
     /* X col-major */
     double X[4] = {1.0, 2.0, 3.0, 4.0};
 
-    CSR_Matrix *C = I_kron_X_alloc(m, k, n, J);
+    CSR_matrix *C = I_kron_X_alloc(m, k, n, J);
 
-    /* Expected CSR */
+    /* Expected CSR_matrix */
     int exp_p[5] = {0, 3, 6, 8, 10};
     int exp_i[10] = {0, 1, 2, 0, 1, 2, 0, 2, 0, 2};
     double exp_x[10] = {1.0, 3.0, 2.0, 2.0, 4.0, 4.0, 3.0, 3.0, 6.0, 4.0};
@@ -182,7 +182,7 @@ const char *test_I_kron_X(void)
  *   [2.0  1.0]
  *   [3.0  0.5]
  *
- * J (kn=4 x p=4, CSC):
+ * J (kn=4 x p=4, CSC_matrix):
  *   [1  0  0  2]
  *   [0  3  1  0]
  *   [0  0  4  0]
@@ -194,8 +194,8 @@ const char *test_I_kron_X_larger(void)
 {
     int m = 3, k = 2, n = 2;
 
-    /* J is 4x4 CSC */
-    CSC_Matrix *J = new_csc_matrix(4, 4, 7);
+    /* J is 4x4 CSC_matrix */
+    CSC_matrix *J = new_csc_matrix(4, 4, 7);
     int Jp[5] = {0, 2, 3, 5, 7};
     int Ji[7] = {0, 3, 1, 1, 2, 0, 3};
     double Jx[7] = {1.0, 2.0, 3.0, 1.0, 4.0, 2.0, 1.0};
@@ -206,9 +206,9 @@ const char *test_I_kron_X_larger(void)
     /* X col-major */
     double X[6] = {1.0, 2.0, 3.0, 0.5, 1.0, 0.5};
 
-    CSR_Matrix *C = I_kron_X_alloc(m, k, n, J);
+    CSR_matrix *C = I_kron_X_alloc(m, k, n, J);
 
-    /* Expected CSR */
+    /* Expected CSR_matrix */
     int exp_p[7] = {0, 4, 8, 12, 15, 18, 21};
     int exp_i[21] = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 2, 3, 0, 2, 3, 0, 2, 3};
     double exp_x[21] = {1.0, 1.5, 0.5, 2.0, 2.0, 3.0, 1.0, 4.0, 3.0,  1.5, 0.5,
diff --git a/tests/utils/test_matrix.h b/tests/utils/test_matrix.h
index 087ef43..197a386 100644
--- a/tests/utils/test_matrix.h
+++ b/tests/utils/test_matrix.h
@@ -4,6 +4,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 #include "utils/dense_matrix.h"
+#include "utils/sparse_matrix.h"
 #include <stdlib.h>
 #include <string.h>
 
@@ -13,7 +14,7 @@
 const char *test_dense_matrix_mult_vec(void)
 {
     double data[] = {1.0, 2.0, 3.0, 4.0};
-    Matrix *A = new_dense_matrix(2, 2, data);
+    matrix *A = new_dense_matrix(2, 2, data);
 
     double x[] = {1.0, 2.0};
     double y[2] = {0.0, 0.0};
@@ -33,7 +34,7 @@ const char *test_dense_matrix_mult_vec(void)
 const char *test_dense_matrix_mult_vec_blocks(void)
 {
     double data[] = {1.0, 2.0, 3.0, 4.0};
-    Matrix *A = new_dense_matrix(2, 2, data);
+    matrix *A = new_dense_matrix(2, 2, data);
 
     double x[] = {1.0, 2.0, 3.0, 4.0};
     double y[4] = {0};
@@ -51,8 +52,8 @@ const char *test_dense_matrix_mult_vec_blocks(void)
    A = [1 2 3; 4 5 6] (2x3), x = [1; 2; 3], p = 1 */
 const char *test_sparse_vs_dense_mult_vec(void)
 {
-    /* Build CSR for A = [1 2 3; 4 5 6] */
-    CSR_Matrix *csr = new_csr_matrix(2, 3, 6);
+    /* Build CSR_matrix for A = [1 2 3; 4 5 6] */
+    CSR_matrix *csr = new_csr_matrix(2, 3, 6);
     int Ap[3] = {0, 3, 6};
     int Ai[6] = {0, 1, 2, 0, 1, 2};
     double Ax[6] = {1, 2, 3, 4, 5, 6};
@@ -62,8 +63,8 @@ const char *test_sparse_vs_dense_mult_vec(void)
 
     double dense_data[] = {1, 2, 3, 4, 5, 6};
 
-    Matrix *sparse = new_sparse_matrix(csr);
-    Matrix *dense = new_dense_matrix(2, 3, dense_data);
+    matrix *sparse = new_sparse_matrix(csr);
+    matrix *dense = new_dense_matrix(2, 3, dense_data);
 
     double x[] = {1.0, 2.0, 3.0};
     double y_sparse[2] = {0};
@@ -83,14 +84,14 @@ const char *test_sparse_vs_dense_mult_vec(void)
 const char *test_dense_matrix_trans(void)
 {
     double data[] = {1, 2, 3, 4, 5, 6}; /* 2x3 */
-    Matrix *A = new_dense_matrix(2, 3, data);
-    Matrix *AT = dense_matrix_trans((const Dense_Matrix *) A);
+    matrix *A = new_dense_matrix(2, 3, data);
+    matrix *AT = dense_matrix_trans((const dense_matrix *) A);
 
     mu_assert("transpose m", AT->m == 3);
     mu_assert("transpose n", AT->n == 2);
 
     /* AT should be [1 4; 2 5; 3 6] stored row-major */
-    Dense_Matrix *dm = (Dense_Matrix *) AT;
+    dense_matrix *dm = (dense_matrix *) AT;
     double AT_expected[6] = {1.0, 4.0, 2.0, 5.0, 3.0, 6.0};
     mu_assert("AT vals incorrect", cmp_double_array(dm->x, AT_expected, 6));
 
@@ -103,7 +104,7 @@ const char *test_dense_matrix_trans(void)
    A = [1 2; 3 4], x = [1; 2; 3; 4], p = 2 */
 const char *test_sparse_vs_dense_mult_vec_blocks(void)
 {
-    CSR_Matrix *csr = new_csr_matrix(2, 2, 4);
+    CSR_matrix *csr = new_csr_matrix(2, 2, 4);
     int Ap[3] = {0, 2, 4};
     int Ai[4] = {0, 1, 0, 1};
     double Ax[4] = {1, 2, 3, 4};
@@ -113,8 +114,8 @@ const char *test_sparse_vs_dense_mult_vec_blocks(void)
 
     double dense_data[] = {1, 2, 3, 4};
 
-    Matrix *sparse = new_sparse_matrix(csr);
-    Matrix *dense = new_dense_matrix(2, 2, dense_data);
+    matrix *sparse = new_sparse_matrix(csr);
+    matrix *dense = new_dense_matrix(2, 2, dense_data);
 
     double x[] = {1.0, 2.0, 3.0, 4.0};
     double y_sparse[4] = {0};
diff --git a/tests/utils/test_matrix_BTA.h b/tests/utils/test_matrix_BTA.h
index 7528484..01edb80 100644
--- a/tests/utils/test_matrix_BTA.h
+++ b/tests/utils/test_matrix_BTA.h
@@ -3,9 +3,10 @@
 
 #include "minunit.h"
 #include "test_helpers.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/matrix_BTA.h"
 #include "utils/permuted_dense.h"
+#include "utils/sparse_matrix.h"
 #include <stdlib.h>
 #include <string.h>
 
@@ -20,20 +21,20 @@ const char *test_BTDA_matrices_pd_pd(void)
     double XB[4] = {5.0, 6.0, 7.0, 8.0};
     double d[2] = {2.0, -1.5};
 
-    Matrix *A_m = new_permuted_dense(2, 4, 2, 2, row_perm, col_perm_A, XA);
-    Matrix *B_m = new_permuted_dense(2, 4, 2, 2, row_perm, col_perm_B, XB);
+    matrix *A_m = new_permuted_dense(2, 4, 2, 2, row_perm, col_perm_A, XA);
+    matrix *B_m = new_permuted_dense(2, 4, 2, 2, row_perm, col_perm_B, XB);
 
     /* Wrapper path. */
-    Matrix *C_m = BTA_matrices_alloc(A_m, B_m);
+    matrix *C_m = BTA_matrices_alloc(A_m, B_m);
     BTDA_matrices_fill_values(A_m, d, B_m, C_m);
 
     /* Direct primitive path on independent operands. */
-    Matrix *A2 = new_permuted_dense(2, 4, 2, 2, row_perm, col_perm_A, XA);
-    Matrix *B2 = new_permuted_dense(2, 4, 2, 2, row_perm, col_perm_B, XB);
-    Matrix *C2 =
-        permuted_dense_BTA_alloc((Permuted_Dense *) A2, (Permuted_Dense *) B2);
-    BTDA_pd_pd_fill_values((Permuted_Dense *) A2, d, (Permuted_Dense *) B2,
-                           (Permuted_Dense *) C2);
+    matrix *A2 = new_permuted_dense(2, 4, 2, 2, row_perm, col_perm_A, XA);
+    matrix *B2 = new_permuted_dense(2, 4, 2, 2, row_perm, col_perm_B, XB);
+    matrix *C2 =
+        permuted_dense_BTA_alloc((permuted_dense *) A2, (permuted_dense *) B2);
+    BTDA_pd_pd_fill_values((permuted_dense *) A2, d, (permuted_dense *) B2,
+                           (permuted_dense *) C2);
 
     mu_assert("values", cmp_double_array(C_m->x, C2->x, C_m->nnz));
 
@@ -46,12 +47,12 @@ const char *test_BTDA_matrices_pd_pd(void)
     return 0;
 }
 
-/* Wrapper dispatch sanity: (CSR, PD). Compare against direct
+/* Wrapper dispatch sanity: (CSR_matrix, PD). Compare against direct
    BTDA_csr_pd_fill_values. */
 const char *test_BTDA_matrices_csr_pd(void)
 {
-    /* A: 4x5 CSR */
-    CSR_Matrix *A = new_csr_matrix(4, 5, 5);
+    /* A: 4x5 CSR_matrix */
+    CSR_matrix *A = new_csr_matrix(4, 5, 5);
     A->p[0] = 0;
     A->p[1] = 2;
     A->p[2] = 3;
@@ -61,22 +62,22 @@ const char *test_BTDA_matrices_csr_pd(void)
     double Ax[5] = {1.0, 2.0, 3.0, 4.0, 5.0};
     memcpy(A->i, Ai, sizeof Ai);
     memcpy(A->x, Ax, sizeof Ax);
-    Matrix *A_m = new_sparse_matrix(A);
+    matrix *A_m = new_sparse_matrix(A);
 
     /* B: 4x4 PD, row_perm = [1, 3], col_perm = [0, 2]. */
     int row_perm_B[2] = {1, 3};
     int col_perm_B[2] = {0, 2};
     double XB[4] = {10.0, 20.0, 30.0, 40.0};
-    Matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
+    matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
 
     double d[4] = {1.0, -2.0, 0.5, 3.0};
 
     /* Wrapper path. */
-    Matrix *C_m = BTA_matrices_alloc(A_m, B_m);
+    matrix *C_m = BTA_matrices_alloc(A_m, B_m);
     BTDA_matrices_fill_values(A_m, d, B_m, C_m);
 
     /* Direct primitive path. */
-    CSR_Matrix *A2 = new_csr_matrix(4, 5, 5);
+    CSR_matrix *A2 = new_csr_matrix(4, 5, 5);
     A2->p[0] = 0;
     A2->p[1] = 2;
     A2->p[2] = 3;
@@ -84,10 +85,10 @@ const char *test_BTDA_matrices_csr_pd(void)
     A2->p[4] = 5;
     memcpy(A2->i, Ai, sizeof Ai);
     memcpy(A2->x, Ax, sizeof Ax);
-    Matrix *B2_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
-    Permuted_Dense *B2 = (Permuted_Dense *) B2_m;
-    Matrix *C2 = BTA_csr_pd_alloc(A2, B2);
-    BTDA_csr_pd_fill_values(A2, d, B2, (Permuted_Dense *) C2);
+    matrix *B2_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
+    permuted_dense *B2 = (permuted_dense *) B2_m;
+    matrix *C2 = BTA_csr_pd_alloc(A2, B2);
+    BTDA_csr_pd_fill_values(A2, d, B2, (permuted_dense *) C2);
 
     mu_assert("values", cmp_double_array(C_m->x, C2->x, C_m->nnz));
 
@@ -100,7 +101,7 @@ const char *test_BTDA_matrices_csr_pd(void)
     return 0;
 }
 
-/* Wrapper dispatch sanity: (PD, CSR). Compare against direct
+/* Wrapper dispatch sanity: (PD, CSR_matrix). Compare against direct
    BTDA_pd_csr_fill_values. */
 const char *test_BTDA_matrices_pd_csr(void)
 {
@@ -108,10 +109,10 @@ const char *test_BTDA_matrices_pd_csr(void)
     int row_perm_A[2] = {1, 3};
     int col_perm_A[2] = {0, 2};
     double XA[4] = {1.0, 2.0, 3.0, 4.0};
-    Matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
+    matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
 
-    /* B: 4x4 CSR. */
-    CSR_Matrix *B = new_csr_matrix(4, 4, 5);
+    /* B: 4x4 CSR_matrix. */
+    CSR_matrix *B = new_csr_matrix(4, 4, 5);
     B->p[0] = 0;
     B->p[1] = 2;
     B->p[2] = 3;
@@ -121,18 +122,18 @@ const char *test_BTDA_matrices_pd_csr(void)
     double Bx[5] = {10.0, 20.0, 30.0, 40.0, 50.0};
     memcpy(B->i, Bi, sizeof Bi);
     memcpy(B->x, Bx, sizeof Bx);
-    Matrix *B_m = new_sparse_matrix(B);
+    matrix *B_m = new_sparse_matrix(B);
 
     double d[4] = {1.0, -2.0, 0.5, 3.0};
 
     /* Wrapper path. */
-    Matrix *C_m = BTA_matrices_alloc(A_m, B_m);
+    matrix *C_m = BTA_matrices_alloc(A_m, B_m);
     BTDA_matrices_fill_values(A_m, d, B_m, C_m);
 
     /* Direct primitive path. */
-    Matrix *A2_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
-    Permuted_Dense *A2 = (Permuted_Dense *) A2_m;
-    CSR_Matrix *B2 = new_csr_matrix(4, 4, 5);
+    matrix *A2_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
+    permuted_dense *A2 = (permuted_dense *) A2_m;
+    CSR_matrix *B2 = new_csr_matrix(4, 4, 5);
     B2->p[0] = 0;
     B2->p[1] = 2;
     B2->p[2] = 3;
@@ -140,8 +141,8 @@ const char *test_BTDA_matrices_pd_csr(void)
     B2->p[4] = 5;
     memcpy(B2->i, Bi, sizeof Bi);
     memcpy(B2->x, Bx, sizeof Bx);
-    Matrix *C2 = BTA_pd_csr_alloc(A2, B2);
-    BTDA_pd_csr_fill_values(A2, d, B2, (Permuted_Dense *) C2);
+    matrix *C2 = BTA_pd_csr_alloc(A2, B2);
+    BTDA_pd_csr_fill_values(A2, d, B2, (permuted_dense *) C2);
 
     mu_assert("values", cmp_double_array(C_m->x, C2->x, C_m->nnz));
 
diff --git a/tests/utils/test_permuted_dense.h b/tests/utils/test_permuted_dense.h
index 81b63d6..24ff605 100644
--- a/tests/utils/test_permuted_dense.h
+++ b/tests/utils/test_permuted_dense.h
@@ -3,7 +3,7 @@
 
 #include "minunit.h"
 #include "test_helpers.h"
-#include "utils/CSC_Matrix.h"
+#include "utils/CSC_matrix.h"
 #include "utils/permuted_dense.h"
 #include <stdlib.h>
 #include <string.h>
@@ -22,9 +22,9 @@ const char *test_permuted_dense_to_csr_basic(void)
     int col_perm[2] = {0, 3};
     double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
 
-    Matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
+    matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
 
-    CSR_Matrix *C = M->to_csr(M);
+    CSR_matrix *C = M->to_csr(M);
     int Cp_expected[6] = {0, 0, 2, 4, 4, 6};
     int Ci_expected[6] = {0, 3, 0, 3, 0, 3};
     double Cx_expected[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
@@ -40,13 +40,13 @@ const char *test_permuted_dense_to_csr_basic(void)
     return 0;
 }
 
-/* Empty dense block (dense_m = dense_n = 0): result is an m x n CSR with
+/* Empty dense block (m0 = n0 = 0): result is an m x n CSR_matrix with
    no nonzeros. */
 const char *test_permuted_dense_to_csr_empty(void)
 {
-    Matrix *M = new_permuted_dense(4, 5, 0, 0, NULL, NULL, NULL);
+    matrix *M = new_permuted_dense(4, 5, 0, 0, NULL, NULL, NULL);
 
-    CSR_Matrix *C = M->to_csr(M);
+    CSR_matrix *C = M->to_csr(M);
     int Cp_expected[5] = {0, 0, 0, 0, 0};
     mu_assert("nnz", C->nnz == 0);
     mu_assert("p", cmp_int_array(C->p, Cp_expected, 5));
@@ -56,16 +56,16 @@ const char *test_permuted_dense_to_csr_empty(void)
 }
 
 /* Full dense (row_perm = [0..m), col_perm = [0..n)): result is the dense
-   matrix in CSR. */
+   matrix in CSR_matrix. */
 const char *test_permuted_dense_to_csr_full(void)
 {
     int row_perm[2] = {0, 1};
     int col_perm[3] = {0, 1, 2};
     double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
 
-    Matrix *M = new_permuted_dense(2, 3, 2, 3, row_perm, col_perm, X);
+    matrix *M = new_permuted_dense(2, 3, 2, 3, row_perm, col_perm, X);
 
-    CSR_Matrix *C = M->to_csr(M);
+    CSR_matrix *C = M->to_csr(M);
     int Cp_expected[3] = {0, 3, 6};
     int Ci_expected[6] = {0, 1, 2, 0, 1, 2};
     double Cx_expected[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
@@ -86,9 +86,9 @@ const char *test_permuted_dense_to_csr_single_row(void)
     int col_perm[2] = {1, 4};
     double X[2] = {7.0, 9.0};
 
-    Matrix *M = new_permuted_dense(4, 5, 1, 2, row_perm, col_perm, X);
+    matrix *M = new_permuted_dense(4, 5, 1, 2, row_perm, col_perm, X);
 
-    CSR_Matrix *C = M->to_csr(M);
+    CSR_matrix *C = M->to_csr(M);
     int Cp_expected[5] = {0, 0, 0, 2, 2};
     int Ci_expected[2] = {1, 4};
     double Cx_expected[2] = {7.0, 9.0};
@@ -108,9 +108,9 @@ const char *test_permuted_dense_to_csr_single_col(void)
     int col_perm[1] = {2};
     double X[3] = {1.0, 2.0, 3.0};
 
-    Matrix *M = new_permuted_dense(4, 4, 3, 1, row_perm, col_perm, X);
+    matrix *M = new_permuted_dense(4, 4, 3, 1, row_perm, col_perm, X);
 
-    CSR_Matrix *C = M->to_csr(M);
+    CSR_matrix *C = M->to_csr(M);
     int Cp_expected[5] = {0, 1, 1, 2, 3};
     int Ci_expected[3] = {2, 2, 2};
     double Cx_expected[3] = {1.0, 2.0, 3.0};
@@ -123,7 +123,8 @@ const char *test_permuted_dense_to_csr_single_col(void)
     return 0;
 }
 
-/* DA_fill_values: compare against CSR DA_fill_values on the equivalent CSR.
+/* DA_fill_values: compare against CSR_matrix DA_fill_values on the equivalent
+   CSR_matrix.
 
    PD is the 5x6 matrix from the basic to_csr test, with d a length-5
    global-row diagonal including a negative and zero entry. */
@@ -134,19 +135,19 @@ const char *test_permuted_dense_DA_fill_values(void)
     double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     double d[5] = {7.0, -1.5, 0.0, 9.0, 2.5};
 
-    Matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
-    Matrix *M_out = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, NULL);
-    Permuted_Dense *pd = (Permuted_Dense *) M;
-    Permuted_Dense *pd_out = (Permuted_Dense *) M_out;
+    matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
+    matrix *M_out = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, NULL);
+    permuted_dense *pd = (permuted_dense *) M;
+    permuted_dense *pd_out = (permuted_dense *) M_out;
 
     permuted_dense_DA_fill_values(d, pd, pd_out);
 
-    /* Ground truth: build CSR of self, run DA_fill_values, compare. */
-    CSR_Matrix *csr = M->to_csr(M);
-    CSR_Matrix *csr_expected = new_csr_copy_sparsity(csr);
+    /* Ground truth: build CSR_matrix of self, run DA_fill_values, compare. */
+    CSR_matrix *csr = M->to_csr(M);
+    CSR_matrix *csr_expected = new_csr_copy_sparsity(csr);
     DA_fill_values(d, csr, csr_expected);
 
-    CSR_Matrix *csr_out = M_out->to_csr(M_out);
+    CSR_matrix *csr_out = M_out->to_csr(M_out);
     mu_assert("x", cmp_double_array(csr_out->x, csr_expected->x, csr->nnz));
 
     free_csr_matrix(csr_expected);
@@ -164,17 +165,17 @@ const char *test_permuted_dense_ATA_alloc(void)
     int col_perm[2] = {0, 3};
     double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
 
-    Matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
-    Permuted_Dense *pd = (Permuted_Dense *) M;
+    matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
+    permuted_dense *pd = (permuted_dense *) M;
 
-    Matrix *M_ata = permuted_dense_ATA_alloc(pd);
-    Permuted_Dense *pd_ata = (Permuted_Dense *) M_ata;
+    matrix *M_ata = permuted_dense_ATA_alloc(pd);
+    permuted_dense *pd_ata = (permuted_dense *) M_ata;
 
     int perm_expected[2] = {0, 3};
     mu_assert("m", M_ata->m == 6);
     mu_assert("n", M_ata->n == 6);
-    mu_assert("dense_m", pd_ata->dense_m == 2);
-    mu_assert("dense_n", pd_ata->dense_n == 2);
+    mu_assert("m0", pd_ata->m0 == 2);
+    mu_assert("n0", pd_ata->n0 == 2);
     mu_assert("row_perm", cmp_int_array(pd_ata->row_perm, perm_expected, 2));
     mu_assert("col_perm", cmp_int_array(pd_ata->col_perm, perm_expected, 2));
 
@@ -193,11 +194,11 @@ const char *test_permuted_dense_ATDA_fill_values(void)
     double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     double d[5] = {7.0, -1.5, 0.0, 9.0, 2.5};
 
-    Matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
-    Permuted_Dense *pd = (Permuted_Dense *) M;
+    matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
+    permuted_dense *pd = (permuted_dense *) M;
 
-    Matrix *M_out = permuted_dense_ATA_alloc(pd);
-    Permuted_Dense *pd_out = (Permuted_Dense *) M_out;
+    matrix *M_out = permuted_dense_ATA_alloc(pd);
+    permuted_dense *pd_out = (permuted_dense *) M_out;
     permuted_dense_ATDA_fill_values(pd, d, pd_out);
 
     double X_expected[4] = {61.0, 72.0, 72.0, 84.0};
@@ -208,11 +209,11 @@ const char *test_permuted_dense_ATDA_fill_values(void)
     return 0;
 }
 
-/* PD x CSC: J is 6x4. col 0 empty; col 1 has rows {0,3} (vals 10, 20);
+/* PD x CSC_matrix: J is 6x4. col 0 empty; col 1 has rows {0,3} (vals 10, 20);
    col 2 has row {2} (val 30, but row 2 not in col_perm_self = {0,3} so col 2
    is INACTIVE); col 3 has row {3} (val 40). Active cols: {1, 3}.
 
-   Expected: dense_m=3, dense_n=2, row_perm={1,2,4}, col_perm={1,3}.
+   Expected: m0=3, n0=2, row_perm={1,2,4}, col_perm={1,3}.
    Values: out.X[:,0] = 10*[1,3,5] + 20*[2,4,6] = [50,110,170],
            out.X[:,1] = 40*[2,4,6] = [80,160,240]. */
 const char *test_permuted_dense_times_csc(void)
@@ -220,10 +221,10 @@ const char *test_permuted_dense_times_csc(void)
     int row_perm[3] = {1, 2, 4};
     int col_perm[2] = {0, 3};
     double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
-    Matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
-    Permuted_Dense *pd = (Permuted_Dense *) M;
+    matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
+    permuted_dense *pd = (permuted_dense *) M;
 
-    CSC_Matrix *J = new_csc_matrix(6, 4, 4);
+    CSC_matrix *J = new_csc_matrix(6, 4, 4);
     int Jp[5] = {0, 0, 2, 3, 4};
     int Ji[4] = {0, 3, 2, 3};
     double Jx[4] = {10.0, 20.0, 30.0, 40.0};
@@ -231,8 +232,8 @@ const char *test_permuted_dense_times_csc(void)
     memcpy(J->i, Ji, 4 * sizeof(int));
     memcpy(J->x, Jx, 4 * sizeof(double));
 
-    Matrix *M_out = permuted_dense_times_csc_alloc(pd, J);
-    Permuted_Dense *pd_out = (Permuted_Dense *) M_out;
+    matrix *M_out = permuted_dense_times_csc_alloc(pd, J);
+    permuted_dense *pd_out = (permuted_dense *) M_out;
     permuted_dense_times_csc_fill_values(pd, J, pd_out);
 
     int row_perm_expected[3] = {1, 2, 4};
@@ -241,8 +242,8 @@ const char *test_permuted_dense_times_csc(void)
 
     mu_assert("m", M_out->m == 5);
     mu_assert("n", M_out->n == 4);
-    mu_assert("dense_m", pd_out->dense_m == 3);
-    mu_assert("dense_n", pd_out->dense_n == 2);
+    mu_assert("m0", pd_out->m0 == 3);
+    mu_assert("n0", pd_out->n0 == 2);
     mu_assert("row_perm", cmp_int_array(pd_out->row_perm, row_perm_expected, 3));
     mu_assert("col_perm", cmp_int_array(pd_out->col_perm, col_perm_expected, 2));
     mu_assert("X", cmp_double_array(pd_out->X, X_expected, 6));
@@ -253,18 +254,18 @@ const char *test_permuted_dense_times_csc(void)
     return 0;
 }
 
-/* PD x CSC edge case: every column of J has its only nonzero outside
-   col_perm_self, so col_perm_out is empty (dense_n = 0). */
+/* PD x CSC_matrix edge case: every column of J has its only nonzero outside
+   col_perm_self, so col_perm_out is empty (n0 = 0). */
 const char *test_permuted_dense_times_csc_no_active(void)
 {
     int row_perm[3] = {1, 2, 4};
     int col_perm[2] = {0, 3};
     double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
-    Matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
-    Permuted_Dense *pd = (Permuted_Dense *) M;
+    matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
+    permuted_dense *pd = (permuted_dense *) M;
 
     /* J: col 0 has row {1}, col 1 has row {5}. Neither in col_perm_self. */
-    CSC_Matrix *J = new_csc_matrix(6, 2, 2);
+    CSC_matrix *J = new_csc_matrix(6, 2, 2);
     int Jp[3] = {0, 1, 2};
     int Ji[2] = {1, 5};
     double Jx[2] = {100.0, 200.0};
@@ -272,14 +273,14 @@ const char *test_permuted_dense_times_csc_no_active(void)
     memcpy(J->i, Ji, 2 * sizeof(int));
     memcpy(J->x, Jx, 2 * sizeof(double));
 
-    Matrix *M_out = permuted_dense_times_csc_alloc(pd, J);
-    Permuted_Dense *pd_out = (Permuted_Dense *) M_out;
+    matrix *M_out = permuted_dense_times_csc_alloc(pd, J);
+    permuted_dense *pd_out = (permuted_dense *) M_out;
     permuted_dense_times_csc_fill_values(pd, J, pd_out);
 
     mu_assert("m", M_out->m == 5);
     mu_assert("n", M_out->n == 2);
-    mu_assert("dense_m", pd_out->dense_m == 3);
-    mu_assert("dense_n", pd_out->dense_n == 0);
+    mu_assert("m0", pd_out->m0 == 3);
+    mu_assert("n0", pd_out->n0 == 0);
 
     free_matrix(M);
     free_matrix(M_out);
@@ -287,7 +288,7 @@ const char *test_permuted_dense_times_csc_no_active(void)
     return 0;
 }
 
-/* to_csr vtable method: lazy CSR view. First call allocates pd->csr_cache;
+/* to_csr vtable method: lazy CSR_matrix view. First call allocates pd->csr_cache;
    subsequent calls refresh values to reflect the current pd->X. */
 const char *test_permuted_dense_to_csr_lazy(void)
 {
@@ -295,12 +296,12 @@ const char *test_permuted_dense_to_csr_lazy(void)
     int col_perm[2] = {0, 3};
     double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
 
-    Matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
-    Permuted_Dense *pd = (Permuted_Dense *) M;
+    matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
+    permuted_dense *pd = (permuted_dense *) M;
 
     mu_assert("csr_cache initially NULL", pd->csr_cache == NULL);
 
-    CSR_Matrix *csr = M->to_csr(M);
+    CSR_matrix *csr = M->to_csr(M);
     mu_assert("csr_cache populated", pd->csr_cache != NULL);
     mu_assert("returns the cache", csr == pd->csr_cache);
 
@@ -324,8 +325,8 @@ const char *test_permuted_dense_col_inv(void)
     int col_perm[2] = {0, 3};
     double X[2] = {0.0, 0.0};
 
-    Matrix *M = new_permuted_dense(1, 6, 1, 2, row_perm, col_perm, X);
-    Permuted_Dense *pd = (Permuted_Dense *) M;
+    matrix *M = new_permuted_dense(1, 6, 1, 2, row_perm, col_perm, X);
+    permuted_dense *pd = (permuted_dense *) M;
 
     int expected[6] = {0, -1, -1, 1, -1, -1};
     mu_assert("col_inv", cmp_int_array(pd->col_inv, expected, 6));
@@ -343,7 +344,7 @@ const char *test_permuted_dense_index(void)
     int row_perm[3] = {1, 3, 4};
     int col_perm[2] = {0, 2};
     double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
-    Matrix *M = new_permuted_dense(6, 4, 3, 2, row_perm, col_perm, X);
+    matrix *M = new_permuted_dense(6, 4, 3, 2, row_perm, col_perm, X);
 
     /* Index by [0, 3, 1, 5, 4]:
        - position 0 -> source row 0 (not in row_perm, zero)
@@ -352,14 +353,14 @@ const char *test_permuted_dense_index(void)
        - position 3 -> source row 5 (not in row_perm, zero)
        - position 4 -> source row 4 (in row_perm at ii=2, dense) */
     int indices[5] = {0, 3, 1, 5, 4};
-    Matrix *out = M->index_alloc(M, indices, 5);
-    Permuted_Dense *out_pd = (Permuted_Dense *) out;
+    matrix *out = M->index_alloc(M, indices, 5);
+    permuted_dense *out_pd = (permuted_dense *) out;
 
     mu_assert("out m", out->m == 5);
     mu_assert("out n", out->n == 4);
-    mu_assert("out nnz", out->nnz == 6); /* dense_m=3 * dense_n=2 */
-    mu_assert("dense_m", out_pd->dense_m == 3);
-    mu_assert("dense_n", out_pd->dense_n == 2);
+    mu_assert("out nnz", out->nnz == 6); /* m0=3 * n0=2 */
+    mu_assert("m0", out_pd->m0 == 3);
+    mu_assert("n0", out_pd->n0 == 2);
 
     int expected_row_perm[3] = {1, 2, 4};
     mu_assert("row_perm", cmp_int_array(out_pd->row_perm, expected_row_perm, 3));
@@ -387,16 +388,16 @@ const char *test_permuted_dense_promote(void)
     int row_perm[1] = {0};
     int col_perm[2] = {1, 3};
     double X[2] = {7.0, 9.0};
-    Matrix *M = new_permuted_dense(1, 5, 1, 2, row_perm, col_perm, X);
+    matrix *M = new_permuted_dense(1, 5, 1, 2, row_perm, col_perm, X);
 
-    Matrix *out = M->promote_alloc(M, 4);
-    Permuted_Dense *out_pd = (Permuted_Dense *) out;
+    matrix *out = M->promote_alloc(M, 4);
+    permuted_dense *out_pd = (permuted_dense *) out;
 
     mu_assert("out m", out->m == 4);
     mu_assert("out n", out->n == 5);
-    mu_assert("out nnz", out->nnz == 8); /* dense_m=4 * dense_n=2 */
-    mu_assert("dense_m", out_pd->dense_m == 4);
-    mu_assert("dense_n", out_pd->dense_n == 2);
+    mu_assert("out nnz", out->nnz == 8); /* m0=4 * n0=2 */
+    mu_assert("m0", out_pd->m0 == 4);
+    mu_assert("n0", out_pd->n0 == 2);
 
     int expected_row_perm[4] = {0, 1, 2, 3};
     mu_assert("row_perm", cmp_int_array(out_pd->row_perm, expected_row_perm, 4));
@@ -420,16 +421,16 @@ const char *test_permuted_dense_broadcast_scalar(void)
     int row_perm[1] = {0};
     int col_perm[2] = {1, 3};
     double X[2] = {7.0, 9.0};
-    Matrix *M = new_permuted_dense(1, 5, 1, 2, row_perm, col_perm, X);
+    matrix *M = new_permuted_dense(1, 5, 1, 2, row_perm, col_perm, X);
 
     int d1 = 2, d2 = 3; /* out shape (2, 3), m = 6 */
-    Matrix *out = M->broadcast_alloc(M, BROADCAST_SCALAR, d1, d2);
-    Permuted_Dense *out_pd = (Permuted_Dense *) out;
+    matrix *out = M->broadcast_alloc(M, BROADCAST_SCALAR, d1, d2);
+    permuted_dense *out_pd = (permuted_dense *) out;
 
     mu_assert("out m", out->m == 6);
     mu_assert("out n", out->n == 5);
-    mu_assert("dense_m", out_pd->dense_m == 6);
-    mu_assert("dense_n", out_pd->dense_n == 2);
+    mu_assert("m0", out_pd->m0 == 6);
+    mu_assert("n0", out_pd->n0 == 2);
     int expected_rp[6] = {0, 1, 2, 3, 4, 5};
     mu_assert("row_perm", cmp_int_array(out_pd->row_perm, expected_rp, 6));
 
@@ -445,7 +446,7 @@ const char *test_permuted_dense_broadcast_scalar(void)
 /* PD broadcast_alloc / broadcast_fill_values, ROW variant.
    (1, d2) input has Jacobian of shape (d2, n_vars). Source PD: m=d2=3,
    row_perm={0, 2} (rows 0 and 2 dense), col_perm={1, 4}, single dense row
-   per dense_m. Output (d1, d2) = (2, 3): each child row replicated d1=2
+   per m0. Output (d1, d2) = (2, 3): each child row replicated d1=2
    times. */
 const char *test_permuted_dense_broadcast_row(void)
 {
@@ -453,15 +454,15 @@ const char *test_permuted_dense_broadcast_row(void)
     int col_perm[2] = {1, 4};
     double X[4] = {1.0, 2.0,  /* row corresponding to child row 0 */
                    3.0, 4.0}; /* row corresponding to child row 2 */
-    Matrix *M = new_permuted_dense(3, 6, 2, 2, row_perm, col_perm, X);
+    matrix *M = new_permuted_dense(3, 6, 2, 2, row_perm, col_perm, X);
 
     int d1 = 2, d2 = 3; /* output (2, 3), out m = 6 */
-    Matrix *out = M->broadcast_alloc(M, BROADCAST_ROW, d1, d2);
-    Permuted_Dense *out_pd = (Permuted_Dense *) out;
+    matrix *out = M->broadcast_alloc(M, BROADCAST_ROW, d1, d2);
+    permuted_dense *out_pd = (permuted_dense *) out;
 
     mu_assert("out m", out->m == 6);
-    mu_assert("dense_m", out_pd->dense_m == 4); /* d1 * 2 */
-    mu_assert("dense_n", out_pd->dense_n == 2);
+    mu_assert("m0", out_pd->m0 == 4); /* d1 * 2 */
+    mu_assert("n0", out_pd->n0 == 2);
     /* row_perm = {child_row_perm[0]*d1, +1, child_row_perm[1]*d1, +1}
                 = {0, 1, 4, 5} */
     int expected_rp[4] = {0, 1, 4, 5};
@@ -486,15 +487,15 @@ const char *test_permuted_dense_broadcast_col(void)
     int row_perm[2] = {0, 2};
     int col_perm[2] = {1, 4};
     double X[4] = {1.0, 2.0, 3.0, 4.0};
-    Matrix *M = new_permuted_dense(3, 6, 2, 2, row_perm, col_perm, X);
+    matrix *M = new_permuted_dense(3, 6, 2, 2, row_perm, col_perm, X);
 
     int d1 = 3, d2 = 2;
-    Matrix *out = M->broadcast_alloc(M, BROADCAST_COL, d1, d2);
-    Permuted_Dense *out_pd = (Permuted_Dense *) out;
+    matrix *out = M->broadcast_alloc(M, BROADCAST_COL, d1, d2);
+    permuted_dense *out_pd = (permuted_dense *) out;
 
     mu_assert("out m", out->m == 6);
-    mu_assert("dense_m", out_pd->dense_m == 4); /* d2 * 2 */
-    mu_assert("dense_n", out_pd->dense_n == 2);
+    mu_assert("m0", out_pd->m0 == 4); /* d2 * 2 */
+    mu_assert("n0", out_pd->n0 == 2);
     /* row_perm = {0+0, 0+2, 3+0, 3+2} = {0, 2, 3, 5} */
     int expected_rp[4] = {0, 2, 3, 5};
     mu_assert("row_perm", cmp_int_array(out_pd->row_perm, expected_rp, 4));
@@ -510,22 +511,22 @@ const char *test_permuted_dense_broadcast_col(void)
 }
 
 /* PD diag_vec_alloc / diag_vec_fill_values.
-   Source PD shape (3, 6) with dense_m=2 (rows 0 and 2) -> output PD shape
+   Source PD shape (3, 6) with m0=2 (rows 0 and 2) -> output PD shape
    (9, 6) with the same 2 dense rows mapped to positions {0, 8} = {0*4, 2*4}. */
 const char *test_permuted_dense_diag_vec(void)
 {
     int row_perm[2] = {0, 2};
     int col_perm[2] = {1, 4};
     double X[4] = {1.0, 2.0, 3.0, 4.0};
-    Matrix *M = new_permuted_dense(3, 6, 2, 2, row_perm, col_perm, X);
+    matrix *M = new_permuted_dense(3, 6, 2, 2, row_perm, col_perm, X);
 
-    Matrix *out = M->diag_vec_alloc(M);
-    Permuted_Dense *out_pd = (Permuted_Dense *) out;
+    matrix *out = M->diag_vec_alloc(M);
+    permuted_dense *out_pd = (permuted_dense *) out;
 
     mu_assert("out m", out->m == 9);
     mu_assert("out n", out->n == 6);
-    mu_assert("dense_m", out_pd->dense_m == 2);
-    mu_assert("dense_n", out_pd->dense_n == 2);
+    mu_assert("m0", out_pd->m0 == 2);
+    mu_assert("n0", out_pd->n0 == 2);
     /* row_perm = {0*(n+1), 2*(n+1)} = {0, 8} */
     int expected_rp[2] = {0, 8};
     mu_assert("row_perm", cmp_int_array(out_pd->row_perm, expected_rp, 2));
@@ -546,18 +547,18 @@ const char *test_permuted_dense_diag_vec(void)
 
 /* Scatter a PD into a dense m x n_global buffer (row-major), zero-filled.
    Buffer is allocated by the caller. */
-static void scatter_pd_to_dense(const Permuted_Dense *pd, int n_global,
+static void scatter_pd_to_dense(const permuted_dense *pd, int n_global,
                                 double *dense)
 {
     int m = pd->base.m;
     memset(dense, 0, (size_t) m * (size_t) n_global * sizeof(double));
-    for (int ii = 0; ii < pd->dense_m; ii++)
+    for (int ii = 0; ii < pd->m0; ii++)
     {
         int row = pd->row_perm[ii];
-        for (int jj = 0; jj < pd->dense_n; jj++)
+        for (int jj = 0; jj < pd->n0; jj++)
         {
             int col = pd->col_perm[jj];
-            dense[row * n_global + col] = pd->X[ii * pd->dense_n + jj];
+            dense[row * n_global + col] = pd->X[ii * pd->n0 + jj];
         }
     }
 }
@@ -573,22 +574,22 @@ const char *test_permuted_dense_BTA_matching_row_perm(void)
     /* X_A is (2, 2), X_B is (2, 2), both row-major. */
     double XA[4] = {1.0, 2.0, 3.0, 4.0}; /* rows: [1,2], [3,4] */
     double XB[4] = {5.0, 6.0, 7.0, 8.0}; /* rows: [5,6], [7,8] */
-    Matrix *A_m = new_permuted_dense(4, 4, 2, 2, row_perm, col_perm_A, XA);
-    Matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm, col_perm_B, XB);
-    Permuted_Dense *A = (Permuted_Dense *) A_m;
-    Permuted_Dense *B = (Permuted_Dense *) B_m;
+    matrix *A_m = new_permuted_dense(4, 4, 2, 2, row_perm, col_perm_A, XA);
+    matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm, col_perm_B, XB);
+    permuted_dense *A = (permuted_dense *) A_m;
+    permuted_dense *B = (permuted_dense *) B_m;
 
-    Matrix *C_m = permuted_dense_BTA_alloc(A, B);
-    Permuted_Dense *C = (Permuted_Dense *) C_m;
+    matrix *C_m = permuted_dense_BTA_alloc(A, B);
+    permuted_dense *C = (permuted_dense *) C_m;
 
     mu_assert("out m", C_m->m == 4); /* B.n */
     mu_assert("out n", C_m->n == 4); /* A.n */
-    mu_assert("dense_m", C->dense_m == 2);
-    mu_assert("dense_n", C->dense_n == 2);
+    mu_assert("m0", C->m0 == 2);
+    mu_assert("n0", C->n0 == 2);
     mu_assert("row_perm", cmp_int_array(C->row_perm, col_perm_B, 2));
     mu_assert("col_perm", cmp_int_array(C->col_perm, col_perm_A, 2));
 
-    permuted_dense_BTA_fill_values(A, B, C);
+    BTA_pd_pd_fill_values(A, B, C);
 
     /* Reference: X_B^T X_A. With X_B = [[5,6],[7,8]], X_A = [[1,2],[3,4]]:
        X_B^T = [[5,7],[6,8]]. X_B^T X_A = [[5*1+7*3, 5*2+7*4], [6*1+8*3, 6*2+8*4]]
@@ -602,6 +603,43 @@ const char *test_permuted_dense_BTA_matching_row_perm(void)
     return 0;
 }
 
+/* BTA with empty row intersection: row_perm_A = [0, 2], row_perm_B = [1, 3].
+   permuted_dense_BTA_alloc should return an empty C (nnz = 0); the fill
+   kernels should short-circuit without crashing. */
+const char *test_permuted_dense_BTA_empty_overlap(void)
+{
+    int row_perm_A[2] = {0, 2};
+    int row_perm_B[2] = {1, 3};
+    int col_perm_A[2] = {0, 2};
+    int col_perm_B[2] = {1, 3};
+    double XA[4] = {1.0, 2.0, 3.0, 4.0};
+    double XB[4] = {5.0, 6.0, 7.0, 8.0};
+    matrix *A_m = new_permuted_dense(4, 4, 2, 2, row_perm_A, col_perm_A, XA);
+    matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
+    permuted_dense *A = (permuted_dense *) A_m;
+    permuted_dense *B = (permuted_dense *) B_m;
+
+    matrix *C_m = permuted_dense_BTA_alloc(A, B);
+    permuted_dense *C = (permuted_dense *) C_m;
+
+    mu_assert("out m", C_m->m == 4); /* B.n */
+    mu_assert("out n", C_m->n == 4); /* A.n */
+    mu_assert("m0", C->m0 == 0);
+    mu_assert("n0", C->n0 == 0);
+    mu_assert("nnz", C_m->nnz == 0);
+
+    /* fill kernels should be safe no-ops on empty C. */
+    BTA_pd_pd_fill_values(A, B, C);
+    double d[4] = {1.0, 1.0, 1.0, 1.0};
+    BTDA_pd_pd_fill_values(A, d, B, C);
+    BTDA_pd_pd_fill_values(A, NULL, B, C);
+
+    free_matrix(C_m);
+    free_matrix(B_m);
+    free_matrix(A_m);
+    return 0;
+}
+
 /* BTA with partial overlap: row_perm_A = [1, 3, 5], row_perm_B = [3, 5, 7].
    Intersection = {3, 5}. */
 const char *test_permuted_dense_BTA_partial_overlap(void)
@@ -619,14 +657,14 @@ const char *test_permuted_dense_BTA_partial_overlap(void)
     double XB[6] = {10.0, 20.0,  /* row 3 (in A at pos 1) */
                     30.0, 40.0,  /* row 5 (in A at pos 2) */
                     50.0, 60.0}; /* row 7 (NOT in A) */
-    Matrix *A_m = new_permuted_dense(8, 4, 3, 2, row_perm_A, col_perm_A, XA);
-    Matrix *B_m = new_permuted_dense(8, 4, 3, 2, row_perm_B, col_perm_B, XB);
-    Permuted_Dense *A = (Permuted_Dense *) A_m;
-    Permuted_Dense *B = (Permuted_Dense *) B_m;
+    matrix *A_m = new_permuted_dense(8, 4, 3, 2, row_perm_A, col_perm_A, XA);
+    matrix *B_m = new_permuted_dense(8, 4, 3, 2, row_perm_B, col_perm_B, XB);
+    permuted_dense *A = (permuted_dense *) A_m;
+    permuted_dense *B = (permuted_dense *) B_m;
 
-    Matrix *C_m = permuted_dense_BTA_alloc(A, B);
-    Permuted_Dense *C = (Permuted_Dense *) C_m;
-    permuted_dense_BTA_fill_values(A, B, C);
+    matrix *C_m = permuted_dense_BTA_alloc(A, B);
+    permuted_dense *C = (permuted_dense *) C_m;
+    BTA_pd_pd_fill_values(A, B, C);
 
     /* Reference: scatter A, B to dense 8x4, compute B^T A, compare block at
        (col_perm_B, col_perm_A). */
@@ -681,19 +719,19 @@ const char *test_permuted_dense_BTDA_decomposition(void)
     double XB[6] = {7.0, 8.0, 9.0, 10.0, 11.0, 12.0};
     double w[3] = {2.0, -1.0, 3.0};
 
-    Matrix *A_m = new_permuted_dense(3, 4, 3, 2, row_perm, col_perm_A, XA);
-    Matrix *B_m = new_permuted_dense(3, 4, 3, 2, row_perm, col_perm_B, XB);
-    Permuted_Dense *A = (Permuted_Dense *) A_m;
-    Permuted_Dense *B = (Permuted_Dense *) B_m;
+    matrix *A_m = new_permuted_dense(3, 4, 3, 2, row_perm, col_perm_A, XA);
+    matrix *B_m = new_permuted_dense(3, 4, 3, 2, row_perm, col_perm_B, XB);
+    permuted_dense *A = (permuted_dense *) A_m;
+    permuted_dense *B = (permuted_dense *) B_m;
 
     /* tmp has the same sparsity as A. */
-    Matrix *tmp_m = A_m->copy_sparsity(A_m);
-    Permuted_Dense *tmp = (Permuted_Dense *) tmp_m;
+    matrix *tmp_m = A_m->copy_sparsity(A_m);
+    permuted_dense *tmp = (permuted_dense *) tmp_m;
     permuted_dense_DA_fill_values(w, A, tmp);
 
-    Matrix *C_m = permuted_dense_BTA_alloc(tmp, B);
-    Permuted_Dense *C = (Permuted_Dense *) C_m;
-    permuted_dense_BTA_fill_values(tmp, B, C);
+    matrix *C_m = permuted_dense_BTA_alloc(tmp, B);
+    permuted_dense *C = (permuted_dense *) C_m;
+    BTA_pd_pd_fill_values(tmp, B, C);
 
     /* Reference: dense B_d^T diag(w) A_d, extract (col_perm_B, col_perm_A) block. */
     double *A_d = (double *) calloc((size_t) 3 * 4, sizeof(double));
@@ -734,9 +772,9 @@ const char *test_permuted_dense_BTDA_decomposition(void)
     return 0;
 }
 
-/* Scatter a CSR matrix into a dense m x n_global buffer (row-major).
+/* Scatter a CSR_matrix matrix into a dense m x n_global buffer (row-major).
    Caller allocates and zero-fills. */
-static void scatter_csr_to_dense(const CSR_Matrix *A_csr, int n_global,
+static void scatter_csr_to_dense(const CSR_matrix *A_csr, int n_global,
                                  double *dense)
 {
     int m = A_csr->m;
@@ -751,17 +789,17 @@ static void scatter_csr_to_dense(const CSR_Matrix *A_csr, int n_global,
     }
 }
 
-/* BTA(CSR A, PD B): basic correctness against a dense reference.
-   A is (4, 5) CSR with mixed sparsity; B is (4, 4) PD with row_perm = [1, 3],
+/* BTA(CSR_matrix A, PD B): basic correctness against a dense reference.
+   A is (4, 5) CSR_matrix with mixed sparsity; B is (4, 4) PD with row_perm = [1, 3],
    col_perm = [0, 2], dense block (2, 2). */
 const char *test_BTA_csr_pd_basic(void)
 {
-    /* CSR A: m=4, n=5, with nonzeros:
+    /* CSR_matrix A: m=4, n=5, with nonzeros:
        row 0: cols {1, 4}
        row 1: cols {0, 2}
        row 2: cols {2}
        row 3: cols {1, 4} */
-    CSR_Matrix *A = new_csr_matrix(4, 5, 7);
+    CSR_matrix *A = new_csr_matrix(4, 5, 7);
     A->p[0] = 0;
     A->p[1] = 2;
     A->p[2] = 4;
@@ -777,19 +815,19 @@ const char *test_BTA_csr_pd_basic(void)
     int row_perm_B[2] = {1, 3};
     int col_perm_B[2] = {0, 2};
     double XB[4] = {10.0, 20.0, 30.0, 40.0};
-    Matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
-    Permuted_Dense *B = (Permuted_Dense *) B_m;
+    matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
+    permuted_dense *B = (permuted_dense *) B_m;
 
-    Matrix *out_m = BTA_csr_pd_alloc(A, B);
-    Permuted_Dense *out = (Permuted_Dense *) out_m;
+    matrix *out_m = BTA_csr_pd_alloc(A, B);
+    permuted_dense *out = (permuted_dense *) out_m;
 
     /* Expected col_active: union of A's columns in rows 1 and 3
        = {0, 2} ∪ {1, 4} = {0, 1, 2, 4}, size 4. */
     int expected_col_perm[4] = {0, 1, 2, 4};
     mu_assert("out m", out_m->m == 4); /* B.n */
     mu_assert("out n", out_m->n == 5); /* A.n */
-    mu_assert("dense_m", out->dense_m == 2);
-    mu_assert("dense_n", out->dense_n == 4);
+    mu_assert("m0", out->m0 == 2);
+    mu_assert("n0", out->n0 == 4);
     mu_assert("row_perm", cmp_int_array(out->row_perm, col_perm_B, 2));
     mu_assert("col_perm", cmp_int_array(out->col_perm, expected_col_perm, 4));
 
@@ -835,13 +873,13 @@ const char *test_BTA_csr_pd_basic(void)
     return 0;
 }
 
-/* BTA(CSR A, PD B) where A is a leaf-variable Jacobian (identity-in-block).
+/* BTA(CSR_matrix A, PD B) where A is a leaf-variable Jacobian (identity-in-block).
    A is (4, 8): row k has a 1 at column 4+k (variable v of size 4 at var_id=4).
    Expected: col_perm_out = {4+row_perm_B[kk]} = {4+1, 4+3} = {5, 7}, and X_C =
    X_B^T. */
 const char *test_BTA_csr_pd_leaf_variable(void)
 {
-    CSR_Matrix *A = new_csr_matrix(4, 8, 4);
+    CSR_matrix *A = new_csr_matrix(4, 8, 4);
     for (int k = 0; k < 4; k++)
     {
         A->p[k] = k;
@@ -853,15 +891,15 @@ const char *test_BTA_csr_pd_leaf_variable(void)
     int row_perm_B[2] = {1, 3};
     int col_perm_B[2] = {0, 2};
     double XB[4] = {10.0, 20.0, 30.0, 40.0}; /* row-major (2, 2) */
-    Matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
-    Permuted_Dense *B = (Permuted_Dense *) B_m;
+    matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
+    permuted_dense *B = (permuted_dense *) B_m;
 
-    Matrix *out_m = BTA_csr_pd_alloc(A, B);
-    Permuted_Dense *out = (Permuted_Dense *) out_m;
+    matrix *out_m = BTA_csr_pd_alloc(A, B);
+    permuted_dense *out = (permuted_dense *) out_m;
 
     int expected_col_perm[2] = {5, 7};
-    mu_assert("dense_m", out->dense_m == 2);
-    mu_assert("dense_n", out->dense_n == 2);
+    mu_assert("m0", out->m0 == 2);
+    mu_assert("n0", out->n0 == 2);
     mu_assert("row_perm", cmp_int_array(out->row_perm, col_perm_B, 2));
     mu_assert("col_perm", cmp_int_array(out->col_perm, expected_col_perm, 2));
 
@@ -877,12 +915,12 @@ const char *test_BTA_csr_pd_leaf_variable(void)
     return 0;
 }
 
-/* BTA(CSR A, PD B) where A has no entries in any row of row_perm_B.
-   Output dense block should have dense_n = 0. */
+/* BTA(CSR_matrix A, PD B) where A has no entries in any row of row_perm_B.
+   Output dense block should have n0 = 0. */
 const char *test_BTA_csr_pd_no_overlap(void)
 {
     /* A: rows 0 and 2 have entries; rows 1 and 3 (row_perm_B) are empty. */
-    CSR_Matrix *A = new_csr_matrix(4, 5, 3);
+    CSR_matrix *A = new_csr_matrix(4, 5, 3);
     A->p[0] = 0;
     A->p[1] = 2;
     A->p[2] = 2;
@@ -896,14 +934,14 @@ const char *test_BTA_csr_pd_no_overlap(void)
     int row_perm_B[2] = {1, 3}; /* rows that ARE empty in A */
     int col_perm_B[2] = {0, 2};
     double XB[4] = {1.0, 2.0, 3.0, 4.0};
-    Matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
-    Permuted_Dense *B = (Permuted_Dense *) B_m;
+    matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
+    permuted_dense *B = (permuted_dense *) B_m;
 
-    Matrix *out_m = BTA_csr_pd_alloc(A, B);
-    Permuted_Dense *out = (Permuted_Dense *) out_m;
+    matrix *out_m = BTA_csr_pd_alloc(A, B);
+    permuted_dense *out = (permuted_dense *) out_m;
 
-    mu_assert("dense_m", out->dense_m == 2);
-    mu_assert("dense_n", out->dense_n == 0);
+    mu_assert("m0", out->m0 == 2);
+    mu_assert("n0", out->n0 == 0);
 
     /* Fill should be a no-op (0-sized dense block). */
     BTA_csr_pd_fill_values(A, B, out);
@@ -914,9 +952,9 @@ const char *test_BTA_csr_pd_no_overlap(void)
     return 0;
 }
 
-/* BTA(PD A, CSR B): basic correctness against a dense reference.
+/* BTA(PD A, CSR_matrix B): basic correctness against a dense reference.
    A is (4, 5) PD with row_perm = [1, 3], col_perm = [0, 2], dense block (2, 2).
-   B is (4, 4) CSR with arbitrary sparsity. */
+   B is (4, 4) CSR_matrix with arbitrary sparsity. */
 const char *test_BTA_pd_csr_basic(void)
 {
     /* PD A: m=4, n=5, row_perm = [1, 3], col_perm = [0, 2].
@@ -924,15 +962,15 @@ const char *test_BTA_pd_csr_basic(void)
     int row_perm_A[2] = {1, 3};
     int col_perm_A[2] = {0, 2};
     double XA[4] = {1.0, 2.0, 3.0, 4.0};
-    Matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
-    Permuted_Dense *A = (Permuted_Dense *) A_m;
+    matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
+    permuted_dense *A = (permuted_dense *) A_m;
 
-    /* CSR B: m=4, n=4.
+    /* CSR_matrix B: m=4, n=4.
        row 0: cols {1, 3}
        row 1: cols {0, 2}
        row 2: cols {2}
        row 3: cols {0, 3} */
-    CSR_Matrix *B = new_csr_matrix(4, 4, 7);
+    CSR_matrix *B = new_csr_matrix(4, 4, 7);
     B->p[0] = 0;
     B->p[1] = 2;
     B->p[2] = 4;
@@ -943,16 +981,16 @@ const char *test_BTA_pd_csr_basic(void)
     memcpy(B->i, Bi, sizeof Bi);
     memcpy(B->x, Bx, sizeof Bx);
 
-    Matrix *out_m = BTA_pd_csr_alloc(A, B);
-    Permuted_Dense *out = (Permuted_Dense *) out_m;
+    matrix *out_m = BTA_pd_csr_alloc(A, B);
+    permuted_dense *out = (permuted_dense *) out_m;
 
     /* row_active = union of B's cols in rows 1 and 3
                   = {0, 2} ∪ {0, 3} = {0, 2, 3}, size 3. */
     int expected_row_perm[3] = {0, 2, 3};
     mu_assert("out m", out_m->m == 4); /* B.n */
     mu_assert("out n", out_m->n == 5); /* A.n */
-    mu_assert("dense_m", out->dense_m == 3);
-    mu_assert("dense_n", out->dense_n == 2);
+    mu_assert("m0", out->m0 == 3);
+    mu_assert("n0", out->n0 == 2);
     mu_assert("row_perm", cmp_int_array(out->row_perm, expected_row_perm, 3));
     mu_assert("col_perm", cmp_int_array(out->col_perm, col_perm_A, 2));
 
@@ -997,7 +1035,7 @@ const char *test_BTA_pd_csr_basic(void)
     return 0;
 }
 
-/* BTA(PD A, CSR B) where B is a leaf-variable Jacobian (identity-in-block).
+/* BTA(PD A, CSR_matrix B) where B is a leaf-variable Jacobian (identity-in-block).
    B is (4, 8): row k has a 1 at column 4+k (variable v of size 4 at var_id=4).
    Expected: row_perm_out = {4+row_perm_A[kk]} = {4+1, 4+3} = {5, 7}, X_C = X_A. */
 const char *test_BTA_pd_csr_leaf_variable(void)
@@ -1005,10 +1043,10 @@ const char *test_BTA_pd_csr_leaf_variable(void)
     int row_perm_A[2] = {1, 3};
     int col_perm_A[2] = {0, 2};
     double XA[4] = {1.0, 2.0, 3.0, 4.0};
-    Matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
-    Permuted_Dense *A = (Permuted_Dense *) A_m;
+    matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
+    permuted_dense *A = (permuted_dense *) A_m;
 
-    CSR_Matrix *B = new_csr_matrix(4, 8, 4);
+    CSR_matrix *B = new_csr_matrix(4, 8, 4);
     for (int k = 0; k < 4; k++)
     {
         B->p[k] = k;
@@ -1017,12 +1055,12 @@ const char *test_BTA_pd_csr_leaf_variable(void)
     }
     B->p[4] = 4;
 
-    Matrix *out_m = BTA_pd_csr_alloc(A, B);
-    Permuted_Dense *out = (Permuted_Dense *) out_m;
+    matrix *out_m = BTA_pd_csr_alloc(A, B);
+    permuted_dense *out = (permuted_dense *) out_m;
 
     int expected_row_perm[2] = {5, 7};
-    mu_assert("dense_m", out->dense_m == 2);
-    mu_assert("dense_n", out->dense_n == 2);
+    mu_assert("m0", out->m0 == 2);
+    mu_assert("n0", out->n0 == 2);
     mu_assert("row_perm", cmp_int_array(out->row_perm, expected_row_perm, 2));
     mu_assert("col_perm", cmp_int_array(out->col_perm, col_perm_A, 2));
 
@@ -1037,18 +1075,18 @@ const char *test_BTA_pd_csr_leaf_variable(void)
     return 0;
 }
 
-/* BTA(PD A, CSR B) where B has no entries in any row of row_perm_A.
-   Output dense block should have dense_m = 0. */
+/* BTA(PD A, CSR_matrix B) where B has no entries in any row of row_perm_A.
+   Output dense block should have m0 = 0. */
 const char *test_BTA_pd_csr_no_overlap(void)
 {
     int row_perm_A[2] = {1, 3};
     int col_perm_A[2] = {0, 2};
     double XA[4] = {1.0, 2.0, 3.0, 4.0};
-    Matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
-    Permuted_Dense *A = (Permuted_Dense *) A_m;
+    matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
+    permuted_dense *A = (permuted_dense *) A_m;
 
     /* B: rows 0 and 2 have entries; rows 1 and 3 (row_perm_A) are empty. */
-    CSR_Matrix *B = new_csr_matrix(4, 4, 3);
+    CSR_matrix *B = new_csr_matrix(4, 4, 3);
     B->p[0] = 0;
     B->p[1] = 2;
     B->p[2] = 2;
@@ -1059,11 +1097,11 @@ const char *test_BTA_pd_csr_no_overlap(void)
     memcpy(B->i, Bi, sizeof Bi);
     memcpy(B->x, Bx, sizeof Bx);
 
-    Matrix *out_m = BTA_pd_csr_alloc(A, B);
-    Permuted_Dense *out = (Permuted_Dense *) out_m;
+    matrix *out_m = BTA_pd_csr_alloc(A, B);
+    permuted_dense *out = (permuted_dense *) out_m;
 
-    mu_assert("dense_m", out->dense_m == 0);
-    mu_assert("dense_n", out->dense_n == 2);
+    mu_assert("m0", out->m0 == 0);
+    mu_assert("n0", out->n0 == 2);
 
     /* Fill should be a no-op (0-sized dense block on the row axis). */
     BTA_pd_csr_fill_values(A, B, out);
diff --git a/tests/wsum_hess/affine/test_left_matmul.h b/tests/wsum_hess/affine/test_left_matmul.h
index c6d6859..bded27f 100644
--- a/tests/wsum_hess/affine/test_left_matmul.h
+++ b/tests/wsum_hess/affine/test_left_matmul.h
@@ -53,8 +53,8 @@ const char *test_wsum_hess_left_matmul(void)
 
     expr *x = new_variable(3, 1, 0, 3);
 
-    /* Create sparse matrix A in CSR format */
-    CSR_Matrix *A = new_csr_matrix(4, 3, 7);
+    /* Create sparse matrix A in CSR_matrix format */
+    CSR_matrix *A = new_csr_matrix(4, 3, 7);
     int A_p[5] = {0, 2, 4, 6, 7};
     int A_i[7] = {0, 2, 0, 2, 0, 2, 0};
     double A_x[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
@@ -98,7 +98,7 @@ const char *test_wsum_hess_left_matmul_exp_composite(void)
     expr *x = new_variable(3, 1, 0, 3);
 
     /* Create B matrix (3x3 all ones) */
-    CSR_Matrix *B = new_csr_matrix(3, 3, 9);
+    CSR_matrix *B = new_csr_matrix(3, 3, 9);
     int B_p[4] = {0, 3, 6, 9};
     int B_i[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
     double B_x[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
@@ -107,7 +107,7 @@ const char *test_wsum_hess_left_matmul_exp_composite(void)
     memcpy(B->x, B_x, 9 * sizeof(double));
 
     /* Create A matrix */
-    CSR_Matrix *A = new_csr_matrix(4, 3, 7);
+    CSR_matrix *A = new_csr_matrix(4, 3, 7);
     int A_p[5] = {0, 2, 4, 6, 7};
     int A_i[7] = {0, 2, 0, 2, 0, 2, 0};
     double A_x[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
@@ -159,8 +159,8 @@ const char *test_wsum_hess_left_matmul_matrix(void)
 
     expr *x = new_variable(3, 2, 0, 6);
 
-    /* Create sparse matrix A in CSR format */
-    CSR_Matrix *A = new_csr_matrix(4, 3, 7);
+    /* Create sparse matrix A in CSR_matrix format */
+    CSR_matrix *A = new_csr_matrix(4, 3, 7);
     int A_p[5] = {0, 2, 4, 6, 7};
     int A_i[7] = {0, 2, 0, 2, 0, 2, 0};
     double A_x[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
diff --git a/tests/wsum_hess/affine/test_right_matmul.h b/tests/wsum_hess/affine/test_right_matmul.h
index dc843cc..959d37f 100644
--- a/tests/wsum_hess/affine/test_right_matmul.h
+++ b/tests/wsum_hess/affine/test_right_matmul.h
@@ -23,8 +23,8 @@ const char *test_wsum_hess_right_matmul(void)
 
     expr *x = new_variable(2, 2, 0, 4);
 
-    /* Create sparse matrix A in CSR format (2x3) */
-    CSR_Matrix *A = new_csr_matrix(2, 3, 4);
+    /* Create sparse matrix A in CSR_matrix format (2x3) */
+    CSR_matrix *A = new_csr_matrix(2, 3, 4);
     int A_p[3] = {0, 2, 4};
     int A_i[4] = {0, 2, 0, 2};
     double A_x[4] = {1.0, 2.0, 3.0, 4.0};
@@ -72,8 +72,8 @@ const char *test_wsum_hess_right_matmul_vector(void)
 
     expr *x = new_variable(1, 3, 0, 3);
 
-    /* Create sparse matrix A in CSR format (3x2) */
-    CSR_Matrix *A = new_csr_matrix(3, 2, 4);
+    /* Create sparse matrix A in CSR_matrix format (3x2) */
+    CSR_matrix *A = new_csr_matrix(3, 2, 4);
     int A_p[4] = {0, 1, 3, 4};
     int A_i[4] = {0, 0, 1, 1};
     double A_x[4] = {1.0, 2.0, 3.0, 4.0};
diff --git a/tests/wsum_hess/affine/test_sum.h b/tests/wsum_hess/affine/test_sum.h
index cc1b69c..06fcf4c 100644
--- a/tests/wsum_hess/affine/test_sum.h
+++ b/tests/wsum_hess/affine/test_sum.h
@@ -16,7 +16,7 @@ const char *test_wsum_hess_sum_exp_linear(void)
     double Ax[6] = {1, 1, 2, 3, 1, -1};
     int Ai[6] = {0, 1, 0, 1, 0, 1};
     int Ap[4] = {0, 2, 4, 6};
-    CSR_Matrix *A = new_csr_matrix(3, 2, 6);
+    CSR_matrix *A = new_csr_matrix(3, 2, 6);
     memcpy(A->x, Ax, 6 * sizeof(double));
     memcpy(A->i, Ai, 6 * sizeof(int));
     memcpy(A->p, Ap, 4 * sizeof(int));
diff --git a/tests/wsum_hess/bivariate_full_dom/test_multiply.h b/tests/wsum_hess/bivariate_full_dom/test_multiply.h
index d1b8484..bdb9c34 100644
--- a/tests/wsum_hess/bivariate_full_dom/test_multiply.h
+++ b/tests/wsum_hess/bivariate_full_dom/test_multiply.h
@@ -41,14 +41,14 @@ const char *test_wsum_hess_multiply_1(void)
 const char *test_wsum_hess_multiply_sparse_random(void)
 {
     /* Test with larger random sparse matrices
-     * A: 5x10 CSR matrix
-     * B: 5x10 CSR matrix
+     * A: 5x10 CSR_matrix matrix
+     * B: 5x10 CSR_matrix matrix
      * x: 10-dimensional variable with var_id = 0, n_vars = 10
      * Expected Hessian: 10x10 sparse matrix
      */
 
     /* Create A matrix (5x10) */
-    CSR_Matrix *A = new_csr_matrix(5, 10, 10);
+    CSR_matrix *A = new_csr_matrix(5, 10, 10);
     double Ax[10] = {-1.44165273, -1.13687223, 0.55892257,  0.24912193,  0.84959744,
                      -0.23998915, 0.5913356,   -1.21627912, -0.50379166, 0.41531801};
     int Ai[10] = {1, 2, 4, 8, 2, 3, 8, 9, 1, 2};
@@ -58,7 +58,7 @@ const char *test_wsum_hess_multiply_sparse_random(void)
     memcpy(A->p, Ap, 6 * sizeof(int));
 
     /* Create B matrix (5x10) */
-    CSR_Matrix *B = new_csr_matrix(5, 10, 10);
+    CSR_matrix *B = new_csr_matrix(5, 10, 10);
     double Bx[10] = {1.27549062,  0.04194731, -0.4356034,  0.405574,   1.34670487,
                      -0.57738638, 0.9411464,  -0.31563179, 1.90831766, -0.89802958};
     int Bi[10] = {0, 3, 5, 7, 0, 5, 0, 3, 7, 9};
@@ -85,7 +85,7 @@ const char *test_wsum_hess_multiply_sparse_random(void)
     double w[5] = {0.50646339, 0.44756224, 0.67295241, 0.16424956, 0.03031469};
     mult_node->eval_wsum_hess(mult_node, w);
 
-    /* Expected Hessian in CSR format (10x10) */
+    /* Expected Hessian in CSR_matrix format (10x10) */
     int expected_p[11] = {0, 6, 9, 13, 18, 19, 20, 20, 22, 25, 29};
     int expected_i[29] = {1, 2, 3, 4, 8, 9, 0, 7, 9, 0, 3, 7, 9, 0, 2,
                           3, 8, 9, 0, 8, 1, 2, 0, 3, 5, 0, 1, 2, 3};
@@ -128,8 +128,8 @@ const char *test_wsum_hess_multiply_linear_ops(void)
      *  [ 75.  18.  76.]]
      */
 
-    /* Create CSR matrix A */
-    CSR_Matrix *A = new_csr_matrix(4, 3, 6);
+    /* Create CSR_matrix matrix A */
+    CSR_matrix *A = new_csr_matrix(4, 3, 6);
     double Ax[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     int Ai[6] = {0, 2, 1, 0, 2, 1};
     int Ap[5] = {0, 2, 3, 5, 6};
@@ -137,8 +137,8 @@ const char *test_wsum_hess_multiply_linear_ops(void)
     memcpy(A->i, Ai, 6 * sizeof(int));
     memcpy(A->p, Ap, 5 * sizeof(int));
 
-    /* Create CSR matrix B */
-    CSR_Matrix *B = new_csr_matrix(4, 3, 8);
+    /* Create CSR_matrix matrix B */
+    CSR_matrix *B = new_csr_matrix(4, 3, 8);
     double Bx[8] = {1.0, 4.0, 2.0, 7.0, 3.0, 2.0, 4.0, -1.0};
     int Bi[8] = {0, 2, 1, 2, 0, 2, 1, 2};
     int Bp[5] = {0, 2, 4, 6, 8};
@@ -167,7 +167,7 @@ const char *test_wsum_hess_multiply_linear_ops(void)
     mult_node->eval_wsum_hess(mult_node, w);
 
     /* Check sparsity pattern and values */
-    /* Expected CSR format:
+    /* Expected CSR_matrix format:
      * indptr: [0, 2, 4, 7]
      * indices: [0, 2, 1, 2, 0, 1, 2]
      * data: [74.0, 75.0, 216.0, 18.0, 75.0, 18.0, 76.0]
diff --git a/tests/wsum_hess/composite/test_chain_rule_wsum_hess.h b/tests/wsum_hess/composite/test_chain_rule_wsum_hess.h
index c5c1a60..4b9bd15 100644
--- a/tests/wsum_hess/composite/test_chain_rule_wsum_hess.h
+++ b/tests/wsum_hess/composite/test_chain_rule_wsum_hess.h
@@ -123,8 +123,8 @@ const char *test_wsum_hess_Ax_Bx_multiply(void)
     double u_vals[4] = {1.0, 2.0, 3.0, 4.0};
     double w[2] = {1.33, 2.1};
 
-    CSR_Matrix *A = new_csr_random(2, 2, 1.0);
-    CSR_Matrix *B = new_csr_random(2, 2, 1.0);
+    CSR_matrix *A = new_csr_random(2, 2, 1.0);
+    CSR_matrix *B = new_csr_random(2, 2, 1.0);
     expr *x = new_variable(2, 1, 1, 4);
     expr *Ax = new_left_matmul(NULL, x, A);
     expr *Bx = new_left_matmul(NULL, x, B);
@@ -159,8 +159,8 @@ const char *test_wsum_hess_AX_BX_multiply(void)
     double u_vals[4] = {1.0, 2.0, 3.0, 4.0};
     double w[4] = {1.1, 2.2, 3.3, 4.4};
 
-    CSR_Matrix *A = new_csr_random(2, 2, 1.0);
-    CSR_Matrix *B = new_csr_random(2, 2, 1.0);
+    CSR_matrix *A = new_csr_random(2, 2, 1.0);
+    CSR_matrix *B = new_csr_random(2, 2, 1.0);
     expr *X = new_variable(2, 2, 0, 4);
     expr *AX = new_left_matmul(NULL, X, A);
     expr *BX = new_left_matmul(NULL, X, B);
@@ -180,8 +180,8 @@ const char *test_wsum_hess_multiply_deep_composite(void)
     double u_vals[4] = {1.0, 2.0, 3.0, 4.0};
     double w[4] = {1.1, 2.2, 3.3, 4.4};
 
-    CSR_Matrix *A = new_csr_random(2, 2, 1.0);
-    CSR_Matrix *B = new_csr_random(2, 2, 1.0);
+    CSR_matrix *A = new_csr_random(2, 2, 1.0);
+    CSR_matrix *B = new_csr_random(2, 2, 1.0);
     expr *X = new_variable(2, 2, 0, 8);
     expr *Y = new_variable(2, 2, 0, 8);
     expr *AX = new_left_matmul(NULL, X, A);
@@ -205,10 +205,10 @@ const char *test_wsum_hess_quad_form_Ax(void)
     double u_vals[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     double w = 1.0;
 
-    CSR_Matrix *A = new_csr_random(3, 4, 1.0);
+    CSR_matrix *A = new_csr_random(3, 4, 1.0);
 
     /* Q = [1 2 0; 2 3 0; 0 0 4] (symmetric) */
-    CSR_Matrix *Q = new_csr_matrix(3, 3, 5);
+    CSR_matrix *Q = new_csr_matrix(3, 3, 5);
     double Qx[5] = {1.0, 2.0, 2.0, 3.0, 4.0};
     int Qi[5] = {0, 1, 0, 1, 2};
     int Qp[4] = {0, 2, 4, 5};
@@ -234,10 +234,10 @@ const char *test_wsum_hess_quad_form_sin_Ax(void)
     double u_vals[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     double w = 2.0;
 
-    CSR_Matrix *A = new_csr_random(3, 4, 1.0);
+    CSR_matrix *A = new_csr_random(3, 4, 1.0);
 
     /* Q = [1 2 0; 2 3 0; 0 0 4] (symmetric) */
-    CSR_Matrix *Q = new_csr_matrix(3, 3, 5);
+    CSR_matrix *Q = new_csr_matrix(3, 3, 5);
     double Qx[5] = {1.0, 2.0, 2.0, 3.0, 4.0};
     int Qi[5] = {0, 1, 0, 1, 2};
     int Qp[4] = {0, 2, 4, 5};
@@ -303,8 +303,8 @@ const char *test_wsum_hess_matmul_Ax_By(void)
     double u_vals[10] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0};
     double w[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
 
-    CSR_Matrix *A = new_csr_random(3, 2, 1.0);
-    CSR_Matrix *B = new_csr_random(2, 3, 1.0);
+    CSR_matrix *A = new_csr_random(3, 2, 1.0);
+    CSR_matrix *B = new_csr_random(2, 3, 1.0);
 
     expr *X = new_variable(2, 2, 0, 10);
     expr *Y = new_variable(3, 2, 4, 10);
@@ -327,8 +327,8 @@ const char *test_wsum_hess_matmul_sin_Ax_cos_Bx(void)
     double u_vals[6] = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0};
     double w[4] = {1.0, 2.0, 3.0, 4.0};
 
-    CSR_Matrix *A = new_csr_random(2, 3, 1.0);
-    CSR_Matrix *B = new_csr_random(2, 3, 1.0);
+    CSR_matrix *A = new_csr_random(2, 3, 1.0);
+    CSR_matrix *B = new_csr_random(2, 3, 1.0);
 
     expr *X = new_variable(3, 2, 0, 6);
     expr *AX = new_left_matmul(NULL, X, A); /* 2x2 */
@@ -368,7 +368,7 @@ const char *test_wsum_hess_quad_form_exp(void)
     double w = 3.0;
 
     /* Q = [1 2 0; 2 3 0; 0 0 4] (symmetric) */
-    CSR_Matrix *Q = new_csr_matrix(3, 3, 5);
+    CSR_matrix *Q = new_csr_matrix(3, 3, 5);
     double Qx[5] = {1.0, 2.0, 2.0, 3.0, 4.0};
     int Qi[5] = {0, 1, 0, 1, 2};
     int Qp[4] = {0, 2, 4, 5};
diff --git a/tests/wsum_hess/elementwise_restricted_dom/test_log.h b/tests/wsum_hess/elementwise_restricted_dom/test_log.h
index cd2a9d6..89ca601 100644
--- a/tests/wsum_hess/elementwise_restricted_dom/test_log.h
+++ b/tests/wsum_hess/elementwise_restricted_dom/test_log.h
@@ -55,7 +55,7 @@ const char *test_wsum_hess_exp_composite(void)
     double Ax[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
     int Ai[] = {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4};
     int Ap[] = {0, 5, 10, 15};
-    CSR_Matrix *A_csr = new_csr_matrix(3, 5, 15);
+    CSR_matrix *A_csr = new_csr_matrix(3, 5, 15);
     memcpy(A_csr->x, Ax, 15 * sizeof(double));
     memcpy(A_csr->i, Ai, 15 * sizeof(int));
     memcpy(A_csr->p, Ap, 4 * sizeof(int));
diff --git a/tests/wsum_hess/other/test_prod_axis_one.h b/tests/wsum_hess/other/test_prod_axis_one.h
index 3ba1326..21b810d 100644
--- a/tests/wsum_hess/other/test_prod_axis_one.h
+++ b/tests/wsum_hess/other/test_prod_axis_one.h
@@ -47,7 +47,7 @@ const char *test_wsum_hess_prod_axis_one_no_zeros(void)
                              /* Var 6 (row 1, col 2): [8, 4] (excludes col 2) */
                              8.0, 4.0};
 
-    /* Row pointers (monotonically increasing for valid CSR format) */
+    /* Row pointers (monotonically increasing for valid CSR_matrix format) */
     int expected_p[9] = {0, 0, 2, 4, 6, 8, 10, 12, 12};
 
     /* Column indices (each row of the matrix interacts with its own columns,
@@ -141,7 +141,7 @@ const char *test_wsum_hess_prod_axis_one_one_zero(void)
     expected_x[16] = 18.0;
     expected_x[17] = 9.0;
 
-    /* Row pointers (monotonically increasing for valid CSR format) */
+    /* Row pointers (monotonically increasing for valid CSR_matrix format) */
     int expected_p[11] = {0, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18};
 
     /* Column indices (each row of the matrix interacts with its own columns,
@@ -210,7 +210,7 @@ const char *test_wsum_hess_prod_axis_one_mixed_zeros(void)
     memset(expected_x, 0, sizeof(expected_x));
 
     /* For a 5x3 matrix with var_id=1, each row has 2 nnz (d2-1):
-     * CSR row pointers: p[i] = (i-1)*2 for i in [1,15]
+     * CSR_matrix row pointers: p[i] = (i-1)*2 for i in [1,15]
      *   Var 1 (matrix [0,0]): p[1]=0
      *   Var 2 (matrix [1,0]): p[2]=2
      *   Var 3 (matrix [2,0]): p[3]=4
@@ -293,7 +293,7 @@ const char *test_wsum_hess_prod_axis_one_mixed_zeros(void)
     expected_x[28] = 25.0; /* 75/(3*1) */
     expected_x[29] = 5.0;  /* 75/(3*5) */
 
-    /* Row pointers (monotonically increasing for valid CSR format) */
+    /* Row pointers (monotonically increasing for valid CSR_matrix format) */
     int expected_p[17] = {0,  0,  2,  4,  6,  8,  10, 12, 14,
                           16, 18, 20, 22, 24, 26, 28, 30};
 
diff --git a/tests/wsum_hess/other/test_prod_axis_zero.h b/tests/wsum_hess/other/test_prod_axis_zero.h
index 4e5a8f4..e057e47 100644
--- a/tests/wsum_hess/other/test_prod_axis_zero.h
+++ b/tests/wsum_hess/other/test_prod_axis_zero.h
@@ -75,7 +75,7 @@ const char *test_wsum_hess_prod_axis_zero_mixed_zeros(void)
 {
     /* x is 5x3 variable, global index 1, total 16 vars
      * x = [1, 1, 1, 1, 1, 2, 0, 3, 4, 5, 1, 0, 0, 2, 3] (column-major)
-     * Matrix (column-major):
+     * matrix (column-major):
      *     [1, 2, 1]
      *     [1, 0, 0]
      *     [1, 3, 0]
@@ -196,7 +196,7 @@ const char *test_wsum_hess_prod_axis_zero_one_zero(void)
     /* Test with a column that has exactly 1 zero
      * x is 2x2 variable, global index 1, total 5 vars
      * x = [1.0, 1.0, 2.0, 0.0] (column-major)
-     * Matrix (column-major):
+     * matrix (column-major):
      *     [1, 2]
      *     [1, 0]
      *
diff --git a/tests/wsum_hess/other/test_quad_form.h b/tests/wsum_hess/other/test_quad_form.h
index b97e682..444be3a 100644
--- a/tests/wsum_hess/other/test_quad_form.h
+++ b/tests/wsum_hess/other/test_quad_form.h
@@ -17,7 +17,7 @@ const char *test_wsum_hess_quad_form(void)
      * [0 3 4 1]
      * [0 0 1 6]
      */
-    CSR_Matrix *Q = new_csr_matrix(4, 4, 10);
+    CSR_matrix *Q = new_csr_matrix(4, 4, 10);
     double Qx[10] = {1.0, 2.0, 2.0, 5.0, 3.0, 3.0, 4.0, 1.0, 1.0, 6.0};
     int Qi[10] = {0, 1, 0, 1, 2, 1, 2, 3, 2, 3};
     int Qp[5] = {0, 2, 5, 8, 10};

From c70018344389a211d7aad5f05923a3a85b10912f Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Tue, 12 May 2026 10:50:33 +0200
Subject: [PATCH 12/31] more cache friendly BA_pd_csc_fill_values

---
 include/utils/permuted_dense.h    |  16 ++---
 src/utils/matrix_BTA.c            |   5 +-
 src/utils/permuted_dense.c        | 110 ++++++++++++++++++++----------
 tests/utils/test_matrix_BTA.h     |   3 +-
 tests/utils/test_permuted_dense.h |  18 ++---
 5 files changed, 94 insertions(+), 58 deletions(-)

diff --git a/include/utils/permuted_dense.h b/include/utils/permuted_dense.h
index 0157da0..8cdf363 100644
--- a/include/utils/permuted_dense.h
+++ b/include/utils/permuted_dense.h
@@ -59,7 +59,7 @@ typedef struct permuted_dense
     /* Mutable int scratch. Currently only used to hold the row-intersection
        index arrays idx_A / idx_B in BTA_pd_pd_fill_values and the
        slow path of BTDA_pd_pd_fill_values; allocated by
-       permuted_dense_BTA_alloc for those outputs (NULL on PDs from other
+       BTA_pd_pd_alloc for those outputs (NULL on PDs from other
        allocators). Fill kernels fall back to a per-call SP_MALLOC if
        iwork_size is too small. */
     int *iwork;
@@ -97,7 +97,7 @@ void permuted_dense_ATDA_fill_values(const permuted_dense *A, const double *d,
 
 /* Fill sparsity of C = BT @ A where A and B are both permuted_dense.
    (If B and A have no overlapping rows, then C is empty) */
-matrix *permuted_dense_BTA_alloc(const permuted_dense *A, const permuted_dense *B);
+matrix *BTA_pd_pd_alloc(const permuted_dense *A, const permuted_dense *B);
 
 /* Fill values of C = BT @ A where A and B are both permuted dense. */
 void BTA_pd_pd_fill_values(const permuted_dense *A, const permuted_dense *B,
@@ -107,13 +107,13 @@ void BTA_pd_pd_fill_values(const permuted_dense *A, const permuted_dense *B,
    and B is PD. Output is PD with row_perm = B->col_perm and col_perm = the
    sorted union of columns appearing in A's rows at positions row_perm_B.
    Dense block size = (B->n0, |col_active|). Values uninitialized. */
-matrix *BTA_csr_pd_alloc(const CSR_matrix *A_csr, const permuted_dense *B);
+matrix *BTA_csr_pd_alloc(const CSR_matrix *A, const permuted_dense *B);
 
 /* Fill C->X = X_B^T @ A_sub_dense, where A_sub_dense is A's rows at
    positions row_perm_B, columns restricted to C's col_perm, scattered
    to a dense buffer. C must have the structure produced by
-   BTA_csr_pd_alloc(A_csr, B). */
-void BTA_csr_pd_fill_values(const CSR_matrix *A_csr, const permuted_dense *B,
+   BTA_csr_pd_alloc(A, B). */
+void BTA_csr_pd_fill_values(const CSR_matrix *A, const permuted_dense *B,
                             permuted_dense *C);
 
 /* Allocate a new permuted_dense for C = B^T @ A where A is PD and B is
@@ -142,12 +142,12 @@ void BTDA_pd_pd_fill_values(const permuted_dense *A, const double *d,
    (A->base.m, J->n) with row_perm = A->row_perm and col_perm equal
    to the sorted list of columns of J that have at least one structural
    nonzero in some row in A->col_perm. Values are uninitialized. */
-matrix *permuted_dense_times_csc_alloc(const permuted_dense *A, const CSC_matrix *J);
+matrix *BA_pd_csc_alloc(const permuted_dense *A, const CSC_matrix *J);
 
 /* Fill C.X[ii, jj] = sum_kk A.X[ii, kk] * J[col_perm_A[kk],
    col_perm_C[jj]]. C must have the structure produced by
-   permuted_dense_times_csc_alloc(A, J). */
-void permuted_dense_times_csc_fill_values(const permuted_dense *A,
+   BA_pd_csc_alloc(A, J). */
+void BA_pd_csc_fill_values(const permuted_dense *A,
                                           const CSC_matrix *J, permuted_dense *C);
 
 #endif /* PERMUTED_DENSE_H */
diff --git a/src/utils/matrix_BTA.c b/src/utils/matrix_BTA.c
index 51261a2..af764b3 100644
--- a/src/utils/matrix_BTA.c
+++ b/src/utils/matrix_BTA.c
@@ -23,7 +23,7 @@ matrix *BTA_matrices_alloc(matrix *A, matrix *B)
 
     if (pd_A && pd_B)
     {
-        return permuted_dense_BTA_alloc(pd_A, pd_B);
+        return BTA_pd_pd_alloc(pd_A, pd_B);
     }
     if (pd_B)
     {
@@ -38,7 +38,8 @@ matrix *BTA_matrices_alloc(matrix *A, matrix *B)
         return BTA_pd_csr_alloc(pd_A, B_csr);
     }
 
-    /* Both Sparse: delegate to CSC_matrix BTA. Caller must ensure caches are fresh. */
+    /* Both Sparse: delegate to CSC_matrix BTA. Caller must ensure caches are fresh.
+     */
     sparse_matrix *sm_A = (sparse_matrix *) A;
     sparse_matrix *sm_B = (sparse_matrix *) B;
     A->refresh_csc_values(A);
diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c
index 65c9e04..faa2ad3 100644
--- a/src/utils/permuted_dense.c
+++ b/src/utils/permuted_dense.c
@@ -476,7 +476,7 @@ void permuted_dense_ATDA_fill_values(const permuted_dense *A, const double *d,
                 A->dwork, n0, 0.0, C->X, n0);
 }
 
-matrix *permuted_dense_BTA_alloc(const permuted_dense *A, const permuted_dense *B)
+matrix *BTA_pd_pd_alloc(const permuted_dense *A, const permuted_dense *B)
 {
     /* if A and B have no overlapping rows, then C = BT @ A is empty */
     if (!has_overlap(A->row_perm, A->m0, B->row_perm, B->m0, 0))
@@ -616,19 +616,22 @@ void BTDA_pd_pd_fill_values(const permuted_dense *A, const double *d,
     free_matrix(&DA->base);
 }
 
-matrix *BTA_csr_pd_alloc(const CSR_matrix *A_csr, const permuted_dense *B)
+matrix *BTA_csr_pd_alloc(const CSR_matrix *A, const permuted_dense *B)
 {
+    /* Cij != 0 only if i is in B's column permutation and column j of A
+      overlaps with column i of B. */
+
     /* Gather the union of columns appearing in A's rows at positions
-       row_perm_B. Use a bitmap of size A_csr->n for O(nnz) collection. */
-    int p = A_csr->n;
+       row_perm_B. Use a bitmap of size A->n for O(nnz) collection. */
+    int p = A->n;
     char *seen = (char *) SP_CALLOC(p, sizeof(char));
     int s_A = 0;
     for (int kk = 0; kk < B->m0; kk++)
     {
         int row = B->row_perm[kk];
-        for (int e = A_csr->p[row]; e < A_csr->p[row + 1]; e++)
+        for (int e = A->p[row]; e < A->p[row + 1]; e++)
         {
-            int j = A_csr->i[e];
+            int j = A->i[e];
             if (!seen[j])
             {
                 seen[j] = 1;
@@ -881,49 +884,82 @@ void BTDA_pd_csr_fill_values(const permuted_dense *A, const double *d,
                 B_sub_dense, r_B, A->X, dn_A, 0.0, C->X, dn_A);
 }
 
-matrix *permuted_dense_times_csc_alloc(const permuted_dense *A, const CSC_matrix *J)
+/* Return true if any of the 'len' integers in 'indices' exist in the set
+   marked by 'inv' (inv[k] != -1 iff k is in the set). */
+static inline bool idxs_hits_set(const int *idxs, int len, const int *inv)
 {
-    /* Active columns: those with at least one structural nonzero in a row
-       of col_perm_A. col_inv[r] != -1 iff r is in col_perm_A. */
-    iVec *col_perm_C = iVec_new(8);
-    for (int j = 0; j < J->n; j++)
+    for (int ii = 0; ii < len; ii++)
     {
-        for (int e = J->p[j]; e < J->p[j + 1]; e++)
+        if (inv[idxs[ii]] != -1)
         {
-            if (A->col_inv[J->i[e]] != -1)
-            {
-                iVec_append(col_perm_C, j);
-                break;
-            }
+            return true;
         }
     }
+    return false;
+}
+
+/* Inner product of a sparse vector (vals[0..len) at positions idxs[0..len))
+   with a dense vector, where inv maps each idxs value to a position in
+   'dense' (inv[k] == -1 means skip that entry). Returns
+       Σ_e  vals[e] * dense[inv[idxs[e]]]
+   over e where inv[idxs[e]] != -1. */
+static inline double sparse_dot_dense(const double *vals, const int *idxs, int len,
+                                      const int *inv, const double *dense)
+{
+    double sum = 0.0;
+    for (int e = 0; e < len; e++)
+    {
+        int kk = inv[idxs[e]];
+        if (kk == -1) continue;
+        sum += vals[e] * dense[kk];
+    }
+    return sum;
+}
 
-    matrix *C = new_permuted_dense(A->base.m, J->n, A->m0, col_perm_C->len,
-                                   A->row_perm, col_perm_C->data, NULL);
+matrix *BA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A)
+{
+    /* Cij != 0 if row i of B overlaps with column j of A. So we loop through
+    the columns of A. For each column of A, we check if it has any nonzeros in rows
+    that are in B's col_perm. If yes, column j of C will have a nonzero block
+    corresponding to the rows of B */
+    iVec *col_perm_C = iVec_new(10);
+    for (int j = 0; j < A->n; j++)
+    {
+        int start = A->p[j];
+        int len = A->p[j + 1] - start;
+        if (idxs_hits_set(A->i + start, len, B->col_inv))
+        {
+            iVec_append(col_perm_C, j);
+        }
+    }
+
+    matrix *C = new_permuted_dense(B->base.m, A->n, B->m0, col_perm_C->len,
+                                   B->row_perm, col_perm_C->data, NULL);
     iVec_free(col_perm_C);
     return C;
 }
 
-void permuted_dense_times_csc_fill_values(const permuted_dense *A,
-                                          const CSC_matrix *J, permuted_dense *C)
+void BA_pd_csc_fill_values(const permuted_dense *B, const CSC_matrix *A,
+                           permuted_dense *C)
 {
-    int m0 = A->m0;
-    int n0_A = A->n0;
-    int n0_C = C->n0;
-
-    /* Each entry (r, val) of J in active columns with r in col_perm_A
-       contributes val * A.X[:, kk] to C.X[:, jj], where kk = col_inv[r]
-       and jj is the position of the column in col_perm_C. Columns of X
-       and C.X are accessed via row-major strides. */
-    memset(C->X, 0, m0 * n0_C * sizeof(double));
-    for (int jj = 0; jj < n0_C; jj++)
-    {
-        int col = C->col_perm[jj];
-        for (int e = J->p[col]; e < J->p[col + 1]; e++)
+    /* C[i, j] = bi^T @ ajj, where bi is the ith row of Bs dense block and ajj is
+    the jjth column of A's sparse block (column jj = C->col_perm[j]) */
+
+    /* row i of C */
+    for (int i = 0; i < C->m0; i++)
+    {
+        double *ci = C->X + i * C->n0;
+
+        /* col j of C  */
+        for (int j = 0; j < C->n0; j++)
         {
-            int kk = A->col_inv[J->i[e]];
-            if (kk == -1) continue;
-            cblas_daxpy(m0, J->x[e], A->X + kk, n0_A, C->X + jj, n0_C);
+
+            /* we compute entry C[i, j] */
+            int jj = C->col_perm[j];
+            int start = A->p[jj];
+            int len = A->p[jj + 1] - start;
+            ci[j] = sparse_dot_dense(A->x + start, A->i + start, len, B->col_inv,
+                                     B->X + i * B->n0);
         }
     }
 }
diff --git a/tests/utils/test_matrix_BTA.h b/tests/utils/test_matrix_BTA.h
index 01edb80..2c99cb0 100644
--- a/tests/utils/test_matrix_BTA.h
+++ b/tests/utils/test_matrix_BTA.h
@@ -31,8 +31,7 @@ const char *test_BTDA_matrices_pd_pd(void)
     /* Direct primitive path on independent operands. */
     matrix *A2 = new_permuted_dense(2, 4, 2, 2, row_perm, col_perm_A, XA);
     matrix *B2 = new_permuted_dense(2, 4, 2, 2, row_perm, col_perm_B, XB);
-    matrix *C2 =
-        permuted_dense_BTA_alloc((permuted_dense *) A2, (permuted_dense *) B2);
+    matrix *C2 = BTA_pd_pd_alloc((permuted_dense *) A2, (permuted_dense *) B2);
     BTDA_pd_pd_fill_values((permuted_dense *) A2, d, (permuted_dense *) B2,
                            (permuted_dense *) C2);
 
diff --git a/tests/utils/test_permuted_dense.h b/tests/utils/test_permuted_dense.h
index 24ff605..eef17a5 100644
--- a/tests/utils/test_permuted_dense.h
+++ b/tests/utils/test_permuted_dense.h
@@ -232,9 +232,9 @@ const char *test_permuted_dense_times_csc(void)
     memcpy(J->i, Ji, 4 * sizeof(int));
     memcpy(J->x, Jx, 4 * sizeof(double));
 
-    matrix *M_out = permuted_dense_times_csc_alloc(pd, J);
+    matrix *M_out = BA_pd_csc_alloc(pd, J);
     permuted_dense *pd_out = (permuted_dense *) M_out;
-    permuted_dense_times_csc_fill_values(pd, J, pd_out);
+    BA_pd_csc_fill_values(pd, J, pd_out);
 
     int row_perm_expected[3] = {1, 2, 4};
     int col_perm_expected[2] = {1, 3};
@@ -273,9 +273,9 @@ const char *test_permuted_dense_times_csc_no_active(void)
     memcpy(J->i, Ji, 2 * sizeof(int));
     memcpy(J->x, Jx, 2 * sizeof(double));
 
-    matrix *M_out = permuted_dense_times_csc_alloc(pd, J);
+    matrix *M_out = BA_pd_csc_alloc(pd, J);
     permuted_dense *pd_out = (permuted_dense *) M_out;
-    permuted_dense_times_csc_fill_values(pd, J, pd_out);
+    BA_pd_csc_fill_values(pd, J, pd_out);
 
     mu_assert("m", M_out->m == 5);
     mu_assert("n", M_out->n == 2);
@@ -579,7 +579,7 @@ const char *test_permuted_dense_BTA_matching_row_perm(void)
     permuted_dense *A = (permuted_dense *) A_m;
     permuted_dense *B = (permuted_dense *) B_m;
 
-    matrix *C_m = permuted_dense_BTA_alloc(A, B);
+    matrix *C_m = BTA_pd_pd_alloc(A, B);
     permuted_dense *C = (permuted_dense *) C_m;
 
     mu_assert("out m", C_m->m == 4); /* B.n */
@@ -604,7 +604,7 @@ const char *test_permuted_dense_BTA_matching_row_perm(void)
 }
 
 /* BTA with empty row intersection: row_perm_A = [0, 2], row_perm_B = [1, 3].
-   permuted_dense_BTA_alloc should return an empty C (nnz = 0); the fill
+   BTA_pd_pd_alloc should return an empty C (nnz = 0); the fill
    kernels should short-circuit without crashing. */
 const char *test_permuted_dense_BTA_empty_overlap(void)
 {
@@ -619,7 +619,7 @@ const char *test_permuted_dense_BTA_empty_overlap(void)
     permuted_dense *A = (permuted_dense *) A_m;
     permuted_dense *B = (permuted_dense *) B_m;
 
-    matrix *C_m = permuted_dense_BTA_alloc(A, B);
+    matrix *C_m = BTA_pd_pd_alloc(A, B);
     permuted_dense *C = (permuted_dense *) C_m;
 
     mu_assert("out m", C_m->m == 4); /* B.n */
@@ -662,7 +662,7 @@ const char *test_permuted_dense_BTA_partial_overlap(void)
     permuted_dense *A = (permuted_dense *) A_m;
     permuted_dense *B = (permuted_dense *) B_m;
 
-    matrix *C_m = permuted_dense_BTA_alloc(A, B);
+    matrix *C_m = BTA_pd_pd_alloc(A, B);
     permuted_dense *C = (permuted_dense *) C_m;
     BTA_pd_pd_fill_values(A, B, C);
 
@@ -729,7 +729,7 @@ const char *test_permuted_dense_BTDA_decomposition(void)
     permuted_dense *tmp = (permuted_dense *) tmp_m;
     permuted_dense_DA_fill_values(w, A, tmp);
 
-    matrix *C_m = permuted_dense_BTA_alloc(tmp, B);
+    matrix *C_m = BTA_pd_pd_alloc(tmp, B);
     permuted_dense *C = (permuted_dense *) C_m;
     BTA_pd_pd_fill_values(tmp, B, C);
 

From 2592efd19f734bfc89c60fa40127937e172f0c7c Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Tue, 12 May 2026 11:11:36 +0200
Subject: [PATCH 13/31] edits

---
 src/utils/permuted_dense.c | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c
index faa2ad3..b6d31dd 100644
--- a/src/utils/permuted_dense.c
+++ b/src/utils/permuted_dense.c
@@ -561,8 +561,10 @@ void BTA_pd_pd_fill_values(const permuted_dense *A, const permuted_dense *B,
         return;
     }
 
-    /* find intersection of row permutations. We use C->iwork as the storage
-       for idx_A | idx_B (back-to-back) and grow it in place if too small */
+    // -----------------------------------------------------------------------
+    // find intersection of row permutations. We use C->iwork as the storage
+    // for idx_A | idx_B (back-to-back) and grow it in place if too small
+    // -----------------------------------------------------------------------
     int max_s = MIN(A->m0, B->m0);
     size_t needed = 2 * (size_t) max_s;
     if (C->iwork_size < needed)
@@ -577,9 +579,11 @@ void BTA_pd_pd_fill_values(const permuted_dense *A, const permuted_dense *B,
                                      idx_B);
     assert(s > 0);
 
-    /* Gather the matching rows into A->dwork and B->dwork (space is sufficient
-       since A->dwork has at least space for A's full block, and we only need part
-       of it. Same comment applies to B->dwork). */
+    // ------------------------------------------------------------------------
+    // Gather the matching rows into A->dwork and B->dwork (space is sufficient
+    // since A->dwork has at least space for A's full block, and we only need
+    // part of it. Same comment applies to B->dwork).
+    // ------------------------------------------------------------------------
     for (int k = 0; k < s; k++)
     {
         memcpy(A->dwork + k * A->n0, A->X + idx_A[k] * A->n0,
@@ -900,9 +904,7 @@ static inline bool idxs_hits_set(const int *idxs, int len, const int *inv)
 
 /* Inner product of a sparse vector (vals[0..len) at positions idxs[0..len))
    with a dense vector, where inv maps each idxs value to a position in
-   'dense' (inv[k] == -1 means skip that entry). Returns
-       Σ_e  vals[e] * dense[inv[idxs[e]]]
-   over e where inv[idxs[e]] != -1. */
+   'dense' (inv[k] == -1 means skip that entry). */
 static inline double sparse_dot_dense(const double *vals, const int *idxs, int len,
                                       const int *inv, const double *dense)
 {
@@ -910,7 +912,10 @@ static inline double sparse_dot_dense(const double *vals, const int *idxs, int l
     for (int e = 0; e < len; e++)
     {
         int kk = inv[idxs[e]];
-        if (kk == -1) continue;
+        if (kk == -1)
+        {
+            continue;
+        }
         sum += vals[e] * dense[kk];
     }
     return sum;
@@ -919,9 +924,9 @@ static inline double sparse_dot_dense(const double *vals, const int *idxs, int l
 matrix *BA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A)
 {
     /* Cij != 0 if row i of B overlaps with column j of A. So we loop through
-    the columns of A. For each column of A, we check if it has any nonzeros in rows
-    that are in B's col_perm. If yes, column j of C will have a nonzero block
-    corresponding to the rows of B */
+    the columns of A. For each column of A, we check if it has any nonzeros in
+    rows that are in B's col_perm. If yes, column j of C will have a nonzero
+    block corresponding to the rows of B */
     iVec *col_perm_C = iVec_new(10);
     for (int j = 0; j < A->n; j++)
     {
@@ -954,10 +959,10 @@ void BA_pd_csc_fill_values(const permuted_dense *B, const CSC_matrix *A,
         for (int j = 0; j < C->n0; j++)
         {
 
-            /* we compute entry C[i, j] */
             int jj = C->col_perm[j];
             int start = A->p[jj];
             int len = A->p[jj + 1] - start;
+            /* we compute entry C[i, j] */
             ci[j] = sparse_dot_dense(A->x + start, A->i + start, len, B->col_inv,
                                      B->X + i * B->n0);
         }

From 648657e5f214e4cba5beae547e2b05edd67e26a8 Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Tue, 12 May 2026 12:48:39 +0200
Subject: [PATCH 14/31] better permuted dense times CSC function

---
 include/old-code/old_permuted_dense.h         |  55 ++++
 include/utils/CSC_matrix.h                    |  10 +-
 include/utils/CSR_matrix.h                    |  12 +-
 include/utils/permuted_dense.h                |  75 +++--
 include/utils/sparse_matrix.h                 |   2 +-
 src/atoms/affine/convolve.c                   |   6 +-
 src/atoms/affine/hstack.c                     |  12 +-
 src/atoms/affine/left_matmul.c                |   4 +-
 src/atoms/affine/right_matmul.c               |   2 +-
 src/atoms/affine/sum.c                        |   2 +-
 src/atoms/affine/trace.c                      |   2 +-
 src/atoms/affine/transpose.c                  |   2 +-
 src/atoms/affine/variable.c                   |   2 +-
 src/atoms/bivariate_full_dom/matmul.c         |  22 +-
 src/atoms/bivariate_full_dom/multiply.c       |   4 +-
 .../bivariate_restricted_dom/quad_over_lin.c  |   6 +-
 src/atoms/bivariate_restricted_dom/rel_entr.c |   4 +-
 .../rel_entr_scalar_vector.c                  |   4 +-
 .../rel_entr_vector_scalar.c                  |   4 +-
 src/atoms/elementwise_full_dom/common.c       |   4 +-
 src/atoms/elementwise_restricted_dom/common.c |   4 +-
 src/atoms/other/prod.c                        |   4 +-
 src/atoms/other/prod_axis_one.c               |   4 +-
 src/atoms/other/prod_axis_zero.c              |   4 +-
 src/atoms/other/quad_form.c                   |  14 +-
 src/expr.c                                    |   2 +-
 src/old-code/linear_op.c                      |   6 +-
 src/old-code/old_CSR.c                        |   6 +-
 src/old-code/old_permuted_dense.c             | 155 +++++++++++
 src/problem.c                                 |   8 +-
 src/utils/CSC_matrix.c                        |  14 +-
 src/utils/CSR_matrix.c                        |  14 +-
 src/utils/CSR_sum.c                           |   2 +-
 src/utils/linalg_dense_sparse_matmuls.c       |   6 +-
 src/utils/linalg_sparse_matmuls.c             |   4 +-
 src/utils/matrix_BTA.c                        |  20 +-
 src/utils/permuted_dense.c                    | 236 +++++-----------
 src/utils/sparse_matrix.c                     |  14 +-
 tests/all_tests.c                             |  14 +-
 tests/forward_pass/affine/test_linear_op.h    |   4 +-
 .../jacobian_tests/affine/test_left_matmul.h  |  16 +-
 .../jacobian_tests/affine/test_right_matmul.h |   8 +-
 tests/jacobian_tests/affine/test_transpose.h  |   4 +-
 .../test_elementwise_mult.h                   |  12 +-
 .../test_quad_over_lin.h                      |  12 +-
 .../composite/test_chain_rule_jacobian.h      |  26 +-
 .../composite/test_composite_exp.h            |  12 +-
 tests/jacobian_tests/other/test_quad_form.h   |  12 +-
 tests/numerical_diff/test_numerical_diff.h    |   8 +-
 tests/old-code/test_old_permuted_dense.h      | 182 ++++++++++++
 tests/profiling/profile_BTA_pd_csr_vs_csc.h   | 133 +++++++++
 tests/profiling/profile_left_matmul.h         |   4 +-
 tests/profiling/profile_log_reg.h             |   4 +-
 tests/profiling/profile_trimmed_log_reg.h     |  20 +-
 tests/test_helpers.c                          |   2 +-
 tests/utils/test_coo_matrix.h                 |  12 +-
 tests/utils/test_csc_matrix.h                 |  34 +--
 tests/utils/test_csr_csc_conversion.h         |  26 +-
 tests/utils/test_csr_matrix.h                 |  84 +++---
 tests/utils/test_linalg_sparse_matmuls.h      |  66 ++---
 .../test_linalg_utils_matmul_chain_rule.h     |  24 +-
 tests/utils/test_matrix.h                     |   4 +-
 tests/utils/test_matrix_BTA.h                 |  29 +-
 tests/utils/test_permuted_dense.h             | 260 ++++++------------
 tests/wsum_hess/affine/test_left_matmul.h     |  16 +-
 tests/wsum_hess/affine/test_right_matmul.h    |   8 +-
 tests/wsum_hess/affine/test_sum.h             |   4 +-
 .../bivariate_full_dom/test_multiply.h        |  16 +-
 .../composite/test_chain_rule_wsum_hess.h     |  36 +--
 .../elementwise_restricted_dom/test_log.h     |   4 +-
 tests/wsum_hess/other/test_quad_form.h        |   4 +-
 71 files changed, 1100 insertions(+), 751 deletions(-)
 create mode 100644 include/old-code/old_permuted_dense.h
 create mode 100644 src/old-code/old_permuted_dense.c
 create mode 100644 tests/old-code/test_old_permuted_dense.h
 create mode 100644 tests/profiling/profile_BTA_pd_csr_vs_csc.h

diff --git a/include/old-code/old_permuted_dense.h b/include/old-code/old_permuted_dense.h
new file mode 100644
index 0000000..a2823da
--- /dev/null
+++ b/include/old-code/old_permuted_dense.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2026 Daniel Cederberg and William Zhang
+ *
+ * This file is part of the SparseDiffEngine project.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef OLD_PERMUTED_DENSE_H
+#define OLD_PERMUTED_DENSE_H
+
+#include "utils/CSR_matrix.h"
+#include "utils/permuted_dense.h"
+
+/* Legacy CSR-based (PD, Sparse) BTA / BTDA kernels.
+
+   Mathematically equivalent to BTA_pd_csc_alloc / BTDA_pd_csc_fill_values
+   in src/utils/permuted_dense.c — they all compute C = B^T (diag(d)) A
+   for B PD and A sparse. The matrix_BTA dispatcher used to choose between
+   the CSR-here and CSC-in-utils variants; after a benchmark on
+   trimmed_log_reg-shaped workloads we committed to CSC and moved these
+   kernels out of production paths.
+
+   Kept here as a reference implementation, as cross-comparison fodder for
+   tests (test_BTA_pd_csc_matches_csr), and as the CSR side of the
+   profile_BTA_pd_csr_vs_csc microbenchmark. */
+
+/* Allocate a new permuted_dense for C = B^T A where B is PD and A is
+   CSR-sparse. Output is PD with row_perm = B->col_perm and col_perm = the
+   sorted union of columns appearing in A's rows at positions row_perm_B.
+   Dense block size = (B->n0, |col_active|). Values uninitialized. */
+matrix *BTA_pd_csr_alloc(const permuted_dense *B, const CSR_matrix *A);
+
+/* Fill C->X = X_B^T @ A_sub_dense, where A_sub_dense is A's rows at
+   positions row_perm_B, columns restricted to C's col_perm, scattered to a
+   dense buffer. C must have the structure produced by BTA_pd_csr_alloc. */
+void BTA_pd_csr_fill_values(const permuted_dense *B, const CSR_matrix *A,
+                            permuted_dense *C);
+
+/* BTDA variant: C->X = X_B^T diag(d) A_sub_dense. d may be NULL (treated
+   as identity scaling). C must have the structure produced by
+   BTA_pd_csr_alloc. */
+void BTDA_pd_csr_fill_values(const permuted_dense *B, const double *d,
+                             const CSR_matrix *A, permuted_dense *C);
+
+#endif /* OLD_PERMUTED_DENSE_H */
diff --git a/include/utils/CSC_matrix.h b/include/utils/CSC_matrix.h
index 3d8beed..34d3155 100644
--- a/include/utils/CSC_matrix.h
+++ b/include/utils/CSC_matrix.h
@@ -15,8 +15,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef CSC_MATRIX_H
-#define CSC_MATRIX_H
+#ifndef CSC_matrix_H
+#define CSC_matrix_H
 
 #include "CSR_matrix.h"
 
@@ -41,8 +41,8 @@ typedef struct CSC_matrix
 } CSC_matrix;
 
 /* constructor and destructor */
-CSC_matrix *new_csc_matrix(int m, int n, int nnz);
-void free_csc_matrix(CSC_matrix *matrix);
+CSC_matrix *new_CSC_matrix(int m, int n, int nnz);
+void free_CSC_matrix(CSC_matrix *matrix);
 
 /* Fill sparsity of C = A^T D A for diagonal D */
 CSR_matrix *ATA_alloc(const CSC_matrix *A);
@@ -77,4 +77,4 @@ void csr_to_csc_fill_values(const CSR_matrix *A, CSC_matrix *C, int *iwork);
 CSR_matrix *csc_to_csr_alloc(const CSC_matrix *A, int *iwork);
 void csc_to_csr_fill_values(const CSC_matrix *A, CSR_matrix *C, int *iwork);
 
-#endif /* CSC_MATRIX_H */
+#endif /* CSC_matrix_H */
diff --git a/include/utils/CSR_matrix.h b/include/utils/CSR_matrix.h
index 32a03d2..a4880c9 100644
--- a/include/utils/CSR_matrix.h
+++ b/include/utils/CSR_matrix.h
@@ -15,8 +15,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef CSR_MATRIX_H
-#define CSR_MATRIX_H
+#ifndef CSR_matrix_H
+#define CSR_matrix_H
 #include <stdbool.h>
 
 /* CSR_matrix (Compressed Sparse Row) matrix Format
@@ -40,11 +40,11 @@ typedef struct CSR_matrix
 } CSR_matrix;
 
 /* constructors and destructors */
-CSR_matrix *new_csr_matrix(int m, int n, int nnz);
+CSR_matrix *new_CSR_matrix(int m, int n, int nnz);
 CSR_matrix *new_csr(const CSR_matrix *A);
 CSR_matrix *new_csr_copy_sparsity(const CSR_matrix *A);
-void free_csr_matrix(CSR_matrix *matrix);
-void copy_csr_matrix(const CSR_matrix *A, CSR_matrix *C);
+void free_CSR_matrix(CSR_matrix *matrix);
+void copy_CSR_matrix(const CSR_matrix *A, CSR_matrix *C);
 
 /* transpose functionality (iwork must be of size A->n) */
 CSR_matrix *transpose(const CSR_matrix *A, int *iwork);
@@ -70,4 +70,4 @@ double csr_get_value(const CSR_matrix *A, int row, int col);
    only upper triangle. C must be pre-allocated with sufficient nnz */
 void symmetrize_csr(const int *Ap, const int *Ai, int m, CSR_matrix *C);
 
-#endif /* CSR_MATRIX_H */
+#endif /* CSR_matrix_H */
diff --git a/include/utils/permuted_dense.h b/include/utils/permuted_dense.h
index 8cdf363..1ebd0e4 100644
--- a/include/utils/permuted_dense.h
+++ b/include/utils/permuted_dense.h
@@ -88,66 +88,63 @@ matrix *new_permuted_dense(int m, int n, int m0, int n0, const int *row_perm,
 void permuted_dense_DA_fill_values(const double *d, const permuted_dense *A,
                                    permuted_dense *C);
 
-/* Fill sparsity of C = AT @ A */
+/* Allocate new permuted dense for C = AT @ A */
 matrix *permuted_dense_ATA_alloc(const permuted_dense *A);
 
 /* Fill values of C = AT @ diag(d) @ A */
 void permuted_dense_ATDA_fill_values(const permuted_dense *A, const double *d,
                                      permuted_dense *C);
 
-/* Fill sparsity of C = BT @ A where A and B are both permuted_dense.
+/* Allocate new permuted dense forC = BT @ A where A and B are both permuted_dense.
    (If B and A have no overlapping rows, then C is empty) */
-matrix *BTA_pd_pd_alloc(const permuted_dense *A, const permuted_dense *B);
+matrix *BTA_pd_pd_alloc(const permuted_dense *B, const permuted_dense *A);
 
 /* Fill values of C = BT @ A where A and B are both permuted dense. */
-void BTA_pd_pd_fill_values(const permuted_dense *A, const permuted_dense *B,
+void BTA_pd_pd_fill_values(const permuted_dense *B, const permuted_dense *A,
                            permuted_dense *C);
 
-/* Allocate a new permuted_dense for C = B^T @ A where A is Sparse (CSR_matrix)
-   and B is PD. Output is PD with row_perm = B->col_perm and col_perm = the
-   sorted union of columns appearing in A's rows at positions row_perm_B.
-   Dense block size = (B->n0, |col_active|). Values uninitialized. */
-matrix *BTA_csr_pd_alloc(const CSR_matrix *A, const permuted_dense *B);
-
-/* Fill C->X = X_B^T @ A_sub_dense, where A_sub_dense is A's rows at
-   positions row_perm_B, columns restricted to C's col_perm, scattered
-   to a dense buffer. C must have the structure produced by
-   BTA_csr_pd_alloc(A, B). */
-void BTA_csr_pd_fill_values(const CSR_matrix *A, const permuted_dense *B,
-                            permuted_dense *C);
+/* Allocate new permuted dense for C = B @ A where B is PD and A is CSC */
+matrix *BA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A);
+
+/* Fill values of C = B @ A where B is value buffer to permuted dense and A is CSC.
+
+   The raw-buffer signature for B lets callers pass a transposed dense block
+   (e.g. (diag(d) B)^T stored in B->dwork) without needing to build a transposed
+   permuted dense. */
+void BA_pd_csc_fill_values(const double *B, int n0_B, const int *inv,
+                           const CSC_matrix *A, permuted_dense *C);
+
+// ------------------- OK SO FAR
 
-/* Allocate a new permuted_dense for C = B^T @ A where A is PD and B is
-   Sparse (CSR_matrix). Output is PD with row_perm = the sorted union of columns
+/* Allocate a new permuted_dense for C = B^T @ A where B is Sparse (CSR_matrix)
+   and A is PD. Output is PD with row_perm = the sorted union of columns
    appearing in B's rows at positions row_perm_A, and col_perm = col_perm_A.
    Dense block size = (|row_active|, n0_A). Values uninitialized. */
-matrix *BTA_pd_csr_alloc(const permuted_dense *A, const CSR_matrix *B_csr);
+matrix *BTA_csr_pd_alloc(const CSR_matrix *B_csr, const permuted_dense *A);
 
 /* Fill C->X = B_sub^T @ X_A, where B_sub is B's rows at positions
    row_perm_A, columns restricted to C's row_perm, scattered to a dense
-   buffer. C must have the structure produced by BTA_pd_csr_alloc(A, B). */
-void BTA_pd_csr_fill_values(const permuted_dense *A, const CSR_matrix *B_csr,
+   buffer. C must have the structure produced by BTA_csr_pd_alloc(B, A). */
+void BTA_csr_pd_fill_values(const CSR_matrix *B_csr, const permuted_dense *A,
                             permuted_dense *C);
 
 /* BTDA variants — fold a diagonal d into the BTA computation. Each fills
    C->X = B^T diag(d) A (d may be NULL for plain B^T A). C must have the
    structure produced by the corresponding BTA *_alloc function. */
-void BTDA_csr_pd_fill_values(const CSR_matrix *A_csr, const double *d,
-                             const permuted_dense *B, permuted_dense *C);
-void BTDA_pd_csr_fill_values(const permuted_dense *A, const double *d,
-                             const CSR_matrix *B_csr, permuted_dense *C);
-void BTDA_pd_pd_fill_values(const permuted_dense *A, const double *d,
-                            const permuted_dense *B, permuted_dense *C);
-
-/* Allocate a new permuted_dense for C = A @ J. C has global shape
-   (A->base.m, J->n) with row_perm = A->row_perm and col_perm equal
-   to the sorted list of columns of J that have at least one structural
-   nonzero in some row in A->col_perm. Values are uninitialized. */
-matrix *BA_pd_csc_alloc(const permuted_dense *A, const CSC_matrix *J);
-
-/* Fill C.X[ii, jj] = sum_kk A.X[ii, kk] * J[col_perm_A[kk],
-   col_perm_C[jj]]. C must have the structure produced by
-   BA_pd_csc_alloc(A, J). */
-void BA_pd_csc_fill_values(const permuted_dense *A,
-                                          const CSC_matrix *J, permuted_dense *C);
+void BTDA_csr_pd_fill_values(const CSR_matrix *B_csr, const double *d,
+                             const permuted_dense *A, permuted_dense *C);
+void BTDA_pd_pd_fill_values(const permuted_dense *B, const double *d,
+                            const permuted_dense *A, permuted_dense *C);
+
+/* CSC-based (PD, Sparse) BTA / BTDA kernels — production path. Alloc is a
+   single pass over A's CSC columns; the fill kernel transposes B's dense
+   block into B->dwork (folding d in) and delegates to
+   BA_pd_csc_fill_values. d MUST be non-NULL — production callers always
+   supply chain-rule weights; for plain B^T A pass d = {1, 1, …, 1}.
+   See include/old-code/old_permuted_dense.h for the legacy CSR equivalents
+   kept as reference implementations. */
+matrix *BTA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A);
+void BTDA_pd_csc_fill_values(const permuted_dense *B, const double *d,
+                             const CSC_matrix *A, permuted_dense *C);
 
 #endif /* PERMUTED_DENSE_H */
diff --git a/include/utils/sparse_matrix.h b/include/utils/sparse_matrix.h
index 7ee97aa..0a8ca4c 100644
--- a/include/utils/sparse_matrix.h
+++ b/include/utils/sparse_matrix.h
@@ -36,7 +36,7 @@ typedef struct sparse_matrix
 matrix *new_sparse_matrix(CSR_matrix *A);
 
 /* Convenience: allocate a sparse_matrix of shape (m, n) with capacity for
-   nnz entries. Equivalent to new_sparse_matrix(new_csr_matrix(m, n, nnz)).
+   nnz entries. Equivalent to new_sparse_matrix(new_CSR_matrix(m, n, nnz)).
    Sparsity pattern and values are uninitialized. */
 matrix *new_sparse_matrix_alloc(int m, int n, int nnz);
 
diff --git a/src/atoms/affine/convolve.c b/src/atoms/affine/convolve.c
index 27ffa54..9cdfeb7 100644
--- a/src/atoms/affine/convolve.c
+++ b/src/atoms/affine/convolve.c
@@ -77,7 +77,7 @@ static void jacobian_init_impl(expr *node)
     jacobian_init(child);
 
     /* Build convolution matrix of size (m+n-1) x n with m*n nonzeros */
-    cnode->T = new_csr_matrix(m + n - 1, n, m * n);
+    cnode->T = new_CSR_matrix(m + n - 1, n, m * n);
     conv_matrix_fill_sparsity(cnode->T, m, n);
     conv_matrix_fill_values(cnode->T, a);
 
@@ -142,8 +142,8 @@ static bool is_affine(const expr *node)
 static void free_type_data(expr *node)
 {
     convolve_expr *cnode = (convolve_expr *) node;
-    free_csr_matrix(cnode->T);
-    free_csc_matrix(cnode->Jchild_CSC);
+    free_CSR_matrix(cnode->T);
+    free_CSC_matrix(cnode->Jchild_CSC);
     free_expr(cnode->param_source);
 }
 
diff --git a/src/atoms/affine/hstack.c b/src/atoms/affine/hstack.c
index 419b23f..9ee3650 100644
--- a/src/atoms/affine/hstack.c
+++ b/src/atoms/affine/hstack.c
@@ -57,7 +57,7 @@ static void jacobian_init_impl(expr *node)
         nnz += hnode->args[i]->jacobian->nnz;
     }
 
-    CSR_matrix *A = new_csr_matrix(node->size, node->n_vars, nnz);
+    CSR_matrix *A = new_CSR_matrix(node->size, node->n_vars, nnz);
 
     /* precompute sparsity pattern of this node's jacobian */
     int row_offset = 0;
@@ -112,15 +112,15 @@ static void wsum_hess_init_impl(expr *node)
 
     /* worst-case scenario the nnz of node->wsum_hess is the sum of children's
        nnz */
-    CSR_matrix *H = new_csr_matrix(node->n_vars, node->n_vars, nnz);
-    hnode->CSR_work = new_csr_matrix(node->n_vars, node->n_vars, nnz);
+    CSR_matrix *H = new_CSR_matrix(node->n_vars, node->n_vars, nnz);
+    hnode->CSR_work = new_CSR_matrix(node->n_vars, node->n_vars, nnz);
 
     /* fill sparsity pattern */
     H->nnz = 0;
     for (int i = 0; i < hnode->n_args; i++)
     {
         matrix *child_hess = hnode->args[i]->wsum_hess;
-        copy_csr_matrix(H, hnode->CSR_work);
+        copy_CSR_matrix(H, hnode->CSR_work);
         sum_csr_alloc(hnode->CSR_work, child_hess->to_csr(child_hess), H);
     }
     node->wsum_hess = new_sparse_matrix(H);
@@ -137,7 +137,7 @@ static void wsum_hess_eval(expr *node, const double *w)
     {
         expr *child = hnode->args[i];
         child->eval_wsum_hess(child, w + row_offset);
-        copy_csr_matrix(H, hnode->CSR_work);
+        copy_CSR_matrix(H, hnode->CSR_work);
         sum_csr_fill_values(hnode->CSR_work, child->wsum_hess->to_csr(child->wsum_hess), H);
         row_offset += child->size;
     }
@@ -166,7 +166,7 @@ static void free_type_data(expr *node)
         hnode->args[i] = NULL;
     }
 
-    free_csr_matrix(hnode->CSR_work);
+    free_CSR_matrix(hnode->CSR_work);
     hnode->CSR_work = NULL;
     free(hnode->args);
     hnode->args = NULL;
diff --git a/src/atoms/affine/left_matmul.c b/src/atoms/affine/left_matmul.c
index e799a42..860ce88 100644
--- a/src/atoms/affine/left_matmul.c
+++ b/src/atoms/affine/left_matmul.c
@@ -95,8 +95,8 @@ static void free_type_data(expr *node)
     left_matmul_expr *lnode = (left_matmul_expr *) node;
     free_matrix(lnode->A);
     free_matrix(lnode->AT);
-    free_csc_matrix(lnode->Jchild_CSC);
-    free_csc_matrix(lnode->J_CSC);
+    free_CSC_matrix(lnode->Jchild_CSC);
+    free_CSC_matrix(lnode->J_CSC);
     free(lnode->csc_to_csr_work);
     if (lnode->param_source != NULL)
     {
diff --git a/src/atoms/affine/right_matmul.c b/src/atoms/affine/right_matmul.c
index a892235..90f031b 100644
--- a/src/atoms/affine/right_matmul.c
+++ b/src/atoms/affine/right_matmul.c
@@ -74,7 +74,7 @@ expr *new_right_matmul(expr *param_node, expr *u, const CSR_matrix *A)
 
     expr *node = new_transpose(left_matmul);
 
-    free_csr_matrix(AT);
+    free_CSR_matrix(AT);
     free(work_transpose);
     return node;
 }
diff --git a/src/atoms/affine/sum.c b/src/atoms/affine/sum.c
index ce3343f..4945fa0 100644
--- a/src/atoms/affine/sum.c
+++ b/src/atoms/affine/sum.c
@@ -90,7 +90,7 @@ static void jacobian_init_impl(expr *node)
     CSR_matrix *Jx = x->jacobian->to_csr(x->jacobian);
 
     /* we never have to store more than the child's nnz */
-    CSR_matrix *jac = new_csr_matrix(node->size, node->n_vars, Jx->nnz);
+    CSR_matrix *jac = new_CSR_matrix(node->size, node->n_vars, Jx->nnz);
     node->work->iwork = SP_MALLOC(MAX(jac->n, Jx->nnz) * sizeof(int));
     snode->idx_map = SP_MALLOC(Jx->nnz * sizeof(int));
 
diff --git a/src/atoms/affine/trace.c b/src/atoms/affine/trace.c
index c1ebdb6..04dd351 100644
--- a/src/atoms/affine/trace.c
+++ b/src/atoms/affine/trace.c
@@ -65,7 +65,7 @@ static void jacobian_init_impl(expr *node)
         total_nnz += A->p[row + 1] - A->p[row];
     }
 
-    CSR_matrix *jac = new_csr_matrix(1, node->n_vars, total_nnz);
+    CSR_matrix *jac = new_CSR_matrix(1, node->n_vars, total_nnz);
 
     // ---------------------------------------------------------------
     // fill sparsity pattern and idx_map
diff --git a/src/atoms/affine/transpose.c b/src/atoms/affine/transpose.c
index 999af08..bcf372c 100644
--- a/src/atoms/affine/transpose.c
+++ b/src/atoms/affine/transpose.c
@@ -47,7 +47,7 @@ static void jacobian_init_impl(expr *node)
     expr *child = node->left;
     jacobian_init(child);
     CSR_matrix *Jc = child->jacobian->to_csr(child->jacobian);
-    CSR_matrix *J = new_csr_matrix(node->size, node->n_vars, Jc->nnz);
+    CSR_matrix *J = new_CSR_matrix(node->size, node->n_vars, Jc->nnz);
 
     /* fill sparsity */
     int d1 = node->d1;
diff --git a/src/atoms/affine/variable.c b/src/atoms/affine/variable.c
index 1f6887d..fe010e7 100644
--- a/src/atoms/affine/variable.c
+++ b/src/atoms/affine/variable.c
@@ -28,7 +28,7 @@ static void forward(expr *node, const double *u)
 
 static void jacobian_init_impl(expr *node)
 {
-    CSR_matrix *jac = new_csr_matrix(node->size, node->n_vars, node->size);
+    CSR_matrix *jac = new_CSR_matrix(node->size, node->n_vars, node->size);
     for (int j = 0; j < node->size; j++)
     {
         jac->p[j] = j;
diff --git a/src/atoms/bivariate_full_dom/matmul.c b/src/atoms/bivariate_full_dom/matmul.c
index 746ba23..e3ef874 100644
--- a/src/atoms/bivariate_full_dom/matmul.c
+++ b/src/atoms/bivariate_full_dom/matmul.c
@@ -45,7 +45,7 @@
 static CSR_matrix *build_cross_hessian_sparsity(int m, int k, int n)
 {
     int total_nnz = m * k * n;
-    CSR_matrix *B = new_csr_matrix(m * k, k * n, total_nnz);
+    CSR_matrix *B = new_CSR_matrix(m * k, k * n, total_nnz);
     int idx = 0;
 
     for (int j = 0; j < k; j++)
@@ -103,15 +103,15 @@ static void free_matmul_data(expr *node)
 {
     matmul_expr *mnode = (matmul_expr *) node;
     /* Jacobian workspace */
-    free_csr_matrix(mnode->term1_CSR);
-    free_csr_matrix(mnode->term2_CSR);
+    free_CSR_matrix(mnode->term1_CSR);
+    free_CSR_matrix(mnode->term2_CSR);
     /* Hessian workspace */
-    free_csr_matrix(mnode->B);
-    free_csr_matrix(mnode->BJg);
-    free_csc_matrix(mnode->BJg_CSC);
+    free_CSR_matrix(mnode->B);
+    free_CSR_matrix(mnode->BJg);
+    free_CSC_matrix(mnode->BJg_CSC);
     free(mnode->BJg_csc_work);
-    free_csr_matrix(mnode->C);
-    free_csr_matrix(mnode->CT);
+    free_CSR_matrix(mnode->C);
+    free_CSR_matrix(mnode->CT);
     free(mnode->idx_map_C);
     free(mnode->idx_map_CT);
     free(mnode->idx_map_Hf);
@@ -140,7 +140,7 @@ static void jacobian_init_no_chain_rule(expr *node)
     int k = x->d2;
     int n = y->d2;
     int nnz = m * n * 2 * k;
-    CSR_matrix *jac = new_csr_matrix(node->size, node->n_vars, nnz);
+    CSR_matrix *jac = new_CSR_matrix(node->size, node->n_vars, nnz);
 
     int nnz_idx = 0;
     for (int i = 0; i < node->size; i++)
@@ -237,7 +237,7 @@ static void jacobian_init_chain_rule(expr *node)
     mnode->term1_CSR = YT_kron_I_alloc(m, k, n, f->work->jacobian_csc);
     mnode->term2_CSR = I_kron_X_alloc(m, k, n, g->work->jacobian_csc);
     int max_nnz = mnode->term1_CSR->nnz + mnode->term2_CSR->nnz;
-    CSR_matrix *jac = new_csr_matrix(node->size, node->n_vars, max_nnz);
+    CSR_matrix *jac = new_CSR_matrix(node->size, node->n_vars, max_nnz);
     sum_csr_alloc(mnode->term1_CSR, mnode->term2_CSR, jac);
     node->jacobian = new_sparse_matrix(jac);
 }
@@ -278,7 +278,7 @@ static void wsum_hess_init_no_chain_rule(expr *node)
     int k = x->d2;
     int n = y->d2;
     int total_nnz = 2 * m * k * n;
-    CSR_matrix *hess = new_csr_matrix(node->n_vars, node->n_vars, total_nnz);
+    CSR_matrix *hess = new_CSR_matrix(node->n_vars, node->n_vars, total_nnz);
     int nnz = 0;
     int *Hi = hess->i;
     int *Hp = hess->p;
diff --git a/src/atoms/bivariate_full_dom/multiply.c b/src/atoms/bivariate_full_dom/multiply.c
index a2e2b63..277a060 100644
--- a/src/atoms/bivariate_full_dom/multiply.c
+++ b/src/atoms/bivariate_full_dom/multiply.c
@@ -84,7 +84,7 @@ static void wsum_hess_init_impl(expr *node)
     {
         assert(y->var_id != NOT_A_VARIABLE);
         CSR_matrix *hess =
-            new_csr_matrix(node->n_vars, node->n_vars, 2 * node->size);
+            new_CSR_matrix(node->n_vars, node->n_vars, 2 * node->size);
 
         int i, var1_id, var2_id;
 
@@ -274,7 +274,7 @@ static void free_type_data(expr *node)
 {
     elementwise_mult_expr *mul_node = (elementwise_mult_expr *) node;
     free_matrix(mul_node->cross_C);
-    free_csr_matrix(mul_node->CSR_work2);
+    free_CSR_matrix(mul_node->CSR_work2);
     free(mul_node->idx_map_C);
     free(mul_node->idx_map_CT);
     free(mul_node->idx_map_Hx);
diff --git a/src/atoms/bivariate_restricted_dom/quad_over_lin.c b/src/atoms/bivariate_restricted_dom/quad_over_lin.c
index 07b08df..12be5a1 100644
--- a/src/atoms/bivariate_restricted_dom/quad_over_lin.c
+++ b/src/atoms/bivariate_restricted_dom/quad_over_lin.c
@@ -59,7 +59,7 @@ static void jacobian_init_impl(expr *node)
     /* if left node is a variable */
     if (x->var_id != NOT_A_VARIABLE)
     {
-        CSR_matrix *jac = new_csr_matrix(1, node->n_vars, x->size + 1);
+        CSR_matrix *jac = new_CSR_matrix(1, node->n_vars, x->size + 1);
         jac->p[0] = 0;
         jac->p[1] = x->size + 1;
 
@@ -91,7 +91,7 @@ static void jacobian_init_impl(expr *node)
             node->n_vars, sizeof(bool)); /* TODO: could use iwork here instead*/
         CSR_matrix *Jx = x->jacobian->to_csr(x->jacobian);
         int nonzero_cols = count_nonzero_cols(Jx, col_nz);
-        CSR_matrix *jac = new_csr_matrix(1, node->n_vars, nonzero_cols + 1);
+        CSR_matrix *jac = new_CSR_matrix(1, node->n_vars, nonzero_cols + 1);
 
         /* precompute column indices */
         jac->nnz = 0;
@@ -189,7 +189,7 @@ static void wsum_hess_init_impl(expr *node)
     /* if left node is a variable */
     if (x->var_id != NOT_A_VARIABLE)
     {
-        CSR_matrix *H = new_csr_matrix(node->n_vars, node->n_vars, 3 * x->size + 1);
+        CSR_matrix *H = new_CSR_matrix(node->n_vars, node->n_vars, 3 * x->size + 1);
         node->wsum_hess = new_sparse_matrix(H);
 
         /* if x has lower idx than y*/
diff --git a/src/atoms/bivariate_restricted_dom/rel_entr.c b/src/atoms/bivariate_restricted_dom/rel_entr.c
index 0c7ea2a..ecd4520 100644
--- a/src/atoms/bivariate_restricted_dom/rel_entr.c
+++ b/src/atoms/bivariate_restricted_dom/rel_entr.c
@@ -46,7 +46,7 @@ static void forward_vector_args(expr *node, const double *u)
 
 static void jacobian_init_vectors_args(expr *node)
 {
-    CSR_matrix *jac = new_csr_matrix(node->size, node->n_vars, 2 * node->size);
+    CSR_matrix *jac = new_CSR_matrix(node->size, node->n_vars, 2 * node->size);
 
     expr *x = node->left;
     expr *y = node->right;
@@ -102,7 +102,7 @@ static void eval_jacobian_vector_args(expr *node)
 
 static void wsum_hess_init_vector_args(expr *node)
 {
-    CSR_matrix *H = new_csr_matrix(node->n_vars, node->n_vars, 4 * node->size);
+    CSR_matrix *H = new_CSR_matrix(node->n_vars, node->n_vars, 4 * node->size);
     expr *x = node->left;
     expr *y = node->right;
 
diff --git a/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c b/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c
index 6dbc5b4..a356289 100644
--- a/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c
+++ b/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c
@@ -51,7 +51,7 @@ static void jacobian_init_scalar_vector(expr *node)
     assert(x->var_id != NOT_A_VARIABLE && y->var_id != NOT_A_VARIABLE);
     assert(x->var_id != y->var_id);
 
-    CSR_matrix *jac = new_csr_matrix(node->size, node->n_vars, 2 * node->size);
+    CSR_matrix *jac = new_CSR_matrix(node->size, node->n_vars, 2 * node->size);
 
     if (x->var_id < y->var_id)
     {
@@ -106,7 +106,7 @@ static void wsum_hess_init_scalar_vector(expr *node)
     int var_id_y = y->var_id;
 
     CSR_matrix *H =
-        new_csr_matrix(node->n_vars, node->n_vars, 3 * node->size + 1);
+        new_CSR_matrix(node->n_vars, node->n_vars, 3 * node->size + 1);
 
     if (var_id_x < var_id_y)
     {
diff --git a/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c b/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c
index 99cfe08..f2b7928 100644
--- a/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c
+++ b/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c
@@ -51,7 +51,7 @@ static void jacobian_init_vector_scalar(expr *node)
     assert(x->var_id != NOT_A_VARIABLE && y->var_id != NOT_A_VARIABLE);
     assert(x->var_id != y->var_id);
 
-    CSR_matrix *jac = new_csr_matrix(node->size, node->n_vars, 2 * node->size);
+    CSR_matrix *jac = new_CSR_matrix(node->size, node->n_vars, 2 * node->size);
 
     if (x->var_id < y->var_id)
     {
@@ -106,7 +106,7 @@ static void wsum_hess_init_vector_scalar(expr *node)
     int var_id_y = y->var_id;
 
     CSR_matrix *H =
-        new_csr_matrix(node->n_vars, node->n_vars, 3 * node->size + 1);
+        new_CSR_matrix(node->n_vars, node->n_vars, 3 * node->size + 1);
 
     if (var_id_x < var_id_y)
     {
diff --git a/src/atoms/elementwise_full_dom/common.c b/src/atoms/elementwise_full_dom/common.c
index 94740b1..352f784 100644
--- a/src/atoms/elementwise_full_dom/common.c
+++ b/src/atoms/elementwise_full_dom/common.c
@@ -33,7 +33,7 @@ void jacobian_init_elementwise(expr *node)
     /* if the variable is a child */
     if (child->var_id != NOT_A_VARIABLE)
     {
-        CSR_matrix *jac = new_csr_matrix(node->size, node->n_vars, node->size);
+        CSR_matrix *jac = new_CSR_matrix(node->size, node->n_vars, node->size);
         for (int j = 0; j < node->size; j++)
         {
             jac->p[j] = j;
@@ -82,7 +82,7 @@ void wsum_hess_init_elementwise(expr *node)
     /* if the variable is a child */
     if (id != NOT_A_VARIABLE)
     {
-        CSR_matrix *hess = new_csr_matrix(node->n_vars, node->n_vars, node->size);
+        CSR_matrix *hess = new_CSR_matrix(node->n_vars, node->n_vars, node->size);
 
         for (i = 0; i < node->size; i++)
         {
diff --git a/src/atoms/elementwise_restricted_dom/common.c b/src/atoms/elementwise_restricted_dom/common.c
index 2ce3af0..5c88733 100644
--- a/src/atoms/elementwise_restricted_dom/common.c
+++ b/src/atoms/elementwise_restricted_dom/common.c
@@ -24,7 +24,7 @@ void jacobian_init_restricted(expr *node)
 {
     expr *child = node->left;
 
-    CSR_matrix *jac = new_csr_matrix(node->size, node->n_vars, node->size);
+    CSR_matrix *jac = new_CSR_matrix(node->size, node->n_vars, node->size);
     for (int j = 0; j < node->size; j++)
     {
         jac->p[j] = j;
@@ -40,7 +40,7 @@ void wsum_hess_init_restricted(expr *node)
     int id = child->var_id;
     int i;
 
-    CSR_matrix *hess = new_csr_matrix(node->n_vars, node->n_vars, node->size);
+    CSR_matrix *hess = new_CSR_matrix(node->n_vars, node->n_vars, node->size);
 
     for (i = 0; i < node->size; i++)
     {
diff --git a/src/atoms/other/prod.c b/src/atoms/other/prod.c
index 4a380c2..aeb7226 100644
--- a/src/atoms/other/prod.c
+++ b/src/atoms/other/prod.c
@@ -72,7 +72,7 @@ static void jacobian_init_impl(expr *node)
     /* if x is a variable */
     if (x->var_id != NOT_A_VARIABLE)
     {
-        CSR_matrix *jac = new_csr_matrix(1, node->n_vars, x->size);
+        CSR_matrix *jac = new_CSR_matrix(1, node->n_vars, x->size);
         jac->p[0] = 0;
         jac->p[1] = x->size;
         for (int j = 0; j < x->size; j++)
@@ -134,7 +134,7 @@ static void wsum_hess_init_impl(expr *node)
         /* allocate n_vars x n_vars CSR_matrix matrix with dense block */
         int block_size = x->size;
         int nnz = block_size * block_size;
-        CSR_matrix *hess = new_csr_matrix(node->n_vars, node->n_vars, nnz);
+        CSR_matrix *hess = new_CSR_matrix(node->n_vars, node->n_vars, nnz);
 
         /* fill row pointers for the dense block */
         for (int i = 0; i < block_size; i++)
diff --git a/src/atoms/other/prod_axis_one.c b/src/atoms/other/prod_axis_one.c
index 49234df..2e0e971 100644
--- a/src/atoms/other/prod_axis_one.c
+++ b/src/atoms/other/prod_axis_one.c
@@ -82,7 +82,7 @@ static void jacobian_init_impl(expr *node)
     /* if x is a variable */
     if (x->var_id != NOT_A_VARIABLE)
     {
-        CSR_matrix *jac = new_csr_matrix(node->size, node->n_vars, x->size);
+        CSR_matrix *jac = new_CSR_matrix(node->size, node->n_vars, x->size);
 
         /* set row pointers (each row has d2 nnzs) */
         for (int row = 0; row < x->d1; row++)
@@ -163,7 +163,7 @@ static void wsum_hess_init_impl(expr *node)
         /* each row i has d2-1 non-zero entries, with column indices corresponding to
            the columns in that row (except the diagonal element). */
         int nnz = x->d1 * x->d2 * (x->d2 - 1);
-        CSR_matrix *H = new_csr_matrix(node->n_vars, node->n_vars, nnz);
+        CSR_matrix *H = new_CSR_matrix(node->n_vars, node->n_vars, nnz);
 
         /* fill sparsity pattern */
         int nnz_per_row = x->d2 - 1;
diff --git a/src/atoms/other/prod_axis_zero.c b/src/atoms/other/prod_axis_zero.c
index a5eebae..55bedd2 100644
--- a/src/atoms/other/prod_axis_zero.c
+++ b/src/atoms/other/prod_axis_zero.c
@@ -77,7 +77,7 @@ static void jacobian_init_impl(expr *node)
     /* if x is a variable */
     if (x->var_id != NOT_A_VARIABLE)
     {
-        CSR_matrix *jac = new_csr_matrix(node->size, node->n_vars, x->size);
+        CSR_matrix *jac = new_CSR_matrix(node->size, node->n_vars, x->size);
 
         /* set row pointers (each row has d1 nnzs) */
         for (int row = 0; row < x->d2; row++)
@@ -156,7 +156,7 @@ static void wsum_hess_init_impl(expr *node)
     {
         /* Hessian has block diagonal structure: d2 blocks of size d1 x d1 */
         int nnz = x->d2 * x->d1 * x->d1;
-        CSR_matrix *H = new_csr_matrix(node->n_vars, node->n_vars, nnz);
+        CSR_matrix *H = new_CSR_matrix(node->n_vars, node->n_vars, nnz);
 
         /* fill row pointers for the variable's rows (block diagonal) */
         for (int i = 0; i < x->size; i++)
diff --git a/src/atoms/other/quad_form.c b/src/atoms/other/quad_form.c
index 3c4cac0..d49c16b 100644
--- a/src/atoms/other/quad_form.c
+++ b/src/atoms/other/quad_form.c
@@ -52,7 +52,7 @@ static void jacobian_init_impl(expr *node)
 
     if (x->var_id != NOT_A_VARIABLE)
     {
-        CSR_matrix *jac = new_csr_matrix(1, node->n_vars, x->size);
+        CSR_matrix *jac = new_CSR_matrix(1, node->n_vars, x->size);
         jac->p[0] = 0;
         jac->p[1] = x->size;
 
@@ -71,7 +71,7 @@ static void jacobian_init_impl(expr *node)
 
         /* allocate the right number of nnz */
         int nnz = count_nonzero_cols_csc(J_csc);
-        CSR_matrix *jac = new_csr_matrix(1, node->n_vars, nnz);
+        CSR_matrix *jac = new_CSR_matrix(1, node->n_vars, nnz);
         jac->p[0] = 0;
         jac->p[1] = nnz;
 
@@ -131,7 +131,7 @@ static void wsum_hess_init_impl(expr *node)
 
     if (x->var_id != NOT_A_VARIABLE)
     {
-        CSR_matrix *H = new_csr_matrix(node->n_vars, node->n_vars, Q->nnz);
+        CSR_matrix *H = new_CSR_matrix(node->n_vars, node->n_vars, Q->nnz);
 
         /* set global row pointers */
         memcpy(H->p + x->var_id, Q->p, (x->size + 1) * sizeof(int));
@@ -234,11 +234,11 @@ static void eval_wsum_hess(expr *node, const double *w)
 static void free_type_data(expr *node)
 {
     quad_form_expr *qnode = (quad_form_expr *) node;
-    free_csr_matrix(qnode->Q);
+    free_CSR_matrix(qnode->Q);
     qnode->Q = NULL;
     if (qnode->QJf != NULL)
     {
-        free_csc_matrix(qnode->QJf);
+        free_CSC_matrix(qnode->QJf);
         qnode->QJf = NULL;
     }
 }
@@ -262,8 +262,8 @@ expr *new_quad_form(expr *left, CSR_matrix *Q)
     expr_retain(left);
 
     /* Set type-specific field */
-    qnode->Q = new_csr_matrix(Q->m, Q->n, Q->nnz);
-    copy_csr_matrix(Q, qnode->Q);
+    qnode->Q = new_CSR_matrix(Q->m, Q->n, Q->nnz);
+    copy_CSR_matrix(Q, qnode->Q);
 
     /* dwork stores the result of Q @ f(x) in the forward pass */
     node->work->dwork = (double *) SP_MALLOC(left->size * sizeof(double));
diff --git a/src/expr.c b/src/expr.c
index 8aa199e..b99b266 100644
--- a/src/expr.c
+++ b/src/expr.c
@@ -84,7 +84,7 @@ void free_expr(expr *node)
     {
         free(node->work->dwork);
         free(node->work->iwork);
-        free_csc_matrix(node->work->jacobian_csc);
+        free_CSC_matrix(node->work->jacobian_csc);
         free(node->work->csc_work);
         free(node->work->local_jac_diag);
         free_matrix(node->work->hess_term1);
diff --git a/src/old-code/linear_op.c b/src/old-code/linear_op.c
index 65b67b5..8637ee6 100644
--- a/src/old-code/linear_op.c
+++ b/src/old-code/linear_op.c
@@ -76,7 +76,7 @@ static void wsum_hess_init_impl(expr *node)
 {
     /* Linear operator Hessian is always zero */
     node->wsum_hess =
-        new_sparse_matrix(new_csr_matrix(node->n_vars, node->n_vars, 0));
+        new_sparse_matrix(new_CSR_matrix(node->n_vars, node->n_vars, 0));
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
@@ -99,8 +99,8 @@ expr *new_linear(expr *u, const CSR_matrix *A, const double *b)
     expr_retain(u);
 
     /* Store A directly as the jacobian (linear op jacobian is constant) */
-    CSR_matrix *jac = new_csr_matrix(A->m, A->n, A->nnz);
-    copy_csr_matrix(A, jac);
+    CSR_matrix *jac = new_CSR_matrix(A->m, A->n, A->nnz);
+    copy_CSR_matrix(A, jac);
     node->jacobian = new_sparse_matrix(jac);
 
     /* Initialize offset (copy b if provided, otherwise NULL) */
diff --git a/src/old-code/old_CSR.c b/src/old-code/old_CSR.c
index 420e825..e4137ab 100644
--- a/src/old-code/old_CSR.c
+++ b/src/old-code/old_CSR.c
@@ -28,7 +28,7 @@ CSR_matrix *block_diag_repeat_csr(const CSR_matrix *A, int p)
     int n = A->n;
     int nnz = A->nnz;
 
-    CSR_matrix *A_kron = new_csr_matrix(m * p, n * p, nnz * p);
+    CSR_matrix *A_kron = new_CSR_matrix(m * p, n * p, nnz * p);
 
     int nnz_cursor = 0;
     for (int block = 0; block < p; block++)
@@ -63,7 +63,7 @@ CSR_matrix *kron_identity_csr(const CSR_matrix *A, int p)
     int n = A->n;
     int nnz = A->nnz;
 
-    CSR_matrix *A_kron = new_csr_matrix(m * p, n * p, nnz * p);
+    CSR_matrix *A_kron = new_CSR_matrix(m * p, n * p, nnz * p);
 
     int nnz_cursor = 0;
     for (int row_block = 0; row_block < m; row_block++)
@@ -159,7 +159,7 @@ void Ax_csr_wo_offset(const CSR_matrix *A, const double *x, double *y)
 
 void diag_csr_mult(const double *d, const CSR_matrix *A, CSR_matrix *C)
 {
-    copy_csr_matrix(A, C);
+    copy_CSR_matrix(A, C);
 
     for (int row = 0; row < C->m; row++)
     {
diff --git a/src/old-code/old_permuted_dense.c b/src/old-code/old_permuted_dense.c
new file mode 100644
index 0000000..4b6c7b7
--- /dev/null
+++ b/src/old-code/old_permuted_dense.c
@@ -0,0 +1,155 @@
+/*
+ * Copyright 2026 Daniel Cederberg and William Zhang
+ *
+ * This file is part of the SparseDiffEngine project.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "old-code/old_permuted_dense.h"
+
+#include "utils/cblas_wrapper.h"
+#include "utils/tracked_alloc.h"
+#include <stdlib.h>
+#include <string.h>
+
+matrix *BTA_pd_csr_alloc(const permuted_dense *B, const CSR_matrix *A)
+{
+    /* Cij != 0 only if i is in B's column permutation and column j of A
+      overlaps with column i of B. */
+
+    /* Gather the union of columns appearing in A's rows at positions
+       row_perm_B. Use a bitmap of size A->n for O(nnz) collection. */
+    int p = A->n;
+    char *seen = (char *) SP_CALLOC(p, sizeof(char));
+    int s_A = 0;
+    for (int kk = 0; kk < B->m0; kk++)
+    {
+        int row = B->row_perm[kk];
+        for (int e = A->p[row]; e < A->p[row + 1]; e++)
+        {
+            int j = A->i[e];
+            if (!seen[j])
+            {
+                seen[j] = 1;
+                s_A++;
+            }
+        }
+    }
+
+    int *col_active = (int *) SP_MALLOC((s_A > 0 ? s_A : 1) * sizeof(int));
+    int idx = 0;
+    for (int j = 0; j < p; j++)
+    {
+        if (seen[j])
+        {
+            col_active[idx++] = j;
+        }
+    }
+
+    matrix *C =
+        new_permuted_dense(B->base.n, p, B->n0, s_A, B->col_perm, col_active, NULL);
+    free(col_active);
+    free(seen);
+
+    /* Upgrade `dwork` (currently sized for the Y-role at m0_C * n0_C = B->n0 *
+       s_A) to fit the gather buffer A_sub_dense used by BTA_csr_pd /
+       BTDA_pd_csr_fill_values: shape (B->m0, s_A) row-major. The dgemm
+       reads it as (B->m0, s_A), so size B->m0 * s_A doubles suffices. */
+    permuted_dense *C_pd = (permuted_dense *) C;
+    size_t gather_size = B->m0 * s_A;
+    if (gather_size > C_pd->dwork_size)
+    {
+        free(C_pd->dwork);
+        C_pd->dwork_size = gather_size;
+        C_pd->dwork = (double *) SP_CALLOC(gather_size, sizeof(double));
+    }
+    return C;
+}
+
+void BTA_pd_csr_fill_values(const permuted_dense *B, const CSR_matrix *A_csr,
+                            permuted_dense *C)
+{
+    int m0 = B->m0;
+    int dn_B = B->n0;
+    int s_A = C->n0;
+
+    if (s_A == 0 || m0 == 0)
+    {
+        /* Output dense block is empty; nothing to fill. */
+        return;
+    }
+
+    /* Use C->col_inv (pre-built by new_permuted_dense) as col_inv_out and
+       C->dwork as A_sub_dense; both are owned by C. dwork is sized at alloc
+       time to cover m0 * s_A; only that prefix is touched. */
+    double *A_sub_dense = C->dwork;
+    size_t used = m0 * s_A;
+    memset(A_sub_dense, 0, used * sizeof(double));
+
+    for (int kk = 0; kk < m0; kk++)
+    {
+        int row = B->row_perm[kk];
+        for (int e = A_csr->p[row]; e < A_csr->p[row + 1]; e++)
+        {
+            int j = A_csr->i[e];
+            int jj = C->col_inv[j];
+            /* jj should always be valid (we built col_perm from these entries),
+               but guard against asymmetry between alloc and fill calls. */
+            if (jj >= 0)
+            {
+                A_sub_dense[kk * s_A + jj] = A_csr->x[e];
+            }
+        }
+    }
+
+    /* C->X = X_B^T @ A_sub_dense */
+    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dn_B, s_A, m0, 1.0, B->X,
+                dn_B, A_sub_dense, s_A, 0.0, C->X, s_A);
+}
+
+/* BTDA variant of BTA_csr_pd: C->X = X_B^T diag(d) A_sub_dense. Folds d
+   into the scatter step. */
+void BTDA_pd_csr_fill_values(const permuted_dense *B, const double *d,
+                             const CSR_matrix *A_csr, permuted_dense *C)
+{
+    int m0 = B->m0;
+    int dn_B = B->n0;
+    int s_A = C->n0;
+
+    if (s_A == 0 || m0 == 0)
+    {
+        return;
+    }
+
+    double *A_sub_dense = C->dwork;
+    size_t used = m0 * s_A;
+    memset(A_sub_dense, 0, used * sizeof(double));
+
+    for (int kk = 0; kk < m0; kk++)
+    {
+        int row = B->row_perm[kk];
+        double dk = d ? d[row] : 1.0;
+        for (int e = A_csr->p[row]; e < A_csr->p[row + 1]; e++)
+        {
+            int j = A_csr->i[e];
+            int jj = C->col_inv[j];
+            if (jj >= 0)
+            {
+                A_sub_dense[kk * s_A + jj] = dk * A_csr->x[e];
+            }
+        }
+    }
+
+    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dn_B, s_A, m0, 1.0, B->X,
+                dn_B, A_sub_dense, s_A, 0.0, C->X, s_A);
+}
diff --git a/src/problem.c b/src/problem.c
index a6ea358..e675ecd 100644
--- a/src/problem.c
+++ b/src/problem.c
@@ -190,7 +190,7 @@ void problem_init_jacobian(problem *prob)
         }
     }
 
-    prob->jacobian = new_csr_matrix(prob->total_constraint_size, prob->n_vars, nnz);
+    prob->jacobian = new_CSR_matrix(prob->total_constraint_size, prob->n_vars, nnz);
 
     /* set sparsity pattern of jacobian */
     CSR_matrix *H = prob->jacobian;
@@ -234,7 +234,7 @@ void problem_init_hessian(problem *prob)
         nnz += prob->constraints[i]->wsum_hess->nnz;
     }
 
-    prob->lagrange_hessian = new_csr_matrix(prob->n_vars, prob->n_vars, nnz);
+    prob->lagrange_hessian = new_CSR_matrix(prob->n_vars, prob->n_vars, nnz);
     memset(prob->lagrange_hessian->x, 0, nnz * sizeof(double)); /* affine shortcut */
     prob->stats.nnz_hessian = nnz;
     prob->hess_idx_map = (int *) SP_MALLOC(nnz * sizeof(int));
@@ -345,8 +345,8 @@ void free_problem(problem *prob)
     /* Free allocated arrays */
     free(prob->constraint_values);
     free(prob->gradient_values);
-    free_csr_matrix(prob->jacobian);
-    free_csr_matrix(prob->lagrange_hessian);
+    free_CSR_matrix(prob->jacobian);
+    free_CSR_matrix(prob->lagrange_hessian);
     free_coo_matrix(prob->jacobian_coo);
     free_coo_matrix(prob->lagrange_hessian_coo);
     free(prob->hess_idx_map);
diff --git a/src/utils/CSC_matrix.c b/src/utils/CSC_matrix.c
index 09a3aca..08eec98 100644
--- a/src/utils/CSC_matrix.c
+++ b/src/utils/CSC_matrix.c
@@ -22,7 +22,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-CSC_matrix *new_csc_matrix(int m, int n, int nnz)
+CSC_matrix *new_CSC_matrix(int m, int n, int nnz)
 {
     CSC_matrix *matrix = (CSC_matrix *) SP_MALLOC(sizeof(CSC_matrix));
     if (!matrix) return NULL;
@@ -47,7 +47,7 @@ CSC_matrix *new_csc_matrix(int m, int n, int nnz)
     return matrix;
 }
 
-void free_csc_matrix(CSC_matrix *matrix)
+void free_CSC_matrix(CSC_matrix *matrix)
 {
     if (matrix)
     {
@@ -102,7 +102,7 @@ CSR_matrix *ATA_alloc(const CSC_matrix *A)
     }
 
     /* Allocate C and symmetrize it */
-    CSR_matrix *C = new_csr_matrix(n, n, nnz);
+    CSR_matrix *C = new_CSR_matrix(n, n, nnz);
     symmetrize_csr(Cp, Ci->data, n, C);
 
     /* free workspace */
@@ -205,7 +205,7 @@ void ATDA_fill_values(const CSC_matrix *A, const double *d, CSR_matrix *C)
 
 CSC_matrix *csr_to_csc_alloc(const CSR_matrix *A, int *iwork)
 {
-    CSC_matrix *C = new_csc_matrix(A->m, A->n, A->nnz);
+    CSC_matrix *C = new_CSC_matrix(A->m, A->n, A->nnz);
 
     int i, j;
     int *count = iwork;
@@ -268,7 +268,7 @@ void csr_to_csc_fill_values(const CSR_matrix *A, CSC_matrix *C, int *iwork)
 
 CSR_matrix *csc_to_csr_alloc(const CSC_matrix *A, int *iwork)
 {
-    CSR_matrix *C = new_csr_matrix(A->m, A->n, A->nnz);
+    CSR_matrix *C = new_CSR_matrix(A->m, A->n, A->nnz);
 
     int i, j;
     int *count = iwork;
@@ -378,7 +378,7 @@ CSR_matrix *BTA_alloc(const CSC_matrix *A, const CSC_matrix *B)
     }
 
     /* Allocate C */
-    CSR_matrix *C = new_csr_matrix(p, n, nnz);
+    CSR_matrix *C = new_CSR_matrix(p, n, nnz);
     memcpy(C->p, Cp, (p + 1) * sizeof(int));
     memcpy(C->i, Ci->data, nnz * sizeof(int));
 
@@ -525,7 +525,7 @@ CSC_matrix *symBA_alloc(const CSR_matrix *B, const CSC_matrix *A)
 
     /* allocate C and copy the computed structure */
     int total_nnz = Cp[n];
-    CSC_matrix *C = new_csc_matrix(m, n, total_nnz);
+    CSC_matrix *C = new_CSC_matrix(m, n, total_nnz);
     memcpy(C->p, Cp, (n + 1) * sizeof(int));
     memcpy(C->i, Ci->data, total_nnz * sizeof(int));
 
diff --git a/src/utils/CSR_matrix.c b/src/utils/CSR_matrix.c
index 63847ab..1123068 100644
--- a/src/utils/CSR_matrix.c
+++ b/src/utils/CSR_matrix.c
@@ -25,7 +25,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-CSR_matrix *new_csr_matrix(int m, int n, int nnz)
+CSR_matrix *new_CSR_matrix(int m, int n, int nnz)
 {
     CSR_matrix *matrix = (CSR_matrix *) SP_MALLOC(sizeof(CSR_matrix));
     matrix->p = (int *) SP_CALLOC(m + 1, sizeof(int));
@@ -39,7 +39,7 @@ CSR_matrix *new_csr_matrix(int m, int n, int nnz)
 
 CSR_matrix *new_csr(const CSR_matrix *A)
 {
-    CSR_matrix *copy = new_csr_matrix(A->m, A->n, A->nnz);
+    CSR_matrix *copy = new_CSR_matrix(A->m, A->n, A->nnz);
     memcpy(copy->p, A->p, (A->m + 1) * sizeof(int));
     memcpy(copy->i, A->i, A->nnz * sizeof(int));
     memcpy(copy->x, A->x, A->nnz * sizeof(double));
@@ -48,13 +48,13 @@ CSR_matrix *new_csr(const CSR_matrix *A)
 
 CSR_matrix *new_csr_copy_sparsity(const CSR_matrix *A)
 {
-    CSR_matrix *copy = new_csr_matrix(A->m, A->n, A->nnz);
+    CSR_matrix *copy = new_CSR_matrix(A->m, A->n, A->nnz);
     memcpy(copy->p, A->p, (A->m + 1) * sizeof(int));
     memcpy(copy->i, A->i, A->nnz * sizeof(int));
     return copy;
 }
 
-void free_csr_matrix(CSR_matrix *matrix)
+void free_CSR_matrix(CSR_matrix *matrix)
 {
     if (matrix)
     {
@@ -65,7 +65,7 @@ void free_csr_matrix(CSR_matrix *matrix)
     }
 }
 
-void copy_csr_matrix(const CSR_matrix *A, CSR_matrix *C)
+void copy_CSR_matrix(const CSR_matrix *A, CSR_matrix *C)
 {
     C->m = A->m;
     C->n = A->n;
@@ -135,7 +135,7 @@ void DA_fill_values(const double *d, const CSR_matrix *A, CSR_matrix *C)
 
 CSR_matrix *transpose(const CSR_matrix *A, int *iwork)
 {
-    CSR_matrix *AT = new_csr_matrix(A->n, A->m, A->nnz);
+    CSR_matrix *AT = new_CSR_matrix(A->n, A->m, A->nnz);
 
     int i, j;
     int *count = iwork;
@@ -181,7 +181,7 @@ CSR_matrix *transpose(const CSR_matrix *A, int *iwork)
 CSR_matrix *AT_alloc(const CSR_matrix *A, int *iwork)
 {
     /* Allocate A^T and compute sparsity pattern without filling values */
-    CSR_matrix *AT = new_csr_matrix(A->n, A->m, A->nnz);
+    CSR_matrix *AT = new_CSR_matrix(A->n, A->m, A->nnz);
 
     int i, j;
     int *count = iwork;
diff --git a/src/utils/CSR_sum.c b/src/utils/CSR_sum.c
index 9d8ba79..286fd38 100644
--- a/src/utils/CSR_sum.c
+++ b/src/utils/CSR_sum.c
@@ -370,7 +370,7 @@ CSR_matrix *sum_4_csr_alloc(const CSR_matrix *A, const CSR_matrix *B,
     int nnz_ub = A->nnz + B->nnz + C->nnz + D->nnz;
 
     /* allocate output and index maps */
-    CSR_matrix *out = new_csr_matrix(m, n, nnz_ub);
+    CSR_matrix *out = new_CSR_matrix(m, n, nnz_ub);
     for (int k = 0; k < 4; k++)
     {
         idx_maps[k] = (int *) SP_MALLOC(inputs[k]->nnz * sizeof(int));
diff --git a/src/utils/linalg_dense_sparse_matmuls.c b/src/utils/linalg_dense_sparse_matmuls.c
index 1695f2d..f1e3aaa 100644
--- a/src/utils/linalg_dense_sparse_matmuls.c
+++ b/src/utils/linalg_dense_sparse_matmuls.c
@@ -82,7 +82,7 @@ CSC_matrix *I_kron_A_alloc(const matrix *A, const CSC_matrix *J, int p)
         Cp[j + 1] = Ci->len;
     }
 
-    CSC_matrix *C = new_csc_matrix(m * p, J->n, Ci->len);
+    CSC_matrix *C = new_CSC_matrix(m * p, J->n, Ci->len);
     memcpy(C->p, Cp, (J->n + 1) * sizeof(int));
     memcpy(C->i, Ci->data, Ci->len * sizeof(int));
     free(Cp);
@@ -198,7 +198,7 @@ CSR_matrix *YT_kron_I_alloc(int m, int k, int n, const CSC_matrix *J)
     // ---------------------------------------------------------------
     //           replicate sparsity pattern across blocks
     // ---------------------------------------------------------------
-    CSR_matrix *C = new_csr_matrix(m * n, J->n, total_nnz);
+    CSR_matrix *C = new_CSR_matrix(m * n, J->n, total_nnz);
     int idx = 0;
     for (i = 0; i < m * n; i++)
     {
@@ -287,7 +287,7 @@ CSR_matrix *I_kron_X_alloc(int m, int k, int n, const CSC_matrix *J)
 
     /* Step 2: replicate each block's pattern for all m rows
      *         within that block. */
-    CSR_matrix *C = new_csr_matrix(m * n, J->n, total_nnz);
+    CSR_matrix *C = new_CSR_matrix(m * n, J->n, total_nnz);
     int idx = 0;
     for (i = 0; i < m * n; i++)
     {
diff --git a/src/utils/linalg_sparse_matmuls.c b/src/utils/linalg_sparse_matmuls.c
index b8fe39c..7720f70 100644
--- a/src/utils/linalg_sparse_matmuls.c
+++ b/src/utils/linalg_sparse_matmuls.c
@@ -157,7 +157,7 @@ CSC_matrix *block_left_multiply_fill_sparsity(const CSR_matrix *A,
         Cp[j + 1] = Ci->len;
     }
 
-    CSC_matrix *C = new_csc_matrix(m * p, J->n, Ci->len);
+    CSC_matrix *C = new_CSC_matrix(m * p, J->n, Ci->len);
     memcpy(C->p, Cp, (J->n + 1) * sizeof(int));
     memcpy(C->i, Ci->data, Ci->len * sizeof(int));
     free(Cp);
@@ -286,7 +286,7 @@ CSR_matrix *csr_csc_matmul_alloc(const CSR_matrix *A, const CSC_matrix *B)
         Cp[i + 1] = nnz;
     }
 
-    CSR_matrix *C = new_csr_matrix(m, p, nnz);
+    CSR_matrix *C = new_CSR_matrix(m, p, nnz);
     memcpy(C->p, Cp, (m + 1) * sizeof(int));
     memcpy(C->i, Ci->data, nnz * sizeof(int));
     free(Cp);
diff --git a/src/utils/matrix_BTA.c b/src/utils/matrix_BTA.c
index af764b3..8bdca67 100644
--- a/src/utils/matrix_BTA.c
+++ b/src/utils/matrix_BTA.c
@@ -23,19 +23,20 @@ matrix *BTA_matrices_alloc(matrix *A, matrix *B)
 
     if (pd_A && pd_B)
     {
-        return BTA_pd_pd_alloc(pd_A, pd_B);
+        return BTA_pd_pd_alloc(pd_B, pd_A);
     }
     if (pd_B)
     {
-        /* A is Sparse, B is PD */
-        CSR_matrix *A_csr = A->to_csr(A);
-        return BTA_csr_pd_alloc(A_csr, pd_B);
+        /* A is Sparse, B is PD — CSC kernel (see permuted_dense.{h,c}). */
+        sparse_matrix *sm_A = (sparse_matrix *) A;
+        A->refresh_csc_values(A);
+        return BTA_pd_csc_alloc(pd_B, sm_A->csc_cache);
     }
     if (pd_A)
     {
         /* A is PD, B is Sparse */
         CSR_matrix *B_csr = B->to_csr(B);
-        return BTA_pd_csr_alloc(pd_A, B_csr);
+        return BTA_csr_pd_alloc(B_csr, pd_A);
     }
 
     /* Both Sparse: delegate to CSC_matrix BTA. Caller must ensure caches are fresh.
@@ -55,19 +56,20 @@ void BTDA_matrices_fill_values(matrix *A, const double *d, matrix *B, matrix *C)
 
     if (pd_A && pd_B)
     {
-        BTDA_pd_pd_fill_values(pd_A, d, pd_B, (permuted_dense *) C);
+        BTDA_pd_pd_fill_values(pd_B, d, pd_A, (permuted_dense *) C);
         return;
     }
     if (pd_B)
     {
-        CSR_matrix *A_csr = A->to_csr(A);
-        BTDA_csr_pd_fill_values(A_csr, d, pd_B, (permuted_dense *) C);
+        sparse_matrix *sm_A = (sparse_matrix *) A;
+        A->refresh_csc_values(A);
+        BTDA_pd_csc_fill_values(pd_B, d, sm_A->csc_cache, (permuted_dense *) C);
         return;
     }
     if (pd_A)
     {
         CSR_matrix *B_csr = B->to_csr(B);
-        BTDA_pd_csr_fill_values(pd_A, d, B_csr, (permuted_dense *) C);
+        BTDA_csr_pd_fill_values(B_csr, d, pd_A, (permuted_dense *) C);
         return;
     }
 
diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c
index b6d31dd..3e7d293 100644
--- a/src/utils/permuted_dense.c
+++ b/src/utils/permuted_dense.c
@@ -21,6 +21,7 @@
 #include "utils/tracked_alloc.h"
 #include "utils/utils.h"
 #include <assert.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -32,12 +33,12 @@ static void permuted_dense_free(matrix *self)
     free(pd->col_inv);
     free(pd->row_inv);
     /* csr_cache->x aliases pd->X (set in permuted_dense_to_csr_alloc); NULL it
-       so free_csr_matrix doesn't double-free the shared buffer. */
+       so free_CSR_matrix doesn't double-free the shared buffer. */
     if (pd->csr_cache != NULL)
     {
         pd->csr_cache->x = NULL;
     }
-    free_csr_matrix(pd->csr_cache);
+    free_CSR_matrix(pd->csr_cache);
     free(pd->X);
     free(pd->dwork);
     free(pd->iwork);
@@ -359,7 +360,7 @@ matrix *new_permuted_dense(int m, int n, int m0, int n0, const int *row_perm,
     pd->X = (double *) SP_MALLOC(sz * sizeof(double));
     pd->base.x = pd->X;
     /* `dwork` sized for the Y-buffer role (Y = diag(d_perm) X) used by ATDA /
-       BTDA_pd_pd. BTA_csr_pd_alloc / BTA_pd_csr_alloc upgrade this to a
+       BTDA_pd_pd. BTA_pd_csr_alloc / BTA_csr_pd_alloc upgrade this to a
        larger gather buffer when their output PD will instead play that role. */
     pd->dwork_size = sz;
     pd->dwork = (double *) SP_MALLOC(pd->dwork_size * sizeof(double));
@@ -406,12 +407,12 @@ static CSR_matrix *permuted_dense_to_csr_alloc(const permuted_dense *A)
     int m0 = A->m0;
     int n0 = A->n0;
     int m = A->base.m;
-    CSR_matrix *C = new_csr_matrix(m, A->base.n, m0 * n0);
+    CSR_matrix *C = new_CSR_matrix(m, A->base.n, m0 * n0);
 
     /* Alias C->x to A->X: the dense block layout already matches what the
        CSR_matrix view's value array would hold, so values are always live with no
        memcpy needed. The PD owns the buffer; permuted_dense_free nulls
-       C->x before free_csr_matrix to avoid double-free. */
+       C->x before free_CSR_matrix to avoid double-free. */
     free(C->x);
     C->x = A->X;
 
@@ -476,7 +477,7 @@ void permuted_dense_ATDA_fill_values(const permuted_dense *A, const double *d,
                 A->dwork, n0, 0.0, C->X, n0);
 }
 
-matrix *BTA_pd_pd_alloc(const permuted_dense *A, const permuted_dense *B)
+matrix *BTA_pd_pd_alloc(const permuted_dense *B, const permuted_dense *A)
 {
     /* if A and B have no overlapping rows, then C = BT @ A is empty */
     if (!has_overlap(A->row_perm, A->m0, B->row_perm, B->m0, 0))
@@ -544,7 +545,7 @@ static inline int sorted_intersect_indices(const int *a, int a_len, const int *b
     return s;
 }
 
-void BTA_pd_pd_fill_values(const permuted_dense *A, const permuted_dense *B,
+void BTA_pd_pd_fill_values(const permuted_dense *B, const permuted_dense *A,
                            permuted_dense *C)
 {
     /* C may be empty if there is no overlap in row permutations */
@@ -597,8 +598,8 @@ void BTA_pd_pd_fill_values(const permuted_dense *A, const permuted_dense *B,
                 B->dwork, B->n0, A->dwork, A->n0, 0.0, C->X, A->n0);
 }
 
-void BTDA_pd_pd_fill_values(const permuted_dense *A, const double *d,
-                            const permuted_dense *B, permuted_dense *C)
+void BTDA_pd_pd_fill_values(const permuted_dense *B, const double *d,
+                            const permuted_dense *A, permuted_dense *C)
 {
     /* C may be empty if there is no overlap in row permutations of A and B */
     if (C->base.nnz == 0)
@@ -609,118 +610,18 @@ void BTDA_pd_pd_fill_values(const permuted_dense *A, const double *d,
     /* d == NULL means plain BT @ A */
     if (d == NULL)
     {
-        BTA_pd_pd_fill_values(A, B, C);
+        BTA_pd_pd_fill_values(B, A, C);
         return;
     }
 
     /* C = BT @ (DA) */
     permuted_dense *DA = (permuted_dense *) A->base.copy_sparsity(&A->base);
     permuted_dense_DA_fill_values(d, A, DA);
-    BTA_pd_pd_fill_values(DA, B, C);
+    BTA_pd_pd_fill_values(B, DA, C);
     free_matrix(&DA->base);
 }
 
-matrix *BTA_csr_pd_alloc(const CSR_matrix *A, const permuted_dense *B)
-{
-    /* Cij != 0 only if i is in B's column permutation and column j of A
-      overlaps with column i of B. */
-
-    /* Gather the union of columns appearing in A's rows at positions
-       row_perm_B. Use a bitmap of size A->n for O(nnz) collection. */
-    int p = A->n;
-    char *seen = (char *) SP_CALLOC(p, sizeof(char));
-    int s_A = 0;
-    for (int kk = 0; kk < B->m0; kk++)
-    {
-        int row = B->row_perm[kk];
-        for (int e = A->p[row]; e < A->p[row + 1]; e++)
-        {
-            int j = A->i[e];
-            if (!seen[j])
-            {
-                seen[j] = 1;
-                s_A++;
-            }
-        }
-    }
-
-    int *col_active = (int *) SP_MALLOC((s_A > 0 ? s_A : 1) * sizeof(int));
-    int idx = 0;
-    for (int j = 0; j < p; j++)
-    {
-        if (seen[j])
-        {
-            col_active[idx++] = j;
-        }
-    }
-
-    matrix *C =
-        new_permuted_dense(B->base.n, p, B->n0, s_A, B->col_perm, col_active, NULL);
-    free(col_active);
-    free(seen);
-
-    /* Upgrade `dwork` (currently sized for the Y-role at m0_C * n0_C = B->n0 *
-       s_A) to fit the gather buffer A_sub_dense used by BTA_csr_pd /
-       BTDA_csr_pd_fill_values: shape (B->m0, s_A) row-major. The dgemm
-       reads it as (B->m0, s_A), so size B->m0 * s_A doubles suffices. */
-    permuted_dense *C_pd = (permuted_dense *) C;
-    size_t gather_size = B->m0 * s_A;
-    if (gather_size > C_pd->dwork_size)
-    {
-        free(C_pd->dwork);
-        C_pd->dwork_size = gather_size;
-        C_pd->dwork = (double *) SP_CALLOC(gather_size, sizeof(double));
-    }
-    return C;
-}
-
-/* Note: when A_csr is a leaf-variable Jacobian (each row has a single entry
-   at column var_id + k, value 1), A_sub_dense is a permuted identity and
-   the dgemm reduces to X_C = X_B^T — a pure transpose with no multiplication
-   needed. A fast path can detect this and skip the dgemm; deferred until a
-   workload shows the savings matter. */
-void BTA_csr_pd_fill_values(const CSR_matrix *A_csr, const permuted_dense *B,
-                            permuted_dense *C)
-{
-    int m0 = B->m0;
-    int dn_B = B->n0;
-    int s_A = C->n0;
-
-    if (s_A == 0 || m0 == 0)
-    {
-        /* Output dense block is empty; nothing to fill. */
-        return;
-    }
-
-    /* Use C->col_inv (pre-built by new_permuted_dense) as col_inv_out and
-       C->dwork as A_sub_dense; both are owned by C. dwork is sized at alloc
-       time to cover m0 * s_A; only that prefix is touched. */
-    double *A_sub_dense = C->dwork;
-    size_t used = m0 * s_A;
-    memset(A_sub_dense, 0, used * sizeof(double));
-
-    for (int kk = 0; kk < m0; kk++)
-    {
-        int row = B->row_perm[kk];
-        for (int e = A_csr->p[row]; e < A_csr->p[row + 1]; e++)
-        {
-            int j = A_csr->i[e];
-            int jj = C->col_inv[j];
-            /* jj should always be valid (we built col_perm from these entries),
-               but guard against asymmetry between alloc and fill calls. */
-            if (jj >= 0)
-            {
-                A_sub_dense[kk * s_A + jj] = A_csr->x[e];
-            }
-        }
-    }
-
-    /* C->X = X_B^T @ A_sub_dense */
-    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dn_B, s_A, m0, 1.0, B->X,
-                dn_B, A_sub_dense, s_A, 0.0, C->X, s_A);
-}
-
-matrix *BTA_pd_csr_alloc(const permuted_dense *A, const CSR_matrix *B_csr)
+matrix *BTA_csr_pd_alloc(const CSR_matrix *B_csr, const permuted_dense *A)
 {
     /* Gather the union of columns appearing in B's rows at positions
        row_perm_A. Bitmap of size B_csr->n for O(nnz) collection. */
@@ -757,8 +658,8 @@ matrix *BTA_pd_csr_alloc(const permuted_dense *A, const CSR_matrix *B_csr)
     free(seen);
 
     /* Upgrade `dwork` (currently sized for the Y-role at m0_C * n0_C = r_B *
-       A->n0) to fit the gather buffer B_sub_dense used by BTA_pd_csr /
-       BTDA_pd_csr_fill_values: shape (A->m0, r_B) row-major. */
+       A->n0) to fit the gather buffer B_sub_dense used by BTA_csr_pd /
+       BTDA_csr_pd_fill_values: shape (A->m0, r_B) row-major. */
     permuted_dense *C_pd = (permuted_dense *) C;
     size_t gather_size = A->m0 * r_B;
     if (gather_size > C_pd->dwork_size)
@@ -775,7 +676,7 @@ matrix *BTA_pd_csr_alloc(const permuted_dense *A, const CSR_matrix *B_csr)
    the dgemm reduces to X_C = X_A — a pure copy with no multiplication
    needed. A fast path can detect this and skip the dgemm; deferred until a
    workload shows the savings matter. */
-void BTA_pd_csr_fill_values(const permuted_dense *A, const CSR_matrix *B_csr,
+void BTA_csr_pd_fill_values(const CSR_matrix *B_csr, const permuted_dense *A,
                             permuted_dense *C)
 {
     int m0 = A->m0;
@@ -814,47 +715,10 @@ void BTA_pd_csr_fill_values(const permuted_dense *A, const CSR_matrix *B_csr,
                 B_sub_dense, r_B, A->X, dn_A, 0.0, C->X, dn_A);
 }
 
-/* BTDA variant of BTA_csr_pd: C->X = X_B^T diag(d) A_sub_dense. Folds d
-   into the scatter step. */
-void BTDA_csr_pd_fill_values(const CSR_matrix *A_csr, const double *d,
-                             const permuted_dense *B, permuted_dense *C)
-{
-    int m0 = B->m0;
-    int dn_B = B->n0;
-    int s_A = C->n0;
-
-    if (s_A == 0 || m0 == 0)
-    {
-        return;
-    }
-
-    double *A_sub_dense = C->dwork;
-    size_t used = m0 * s_A;
-    memset(A_sub_dense, 0, used * sizeof(double));
-
-    for (int kk = 0; kk < m0; kk++)
-    {
-        int row = B->row_perm[kk];
-        double dk = d ? d[row] : 1.0;
-        for (int e = A_csr->p[row]; e < A_csr->p[row + 1]; e++)
-        {
-            int j = A_csr->i[e];
-            int jj = C->col_inv[j];
-            if (jj >= 0)
-            {
-                A_sub_dense[kk * s_A + jj] = dk * A_csr->x[e];
-            }
-        }
-    }
-
-    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dn_B, s_A, m0, 1.0, B->X,
-                dn_B, A_sub_dense, s_A, 0.0, C->X, s_A);
-}
-
-/* BTDA variant of BTA_pd_csr: C->X = B_sub_dense^T diag(d) X_A. Folds d
+/* BTDA variant of BTA_csr_pd: C->X = B_sub_dense^T diag(d) X_A. Folds d
    into the scatter step. */
-void BTDA_pd_csr_fill_values(const permuted_dense *A, const double *d,
-                             const CSR_matrix *B_csr, permuted_dense *C)
+void BTDA_csr_pd_fill_values(const CSR_matrix *B_csr, const double *d,
+                             const permuted_dense *A, permuted_dense *C)
 {
     int m0 = A->m0;
     int dn_A = A->n0;
@@ -944,11 +808,13 @@ matrix *BA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A)
     return C;
 }
 
-void BA_pd_csc_fill_values(const permuted_dense *B, const CSC_matrix *A,
-                           permuted_dense *C)
+void BA_pd_csc_fill_values(const double *B, int n0_B, const int *inv,
+                           const CSC_matrix *A, permuted_dense *C)
 {
-    /* C[i, j] = bi^T @ ajj, where bi is the ith row of Bs dense block and ajj is
-    the jjth column of A's sparse block (column jj = C->col_perm[j]) */
+    /* C[i, j] = bi^T @ ajj, where bi is the ith row of B_X (length n0_B,
+       row stride n0_B) and ajj is the jjth column of A's sparse block
+       (column jj = C->col_perm[j]). inv maps A's row indices to positions
+       in B_X (entries with inv[r] == -1 are skipped). */
 
     /* row i of C */
     for (int i = 0; i < C->m0; i++)
@@ -963,8 +829,56 @@ void BA_pd_csc_fill_values(const permuted_dense *B, const CSC_matrix *A,
             int start = A->p[jj];
             int len = A->p[jj + 1] - start;
             /* we compute entry C[i, j] */
-            ci[j] = sparse_dot_dense(A->x + start, A->i + start, len, B->col_inv,
-                                     B->X + i * B->n0);
+            ci[j] =
+                sparse_dot_dense(A->x + start, A->i + start, len, inv, B + i * n0_B);
         }
     }
 }
+
+/* C = B^T @ A where A is Sparse (CSC) and B is PD. Same output structure as
+   BTA_pd_csr_alloc — built directly by scanning A's CSC columns with
+   idxs_hits_set against row_perm_B. */
+matrix *BTA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A)
+{
+    /* col_active[jj] is set of columns j of A whose nonzero pattern hits
+       at least one row in row_perm_B. col_inv built against row_perm_B
+       via B->row_inv. */
+    iVec *col_active = iVec_new(8);
+    for (int j = 0; j < A->n; j++)
+    {
+        int start = A->p[j];
+        int len = A->p[j + 1] - start;
+        if (idxs_hits_set(A->i + start, len, B->row_inv))
+        {
+            iVec_append(col_active, j);
+        }
+    }
+
+    /* Same shape and permutations as BTA_pd_csr_alloc:
+       shape (B->base.n, A->n), m0 = B->n0, n0 = |col_active|,
+       row_perm = B->col_perm, col_perm = col_active. */
+    matrix *C = new_permuted_dense(B->base.n, A->n, B->n0, col_active->len,
+                                   B->col_perm, col_active->data, NULL);
+    iVec_free(col_active);
+    return C;
+}
+
+/* C = B^T diag(d) A = (diag (d) B)^T A */
+void BTDA_pd_csc_fill_values(const permuted_dense *B, const double *d,
+                             const CSC_matrix *A, permuted_dense *C)
+{
+    int m0 = B->m0;
+    int n0 = B->n0;
+
+    /* conpute B->dwork = (diag(d) B)^T */
+    for (int kk = 0; kk < m0; kk++)
+    {
+        double dk = d[B->row_perm[kk]];
+        for (int ii = 0; ii < n0; ii++)
+        {
+            B->dwork[ii * m0 + kk] = dk * B->X[kk * n0 + ii];
+        }
+    }
+
+    BA_pd_csc_fill_values(B->dwork, m0, B->row_inv, A, C);
+}
diff --git a/src/utils/sparse_matrix.c b/src/utils/sparse_matrix.c
index 6a83bc1..229b45c 100644
--- a/src/utils/sparse_matrix.c
+++ b/src/utils/sparse_matrix.c
@@ -49,8 +49,8 @@ static void sparse_block_left_mult_values(const matrix *self, const CSC_matrix *
 static void sparse_free(matrix *self)
 {
     sparse_matrix *sm = (sparse_matrix *) self;
-    free_csr_matrix(sm->csr);
-    free_csc_matrix(sm->csc_cache);
+    free_CSR_matrix(sm->csr);
+    free_CSC_matrix(sm->csc_cache);
     free(sm->csc_iwork);
     free(sm);
 }
@@ -110,7 +110,7 @@ static struct permuted_dense *sparse_as_permuted_dense(matrix *self)
 static matrix *sparse_index_alloc(matrix *self, const int *indices, int n_idxs)
 {
     CSR_matrix *Jx = ((sparse_matrix *) self)->csr;
-    CSR_matrix *J = new_csr_matrix(n_idxs, self->n, Jx->nnz);
+    CSR_matrix *J = new_CSR_matrix(n_idxs, self->n, Jx->nnz);
 
     J->p[0] = 0;
     for (int i = 0; i < n_idxs; i++)
@@ -140,7 +140,7 @@ static matrix *sparse_promote_alloc(matrix *self, int size)
 {
     CSR_matrix *Jx = ((sparse_matrix *) self)->csr;
     int row_nnz = Jx->nnz;
-    CSR_matrix *J = new_csr_matrix(size, self->n, size * row_nnz);
+    CSR_matrix *J = new_CSR_matrix(size, self->n, size * row_nnz);
 
     for (int row = 0; row < size; row++)
     {
@@ -181,7 +181,7 @@ static matrix *sparse_broadcast_alloc(matrix *self, broadcast_type type, int d1,
         total_nnz = Jx->nnz * out_m;
     }
 
-    CSR_matrix *J = new_csr_matrix(out_m, self->n, total_nnz);
+    CSR_matrix *J = new_CSR_matrix(out_m, self->n, total_nnz);
 
     if (type == BROADCAST_ROW)
     {
@@ -255,7 +255,7 @@ static matrix *sparse_diag_vec_alloc(matrix *self)
     CSR_matrix *Jx = ((sparse_matrix *) self)->csr;
     int n = self->m;
     int out_m = n * n;
-    CSR_matrix *J = new_csr_matrix(out_m, self->n, Jx->nnz);
+    CSR_matrix *J = new_CSR_matrix(out_m, self->n, Jx->nnz);
 
     int nnz = 0;
     int next_diag = 0;
@@ -334,7 +334,7 @@ matrix *new_sparse_matrix(CSR_matrix *A)
 
 matrix *new_sparse_matrix_alloc(int m, int n, int nnz)
 {
-    return new_sparse_matrix(new_csr_matrix(m, n, nnz));
+    return new_sparse_matrix(new_CSR_matrix(m, n, nnz));
 }
 
 matrix *sparse_matrix_trans(const sparse_matrix *self, int *iwork)
diff --git a/tests/all_tests.c b/tests/all_tests.c
index f6cd7d8..218fc34 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -60,13 +60,14 @@
 #include "problem/test_problem.h"
 #include "utils/test_cblas.h"
 #include "utils/test_coo_matrix.h"
-#include "utils/test_csc_matrix.h"
+#include "utils/test_CSC_matrix.h"
 #include "utils/test_csr_csc_conversion.h"
-#include "utils/test_csr_matrix.h"
+#include "utils/test_CSR_matrix.h"
 #include "utils/test_linalg_sparse_matmuls.h"
 #include "utils/test_linalg_utils_matmul_chain_rule.h"
 #include "utils/test_matrix.h"
 #include "utils/test_matrix_BTA.h"
+#include "old-code/test_old_permuted_dense.h"
 #include "utils/test_permuted_dense.h"
 #include "wsum_hess/affine/test_broadcast.h"
 #include "wsum_hess/affine/test_convolve.h"
@@ -104,6 +105,7 @@
 #endif /* PROFILE_ONLY */
 
 #ifdef PROFILE_ONLY
+#include "profiling/profile_BTA_pd_csr_vs_csc.h"
 #include "profiling/profile_left_matmul.h"
 #include "profiling/profile_log_reg.h"
 #include "profiling/profile_trimmed_log_reg.h"
@@ -382,12 +384,13 @@ int main(void)
     mu_run_test(test_permuted_dense_BTA_empty_overlap, tests_run);
     mu_run_test(test_permuted_dense_BTA_partial_overlap, tests_run);
     mu_run_test(test_permuted_dense_BTDA_decomposition, tests_run);
-    mu_run_test(test_BTA_csr_pd_basic, tests_run);
-    mu_run_test(test_BTA_csr_pd_leaf_variable, tests_run);
-    mu_run_test(test_BTA_csr_pd_no_overlap, tests_run);
+    mu_run_test(test_BTA_pd_csc_matches_csr, tests_run);
     mu_run_test(test_BTA_pd_csr_basic, tests_run);
     mu_run_test(test_BTA_pd_csr_leaf_variable, tests_run);
     mu_run_test(test_BTA_pd_csr_no_overlap, tests_run);
+    mu_run_test(test_BTA_csr_pd_basic, tests_run);
+    mu_run_test(test_BTA_csr_pd_leaf_variable, tests_run);
+    mu_run_test(test_BTA_csr_pd_no_overlap, tests_run);
     mu_run_test(test_BTDA_matrices_pd_pd, tests_run);
     mu_run_test(test_BTDA_matrices_csr_pd, tests_run);
     mu_run_test(test_BTDA_matrices_pd_csr, tests_run);
@@ -438,6 +441,7 @@ int main(void)
     mu_run_test(profile_left_matmul, tests_run);
     mu_run_test(profile_log_reg, tests_run);
     mu_run_test(profile_trimmed_log_reg, tests_run);
+    mu_run_test(profile_BTA_pd_csr_vs_csc, tests_run);
 #endif /* PROFILE_ONLY */
 
     printf("\n=== All %d tests passed ===\n", tests_run);
diff --git a/tests/forward_pass/affine/test_linear_op.h b/tests/forward_pass/affine/test_linear_op.h
index d235cd7..8a8f9f8 100644
--- a/tests/forward_pass/affine/test_linear_op.h
+++ b/tests/forward_pass/affine/test_linear_op.h
@@ -18,7 +18,7 @@ const char *test_linear_op(void)
     double Ax[7] = {2.0, 3.0, 1.0, 2.0, 3.0, 4.0, 5.0};
     int Ai[7] = {2, 3, 2, 4, 2, 3, 4};
     int Ap[4] = {0, 2, 4, 7};
-    CSR_matrix *A = new_csr_matrix(3, 6, 7);
+    CSR_matrix *A = new_CSR_matrix(3, 6, 7);
     memcpy(A->x, Ax, 7 * sizeof(double));
     memcpy(A->i, Ai, 7 * sizeof(int));
     memcpy(A->p, Ap, 4 * sizeof(int));
@@ -31,6 +31,6 @@ const char *test_linear_op(void)
     double expected[3] = {8, 7, 26};
     mu_assert("fail", cmp_double_array(linear_node->value, expected, 3));
     free_expr(linear_node);
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     return 0;
 }
diff --git a/tests/jacobian_tests/affine/test_left_matmul.h b/tests/jacobian_tests/affine/test_left_matmul.h
index 85ee558..be70b25 100644
--- a/tests/jacobian_tests/affine/test_left_matmul.h
+++ b/tests/jacobian_tests/affine/test_left_matmul.h
@@ -33,7 +33,7 @@ const char *test_jacobian_left_matmul_log(void)
     expr *x = new_variable(3, 1, 0, 3);
 
     /* Create sparse matrix A in CSR_matrix format */
-    CSR_matrix *A = new_csr_matrix(4, 3, 7);
+    CSR_matrix *A = new_CSR_matrix(4, 3, 7);
     int A_p[5] = {0, 2, 4, 6, 7};
     int A_i[7] = {0, 2, 0, 2, 0, 2, 0};
     double A_x[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
@@ -65,7 +65,7 @@ const char *test_jacobian_left_matmul_log(void)
     mu_assert("sparsity fail",
               cmp_sparsity(A_log_x->jacobian, expected_Ap, expected_Ai, 4, 7));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     free_expr(A_log_x);
     return 0;
 }
@@ -77,7 +77,7 @@ const char *test_jacobian_left_matmul_log_matrix(void)
     expr *x = new_variable(3, 2, 0, 6);
 
     /* Create sparse matrix A in CSR_matrix format (4x3) */
-    CSR_matrix *A = new_csr_matrix(4, 3, 7);
+    CSR_matrix *A = new_CSR_matrix(4, 3, 7);
     int A_p[5] = {0, 2, 4, 6, 7};
     int A_i[7] = {0, 2, 0, 2, 0, 2, 0};
     double A_x[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
@@ -104,7 +104,7 @@ const char *test_jacobian_left_matmul_log_matrix(void)
     mu_assert("sparsity fail",
               cmp_sparsity(A_log_x->jacobian, expected_Ap, expected_Ai, 8, 14));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     free_expr(A_log_x);
     return 0;
 }
@@ -116,7 +116,7 @@ const char *test_jacobian_left_matmul_exp_composite(void)
     expr *x = new_variable(3, 1, 0, 3);
 
     /* Create B matrix (3x3 all ones) */
-    CSR_matrix *B = new_csr_matrix(3, 3, 9);
+    CSR_matrix *B = new_CSR_matrix(3, 3, 9);
     int B_p[4] = {0, 3, 6, 9};
     int B_i[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
     double B_x[9] = {1, 1, 1, 1, 1, 1, 1, 1, 1};
@@ -125,7 +125,7 @@ const char *test_jacobian_left_matmul_exp_composite(void)
     memcpy(B->x, B_x, 9 * sizeof(double));
 
     /* Create A matrix */
-    CSR_matrix *A = new_csr_matrix(4, 3, 7);
+    CSR_matrix *A = new_CSR_matrix(4, 3, 7);
     int A_p[5] = {0, 2, 4, 6, 7};
     int A_i[7] = {0, 2, 0, 2, 0, 2, 0};
     double A_x[7] = {1, 2, 3, 4, 5, 6, 7};
@@ -140,8 +140,8 @@ const char *test_jacobian_left_matmul_exp_composite(void)
     mu_assert("check_jacobian failed",
               check_jacobian_num(A_exp_Bx, x_vals, NUMERICAL_DIFF_DEFAULT_H));
 
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
     free_expr(A_exp_Bx);
     return 0;
 }
diff --git a/tests/jacobian_tests/affine/test_right_matmul.h b/tests/jacobian_tests/affine/test_right_matmul.h
index 0925857..4259172 100644
--- a/tests/jacobian_tests/affine/test_right_matmul.h
+++ b/tests/jacobian_tests/affine/test_right_matmul.h
@@ -18,7 +18,7 @@ const char *test_jacobian_right_matmul_log(void)
     expr *x = new_variable(2, 2, 0, 4);
 
     /* Create sparse matrix A in CSR_matrix format (2x3) */
-    CSR_matrix *A = new_csr_matrix(2, 3, 4);
+    CSR_matrix *A = new_CSR_matrix(2, 3, 4);
     int A_p[3] = {0, 2, 4};
     int A_i[4] = {0, 2, 0, 2};
     double A_x[4] = {1.0, 2.0, 3.0, 4.0};
@@ -51,7 +51,7 @@ const char *test_jacobian_right_matmul_log(void)
     mu_assert("sparsity fail",
               cmp_sparsity(log_x_A->jacobian, expected_Ap, expected_Ai, 6, 8));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     free_expr(log_x_A);
     return 0;
 }
@@ -67,7 +67,7 @@ const char *test_jacobian_right_matmul_log_vector(void)
     expr *x = new_variable(1, 3, 0, 3);
 
     /* Create sparse matrix A in CSR_matrix format (3x2) */
-    CSR_matrix *A = new_csr_matrix(3, 2, 4);
+    CSR_matrix *A = new_CSR_matrix(3, 2, 4);
     int A_p[4] = {0, 1, 3, 4};
     int A_i[4] = {0, 0, 1, 1};
     double A_x[4] = {1.0, 2.0, 3.0, 4.0};
@@ -96,7 +96,7 @@ const char *test_jacobian_right_matmul_log_vector(void)
     mu_assert("sparsity fail",
               cmp_sparsity(log_x_A->jacobian, expected_Ap, expected_Ai, 2, 4));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     free_expr(log_x_A);
     return 0;
 }
diff --git a/tests/jacobian_tests/affine/test_transpose.h b/tests/jacobian_tests/affine/test_transpose.h
index 2d16837..2871a78 100644
--- a/tests/jacobian_tests/affine/test_transpose.h
+++ b/tests/jacobian_tests/affine/test_transpose.h
@@ -11,7 +11,7 @@
 const char *test_jacobian_transpose(void)
 {
     // A = [1 2; 3 4]
-    CSR_matrix *A = new_csr_matrix(2, 2, 4);
+    CSR_matrix *A = new_CSR_matrix(2, 2, 4);
     int A_p[3] = {0, 2, 4};
     int A_i[4] = {0, 1, 0, 1};
     double A_x[4] = {1, 2, 3, 4};
@@ -37,7 +37,7 @@ const char *test_jacobian_transpose(void)
     mu_assert("sparsity fail",
               cmp_sparsity(transpose_AX->jacobian, expected_p, expected_i, 4, 8));
     free_expr(transpose_AX);
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     return 0;
 }
 
diff --git a/tests/jacobian_tests/bivariate_full_dom/test_elementwise_mult.h b/tests/jacobian_tests/bivariate_full_dom/test_elementwise_mult.h
index 35304c5..08d50e4 100644
--- a/tests/jacobian_tests/bivariate_full_dom/test_elementwise_mult.h
+++ b/tests/jacobian_tests/bivariate_full_dom/test_elementwise_mult.h
@@ -64,7 +64,7 @@ const char *test_jacobian_elementwise_mult_3(void)
          0 0 1 1 3 0 0 0 0 0
          0 0 1 -1 1 0 0 0 0 0]
     */
-    CSR_matrix *A = new_csr_matrix(3, 10, 9);
+    CSR_matrix *A = new_CSR_matrix(3, 10, 9);
     double Ax_vals[9] = {1.0, 2.0, 1.0, 1.0, 3.0, 1.0, -1.0, 1.0};
     int Ai[9] = {2, 3, 2, 3, 4, 2, 3, 4};
     int Ap[4] = {0, 2, 5, 8};
@@ -78,7 +78,7 @@ const char *test_jacobian_elementwise_mult_3(void)
          0 0 0 0 0 0 0 1 -2 1]
 
     */
-    CSR_matrix *B = new_csr_matrix(3, 10, 9);
+    CSR_matrix *B = new_CSR_matrix(3, 10, 9);
     double Bx_vals[9] = {1.0, 3.0, 1.0, 1.0, 4.0, 1.0, -2.0, 1.0};
     int Bi[9] = {7, 8, 7, 8, 9, 7, 8, 9};
     int Bp[4] = {0, 2, 5, 8};
@@ -113,8 +113,8 @@ const char *test_jacobian_elementwise_mult_3(void)
     mu_assert("vals fail", cmp_values(node->jacobian, vals, 16));
     mu_assert("sparsity fail", cmp_sparsity(node->jacobian, rows, cols, 3, 16));
     free_expr(node);
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
     return 0;
 }
 
@@ -128,7 +128,7 @@ const char *test_jacobian_elementwise_mult_4(void)
          0 0 1 1 3 0 0 0 0 0
          0 0 1 -1 1 0 0 0 0 0]
     */
-    CSR_matrix *A = new_csr_matrix(3, 10, 9);
+    CSR_matrix *A = new_CSR_matrix(3, 10, 9);
     double Ax_vals[9] = {1.0, 2.0, 1.0, 1.0, 3.0, 1.0, -1.0, 1.0};
     int Ai[9] = {2, 3, 2, 3, 4, 2, 3, 4};
     int Ap[4] = {0, 2, 5, 8};
@@ -157,6 +157,6 @@ const char *test_jacobian_elementwise_mult_4(void)
     mu_assert("vals fail", cmp_values(node->jacobian, vals, 8));
     mu_assert("sparsity fail", cmp_sparsity(node->jacobian, rows, cols, 3, 8));
     free_expr(node);
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     return 0;
 }
diff --git a/tests/jacobian_tests/bivariate_restricted_dom/test_quad_over_lin.h b/tests/jacobian_tests/bivariate_restricted_dom/test_quad_over_lin.h
index 12accf1..1a44469 100644
--- a/tests/jacobian_tests/bivariate_restricted_dom/test_quad_over_lin.h
+++ b/tests/jacobian_tests/bivariate_restricted_dom/test_quad_over_lin.h
@@ -63,7 +63,7 @@ const char *test_quad_over_lin3(void)
     // A = [0 0 1 2 3 0 0 0
     //      0 0 4 5 6 0 0]
 
-    CSR_matrix *A = new_csr_matrix(2, 8, 6);
+    CSR_matrix *A = new_CSR_matrix(2, 8, 6);
     double Ax[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     int Ai[6] = {2, 3, 4, 2, 3, 4};
     int Ap[3] = {0, 3, 6};
@@ -90,7 +90,7 @@ const char *test_quad_over_lin3(void)
     mu_assert("sparsity fail",
               cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 1, 4));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     free_expr(node);
     return 0;
 }
@@ -103,7 +103,7 @@ const char *test_quad_over_lin4(void)
     //      0 0 0 0 0 4 5 6
     //
 
-    CSR_matrix *A = new_csr_matrix(2, 8, 6);
+    CSR_matrix *A = new_CSR_matrix(2, 8, 6);
     double Ax[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     int Ai[6] = {5, 6, 7, 5, 6, 7};
     int Ap[3] = {0, 3, 6};
@@ -130,7 +130,7 @@ const char *test_quad_over_lin4(void)
     mu_assert("sparsity fail",
               cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 1, 4));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     free_expr(node);
     return 0;
 }
@@ -143,7 +143,7 @@ const char *test_quad_over_lin5(void)
     //      0 2 0 0 0 4 5 6
     //
 
-    CSR_matrix *A = new_csr_matrix(2, 8, 9);
+    CSR_matrix *A = new_CSR_matrix(2, 8, 9);
     double Ax[9] = {1, 3, 1.0, 2.0, 3.0, 2, 4.0, 5.0, 6.0};
     int Ai[9] = {0, 3, 5, 6, 7, 1, 5, 6, 7};
     int Ap[3] = {0, 5, 9};
@@ -170,7 +170,7 @@ const char *test_quad_over_lin5(void)
     mu_assert("sparsity fail",
               cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 1, 7));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     free_expr(node);
     return 0;
 }
diff --git a/tests/jacobian_tests/composite/test_chain_rule_jacobian.h b/tests/jacobian_tests/composite/test_chain_rule_jacobian.h
index 49eb8b3..1fd6a65 100644
--- a/tests/jacobian_tests/composite/test_chain_rule_jacobian.h
+++ b/tests/jacobian_tests/composite/test_chain_rule_jacobian.h
@@ -90,8 +90,8 @@ const char *test_jacobian_Ax_Bx_multiply(void)
               check_jacobian_num(multiply, u_vals, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(multiply);
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
     return 0;
 }
 
@@ -110,8 +110,8 @@ const char *test_jacobian_AX_BX_multiply(void)
               check_jacobian_num(multiply, u_vals, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(multiply);
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
     return 0;
 }
 
@@ -123,7 +123,7 @@ const char *test_jacobian_quad_form_Ax(void)
     CSR_matrix *A = new_csr_random(3, 4, 1.0);
 
     /* Q = [1 2 0; 2 3 0; 0 0 4] */
-    CSR_matrix *Q = new_csr_matrix(3, 3, 5);
+    CSR_matrix *Q = new_CSR_matrix(3, 3, 5);
     double Qx[5] = {1.0, 2.0, 2.0, 3.0, 4.0};
     int Qi[5] = {0, 1, 0, 1, 2};
     int Qp[4] = {0, 2, 4, 5};
@@ -140,8 +140,8 @@ const char *test_jacobian_quad_form_Ax(void)
               check_jacobian_num(node, u_vals, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(node);
-    free_csr_matrix(A);
-    free_csr_matrix(Q);
+    free_CSR_matrix(A);
+    free_CSR_matrix(Q);
     return 0;
 }
 
@@ -151,7 +151,7 @@ const char *test_jacobian_quad_form_exp(void)
     double u_vals[3] = {0.5, 1.0, 1.5};
 
     /* Q = [1 2 0; 2 3 0; 0 0 4] */
-    CSR_matrix *Q = new_csr_matrix(3, 3, 5);
+    CSR_matrix *Q = new_CSR_matrix(3, 3, 5);
     double Qx[5] = {1.0, 2.0, 2.0, 3.0, 4.0};
     int Qi[5] = {0, 1, 0, 1, 2};
     int Qp[4] = {0, 2, 4, 5};
@@ -167,7 +167,7 @@ const char *test_jacobian_quad_form_exp(void)
               check_jacobian_num(node, u_vals, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(node);
-    free_csr_matrix(Q);
+    free_CSR_matrix(Q);
     return 0;
 }
 
@@ -225,8 +225,8 @@ const char *test_jacobian_matmul_Ax_By(void)
               check_jacobian_num(Z, u_vals, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(Z);
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
     return 0;
 }
 
@@ -249,8 +249,8 @@ const char *test_jacobian_matmul_sin_Ax_cos_Bx(void)
               check_jacobian_num(Z, u_vals, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(Z);
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
     return 0;
 }
 
diff --git a/tests/jacobian_tests/composite/test_composite_exp.h b/tests/jacobian_tests/composite/test_composite_exp.h
index b83d32b..73a2e95 100644
--- a/tests/jacobian_tests/composite/test_composite_exp.h
+++ b/tests/jacobian_tests/composite/test_composite_exp.h
@@ -11,7 +11,7 @@ const char *test_jacobian_composite_exp(void)
 {
     double u_vals[6] = {0, 0, 1, 2, 3, 0};
 
-    CSR_matrix *A = new_csr_matrix(2, 6, 6);
+    CSR_matrix *A = new_CSR_matrix(2, 6, 6);
     double Ax[6] = {3, 2, 1, 2, 1, 1};
     int Ai[6] = {2, 3, 4, 2, 3, 4};
     int Ap[3] = {0, 3, 6};
@@ -36,7 +36,7 @@ const char *test_jacobian_composite_exp(void)
     mu_assert("vals fail", cmp_values(exp_node->jacobian, vals, 6));
     mu_assert("sparsity fail", cmp_sparsity(exp_node->jacobian, rows, cols, 2, 6));
     free_expr(exp_node);
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     return 0;
 }
 
@@ -45,7 +45,7 @@ const char *test_jacobian_composite_exp_add(void)
 {
     double u_vals[7] = {0, 0, 1, 1, 1, 2, 2};
 
-    CSR_matrix *A = new_csr_matrix(3, 7, 9);
+    CSR_matrix *A = new_CSR_matrix(3, 7, 9);
     double Ax[9] = {1, 1, 1, 2, 2, 2, 3, 3, 3};
     int Ai[9] = {2, 3, 4, 2, 3, 4, 2, 3, 4};
     int Ap[4] = {0, 3, 6, 9};
@@ -53,7 +53,7 @@ const char *test_jacobian_composite_exp_add(void)
     memcpy(A->i, Ai, 9 * sizeof(int));
     memcpy(A->p, Ap, 4 * sizeof(int));
 
-    CSR_matrix *B = new_csr_matrix(3, 7, 6);
+    CSR_matrix *B = new_CSR_matrix(3, 7, 6);
     double Bx[6] = {1, 1, 2, 2, 3, 3};
     int Bi[6] = {5, 6, 5, 6, 5, 6};
     int Bp[4] = {0, 2, 4, 6};
@@ -73,7 +73,7 @@ const char *test_jacobian_composite_exp_add(void)
               check_jacobian_num(sum, u_vals, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(sum);
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
     return 0;
 }
diff --git a/tests/jacobian_tests/other/test_quad_form.h b/tests/jacobian_tests/other/test_quad_form.h
index 1a224fe..22d33c0 100644
--- a/tests/jacobian_tests/other/test_quad_form.h
+++ b/tests/jacobian_tests/other/test_quad_form.h
@@ -14,7 +14,7 @@ const char *test_quad_form(void)
     // Q = [1 2 0; 2 3 0; 0 0 4]
     double u_vals[5] = {0, 0, 1, 2, 3};
     expr *x = new_variable(3, 1, 2, 5);
-    CSR_matrix *Q = new_csr_matrix(3, 3, 5);
+    CSR_matrix *Q = new_CSR_matrix(3, 3, 5);
     double Qx[5] = {1.0, 2.0, 2.0, 3.0, 4.0};
     int Qi[5] = {0, 1, 0, 1, 2};
     int Qp[4] = {0, 2, 4, 5};
@@ -35,7 +35,7 @@ const char *test_quad_form(void)
     mu_assert("sparsity fail",
               cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 1, 3));
     free_expr(node);
-    free_csr_matrix(Q);
+    free_CSR_matrix(Q);
     return 0;
 }
 
@@ -51,7 +51,7 @@ src/other/quad_form.c. const char *test_quad_form2(void)
     //         1 0 0 2 0 1]
 double u_vals[6] = {1, 2, 3, 4, 5, 6};
 expr *u = new_variable(6, 1, 0, 6);
-CSR_matrix *Q = new_csr_matrix(3, 3, 5);
+CSR_matrix *Q = new_CSR_matrix(3, 3, 5);
 double Qx[5] = {1.0, 2.0, 2.0, 3.0, 4.0};
 int Qi[5] = {0, 1, 0, 1, 2};
 int Qp[4] = {0, 2, 4, 5};
@@ -59,7 +59,7 @@ memcpy(Q->x, Qx, 5 * sizeof(double));
 memcpy(Q->i, Qi, 5 * sizeof(int));
 memcpy(Q->p, Qp, 4 * sizeof(int));
 
-CSR_matrix *A = new_csr_matrix(3, 6, 10);
+CSR_matrix *A = new_CSR_matrix(3, 6, 10);
 double Ax[10] = {1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6, 1.0, 2.0, 1.0};
 int Ai[10] = {0, 2, 3, 4, 2, 3, 4, 0, 3, 5};
 int Ap[4] = {0, 4, 7, 10};
@@ -82,8 +82,8 @@ mu_assert("sparsity fail",
               cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 1, 5));
 free_expr(node);
 free_expr(Au);
-free_csr_matrix(Q);
-free_csr_matrix(A);
+free_CSR_matrix(Q);
+free_CSR_matrix(A);
 return 0;
 }
 */
diff --git a/tests/numerical_diff/test_numerical_diff.h b/tests/numerical_diff/test_numerical_diff.h
index e98616d..a43b718 100644
--- a/tests/numerical_diff/test_numerical_diff.h
+++ b/tests/numerical_diff/test_numerical_diff.h
@@ -9,7 +9,7 @@ const char *test_check_jacobian_composite_exp(void)
 {
     double u_vals[6] = {0, 0, 1, 2, 3, 0};
 
-    CSR_matrix *A = new_csr_matrix(2, 6, 6);
+    CSR_matrix *A = new_CSR_matrix(2, 6, 6);
     double Ax[6] = {3, 2, 1, 2, 1, 1};
     int Ai[6] = {2, 3, 4, 2, 3, 4};
     int Ap[3] = {0, 3, 6};
@@ -25,7 +25,7 @@ const char *test_check_jacobian_composite_exp(void)
               check_jacobian_num(exp_node, u_vals, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(exp_node);
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     return 0;
 }
 
@@ -36,7 +36,7 @@ const char *test_check_wsum_hess_exp_composite(void)
     double Ax[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
     int Ai[] = {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4};
     int Ap[] = {0, 5, 10, 15};
-    CSR_matrix *A_csr = new_csr_matrix(3, 5, 15);
+    CSR_matrix *A_csr = new_CSR_matrix(3, 5, 15);
     memcpy(A_csr->x, Ax, 15 * sizeof(double));
     memcpy(A_csr->i, Ai, 15 * sizeof(int));
     memcpy(A_csr->p, Ap, 4 * sizeof(int));
@@ -49,6 +49,6 @@ const char *test_check_wsum_hess_exp_composite(void)
               check_wsum_hess(exp_node, u_vals, w, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(exp_node);
-    free_csr_matrix(A_csr);
+    free_CSR_matrix(A_csr);
     return 0;
 }
diff --git a/tests/old-code/test_old_permuted_dense.h b/tests/old-code/test_old_permuted_dense.h
new file mode 100644
index 0000000..c986c4b
--- /dev/null
+++ b/tests/old-code/test_old_permuted_dense.h
@@ -0,0 +1,182 @@
+#ifndef TEST_OLD_PERMUTED_DENSE_H
+#define TEST_OLD_PERMUTED_DENSE_H
+
+#include "minunit.h"
+#include "old-code/old_permuted_dense.h"
+#include "test_helpers.h"
+#include "utils/CSR_matrix.h"
+#include "utils/permuted_dense.h"
+#include <stdlib.h>
+#include <string.h>
+
+/* Direct unit tests for the legacy CSR-pd BTA kernels in old-code. They no
+   longer sit on a production path (matrix_BTA dispatcher hard-wires the
+   CSC variants), but the kernels remain as reference implementations and
+   as the CSR side of the cross-comparison test in test_permuted_dense.h. */
+
+const char *test_BTA_pd_csr_basic(void)
+{
+    /* CSR_matrix A: m=4, n=5, with nonzeros:
+       row 0: cols {1, 4}
+       row 1: cols {0, 2}
+       row 2: cols {2}
+       row 3: cols {1, 4} */
+    CSR_matrix *A = new_CSR_matrix(4, 5, 7);
+    A->p[0] = 0;
+    A->p[1] = 2;
+    A->p[2] = 4;
+    A->p[3] = 5;
+    A->p[4] = 7;
+    int Ai[7] = {1, 4, 0, 2, 2, 1, 4};
+    double Ax[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
+    memcpy(A->i, Ai, sizeof Ai);
+    memcpy(A->x, Ax, sizeof Ax);
+
+    /* PD B: m=4, n=4, row_perm = [1, 3], col_perm = [0, 2], X = [[10, 20], [30,
+     * 40]]. */
+    int row_perm_B[2] = {1, 3};
+    int col_perm_B[2] = {0, 2};
+    double XB[4] = {10.0, 20.0, 30.0, 40.0};
+    matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
+    permuted_dense *B = (permuted_dense *) B_m;
+
+    matrix *out_m = BTA_pd_csr_alloc(B, A);
+    permuted_dense *out = (permuted_dense *) out_m;
+
+    /* Expected col_active: union of A's columns in rows 1 and 3
+       = {0, 2} ∪ {1, 4} = {0, 1, 2, 4}, size 4. */
+    int expected_col_perm[4] = {0, 1, 2, 4};
+    mu_assert("out m", out_m->m == 4); /* B.n */
+    mu_assert("out n", out_m->n == 5); /* A.n */
+    mu_assert("m0", out->m0 == 2);
+    mu_assert("n0", out->n0 == 4);
+    mu_assert("row_perm", cmp_int_array(out->row_perm, col_perm_B, 2));
+    mu_assert("col_perm", cmp_int_array(out->col_perm, expected_col_perm, 4));
+
+    BTA_pd_csr_fill_values(B, A, out);
+
+    /* Reference: scatter A and B to dense 4x{5,4}, compute B^T A, extract
+       block at (col_perm_B × out->col_perm). Scatter inlined locally to
+       avoid coupling to the static helpers in tests/utils/test_permuted_dense.h. */
+    double *A_d = (double *) calloc(4 * 5, sizeof(double));
+    double *B_d = (double *) calloc(4 * 4, sizeof(double));
+    for (int i = 0; i < A->m; i++)
+        for (int e = A->p[i]; e < A->p[i + 1]; e++)
+            A_d[i * 5 + A->i[e]] = A->x[e];
+    for (int kk = 0; kk < B->m0; kk++)
+        for (int jj = 0; jj < B->n0; jj++)
+            B_d[B->row_perm[kk] * 4 + B->col_perm[jj]] = B->X[kk * B->n0 + jj];
+
+    double C_ref[4 * 5];
+    memset(C_ref, 0, sizeof C_ref);
+    for (int i = 0; i < 4; i++)
+    {
+        for (int j = 0; j < 5; j++)
+        {
+            double s = 0.0;
+            for (int k = 0; k < 4; k++)
+            {
+                s += B_d[k * 4 + i] * A_d[k * 5 + j];
+            }
+            C_ref[i * 5 + j] = s;
+        }
+    }
+    double expected_X[8];
+    for (int ii = 0; ii < 2; ii++)
+    {
+        for (int jj = 0; jj < 4; jj++)
+        {
+            expected_X[ii * 4 + jj] =
+                C_ref[col_perm_B[ii] * 5 + expected_col_perm[jj]];
+        }
+    }
+    mu_assert("values", cmp_double_array(out->X, expected_X, 8));
+
+    free(A_d);
+    free(B_d);
+    free_matrix(out_m);
+    free_matrix(B_m);
+    free_CSR_matrix(A);
+    return 0;
+}
+
+/* BTA(CSR_matrix A, PD B) where A is a leaf-variable Jacobian (identity-in-block).
+   A is (4, 8): row k has a 1 at column 4+k (variable v of size 4 at var_id=4).
+   Expected: col_perm_out = {4+row_perm_B[kk]} = {4+1, 4+3} = {5, 7}, and X_C =
+   X_B^T. */
+const char *test_BTA_pd_csr_leaf_variable(void)
+{
+    CSR_matrix *A = new_CSR_matrix(4, 8, 4);
+    for (int k = 0; k < 4; k++)
+    {
+        A->p[k] = k;
+        A->i[k] = 4 + k;
+        A->x[k] = 1.0;
+    }
+    A->p[4] = 4;
+
+    int row_perm_B[2] = {1, 3};
+    int col_perm_B[2] = {0, 2};
+    double XB[4] = {10.0, 20.0, 30.0, 40.0}; /* row-major (2, 2) */
+    matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
+    permuted_dense *B = (permuted_dense *) B_m;
+
+    matrix *out_m = BTA_pd_csr_alloc(B, A);
+    permuted_dense *out = (permuted_dense *) out_m;
+
+    int expected_col_perm[2] = {5, 7};
+    mu_assert("m0", out->m0 == 2);
+    mu_assert("n0", out->n0 == 2);
+    mu_assert("row_perm", cmp_int_array(out->row_perm, col_perm_B, 2));
+    mu_assert("col_perm", cmp_int_array(out->col_perm, expected_col_perm, 2));
+
+    BTA_pd_csr_fill_values(B, A, out);
+
+    /* X_C should be X_B^T = [[10, 30], [20, 40]] row-major. */
+    double expected_X[4] = {10.0, 30.0, 20.0, 40.0};
+    mu_assert("values", cmp_double_array(out->X, expected_X, 4));
+
+    free_matrix(out_m);
+    free_matrix(B_m);
+    free_CSR_matrix(A);
+    return 0;
+}
+
+/* BTA(CSR_matrix A, PD B) where A has no entries in any row of row_perm_B.
+   Output dense block should have n0 = 0. */
+const char *test_BTA_pd_csr_no_overlap(void)
+{
+    /* A: rows 0 and 2 have entries; rows 1 and 3 (row_perm_B) are empty. */
+    CSR_matrix *A = new_CSR_matrix(4, 5, 3);
+    A->p[0] = 0;
+    A->p[1] = 2;
+    A->p[2] = 2;
+    A->p[3] = 3;
+    A->p[4] = 3;
+    int Ai[3] = {1, 4, 2};
+    double Ax[3] = {1.0, 2.0, 3.0};
+    memcpy(A->i, Ai, sizeof Ai);
+    memcpy(A->x, Ax, sizeof Ax);
+
+    int row_perm_B[2] = {1, 3}; /* rows that ARE empty in A */
+    int col_perm_B[2] = {0, 2};
+    double XB[4] = {1.0, 2.0, 3.0, 4.0};
+    matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
+    permuted_dense *B = (permuted_dense *) B_m;
+
+    matrix *out_m = BTA_pd_csr_alloc(B, A);
+    permuted_dense *out = (permuted_dense *) out_m;
+
+    mu_assert("m0", out->m0 == 2);
+    mu_assert("n0", out->n0 == 0);
+
+    /* Fill should be a no-op (0-sized dense block). */
+    BTA_pd_csr_fill_values(B, A, out);
+
+    free_matrix(out_m);
+    free_matrix(B_m);
+    free_CSR_matrix(A);
+    return 0;
+}
+
+#endif /* TEST_OLD_PERMUTED_DENSE_H */
diff --git a/tests/profiling/profile_BTA_pd_csr_vs_csc.h b/tests/profiling/profile_BTA_pd_csr_vs_csc.h
new file mode 100644
index 0000000..db7bfab
--- /dev/null
+++ b/tests/profiling/profile_BTA_pd_csr_vs_csc.h
@@ -0,0 +1,133 @@
+#ifndef PROFILE_BTA_PD_CSR_VS_CSC_H
+#define PROFILE_BTA_PD_CSR_VS_CSC_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "minunit.h"
+#include "old-code/old_permuted_dense.h"
+#include "test_helpers.h"
+#include "utils/CSC_matrix.h"
+#include "utils/CSR_matrix.h"
+#include "utils/Timer.h"
+#include "utils/permuted_dense.h"
+#include "utils/utils.h"
+
+/* Microbenchmark: compare BTA_csr_pd vs BTA_csc_pd on trimmed_log_reg-shaped
+   (m=2000, n0_B=785) inputs at two A densities. Output is one fill timing
+   pair per row of the table. */
+static void run_bench_one_density(int m, int n0_B, int n_A, int nnz_per_row,
+                                  int N_ITERS, const char *label)
+{
+    /* B: PD with full m × n0_B dense block (row_perm = 0..m-1, col_perm =
+       0..n0_B-1). Values arbitrary. */
+    int *row_perm_B = (int *) malloc(m * sizeof(int));
+    int *col_perm_B = (int *) malloc(n0_B * sizeof(int));
+    double *XB = (double *) malloc(m * n0_B * sizeof(double));
+    for (int i = 0; i < m; i++) row_perm_B[i] = i;
+    for (int j = 0; j < n0_B; j++) col_perm_B[j] = j;
+    for (int k = 0; k < m * n0_B; k++) XB[k] = (double) (k % 37) * 0.013 + 0.1;
+
+    /* B's global shape: (m, n_B_global). Pick n_B_global = n0_B (no padding). */
+    matrix *B_m = new_permuted_dense(m, n0_B, m, n0_B, row_perm_B, col_perm_B, XB);
+    permuted_dense *B = (permuted_dense *) B_m;
+
+    /* A: (m × n_A) CSR with `nnz_per_row` evenly-spaced nonzeros per row. */
+    int total_nnz = m * nnz_per_row;
+    CSR_matrix *A_csr = new_CSR_matrix(m, n_A, total_nnz);
+    for (int row = 0; row <= m; row++) A_csr->p[row] = row * nnz_per_row;
+    srand(42);
+    for (int row = 0; row < m; row++)
+    {
+        /* Pick nnz_per_row distinct columns by sorted random sampling. */
+        int *cols = (int *) malloc(nnz_per_row * sizeof(int));
+        int picked = 0;
+        while (picked < nnz_per_row)
+        {
+            int c = rand() % n_A;
+            int dup = 0;
+            for (int k = 0; k < picked; k++)
+                if (cols[k] == c) { dup = 1; break; }
+            if (!dup) cols[picked++] = c;
+        }
+        /* Insertion sort to keep CSR column-index invariant. */
+        for (int a = 1; a < nnz_per_row; a++)
+        {
+            int v = cols[a]; int b = a - 1;
+            while (b >= 0 && cols[b] > v) { cols[b + 1] = cols[b]; b--; }
+            cols[b + 1] = v;
+        }
+        for (int k = 0; k < nnz_per_row; k++)
+        {
+            int e = A_csr->p[row] + k;
+            A_csr->i[e] = cols[k];
+            A_csr->x[e] = (double) ((row * 31 + cols[k]) % 53) * 0.027 + 0.05;
+        }
+        free(cols);
+    }
+
+    /* CSC view of A. */
+    int *iwork = (int *) malloc(MAX(m, n_A) * sizeof(int));
+    CSC_matrix *A_csc = csr_to_csc_alloc(A_csr, iwork);
+    csr_to_csc_fill_values(A_csr, A_csc, iwork);
+
+    /* Allocate outputs once for each variant. */
+    matrix *C_csr_m = BTA_pd_csr_alloc(B, A_csr);
+    permuted_dense *C_csr = (permuted_dense *) C_csr_m;
+    matrix *C_csc_m = BTA_pd_csc_alloc(B, A_csc);
+    permuted_dense *C_csc = (permuted_dense *) C_csc_m;
+
+    /* d for BTDA: all ones, so C = B^T diag(d) A = B^T A. */
+    double *d_ones = (double *) malloc(m * sizeof(double));
+    for (int i = 0; i < m; i++) d_ones[i] = 1.0;
+
+    /* Warm-up + time CSR fill. */
+    Timer t1;
+    BTDA_pd_csr_fill_values(B, d_ones, A_csr, C_csr);
+    clock_gettime(CLOCK_MONOTONIC, &t1.start);
+    for (int it = 0; it < N_ITERS; it++)
+        BTDA_pd_csr_fill_values(B, d_ones, A_csr, C_csr);
+    clock_gettime(CLOCK_MONOTONIC, &t1.end);
+    double t_csr_ms = GET_ELAPSED_SECONDS(t1) * 1000.0 / N_ITERS;
+
+    /* Warm-up + time CSC fill. */
+    Timer t2;
+    BTDA_pd_csc_fill_values(B, d_ones, A_csc, C_csc);
+    clock_gettime(CLOCK_MONOTONIC, &t2.start);
+    for (int it = 0; it < N_ITERS; it++)
+        BTDA_pd_csc_fill_values(B, d_ones, A_csc, C_csc);
+    clock_gettime(CLOCK_MONOTONIC, &t2.end);
+    double t_csc_ms = GET_ELAPSED_SECONDS(t2) * 1000.0 / N_ITERS;
+
+    printf("  %-22s CSR = %7.3f ms   CSC = %7.3f ms   ratio CSR/CSC = %.2fx\n",
+           label, t_csr_ms, t_csc_ms, t_csr_ms / t_csc_ms);
+
+    free_matrix(C_csr_m);
+    free_matrix(C_csc_m);
+    free_matrix(B_m);
+    free_CSR_matrix(A_csr);
+    free_CSC_matrix(A_csc);
+    free(iwork);
+    free(row_perm_B);
+    free(col_perm_B);
+    free(XB);
+    free(d_ones);
+}
+
+const char *profile_BTA_pd_csr_vs_csc(void)
+{
+    int m = 2000;
+    int n0_B = 785;
+    int n_A = 2000;
+    int N_ITERS = 50;
+
+    printf("\nBTA pd × sparse fill benchmark (m=%d, n0_B=%d, n_A=%d, %d iters):\n",
+           m, n0_B, n_A, N_ITERS);
+    run_bench_one_density(m, n0_B, n_A,  1, N_ITERS, "leaf-var (1 nnz/row):");
+    run_bench_one_density(m, n0_B, n_A, 50, N_ITERS, "dense-ish (50 nnz/row):");
+    return 0;
+}
+
+#endif /* PROFILE_BTA_PD_CSR_VS_CSC_H */
diff --git a/tests/profiling/profile_left_matmul.h b/tests/profiling/profile_left_matmul.h
index df6d96b..639b6cf 100644
--- a/tests/profiling/profile_left_matmul.h
+++ b/tests/profiling/profile_left_matmul.h
@@ -16,7 +16,7 @@ const char *profile_left_matmul(void)
     /* A @ X where A is 50 x 50 dense stored in CSR_matrix and X is 50 x 50 variable */
     int n = 100;
     expr *X = new_variable(n, n, 0, n * n);
-    CSR_matrix *A = new_csr_matrix(n, n, n * n);
+    CSR_matrix *A = new_CSR_matrix(n, n, n * n);
     for (int i = 0; i < n * n; i++)
     {
         A->x[i] = 1.0; /* dense matrix of all ones */
@@ -56,7 +56,7 @@ const char *profile_left_matmul(void)
            GET_ELAPSED_SECONDS(timer));
 
     free(x_vals);
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     free_expr(AX);
     return 0;
 }
diff --git a/tests/profiling/profile_log_reg.h b/tests/profiling/profile_log_reg.h
index e89cf0c..ccf8e63 100644
--- a/tests/profiling/profile_log_reg.h
+++ b/tests/profiling/profile_log_reg.h
@@ -81,7 +81,7 @@ const char *profile_log_reg(void)
 
     /* CSR_matrix scaffolding for the row-sum step (PD owns the cached CSR_matrix view). */
     CSR_matrix *Jlog_csr = Jlog_M->to_csr(Jlog_M);
-    CSR_matrix *Jobj_csr = new_csr_matrix(1, n, n);
+    CSR_matrix *Jobj_csr = new_CSR_matrix(1, n, n);
     int *iwork = (int *) malloc((size_t) m * n * sizeof(int));
     int *idx_map = (int *) malloc((size_t) m * n * sizeof(int));
     sum_all_rows_csr_alloc(Jlog_csr, Jobj_csr, iwork, idx_map);
@@ -161,7 +161,7 @@ const char *profile_log_reg(void)
     free(w_ones);
     free(iwork);
     free(idx_map);
-    free_csr_matrix(Jobj_csr);
+    free_CSR_matrix(Jobj_csr);
     /* Jlog_csr is owned by Jlog_M's cache; released by free_matrix below. */
     free_matrix(H_pd_M);
     free_matrix(Jlog_M);
diff --git a/tests/profiling/profile_trimmed_log_reg.h b/tests/profiling/profile_trimmed_log_reg.h
index d010d00..5f7f2f6 100644
--- a/tests/profiling/profile_trimmed_log_reg.h
+++ b/tests/profiling/profile_trimmed_log_reg.h
@@ -17,13 +17,14 @@
 
    theta (n x 1), w (m x 1)  : variables (n_vars = n + m)
    A     (m x n)             : dense constant
-   y     (m x 1)              : constant in {-1, +1}, wrapped as PARAM_FIXED
+   y     (m x 1)             : constant in {-1, +1}, wrapped as PARAM_FIXED
 
    Forward pass is excluded from timing. */
 const char *profile_trimmed_log_reg(void)
 {
     int m = 2000;
     int n = 785;
+    int N_HESS_ITERS = 10;
     int n_vars = n + m;
 
     /* ---- Random inputs ---- */
@@ -63,31 +64,32 @@ const char *profile_trimmed_log_reg(void)
     /* Forward (untimed). */
     obj->forward(obj, u);
 
-    /* ---- Time eval_jacobian and eval_wsum_hess ---- */
     double w_one = 1.0;
     Timer t_jac, t_hess;
     clock_gettime(CLOCK_MONOTONIC, &t_jac.start);
     obj->eval_jacobian(obj);
     clock_gettime(CLOCK_MONOTONIC, &t_jac.end);
 
+    obj->eval_wsum_hess(obj, &w_one); /* warm-up */
     clock_gettime(CLOCK_MONOTONIC, &t_hess.start);
-    obj->eval_wsum_hess(obj, &w_one);
+    for (int it = 0; it < N_HESS_ITERS; it++)
+    {
+        obj->eval_wsum_hess(obj, &w_one);
+    }
     clock_gettime(CLOCK_MONOTONIC, &t_hess.end);
 
     double sec_jac = GET_ELAPSED_SECONDS(t_jac);
-    double sec_hess = GET_ELAPSED_SECONDS(t_hess);
+    double sec_hess = GET_ELAPSED_SECONDS(t_hess) / N_HESS_ITERS;
 
     printf("\n");
-    printf("                          Jacobian      Hessian        Total\n");
-    printf("  trimmed_log_reg:      %10.6fs  %10.6fs  %10.6fs\n", sec_jac, sec_hess,
-           sec_jac + sec_hess);
+    printf("trimmed_log_reg (m=%d, n=%d):\n", m, n);
+    printf("  jacobian = %10.6fs   hessian = %10.6fs  (avg over %d)\n",
+           sec_jac, sec_hess, N_HESS_ITERS);
 
-    /* ---- Cleanup ---- */
     free_expr(obj);
     free(A_data);
     free(y_data);
     free(u);
-
     return 0;
 }
 
diff --git a/tests/test_helpers.c b/tests/test_helpers.c
index 43b3707..7d5244a 100644
--- a/tests/test_helpers.c
+++ b/tests/test_helpers.c
@@ -114,7 +114,7 @@ CSR_matrix *new_csr_random(int m, int n, double density)
     }
     tmp_p[m] = nnz;
 
-    CSR_matrix *A = new_csr_matrix(m, n, nnz);
+    CSR_matrix *A = new_CSR_matrix(m, n, nnz);
     memcpy(A->p, tmp_p, ((size_t) m + 1) * sizeof(int));
     memcpy(A->i, tmp_i, (size_t) nnz * sizeof(int));
     memcpy(A->x, tmp_x, (size_t) nnz * sizeof(double));
diff --git a/tests/utils/test_coo_matrix.h b/tests/utils/test_coo_matrix.h
index eb48a3d..73cbbf3 100644
--- a/tests/utils/test_coo_matrix.h
+++ b/tests/utils/test_coo_matrix.h
@@ -13,7 +13,7 @@ const char *test_csr_to_coo(void)
      * [0.0  3.0  4.0]
      * [5.0  0.0  6.0]
      */
-    CSR_matrix *A = new_csr_matrix(3, 3, 6);
+    CSR_matrix *A = new_CSR_matrix(3, 3, 6);
     double Ax[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     int Ai[6] = {0, 1, 1, 2, 0, 2};
     int Ap[4] = {0, 2, 4, 6};
@@ -36,7 +36,7 @@ const char *test_csr_to_coo(void)
     mu_assert("vals incorrect", cmp_double_array(coo->x, expected_x, 6));
 
     free_coo_matrix(coo);
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
 
     return 0;
 }
@@ -48,7 +48,7 @@ const char *test_csr_to_coo_lower_triangular(void)
      * [2  5  6]
      * [3  6  9]
      */
-    CSR_matrix *A = new_csr_matrix(3, 3, 9);
+    CSR_matrix *A = new_CSR_matrix(3, 3, 9);
     int Ap[4] = {0, 3, 6, 9};
     int Ai[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
     double Ax[9] = {1, 2, 3, 2, 5, 6, 3, 6, 9};
@@ -74,14 +74,14 @@ const char *test_csr_to_coo_lower_triangular(void)
               cmp_int_array(coo->value_map, expected_map, 6));
 
     free_coo_matrix(coo);
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
 
     return 0;
 }
 
 const char *test_refresh_lower_triangular_coo(void)
 {
-    CSR_matrix *A = new_csr_matrix(3, 3, 9);
+    CSR_matrix *A = new_CSR_matrix(3, 3, 9);
     int Ap[4] = {0, 3, 6, 9};
     int Ai[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
     double Ax[9] = {1, 2, 3, 2, 5, 6, 3, 6, 9};
@@ -98,7 +98,7 @@ const char *test_refresh_lower_triangular_coo(void)
     mu_assert("refresh vals incorrect", cmp_double_array(coo->x, expected_x, 6));
 
     free_coo_matrix(coo);
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
 
     return 0;
 }
diff --git a/tests/utils/test_csc_matrix.h b/tests/utils/test_csc_matrix.h
index 9b6841a..19d0e36 100644
--- a/tests/utils/test_csc_matrix.h
+++ b/tests/utils/test_csc_matrix.h
@@ -21,7 +21,7 @@
  */
 const char *test_ATA_alloc_simple(void)
 {
-    CSC_matrix *A = new_csc_matrix(4, 3, 6);
+    CSC_matrix *A = new_CSC_matrix(4, 3, 6);
     int Ap[4] = {0, 2, 3, 6};
     int Ai[5] = {0, 2, 1, 2, 1};
     memcpy(A->p, Ap, 4 * sizeof(int));
@@ -36,8 +36,8 @@ const char *test_ATA_alloc_simple(void)
     mu_assert("i incorrect", cmp_int_array(C->i, expected_i, C->nnz));
     mu_assert("nnz incorrect", C->nnz == 5);
 
-    free_csr_matrix(C);
-    free_csc_matrix(A);
+    free_CSR_matrix(C);
+    free_CSC_matrix(A);
 
     return 0;
 }
@@ -58,7 +58,7 @@ const char *test_ATA_alloc_simple(void)
 const char *test_ATA_alloc_diagonal_like(void)
 {
     /* Create A in CSC_matrix format (3 rows, 4 cols, 4 nonzeros) */
-    CSC_matrix *A = new_csc_matrix(3, 4, 4);
+    CSC_matrix *A = new_CSC_matrix(3, 4, 4);
     int Ap[5] = {0, 1, 2, 3, 4};
     int Ai[4] = {0, 1, 2, 0};
     memcpy(A->p, Ap, 5 * sizeof(int));
@@ -72,8 +72,8 @@ const char *test_ATA_alloc_diagonal_like(void)
     mu_assert("i incorrect", cmp_int_array(C->i, expected_i, C->nnz));
     mu_assert("nnz incorrect", C->nnz == 6);
 
-    free_csr_matrix(C);
-    free_csc_matrix(A);
+    free_CSR_matrix(C);
+    free_CSC_matrix(A);
 
     return 0;
 }
@@ -81,7 +81,7 @@ const char *test_ATA_alloc_diagonal_like(void)
 const char *test_ATA_alloc_random(void)
 {
     /* Create A in CSC_matrix format  */
-    CSC_matrix *A = new_csc_matrix(10, 15, 15);
+    CSC_matrix *A = new_CSC_matrix(10, 15, 15);
     int Ap[16] = {0, 1, 1, 1, 1, 4, 5, 6, 7, 8, 9, 11, 11, 11, 13, 15};
     int Ai[15] = {5, 0, 6, 9, 0, 5, 1, 3, 6, 0, 6, 3, 6, 6, 8};
     double Ax[15] = {7, 4, 8, 5, 7, 3, 7, 8, 5, 4, 8, 8, 3, 6, 5};
@@ -109,8 +109,8 @@ const char *test_ATA_alloc_random(void)
         288., 144., 128., 90.,  144., 182., 108., 288., 180., 288., 108., 241.};
     mu_assert("x incorrect", cmp_double_array(C->x, Cx_correct, C->nnz));
 
-    free_csr_matrix(C);
-    free_csc_matrix(A);
+    free_CSR_matrix(C);
+    free_CSC_matrix(A);
 
     return 0;
 }
@@ -120,7 +120,7 @@ const char *test_ATA_alloc_random2(void)
     /* Create A in CSC_matrix format  */
     int m = 15;
     int n = 10;
-    CSC_matrix *A = new_csc_matrix(m, n, 15);
+    CSC_matrix *A = new_CSC_matrix(m, n, 15);
     int Ap[11] = {0, 2, 4, 6, 6, 9, 12, 12, 14, 14, 15};
     int Ai[15] = {9, 12, 3, 4, 1, 6, 4, 8, 13, 1, 3, 7, 5, 13, 6};
     double Ax[15] = {0.99, 0.9,  0.51, 0.64, 0.39, 0.29, 0.26, 0.91,
@@ -147,8 +147,8 @@ const char *test_ATA_alloc_random2(void)
                              0.032857,  0.116699};
     mu_assert("x incorrect", cmp_double_array(C->x, Cx_correct, C->nnz));
 
-    free_csr_matrix(C);
-    free_csc_matrix(A);
+    free_CSR_matrix(C);
+    free_CSC_matrix(A);
 
     return 0;
 }
@@ -162,7 +162,7 @@ const char *test_BTA_alloc_and_BTDA_fill(void)
      */
     int m = 4;
     int n = 3;
-    CSC_matrix *A = new_csc_matrix(m, n, 6);
+    CSC_matrix *A = new_CSC_matrix(m, n, 6);
     int Ap_A[4] = {0, 2, 4, 6};
     int Ai_A[6] = {0, 2, 1, 3, 0, 2};
     double Ax_A[6] = {1.0, 4.0, 3.0, 6.0, 2.0, 5.0};
@@ -177,7 +177,7 @@ const char *test_BTA_alloc_and_BTDA_fill(void)
      * [0.0  4.0]
      */
     int p = 2;
-    CSC_matrix *B = new_csc_matrix(m, p, 4);
+    CSC_matrix *B = new_CSC_matrix(m, p, 4);
     int Bp[3] = {0, 2, 4};
     int Bi[4] = {0, 2, 1, 3};
     double Bx[4] = {1.0, 3.0, 2.0, 4.0};
@@ -203,9 +203,9 @@ const char *test_BTA_alloc_and_BTDA_fill(void)
     double expected_x[3] = {37.0, 47.0, 108.0};
     mu_assert("C values incorrect", cmp_double_array(C->x, expected_x, 3));
 
-    free_csr_matrix(C);
-    free_csc_matrix(A);
-    free_csc_matrix(B);
+    free_CSR_matrix(C);
+    free_CSC_matrix(A);
+    free_CSC_matrix(B);
 
     return 0;
 }
diff --git a/tests/utils/test_csr_csc_conversion.h b/tests/utils/test_csr_csc_conversion.h
index 0a00121..bb8189c 100644
--- a/tests/utils/test_csr_csc_conversion.h
+++ b/tests/utils/test_csr_csc_conversion.h
@@ -17,7 +17,7 @@ const char *test_csr_to_csc_split(void)
      * [0.0  2.0  0.0  0.0  0.0]
      * [0.0  0.0  0.0  4.0  0.0]
      */
-    CSR_matrix *A = new_csr_matrix(4, 5, 5);
+    CSR_matrix *A = new_CSR_matrix(4, 5, 5);
     double Ax[5] = {1.0, 1.0, 3.0, 2.0, 4.0};
     int Ai[5] = {0, 4, 2, 1, 3};
     int Ap[5] = {0, 2, 3, 4, 5};
@@ -47,8 +47,8 @@ const char *test_csr_to_csc_split(void)
     mu_assert("C vals incorrect", cmp_double_array(C->x, Cx_correct, 5));
 
     free(iwork);
-    free_csr_matrix(A);
-    free_csc_matrix(C);
+    free_CSR_matrix(A);
+    free_CSC_matrix(C);
 
     return 0;
 }
@@ -62,7 +62,7 @@ const char *test_csc_to_csr_sparsity(void)
      * [0.0  4.0  0.0  0.0  0.0]
      * [0.0  0.0  0.0  5.0  0.0]
      */
-    CSC_matrix *A = new_csc_matrix(4, 5, 5);
+    CSC_matrix *A = new_CSC_matrix(4, 5, 5);
     double Ax[5] = {1.0, 4.0, 3.0, 5.0, 2.0};
     int Ai[5] = {0, 2, 1, 3, 0};
     int Ap[6] = {0, 1, 2, 3, 4, 5};
@@ -91,8 +91,8 @@ const char *test_csc_to_csr_sparsity(void)
     mu_assert("C nnz incorrect", C->nnz == 5);
 
     free(iwork);
-    free_csc_matrix(A);
-    free_csr_matrix(C);
+    free_CSC_matrix(A);
+    free_CSR_matrix(C);
 
     return 0;
 }
@@ -101,7 +101,7 @@ const char *test_csc_to_csr_sparsity(void)
 const char *test_csc_to_csr_values(void)
 {
     /* Create a 4x5 CSC_matrix matrix A */
-    CSC_matrix *A = new_csc_matrix(4, 5, 5);
+    CSC_matrix *A = new_CSC_matrix(4, 5, 5);
     double Ax[5] = {1.0, 4.0, 3.0, 5.0, 2.0};
     int Ai[5] = {0, 2, 1, 3, 0};
     int Ap[6] = {0, 1, 2, 3, 4, 5};
@@ -124,8 +124,8 @@ const char *test_csc_to_csr_values(void)
     mu_assert("C vals incorrect", cmp_double_array(C->x, Cx_correct, 5));
 
     free(iwork);
-    free_csc_matrix(A);
-    free_csr_matrix(C);
+    free_CSC_matrix(A);
+    free_CSR_matrix(C);
 
     return 0;
 }
@@ -138,7 +138,7 @@ const char *test_csr_csc_csr_roundtrip(void)
      * [0.0  4.0  5.0  0.0]
      * [6.0  0.0  7.0  8.0]
      */
-    CSR_matrix *A = new_csr_matrix(3, 4, 8);
+    CSR_matrix *A = new_CSR_matrix(3, 4, 8);
     double Ax[8] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
     int Ai[8] = {0, 1, 3, 1, 2, 0, 2, 3};
     int Ap[4] = {0, 3, 5, 8};
@@ -163,9 +163,9 @@ const char *test_csr_csc_csr_roundtrip(void)
 
     free(iwork_csc);
     free(iwork_csr);
-    free_csr_matrix(A);
-    free_csc_matrix(B);
-    free_csr_matrix(C);
+    free_CSR_matrix(A);
+    free_CSC_matrix(B);
+    free_CSR_matrix(C);
 
     return 0;
 }
diff --git a/tests/utils/test_csr_matrix.h b/tests/utils/test_csr_matrix.h
index cb6231e..db2e427 100644
--- a/tests/utils/test_csr_matrix.h
+++ b/tests/utils/test_csr_matrix.h
@@ -17,7 +17,7 @@ const char *test_diag_csr_mult(void)
      * [0.0  3.0  4.0]
      * [5.0  0.0  6.0]
      */
-    CSR_matrix *A = new_csr_matrix(3, 3, 6);
+    CSR_matrix *A = new_CSR_matrix(3, 3, 6);
     double Ax[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     int Ai[6] = {0, 1, 1, 2, 0, 2};
     int Ap[4] = {0, 2, 4, 6};
@@ -31,7 +31,7 @@ const char *test_diag_csr_mult(void)
      * [0.0  9.0  12.0]
      * [2.5  0.0  3.0]
      */
-    CSR_matrix *C = new_csr_matrix(3, 3, 6);
+    CSR_matrix *C = new_CSR_matrix(3, 3, 6);
     diag_csr_mult(d, A, C);
 
     double Ax_correct[6] = {2.0, 4.0, 9.0, 12.0, 2.5, 3.0};
@@ -42,8 +42,8 @@ const char *test_diag_csr_mult(void)
     mu_assert("cols incorrect", cmp_int_array(C->i, Ai_correct, 6));
     mu_assert("rows incorrect", cmp_int_array(C->p, Ap_correct, 4));
 
-    free_csr_matrix(A);
-    free_csr_matrix(C);
+    free_CSR_matrix(A);
+    free_CSR_matrix(C);
 
     return 0;
 }
@@ -55,7 +55,7 @@ const char *test_diag_csr_mult(void)
 */
 const char *test_csr_sum(void)
 {
-    CSR_matrix *A = new_csr_matrix(3, 3, 5);
+    CSR_matrix *A = new_CSR_matrix(3, 3, 5);
     double Ax[5] = {1.0, 2.0, 3.0, 4.0, 5.0};
     int Ai[5] = {0, 2, 1, 0, 2};
     int Ap[4] = {0, 2, 3, 5};
@@ -63,7 +63,7 @@ const char *test_csr_sum(void)
     memcpy(A->i, Ai, 5 * sizeof(int));
     memcpy(A->p, Ap, 4 * sizeof(int));
 
-    CSR_matrix *B = new_csr_matrix(3, 3, 4);
+    CSR_matrix *B = new_CSR_matrix(3, 3, 4);
     double Bx[4] = {1.0, 2.0, 3.0, 6.0};
     int Bi[4] = {1, 0, 2, 1};
     int Bp[4] = {0, 1, 3, 4};
@@ -71,7 +71,7 @@ const char *test_csr_sum(void)
     memcpy(B->i, Bi, 4 * sizeof(int));
     memcpy(B->p, Bp, 4 * sizeof(int));
 
-    CSR_matrix *C = new_csr_matrix(3, 3, 9);
+    CSR_matrix *C = new_CSR_matrix(3, 3, 9);
     sum_csr_matrices(A, B, C);
 
     double Cx_correct[9] = {1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 6.0, 5.0};
@@ -83,9 +83,9 @@ const char *test_csr_sum(void)
     mu_assert("C cols incorrect", cmp_int_array(C->i, Ci_correct, 9));
     mu_assert("C rows incorrect", cmp_int_array(C->p, Cp_correct, 4));
 
-    free_csr_matrix(A);
-    free_csr_matrix(B);
-    free_csr_matrix(C);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
+    free_CSR_matrix(C);
 
     return 0;
 }
@@ -97,7 +97,7 @@ const char *test_csr_sum(void)
 */
 const char *test_csr_sum2(void)
 {
-    CSR_matrix *A = new_csr_matrix(3, 3, 5);
+    CSR_matrix *A = new_CSR_matrix(3, 3, 5);
     double Ax[5] = {1.0, 2.0, 3.0, 4.0, 5.0};
     int Ai[5] = {0, 2, 2, 0, 2};
     int Ap[4] = {0, 2, 3, 5};
@@ -105,7 +105,7 @@ const char *test_csr_sum2(void)
     memcpy(A->i, Ai, 5 * sizeof(int));
     memcpy(A->p, Ap, 4 * sizeof(int));
 
-    CSR_matrix *B = new_csr_matrix(3, 3, 4);
+    CSR_matrix *B = new_CSR_matrix(3, 3, 4);
     double Bx[4] = {1.0, 2.0, 3.0, 6.0};
     int Bi[4] = {1, 0, 2, 1};
     int Bp[4] = {0, 1, 3, 4};
@@ -113,7 +113,7 @@ const char *test_csr_sum2(void)
     memcpy(B->i, Bi, 4 * sizeof(int));
     memcpy(B->p, Bp, 4 * sizeof(int));
 
-    CSR_matrix *C = new_csr_matrix(3, 3, 8);
+    CSR_matrix *C = new_CSR_matrix(3, 3, 8);
     sum_csr_matrices(A, B, C);
 
     double Cx_correct[8] = {1, 1, 2, 2, 6, 4, 6, 5};
@@ -125,16 +125,16 @@ const char *test_csr_sum2(void)
     mu_assert("C cols incorrect", cmp_int_array(C->i, Ci_correct, 8));
     mu_assert("C rows incorrect", cmp_int_array(C->p, Cp_correct, 4));
 
-    free_csr_matrix(A);
-    free_csr_matrix(B);
-    free_csr_matrix(C);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
+    free_CSR_matrix(C);
 
     return 0;
 }
 
 const char *test_transpose(void)
 {
-    CSR_matrix *A = new_csr_matrix(4, 5, 5);
+    CSR_matrix *A = new_CSR_matrix(4, 5, 5);
     double Ax[5] = {1.0, 1.0, 3.0, 2.0, 4.0};
     int Ai[5] = {0, 4, 1, 0, 1};
     int Ap[5] = {0, 2, 3, 4, 5};
@@ -151,8 +151,8 @@ const char *test_transpose(void)
     mu_assert("AT cols incorrect", cmp_int_array(AT->i, ATi_correct, 5));
     mu_assert("AT rows incorrect", cmp_int_array(AT->p, ATp_correct, 6));
 
-    free_csr_matrix(A);
-    free_csr_matrix(AT);
+    free_CSR_matrix(A);
+    free_CSR_matrix(AT);
 
     return 0;
 }
@@ -167,7 +167,7 @@ A = [1 0 0 0 1
 */
 const char *test_csr_vecmat_values_sparse(void)
 {
-    CSR_matrix *A = new_csr_matrix(4, 5, 5);
+    CSR_matrix *A = new_CSR_matrix(4, 5, 5);
     double Ax[5] = {1.0, 1.0, 3.0, 2.0, 4.0};
     int Ai[5] = {0, 4, 1, 0, 1};
     int Ap[5] = {0, 2, 3, 4, 5};
@@ -177,7 +177,7 @@ const char *test_csr_vecmat_values_sparse(void)
 
     double z[4] = {1.0, 2.0, 3.0, 4.0};
 
-    CSR_matrix *C = new_csr_matrix(1, 3, 3);
+    CSR_matrix *C = new_CSR_matrix(1, 3, 3);
     double Cx[3] = {0.0, 0.0, 0.0};
     int Ci[3] = {0, 1, 4};
     int Cp[2] = {0, 3};
@@ -196,9 +196,9 @@ const char *test_csr_vecmat_values_sparse(void)
     mu_assert("C nnz incorrect", C->nnz == 3);
     mu_assert("C vals incorrect", cmp_double_array(C->x, Cx_correct, 3));
 
-    free_csr_matrix(A);
-    free_csr_matrix(AT);
-    free_csr_matrix(C);
+    free_CSR_matrix(A);
+    free_CSR_matrix(AT);
+    free_CSR_matrix(C);
 
     return 0;
 }
@@ -212,14 +212,14 @@ const char *test_sum_all_rows_csr(void)
      * Sum all rows should give:
      * [6.0  5.0  10.0  7.0]
      */
-    CSR_matrix *A = new_csr_matrix(3, 4, 7);
+    CSR_matrix *A = new_CSR_matrix(3, 4, 7);
     double Ax[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
     int Ai[7] = {0, 1, 1, 2, 0, 2, 3};
     int Ap[4] = {0, 2, 4, 7};
     memcpy(A->x, Ax, 7 * sizeof(double));
     memcpy(A->i, Ai, 7 * sizeof(int));
     memcpy(A->p, Ap, 4 * sizeof(int));
-    CSR_matrix *C = new_csr_matrix(1, 4, 4);
+    CSR_matrix *C = new_CSR_matrix(1, 4, 4);
     int_double_pair *pairs = new_int_double_pair_array(7);
     sum_all_rows_csr(A, C, pairs);
     double Cx_correct[4] = {6.0, 5.0, 10.0, 7.0};
@@ -231,8 +231,8 @@ const char *test_sum_all_rows_csr(void)
     mu_assert("C cols incorrect", cmp_int_array(C->i, Ci_correct, 4));
     mu_assert("C rows incorrect", cmp_int_array(C->p, Cp_correct, 2));
 
-    free_csr_matrix(A);
-    free_csr_matrix(C);
+    free_CSR_matrix(A);
+    free_CSR_matrix(C);
     free_int_double_pair_array(pairs);
 
     return 0;
@@ -260,7 +260,7 @@ const char *test_sum_block_of_rows_csr(void)
      *
      * Result C should be 3x4 matrix with the sums above
      */
-    CSR_matrix *A = new_csr_matrix(9, 4, 18);
+    CSR_matrix *A = new_CSR_matrix(9, 4, 18);
 
     double Ax[18] = {1.0, 2.0,  /* row 0 */
                      3.0, 1.0,  /* row 1 */
@@ -289,7 +289,7 @@ const char *test_sum_block_of_rows_csr(void)
     memcpy(A->p, Ap, 10 * sizeof(int));
 
     /* Allocate C for 3 blocks and enough space for all nonzeros */
-    CSR_matrix *C = new_csr_matrix(3, 4, 12);
+    CSR_matrix *C = new_CSR_matrix(3, 4, 12);
     int_double_pair *pairs = new_int_double_pair_array(18);
 
     sum_block_of_rows_csr(A, C, pairs, 3);
@@ -310,8 +310,8 @@ const char *test_sum_block_of_rows_csr(void)
     mu_assert("C cols incorrect", cmp_int_array(C->i, Ci_correct, 12));
     mu_assert("C rows incorrect", cmp_int_array(C->p, Cp_correct, 4));
 
-    free_csr_matrix(A);
-    free_csr_matrix(C);
+    free_CSR_matrix(A);
+    free_CSR_matrix(C);
     free_int_double_pair_array(pairs);
 
     return 0;
@@ -338,7 +338,7 @@ const char *test_sum_evenly_spaced_rows_csr(void)
             row 1: sum of rows 1, 4, 7 = [1 4 6 0]
             row 2: sum of rows 2, 5, 8 = [3 2 4 11]
     */
-    CSR_matrix *A = new_csr_matrix(9, 4, 18);
+    CSR_matrix *A = new_CSR_matrix(9, 4, 18);
 
     double Ax[18] = {1.0, 2.0,  /* row 0 */
                      3.0, 1.0,  /* row 1 */
@@ -367,7 +367,7 @@ const char *test_sum_evenly_spaced_rows_csr(void)
     memcpy(A->p, Ap, 10 * sizeof(int));
 
     /* Allocate C for 3 rows (row_spacing=3) and enough space for all nonzeros */
-    CSR_matrix *C = new_csr_matrix(3, 4, 10);
+    CSR_matrix *C = new_CSR_matrix(3, 4, 10);
     int_double_pair *pairs = new_int_double_pair_array(18);
 
     sum_evenly_spaced_rows_csr(A, C, pairs, 3);
@@ -388,8 +388,8 @@ const char *test_sum_evenly_spaced_rows_csr(void)
     mu_assert("C cols incorrect", cmp_int_array(C->i, Ci_correct, 10));
     mu_assert("C rows incorrect", cmp_int_array(C->p, Cp_correct, 4));
 
-    free_csr_matrix(A);
-    free_csr_matrix(C);
+    free_CSR_matrix(A);
+    free_CSR_matrix(C);
     free_int_double_pair_array(pairs);
 
     return 0;
@@ -401,7 +401,7 @@ const char *test_AT_alloc_and_fill(void)
      * [0.0  3.0  0.0  4.0]
      * [5.0  0.0  6.0  0.0]
      */
-    CSR_matrix *A = new_csr_matrix(3, 4, 6);
+    CSR_matrix *A = new_CSR_matrix(3, 4, 6);
     double Ax[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     int Ai[6] = {0, 2, 1, 3, 0, 2};
     int Ap[4] = {0, 2, 4, 6};
@@ -432,8 +432,8 @@ const char *test_AT_alloc_and_fill(void)
     mu_assert("AT cols incorrect", cmp_int_array(AT->i, ATi_correct, 6));
     mu_assert("AT rows incorrect", cmp_int_array(AT->p, ATp_correct, 5));
 
-    free_csr_matrix(A);
-    free_csr_matrix(AT);
+    free_CSR_matrix(A);
+    free_CSR_matrix(AT);
     free(iwork);
 
     return 0;
@@ -454,7 +454,7 @@ const char *test_kron_identity_csr(void)
      * [3  0 | 0  0 | 4  0]
      * [0  3 | 0  0 | 0  4]
      */
-    CSR_matrix *A = new_csr_matrix(2, 3, 4);
+    CSR_matrix *A = new_CSR_matrix(2, 3, 4);
     double Ax[4] = {1.0, 2.0, 3.0, 4.0};
     int Ai[4] = {0, 2, 0, 2};
     int Ap[3] = {0, 2, 4};
@@ -480,8 +480,8 @@ const char *test_kron_identity_csr(void)
     mu_assert("cols incorrect", cmp_int_array(result->i, expected_i, 8));
     mu_assert("rows incorrect", cmp_int_array(result->p, expected_p, 5));
 
-    free_csr_matrix(A);
-    free_csr_matrix(result);
+    free_CSR_matrix(A);
+    free_CSR_matrix(result);
 
     return 0;
 }
diff --git a/tests/utils/test_linalg_sparse_matmuls.h b/tests/utils/test_linalg_sparse_matmuls.h
index 6e9753f..95c1c27 100644
--- a/tests/utils/test_linalg_sparse_matmuls.h
+++ b/tests/utils/test_linalg_sparse_matmuls.h
@@ -16,7 +16,7 @@ const char *test_block_left_multiply_single_block(void)
      * [1.0  0.0  0.0]
      * [0.0  1.0  1.0]
      */
-    CSR_matrix *A = new_csr_matrix(2, 3, 3);
+    CSR_matrix *A = new_CSR_matrix(2, 3, 3);
     double Ax[3] = {1.0, 1.0, 1.0};
     int Ai[3] = {0, 1, 2};
     int Ap[3] = {0, 1, 3};
@@ -29,7 +29,7 @@ const char *test_block_left_multiply_single_block(void)
      * [1.0  0.0]
      * [0.0  1.0]
      */
-    CSC_matrix *J = new_csc_matrix(3, 2, 3);
+    CSC_matrix *J = new_CSC_matrix(3, 2, 3);
     double Jx[3] = {1.0, 1.0, 1.0};
     int Ji[3] = {0, 1, 2};
     int Jp[3] = {0, 2, 3};
@@ -54,9 +54,9 @@ const char *test_block_left_multiply_single_block(void)
     mu_assert("C col pointers incorrect", cmp_int_array(C->p, expected_p, 3));
     mu_assert("C row indices incorrect", cmp_int_array(C->i, expected_i, 3));
 
-    free_csc_matrix(C);
-    free_csr_matrix(A);
-    free_csc_matrix(J);
+    free_CSC_matrix(C);
+    free_CSR_matrix(A);
+    free_CSC_matrix(J);
     return NULL;
 }
 
@@ -67,7 +67,7 @@ const char *test_block_left_multiply_two_blocks(void)
      * [1.0  0.0]
      * [0.0  1.0]
      */
-    CSR_matrix *A = new_csr_matrix(2, 2, 2);
+    CSR_matrix *A = new_CSR_matrix(2, 2, 2);
     double Ax[2] = {1.0, 1.0};
     int Ai[2] = {0, 1};
     int Ap[3] = {0, 1, 2};
@@ -88,7 +88,7 @@ const char *test_block_left_multiply_two_blocks(void)
      * [0.0  1.0  0.0]
      * [0.0  0.0  1.0]
      */
-    CSC_matrix *J = new_csc_matrix(4, 3, 3);
+    CSC_matrix *J = new_CSC_matrix(4, 3, 3);
     double Jx[3] = {1.0, 1.0, 1.0};
     int Ji[3] = {0, 2, 3};
     int Jp[4] = {0, 1, 2, 3};
@@ -121,9 +121,9 @@ const char *test_block_left_multiply_two_blocks(void)
     mu_assert("C row indices incorrect", cmp_int_array(C->i, expected_i2, 3));
     mu_assert("C values incorrect", cmp_double_array(C->x, expected_x2, 3));
 
-    free_csc_matrix(C);
-    free_csr_matrix(A);
-    free_csc_matrix(J);
+    free_CSC_matrix(C);
+    free_CSR_matrix(A);
+    free_CSC_matrix(J);
     return NULL;
 }
 
@@ -131,7 +131,7 @@ const char *test_block_left_multiply_two_blocks(void)
 const char *test_block_left_multiply_zero_column(void)
 {
     /* A is 2x2 CSR_matrix (identity) */
-    CSR_matrix *A = new_csr_matrix(2, 2, 2);
+    CSR_matrix *A = new_CSR_matrix(2, 2, 2);
     double Ax[2] = {1.0, 1.0};
     int Ai[2] = {0, 1};
     int Ap[3] = {0, 1, 2};
@@ -143,7 +143,7 @@ const char *test_block_left_multiply_zero_column(void)
      * [1.0  0.0]
      * [0.0  0.0]
      */
-    CSC_matrix *J = new_csc_matrix(2, 2, 1);
+    CSC_matrix *J = new_CSC_matrix(2, 2, 1);
     double Jx[1] = {1.0};
     int Ji[1] = {0};
     int Jp[3] = {0, 1, 1}; /* Column 0 has one nonzero, column 1 is empty */
@@ -160,9 +160,9 @@ const char *test_block_left_multiply_zero_column(void)
     mu_assert("C col pointers incorrect", cmp_int_array(C->p, expected_p3, 3));
     mu_assert("C row indices incorrect", cmp_int_array(C->i, expected_i3, 1));
 
-    free_csc_matrix(C);
-    free_csr_matrix(A);
-    free_csc_matrix(J);
+    free_CSC_matrix(C);
+    free_CSR_matrix(A);
+    free_CSC_matrix(J);
     return NULL;
 }
 
@@ -174,7 +174,7 @@ const char *test_csr_csc_matmul_alloc_basic(void)
      * [0.0  1.0]
      * [1.0  1.0]
      */
-    CSR_matrix *A = new_csr_matrix(3, 2, 4);
+    CSR_matrix *A = new_CSR_matrix(3, 2, 4);
     double Ax[4] = {1.0, 1.0, 1.0, 1.0};
     int Ai[4] = {0, 1, 0, 1};
     int Ap[4] = {0, 1, 2, 4};
@@ -186,7 +186,7 @@ const char *test_csr_csc_matmul_alloc_basic(void)
      * [1.0  0.0  1.0]
      * [0.0  1.0  1.0]
      */
-    CSC_matrix *B = new_csc_matrix(2, 3, 4);
+    CSC_matrix *B = new_CSC_matrix(2, 3, 4);
     double Bx[4] = {1.0, 1.0, 1.0, 1.0};
     int Bi[4] = {0, 1, 0, 1};
     int Bp[4] = {0, 1, 2, 4};
@@ -208,9 +208,9 @@ const char *test_csr_csc_matmul_alloc_basic(void)
     mu_assert("C row pointers incorrect", cmp_int_array(C->p, expected_p4, 4));
     mu_assert("C col indices incorrect", cmp_int_array(C->i, expected_i4, 7));
 
-    free_csr_matrix(C);
-    free_csr_matrix(A);
-    free_csc_matrix(B);
+    free_CSR_matrix(C);
+    free_CSR_matrix(A);
+    free_CSC_matrix(B);
     return NULL;
 }
 
@@ -221,7 +221,7 @@ const char *test_csr_csc_matmul_alloc_sparse(void)
      * [1.0  0.0  0.0]
      * [0.0  0.0  1.0]
      */
-    CSR_matrix *A = new_csr_matrix(2, 3, 2);
+    CSR_matrix *A = new_CSR_matrix(2, 3, 2);
     double Ax[2] = {1.0, 1.0};
     int Ai[2] = {0, 2};
     int Ap[3] = {0, 1, 2};
@@ -234,7 +234,7 @@ const char *test_csr_csc_matmul_alloc_sparse(void)
      * [0.0  0.0]
      * [0.0  1.0]
      */
-    CSC_matrix *B = new_csc_matrix(3, 2, 2);
+    CSC_matrix *B = new_CSC_matrix(3, 2, 2);
     double Bx[2] = {1.0, 1.0};
     int Bi[2] = {0, 2};
     int Bp[3] = {0, 1, 2};
@@ -255,9 +255,9 @@ const char *test_csr_csc_matmul_alloc_sparse(void)
     mu_assert("C row pointers incorrect", cmp_int_array(C->p, expected_p5, 3));
     mu_assert("C col indices incorrect", cmp_int_array(C->i, expected_i5, 2));
 
-    free_csr_matrix(C);
-    free_csr_matrix(A);
-    free_csc_matrix(B);
+    free_CSR_matrix(C);
+    free_CSR_matrix(A);
+    free_CSC_matrix(B);
     return NULL;
 }
 
@@ -268,7 +268,7 @@ const char *test_block_left_multiply_vec_single_block(void)
      * [1.0  0.0  2.0]
      * [0.0  3.0  0.0]
      */
-    CSR_matrix *A = new_csr_matrix(2, 3, 3);
+    CSR_matrix *A = new_CSR_matrix(2, 3, 3);
     double Ax[3] = {1.0, 3.0, 2.0};
     int Ai[3] = {0, 1, 2};
     int Ap[3] = {0, 2, 3};
@@ -289,7 +289,7 @@ const char *test_block_left_multiply_vec_single_block(void)
     double expected_y[2] = {7.0, 6.0};
     mu_assert("y values incorrect", cmp_double_array(y, expected_y, 2));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     return NULL;
 }
 
@@ -300,7 +300,7 @@ const char *test_block_left_multiply_vec_two_blocks(void)
      * [1.0  2.0  0.0]
      * [0.0  3.0  4.0]
      */
-    CSR_matrix *A = new_csr_matrix(2, 3, 4);
+    CSR_matrix *A = new_CSR_matrix(2, 3, 4);
     double Ax[4] = {1.0, 2.0, 3.0, 4.0};
     int Ai[4] = {0, 1, 1, 2};
     int Ap[3] = {0, 2, 4};
@@ -323,7 +323,7 @@ const char *test_block_left_multiply_vec_two_blocks(void)
     double expected_y[4] = {5.0, 18.0, 14.0, 39.0};
     mu_assert("y values incorrect", cmp_double_array(y, expected_y, 4));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     return NULL;
 }
 
@@ -335,7 +335,7 @@ const char *test_block_left_multiply_vec_sparse(void)
      * [0.0  0.0  3.0  0.0]
      * [0.0  0.0  0.0  4.0]
      */
-    CSR_matrix *A = new_csr_matrix(3, 4, 3);
+    CSR_matrix *A = new_CSR_matrix(3, 4, 3);
     double Ax[3] = {2.0, 3.0, 4.0};
     int Ai[3] = {0, 2, 3};
     int Ap[4] = {0, 1, 2, 3};
@@ -358,7 +358,7 @@ const char *test_block_left_multiply_vec_sparse(void)
     double expected_y[6] = {2.0, 9.0, 16.0, 10.0, 21.0, 32.0};
     mu_assert("y values incorrect", cmp_double_array(y, expected_y, 6));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     return NULL;
 }
 
@@ -369,7 +369,7 @@ const char *test_block_left_multiply_vec_three_blocks(void)
      * [1.0  2.0]
      * [3.0  4.0]
      */
-    CSR_matrix *A = new_csr_matrix(2, 2, 4);
+    CSR_matrix *A = new_CSR_matrix(2, 2, 4);
     double Ax[4] = {1.0, 2.0, 3.0, 4.0};
     int Ai[4] = {0, 1, 0, 1};
     int Ap[3] = {0, 2, 4};
@@ -393,6 +393,6 @@ const char *test_block_left_multiply_vec_three_blocks(void)
     double expected_y[6] = {5.0, 11.0, 11.0, 25.0, 17.0, 39.0};
     mu_assert("y values incorrect", cmp_double_array(y, expected_y, 6));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     return NULL;
 }
diff --git a/tests/utils/test_linalg_utils_matmul_chain_rule.h b/tests/utils/test_linalg_utils_matmul_chain_rule.h
index 8da9d05..86d7e3f 100644
--- a/tests/utils/test_linalg_utils_matmul_chain_rule.h
+++ b/tests/utils/test_linalg_utils_matmul_chain_rule.h
@@ -34,7 +34,7 @@ const char *test_YT_kron_I(void)
     int m = 2, k = 2, n = 2;
 
     /* J is 4x3 CSC_matrix */
-    CSC_matrix *J = new_csc_matrix(4, 3, 5);
+    CSC_matrix *J = new_CSC_matrix(4, 3, 5);
     int Jp[4] = {0, 2, 3, 5};
     int Ji[5] = {0, 2, 1, 0, 3};
     double Jx[5] = {1.0, 3.0, 1.0, 2.0, 1.0};
@@ -60,8 +60,8 @@ const char *test_YT_kron_I(void)
     YT_kron_I_fill_values(m, k, n, Y, J, C);
     mu_assert("C values", cmp_double_array(C->x, exp_x, 8));
 
-    free_csr_matrix(C);
-    free_csc_matrix(J);
+    free_CSR_matrix(C);
+    free_CSC_matrix(J);
     return NULL;
 }
 
@@ -86,7 +86,7 @@ const char *test_YT_kron_I_larger(void)
     int m = 3, k = 2, n = 3;
 
     /* J is 6x4 CSC_matrix */
-    CSC_matrix *J = new_csc_matrix(6, 4, 8);
+    CSC_matrix *J = new_CSC_matrix(6, 4, 8);
     int Jp[5] = {0, 2, 4, 6, 8};
     int Ji[8] = {0, 3, 2, 4, 1, 5, 0, 3};
     double Jx[8] = {1.0, 2.0, 3.0, 1.0, 1.0, 4.0, 2.0, 1.0};
@@ -113,8 +113,8 @@ const char *test_YT_kron_I_larger(void)
     YT_kron_I_fill_values(m, k, n, Y, J, C);
     mu_assert("C2 values", cmp_double_array(C->x, exp_x, 18));
 
-    free_csr_matrix(C);
-    free_csc_matrix(J);
+    free_CSR_matrix(C);
+    free_CSC_matrix(J);
     return NULL;
 }
 
@@ -144,7 +144,7 @@ const char *test_I_kron_X(void)
     int m = 2, k = 2, n = 2;
 
     /* J is 4x3 CSC_matrix */
-    CSC_matrix *J = new_csc_matrix(4, 3, 5);
+    CSC_matrix *J = new_CSC_matrix(4, 3, 5);
     int Jp[4] = {0, 2, 3, 5};
     int Ji[5] = {0, 2, 1, 0, 3};
     double Jx[5] = {1.0, 3.0, 1.0, 2.0, 1.0};
@@ -170,8 +170,8 @@ const char *test_I_kron_X(void)
     I_kron_X_fill_values(m, k, n, X, J, C);
     mu_assert("C values", cmp_double_array(C->x, exp_x, 10));
 
-    free_csr_matrix(C);
-    free_csc_matrix(J);
+    free_CSR_matrix(C);
+    free_CSC_matrix(J);
     return NULL;
 }
 
@@ -195,7 +195,7 @@ const char *test_I_kron_X_larger(void)
     int m = 3, k = 2, n = 2;
 
     /* J is 4x4 CSC_matrix */
-    CSC_matrix *J = new_csc_matrix(4, 4, 7);
+    CSC_matrix *J = new_CSC_matrix(4, 4, 7);
     int Jp[5] = {0, 2, 3, 5, 7};
     int Ji[7] = {0, 3, 1, 1, 2, 0, 3};
     double Jx[7] = {1.0, 2.0, 3.0, 1.0, 4.0, 2.0, 1.0};
@@ -222,7 +222,7 @@ const char *test_I_kron_X_larger(void)
     I_kron_X_fill_values(m, k, n, X, J, C);
     mu_assert("C2 values", cmp_double_array(C->x, exp_x, 21));
 
-    free_csr_matrix(C);
-    free_csc_matrix(J);
+    free_CSR_matrix(C);
+    free_CSC_matrix(J);
     return NULL;
 }
diff --git a/tests/utils/test_matrix.h b/tests/utils/test_matrix.h
index 197a386..d0ccfe5 100644
--- a/tests/utils/test_matrix.h
+++ b/tests/utils/test_matrix.h
@@ -53,7 +53,7 @@ const char *test_dense_matrix_mult_vec_blocks(void)
 const char *test_sparse_vs_dense_mult_vec(void)
 {
     /* Build CSR_matrix for A = [1 2 3; 4 5 6] */
-    CSR_matrix *csr = new_csr_matrix(2, 3, 6);
+    CSR_matrix *csr = new_CSR_matrix(2, 3, 6);
     int Ap[3] = {0, 3, 6};
     int Ai[6] = {0, 1, 2, 0, 1, 2};
     double Ax[6] = {1, 2, 3, 4, 5, 6};
@@ -104,7 +104,7 @@ const char *test_dense_matrix_trans(void)
    A = [1 2; 3 4], x = [1; 2; 3; 4], p = 2 */
 const char *test_sparse_vs_dense_mult_vec_blocks(void)
 {
-    CSR_matrix *csr = new_csr_matrix(2, 2, 4);
+    CSR_matrix *csr = new_CSR_matrix(2, 2, 4);
     int Ap[3] = {0, 2, 4};
     int Ai[4] = {0, 1, 0, 1};
     double Ax[4] = {1, 2, 3, 4};
diff --git a/tests/utils/test_matrix_BTA.h b/tests/utils/test_matrix_BTA.h
index 2c99cb0..73a436d 100644
--- a/tests/utils/test_matrix_BTA.h
+++ b/tests/utils/test_matrix_BTA.h
@@ -2,6 +2,7 @@
 #define TEST_MATRIX_BTA_H
 
 #include "minunit.h"
+#include "old-code/old_permuted_dense.h"
 #include "test_helpers.h"
 #include "utils/CSR_matrix.h"
 #include "utils/matrix_BTA.h"
@@ -31,8 +32,8 @@ const char *test_BTDA_matrices_pd_pd(void)
     /* Direct primitive path on independent operands. */
     matrix *A2 = new_permuted_dense(2, 4, 2, 2, row_perm, col_perm_A, XA);
     matrix *B2 = new_permuted_dense(2, 4, 2, 2, row_perm, col_perm_B, XB);
-    matrix *C2 = BTA_pd_pd_alloc((permuted_dense *) A2, (permuted_dense *) B2);
-    BTDA_pd_pd_fill_values((permuted_dense *) A2, d, (permuted_dense *) B2,
+    matrix *C2 = BTA_pd_pd_alloc((permuted_dense *) B2, (permuted_dense *) A2);
+    BTDA_pd_pd_fill_values((permuted_dense *) B2, d, (permuted_dense *) A2,
                            (permuted_dense *) C2);
 
     mu_assert("values", cmp_double_array(C_m->x, C2->x, C_m->nnz));
@@ -47,11 +48,11 @@ const char *test_BTDA_matrices_pd_pd(void)
 }
 
 /* Wrapper dispatch sanity: (CSR_matrix, PD). Compare against direct
-   BTDA_csr_pd_fill_values. */
+   BTDA_pd_csr_fill_values. */
 const char *test_BTDA_matrices_csr_pd(void)
 {
     /* A: 4x5 CSR_matrix */
-    CSR_matrix *A = new_csr_matrix(4, 5, 5);
+    CSR_matrix *A = new_CSR_matrix(4, 5, 5);
     A->p[0] = 0;
     A->p[1] = 2;
     A->p[2] = 3;
@@ -76,7 +77,7 @@ const char *test_BTDA_matrices_csr_pd(void)
     BTDA_matrices_fill_values(A_m, d, B_m, C_m);
 
     /* Direct primitive path. */
-    CSR_matrix *A2 = new_csr_matrix(4, 5, 5);
+    CSR_matrix *A2 = new_CSR_matrix(4, 5, 5);
     A2->p[0] = 0;
     A2->p[1] = 2;
     A2->p[2] = 3;
@@ -86,8 +87,8 @@ const char *test_BTDA_matrices_csr_pd(void)
     memcpy(A2->x, Ax, sizeof Ax);
     matrix *B2_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
     permuted_dense *B2 = (permuted_dense *) B2_m;
-    matrix *C2 = BTA_csr_pd_alloc(A2, B2);
-    BTDA_csr_pd_fill_values(A2, d, B2, (permuted_dense *) C2);
+    matrix *C2 = BTA_pd_csr_alloc(B2, A2);
+    BTDA_pd_csr_fill_values(B2, d, A2, (permuted_dense *) C2);
 
     mu_assert("values", cmp_double_array(C_m->x, C2->x, C_m->nnz));
 
@@ -96,12 +97,12 @@ const char *test_BTDA_matrices_csr_pd(void)
     free_matrix(A_m);
     free_matrix(C2);
     free_matrix(B2_m);
-    free_csr_matrix(A2);
+    free_CSR_matrix(A2);
     return 0;
 }
 
 /* Wrapper dispatch sanity: (PD, CSR_matrix). Compare against direct
-   BTDA_pd_csr_fill_values. */
+   BTDA_csr_pd_fill_values. */
 const char *test_BTDA_matrices_pd_csr(void)
 {
     /* A: 4x5 PD, row_perm = [1, 3], col_perm = [0, 2]. */
@@ -111,7 +112,7 @@ const char *test_BTDA_matrices_pd_csr(void)
     matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
 
     /* B: 4x4 CSR_matrix. */
-    CSR_matrix *B = new_csr_matrix(4, 4, 5);
+    CSR_matrix *B = new_CSR_matrix(4, 4, 5);
     B->p[0] = 0;
     B->p[1] = 2;
     B->p[2] = 3;
@@ -132,7 +133,7 @@ const char *test_BTDA_matrices_pd_csr(void)
     /* Direct primitive path. */
     matrix *A2_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
     permuted_dense *A2 = (permuted_dense *) A2_m;
-    CSR_matrix *B2 = new_csr_matrix(4, 4, 5);
+    CSR_matrix *B2 = new_CSR_matrix(4, 4, 5);
     B2->p[0] = 0;
     B2->p[1] = 2;
     B2->p[2] = 3;
@@ -140,8 +141,8 @@ const char *test_BTDA_matrices_pd_csr(void)
     B2->p[4] = 5;
     memcpy(B2->i, Bi, sizeof Bi);
     memcpy(B2->x, Bx, sizeof Bx);
-    matrix *C2 = BTA_pd_csr_alloc(A2, B2);
-    BTDA_pd_csr_fill_values(A2, d, B2, (permuted_dense *) C2);
+    matrix *C2 = BTA_csr_pd_alloc(B2, A2);
+    BTDA_csr_pd_fill_values(B2, d, A2, (permuted_dense *) C2);
 
     mu_assert("values", cmp_double_array(C_m->x, C2->x, C_m->nnz));
 
@@ -149,7 +150,7 @@ const char *test_BTDA_matrices_pd_csr(void)
     free_matrix(B_m);
     free_matrix(A_m);
     free_matrix(C2);
-    free_csr_matrix(B2);
+    free_CSR_matrix(B2);
     free_matrix(A2_m);
     return 0;
 }
diff --git a/tests/utils/test_permuted_dense.h b/tests/utils/test_permuted_dense.h
index eef17a5..bad8ffd 100644
--- a/tests/utils/test_permuted_dense.h
+++ b/tests/utils/test_permuted_dense.h
@@ -2,9 +2,11 @@
 #define TEST_PERMUTED_DENSE_H
 
 #include "minunit.h"
+#include "old-code/old_permuted_dense.h"
 #include "test_helpers.h"
 #include "utils/CSC_matrix.h"
 #include "utils/permuted_dense.h"
+#include "utils/utils.h"
 #include <stdlib.h>
 #include <string.h>
 
@@ -150,7 +152,7 @@ const char *test_permuted_dense_DA_fill_values(void)
     CSR_matrix *csr_out = M_out->to_csr(M_out);
     mu_assert("x", cmp_double_array(csr_out->x, csr_expected->x, csr->nnz));
 
-    free_csr_matrix(csr_expected);
+    free_CSR_matrix(csr_expected);
     free_matrix(M);
     free_matrix(M_out);
     return 0;
@@ -224,7 +226,7 @@ const char *test_permuted_dense_times_csc(void)
     matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
     permuted_dense *pd = (permuted_dense *) M;
 
-    CSC_matrix *J = new_csc_matrix(6, 4, 4);
+    CSC_matrix *J = new_CSC_matrix(6, 4, 4);
     int Jp[5] = {0, 0, 2, 3, 4};
     int Ji[4] = {0, 3, 2, 3};
     double Jx[4] = {10.0, 20.0, 30.0, 40.0};
@@ -234,7 +236,7 @@ const char *test_permuted_dense_times_csc(void)
 
     matrix *M_out = BA_pd_csc_alloc(pd, J);
     permuted_dense *pd_out = (permuted_dense *) M_out;
-    BA_pd_csc_fill_values(pd, J, pd_out);
+    BA_pd_csc_fill_values(pd->X, pd->n0, pd->col_inv, J, pd_out);
 
     int row_perm_expected[3] = {1, 2, 4};
     int col_perm_expected[2] = {1, 3};
@@ -250,7 +252,7 @@ const char *test_permuted_dense_times_csc(void)
 
     free_matrix(M);
     free_matrix(M_out);
-    free_csc_matrix(J);
+    free_CSC_matrix(J);
     return 0;
 }
 
@@ -265,7 +267,7 @@ const char *test_permuted_dense_times_csc_no_active(void)
     permuted_dense *pd = (permuted_dense *) M;
 
     /* J: col 0 has row {1}, col 1 has row {5}. Neither in col_perm_self. */
-    CSC_matrix *J = new_csc_matrix(6, 2, 2);
+    CSC_matrix *J = new_CSC_matrix(6, 2, 2);
     int Jp[3] = {0, 1, 2};
     int Ji[2] = {1, 5};
     double Jx[2] = {100.0, 200.0};
@@ -275,7 +277,7 @@ const char *test_permuted_dense_times_csc_no_active(void)
 
     matrix *M_out = BA_pd_csc_alloc(pd, J);
     permuted_dense *pd_out = (permuted_dense *) M_out;
-    BA_pd_csc_fill_values(pd, J, pd_out);
+    BA_pd_csc_fill_values(pd->X, pd->n0, pd->col_inv, J, pd_out);
 
     mu_assert("m", M_out->m == 5);
     mu_assert("n", M_out->n == 2);
@@ -284,7 +286,7 @@ const char *test_permuted_dense_times_csc_no_active(void)
 
     free_matrix(M);
     free_matrix(M_out);
-    free_csc_matrix(J);
+    free_CSC_matrix(J);
     return 0;
 }
 
@@ -579,7 +581,7 @@ const char *test_permuted_dense_BTA_matching_row_perm(void)
     permuted_dense *A = (permuted_dense *) A_m;
     permuted_dense *B = (permuted_dense *) B_m;
 
-    matrix *C_m = BTA_pd_pd_alloc(A, B);
+    matrix *C_m = BTA_pd_pd_alloc(B, A);
     permuted_dense *C = (permuted_dense *) C_m;
 
     mu_assert("out m", C_m->m == 4); /* B.n */
@@ -589,7 +591,7 @@ const char *test_permuted_dense_BTA_matching_row_perm(void)
     mu_assert("row_perm", cmp_int_array(C->row_perm, col_perm_B, 2));
     mu_assert("col_perm", cmp_int_array(C->col_perm, col_perm_A, 2));
 
-    BTA_pd_pd_fill_values(A, B, C);
+    BTA_pd_pd_fill_values(B, A, C);
 
     /* Reference: X_B^T X_A. With X_B = [[5,6],[7,8]], X_A = [[1,2],[3,4]]:
        X_B^T = [[5,7],[6,8]]. X_B^T X_A = [[5*1+7*3, 5*2+7*4], [6*1+8*3, 6*2+8*4]]
@@ -619,7 +621,7 @@ const char *test_permuted_dense_BTA_empty_overlap(void)
     permuted_dense *A = (permuted_dense *) A_m;
     permuted_dense *B = (permuted_dense *) B_m;
 
-    matrix *C_m = BTA_pd_pd_alloc(A, B);
+    matrix *C_m = BTA_pd_pd_alloc(B, A);
     permuted_dense *C = (permuted_dense *) C_m;
 
     mu_assert("out m", C_m->m == 4); /* B.n */
@@ -629,10 +631,10 @@ const char *test_permuted_dense_BTA_empty_overlap(void)
     mu_assert("nnz", C_m->nnz == 0);
 
     /* fill kernels should be safe no-ops on empty C. */
-    BTA_pd_pd_fill_values(A, B, C);
+    BTA_pd_pd_fill_values(B, A, C);
     double d[4] = {1.0, 1.0, 1.0, 1.0};
-    BTDA_pd_pd_fill_values(A, d, B, C);
-    BTDA_pd_pd_fill_values(A, NULL, B, C);
+    BTDA_pd_pd_fill_values(B, d, A, C);
+    BTDA_pd_pd_fill_values(B, NULL, A, C);
 
     free_matrix(C_m);
     free_matrix(B_m);
@@ -662,9 +664,9 @@ const char *test_permuted_dense_BTA_partial_overlap(void)
     permuted_dense *A = (permuted_dense *) A_m;
     permuted_dense *B = (permuted_dense *) B_m;
 
-    matrix *C_m = BTA_pd_pd_alloc(A, B);
+    matrix *C_m = BTA_pd_pd_alloc(B, A);
     permuted_dense *C = (permuted_dense *) C_m;
-    BTA_pd_pd_fill_values(A, B, C);
+    BTA_pd_pd_fill_values(B, A, C);
 
     /* Reference: scatter A, B to dense 8x4, compute B^T A, compare block at
        (col_perm_B, col_perm_A). */
@@ -729,9 +731,9 @@ const char *test_permuted_dense_BTDA_decomposition(void)
     permuted_dense *tmp = (permuted_dense *) tmp_m;
     permuted_dense_DA_fill_values(w, A, tmp);
 
-    matrix *C_m = BTA_pd_pd_alloc(tmp, B);
+    matrix *C_m = BTA_pd_pd_alloc(B, tmp);
     permuted_dense *C = (permuted_dense *) C_m;
-    BTA_pd_pd_fill_values(tmp, B, C);
+    BTA_pd_pd_fill_values(B, tmp, C);
 
     /* Reference: dense B_d^T diag(w) A_d, extract (col_perm_B, col_perm_A) block. */
     double *A_d = (double *) calloc((size_t) 3 * 4, sizeof(double));
@@ -792,170 +794,72 @@ static void scatter_csr_to_dense(const CSR_matrix *A_csr, int n_global,
 /* BTA(CSR_matrix A, PD B): basic correctness against a dense reference.
    A is (4, 5) CSR_matrix with mixed sparsity; B is (4, 4) PD with row_perm = [1, 3],
    col_perm = [0, 2], dense block (2, 2). */
-const char *test_BTA_csr_pd_basic(void)
+/* BTA_pd_csc_alloc + BTDA_pd_csc_fill_values should match the legacy
+   CSR-pd kernels in old-code on both alloc structure and BTDA values.
+   Uses a d with negative + zero entries to exercise sign / drop paths. */
+const char *test_BTA_pd_csc_matches_csr(void)
 {
-    /* CSR_matrix A: m=4, n=5, with nonzeros:
-       row 0: cols {1, 4}
-       row 1: cols {0, 2}
-       row 2: cols {2}
-       row 3: cols {1, 4} */
-    CSR_matrix *A = new_csr_matrix(4, 5, 7);
-    A->p[0] = 0;
-    A->p[1] = 2;
-    A->p[2] = 4;
-    A->p[3] = 5;
-    A->p[4] = 7;
+    /* Same A and B as test_BTA_pd_csr_basic. */
+    CSR_matrix *A_csr = new_CSR_matrix(4, 5, 7);
+    A_csr->p[0] = 0;
+    A_csr->p[1] = 2;
+    A_csr->p[2] = 4;
+    A_csr->p[3] = 5;
+    A_csr->p[4] = 7;
     int Ai[7] = {1, 4, 0, 2, 2, 1, 4};
     double Ax[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
-    memcpy(A->i, Ai, sizeof Ai);
-    memcpy(A->x, Ax, sizeof Ax);
-
-    /* PD B: m=4, n=4, row_perm = [1, 3], col_perm = [0, 2], X = [[10, 20], [30,
-     * 40]]. */
-    int row_perm_B[2] = {1, 3};
-    int col_perm_B[2] = {0, 2};
-    double XB[4] = {10.0, 20.0, 30.0, 40.0};
-    matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
-    permuted_dense *B = (permuted_dense *) B_m;
-
-    matrix *out_m = BTA_csr_pd_alloc(A, B);
-    permuted_dense *out = (permuted_dense *) out_m;
-
-    /* Expected col_active: union of A's columns in rows 1 and 3
-       = {0, 2} ∪ {1, 4} = {0, 1, 2, 4}, size 4. */
-    int expected_col_perm[4] = {0, 1, 2, 4};
-    mu_assert("out m", out_m->m == 4); /* B.n */
-    mu_assert("out n", out_m->n == 5); /* A.n */
-    mu_assert("m0", out->m0 == 2);
-    mu_assert("n0", out->n0 == 4);
-    mu_assert("row_perm", cmp_int_array(out->row_perm, col_perm_B, 2));
-    mu_assert("col_perm", cmp_int_array(out->col_perm, expected_col_perm, 4));
-
-    BTA_csr_pd_fill_values(A, B, out);
-
-    /* Reference: scatter A and B to dense 4x{5,4}, compute B^T A, extract
-       block at (col_perm_B × out->col_perm). */
-    double *A_d = (double *) calloc(4 * 5, sizeof(double));
-    double *B_d = (double *) calloc(4 * 4, sizeof(double));
-    scatter_csr_to_dense(A, 5, A_d);
-    scatter_pd_to_dense(B, 4, B_d);
-
-    double C_ref[4 * 5];
-    memset(C_ref, 0, sizeof C_ref);
-    for (int i = 0; i < 4; i++)
-    {
-        for (int j = 0; j < 5; j++)
-        {
-            double s = 0.0;
-            for (int k = 0; k < 4; k++)
-            {
-                s += B_d[k * 4 + i] * A_d[k * 5 + j];
-            }
-            C_ref[i * 5 + j] = s;
-        }
-    }
-    double expected_X[8];
-    for (int ii = 0; ii < 2; ii++)
-    {
-        for (int jj = 0; jj < 4; jj++)
-        {
-            expected_X[ii * 4 + jj] =
-                C_ref[col_perm_B[ii] * 5 + expected_col_perm[jj]];
-        }
-    }
-    mu_assert("values", cmp_double_array(out->X, expected_X, 8));
+    memcpy(A_csr->i, Ai, sizeof Ai);
+    memcpy(A_csr->x, Ax, sizeof Ax);
 
-    free(A_d);
-    free(B_d);
-    free_matrix(out_m);
-    free_matrix(B_m);
-    free_csr_matrix(A);
-    return 0;
-}
-
-/* BTA(CSR_matrix A, PD B) where A is a leaf-variable Jacobian (identity-in-block).
-   A is (4, 8): row k has a 1 at column 4+k (variable v of size 4 at var_id=4).
-   Expected: col_perm_out = {4+row_perm_B[kk]} = {4+1, 4+3} = {5, 7}, and X_C =
-   X_B^T. */
-const char *test_BTA_csr_pd_leaf_variable(void)
-{
-    CSR_matrix *A = new_csr_matrix(4, 8, 4);
-    for (int k = 0; k < 4; k++)
-    {
-        A->p[k] = k;
-        A->i[k] = 4 + k;
-        A->x[k] = 1.0;
-    }
-    A->p[4] = 4;
+    int *iwork = (int *) malloc(MAX(A_csr->m, A_csr->n) * sizeof(int));
+    CSC_matrix *A_csc = csr_to_csc_alloc(A_csr, iwork);
+    csr_to_csc_fill_values(A_csr, A_csc, iwork);
 
     int row_perm_B[2] = {1, 3};
     int col_perm_B[2] = {0, 2};
-    double XB[4] = {10.0, 20.0, 30.0, 40.0}; /* row-major (2, 2) */
-    matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
-    permuted_dense *B = (permuted_dense *) B_m;
-
-    matrix *out_m = BTA_csr_pd_alloc(A, B);
-    permuted_dense *out = (permuted_dense *) out_m;
-
-    int expected_col_perm[2] = {5, 7};
-    mu_assert("m0", out->m0 == 2);
-    mu_assert("n0", out->n0 == 2);
-    mu_assert("row_perm", cmp_int_array(out->row_perm, col_perm_B, 2));
-    mu_assert("col_perm", cmp_int_array(out->col_perm, expected_col_perm, 2));
-
-    BTA_csr_pd_fill_values(A, B, out);
-
-    /* X_C should be X_B^T = [[10, 30], [20, 40]] row-major. */
-    double expected_X[4] = {10.0, 30.0, 20.0, 40.0};
-    mu_assert("values", cmp_double_array(out->X, expected_X, 4));
-
-    free_matrix(out_m);
-    free_matrix(B_m);
-    free_csr_matrix(A);
-    return 0;
-}
-
-/* BTA(CSR_matrix A, PD B) where A has no entries in any row of row_perm_B.
-   Output dense block should have n0 = 0. */
-const char *test_BTA_csr_pd_no_overlap(void)
-{
-    /* A: rows 0 and 2 have entries; rows 1 and 3 (row_perm_B) are empty. */
-    CSR_matrix *A = new_csr_matrix(4, 5, 3);
-    A->p[0] = 0;
-    A->p[1] = 2;
-    A->p[2] = 2;
-    A->p[3] = 3;
-    A->p[4] = 3;
-    int Ai[3] = {1, 4, 2};
-    double Ax[3] = {1.0, 2.0, 3.0};
-    memcpy(A->i, Ai, sizeof Ai);
-    memcpy(A->x, Ax, sizeof Ax);
-
-    int row_perm_B[2] = {1, 3}; /* rows that ARE empty in A */
-    int col_perm_B[2] = {0, 2};
-    double XB[4] = {1.0, 2.0, 3.0, 4.0};
+    double XB[4] = {10.0, 20.0, 30.0, 40.0};
     matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
     permuted_dense *B = (permuted_dense *) B_m;
 
-    matrix *out_m = BTA_csr_pd_alloc(A, B);
-    permuted_dense *out = (permuted_dense *) out_m;
-
-    mu_assert("m0", out->m0 == 2);
-    mu_assert("n0", out->n0 == 0);
-
-    /* Fill should be a no-op (0-sized dense block). */
-    BTA_csr_pd_fill_values(A, B, out);
-
-    free_matrix(out_m);
+    double d[4] = {1.5, -2.0, 0.0, 3.5};
+
+    /* CSR variant (baseline, from old-code). */
+    matrix *C_csr_m = BTA_pd_csr_alloc(B, A_csr);
+    permuted_dense *C_csr = (permuted_dense *) C_csr_m;
+    BTDA_pd_csr_fill_values(B, d, A_csr, C_csr);
+
+    /* CSC variant (under test). */
+    matrix *C_csc_m = BTA_pd_csc_alloc(B, A_csc);
+    permuted_dense *C_csc = (permuted_dense *) C_csc_m;
+    BTDA_pd_csc_fill_values(B, d, A_csc, C_csc);
+
+    /* Structural equality. */
+    mu_assert("m matches", C_csc_m->m == C_csr_m->m);
+    mu_assert("n matches", C_csc_m->n == C_csr_m->n);
+    mu_assert("m0 matches", C_csc->m0 == C_csr->m0);
+    mu_assert("n0 matches", C_csc->n0 == C_csr->n0);
+    mu_assert("row_perm matches",
+              cmp_int_array(C_csc->row_perm, C_csr->row_perm, C_csr->m0));
+    mu_assert("col_perm matches",
+              cmp_int_array(C_csc->col_perm, C_csr->col_perm, C_csr->n0));
+
+    /* Value equality (tolerance-based; dot ordering differs vs dgemm). */
+    mu_assert("BTDA values match",
+              cmp_double_array(C_csc->X, C_csr->X, C_csr->m0 * C_csr->n0));
+
+    free_matrix(C_csr_m);
+    free_matrix(C_csc_m);
     free_matrix(B_m);
-    free_csr_matrix(A);
+    free_CSC_matrix(A_csc);
+    free_CSR_matrix(A_csr);
+    free(iwork);
     return 0;
 }
 
 /* BTA(PD A, CSR_matrix B): basic correctness against a dense reference.
    A is (4, 5) PD with row_perm = [1, 3], col_perm = [0, 2], dense block (2, 2).
    B is (4, 4) CSR_matrix with arbitrary sparsity. */
-const char *test_BTA_pd_csr_basic(void)
+const char *test_BTA_csr_pd_basic(void)
 {
     /* PD A: m=4, n=5, row_perm = [1, 3], col_perm = [0, 2].
        X = [[1, 2], [3, 4]] (2 x 2 row-major). */
@@ -970,7 +874,7 @@ const char *test_BTA_pd_csr_basic(void)
        row 1: cols {0, 2}
        row 2: cols {2}
        row 3: cols {0, 3} */
-    CSR_matrix *B = new_csr_matrix(4, 4, 7);
+    CSR_matrix *B = new_CSR_matrix(4, 4, 7);
     B->p[0] = 0;
     B->p[1] = 2;
     B->p[2] = 4;
@@ -981,7 +885,7 @@ const char *test_BTA_pd_csr_basic(void)
     memcpy(B->i, Bi, sizeof Bi);
     memcpy(B->x, Bx, sizeof Bx);
 
-    matrix *out_m = BTA_pd_csr_alloc(A, B);
+    matrix *out_m = BTA_csr_pd_alloc(B, A);
     permuted_dense *out = (permuted_dense *) out_m;
 
     /* row_active = union of B's cols in rows 1 and 3
@@ -994,7 +898,7 @@ const char *test_BTA_pd_csr_basic(void)
     mu_assert("row_perm", cmp_int_array(out->row_perm, expected_row_perm, 3));
     mu_assert("col_perm", cmp_int_array(out->col_perm, col_perm_A, 2));
 
-    BTA_pd_csr_fill_values(A, B, out);
+    BTA_csr_pd_fill_values(B, A, out);
 
     /* Reference: dense B^T A, extract block at (row_active × col_perm_A). */
     double *A_d = (double *) calloc(4 * 5, sizeof(double));
@@ -1030,7 +934,7 @@ const char *test_BTA_pd_csr_basic(void)
     free(A_d);
     free(B_d);
     free_matrix(out_m);
-    free_csr_matrix(B);
+    free_CSR_matrix(B);
     free_matrix(A_m);
     return 0;
 }
@@ -1038,7 +942,7 @@ const char *test_BTA_pd_csr_basic(void)
 /* BTA(PD A, CSR_matrix B) where B is a leaf-variable Jacobian (identity-in-block).
    B is (4, 8): row k has a 1 at column 4+k (variable v of size 4 at var_id=4).
    Expected: row_perm_out = {4+row_perm_A[kk]} = {4+1, 4+3} = {5, 7}, X_C = X_A. */
-const char *test_BTA_pd_csr_leaf_variable(void)
+const char *test_BTA_csr_pd_leaf_variable(void)
 {
     int row_perm_A[2] = {1, 3};
     int col_perm_A[2] = {0, 2};
@@ -1046,7 +950,7 @@ const char *test_BTA_pd_csr_leaf_variable(void)
     matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
     permuted_dense *A = (permuted_dense *) A_m;
 
-    CSR_matrix *B = new_csr_matrix(4, 8, 4);
+    CSR_matrix *B = new_CSR_matrix(4, 8, 4);
     for (int k = 0; k < 4; k++)
     {
         B->p[k] = k;
@@ -1055,7 +959,7 @@ const char *test_BTA_pd_csr_leaf_variable(void)
     }
     B->p[4] = 4;
 
-    matrix *out_m = BTA_pd_csr_alloc(A, B);
+    matrix *out_m = BTA_csr_pd_alloc(B, A);
     permuted_dense *out = (permuted_dense *) out_m;
 
     int expected_row_perm[2] = {5, 7};
@@ -1064,20 +968,20 @@ const char *test_BTA_pd_csr_leaf_variable(void)
     mu_assert("row_perm", cmp_int_array(out->row_perm, expected_row_perm, 2));
     mu_assert("col_perm", cmp_int_array(out->col_perm, col_perm_A, 2));
 
-    BTA_pd_csr_fill_values(A, B, out);
+    BTA_csr_pd_fill_values(B, A, out);
 
     /* X_C should equal X_A. */
     mu_assert("values", cmp_double_array(out->X, XA, 4));
 
     free_matrix(out_m);
-    free_csr_matrix(B);
+    free_CSR_matrix(B);
     free_matrix(A_m);
     return 0;
 }
 
 /* BTA(PD A, CSR_matrix B) where B has no entries in any row of row_perm_A.
    Output dense block should have m0 = 0. */
-const char *test_BTA_pd_csr_no_overlap(void)
+const char *test_BTA_csr_pd_no_overlap(void)
 {
     int row_perm_A[2] = {1, 3};
     int col_perm_A[2] = {0, 2};
@@ -1086,7 +990,7 @@ const char *test_BTA_pd_csr_no_overlap(void)
     permuted_dense *A = (permuted_dense *) A_m;
 
     /* B: rows 0 and 2 have entries; rows 1 and 3 (row_perm_A) are empty. */
-    CSR_matrix *B = new_csr_matrix(4, 4, 3);
+    CSR_matrix *B = new_CSR_matrix(4, 4, 3);
     B->p[0] = 0;
     B->p[1] = 2;
     B->p[2] = 2;
@@ -1097,17 +1001,17 @@ const char *test_BTA_pd_csr_no_overlap(void)
     memcpy(B->i, Bi, sizeof Bi);
     memcpy(B->x, Bx, sizeof Bx);
 
-    matrix *out_m = BTA_pd_csr_alloc(A, B);
+    matrix *out_m = BTA_csr_pd_alloc(B, A);
     permuted_dense *out = (permuted_dense *) out_m;
 
     mu_assert("m0", out->m0 == 0);
     mu_assert("n0", out->n0 == 2);
 
     /* Fill should be a no-op (0-sized dense block on the row axis). */
-    BTA_pd_csr_fill_values(A, B, out);
+    BTA_csr_pd_fill_values(B, A, out);
 
     free_matrix(out_m);
-    free_csr_matrix(B);
+    free_CSR_matrix(B);
     free_matrix(A_m);
     return 0;
 }
diff --git a/tests/wsum_hess/affine/test_left_matmul.h b/tests/wsum_hess/affine/test_left_matmul.h
index bded27f..0013efb 100644
--- a/tests/wsum_hess/affine/test_left_matmul.h
+++ b/tests/wsum_hess/affine/test_left_matmul.h
@@ -54,7 +54,7 @@ const char *test_wsum_hess_left_matmul(void)
     expr *x = new_variable(3, 1, 0, 3);
 
     /* Create sparse matrix A in CSR_matrix format */
-    CSR_matrix *A = new_csr_matrix(4, 3, 7);
+    CSR_matrix *A = new_CSR_matrix(4, 3, 7);
     int A_p[5] = {0, 2, 4, 6, 7};
     int A_i[7] = {0, 2, 0, 2, 0, 2, 0};
     double A_x[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
@@ -84,7 +84,7 @@ const char *test_wsum_hess_left_matmul(void)
     mu_assert("sparsity fail",
               cmp_sparsity(A_log_x->wsum_hess, expected_p, expected_i, 3, 3));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     free_expr(A_log_x);
     return 0;
 }
@@ -98,7 +98,7 @@ const char *test_wsum_hess_left_matmul_exp_composite(void)
     expr *x = new_variable(3, 1, 0, 3);
 
     /* Create B matrix (3x3 all ones) */
-    CSR_matrix *B = new_csr_matrix(3, 3, 9);
+    CSR_matrix *B = new_CSR_matrix(3, 3, 9);
     int B_p[4] = {0, 3, 6, 9};
     int B_i[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
     double B_x[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
@@ -107,7 +107,7 @@ const char *test_wsum_hess_left_matmul_exp_composite(void)
     memcpy(B->x, B_x, 9 * sizeof(double));
 
     /* Create A matrix */
-    CSR_matrix *A = new_csr_matrix(4, 3, 7);
+    CSR_matrix *A = new_CSR_matrix(4, 3, 7);
     int A_p[5] = {0, 2, 4, 6, 7};
     int A_i[7] = {0, 2, 0, 2, 0, 2, 0};
     double A_x[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
@@ -122,8 +122,8 @@ const char *test_wsum_hess_left_matmul_exp_composite(void)
     mu_assert("check_wsum_hess failed",
               check_wsum_hess(A_exp_Bx, x_vals, w, NUMERICAL_DIFF_DEFAULT_H));
 
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
     free_expr(A_exp_Bx);
     return 0;
 }
@@ -160,7 +160,7 @@ const char *test_wsum_hess_left_matmul_matrix(void)
     expr *x = new_variable(3, 2, 0, 6);
 
     /* Create sparse matrix A in CSR_matrix format */
-    CSR_matrix *A = new_csr_matrix(4, 3, 7);
+    CSR_matrix *A = new_CSR_matrix(4, 3, 7);
     int A_p[5] = {0, 2, 4, 6, 7};
     int A_i[7] = {0, 2, 0, 2, 0, 2, 0};
     double A_x[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
@@ -192,7 +192,7 @@ const char *test_wsum_hess_left_matmul_matrix(void)
     mu_assert("sparsity fail",
               cmp_sparsity(A_log_x->wsum_hess, expected_p, expected_i, 6, 6));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     free_expr(A_log_x);
     return 0;
 }
diff --git a/tests/wsum_hess/affine/test_right_matmul.h b/tests/wsum_hess/affine/test_right_matmul.h
index 959d37f..661e8e0 100644
--- a/tests/wsum_hess/affine/test_right_matmul.h
+++ b/tests/wsum_hess/affine/test_right_matmul.h
@@ -24,7 +24,7 @@ const char *test_wsum_hess_right_matmul(void)
     expr *x = new_variable(2, 2, 0, 4);
 
     /* Create sparse matrix A in CSR_matrix format (2x3) */
-    CSR_matrix *A = new_csr_matrix(2, 3, 4);
+    CSR_matrix *A = new_CSR_matrix(2, 3, 4);
     int A_p[3] = {0, 2, 4};
     int A_i[4] = {0, 2, 0, 2};
     double A_x[4] = {1.0, 2.0, 3.0, 4.0};
@@ -54,7 +54,7 @@ const char *test_wsum_hess_right_matmul(void)
     mu_assert("sparsity fail",
               cmp_sparsity(log_x_A->wsum_hess, expected_p, expected_i, 4, 4));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     free_expr(log_x_A);
     return 0;
 }
@@ -73,7 +73,7 @@ const char *test_wsum_hess_right_matmul_vector(void)
     expr *x = new_variable(1, 3, 0, 3);
 
     /* Create sparse matrix A in CSR_matrix format (3x2) */
-    CSR_matrix *A = new_csr_matrix(3, 2, 4);
+    CSR_matrix *A = new_CSR_matrix(3, 2, 4);
     int A_p[4] = {0, 1, 3, 4};
     int A_i[4] = {0, 0, 1, 1};
     double A_x[4] = {1.0, 2.0, 3.0, 4.0};
@@ -102,7 +102,7 @@ const char *test_wsum_hess_right_matmul_vector(void)
     mu_assert("sparsity fail",
               cmp_sparsity(log_x_A->wsum_hess, expected_p, expected_i, 3, 3));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     free_expr(log_x_A);
     return 0;
 }
diff --git a/tests/wsum_hess/affine/test_sum.h b/tests/wsum_hess/affine/test_sum.h
index 06fcf4c..2c739c9 100644
--- a/tests/wsum_hess/affine/test_sum.h
+++ b/tests/wsum_hess/affine/test_sum.h
@@ -16,7 +16,7 @@ const char *test_wsum_hess_sum_exp_linear(void)
     double Ax[6] = {1, 1, 2, 3, 1, -1};
     int Ai[6] = {0, 1, 0, 1, 0, 1};
     int Ap[4] = {0, 2, 4, 6};
-    CSR_matrix *A = new_csr_matrix(3, 2, 6);
+    CSR_matrix *A = new_CSR_matrix(3, 2, 6);
     memcpy(A->x, Ax, 6 * sizeof(double));
     memcpy(A->i, Ai, 6 * sizeof(int));
     memcpy(A->p, Ap, 4 * sizeof(int));
@@ -32,7 +32,7 @@ const char *test_wsum_hess_sum_exp_linear(void)
               check_wsum_hess(sum_node, x_vals, &w, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(sum_node);
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
 
     return 0;
 }
diff --git a/tests/wsum_hess/bivariate_full_dom/test_multiply.h b/tests/wsum_hess/bivariate_full_dom/test_multiply.h
index bdb9c34..a7c01fc 100644
--- a/tests/wsum_hess/bivariate_full_dom/test_multiply.h
+++ b/tests/wsum_hess/bivariate_full_dom/test_multiply.h
@@ -48,7 +48,7 @@ const char *test_wsum_hess_multiply_sparse_random(void)
      */
 
     /* Create A matrix (5x10) */
-    CSR_matrix *A = new_csr_matrix(5, 10, 10);
+    CSR_matrix *A = new_CSR_matrix(5, 10, 10);
     double Ax[10] = {-1.44165273, -1.13687223, 0.55892257,  0.24912193,  0.84959744,
                      -0.23998915, 0.5913356,   -1.21627912, -0.50379166, 0.41531801};
     int Ai[10] = {1, 2, 4, 8, 2, 3, 8, 9, 1, 2};
@@ -58,7 +58,7 @@ const char *test_wsum_hess_multiply_sparse_random(void)
     memcpy(A->p, Ap, 6 * sizeof(int));
 
     /* Create B matrix (5x10) */
-    CSR_matrix *B = new_csr_matrix(5, 10, 10);
+    CSR_matrix *B = new_CSR_matrix(5, 10, 10);
     double Bx[10] = {1.27549062,  0.04194731, -0.4356034,  0.405574,   1.34670487,
                      -0.57738638, 0.9411464,  -0.31563179, 1.90831766, -0.89802958};
     int Bi[10] = {0, 3, 5, 7, 0, 5, 0, 3, 7, 9};
@@ -102,8 +102,8 @@ const char *test_wsum_hess_multiply_sparse_random(void)
 
     /* Cleanup */
     free_expr(mult_node);
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
 
     return 0;
 }
@@ -129,7 +129,7 @@ const char *test_wsum_hess_multiply_linear_ops(void)
      */
 
     /* Create CSR_matrix matrix A */
-    CSR_matrix *A = new_csr_matrix(4, 3, 6);
+    CSR_matrix *A = new_CSR_matrix(4, 3, 6);
     double Ax[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     int Ai[6] = {0, 2, 1, 0, 2, 1};
     int Ap[5] = {0, 2, 3, 5, 6};
@@ -138,7 +138,7 @@ const char *test_wsum_hess_multiply_linear_ops(void)
     memcpy(A->p, Ap, 5 * sizeof(int));
 
     /* Create CSR_matrix matrix B */
-    CSR_matrix *B = new_csr_matrix(4, 3, 8);
+    CSR_matrix *B = new_CSR_matrix(4, 3, 8);
     double Bx[8] = {1.0, 4.0, 2.0, 7.0, 3.0, 2.0, 4.0, -1.0};
     int Bi[8] = {0, 2, 1, 2, 0, 2, 1, 2};
     int Bp[5] = {0, 2, 4, 6, 8};
@@ -182,8 +182,8 @@ const char *test_wsum_hess_multiply_linear_ops(void)
 
     /* Cleanup */
     free_expr(mult_node);
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
 
     return 0;
 }
diff --git a/tests/wsum_hess/composite/test_chain_rule_wsum_hess.h b/tests/wsum_hess/composite/test_chain_rule_wsum_hess.h
index 4b9bd15..6fbff2d 100644
--- a/tests/wsum_hess/composite/test_chain_rule_wsum_hess.h
+++ b/tests/wsum_hess/composite/test_chain_rule_wsum_hess.h
@@ -134,8 +134,8 @@ const char *test_wsum_hess_Ax_Bx_multiply(void)
               check_wsum_hess(multiply, u_vals, w, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(multiply);
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
     return 0;
 }
 
@@ -170,8 +170,8 @@ const char *test_wsum_hess_AX_BX_multiply(void)
               check_wsum_hess(multiply, u_vals, w, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(multiply);
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
     return 0;
 }
 
@@ -195,8 +195,8 @@ const char *test_wsum_hess_multiply_deep_composite(void)
               check_wsum_hess(multiply, u_vals, w, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(multiply);
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
     return 0;
 }
 
@@ -208,7 +208,7 @@ const char *test_wsum_hess_quad_form_Ax(void)
     CSR_matrix *A = new_csr_random(3, 4, 1.0);
 
     /* Q = [1 2 0; 2 3 0; 0 0 4] (symmetric) */
-    CSR_matrix *Q = new_csr_matrix(3, 3, 5);
+    CSR_matrix *Q = new_CSR_matrix(3, 3, 5);
     double Qx[5] = {1.0, 2.0, 2.0, 3.0, 4.0};
     int Qi[5] = {0, 1, 0, 1, 2};
     int Qp[4] = {0, 2, 4, 5};
@@ -224,8 +224,8 @@ const char *test_wsum_hess_quad_form_Ax(void)
               check_wsum_hess(node, u_vals, &w, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(node);
-    free_csr_matrix(A);
-    free_csr_matrix(Q);
+    free_CSR_matrix(A);
+    free_CSR_matrix(Q);
     return 0;
 }
 
@@ -237,7 +237,7 @@ const char *test_wsum_hess_quad_form_sin_Ax(void)
     CSR_matrix *A = new_csr_random(3, 4, 1.0);
 
     /* Q = [1 2 0; 2 3 0; 0 0 4] (symmetric) */
-    CSR_matrix *Q = new_csr_matrix(3, 3, 5);
+    CSR_matrix *Q = new_CSR_matrix(3, 3, 5);
     double Qx[5] = {1.0, 2.0, 2.0, 3.0, 4.0};
     int Qi[5] = {0, 1, 0, 1, 2};
     int Qp[4] = {0, 2, 4, 5};
@@ -254,8 +254,8 @@ const char *test_wsum_hess_quad_form_sin_Ax(void)
               check_wsum_hess(node, u_vals, &w, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(node);
-    free_csr_matrix(A);
-    free_csr_matrix(Q);
+    free_CSR_matrix(A);
+    free_CSR_matrix(Q);
     return 0;
 }
 
@@ -316,8 +316,8 @@ const char *test_wsum_hess_matmul_Ax_By(void)
               check_wsum_hess(Z, u_vals, w, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(Z);
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
     return 0;
 }
 
@@ -341,8 +341,8 @@ const char *test_wsum_hess_matmul_sin_Ax_cos_Bx(void)
               check_wsum_hess(Z, u_vals, w, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(Z);
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
     return 0;
 }
 
@@ -368,7 +368,7 @@ const char *test_wsum_hess_quad_form_exp(void)
     double w = 3.0;
 
     /* Q = [1 2 0; 2 3 0; 0 0 4] (symmetric) */
-    CSR_matrix *Q = new_csr_matrix(3, 3, 5);
+    CSR_matrix *Q = new_CSR_matrix(3, 3, 5);
     double Qx[5] = {1.0, 2.0, 2.0, 3.0, 4.0};
     int Qi[5] = {0, 1, 0, 1, 2};
     int Qp[4] = {0, 2, 4, 5};
@@ -384,6 +384,6 @@ const char *test_wsum_hess_quad_form_exp(void)
               check_wsum_hess(node, u_vals, &w, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(node);
-    free_csr_matrix(Q);
+    free_CSR_matrix(Q);
     return 0;
 }
diff --git a/tests/wsum_hess/elementwise_restricted_dom/test_log.h b/tests/wsum_hess/elementwise_restricted_dom/test_log.h
index 89ca601..c8acff5 100644
--- a/tests/wsum_hess/elementwise_restricted_dom/test_log.h
+++ b/tests/wsum_hess/elementwise_restricted_dom/test_log.h
@@ -55,7 +55,7 @@ const char *test_wsum_hess_exp_composite(void)
     double Ax[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
     int Ai[] = {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4};
     int Ap[] = {0, 5, 10, 15};
-    CSR_matrix *A_csr = new_csr_matrix(3, 5, 15);
+    CSR_matrix *A_csr = new_CSR_matrix(3, 5, 15);
     memcpy(A_csr->x, Ax, 15 * sizeof(double));
     memcpy(A_csr->i, Ai, 15 * sizeof(int));
     memcpy(A_csr->p, Ap, 4 * sizeof(int));
@@ -67,7 +67,7 @@ const char *test_wsum_hess_exp_composite(void)
     mu_assert("check_wsum_hess failed",
               check_wsum_hess(exp_node, u_vals, w, NUMERICAL_DIFF_DEFAULT_H));
 
-    free_csr_matrix(A_csr);
+    free_CSR_matrix(A_csr);
     free_expr(exp_node);
 
     return 0;
diff --git a/tests/wsum_hess/other/test_quad_form.h b/tests/wsum_hess/other/test_quad_form.h
index 444be3a..bd62d5a 100644
--- a/tests/wsum_hess/other/test_quad_form.h
+++ b/tests/wsum_hess/other/test_quad_form.h
@@ -17,7 +17,7 @@ const char *test_wsum_hess_quad_form(void)
      * [0 3 4 1]
      * [0 0 1 6]
      */
-    CSR_matrix *Q = new_csr_matrix(4, 4, 10);
+    CSR_matrix *Q = new_CSR_matrix(4, 4, 10);
     double Qx[10] = {1.0, 2.0, 2.0, 5.0, 3.0, 3.0, 4.0, 1.0, 1.0, 6.0};
     int Qi[10] = {0, 1, 0, 1, 2, 1, 2, 3, 2, 3};
     int Qp[5] = {0, 2, 5, 8, 10};
@@ -43,6 +43,6 @@ const char *test_wsum_hess_quad_form(void)
     mu_assert("vals fail", cmp_values(node->wsum_hess, expected_x, 10));
 
     free_expr(node);
-    free_csr_matrix(Q);
+    free_CSR_matrix(Q);
     return 0;
 }

From a409985e5388c6cec6423e73a38bf2f49d6665b0 Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Tue, 12 May 2026 13:09:13 +0200
Subject: [PATCH 15/31] better infrastructure permuted dense

---
 include/old-code/old_permuted_dense.h    |   8 +
 include/utils/permuted_dense.h           |  38 +++--
 src/old-code/old_permuted_dense.c        |  43 ++++++
 src/utils/permuted_dense.c               |  57 +-------
 tests/all_tests.c                        |   4 +-
 tests/old-code/test_old_permuted_dense.h | 169 ++++++++++++++++++++++
 tests/utils/test_permuted_dense.h        | 177 -----------------------
 7 files changed, 243 insertions(+), 253 deletions(-)

diff --git a/include/old-code/old_permuted_dense.h b/include/old-code/old_permuted_dense.h
index a2823da..ea67d70 100644
--- a/include/old-code/old_permuted_dense.h
+++ b/include/old-code/old_permuted_dense.h
@@ -52,4 +52,12 @@ void BTA_pd_csr_fill_values(const permuted_dense *B, const CSR_matrix *A,
 void BTDA_pd_csr_fill_values(const permuted_dense *B, const double *d,
                              const CSR_matrix *A, permuted_dense *C);
 
+/* Legacy no-d BTA fill for the production CSR-pd kernel (B=CSR, A=PD).
+   Production path always supplies chain-rule weights via
+   BTDA_csr_pd_fill_values (in src/utils/permuted_dense.c); the no-d variant
+   is kept here for the direct unit tests in tests/old-code. C must have the
+   structure produced by BTA_csr_pd_alloc (declared in utils/permuted_dense.h). */
+void BTA_csr_pd_fill_values(const CSR_matrix *B_csr, const permuted_dense *A,
+                            permuted_dense *C);
+
 #endif /* OLD_PERMUTED_DENSE_H */
diff --git a/include/utils/permuted_dense.h b/include/utils/permuted_dense.h
index 1ebd0e4..ab33d0c 100644
--- a/include/utils/permuted_dense.h
+++ b/include/utils/permuted_dense.h
@@ -103,7 +103,13 @@ matrix *BTA_pd_pd_alloc(const permuted_dense *B, const permuted_dense *A);
 void BTA_pd_pd_fill_values(const permuted_dense *B, const permuted_dense *A,
                            permuted_dense *C);
 
-/* Allocate new permuted dense for C = B @ A where B is PD and A is CSC */
+/* Fill values of C = BT @ diag(d) @ A where A and B are both permuted dense. */
+void BTDA_pd_pd_fill_values(const permuted_dense *B, const double *d,
+                            const permuted_dense *A, permuted_dense *C);
+
+/* Allocate new permuted dense for C = B @ A where B is PD and A is CSC.
+   This function is currently never used in production, but we keep it
+   here because it is simple and might be useful in the future. */
 matrix *BA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A);
 
 /* Fill values of C = B @ A where B is value buffer to permuted dense and A is CSC.
@@ -114,6 +120,13 @@ matrix *BA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A);
 void BA_pd_csc_fill_values(const double *B, int n0_B, const int *inv,
                            const CSC_matrix *A, permuted_dense *C);
 
+/* Allocate new permuted dense for C = B^T @ A where B is PD and A is CSC */
+matrix *BTA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A);
+
+/* Fill values of C = B^T @ diag(d) @ A where B is PD and A is CSC */
+void BTDA_pd_csc_fill_values(const permuted_dense *B, const double *d,
+                             const CSC_matrix *A, permuted_dense *C);
+
 // ------------------- OK SO FAR
 
 /* Allocate a new permuted_dense for C = B^T @ A where B is Sparse (CSR_matrix)
@@ -122,29 +135,12 @@ void BA_pd_csc_fill_values(const double *B, int n0_B, const int *inv,
    Dense block size = (|row_active|, n0_A). Values uninitialized. */
 matrix *BTA_csr_pd_alloc(const CSR_matrix *B_csr, const permuted_dense *A);
 
-/* Fill C->X = B_sub^T @ X_A, where B_sub is B's rows at positions
-   row_perm_A, columns restricted to C's row_perm, scattered to a dense
-   buffer. C must have the structure produced by BTA_csr_pd_alloc(B, A). */
-void BTA_csr_pd_fill_values(const CSR_matrix *B_csr, const permuted_dense *A,
-                            permuted_dense *C);
-
 /* BTDA variants — fold a diagonal d into the BTA computation. Each fills
    C->X = B^T diag(d) A (d may be NULL for plain B^T A). C must have the
-   structure produced by the corresponding BTA *_alloc function. */
+   structure produced by the corresponding BTA *_alloc function.
+   The no-d BTA_csr_pd_fill_values variant is unused in production; the
+   legacy version lives in include/old-code/old_permuted_dense.h. */
 void BTDA_csr_pd_fill_values(const CSR_matrix *B_csr, const double *d,
                              const permuted_dense *A, permuted_dense *C);
-void BTDA_pd_pd_fill_values(const permuted_dense *B, const double *d,
-                            const permuted_dense *A, permuted_dense *C);
-
-/* CSC-based (PD, Sparse) BTA / BTDA kernels — production path. Alloc is a
-   single pass over A's CSC columns; the fill kernel transposes B's dense
-   block into B->dwork (folding d in) and delegates to
-   BA_pd_csc_fill_values. d MUST be non-NULL — production callers always
-   supply chain-rule weights; for plain B^T A pass d = {1, 1, …, 1}.
-   See include/old-code/old_permuted_dense.h for the legacy CSR equivalents
-   kept as reference implementations. */
-matrix *BTA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A);
-void BTDA_pd_csc_fill_values(const permuted_dense *B, const double *d,
-                             const CSC_matrix *A, permuted_dense *C);
 
 #endif /* PERMUTED_DENSE_H */
diff --git a/src/old-code/old_permuted_dense.c b/src/old-code/old_permuted_dense.c
index 4b6c7b7..3517f64 100644
--- a/src/old-code/old_permuted_dense.c
+++ b/src/old-code/old_permuted_dense.c
@@ -153,3 +153,46 @@ void BTDA_pd_csr_fill_values(const permuted_dense *B, const double *d,
     cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dn_B, s_A, m0, 1.0, B->X,
                 dn_B, A_sub_dense, s_A, 0.0, C->X, s_A);
 }
+
+/* No-d BTA fill for the production CSR-pd kernel (B=CSR, A=PD). Moved out
+   of src/utils/permuted_dense.c because production always supplies
+   chain-rule weights through BTDA_csr_pd_fill_values; kept here for the
+   direct unit tests in tests/old-code/test_old_permuted_dense.h. */
+void BTA_csr_pd_fill_values(const CSR_matrix *B_csr, const permuted_dense *A,
+                            permuted_dense *C)
+{
+    int m0 = A->m0;
+    int dn_A = A->n0;
+    int r_B = C->m0;
+
+    if (r_B == 0 || m0 == 0)
+    {
+        /* Output dense block is empty; nothing to fill. */
+        return;
+    }
+
+    /* Use C->row_inv (pre-built by new_permuted_dense) as row_inv_out and
+       C->dwork as B_sub_dense; both are owned by C. dwork is sized at alloc
+       time to cover m0 * r_B; only that prefix is touched. */
+    double *B_sub_dense = C->dwork;
+    size_t used = m0 * r_B;
+    memset(B_sub_dense, 0, used * sizeof(double));
+
+    for (int kk = 0; kk < m0; kk++)
+    {
+        int row = A->row_perm[kk];
+        for (int e = B_csr->p[row]; e < B_csr->p[row + 1]; e++)
+        {
+            int i = B_csr->i[e];
+            int ii = C->row_inv[i];
+            if (ii >= 0)
+            {
+                B_sub_dense[kk * r_B + ii] = B_csr->x[e];
+            }
+        }
+    }
+
+    /* C->X = B_sub_dense^T @ X_A */
+    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, r_B, dn_A, m0, 1.0,
+                B_sub_dense, r_B, A->X, dn_A, 0.0, C->X, dn_A);
+}
diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c
index 3e7d293..3bae390 100644
--- a/src/utils/permuted_dense.c
+++ b/src/utils/permuted_dense.c
@@ -671,50 +671,6 @@ matrix *BTA_csr_pd_alloc(const CSR_matrix *B_csr, const permuted_dense *A)
     return C;
 }
 
-/* Note: when B_csr is a leaf-variable Jacobian (each row has a single entry
-   at column var_id + k, value 1), B_sub_dense is an identity matrix and
-   the dgemm reduces to X_C = X_A — a pure copy with no multiplication
-   needed. A fast path can detect this and skip the dgemm; deferred until a
-   workload shows the savings matter. */
-void BTA_csr_pd_fill_values(const CSR_matrix *B_csr, const permuted_dense *A,
-                            permuted_dense *C)
-{
-    int m0 = A->m0;
-    int dn_A = A->n0;
-    int r_B = C->m0;
-
-    if (r_B == 0 || m0 == 0)
-    {
-        /* Output dense block is empty; nothing to fill. */
-        return;
-    }
-
-    /* Use C->row_inv (pre-built by new_permuted_dense) as row_inv_out and
-       C->dwork as B_sub_dense; both are owned by C. dwork is sized at alloc
-       time to cover m0 * r_B; only that prefix is touched. */
-    double *B_sub_dense = C->dwork;
-    size_t used = m0 * r_B;
-    memset(B_sub_dense, 0, used * sizeof(double));
-
-    for (int kk = 0; kk < m0; kk++)
-    {
-        int row = A->row_perm[kk];
-        for (int e = B_csr->p[row]; e < B_csr->p[row + 1]; e++)
-        {
-            int i = B_csr->i[e];
-            int ii = C->row_inv[i];
-            if (ii >= 0)
-            {
-                B_sub_dense[kk * r_B + ii] = B_csr->x[e];
-            }
-        }
-    }
-
-    /* C->X = B_sub_dense^T @ X_A */
-    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, r_B, dn_A, m0, 1.0,
-                B_sub_dense, r_B, A->X, dn_A, 0.0, C->X, dn_A);
-}
-
 /* BTDA variant of BTA_csr_pd: C->X = B_sub_dense^T diag(d) X_A. Folds d
    into the scatter step. */
 void BTDA_csr_pd_fill_values(const CSR_matrix *B_csr, const double *d,
@@ -835,14 +791,12 @@ void BA_pd_csc_fill_values(const double *B, int n0_B, const int *inv,
     }
 }
 
-/* C = B^T @ A where A is Sparse (CSC) and B is PD. Same output structure as
-   BTA_pd_csr_alloc — built directly by scanning A's CSC columns with
-   idxs_hits_set against row_perm_B. */
 matrix *BTA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A)
 {
-    /* col_active[jj] is set of columns j of A whose nonzero pattern hits
-       at least one row in row_perm_B. col_inv built against row_perm_B
-       via B->row_inv. */
+    /* Cij != 0 if column i of B overlaps with column j of A. So we loop
+    through the columns of A. For each column of A, we check if it has any
+    nonzeros in rows that are in B's row_perm. If yes, column j of C will
+    have a nonzero block corresponding to the columns of B */
     iVec *col_active = iVec_new(8);
     for (int j = 0; j < A->n; j++)
     {
@@ -854,9 +808,6 @@ matrix *BTA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A)
         }
     }
 
-    /* Same shape and permutations as BTA_pd_csr_alloc:
-       shape (B->base.n, A->n), m0 = B->n0, n0 = |col_active|,
-       row_perm = B->col_perm, col_perm = col_active. */
     matrix *C = new_permuted_dense(B->base.n, A->n, B->n0, col_active->len,
                                    B->col_perm, col_active->data, NULL);
     iVec_free(col_active);
diff --git a/tests/all_tests.c b/tests/all_tests.c
index 218fc34..c49cc11 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -60,9 +60,9 @@
 #include "problem/test_problem.h"
 #include "utils/test_cblas.h"
 #include "utils/test_coo_matrix.h"
-#include "utils/test_CSC_matrix.h"
+#include "utils/test_csc_matrix.h"
 #include "utils/test_csr_csc_conversion.h"
-#include "utils/test_CSR_matrix.h"
+#include "utils/test_csr_matrix.h"
 #include "utils/test_linalg_sparse_matmuls.h"
 #include "utils/test_linalg_utils_matmul_chain_rule.h"
 #include "utils/test_matrix.h"
diff --git a/tests/old-code/test_old_permuted_dense.h b/tests/old-code/test_old_permuted_dense.h
index c986c4b..e4ad18f 100644
--- a/tests/old-code/test_old_permuted_dense.h
+++ b/tests/old-code/test_old_permuted_dense.h
@@ -179,4 +179,173 @@ const char *test_BTA_pd_csr_no_overlap(void)
     return 0;
 }
 
+/* Tests for the production CSR-pd kernel pair (B=CSR, A=PD). The BTA fill
+   variant lives here in old-code because production only calls the BTDA
+   path; the alloc is still in src/utils/permuted_dense.c. */
+
+/* BTA(CSR_matrix B, PD A): basic correctness against a dense reference.
+   A is (4, 5) PD with row_perm = [1, 3], col_perm = [0, 2], dense block (2, 2).
+   B is (4, 4) CSR_matrix with arbitrary sparsity. */
+const char *test_BTA_csr_pd_basic(void)
+{
+    /* PD A: m=4, n=5, row_perm = [1, 3], col_perm = [0, 2].
+       X = [[1, 2], [3, 4]] (2 x 2 row-major). */
+    int row_perm_A[2] = {1, 3};
+    int col_perm_A[2] = {0, 2};
+    double XA[4] = {1.0, 2.0, 3.0, 4.0};
+    matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
+    permuted_dense *A = (permuted_dense *) A_m;
+
+    /* CSR_matrix B: m=4, n=4.
+       row 0: cols {1, 3}
+       row 1: cols {0, 2}
+       row 2: cols {2}
+       row 3: cols {0, 3} */
+    CSR_matrix *B = new_CSR_matrix(4, 4, 7);
+    B->p[0] = 0;
+    B->p[1] = 2;
+    B->p[2] = 4;
+    B->p[3] = 5;
+    B->p[4] = 7;
+    int Bi[7] = {1, 3, 0, 2, 2, 0, 3};
+    double Bx[7] = {10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0};
+    memcpy(B->i, Bi, sizeof Bi);
+    memcpy(B->x, Bx, sizeof Bx);
+
+    matrix *out_m = BTA_csr_pd_alloc(B, A);
+    permuted_dense *out = (permuted_dense *) out_m;
+
+    /* row_active = union of B's cols in rows 1 and 3
+                  = {0, 2} ∪ {0, 3} = {0, 2, 3}, size 3. */
+    int expected_row_perm[3] = {0, 2, 3};
+    mu_assert("out m", out_m->m == 4); /* B.n */
+    mu_assert("out n", out_m->n == 5); /* A.n */
+    mu_assert("m0", out->m0 == 3);
+    mu_assert("n0", out->n0 == 2);
+    mu_assert("row_perm", cmp_int_array(out->row_perm, expected_row_perm, 3));
+    mu_assert("col_perm", cmp_int_array(out->col_perm, col_perm_A, 2));
+
+    BTA_csr_pd_fill_values(B, A, out);
+
+    /* Reference: dense B^T A, extract block at (row_active × col_perm_A).
+       Scatter inlined locally to avoid coupling to static helpers. */
+    double *A_d = (double *) calloc(4 * 5, sizeof(double));
+    double *B_d = (double *) calloc(4 * 4, sizeof(double));
+    for (int kk = 0; kk < A->m0; kk++)
+        for (int jj = 0; jj < A->n0; jj++)
+            A_d[A->row_perm[kk] * 5 + A->col_perm[jj]] = A->X[kk * A->n0 + jj];
+    for (int i = 0; i < B->m; i++)
+        for (int e = B->p[i]; e < B->p[i + 1]; e++)
+            B_d[i * 4 + B->i[e]] = B->x[e];
+
+    double C_ref[4 * 5];
+    memset(C_ref, 0, sizeof C_ref);
+    for (int i = 0; i < 4; i++)
+    {
+        for (int j = 0; j < 5; j++)
+        {
+            double s = 0.0;
+            for (int k = 0; k < 4; k++)
+            {
+                s += B_d[k * 4 + i] * A_d[k * 5 + j];
+            }
+            C_ref[i * 5 + j] = s;
+        }
+    }
+    double expected_X[6];
+    for (int ii = 0; ii < 3; ii++)
+    {
+        for (int jj = 0; jj < 2; jj++)
+        {
+            expected_X[ii * 2 + jj] =
+                C_ref[expected_row_perm[ii] * 5 + col_perm_A[jj]];
+        }
+    }
+    mu_assert("values", cmp_double_array(out->X, expected_X, 6));
+
+    free(A_d);
+    free(B_d);
+    free_matrix(out_m);
+    free_CSR_matrix(B);
+    free_matrix(A_m);
+    return 0;
+}
+
+/* BTA(CSR_matrix B, PD A) where B is a leaf-variable Jacobian (identity-in-block).
+   B is (4, 8): row k has a 1 at column 4+k (variable v of size 4 at var_id=4).
+   Expected: row_perm_out = {4+row_perm_A[kk]} = {4+1, 4+3} = {5, 7}, X_C = X_A. */
+const char *test_BTA_csr_pd_leaf_variable(void)
+{
+    int row_perm_A[2] = {1, 3};
+    int col_perm_A[2] = {0, 2};
+    double XA[4] = {1.0, 2.0, 3.0, 4.0};
+    matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
+    permuted_dense *A = (permuted_dense *) A_m;
+
+    CSR_matrix *B = new_CSR_matrix(4, 8, 4);
+    for (int k = 0; k < 4; k++)
+    {
+        B->p[k] = k;
+        B->i[k] = 4 + k;
+        B->x[k] = 1.0;
+    }
+    B->p[4] = 4;
+
+    matrix *out_m = BTA_csr_pd_alloc(B, A);
+    permuted_dense *out = (permuted_dense *) out_m;
+
+    int expected_row_perm[2] = {5, 7};
+    mu_assert("m0", out->m0 == 2);
+    mu_assert("n0", out->n0 == 2);
+    mu_assert("row_perm", cmp_int_array(out->row_perm, expected_row_perm, 2));
+    mu_assert("col_perm", cmp_int_array(out->col_perm, col_perm_A, 2));
+
+    BTA_csr_pd_fill_values(B, A, out);
+
+    /* X_C should equal X_A. */
+    mu_assert("values", cmp_double_array(out->X, XA, 4));
+
+    free_matrix(out_m);
+    free_CSR_matrix(B);
+    free_matrix(A_m);
+    return 0;
+}
+
+/* BTA(CSR_matrix B, PD A) where B has no entries in any row of row_perm_A.
+   Output dense block should have m0 = 0. */
+const char *test_BTA_csr_pd_no_overlap(void)
+{
+    int row_perm_A[2] = {1, 3};
+    int col_perm_A[2] = {0, 2};
+    double XA[4] = {1.0, 2.0, 3.0, 4.0};
+    matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
+    permuted_dense *A = (permuted_dense *) A_m;
+
+    /* B: rows 0 and 2 have entries; rows 1 and 3 (row_perm_A) are empty. */
+    CSR_matrix *B = new_CSR_matrix(4, 4, 3);
+    B->p[0] = 0;
+    B->p[1] = 2;
+    B->p[2] = 2;
+    B->p[3] = 3;
+    B->p[4] = 3;
+    int Bi[3] = {0, 1, 2};
+    double Bx[3] = {1.0, 2.0, 3.0};
+    memcpy(B->i, Bi, sizeof Bi);
+    memcpy(B->x, Bx, sizeof Bx);
+
+    matrix *out_m = BTA_csr_pd_alloc(B, A);
+    permuted_dense *out = (permuted_dense *) out_m;
+
+    mu_assert("m0", out->m0 == 0);
+    mu_assert("n0", out->n0 == 2);
+
+    /* Fill should be a no-op (0-sized dense block on the row axis). */
+    BTA_csr_pd_fill_values(B, A, out);
+
+    free_matrix(out_m);
+    free_CSR_matrix(B);
+    free_matrix(A_m);
+    return 0;
+}
+
 #endif /* TEST_OLD_PERMUTED_DENSE_H */
diff --git a/tests/utils/test_permuted_dense.h b/tests/utils/test_permuted_dense.h
index bad8ffd..a668990 100644
--- a/tests/utils/test_permuted_dense.h
+++ b/tests/utils/test_permuted_dense.h
@@ -774,23 +774,6 @@ const char *test_permuted_dense_BTDA_decomposition(void)
     return 0;
 }
 
-/* Scatter a CSR_matrix matrix into a dense m x n_global buffer (row-major).
-   Caller allocates and zero-fills. */
-static void scatter_csr_to_dense(const CSR_matrix *A_csr, int n_global,
-                                 double *dense)
-{
-    int m = A_csr->m;
-    memset(dense, 0, (size_t) m * (size_t) n_global * sizeof(double));
-    for (int i = 0; i < m; i++)
-    {
-        for (int e = A_csr->p[i]; e < A_csr->p[i + 1]; e++)
-        {
-            int j = A_csr->i[e];
-            dense[i * n_global + j] = A_csr->x[e];
-        }
-    }
-}
-
 /* BTA(CSR_matrix A, PD B): basic correctness against a dense reference.
    A is (4, 5) CSR_matrix with mixed sparsity; B is (4, 4) PD with row_perm = [1, 3],
    col_perm = [0, 2], dense block (2, 2). */
@@ -856,164 +839,4 @@ const char *test_BTA_pd_csc_matches_csr(void)
     return 0;
 }
 
-/* BTA(PD A, CSR_matrix B): basic correctness against a dense reference.
-   A is (4, 5) PD with row_perm = [1, 3], col_perm = [0, 2], dense block (2, 2).
-   B is (4, 4) CSR_matrix with arbitrary sparsity. */
-const char *test_BTA_csr_pd_basic(void)
-{
-    /* PD A: m=4, n=5, row_perm = [1, 3], col_perm = [0, 2].
-       X = [[1, 2], [3, 4]] (2 x 2 row-major). */
-    int row_perm_A[2] = {1, 3};
-    int col_perm_A[2] = {0, 2};
-    double XA[4] = {1.0, 2.0, 3.0, 4.0};
-    matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
-    permuted_dense *A = (permuted_dense *) A_m;
-
-    /* CSR_matrix B: m=4, n=4.
-       row 0: cols {1, 3}
-       row 1: cols {0, 2}
-       row 2: cols {2}
-       row 3: cols {0, 3} */
-    CSR_matrix *B = new_CSR_matrix(4, 4, 7);
-    B->p[0] = 0;
-    B->p[1] = 2;
-    B->p[2] = 4;
-    B->p[3] = 5;
-    B->p[4] = 7;
-    int Bi[7] = {1, 3, 0, 2, 2, 0, 3};
-    double Bx[7] = {10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0};
-    memcpy(B->i, Bi, sizeof Bi);
-    memcpy(B->x, Bx, sizeof Bx);
-
-    matrix *out_m = BTA_csr_pd_alloc(B, A);
-    permuted_dense *out = (permuted_dense *) out_m;
-
-    /* row_active = union of B's cols in rows 1 and 3
-                  = {0, 2} ∪ {0, 3} = {0, 2, 3}, size 3. */
-    int expected_row_perm[3] = {0, 2, 3};
-    mu_assert("out m", out_m->m == 4); /* B.n */
-    mu_assert("out n", out_m->n == 5); /* A.n */
-    mu_assert("m0", out->m0 == 3);
-    mu_assert("n0", out->n0 == 2);
-    mu_assert("row_perm", cmp_int_array(out->row_perm, expected_row_perm, 3));
-    mu_assert("col_perm", cmp_int_array(out->col_perm, col_perm_A, 2));
-
-    BTA_csr_pd_fill_values(B, A, out);
-
-    /* Reference: dense B^T A, extract block at (row_active × col_perm_A). */
-    double *A_d = (double *) calloc(4 * 5, sizeof(double));
-    double *B_d = (double *) calloc(4 * 4, sizeof(double));
-    scatter_pd_to_dense(A, 5, A_d);
-    scatter_csr_to_dense(B, 4, B_d);
-
-    double C_ref[4 * 5];
-    memset(C_ref, 0, sizeof C_ref);
-    for (int i = 0; i < 4; i++)
-    {
-        for (int j = 0; j < 5; j++)
-        {
-            double s = 0.0;
-            for (int k = 0; k < 4; k++)
-            {
-                s += B_d[k * 4 + i] * A_d[k * 5 + j];
-            }
-            C_ref[i * 5 + j] = s;
-        }
-    }
-    double expected_X[6];
-    for (int ii = 0; ii < 3; ii++)
-    {
-        for (int jj = 0; jj < 2; jj++)
-        {
-            expected_X[ii * 2 + jj] =
-                C_ref[expected_row_perm[ii] * 5 + col_perm_A[jj]];
-        }
-    }
-    mu_assert("values", cmp_double_array(out->X, expected_X, 6));
-
-    free(A_d);
-    free(B_d);
-    free_matrix(out_m);
-    free_CSR_matrix(B);
-    free_matrix(A_m);
-    return 0;
-}
-
-/* BTA(PD A, CSR_matrix B) where B is a leaf-variable Jacobian (identity-in-block).
-   B is (4, 8): row k has a 1 at column 4+k (variable v of size 4 at var_id=4).
-   Expected: row_perm_out = {4+row_perm_A[kk]} = {4+1, 4+3} = {5, 7}, X_C = X_A. */
-const char *test_BTA_csr_pd_leaf_variable(void)
-{
-    int row_perm_A[2] = {1, 3};
-    int col_perm_A[2] = {0, 2};
-    double XA[4] = {1.0, 2.0, 3.0, 4.0};
-    matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
-    permuted_dense *A = (permuted_dense *) A_m;
-
-    CSR_matrix *B = new_CSR_matrix(4, 8, 4);
-    for (int k = 0; k < 4; k++)
-    {
-        B->p[k] = k;
-        B->i[k] = 4 + k;
-        B->x[k] = 1.0;
-    }
-    B->p[4] = 4;
-
-    matrix *out_m = BTA_csr_pd_alloc(B, A);
-    permuted_dense *out = (permuted_dense *) out_m;
-
-    int expected_row_perm[2] = {5, 7};
-    mu_assert("m0", out->m0 == 2);
-    mu_assert("n0", out->n0 == 2);
-    mu_assert("row_perm", cmp_int_array(out->row_perm, expected_row_perm, 2));
-    mu_assert("col_perm", cmp_int_array(out->col_perm, col_perm_A, 2));
-
-    BTA_csr_pd_fill_values(B, A, out);
-
-    /* X_C should equal X_A. */
-    mu_assert("values", cmp_double_array(out->X, XA, 4));
-
-    free_matrix(out_m);
-    free_CSR_matrix(B);
-    free_matrix(A_m);
-    return 0;
-}
-
-/* BTA(PD A, CSR_matrix B) where B has no entries in any row of row_perm_A.
-   Output dense block should have m0 = 0. */
-const char *test_BTA_csr_pd_no_overlap(void)
-{
-    int row_perm_A[2] = {1, 3};
-    int col_perm_A[2] = {0, 2};
-    double XA[4] = {1.0, 2.0, 3.0, 4.0};
-    matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
-    permuted_dense *A = (permuted_dense *) A_m;
-
-    /* B: rows 0 and 2 have entries; rows 1 and 3 (row_perm_A) are empty. */
-    CSR_matrix *B = new_CSR_matrix(4, 4, 3);
-    B->p[0] = 0;
-    B->p[1] = 2;
-    B->p[2] = 2;
-    B->p[3] = 3;
-    B->p[4] = 3;
-    int Bi[3] = {0, 1, 2};
-    double Bx[3] = {1.0, 2.0, 3.0};
-    memcpy(B->i, Bi, sizeof Bi);
-    memcpy(B->x, Bx, sizeof Bx);
-
-    matrix *out_m = BTA_csr_pd_alloc(B, A);
-    permuted_dense *out = (permuted_dense *) out_m;
-
-    mu_assert("m0", out->m0 == 0);
-    mu_assert("n0", out->n0 == 2);
-
-    /* Fill should be a no-op (0-sized dense block on the row axis). */
-    BTA_csr_pd_fill_values(B, A, out);
-
-    free_matrix(out_m);
-    free_CSR_matrix(B);
-    free_matrix(A_m);
-    return 0;
-}
-
 #endif /* TEST_PERMUTED_DENSE_H */

From e132919d83c605e5f5afb8bfbd588ac44877dbd3 Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Tue, 12 May 2026 15:40:40 +0200
Subject: [PATCH 16/31] get rid of unnecessary allocation

---
 include/old-code/old_permuted_dense.h |  23 ++-
 include/utils/permuted_dense.h        |  22 +--
 src/old-code/old_permuted_dense.c     |  96 +++++++++++-
 src/utils/matrix_BTA.c                |  12 +-
 src/utils/permuted_dense.c            | 208 +++++++++++++++-----------
 tests/utils/test_matrix_BTA.h         |  35 +++--
 6 files changed, 268 insertions(+), 128 deletions(-)

diff --git a/include/old-code/old_permuted_dense.h b/include/old-code/old_permuted_dense.h
index ea67d70..38ac5cf 100644
--- a/include/old-code/old_permuted_dense.h
+++ b/include/old-code/old_permuted_dense.h
@@ -52,12 +52,25 @@ void BTA_pd_csr_fill_values(const permuted_dense *B, const CSR_matrix *A,
 void BTDA_pd_csr_fill_values(const permuted_dense *B, const double *d,
                              const CSR_matrix *A, permuted_dense *C);
 
-/* Legacy no-d BTA fill for the production CSR-pd kernel (B=CSR, A=PD).
-   Production path always supplies chain-rule weights via
-   BTDA_csr_pd_fill_values (in src/utils/permuted_dense.c); the no-d variant
-   is kept here for the direct unit tests in tests/old-code. C must have the
-   structure produced by BTA_csr_pd_alloc (declared in utils/permuted_dense.h). */
+/* Legacy CSR-pd kernels (B=CSR, A=PD), formerly in src/utils/permuted_dense.c.
+   Production now dispatches the (PD A, sparse B) branch through CSC-pd
+   kernels (BTA_csc_pd_alloc / BTDA_csc_pd_fill_values in utils/permuted_dense.h),
+   so these CSR variants live here as reference implementations and as
+   targets for the direct unit tests in tests/old-code. */
+
+/* Allocate a new permuted_dense for C = B^T A where B is CSR-sparse and A
+   is PD. Output is PD with row_perm = the sorted union of columns appearing
+   in B's rows at positions row_perm_A, and col_perm = A->col_perm. */
+matrix *BTA_csr_pd_alloc(const CSR_matrix *B_csr, const permuted_dense *A);
+
+/* No-d BTA fill. C must have the structure produced by BTA_csr_pd_alloc. */
 void BTA_csr_pd_fill_values(const CSR_matrix *B_csr, const permuted_dense *A,
                             permuted_dense *C);
 
+/* BTDA variant: C->X = B_sub_dense^T diag(d) X_A. d may be NULL (treated
+   as identity scaling). C must have the structure produced by
+   BTA_csr_pd_alloc. */
+void BTDA_csr_pd_fill_values(const CSR_matrix *B_csr, const double *d,
+                             const permuted_dense *A, permuted_dense *C);
+
 #endif /* OLD_PERMUTED_DENSE_H */
diff --git a/include/utils/permuted_dense.h b/include/utils/permuted_dense.h
index ab33d0c..6606d55 100644
--- a/include/utils/permuted_dense.h
+++ b/include/utils/permuted_dense.h
@@ -49,8 +49,9 @@ typedef struct permuted_dense
        operate on this PD. Two non-overlapping roles (a given fill call uses
        at most one):
          - Y-buffer: holds diag(d_perm) X for ATDA / BTDA_pd_pd (size m0*n0).
-         - gather:   holds densified CSR rows for BTA/BTDA_csr_pd /
-                     _pd_csr (size depends on the input PD's dimensions).
+         - transpose: holds (diag(d) X)^T for the BA_pd_csc-based BTDA
+                     kernels (BTDA_pd_csc and, transitively, BTDA_csc_pd
+                     via its delegate). Size m0*n0 doubles.
        Sized at alloc time for the largest role this PD could play. Functions
        taking a const permuted_dense * may still mutate `dwork`. */
     double *dwork;
@@ -129,18 +130,11 @@ void BTDA_pd_csc_fill_values(const permuted_dense *B, const double *d,
 
 // ------------------- OK SO FAR
 
-/* Allocate a new permuted_dense for C = B^T @ A where B is Sparse (CSR_matrix)
-   and A is PD. Output is PD with row_perm = the sorted union of columns
-   appearing in B's rows at positions row_perm_A, and col_perm = col_perm_A.
-   Dense block size = (|row_active|, n0_A). Values uninitialized. */
-matrix *BTA_csr_pd_alloc(const CSR_matrix *B_csr, const permuted_dense *A);
-
-/* BTDA variants — fold a diagonal d into the BTA computation. Each fills
-   C->X = B^T diag(d) A (d may be NULL for plain B^T A). C must have the
-   structure produced by the corresponding BTA *_alloc function.
-   The no-d BTA_csr_pd_fill_values variant is unused in production; the
-   legacy version lives in include/old-code/old_permuted_dense.h. */
-void BTDA_csr_pd_fill_values(const CSR_matrix *B_csr, const double *d,
+/* Allocate new permuted_dense for C = B^T @ A where B is Sparse CSC and A is PD. */
+matrix *BTA_csc_pd_alloc(const CSC_matrix *B, const permuted_dense *A);
+
+/* Fill values of C = B^T @ diag(d) @ A where B is CSC and A is PD */
+void BTDA_csc_pd_fill_values(const CSC_matrix *B, const double *d,
                              const permuted_dense *A, permuted_dense *C);
 
 #endif /* PERMUTED_DENSE_H */
diff --git a/src/old-code/old_permuted_dense.c b/src/old-code/old_permuted_dense.c
index 3517f64..453bdef 100644
--- a/src/old-code/old_permuted_dense.c
+++ b/src/old-code/old_permuted_dense.c
@@ -154,10 +154,61 @@ void BTDA_pd_csr_fill_values(const permuted_dense *B, const double *d,
                 dn_B, A_sub_dense, s_A, 0.0, C->X, s_A);
 }
 
-/* No-d BTA fill for the production CSR-pd kernel (B=CSR, A=PD). Moved out
-   of src/utils/permuted_dense.c because production always supplies
-   chain-rule weights through BTDA_csr_pd_fill_values; kept here for the
-   direct unit tests in tests/old-code/test_old_permuted_dense.h. */
+/* Legacy CSR-pd kernels (B=CSR, A=PD), formerly in src/utils/permuted_dense.c.
+   Production now goes through BTA_csc_pd_alloc / BTDA_csc_pd_fill_values;
+   these are kept here for reference + direct unit tests. */
+
+matrix *BTA_csr_pd_alloc(const CSR_matrix *B_csr, const permuted_dense *A)
+{
+    /* Gather the union of columns appearing in B's rows at positions
+       row_perm_A. Bitmap of size B_csr->n for O(nnz) collection. */
+    int q = B_csr->n;
+    char *seen = (char *) SP_CALLOC(q, sizeof(char));
+    int r_B = 0;
+    for (int kk = 0; kk < A->m0; kk++)
+    {
+        int row = A->row_perm[kk];
+        for (int e = B_csr->p[row]; e < B_csr->p[row + 1]; e++)
+        {
+            int i = B_csr->i[e];
+            if (!seen[i])
+            {
+                seen[i] = 1;
+                r_B++;
+            }
+        }
+    }
+
+    int *row_active = (int *) SP_MALLOC((r_B > 0 ? r_B : 1) * sizeof(int));
+    int idx = 0;
+    for (int i = 0; i < q; i++)
+    {
+        if (seen[i])
+        {
+            row_active[idx++] = i;
+        }
+    }
+
+    matrix *C =
+        new_permuted_dense(q, A->base.n, r_B, A->n0, row_active, A->col_perm, NULL);
+    free(row_active);
+    free(seen);
+
+    /* Upgrade `dwork` (currently sized for the Y-role at m0_C * n0_C = r_B *
+       A->n0) to fit the gather buffer B_sub_dense used by BTA_csr_pd /
+       BTDA_csr_pd_fill_values: shape (A->m0, r_B) row-major. */
+    permuted_dense *C_pd = (permuted_dense *) C;
+    size_t gather_size = A->m0 * r_B;
+    if (gather_size > C_pd->dwork_size)
+    {
+        free(C_pd->dwork);
+        C_pd->dwork_size = gather_size;
+        C_pd->dwork = (double *) SP_CALLOC(gather_size, sizeof(double));
+    }
+    return C;
+}
+
+/* No-d BTA fill for the legacy CSR-pd kernel. */
 void BTA_csr_pd_fill_values(const CSR_matrix *B_csr, const permuted_dense *A,
                             permuted_dense *C)
 {
@@ -196,3 +247,40 @@ void BTA_csr_pd_fill_values(const CSR_matrix *B_csr, const permuted_dense *A,
     cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, r_B, dn_A, m0, 1.0,
                 B_sub_dense, r_B, A->X, dn_A, 0.0, C->X, dn_A);
 }
+
+/* BTDA variant: C->X = B_sub_dense^T diag(d) X_A. Folds d into the scatter
+   step. d may be NULL (treated as identity). */
+void BTDA_csr_pd_fill_values(const CSR_matrix *B_csr, const double *d,
+                             const permuted_dense *A, permuted_dense *C)
+{
+    int m0 = A->m0;
+    int dn_A = A->n0;
+    int r_B = C->m0;
+
+    if (r_B == 0 || m0 == 0)
+    {
+        return;
+    }
+
+    double *B_sub_dense = C->dwork;
+    size_t used = m0 * r_B;
+    memset(B_sub_dense, 0, used * sizeof(double));
+
+    for (int kk = 0; kk < m0; kk++)
+    {
+        int row = A->row_perm[kk];
+        double dk = d ? d[row] : 1.0;
+        for (int e = B_csr->p[row]; e < B_csr->p[row + 1]; e++)
+        {
+            int i = B_csr->i[e];
+            int ii = C->row_inv[i];
+            if (ii >= 0)
+            {
+                B_sub_dense[kk * r_B + ii] = dk * B_csr->x[e];
+            }
+        }
+    }
+
+    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, r_B, dn_A, m0, 1.0,
+                B_sub_dense, r_B, A->X, dn_A, 0.0, C->X, dn_A);
+}
diff --git a/src/utils/matrix_BTA.c b/src/utils/matrix_BTA.c
index 8bdca67..fa8fec8 100644
--- a/src/utils/matrix_BTA.c
+++ b/src/utils/matrix_BTA.c
@@ -34,9 +34,10 @@ matrix *BTA_matrices_alloc(matrix *A, matrix *B)
     }
     if (pd_A)
     {
-        /* A is PD, B is Sparse */
-        CSR_matrix *B_csr = B->to_csr(B);
-        return BTA_csr_pd_alloc(B_csr, pd_A);
+        /* A is PD, B is Sparse — CSC kernel (see permuted_dense.{h,c}). */
+        sparse_matrix *sm_B = (sparse_matrix *) B;
+        B->refresh_csc_values(B);
+        return BTA_csc_pd_alloc(sm_B->csc_cache, pd_A);
     }
 
     /* Both Sparse: delegate to CSC_matrix BTA. Caller must ensure caches are fresh.
@@ -68,8 +69,9 @@ void BTDA_matrices_fill_values(matrix *A, const double *d, matrix *B, matrix *C)
     }
     if (pd_A)
     {
-        CSR_matrix *B_csr = B->to_csr(B);
-        BTDA_csr_pd_fill_values(B_csr, d, pd_A, (permuted_dense *) C);
+        sparse_matrix *sm_B = (sparse_matrix *) B;
+        B->refresh_csc_values(B);
+        BTDA_csc_pd_fill_values(sm_B->csc_cache, d, pd_A, (permuted_dense *) C);
         return;
     }
 
diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c
index 3bae390..8599938 100644
--- a/src/utils/permuted_dense.c
+++ b/src/utils/permuted_dense.c
@@ -360,7 +360,8 @@ matrix *new_permuted_dense(int m, int n, int m0, int n0, const int *row_perm,
     pd->X = (double *) SP_MALLOC(sz * sizeof(double));
     pd->base.x = pd->X;
     /* `dwork` sized for the Y-buffer role (Y = diag(d_perm) X) used by ATDA /
-       BTDA_pd_pd. BTA_pd_csr_alloc / BTA_csr_pd_alloc upgrade this to a
+       BTDA_pd_pd, and for the (diag(d) X)^T transpose in BTDA_pd_csc. The
+       legacy old-code BTA_pd_csr_alloc / BTA_csr_pd_alloc upgrade this to a
        larger gather buffer when their output PD will instead play that role. */
     pd->dwork_size = sz;
     pd->dwork = (double *) SP_MALLOC(pd->dwork_size * sizeof(double));
@@ -621,92 +622,9 @@ void BTDA_pd_pd_fill_values(const permuted_dense *B, const double *d,
     free_matrix(&DA->base);
 }
 
-matrix *BTA_csr_pd_alloc(const CSR_matrix *B_csr, const permuted_dense *A)
-{
-    /* Gather the union of columns appearing in B's rows at positions
-       row_perm_A. Bitmap of size B_csr->n for O(nnz) collection. */
-    int q = B_csr->n;
-    char *seen = (char *) SP_CALLOC(q, sizeof(char));
-    int r_B = 0;
-    for (int kk = 0; kk < A->m0; kk++)
-    {
-        int row = A->row_perm[kk];
-        for (int e = B_csr->p[row]; e < B_csr->p[row + 1]; e++)
-        {
-            int i = B_csr->i[e];
-            if (!seen[i])
-            {
-                seen[i] = 1;
-                r_B++;
-            }
-        }
-    }
-
-    int *row_active = (int *) SP_MALLOC((r_B > 0 ? r_B : 1) * sizeof(int));
-    int idx = 0;
-    for (int i = 0; i < q; i++)
-    {
-        if (seen[i])
-        {
-            row_active[idx++] = i;
-        }
-    }
-
-    matrix *C =
-        new_permuted_dense(q, A->base.n, r_B, A->n0, row_active, A->col_perm, NULL);
-    free(row_active);
-    free(seen);
-
-    /* Upgrade `dwork` (currently sized for the Y-role at m0_C * n0_C = r_B *
-       A->n0) to fit the gather buffer B_sub_dense used by BTA_csr_pd /
-       BTDA_csr_pd_fill_values: shape (A->m0, r_B) row-major. */
-    permuted_dense *C_pd = (permuted_dense *) C;
-    size_t gather_size = A->m0 * r_B;
-    if (gather_size > C_pd->dwork_size)
-    {
-        free(C_pd->dwork);
-        C_pd->dwork_size = gather_size;
-        C_pd->dwork = (double *) SP_CALLOC(gather_size, sizeof(double));
-    }
-    return C;
-}
-
-/* BTDA variant of BTA_csr_pd: C->X = B_sub_dense^T diag(d) X_A. Folds d
-   into the scatter step. */
-void BTDA_csr_pd_fill_values(const CSR_matrix *B_csr, const double *d,
-                             const permuted_dense *A, permuted_dense *C)
-{
-    int m0 = A->m0;
-    int dn_A = A->n0;
-    int r_B = C->m0;
-
-    if (r_B == 0 || m0 == 0)
-    {
-        return;
-    }
-
-    double *B_sub_dense = C->dwork;
-    size_t used = m0 * r_B;
-    memset(B_sub_dense, 0, used * sizeof(double));
-
-    for (int kk = 0; kk < m0; kk++)
-    {
-        int row = A->row_perm[kk];
-        double dk = d ? d[row] : 1.0;
-        for (int e = B_csr->p[row]; e < B_csr->p[row + 1]; e++)
-        {
-            int i = B_csr->i[e];
-            int ii = C->row_inv[i];
-            if (ii >= 0)
-            {
-                B_sub_dense[kk * r_B + ii] = dk * B_csr->x[e];
-            }
-        }
-    }
-
-    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, r_B, dn_A, m0, 1.0,
-                B_sub_dense, r_B, A->X, dn_A, 0.0, C->X, dn_A);
-}
+/* The CSR-flavored kernels for (B=Sparse, A=PD) live in src/old-code; the
+   production path uses BTA_csc_pd_alloc / BTDA_csc_pd_fill_values defined
+   further below, which delegate to BTA_pd_csc via the (A^T B)^T identity. */
 
 /* Return true if any of the 'len' integers in 'indices' exist in the set
    marked by 'inv' (inv[k] != -1 iff k is in the set). */
@@ -818,6 +736,12 @@ matrix *BTA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A)
 void BTDA_pd_csc_fill_values(const permuted_dense *B, const double *d,
                              const CSC_matrix *A, permuted_dense *C)
 {
+    /* C may be empty */
+    if (C->base.nnz == 0)
+    {
+        return;
+    }
+
     int m0 = B->m0;
     int n0 = B->n0;
 
@@ -833,3 +757,113 @@ void BTDA_pd_csc_fill_values(const permuted_dense *B, const double *d,
 
     BA_pd_csc_fill_values(B->dwork, m0, B->row_inv, A, C);
 }
+
+matrix *BTA_csc_pd_alloc(const CSC_matrix *B, const permuted_dense *A)
+{
+    /* Cij != 0 if column i of B overlaps with row j of A. So we loop through the
+       columns of B. For each column of B, we check if it has any nonzeros in rows
+       that are in A->row_perm. If yes, column i of C will have a nonzero block
+       corresponding to the columns of A */
+
+    iVec *row_active = iVec_new(10);
+    for (int i = 0; i < B->n; i++)
+    {
+        int start = B->p[i];
+        int len = B->p[i + 1] - start;
+        if (idxs_hits_set(B->i + start, len, A->row_inv))
+        {
+            iVec_append(row_active, i);
+        }
+    }
+
+    matrix *C = new_permuted_dense(B->n, A->base.n, row_active->len, A->n0,
+                                   row_active->data, A->col_perm, NULL);
+    iVec_free(row_active);
+    return C;
+}
+
+/* Internal helper for BTDA_csc_pd_fill_values: C = B^T @ A where B is CSC
+   and the right operand A is supplied as a transposed-layout raw buffer
+   (row j of A_T = m0_A contiguous doubles = the j-th column of A's dense
+   block). Transposed-output sibling of BA_pd_csc_fill_values. */
+static void BTA_csc_pd_fill_values(const CSC_matrix *B, const double *A_T,
+                                   int m0_A, const int *inv, permuted_dense *C)
+{
+    /* C[i_C, j_C] = dot(col C->row_perm[i_C] of B, row j_C of A_T). */
+    for (int i_C = 0; i_C < C->m0; i_C++)
+    {
+        int B_col = C->row_perm[i_C];
+        int start = B->p[B_col];
+        int len = B->p[B_col + 1] - start;
+        double *ci = C->X + i_C * C->n0;
+        for (int j_C = 0; j_C < C->n0; j_C++)
+        {
+            ci[j_C] = sparse_dot_dense(B->x + start, B->i + start, len, inv,
+                                       A_T + j_C * m0_A);
+        }
+    }
+}
+
+/* C = B^T diag(d) A. Folds diag(d) into A's dense block (writing
+   (diag(d_perm) X_A)^T into A->dwork) and delegates to BTA_csc_pd_fill_values.
+   Mirrors how BTDA_pd_csc_fill_values wraps BA_pd_csc_fill_values. */
+void BTDA_csc_pd_fill_values(const CSC_matrix *B, const double *d,
+                             const permuted_dense *A, permuted_dense *C)
+{
+    if (C->base.nnz == 0)
+    {
+        return;
+    }
+
+    int m0_A = A->m0;
+    int n0_A = A->n0;
+
+    /* A->dwork = (diag(d_perm) X_A)^T, row-major shape (n0_A, m0_A).
+       Column j of (diag(d) X_A) lives contiguously in dwork as row j —
+       which is exactly the layout BTA_csc_pd_fill_values wants. */
+    for (int kk = 0; kk < m0_A; kk++)
+    {
+        double dk = d[A->row_perm[kk]];
+        for (int jj = 0; jj < n0_A; jj++)
+        {
+            A->dwork[jj * m0_A + kk] = dk * A->X[kk * n0_A + jj];
+        }
+    }
+
+    BTA_csc_pd_fill_values(B, A->dwork, m0_A, A->row_inv, C);
+}
+
+/* Original transpose-via-Cprime implementation of BTDA_csc_pd_fill_values.
+   No longer linked; preserved here as in-file reference for the math
+   identity C = (A^T diag(d) B)^T and the BA_pd_csc_fill_values delegation. */
+#if defined(__GNUC__) || defined(__clang__)
+__attribute__((unused))
+#endif
+static void
+BTDA_csc_pd_fill_values_via_transpose_dead(const CSC_matrix *B, const double *d,
+                                           const permuted_dense *A,
+                                           permuted_dense *C)
+{
+    if (C->base.nnz == 0)
+    {
+        return;
+    }
+
+    /* Cprime has shape (A->n0, |row_active|) — i.e. C transposed. */
+    matrix *Cprime_m = BTA_pd_csc_alloc(A, B);
+    permuted_dense *Cprime = (permuted_dense *) Cprime_m;
+    BTDA_pd_csc_fill_values(A, d, B, Cprime);
+
+    /* C->X = Cprime->X^T. Cprime has dims (C->n0, C->m0). */
+    int m0 = C->m0;
+    int n0 = C->n0;
+    for (int i = 0; i < m0; i++)
+    {
+        for (int j = 0; j < n0; j++)
+        {
+            C->X[i * n0 + j] = Cprime->X[j * m0 + i];
+        }
+    }
+
+    free_matrix(Cprime_m);
+}
diff --git a/tests/utils/test_matrix_BTA.h b/tests/utils/test_matrix_BTA.h
index 73a436d..c74d079 100644
--- a/tests/utils/test_matrix_BTA.h
+++ b/tests/utils/test_matrix_BTA.h
@@ -4,10 +4,12 @@
 #include "minunit.h"
 #include "old-code/old_permuted_dense.h"
 #include "test_helpers.h"
+#include "utils/CSC_matrix.h"
 #include "utils/CSR_matrix.h"
 #include "utils/matrix_BTA.h"
 #include "utils/permuted_dense.h"
 #include "utils/sparse_matrix.h"
+#include "utils/utils.h"
 #include <stdlib.h>
 #include <string.h>
 
@@ -102,7 +104,7 @@ const char *test_BTDA_matrices_csr_pd(void)
 }
 
 /* Wrapper dispatch sanity: (PD, CSR_matrix). Compare against direct
-   BTDA_csr_pd_fill_values. */
+   BTDA_csc_pd_fill_values. */
 const char *test_BTDA_matrices_pd_csr(void)
 {
     /* A: 4x5 PD, row_perm = [1, 3], col_perm = [0, 2]. */
@@ -130,19 +132,24 @@ const char *test_BTDA_matrices_pd_csr(void)
     matrix *C_m = BTA_matrices_alloc(A_m, B_m);
     BTDA_matrices_fill_values(A_m, d, B_m, C_m);
 
-    /* Direct primitive path. */
+    /* Direct primitive path: production now dispatches the (PD, Sparse)
+       branch through CSC-pd kernels. Build a CSC view of B and call
+       BTA_csc_pd_alloc + BTDA_csc_pd_fill_values to match. */
     matrix *A2_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
     permuted_dense *A2 = (permuted_dense *) A2_m;
-    CSR_matrix *B2 = new_CSR_matrix(4, 4, 5);
-    B2->p[0] = 0;
-    B2->p[1] = 2;
-    B2->p[2] = 3;
-    B2->p[3] = 4;
-    B2->p[4] = 5;
-    memcpy(B2->i, Bi, sizeof Bi);
-    memcpy(B2->x, Bx, sizeof Bx);
-    matrix *C2 = BTA_csr_pd_alloc(B2, A2);
-    BTDA_csr_pd_fill_values(B2, d, A2, (permuted_dense *) C2);
+    CSR_matrix *B2_csr = new_CSR_matrix(4, 4, 5);
+    B2_csr->p[0] = 0;
+    B2_csr->p[1] = 2;
+    B2_csr->p[2] = 3;
+    B2_csr->p[3] = 4;
+    B2_csr->p[4] = 5;
+    memcpy(B2_csr->i, Bi, sizeof Bi);
+    memcpy(B2_csr->x, Bx, sizeof Bx);
+    int *iwork = (int *) malloc(MAX(B2_csr->m, B2_csr->n) * sizeof(int));
+    CSC_matrix *B2_csc = csr_to_csc_alloc(B2_csr, iwork);
+    csr_to_csc_fill_values(B2_csr, B2_csc, iwork);
+    matrix *C2 = BTA_csc_pd_alloc(B2_csc, A2);
+    BTDA_csc_pd_fill_values(B2_csc, d, A2, (permuted_dense *) C2);
 
     mu_assert("values", cmp_double_array(C_m->x, C2->x, C_m->nnz));
 
@@ -150,7 +157,9 @@ const char *test_BTDA_matrices_pd_csr(void)
     free_matrix(B_m);
     free_matrix(A_m);
     free_matrix(C2);
-    free_CSR_matrix(B2);
+    free_CSC_matrix(B2_csc);
+    free_CSR_matrix(B2_csr);
+    free(iwork);
     free_matrix(A2_m);
     return 0;
 }

From ed9de269968895c81b1f85ff2365484ba635b78e Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Tue, 12 May 2026 15:43:30 +0200
Subject: [PATCH 17/31] change name of some functions

---
 include/utils/permuted_dense.h    | 11 ++++-------
 src/utils/permuted_dense.c        | 22 +++++++++-------------
 tests/all_tests.c                 |  8 ++++----
 tests/profiling/profile_log_reg.h | 13 +++++++------
 tests/utils/test_permuted_dense.h | 16 ++++++++--------
 5 files changed, 32 insertions(+), 38 deletions(-)

diff --git a/include/utils/permuted_dense.h b/include/utils/permuted_dense.h
index 6606d55..c678797 100644
--- a/include/utils/permuted_dense.h
+++ b/include/utils/permuted_dense.h
@@ -86,15 +86,14 @@ matrix *new_permuted_dense(int m, int n, int m0, int n0, const int *row_perm,
    free the returned CSR_matrix — it's released by free_matrix on the PD. */
 
 /* Fill values of C = diag(d) @ A where len(d) = number of (global) rows of A */
-void permuted_dense_DA_fill_values(const double *d, const permuted_dense *A,
-                                   permuted_dense *C);
+void DA_pd_fill_values(const double *d, const permuted_dense *A, permuted_dense *C);
 
 /* Allocate new permuted dense for C = AT @ A */
-matrix *permuted_dense_ATA_alloc(const permuted_dense *A);
+matrix *ATA_pd_alloc(const permuted_dense *A);
 
 /* Fill values of C = AT @ diag(d) @ A */
-void permuted_dense_ATDA_fill_values(const permuted_dense *A, const double *d,
-                                     permuted_dense *C);
+void ATDA_pd_fill_values(const permuted_dense *A, const double *d,
+                         permuted_dense *C);
 
 /* Allocate new permuted dense forC = BT @ A where A and B are both permuted_dense.
    (If B and A have no overlapping rows, then C is empty) */
@@ -128,8 +127,6 @@ matrix *BTA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A);
 void BTDA_pd_csc_fill_values(const permuted_dense *B, const double *d,
                              const CSC_matrix *A, permuted_dense *C);
 
-// ------------------- OK SO FAR
-
 /* Allocate new permuted_dense for C = B^T @ A where B is Sparse CSC and A is PD. */
 matrix *BTA_csc_pd_alloc(const CSC_matrix *B, const permuted_dense *A);
 
diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c
index 8599938..39e87c5 100644
--- a/src/utils/permuted_dense.c
+++ b/src/utils/permuted_dense.c
@@ -63,20 +63,18 @@ static matrix *permuted_dense_vtable_copy_sparsity(const matrix *self)
 static void permuted_dense_vtable_DA_fill_values(const double *d, const matrix *self,
                                                  matrix *out)
 {
-    permuted_dense_DA_fill_values(d, (const permuted_dense *) self,
-                                  (permuted_dense *) out);
+    DA_pd_fill_values(d, (const permuted_dense *) self, (permuted_dense *) out);
 }
 
 static matrix *permuted_dense_vtable_ATA_alloc(matrix *self)
 {
-    return permuted_dense_ATA_alloc((const permuted_dense *) self);
+    return ATA_pd_alloc((const permuted_dense *) self);
 }
 
 static void permuted_dense_vtable_ATDA_fill_values(const matrix *self,
                                                    const double *d, matrix *out)
 {
-    permuted_dense_ATDA_fill_values((const permuted_dense *) self, d,
-                                    (permuted_dense *) out);
+    ATDA_pd_fill_values((const permuted_dense *) self, d, (permuted_dense *) out);
 }
 
 /* Forward decl; definition lower in the file. */
@@ -438,8 +436,7 @@ static CSR_matrix *permuted_dense_to_csr_alloc(const permuted_dense *A)
     return C;
 }
 
-void permuted_dense_DA_fill_values(const double *d, const permuted_dense *A,
-                                   permuted_dense *C)
+void DA_pd_fill_values(const double *d, const permuted_dense *A, permuted_dense *C)
 {
     int m0 = A->m0;
     int n0 = A->n0;
@@ -450,7 +447,7 @@ void permuted_dense_DA_fill_values(const double *d, const permuted_dense *A,
     }
 }
 
-matrix *permuted_dense_ATA_alloc(const permuted_dense *A)
+matrix *ATA_pd_alloc(const permuted_dense *A)
 {
     int n = A->base.n;
     /* C = AT @ A has a dense block of size n0 x n0, with row and column index
@@ -460,8 +457,7 @@ matrix *permuted_dense_ATA_alloc(const permuted_dense *A)
     return new_permuted_dense(n, n, A->n0, A->n0, A->col_perm, A->col_perm, NULL);
 }
 
-void permuted_dense_ATDA_fill_values(const permuted_dense *A, const double *d,
-                                     permuted_dense *C)
+void ATDA_pd_fill_values(const permuted_dense *A, const double *d, permuted_dense *C)
 {
     int m0 = A->m0;
     int n0 = A->n0;
@@ -617,7 +613,7 @@ void BTDA_pd_pd_fill_values(const permuted_dense *B, const double *d,
 
     /* C = BT @ (DA) */
     permuted_dense *DA = (permuted_dense *) A->base.copy_sparsity(&A->base);
-    permuted_dense_DA_fill_values(d, A, DA);
+    DA_pd_fill_values(d, A, DA);
     BTA_pd_pd_fill_values(B, DA, C);
     free_matrix(&DA->base);
 }
@@ -786,8 +782,8 @@ matrix *BTA_csc_pd_alloc(const CSC_matrix *B, const permuted_dense *A)
    and the right operand A is supplied as a transposed-layout raw buffer
    (row j of A_T = m0_A contiguous doubles = the j-th column of A's dense
    block). Transposed-output sibling of BA_pd_csc_fill_values. */
-static void BTA_csc_pd_fill_values(const CSC_matrix *B, const double *A_T,
-                                   int m0_A, const int *inv, permuted_dense *C)
+static void BTA_csc_pd_fill_values(const CSC_matrix *B, const double *A_T, int m0_A,
+                                   const int *inv, permuted_dense *C)
 {
     /* C[i_C, j_C] = dot(col C->row_perm[i_C] of B, row j_C of A_T). */
     for (int i_C = 0; i_C < C->m0; i_C++)
diff --git a/tests/all_tests.c b/tests/all_tests.c
index c49cc11..3ee926c 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -55,6 +55,7 @@
 #include "jacobian_tests/other/test_prod_axis_zero.h"
 #include "jacobian_tests/other/test_quad_form.h"
 #include "numerical_diff/test_numerical_diff.h"
+#include "old-code/test_old_permuted_dense.h"
 #include "problem/test_param_broadcast.h"
 #include "problem/test_param_prob.h"
 #include "problem/test_problem.h"
@@ -67,7 +68,6 @@
 #include "utils/test_linalg_utils_matmul_chain_rule.h"
 #include "utils/test_matrix.h"
 #include "utils/test_matrix_BTA.h"
-#include "old-code/test_old_permuted_dense.h"
 #include "utils/test_permuted_dense.h"
 #include "wsum_hess/affine/test_broadcast.h"
 #include "wsum_hess/affine/test_convolve.h"
@@ -367,9 +367,9 @@ int main(void)
     mu_run_test(test_permuted_dense_to_csr_full, tests_run);
     mu_run_test(test_permuted_dense_to_csr_single_row, tests_run);
     mu_run_test(test_permuted_dense_to_csr_single_col, tests_run);
-    mu_run_test(test_permuted_dense_DA_fill_values, tests_run);
-    mu_run_test(test_permuted_dense_ATA_alloc, tests_run);
-    mu_run_test(test_permuted_dense_ATDA_fill_values, tests_run);
+    mu_run_test(test_DA_pd_fill_values, tests_run);
+    mu_run_test(test_ATA_pd_alloc, tests_run);
+    mu_run_test(test_ATDA_pd_fill_values, tests_run);
     mu_run_test(test_permuted_dense_times_csc, tests_run);
     mu_run_test(test_permuted_dense_times_csc_no_active, tests_run);
     mu_run_test(test_permuted_dense_to_csr_lazy, tests_run);
diff --git a/tests/profiling/profile_log_reg.h b/tests/profiling/profile_log_reg.h
index ccf8e63..31bc075 100644
--- a/tests/profiling/profile_log_reg.h
+++ b/tests/profiling/profile_log_reg.h
@@ -73,13 +73,14 @@ const char *profile_log_reg(void)
     permuted_dense *A_pd = (permuted_dense *) A_pd_M;
     matrix *Jlog_M = new_permuted_dense(m, n, m, n, full_rows, full_cols, NULL);
     permuted_dense *Jlog_pd = (permuted_dense *) Jlog_M;
-    matrix *H_pd_M = permuted_dense_ATA_alloc(A_pd);
+    matrix *H_pd_M = ATA_pd_alloc(A_pd);
     permuted_dense *H_pd = (permuted_dense *) H_pd_M;
 
     free(full_rows);
     free(full_cols);
 
-    /* CSR_matrix scaffolding for the row-sum step (PD owns the cached CSR_matrix view). */
+    /* CSR_matrix scaffolding for the row-sum step (PD owns the cached CSR_matrix
+     * view). */
     CSR_matrix *Jlog_csr = Jlog_M->to_csr(Jlog_M);
     CSR_matrix *Jobj_csr = new_CSR_matrix(1, n, n);
     int *iwork = (int *) malloc((size_t) m * n * sizeof(int));
@@ -97,21 +98,21 @@ const char *profile_log_reg(void)
        dwork) as sigmas read by local_wsum_hess. */
     clock_gettime(CLOCK_MONOTONIC, &t_b_jac.start);
     log_obj->local_jacobian(log_obj, log_obj->work->dwork);
-    permuted_dense_DA_fill_values(log_obj->work->dwork, A_pd, Jlog_pd);
+    DA_pd_fill_values(log_obj->work->dwork, A_pd, Jlog_pd);
     memset(Jobj_csr->x, 0, Jobj_csr->nnz * sizeof(double));
     accumulator(Jlog_csr->x, Jlog_csr->nnz, idx_map, Jobj_csr->x);
     clock_gettime(CLOCK_MONOTONIC, &t_b_jac.end);
     clock_gettime(CLOCK_MONOTONIC, &t_b_hess.start);
     log_obj->local_wsum_hess(log_obj, d2, w_ones);
-    permuted_dense_ATDA_fill_values(A_pd, d2, H_pd);
+    ATDA_pd_fill_values(A_pd, d2, H_pd);
     clock_gettime(CLOCK_MONOTONIC, &t_b_hess.end);
     double sec_b_jac = GET_ELAPSED_SECONDS(t_b_jac);
     double sec_b_hess = GET_ELAPSED_SECONDS(t_b_hess);
 
     printf("\n");
     printf("                            Jacobian      Hessian        Total\n");
-    printf("  Path A (engine CSR_matrix/CSC_matrix): %10.6fs  %10.6fs  %10.6fs\n", sec_a_jac,
-           sec_a_hess, sec_a_jac + sec_a_hess);
+    printf("  Path A (engine CSR_matrix/CSC_matrix): %10.6fs  %10.6fs  %10.6fs\n",
+           sec_a_jac, sec_a_hess, sec_a_jac + sec_a_hess);
     printf("  Path B (permuted_dense): %10.6fs  %10.6fs  %10.6fs\n", sec_b_jac,
            sec_b_hess, sec_b_jac + sec_b_hess);
     printf("  Speedup (A / B):         %10.2fx %10.2fx %10.2fx\n",
diff --git a/tests/utils/test_permuted_dense.h b/tests/utils/test_permuted_dense.h
index a668990..f186999 100644
--- a/tests/utils/test_permuted_dense.h
+++ b/tests/utils/test_permuted_dense.h
@@ -130,7 +130,7 @@ const char *test_permuted_dense_to_csr_single_col(void)
 
    PD is the 5x6 matrix from the basic to_csr test, with d a length-5
    global-row diagonal including a negative and zero entry. */
-const char *test_permuted_dense_DA_fill_values(void)
+const char *test_DA_pd_fill_values(void)
 {
     int row_perm[3] = {1, 2, 4};
     int col_perm[2] = {0, 3};
@@ -142,7 +142,7 @@ const char *test_permuted_dense_DA_fill_values(void)
     permuted_dense *pd = (permuted_dense *) M;
     permuted_dense *pd_out = (permuted_dense *) M_out;
 
-    permuted_dense_DA_fill_values(d, pd, pd_out);
+    DA_pd_fill_values(d, pd, pd_out);
 
     /* Ground truth: build CSR_matrix of self, run DA_fill_values, compare. */
     CSR_matrix *csr = M->to_csr(M);
@@ -161,7 +161,7 @@ const char *test_permuted_dense_DA_fill_values(void)
 /* ATA_alloc: structure-only check. Output is 6x6 with a 2x2 dense block at
    perms {0, 3} (= self.col_perm on both sides). Values are uninitialized
    here; ATDA_fill_values is the value-producing op. */
-const char *test_permuted_dense_ATA_alloc(void)
+const char *test_ATA_pd_alloc(void)
 {
     int row_perm[3] = {1, 2, 4};
     int col_perm[2] = {0, 3};
@@ -170,7 +170,7 @@ const char *test_permuted_dense_ATA_alloc(void)
     matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
     permuted_dense *pd = (permuted_dense *) M;
 
-    matrix *M_ata = permuted_dense_ATA_alloc(pd);
+    matrix *M_ata = ATA_pd_alloc(pd);
     permuted_dense *pd_ata = (permuted_dense *) M_ata;
 
     int perm_expected[2] = {0, 3};
@@ -189,7 +189,7 @@ const char *test_permuted_dense_ATA_alloc(void)
 /* ATDA: same 5x6 PD, d with negative + zero entries to catch sign bugs.
    Hand-computed: d_perm = [-1.5, 0, 2.5], Y = diag(d_perm) X gives
    [[-1.5,-3],[0,0],[12.5,15]], and X^T Y = [[61,72],[72,84]]. */
-const char *test_permuted_dense_ATDA_fill_values(void)
+const char *test_ATDA_pd_fill_values(void)
 {
     int row_perm[3] = {1, 2, 4};
     int col_perm[2] = {0, 3};
@@ -199,9 +199,9 @@ const char *test_permuted_dense_ATDA_fill_values(void)
     matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
     permuted_dense *pd = (permuted_dense *) M;
 
-    matrix *M_out = permuted_dense_ATA_alloc(pd);
+    matrix *M_out = ATA_pd_alloc(pd);
     permuted_dense *pd_out = (permuted_dense *) M_out;
-    permuted_dense_ATDA_fill_values(pd, d, pd_out);
+    ATDA_pd_fill_values(pd, d, pd_out);
 
     double X_expected[4] = {61.0, 72.0, 72.0, 84.0};
     mu_assert("X", cmp_double_array(pd_out->X, X_expected, 4));
@@ -729,7 +729,7 @@ const char *test_permuted_dense_BTDA_decomposition(void)
     /* tmp has the same sparsity as A. */
     matrix *tmp_m = A_m->copy_sparsity(A_m);
     permuted_dense *tmp = (permuted_dense *) tmp_m;
-    permuted_dense_DA_fill_values(w, A, tmp);
+    DA_pd_fill_values(w, A, tmp);
 
     matrix *C_m = BTA_pd_pd_alloc(B, tmp);
     permuted_dense *C = (permuted_dense *) C_m;

From 9e853b1b6845162e5ae4bcd7359ea987bc1a33d1 Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Tue, 12 May 2026 15:46:20 +0200
Subject: [PATCH 18/31] run formatter

---
 include/subexpr.h                           |  8 ++++----
 include/utils/CSR_sum.h                     | 18 +++++++++---------
 include/utils/matrix.h                      |  8 ++++----
 include/utils/permuted_dense.h              |  2 +-
 include/utils/sparse_matrix.h               |  4 ++--
 tests/old-code/test_old_permuted_dense.h    |  6 ++----
 tests/profiling/profile_BTA_pd_csr_vs_csc.h | 17 +++++++++++++----
 tests/profiling/profile_left_matmul.h       |  3 ++-
 tests/profiling/profile_trimmed_log_reg.h   |  4 ++--
 tests/utils/test_csr_matrix.h               |  3 ++-
 10 files changed, 41 insertions(+), 32 deletions(-)

diff --git a/include/subexpr.h b/include/subexpr.h
index c2a3662..e66eb3b 100644
--- a/include/subexpr.h
+++ b/include/subexpr.h
@@ -133,10 +133,10 @@ typedef struct left_matmul_expr
     void (*refresh_param_values)(struct left_matmul_expr *);
 
     /* When true, jacobian_init_impl produces a permuted_dense node->jacobian
-       directly (skipping the CSC_matrix mirror machinery). Set by new_left_matmul_dense
-       when A is a constant dense matrix, child is a leaf variable, and
-       n_blocks == 1 — in that case the Jacobian is exactly A placed in the
-       variable's column slot, which is naturally a full-dense permuted_dense. */
+       directly (skipping the CSC_matrix mirror machinery). Set by
+       new_left_matmul_dense when A is a constant dense matrix, child is a leaf
+       variable, and n_blocks == 1 — in that case the Jacobian is exactly A placed in
+       the variable's column slot, which is naturally a full-dense permuted_dense. */
     bool produce_pd_jacobian;
 } left_matmul_expr;
 
diff --git a/include/utils/CSR_sum.h b/include/utils/CSR_sum.h
index 97c4718..62f49d9 100644
--- a/include/utils/CSR_sum.h
+++ b/include/utils/CSR_sum.h
@@ -35,15 +35,15 @@ void sum_scaled_csr_matrices_fill_values(const CSR_matrix *A, const CSR_matrix *
                                          const double *d2);
 
 /* The following five functions are used for summing either more than two CSR_matrix
-   matrices or rows of CSR_matrix matrices. To implement the filling of values efficiently,
-   we compute an idx_map when we fill the sparsity pattern of the output matrix,
-   which maps each nonzero entry in the input matrix to its position in the output
-   matrix. This allows us to fill the values with a single pass of the output matrix
-   through the input matrices, without needing to search for the position of each
-   entry in the output matrix. So each idx_map should have size equal to the number
-   of nonzeros in the corresponding input matrix, and idx_map[j] should give the
-   index in the output matrix of the entry (in the value array of the output matrix)
-   corresponding to the j-th nonzero in the input matrix.
+   matrices or rows of CSR_matrix matrices. To implement the filling of values
+   efficiently, we compute an idx_map when we fill the sparsity pattern of the output
+   matrix, which maps each nonzero entry in the input matrix to its position in the
+   output matrix. This allows us to fill the values with a single pass of the output
+   matrix through the input matrices, without needing to search for the position of
+   each entry in the output matrix. So each idx_map should have size equal to the
+   number of nonzeros in the corresponding input matrix, and idx_map[j] should give
+   the index in the output matrix of the entry (in the value array of the output
+   matrix) corresponding to the j-th nonzero in the input matrix.
 
    Output matrix C, input matrix A, iwork->size = max(A->n, A->nnz) for the first
    four functions. The last function allocates the output matrix and returns it. */
diff --git a/include/utils/matrix.h b/include/utils/matrix.h
index a20980a..34f7157 100644
--- a/include/utils/matrix.h
+++ b/include/utils/matrix.h
@@ -84,8 +84,8 @@ typedef struct matrix
        polymorphism). copy_sparsity returns a matrix of same shape and type as
        self; DA_fill_values writes diag(d) * self into out; ATA_alloc allocates
        a matrix with sparsity of self^T * self; ATDA_fill_values fills out with
-       self^T * diag(d) * self; to_csr returns a CSR_matrix view of self (constant-time
-       for sparse_matrix, lazily built/refreshed for other types). */
+       self^T * diag(d) * self; to_csr returns a CSR_matrix view of self
+       (constant-time for sparse_matrix, lazily built/refreshed for other types). */
     struct matrix *(*copy_sparsity)(const struct matrix *self);
     void (*DA_fill_values)(const double *d, const struct matrix *self,
                            struct matrix *out);
@@ -133,8 +133,8 @@ typedef struct matrix
     struct matrix *(*diag_vec_alloc)(struct matrix *self);
     void (*diag_vec_fill_values)(struct matrix *self, struct matrix *out);
 
-    /* Refresh any internal caches (e.g. a CSC_matrix mirror) so subsequent ATA / ATDA
-       calls reflect the current values. Atoms whose child Jacobian is affine
+    /* Refresh any internal caches (e.g. a CSC_matrix mirror) so subsequent ATA /
+       ATDA calls reflect the current values. Atoms whose child Jacobian is affine
        can skip this on iterations after the first; non-affine children must
        call it before every chain-rule call. No-op for types that don't have
        a cache (e.g. permuted_dense). */
diff --git a/include/utils/permuted_dense.h b/include/utils/permuted_dense.h
index c678797..129b8c2 100644
--- a/include/utils/permuted_dense.h
+++ b/include/utils/permuted_dense.h
@@ -32,7 +32,7 @@
    strictly increasing order; the constructor asserts this. */
 typedef struct permuted_dense
 {
-    matrix base;           /* base.m, base.n = global ambient dimensions */
+    matrix base;
     int m0;                /* rows of dense block (= len(row_perm))      */
     int n0;                /* cols of dense block (= len(col_perm))      */
     int *row_perm;         /* row_perm[ii] in [0, base.m), sorted        */
diff --git a/include/utils/sparse_matrix.h b/include/utils/sparse_matrix.h
index 0a8ca4c..fbecdf1 100644
--- a/include/utils/sparse_matrix.h
+++ b/include/utils/sparse_matrix.h
@@ -20,8 +20,8 @@
 
 #include "matrix.h"
 
-/* Sparse matrix wrapping CSR_matrix. csc_cache is a lazily-built CSC_matrix mirror used by
-   the chain-rule ATA / ATDA paths; it's allocated on first need and refilled
+/* Sparse matrix wrapping CSR_matrix. csc_cache is a lazily-built CSC_matrix mirror
+   used by the chain-rule ATA / ATDA paths; it's allocated on first need and refilled
    by refresh_csc_values. csc_iwork is the workspace for csr_to_csc. */
 typedef struct sparse_matrix
 {
diff --git a/tests/old-code/test_old_permuted_dense.h b/tests/old-code/test_old_permuted_dense.h
index e4ad18f..335e5ff 100644
--- a/tests/old-code/test_old_permuted_dense.h
+++ b/tests/old-code/test_old_permuted_dense.h
@@ -61,8 +61,7 @@ const char *test_BTA_pd_csr_basic(void)
     double *A_d = (double *) calloc(4 * 5, sizeof(double));
     double *B_d = (double *) calloc(4 * 4, sizeof(double));
     for (int i = 0; i < A->m; i++)
-        for (int e = A->p[i]; e < A->p[i + 1]; e++)
-            A_d[i * 5 + A->i[e]] = A->x[e];
+        for (int e = A->p[i]; e < A->p[i + 1]; e++) A_d[i * 5 + A->i[e]] = A->x[e];
     for (int kk = 0; kk < B->m0; kk++)
         for (int jj = 0; jj < B->n0; jj++)
             B_d[B->row_perm[kk] * 4 + B->col_perm[jj]] = B->X[kk * B->n0 + jj];
@@ -235,8 +234,7 @@ const char *test_BTA_csr_pd_basic(void)
         for (int jj = 0; jj < A->n0; jj++)
             A_d[A->row_perm[kk] * 5 + A->col_perm[jj]] = A->X[kk * A->n0 + jj];
     for (int i = 0; i < B->m; i++)
-        for (int e = B->p[i]; e < B->p[i + 1]; e++)
-            B_d[i * 4 + B->i[e]] = B->x[e];
+        for (int e = B->p[i]; e < B->p[i + 1]; e++) B_d[i * 4 + B->i[e]] = B->x[e];
 
     double C_ref[4 * 5];
     memset(C_ref, 0, sizeof C_ref);
diff --git a/tests/profiling/profile_BTA_pd_csr_vs_csc.h b/tests/profiling/profile_BTA_pd_csr_vs_csc.h
index db7bfab..df1cc46 100644
--- a/tests/profiling/profile_BTA_pd_csr_vs_csc.h
+++ b/tests/profiling/profile_BTA_pd_csr_vs_csc.h
@@ -49,14 +49,23 @@ static void run_bench_one_density(int m, int n0_B, int n_A, int nnz_per_row,
             int c = rand() % n_A;
             int dup = 0;
             for (int k = 0; k < picked; k++)
-                if (cols[k] == c) { dup = 1; break; }
+                if (cols[k] == c)
+                {
+                    dup = 1;
+                    break;
+                }
             if (!dup) cols[picked++] = c;
         }
         /* Insertion sort to keep CSR column-index invariant. */
         for (int a = 1; a < nnz_per_row; a++)
         {
-            int v = cols[a]; int b = a - 1;
-            while (b >= 0 && cols[b] > v) { cols[b + 1] = cols[b]; b--; }
+            int v = cols[a];
+            int b = a - 1;
+            while (b >= 0 && cols[b] > v)
+            {
+                cols[b + 1] = cols[b];
+                b--;
+            }
             cols[b + 1] = v;
         }
         for (int k = 0; k < nnz_per_row; k++)
@@ -125,7 +134,7 @@ const char *profile_BTA_pd_csr_vs_csc(void)
 
     printf("\nBTA pd × sparse fill benchmark (m=%d, n0_B=%d, n_A=%d, %d iters):\n",
            m, n0_B, n_A, N_ITERS);
-    run_bench_one_density(m, n0_B, n_A,  1, N_ITERS, "leaf-var (1 nnz/row):");
+    run_bench_one_density(m, n0_B, n_A, 1, N_ITERS, "leaf-var (1 nnz/row):");
     run_bench_one_density(m, n0_B, n_A, 50, N_ITERS, "dense-ish (50 nnz/row):");
     return 0;
 }
diff --git a/tests/profiling/profile_left_matmul.h b/tests/profiling/profile_left_matmul.h
index 639b6cf..db6b832 100644
--- a/tests/profiling/profile_left_matmul.h
+++ b/tests/profiling/profile_left_matmul.h
@@ -13,7 +13,8 @@
 
 const char *profile_left_matmul(void)
 {
-    /* A @ X where A is 50 x 50 dense stored in CSR_matrix and X is 50 x 50 variable */
+    /* A @ X where A is 50 x 50 dense stored in CSR_matrix and X is 50 x 50 variable
+     */
     int n = 100;
     expr *X = new_variable(n, n, 0, n * n);
     CSR_matrix *A = new_CSR_matrix(n, n, n * n);
diff --git a/tests/profiling/profile_trimmed_log_reg.h b/tests/profiling/profile_trimmed_log_reg.h
index 5f7f2f6..3fe2ae6 100644
--- a/tests/profiling/profile_trimmed_log_reg.h
+++ b/tests/profiling/profile_trimmed_log_reg.h
@@ -83,8 +83,8 @@ const char *profile_trimmed_log_reg(void)
 
     printf("\n");
     printf("trimmed_log_reg (m=%d, n=%d):\n", m, n);
-    printf("  jacobian = %10.6fs   hessian = %10.6fs  (avg over %d)\n",
-           sec_jac, sec_hess, N_HESS_ITERS);
+    printf("  jacobian = %10.6fs   hessian = %10.6fs  (avg over %d)\n", sec_jac,
+           sec_hess, N_HESS_ITERS);
 
     free_expr(obj);
     free(A_data);
diff --git a/tests/utils/test_csr_matrix.h b/tests/utils/test_csr_matrix.h
index db2e427..33c11a0 100644
--- a/tests/utils/test_csr_matrix.h
+++ b/tests/utils/test_csr_matrix.h
@@ -318,7 +318,8 @@ const char *test_sum_block_of_rows_csr(void)
 }
 const char *test_sum_evenly_spaced_rows_csr(void)
 {
-    /* Create a 9x4 CSR_matrix matrix A (same as test_sum_block_of_rows_csr) and sum evenly
+    /* Create a 9x4 CSR_matrix matrix A (same as test_sum_block_of_rows_csr) and sum
+    evenly
      * spaced rows With row_spacing=3:
 
 

From 78d2860fad3c0ba227ae625084f517469f5fd679 Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Tue, 12 May 2026 17:11:07 +0200
Subject: [PATCH 19/31] permutation dense preserving transpose implementation
 via index polymorhpim

---
 src/atoms/affine/transpose.c                 | 40 +++++-----------
 tests/all_tests.c                            |  1 +
 tests/jacobian_tests/affine/test_transpose.h | 49 ++++++++++++++++++++
 3 files changed, 62 insertions(+), 28 deletions(-)

diff --git a/src/atoms/affine/transpose.c b/src/atoms/affine/transpose.c
index bcf372c..ab083b4 100644
--- a/src/atoms/affine/transpose.c
+++ b/src/atoms/affine/transpose.c
@@ -16,9 +16,7 @@
  * limitations under the License.
  */
 #include "atoms/affine.h"
-#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
-#include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -46,45 +44,31 @@ static void jacobian_init_impl(expr *node)
 {
     expr *child = node->left;
     jacobian_init(child);
-    CSR_matrix *Jc = child->jacobian->to_csr(child->jacobian);
-    CSR_matrix *J = new_CSR_matrix(node->size, node->n_vars, Jc->nnz);
 
-    /* fill sparsity */
+    int n_out = node->size;
     int d1 = node->d1;
     int d2 = node->d2;
-    int nnz = 0;
-    J->p[0] = 0;
 
-    /* 'k' is the old row that gets swapped to 'row'*/
-    int k, len;
-    for (int row = 0; row < J->m; ++row)
+    /* The transpose's Jacobian is a row permutation of the child's:
+       J_node[r, :] = J_child[k(r), :] where k(r) = (r/d1) + (r%d1)*d2. */
+    int *indices = (int *) SP_MALLOC(n_out * sizeof(int));
+    for (int r = 0; r < n_out; r++)
     {
-        k = (row / d1) + (row % d1) * d2;
-        len = Jc->p[k + 1] - Jc->p[k];
-        memcpy(J->i + nnz, Jc->i + Jc->p[k], len * sizeof(int));
-        nnz += len;
-        J->p[row + 1] = nnz;
+        indices[r] = (r / d1) + (r % d1) * d2;
     }
 
-    node->jacobian = new_sparse_matrix(J);
+    node->jacobian = child->jacobian->index_alloc(child->jacobian, indices, n_out);
+
+    /* save indices for eval_jacobian */
+    node->work->iwork = indices;
 }
 
 static void eval_jacobian(expr *node)
 {
     expr *child = node->left;
     child->eval_jacobian(child);
-    CSR_matrix *Jc = child->jacobian->to_csr(child->jacobian);
-
-    int d1 = node->d1;
-    int d2 = node->d2;
-    int nnz = 0;
-    for (int row = 0; row < node->jacobian->m; ++row)
-    {
-        int k = (row / d1) + (row % d1) * d2;
-        int len = Jc->p[k + 1] - Jc->p[k];
-        memcpy(node->jacobian->x + nnz, Jc->x + Jc->p[k], len * sizeof(double));
-        nnz += len;
-    }
+    child->jacobian->index_fill_values(child->jacobian, node->work->iwork,
+                                       node->size, node->jacobian);
 }
 
 static void wsum_hess_init_impl(expr *node)
diff --git a/tests/all_tests.c b/tests/all_tests.c
index 3ee926c..c8c3e90 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -230,6 +230,7 @@ int main(void)
     mu_run_test(test_jacobian_convolve, tests_run);
     mu_run_test(test_jacobian_convolve_composite, tests_run);
     mu_run_test(test_jacobian_transpose, tests_run);
+    mu_run_test(test_jacobian_transpose_pd_preserved, tests_run);
     mu_run_test(test_diag_mat_jacobian_variable, tests_run);
     mu_run_test(test_diag_mat_jacobian_of_log, tests_run);
     mu_run_test(test_upper_tri_jacobian_variable, tests_run);
diff --git a/tests/jacobian_tests/affine/test_transpose.h b/tests/jacobian_tests/affine/test_transpose.h
index 2871a78..6317782 100644
--- a/tests/jacobian_tests/affine/test_transpose.h
+++ b/tests/jacobian_tests/affine/test_transpose.h
@@ -5,6 +5,7 @@
 #include "atoms/affine.h"
 #include "minunit.h"
 #include "test_helpers.h"
+#include "utils/permuted_dense.h"
 #include <math.h>
 #include <stdio.h>
 
@@ -41,4 +42,52 @@ const char *test_jacobian_transpose(void)
     return 0;
 }
 
+/* When the child of transpose has a PD Jacobian, the output should also be PD
+   with the same col_perm and a permuted row_perm. Setup:
+     u : 2x1 column variable, n_vars = 2.
+     AU = left_matmul_dense(A, u) with A a 6x2 dense matrix => AU is 6x1.
+          PD Jacobian: global (6, 2), m0=6, n0=2, row_perm=[0..5], col_perm=[0,1].
+     R  = reshape(AU, 3, 2). copy_sparsity preserves PD.
+     T  = transpose(R) with d1=2, d2=3. k(r) = (r/2) + (r%2)*3 = [0,3,1,4,2,5].
+          All r are active, so output row_perm stays [0..5] and the dense
+          block X is row-permuted: X_out[i, :] = X_c[k(i), :]. */
+const char *test_jacobian_transpose_pd_preserved(void)
+{
+    double A_data[12] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0,
+                         7.0, 8.0, 9.0, 10.0, 11.0, 12.0};
+    expr *u = new_variable(2, 1, 0, 2);
+    expr *AU = new_left_matmul_dense(NULL, u, 6, 2, A_data);
+    expr *R = new_reshape(AU, 3, 2);
+    expr *T = new_transpose(R);
+
+    double u_vals[2] = {0.5, -1.5};
+    T->forward(T, u_vals);
+    jacobian_init(T);
+    T->eval_jacobian(T);
+
+    /* Structural: output Jacobian must be a PD. */
+    permuted_dense *pd_T = T->jacobian->as_permuted_dense(T->jacobian);
+    mu_assert("transpose Jacobian should be PD", pd_T != NULL);
+    mu_assert("global m", T->jacobian->m == 6);
+    mu_assert("global n", T->jacobian->n == 2);
+    mu_assert("m0", pd_T->m0 == 6);
+    mu_assert("n0", pd_T->n0 == 2);
+    int expected_row_perm[6] = {0, 1, 2, 3, 4, 5};
+    int expected_col_perm[2] = {0, 1};
+    mu_assert("row_perm", cmp_int_array(pd_T->row_perm, expected_row_perm, 6));
+    mu_assert("col_perm", cmp_int_array(pd_T->col_perm, expected_col_perm, 2));
+
+    /* Numerical: X_out rows = A rows permuted by k(r) = [0,3,1,4,2,5]. */
+    double expected_X[12] = {1.0, 2.0,    /* row 0 from A row 0 */
+                             7.0, 8.0,    /* row 1 from A row 3 */
+                             3.0, 4.0,    /* row 2 from A row 1 */
+                             9.0, 10.0,   /* row 3 from A row 4 */
+                             5.0, 6.0,    /* row 4 from A row 2 */
+                             11.0, 12.0}; /* row 5 from A row 5 */
+    mu_assert("X values", cmp_double_array(pd_T->X, expected_X, 12));
+
+    free_expr(T);
+    return 0;
+}
+
 #endif // TEST_TRANSPOSE_H

From ce3e33c636c303466e78fdafa9b5b3c2023d93a8 Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Wed, 13 May 2026 10:38:54 +0200
Subject: [PATCH 20/31] clean up multiply

---
 include/atoms/affine.h                       |  1 -
 include/problem.h                            |  6 +--
 include/subexpr.h                            | 12 ++---
 include/utils/COO_Matrix.h                   | 27 +++++-----
 include/utils/dense_matrix.h                 |  2 +
 include/utils/matrix.h                       | 20 ++++---
 include/utils/matrix_BTA.h                   | 18 +++----
 include/utils/matrix_sum.h                   | 15 ++----
 include/utils/sparse_matrix.h                | 10 ++++
 src/atoms/affine/hstack.c                    |  7 ++-
 src/atoms/affine/sum.c                       |  1 +
 src/atoms/affine/trace.c                     |  4 +-
 src/atoms/affine/transpose.c                 |  1 -
 src/atoms/bivariate_full_dom/multiply.c      | 57 ++++++++++----------
 src/problem.c                                |  8 +--
 src/utils/COO_Matrix.c                       | 14 ++---
 src/utils/dense_matrix.c                     |  7 ---
 src/utils/matrix_BTA.c                       | 46 ++++++----------
 src/utils/permuted_dense.c                   | 42 ++++++++++-----
 src/utils/sparse_matrix.c                    | 29 +++++++---
 tests/all_tests.c                            |  2 +-
 tests/forward_pass/affine/test_add.h         |  1 +
 tests/jacobian_tests/affine/test_transpose.h | 16 +++---
 tests/utils/test_coo_matrix.h                | 14 ++---
 tests/utils/test_matrix_BTA.h                | 10 +++-
 tests/utils/test_permuted_dense.h            |  1 -
 26 files changed, 200 insertions(+), 171 deletions(-)

diff --git a/include/atoms/affine.h b/include/atoms/affine.h
index 3fd9118..49b4637 100644
--- a/include/atoms/affine.h
+++ b/include/atoms/affine.h
@@ -19,7 +19,6 @@
 #define AFFINE_H
 
 #include "expr.h"
-#include "subexpr.h"
 #include "utils/CSR_matrix.h"
 
 expr *new_add(expr *left, expr *right);
diff --git a/include/problem.h b/include/problem.h
index 1c2a77d..530907c 100644
--- a/include/problem.h
+++ b/include/problem.h
@@ -19,7 +19,7 @@
 #define PROBLEM_H
 
 #include "expr.h"
-#include "utils/COO_Matrix.h"
+#include "utils/COO_matrix.h"
 #include "utils/CSR_matrix.h"
 #include "utils/Timer.h"
 #include <stdbool.h>
@@ -62,8 +62,8 @@ typedef struct problem
     CSR_matrix *jacobian;
     CSR_matrix *lagrange_hessian;
     int *hess_idx_map; /* maps all wsum_hess nnz to lagrange_hessian */
-    COO_Matrix *jacobian_coo;
-    COO_Matrix *lagrange_hessian_coo; /* lower triangular part stored in COO */
+    COO_matrix *jacobian_coo;
+    COO_matrix *lagrange_hessian_coo; /* lower triangular part stored in COO */
 
     /* for the affine shortcut we keep track of the first time the jacobian and
      * hessian are called */
diff --git a/include/subexpr.h b/include/subexpr.h
index e66eb3b..1676690 100644
--- a/include/subexpr.h
+++ b/include/subexpr.h
@@ -109,12 +109,12 @@ typedef struct hstack_expr
 typedef struct elementwise_mult_expr
 {
     expr base;
-    matrix *cross_C;       /* C  = Jg2^T diag(w) Jg1 (Sparse or PD) */
-    CSR_matrix *CSR_work2; /* CT = C^T (always CSR_matrix) */
-    int *idx_map_C;        /* C[j]  -> wsum_hess pos */
-    int *idx_map_CT;       /* CT[j] -> wsum_hess pos */
-    int *idx_map_Hx;       /* x->wsum_hess[j] -> pos */
-    int *idx_map_Hy;       /* y->wsum_hess[j] -> pos */
+    matrix *C;       /* C  = Jg2^T diag(w) Jg1 (Sparse or PD) */
+    matrix *CT;      /* CT = C^T; same concrete type as C */
+    int *idx_map_C;  /* C[j]  -> wsum_hess pos */
+    int *idx_map_CT; /* CT[j] -> wsum_hess pos */
+    int *idx_map_Hx; /* x->wsum_hess[j] -> pos */
+    int *idx_map_Hy; /* y->wsum_hess[j] -> pos */
 } elementwise_mult_expr;
 
 /* Left matrix multiplication: y = A * f(x) where f(x) is an expression. Note that
diff --git a/include/utils/COO_Matrix.h b/include/utils/COO_Matrix.h
index 549bb1a..64b57b4 100644
--- a/include/utils/COO_Matrix.h
+++ b/include/utils/COO_Matrix.h
@@ -15,8 +15,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef COO_MATRIX_H
-#define COO_MATRIX_H
+#ifndef COO_matrix_H
+#define COO_matrix_H
 
 #include "CSR_matrix.h"
 
@@ -32,7 +32,7 @@
  * - n: number of columns
  * - nnz: number of nonzero entries
  */
-typedef struct COO_Matrix
+typedef struct COO_matrix
 {
     int *rows;
     int *cols;
@@ -41,20 +41,19 @@ typedef struct COO_Matrix
     int m;
     int n;
     int nnz;
-} COO_Matrix;
+} COO_matrix;
 
-/* Construct a COO matrix from a CSR_matrix matrix */
-COO_Matrix *new_coo_matrix(const CSR_matrix *A);
+/* COO from CSR */
+COO_matrix *new_COO_matrix(const CSR_matrix *A);
 
-/* Construct a COO matrix containing only the lower-triangular
- * entries (col <= row) of a symmetric CSR_matrix matrix. Populates
- * value_map so that refresh_lower_triangular_coo can update
- * values without recomputing structure. */
-COO_Matrix *new_coo_matrix_lower_triangular(const CSR_matrix *A);
+/* Construct COO containing only the lower-triangular entries (col <= row) of a
+   symmetric CSR. Populates value_map so that refresh_lower_triangular_coo can
+   update values without recomputing structure. */
+COO_matrix *new_COO_matrix_lower_triangular(const CSR_matrix *A);
 
 /* Refresh COO values from a new CSR_matrix value array using value_map */
-void refresh_lower_triangular_coo(COO_Matrix *coo, const double *vals);
+void refresh_lower_triangular_coo(COO_matrix *coo, const double *vals);
 
-void free_coo_matrix(COO_Matrix *matrix);
+void free_COO_matrix(COO_matrix *matrix);
 
-#endif /* COO_MATRIX_H */
+#endif /* COO_matrix_H */
diff --git a/include/utils/dense_matrix.h b/include/utils/dense_matrix.h
index 3de84b1..9b85a58 100644
--- a/include/utils/dense_matrix.h
+++ b/include/utils/dense_matrix.h
@@ -20,6 +20,8 @@
 
 #include "matrix.h"
 
+/* TODO: do we need this matrix class? */
+
 /* Dense matrix (row-major) */
 typedef struct dense_matrix
 {
diff --git a/include/utils/matrix.h b/include/utils/matrix.h
index 34f7157..c24916c 100644
--- a/include/utils/matrix.h
+++ b/include/utils/matrix.h
@@ -20,10 +20,7 @@
 
 #include "CSC_matrix.h"
 #include "CSR_matrix.h"
-
-/* Forward declaration; full definition in permuted_dense.h. Used by the
-   as_permuted_dense vtable getter. */
-struct permuted_dense;
+#include <stdbool.h>
 
 /* Broadcast shape used by the broadcast atom and its vtable methods. */
 typedef enum
@@ -71,6 +68,11 @@ typedef struct matrix
     int m, n, nnz; /* shape and nnz*/
     double *x;     /* non-owning pointer to the value buffer */
 
+    /* True iff self is a permuted_dense; lets bivariate dispatchers route to
+       type-specialized kernels without a vtable call. Set by the concrete
+       constructor (false by default via CALLOC). */
+    bool is_permuted_dense;
+
     /* Operators for the left-multiply matrix in left_matmul. */
     void (*block_left_mult_vec)(const struct matrix *self, const double *x,
                                 double *y, int p);
@@ -94,10 +96,12 @@ typedef struct matrix
                              struct matrix *out);
     CSR_matrix *(*to_csr)(struct matrix *self);
 
-    /* Returns self downcast to permuted_dense if self is PD-backed, NULL
-       otherwise. Used by bivariate dispatchers to route to type-specialized
-       kernels. */
-    struct permuted_dense *(*as_permuted_dense)(struct matrix *self);
+    /* Transpose: returns a matrix of shape (self->n, self->m), same concrete
+       type as self. transpose_alloc sets up sparsity; transpose_fill_values
+       fills values into out, which must have been produced by a prior
+       transpose_alloc on a matrix with the same sparsity as self. */
+    struct matrix *(*transpose_alloc)(const struct matrix *self);
+    void (*transpose_fill_values)(const struct matrix *self, struct matrix *out);
 
     /* Row-selection / indexing: returns a new matrix that selects rows
        indices[0..n_idxs) of self. Output shape is (n_idxs, self->n). The
diff --git a/include/utils/matrix_BTA.h b/include/utils/matrix_BTA.h
index bbf68ab..115d3d5 100644
--- a/include/utils/matrix_BTA.h
+++ b/include/utils/matrix_BTA.h
@@ -14,19 +14,19 @@
 
 #include "matrix.h"
 
-/* Polymorphic dispatchers for C = B^T A and C = B^T diag(d) A. The output
+/* Polymorphic dispatchers for C = BT @ A and C = BT @ diag(d) @ A. The output
    type depends on the input types: (PD, PD) → PD, (Sparse, PD) → PD,
-   (PD, Sparse) → PD, (Sparse, Sparse) → Sparse. Dispatched via
-   as_permuted_dense() on both operands. */
+   (PD, Sparse) → PD, (Sparse, Sparse) → Sparse. (Here PD = permuted_dense.)
 
-/* Allocate sparsity for C = B^T A. */
+   Contract: neither function touches sparse_matrix internals. The caller must,
+   before calling either function, ensure each Sparse operand's csc_cache
+   exists (sparse_matrix_ensure_csc_cache). Before BTDA_matrices_fill_values
+   the caller must also refresh the cache values (refresh_csc_values). */
+
+/* Allocate sparsity for C = BT @ A. */
 matrix *BTA_matrices_alloc(matrix *A, matrix *B);
 
-/* Fill out->x = B^T diag(d) A (d may be NULL for plain B^T A). out must
-   have the structure produced by BTA_matrices_alloc(A, B). For the
-   (Sparse, Sparse) path, the caller must ensure both operands' csc_caches
-   are fresh (via refresh_csc_values) before calling; the dispatcher does
-   not refresh. */
+/* Fill values of C = BT @ diag(d) @ A. */
 void BTDA_matrices_fill_values(matrix *A, const double *d, matrix *B, matrix *C);
 
 #endif /* MATRIX_BTA_H */
diff --git a/include/utils/matrix_sum.h b/include/utils/matrix_sum.h
index 31e5a56..95c6fcf 100644
--- a/include/utils/matrix_sum.h
+++ b/include/utils/matrix_sum.h
@@ -20,21 +20,14 @@
 
 #include "matrix.h"
 
-/* Polymorphic wrappers over CSR_sum. A, B, and C must all be sparse_matrix-
-   backed for now; the union sparsity of A+B is general sparse, so a
-   permuted_dense output is not supported.
-
-   sum_matrices_alloc fills C's sparsity pattern and re-syncs C's base.nnz
-   from the underlying CSR_matrix (sum_csr_alloc may shrink nnz below the
-   over-allocated max). */
+/* Polymorphic wrappers for allocating C = A + B. Right now we always
+   convert to CSR matrices internally for the sum. */
 void sum_matrices_alloc(matrix *A, matrix *B, matrix *C);
 
-/* Fills C's values; assumes C already has the union sparsity pattern of
-   A and B (typically produced by sum_matrices_alloc). */
+/* Fill values of C = A + B. Uses CSR matrices internally. */
 void sum_matrices_fill_values(matrix *A, matrix *B, matrix *C);
 
-/* Fills C's values for C = diag(d1) * A + diag(d2) * B; assumes C already
-   has the union sparsity pattern of A and B. */
+/* Fill values of C = diag(d1) * A + diag(d2) * B. Uses CSR matrices internally. */
 void sum_scaled_matrices_fill_values(matrix *A, matrix *B, matrix *C,
                                      const double *d1, const double *d2);
 
diff --git a/include/utils/sparse_matrix.h b/include/utils/sparse_matrix.h
index fbecdf1..210daa1 100644
--- a/include/utils/sparse_matrix.h
+++ b/include/utils/sparse_matrix.h
@@ -29,6 +29,10 @@ typedef struct sparse_matrix
     CSR_matrix *csr;
     CSC_matrix *csc_cache;
     int *csc_iwork;
+    int *transpose_iwork; /* sized csr->n; allocated by sparse_transpose_alloc
+                             on the output sm and reused by
+                             sparse_transpose_fill_values. NULL when this
+                             sm wasn't produced by transpose_alloc. */
 } sparse_matrix;
 
 /* Constructor. Takes ownership of A; the caller must not free A separately
@@ -43,4 +47,10 @@ matrix *new_sparse_matrix_alloc(int m, int n, int nnz);
 /* Transpose helper */
 matrix *sparse_matrix_trans(const sparse_matrix *self, int *iwork);
 
+/* Build the CSC_matrix cache structure if absent. Idempotent; structure-only,
+   values are NOT filled (use refresh_csc_values for that). Exposed so the
+   bivariate dispatchers in matrix_BTA can prepare sparsity without touching
+   uninitialized values during the init phase. */
+void sparse_matrix_ensure_csc_cache(sparse_matrix *sm);
+
 #endif /* SPARSE_MATRIX_H */
diff --git a/src/atoms/affine/hstack.c b/src/atoms/affine/hstack.c
index 9ee3650..4bf416e 100644
--- a/src/atoms/affine/hstack.c
+++ b/src/atoms/affine/hstack.c
@@ -16,6 +16,7 @@
  * limitations under the License.
  */
 #include "atoms/affine.h"
+#include "subexpr.h"
 #include "utils/CSR_sum.h"
 #include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
@@ -94,7 +95,8 @@ static void eval_jacobian(expr *node)
         expr *child = hnode->args[i];
         child->eval_jacobian(child);
         /* copy values */
-        memcpy(node->jacobian->x + node->jacobian->nnz, child->jacobian->x, child->jacobian->nnz * sizeof(double));
+        memcpy(node->jacobian->x + node->jacobian->nnz, child->jacobian->x,
+               child->jacobian->nnz * sizeof(double));
         node->jacobian->nnz += child->jacobian->nnz;
     }
 }
@@ -138,7 +140,8 @@ static void wsum_hess_eval(expr *node, const double *w)
         expr *child = hnode->args[i];
         child->eval_wsum_hess(child, w + row_offset);
         copy_CSR_matrix(H, hnode->CSR_work);
-        sum_csr_fill_values(hnode->CSR_work, child->wsum_hess->to_csr(child->wsum_hess), H);
+        sum_csr_fill_values(hnode->CSR_work,
+                            child->wsum_hess->to_csr(child->wsum_hess), H);
         row_offset += child->size;
     }
 }
diff --git a/src/atoms/affine/sum.c b/src/atoms/affine/sum.c
index 4945fa0..74f720b 100644
--- a/src/atoms/affine/sum.c
+++ b/src/atoms/affine/sum.c
@@ -16,6 +16,7 @@
  * limitations under the License.
  */
 #include "atoms/affine.h"
+#include "subexpr.h"
 #include "utils/CSR_sum.h"
 #include "utils/int_double_pair.h"
 #include "utils/mini_numpy.h"
diff --git a/src/atoms/affine/trace.c b/src/atoms/affine/trace.c
index 04dd351..846a45b 100644
--- a/src/atoms/affine/trace.c
+++ b/src/atoms/affine/trace.c
@@ -16,6 +16,7 @@
  * limitations under the License.
  */
 #include "atoms/affine.h"
+#include "subexpr.h"
 #include "utils/CSR_sum.h"
 #include "utils/int_double_pair.h"
 #include "utils/sparse_matrix.h"
@@ -94,7 +95,8 @@ static void eval_jacobian(expr *node)
 
     /* local jacobian */
     memset(node->jacobian->x, 0, node->jacobian->nnz * sizeof(double));
-    accumulator_with_spacing(x->jacobian->to_csr(x->jacobian), tnode->idx_map, node->jacobian->x, x->d1 + 1);
+    accumulator_with_spacing(x->jacobian->to_csr(x->jacobian), tnode->idx_map,
+                             node->jacobian->x, x->d1 + 1);
 }
 
 /* Placeholders for Hessian-related functions */
diff --git a/src/atoms/affine/transpose.c b/src/atoms/affine/transpose.c
index ab083b4..7fd3720 100644
--- a/src/atoms/affine/transpose.c
+++ b/src/atoms/affine/transpose.c
@@ -87,7 +87,6 @@ static void eval_wsum_hess(expr *node, const double *w)
 {
     int d2 = node->d2;
     int d1 = node->d1;
-    // TODO: meaybe more efficient to do this with memcpy first
 
     /* evaluate hessian of child at Kw */
     for (int i = 0; i < d2; ++i)
diff --git a/src/atoms/bivariate_full_dom/multiply.c b/src/atoms/bivariate_full_dom/multiply.c
index 277a060..7ae7841 100644
--- a/src/atoms/bivariate_full_dom/multiply.c
+++ b/src/atoms/bivariate_full_dom/multiply.c
@@ -17,6 +17,7 @@
  */
 #include "atoms/bivariate_full_dom.h"
 #include "subexpr.h"
+#include "utils/CSR_matrix.h"
 #include "utils/CSR_sum.h"
 #include "utils/matrix_BTA.h"
 #include "utils/matrix_sum.h"
@@ -150,33 +151,34 @@ static void wsum_hess_init_impl(expr *node)
             node->work->dwork = (double *) SP_MALLOC(node->size * sizeof(double));
         }
 
-        /* CSC_matrix scaffolding is still needed for the (Sparse, Sparse) fast path
-           through BTA_matrices_* / BTDA_matrices_* — those route through
-           sparse_matrix's csc_cache. For PD operands, refresh_csc_values is
-           a no-op so the call is harmless. */
-        jacobian_csc_init(x);
-        jacobian_csc_init(y);
+        /* For sparse matrices we need the CSC cache to be valid for the
+           BTA_matrices_alloc / BTDA_matrices_fill_values calls below. */
+        if (!x->jacobian->is_permuted_dense)
+        {
+            sparse_matrix_ensure_csc_cache((sparse_matrix *) x->jacobian);
+        }
+        if (!y->jacobian->is_permuted_dense)
+        {
+            sparse_matrix_ensure_csc_cache((sparse_matrix *) y->jacobian);
+        }
+
+        /* compute sparsity of C and prepare CT */
+        matrix *C = BTA_matrices_alloc(x->jacobian, y->jacobian);
+        matrix *CT = C->transpose_alloc(C);
 
         /* initialize wsum_hessians of children */
         wsum_hess_init(x);
         wsum_hess_init(y);
 
         elementwise_mult_expr *mul_node = (elementwise_mult_expr *) node;
-
-        /* compute sparsity of C polymorphically (Sparse, PD-CSR_matrix, CSR_matrix-PD, PD-PD). */
-        mul_node->cross_C = BTA_matrices_alloc(x->jacobian, y->jacobian);
-
-        /* CT structure is always CSR_matrix (via AT on C's CSR_matrix view). */
-        CSR_matrix *C_csr = mul_node->cross_C->to_csr(mul_node->cross_C);
-        node->work->iwork = (int *) SP_MALLOC(C_csr->m * sizeof(int));
-        CSR_matrix *CT = AT_alloc(C_csr, node->work->iwork);
-        mul_node->CSR_work2 = CT;
+        mul_node->C = C;
+        mul_node->CT = CT;
 
         /* compute sparsity pattern of H = C + C^T + term2 + term3 (we also
            fill index maps telling us where to accumulate each element of each
            matrix in the sum) */
         int *maps[4];
-        CSR_matrix *hess = sum_4_csr_alloc(C_csr, CT,
+        CSR_matrix *hess = sum_4_csr_alloc(C->to_csr(C), CT->to_csr(CT),
                                            x->wsum_hess->to_csr(x->wsum_hess),
                                            y->wsum_hess->to_csr(y->wsum_hess), maps);
         node->wsum_hess = new_sparse_matrix(hess);
@@ -204,9 +206,9 @@ static void eval_wsum_hess(expr *node, const double *w)
         bool is_x_affine = x->is_affine(x);
         bool is_y_affine = y->is_affine(y);
         // ----------------------------------------------------------------------
-        //  Refresh each operand's CSC_matrix cache as needed for the (Sparse, Sparse)
-        //  dispatch path. For PD operands, refresh_csc_values is a no-op. The
-        //  jacobian_csc_filled flag preserves the affine optimization: we only
+        //  Refresh each operand's CSC_matrix cache as needed for the (Sparse,
+        //  Sparse) dispatch path. For PD operands, refresh_csc_values is a no-op.
+        //  The jacobian_csc_filled flag preserves the affine optimization: we only
         //  refresh on the first eval for affine children.
         // ----------------------------------------------------------------------
         if (!x->work->jacobian_csc_filled)
@@ -230,10 +232,8 @@ static void eval_wsum_hess(expr *node, const double *w)
         //                    compute C and CT
         // ---------------------------------------------------------------
         elementwise_mult_expr *mul_node = (elementwise_mult_expr *) node;
-        CSR_matrix *CT = mul_node->CSR_work2;
-        BTDA_matrices_fill_values(x->jacobian, w, y->jacobian, mul_node->cross_C);
-        AT_fill_values(mul_node->cross_C->to_csr(mul_node->cross_C), CT,
-                       node->work->iwork);
+        BTDA_matrices_fill_values(x->jacobian, w, y->jacobian, mul_node->C);
+        mul_node->C->transpose_fill_values(mul_node->C, mul_node->CT);
 
         // ---------------------------------------------------------------
         //              compute term2 and term 3
@@ -260,9 +260,10 @@ static void eval_wsum_hess(expr *node, const double *w)
         //        compute H = C + C^T + term2 + term3
         // ---------------------------------------------------------------
         memset(node->wsum_hess->x, 0, node->wsum_hess->nnz * sizeof(double));
-        accumulator(mul_node->cross_C->x, mul_node->cross_C->nnz,
-                    mul_node->idx_map_C, node->wsum_hess->x);
-        accumulator(CT->x, CT->nnz, mul_node->idx_map_CT, node->wsum_hess->x);
+        accumulator(mul_node->C->x, mul_node->C->nnz, mul_node->idx_map_C,
+                    node->wsum_hess->x);
+        accumulator(mul_node->CT->x, mul_node->CT->nnz, mul_node->idx_map_CT,
+                    node->wsum_hess->x);
         accumulator(x->wsum_hess->x, x->wsum_hess->nnz, mul_node->idx_map_Hx,
                     node->wsum_hess->x);
         accumulator(y->wsum_hess->x, y->wsum_hess->nnz, mul_node->idx_map_Hy,
@@ -273,8 +274,8 @@ static void eval_wsum_hess(expr *node, const double *w)
 static void free_type_data(expr *node)
 {
     elementwise_mult_expr *mul_node = (elementwise_mult_expr *) node;
-    free_matrix(mul_node->cross_C);
-    free_CSR_matrix(mul_node->CSR_work2);
+    free_matrix(mul_node->C);
+    free_matrix(mul_node->CT);
     free(mul_node->idx_map_C);
     free(mul_node->idx_map_CT);
     free(mul_node->idx_map_Hx);
diff --git a/src/problem.c b/src/problem.c
index e675ecd..e462fa4 100644
--- a/src/problem.c
+++ b/src/problem.c
@@ -251,7 +251,7 @@ void problem_init_jacobian_coo(problem *prob)
     problem_init_jacobian(prob);
     Timer timer;
     clock_gettime(CLOCK_MONOTONIC, &timer.start);
-    prob->jacobian_coo = new_coo_matrix(prob->jacobian);
+    prob->jacobian_coo = new_COO_matrix(prob->jacobian);
     clock_gettime(CLOCK_MONOTONIC, &timer.end);
     prob->stats.time_init_derivatives += GET_ELAPSED_SECONDS(timer);
 }
@@ -262,7 +262,7 @@ void problem_init_hessian_coo_lower_triangular(problem *prob)
     Timer timer;
     clock_gettime(CLOCK_MONOTONIC, &timer.start);
     prob->lagrange_hessian_coo =
-        new_coo_matrix_lower_triangular(prob->lagrange_hessian);
+        new_COO_matrix_lower_triangular(prob->lagrange_hessian);
     clock_gettime(CLOCK_MONOTONIC, &timer.end);
     prob->stats.time_init_derivatives += GET_ELAPSED_SECONDS(timer);
 }
@@ -347,8 +347,8 @@ void free_problem(problem *prob)
     free(prob->gradient_values);
     free_CSR_matrix(prob->jacobian);
     free_CSR_matrix(prob->lagrange_hessian);
-    free_coo_matrix(prob->jacobian_coo);
-    free_coo_matrix(prob->lagrange_hessian_coo);
+    free_COO_matrix(prob->jacobian_coo);
+    free_COO_matrix(prob->lagrange_hessian_coo);
     free(prob->hess_idx_map);
 
     /* Release expression references (decrements refcount) */
diff --git a/src/utils/COO_Matrix.c b/src/utils/COO_Matrix.c
index 9b67291..8602c09 100644
--- a/src/utils/COO_Matrix.c
+++ b/src/utils/COO_Matrix.c
@@ -15,14 +15,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "utils/COO_Matrix.h"
+#include "utils/COO_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <stdlib.h>
 #include <string.h>
 
-COO_Matrix *new_coo_matrix(const CSR_matrix *A)
+COO_matrix *new_COO_matrix(const CSR_matrix *A)
 {
-    COO_Matrix *coo = (COO_Matrix *) SP_MALLOC(sizeof(COO_Matrix));
+    COO_matrix *coo = (COO_matrix *) SP_MALLOC(sizeof(COO_matrix));
     coo->m = A->m;
     coo->n = A->n;
     coo->nnz = A->nnz;
@@ -45,7 +45,7 @@ COO_Matrix *new_coo_matrix(const CSR_matrix *A)
     return coo;
 }
 
-COO_Matrix *new_coo_matrix_lower_triangular(const CSR_matrix *A)
+COO_matrix *new_COO_matrix_lower_triangular(const CSR_matrix *A)
 {
     /* Pass 1: count lower-triangular entries (col <= row) */
     int count = 0;
@@ -60,7 +60,7 @@ COO_Matrix *new_coo_matrix_lower_triangular(const CSR_matrix *A)
         }
     }
 
-    COO_Matrix *coo = (COO_Matrix *) SP_MALLOC(sizeof(COO_Matrix));
+    COO_matrix *coo = (COO_matrix *) SP_MALLOC(sizeof(COO_matrix));
     coo->m = A->m;
     coo->n = A->n;
     coo->nnz = count;
@@ -89,7 +89,7 @@ COO_Matrix *new_coo_matrix_lower_triangular(const CSR_matrix *A)
     return coo;
 }
 
-void refresh_lower_triangular_coo(COO_Matrix *coo, const double *vals)
+void refresh_lower_triangular_coo(COO_matrix *coo, const double *vals)
 {
     for (int i = 0; i < coo->nnz; i++)
     {
@@ -97,7 +97,7 @@ void refresh_lower_triangular_coo(COO_Matrix *coo, const double *vals)
     }
 }
 
-void free_coo_matrix(COO_Matrix *matrix)
+void free_COO_matrix(COO_matrix *matrix)
 {
     if (matrix)
     {
diff --git a/src/utils/dense_matrix.c b/src/utils/dense_matrix.c
index cb090f2..390f3e9 100644
--- a/src/utils/dense_matrix.c
+++ b/src/utils/dense_matrix.c
@@ -22,12 +22,6 @@
 #include <stdlib.h>
 #include <string.h>
 
-static struct permuted_dense *dense_as_permuted_dense(matrix *self)
-{
-    (void) self;
-    return NULL;
-}
-
 static void dense_block_left_mult_vec(const matrix *A, const double *x, double *y,
                                       int p)
 {
@@ -63,7 +57,6 @@ matrix *new_dense_matrix(int m, int n, const double *data)
     dm->base.n = n;
     dm->base.nnz = m * n;
     dm->base.block_left_mult_vec = dense_block_left_mult_vec;
-    dm->base.as_permuted_dense = dense_as_permuted_dense;
     dm->base.block_left_mult_sparsity = I_kron_A_alloc;
     dm->base.block_left_mult_values = I_kron_A_fill_values;
     dm->base.free_fn = dense_free;
diff --git a/src/utils/matrix_BTA.c b/src/utils/matrix_BTA.c
index fa8fec8..924fe86 100644
--- a/src/utils/matrix_BTA.c
+++ b/src/utils/matrix_BTA.c
@@ -18,64 +18,52 @@
 
 matrix *BTA_matrices_alloc(matrix *A, matrix *B)
 {
-    permuted_dense *pd_A = A->as_permuted_dense(A);
-    permuted_dense *pd_B = B->as_permuted_dense(B);
-
-    if (pd_A && pd_B)
+    if (A->is_permuted_dense && B->is_permuted_dense)
     {
-        return BTA_pd_pd_alloc(pd_B, pd_A);
+        return BTA_pd_pd_alloc((permuted_dense *) B, (permuted_dense *) A);
     }
-    if (pd_B)
+    if (B->is_permuted_dense)
     {
-        /* A is Sparse, B is PD — CSC kernel (see permuted_dense.{h,c}). */
         sparse_matrix *sm_A = (sparse_matrix *) A;
-        A->refresh_csc_values(A);
-        return BTA_pd_csc_alloc(pd_B, sm_A->csc_cache);
+        return BTA_pd_csc_alloc((permuted_dense *) B, sm_A->csc_cache);
     }
-    if (pd_A)
+    if (A->is_permuted_dense)
     {
-        /* A is PD, B is Sparse — CSC kernel (see permuted_dense.{h,c}). */
         sparse_matrix *sm_B = (sparse_matrix *) B;
-        B->refresh_csc_values(B);
-        return BTA_csc_pd_alloc(sm_B->csc_cache, pd_A);
+        return BTA_csc_pd_alloc(sm_B->csc_cache, (permuted_dense *) A);
     }
 
-    /* Both Sparse: delegate to CSC_matrix BTA. Caller must ensure caches are fresh.
-     */
+    /* both sparse */
     sparse_matrix *sm_A = (sparse_matrix *) A;
     sparse_matrix *sm_B = (sparse_matrix *) B;
-    A->refresh_csc_values(A);
-    B->refresh_csc_values(B);
     CSR_matrix *C_csr = BTA_alloc(sm_A->csc_cache, sm_B->csc_cache);
     return new_sparse_matrix(C_csr);
 }
 
 void BTDA_matrices_fill_values(matrix *A, const double *d, matrix *B, matrix *C)
 {
-    permuted_dense *pd_A = A->as_permuted_dense(A);
-    permuted_dense *pd_B = B->as_permuted_dense(B);
-
-    if (pd_A && pd_B)
+    if (A->is_permuted_dense && B->is_permuted_dense)
     {
-        BTDA_pd_pd_fill_values(pd_B, d, pd_A, (permuted_dense *) C);
+        BTDA_pd_pd_fill_values((permuted_dense *) B, d, (permuted_dense *) A,
+                               (permuted_dense *) C);
         return;
     }
-    if (pd_B)
+    if (B->is_permuted_dense)
     {
         sparse_matrix *sm_A = (sparse_matrix *) A;
-        A->refresh_csc_values(A);
-        BTDA_pd_csc_fill_values(pd_B, d, sm_A->csc_cache, (permuted_dense *) C);
+        BTDA_pd_csc_fill_values((permuted_dense *) B, d, sm_A->csc_cache,
+                                (permuted_dense *) C);
         return;
     }
-    if (pd_A)
+    if (A->is_permuted_dense)
     {
         sparse_matrix *sm_B = (sparse_matrix *) B;
-        B->refresh_csc_values(B);
-        BTDA_csc_pd_fill_values(sm_B->csc_cache, d, pd_A, (permuted_dense *) C);
+        BTDA_csc_pd_fill_values(sm_B->csc_cache, d, (permuted_dense *) A,
+                                (permuted_dense *) C);
         return;
     }
 
-    /* Both Sparse: delegate to CSC_matrix BTDA. */
+    /* both sparse */
     sparse_matrix *sm_A = (sparse_matrix *) A;
     sparse_matrix *sm_B = (sparse_matrix *) B;
     sparse_matrix *sm_C = (sparse_matrix *) C;
diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c
index 39e87c5..1cf1055 100644
--- a/src/utils/permuted_dense.c
+++ b/src/utils/permuted_dense.c
@@ -83,11 +83,6 @@ static CSR_matrix *permuted_dense_to_csr_alloc(const permuted_dense *A);
 /* Lazy CSR_matrix view: allocate structure on first call, then return the cache.
    The cache's x array aliases pd->X (see permuted_dense_to_csr_alloc), so
    values are always live without a per-call refresh. */
-static struct permuted_dense *permuted_dense_as_permuted_dense(matrix *self)
-{
-    return (permuted_dense *) self;
-}
-
 static CSR_matrix *permuted_dense_to_csr(matrix *self)
 {
     permuted_dense *pd = (permuted_dense *) self;
@@ -98,6 +93,32 @@ static CSR_matrix *permuted_dense_to_csr(matrix *self)
     return pd->csr_cache;
 }
 
+static matrix *permuted_dense_vtable_transpose_alloc(const matrix *self)
+{
+    const permuted_dense *pd = (const permuted_dense *) self;
+    /* Swap (m, n), (m0, n0), and (row_perm, col_perm). The constructor
+       asserts strict increase of both perms, which holds by construction. */
+    return new_permuted_dense(pd->base.n, pd->base.m, pd->n0, pd->m0,
+                              pd->col_perm, pd->row_perm, NULL);
+}
+
+static void permuted_dense_vtable_transpose_fill_values(const matrix *self,
+                                                        matrix *out)
+{
+    const permuted_dense *pd_in = (const permuted_dense *) self;
+    permuted_dense *pd_out = (permuted_dense *) out;
+    int m0 = pd_in->m0;
+    int n0 = pd_in->n0;
+    /* pd_out has shape (n0, m0); transpose pd_in->X into pd_out->X. */
+    for (int ii = 0; ii < m0; ii++)
+    {
+        for (int jj = 0; jj < n0; jj++)
+        {
+            pd_out->X[jj * m0 + ii] = pd_in->X[ii * n0 + jj];
+        }
+    }
+}
+
 static matrix *permuted_dense_vtable_index_alloc(matrix *self, const int *indices,
                                                  int n_idxs)
 {
@@ -337,7 +358,9 @@ matrix *new_permuted_dense(int m, int n, int m0, int n0, const int *row_perm,
     pd->base.ATA_alloc = permuted_dense_vtable_ATA_alloc;
     pd->base.ATDA_fill_values = permuted_dense_vtable_ATDA_fill_values;
     pd->base.to_csr = permuted_dense_to_csr;
-    pd->base.as_permuted_dense = permuted_dense_as_permuted_dense;
+    pd->base.transpose_alloc = permuted_dense_vtable_transpose_alloc;
+    pd->base.transpose_fill_values = permuted_dense_vtable_transpose_fill_values;
+    pd->base.is_permuted_dense = true;
     pd->base.index_alloc = permuted_dense_vtable_index_alloc;
     pd->base.index_fill_values = permuted_dense_vtable_index_fill_values;
     pd->base.promote_alloc = permuted_dense_vtable_promote_alloc;
@@ -604,13 +627,6 @@ void BTDA_pd_pd_fill_values(const permuted_dense *B, const double *d,
         return;
     }
 
-    /* d == NULL means plain BT @ A */
-    if (d == NULL)
-    {
-        BTA_pd_pd_fill_values(B, A, C);
-        return;
-    }
-
     /* C = BT @ (DA) */
     permuted_dense *DA = (permuted_dense *) A->base.copy_sparsity(&A->base);
     DA_pd_fill_values(d, A, DA);
diff --git a/src/utils/sparse_matrix.c b/src/utils/sparse_matrix.c
index 229b45c..211ac31 100644
--- a/src/utils/sparse_matrix.c
+++ b/src/utils/sparse_matrix.c
@@ -52,6 +52,7 @@ static void sparse_free(matrix *self)
     free_CSR_matrix(sm->csr);
     free_CSC_matrix(sm->csc_cache);
     free(sm->csc_iwork);
+    free(sm->transpose_iwork);
     free(sm);
 }
 
@@ -60,8 +61,8 @@ matrix *new_sparse_matrix(CSR_matrix *A);
 
 /* Build the CSC_matrix cache structure if absent. Values are NOT filled here; caller
    must call refresh_csc_values before consuming. ATA_alloc only needs structure,
-   so it's safe to call after build_csc_structure alone. */
-static void build_csc_structure_if_absent(sparse_matrix *sm)
+   so it's safe to call without a subsequent refresh. */
+void sparse_matrix_ensure_csc_cache(sparse_matrix *sm)
 {
     if (sm->csc_cache != NULL) return;
     sm->csc_iwork = (int *) SP_MALLOC(sm->csr->n * sizeof(int));
@@ -84,7 +85,7 @@ static void sparse_DA_fill_values(const double *d, const matrix *self, matrix *o
 static matrix *sparse_ATA_alloc(matrix *self)
 {
     sparse_matrix *sm = (sparse_matrix *) self;
-    build_csc_structure_if_absent(sm);
+    sparse_matrix_ensure_csc_cache(sm);
     return new_sparse_matrix(ATA_alloc(sm->csc_cache));
 }
 
@@ -101,10 +102,21 @@ static CSR_matrix *sparse_to_csr(matrix *self)
     return ((sparse_matrix *) self)->csr;
 }
 
-static struct permuted_dense *sparse_as_permuted_dense(matrix *self)
+static matrix *sparse_transpose_alloc(const matrix *self)
 {
-    (void) self;
-    return NULL;
+    const sparse_matrix *sm = (const sparse_matrix *) self;
+    int *iwork = (int *) SP_MALLOC(sm->csr->n * sizeof(int));
+    CSR_matrix *AT = AT_alloc(sm->csr, iwork);
+    sparse_matrix *out = (sparse_matrix *) new_sparse_matrix(AT);
+    out->transpose_iwork = iwork;
+    return &out->base;
+}
+
+static void sparse_transpose_fill_values(const matrix *self, matrix *out)
+{
+    const sparse_matrix *sm_in = (const sparse_matrix *) self;
+    sparse_matrix *sm_out = (sparse_matrix *) out;
+    AT_fill_values(sm_in->csr, sm_out->csr, sm_out->transpose_iwork);
 }
 
 static matrix *sparse_index_alloc(matrix *self, const int *indices, int n_idxs)
@@ -293,7 +305,7 @@ static void sparse_diag_vec_fill_values(matrix *self, matrix *out)
 static void sparse_refresh_csc_values(matrix *self)
 {
     sparse_matrix *sm = (sparse_matrix *) self;
-    build_csc_structure_if_absent(sm);
+    sparse_matrix_ensure_csc_cache(sm);
     csr_to_csc_fill_values(sm->csr, sm->csc_cache, sm->csc_iwork);
 }
 
@@ -307,7 +319,8 @@ static void wire_vtable(sparse_matrix *sm)
     sm->base.ATA_alloc = sparse_ATA_alloc;
     sm->base.ATDA_fill_values = sparse_ATDA_fill_values;
     sm->base.to_csr = sparse_to_csr;
-    sm->base.as_permuted_dense = sparse_as_permuted_dense;
+    sm->base.transpose_alloc = sparse_transpose_alloc;
+    sm->base.transpose_fill_values = sparse_transpose_fill_values;
     sm->base.index_alloc = sparse_index_alloc;
     sm->base.index_fill_values = sparse_index_fill_values;
     sm->base.promote_alloc = sparse_promote_alloc;
diff --git a/tests/all_tests.c b/tests/all_tests.c
index c8c3e90..a98ed60 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -60,7 +60,7 @@
 #include "problem/test_param_prob.h"
 #include "problem/test_problem.h"
 #include "utils/test_cblas.h"
-#include "utils/test_coo_matrix.h"
+#include "utils/test_COO_matrix.h"
 #include "utils/test_csc_matrix.h"
 #include "utils/test_csr_csc_conversion.h"
 #include "utils/test_csr_matrix.h"
diff --git a/tests/forward_pass/affine/test_add.h b/tests/forward_pass/affine/test_add.h
index 1f8a61a..4b14ae3 100644
--- a/tests/forward_pass/affine/test_add.h
+++ b/tests/forward_pass/affine/test_add.h
@@ -5,6 +5,7 @@
 #include "atoms/affine.h"
 #include "expr.h"
 #include "minunit.h"
+#include "subexpr.h"
 #include "test_helpers.h"
 
 const char *test_addition(void)
diff --git a/tests/jacobian_tests/affine/test_transpose.h b/tests/jacobian_tests/affine/test_transpose.h
index 6317782..3302545 100644
--- a/tests/jacobian_tests/affine/test_transpose.h
+++ b/tests/jacobian_tests/affine/test_transpose.h
@@ -53,7 +53,7 @@ const char *test_jacobian_transpose(void)
           block X is row-permuted: X_out[i, :] = X_c[k(i), :]. */
 const char *test_jacobian_transpose_pd_preserved(void)
 {
-    double A_data[12] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0,
+    double A_data[12] = {1.0, 2.0, 3.0, 4.0,  5.0,  6.0,
                          7.0, 8.0, 9.0, 10.0, 11.0, 12.0};
     expr *u = new_variable(2, 1, 0, 2);
     expr *AU = new_left_matmul_dense(NULL, u, 6, 2, A_data);
@@ -66,8 +66,8 @@ const char *test_jacobian_transpose_pd_preserved(void)
     T->eval_jacobian(T);
 
     /* Structural: output Jacobian must be a PD. */
-    permuted_dense *pd_T = T->jacobian->as_permuted_dense(T->jacobian);
-    mu_assert("transpose Jacobian should be PD", pd_T != NULL);
+    mu_assert("transpose Jacobian should be PD", T->jacobian->is_permuted_dense);
+    permuted_dense *pd_T = (permuted_dense *) T->jacobian;
     mu_assert("global m", T->jacobian->m == 6);
     mu_assert("global n", T->jacobian->n == 2);
     mu_assert("m0", pd_T->m0 == 6);
@@ -78,11 +78,11 @@ const char *test_jacobian_transpose_pd_preserved(void)
     mu_assert("col_perm", cmp_int_array(pd_T->col_perm, expected_col_perm, 2));
 
     /* Numerical: X_out rows = A rows permuted by k(r) = [0,3,1,4,2,5]. */
-    double expected_X[12] = {1.0, 2.0,    /* row 0 from A row 0 */
-                             7.0, 8.0,    /* row 1 from A row 3 */
-                             3.0, 4.0,    /* row 2 from A row 1 */
-                             9.0, 10.0,   /* row 3 from A row 4 */
-                             5.0, 6.0,    /* row 4 from A row 2 */
+    double expected_X[12] = {1.0,  2.0,   /* row 0 from A row 0 */
+                             7.0,  8.0,   /* row 1 from A row 3 */
+                             3.0,  4.0,   /* row 2 from A row 1 */
+                             9.0,  10.0,  /* row 3 from A row 4 */
+                             5.0,  6.0,   /* row 4 from A row 2 */
                              11.0, 12.0}; /* row 5 from A row 5 */
     mu_assert("X values", cmp_double_array(pd_T->X, expected_X, 12));
 
diff --git a/tests/utils/test_coo_matrix.h b/tests/utils/test_coo_matrix.h
index 73cbbf3..523b94a 100644
--- a/tests/utils/test_coo_matrix.h
+++ b/tests/utils/test_coo_matrix.h
@@ -4,7 +4,7 @@
 
 #include "minunit.h"
 #include "test_helpers.h"
-#include "utils/COO_Matrix.h"
+#include "utils/COO_matrix.h"
 
 const char *test_csr_to_coo(void)
 {
@@ -21,7 +21,7 @@ const char *test_csr_to_coo(void)
     memcpy(A->i, Ai, 6 * sizeof(int));
     memcpy(A->p, Ap, 4 * sizeof(int));
 
-    COO_Matrix *coo = new_coo_matrix(A);
+    COO_matrix *coo = new_COO_matrix(A);
 
     mu_assert("m incorrect", coo->m == 3);
     mu_assert("n incorrect", coo->n == 3);
@@ -35,7 +35,7 @@ const char *test_csr_to_coo(void)
     mu_assert("cols incorrect", cmp_int_array(coo->cols, expected_cols, 6));
     mu_assert("vals incorrect", cmp_double_array(coo->x, expected_x, 6));
 
-    free_coo_matrix(coo);
+    free_COO_matrix(coo);
     free_CSR_matrix(A);
 
     return 0;
@@ -56,7 +56,7 @@ const char *test_csr_to_coo_lower_triangular(void)
     memcpy(A->i, Ai, 9 * sizeof(int));
     memcpy(A->x, Ax, 9 * sizeof(double));
 
-    COO_Matrix *coo = new_coo_matrix_lower_triangular(A);
+    COO_matrix *coo = new_COO_matrix_lower_triangular(A);
 
     mu_assert("ltri m incorrect", coo->m == 3);
     mu_assert("ltri n incorrect", coo->n == 3);
@@ -73,7 +73,7 @@ const char *test_csr_to_coo_lower_triangular(void)
     mu_assert("ltri value_map incorrect",
               cmp_int_array(coo->value_map, expected_map, 6));
 
-    free_coo_matrix(coo);
+    free_COO_matrix(coo);
     free_CSR_matrix(A);
 
     return 0;
@@ -89,7 +89,7 @@ const char *test_refresh_lower_triangular_coo(void)
     memcpy(A->i, Ai, 9 * sizeof(int));
     memcpy(A->x, Ax, 9 * sizeof(double));
 
-    COO_Matrix *coo = new_coo_matrix_lower_triangular(A);
+    COO_matrix *coo = new_COO_matrix_lower_triangular(A);
 
     double vals2[9] = {10, 20, 30, 20, 50, 60, 30, 60, 90};
     refresh_lower_triangular_coo(coo, vals2);
@@ -97,7 +97,7 @@ const char *test_refresh_lower_triangular_coo(void)
     double expected_x[6] = {10, 20, 50, 30, 60, 90};
     mu_assert("refresh vals incorrect", cmp_double_array(coo->x, expected_x, 6));
 
-    free_coo_matrix(coo);
+    free_COO_matrix(coo);
     free_CSR_matrix(A);
 
     return 0;
diff --git a/tests/utils/test_matrix_BTA.h b/tests/utils/test_matrix_BTA.h
index c74d079..feb5fe9 100644
--- a/tests/utils/test_matrix_BTA.h
+++ b/tests/utils/test_matrix_BTA.h
@@ -74,8 +74,11 @@ const char *test_BTDA_matrices_csr_pd(void)
 
     double d[4] = {1.0, -2.0, 0.5, 3.0};
 
-    /* Wrapper path. */
+    /* Wrapper path. Dispatchers don't touch sparse_matrix internals — caller
+       owns csc_cache structure and values. */
+    sparse_matrix_ensure_csc_cache((sparse_matrix *) A_m);
     matrix *C_m = BTA_matrices_alloc(A_m, B_m);
+    A_m->refresh_csc_values(A_m);
     BTDA_matrices_fill_values(A_m, d, B_m, C_m);
 
     /* Direct primitive path. */
@@ -128,8 +131,11 @@ const char *test_BTDA_matrices_pd_csr(void)
 
     double d[4] = {1.0, -2.0, 0.5, 3.0};
 
-    /* Wrapper path. */
+    /* Wrapper path. Dispatchers don't touch sparse_matrix internals — caller
+       owns csc_cache structure and values. */
+    sparse_matrix_ensure_csc_cache((sparse_matrix *) B_m);
     matrix *C_m = BTA_matrices_alloc(A_m, B_m);
+    B_m->refresh_csc_values(B_m);
     BTDA_matrices_fill_values(A_m, d, B_m, C_m);
 
     /* Direct primitive path: production now dispatches the (PD, Sparse)
diff --git a/tests/utils/test_permuted_dense.h b/tests/utils/test_permuted_dense.h
index f186999..5edf1ab 100644
--- a/tests/utils/test_permuted_dense.h
+++ b/tests/utils/test_permuted_dense.h
@@ -634,7 +634,6 @@ const char *test_permuted_dense_BTA_empty_overlap(void)
     BTA_pd_pd_fill_values(B, A, C);
     double d[4] = {1.0, 1.0, 1.0, 1.0};
     BTDA_pd_pd_fill_values(B, d, A, C);
-    BTDA_pd_pd_fill_values(B, NULL, A, C);
 
     free_matrix(C_m);
     free_matrix(B_m);

From 06a9b0e7e19bcead7fd4e389ea31217a3f35794a Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Wed, 13 May 2026 11:06:26 +0200
Subject: [PATCH 21/31] clean up matrix.h

---
 include/utils/matrix.h | 225 ++++++++++++++++++++---------------------
 1 file changed, 111 insertions(+), 114 deletions(-)

diff --git a/include/utils/matrix.h b/include/utils/matrix.h
index c24916c..17a84ad 100644
--- a/include/utils/matrix.h
+++ b/include/utils/matrix.h
@@ -30,123 +30,120 @@ typedef enum
     BROADCAST_SCALAR /* (1, 1) -> (m, n) */
 } broadcast_type;
 
-/* We implement three different types of matrices.
-
-    1. 'sparse_matrix' represents a generic CSR_matrix matrix.
-    2. 'permuted_dense' represents a matrix that only consists of a dense block
-        (potentially after permuting columns).
-    3. 'blkdiag_dense' represents a block diagonal matrix with a constant dense
-        block.
-
-    Each of these types implements its own functionality for common matrix operations
-    such as DA_fill_values etc. The return type of most of these operations are the
-    same as the type of the input. For example, DA_fill_values for permuted_dense
-    fills the values of a new permuted_dense object.
-
-    2, 'permuted_dense':
-       * DA_fill_values just scales the rows. It does not affect the permutation
-         indices.
-       * ATA_alloc
-       * ATDA_fill_values
-       * to_csr_sparsity
-       * to_csr_values
-       *
-
-   1. sparse_matrix: generic CSR_matrix matrix.
-   2. permuted_dense:
-
-
-*/
-
-/* Base matrix type with function pointers for polymorphic dispatch. There are two
-   types of matrices: 'sparse_matrix' and 'permuted_dense'. Each type implements the
-   same set of operations, but with different algorithms. The following operations
-   are implemented: TODO
-*/
-typedef struct matrix
-{
-    int m, n, nnz; /* shape and nnz*/
-    double *x;     /* non-owning pointer to the value buffer */
+/* Polymorphic matrix base. Concrete types embed `matrix` as their first
+   member and implement the vtable slots below. Currently implemented:
+       1. sparse_matrix  — generic CSR_matrix-backed matrix.
+       2. permuted_dense — matrix whose nonzeros lie in a single dense block
+                           located at chosen rows and columns of the global
+                           index space.
+   A third type is potentially planned. */
+
+typedef struct matrix matrix;
+
+/* y = kron(I_p, A) @ x */
+typedef void (*matrix_block_left_mult_vec_fn)(const matrix *A, const double *x,
+                                              double *y, int p);
+
+/* Allocate sparsity of C = kron(I_p, A) @ J */
+typedef CSC_matrix *(*matrix_block_left_mult_sparsity_fn)(const matrix *A,
+                                                          const CSC_matrix *J,
+                                                          int p);
+
+/* Fill values of C = kron(I_p, A) @ J */
+typedef void (*matrix_block_left_mult_values_fn)(const matrix *A,
+                                                 const CSC_matrix *J, CSC_matrix *C);
+
+/* Allocate a new matrix with the same sparsity as A */
+typedef matrix *(*matrix_copy_sparsity_fn)(const matrix *A);
+
+/* Fill values of C = diag(d) @ A */
+typedef void (*matrix_DA_fill_values_fn)(const double *d, const matrix *A,
+                                         matrix *C);
+
+/* Allocate C = AT @ A */
+typedef matrix *(*matrix_ATA_alloc_fn)(matrix *A);
+
+/* Fill values of C = AT @ diag(d) @ A */
+typedef void (*matrix_ATDA_fill_values_fn)(const matrix *A, const double *d,
+                                           matrix *C);
+
+/* Allocate AT = transpose(A) */
+typedef matrix *(*matrix_transpose_alloc_fn)(const matrix *A);
 
-    /* True iff self is a permuted_dense; lets bivariate dispatchers route to
-       type-specialized kernels without a vtable call. Set by the concrete
-       constructor (false by default via CALLOC). */
+/* Fill values of AT = transpose(A) */
+typedef void (*matrix_transpose_fill_values_fn)(const matrix *A, matrix *AT);
+
+/* Returns a CSR_matrix view of A */
+typedef CSR_matrix *(*matrix_to_csr_fn)(matrix *A);
+
+/* Refresh any internal caches (e.g. a CSC_matrix mirror) so subsequent ATA /
+   ATDA calls reflect the current values. */
+typedef void (*matrix_refresh_csc_values_fn)(matrix *A);
+
+/* Allocate C = A[indices, :] */
+typedef matrix *(*matrix_index_alloc_fn)(matrix *A, const int *indices, int n_idxs);
+
+/* Fill values of C = A[indices, :] */
+typedef void (*matrix_index_fill_values_fn)(matrix *A, const int *indices,
+                                            int n_idxs, matrix *C);
+
+/* Row-tiling for the promote atom: A must be a 1-row matrix; returns
+   a new matrix of shape (size, A->n) where every row is a copy of A's
+   single row. */
+typedef matrix *(*matrix_promote_alloc_fn)(matrix *A, int size);
+typedef void (*matrix_promote_fill_values_fn)(matrix *A, matrix *out);
+
+/* Broadcast: lift the child Jacobian of a broadcast atom into the output
+   Jacobian. `type` is the broadcast variant; (d1, d2) is the output shape. */
+typedef matrix *(*matrix_broadcast_alloc_fn)(matrix *A, broadcast_type type, int d1,
+                                             int d2);
+typedef void (*matrix_broadcast_fill_values_fn)(matrix *A, broadcast_type type,
+                                                int d1, int d2, matrix *out);
+
+/* diag_vec: A is an (n, A->n) Jacobian for a length-n vector; output is
+   (n*n, A->n) where row i lands at output row i*(n+1) (column-major
+   diagonal positions). Other output rows are structurally zero. */
+typedef matrix *(*matrix_diag_vec_alloc_fn)(matrix *A);
+typedef void (*matrix_diag_vec_fill_values_fn)(matrix *A, matrix *out);
+
+typedef void (*matrix_free_fn)(matrix *self);
+
+struct matrix
+{
+    int m, n, nnz;
+    double *x; /* non-owning pointer to the value buffer */
     bool is_permuted_dense;
 
-    /* Operators for the left-multiply matrix in left_matmul. */
-    void (*block_left_mult_vec)(const struct matrix *self, const double *x,
-                                double *y, int p);
-    CSC_matrix *(*block_left_mult_sparsity)(const struct matrix *self,
-                                            const CSC_matrix *J, int p);
-    void (*block_left_mult_values)(const struct matrix *self, const CSC_matrix *J,
-                                   CSC_matrix *C);
-
-    /* Chain-rule operations used by transformer atoms (elementwise, etc.).
-       All chain-rule outputs are the same concrete type as self (uniform
-       polymorphism). copy_sparsity returns a matrix of same shape and type as
-       self; DA_fill_values writes diag(d) * self into out; ATA_alloc allocates
-       a matrix with sparsity of self^T * self; ATDA_fill_values fills out with
-       self^T * diag(d) * self; to_csr returns a CSR_matrix view of self
-       (constant-time for sparse_matrix, lazily built/refreshed for other types). */
-    struct matrix *(*copy_sparsity)(const struct matrix *self);
-    void (*DA_fill_values)(const double *d, const struct matrix *self,
-                           struct matrix *out);
-    struct matrix *(*ATA_alloc)(struct matrix *self);
-    void (*ATDA_fill_values)(const struct matrix *self, const double *d,
-                             struct matrix *out);
-    CSR_matrix *(*to_csr)(struct matrix *self);
-
-    /* Transpose: returns a matrix of shape (self->n, self->m), same concrete
-       type as self. transpose_alloc sets up sparsity; transpose_fill_values
-       fills values into out, which must have been produced by a prior
-       transpose_alloc on a matrix with the same sparsity as self. */
-    struct matrix *(*transpose_alloc)(const struct matrix *self);
-    void (*transpose_fill_values)(const struct matrix *self, struct matrix *out);
-
-    /* Row-selection / indexing: returns a new matrix that selects rows
-       indices[0..n_idxs) of self. Output shape is (n_idxs, self->n). The
-       returned type matches self's concrete type. index_alloc sets up
-       sparsity (values uninitialized); index_fill_values fills values into
-       out, which must have been produced by a prior index_alloc with the
-       same indices/n_idxs. */
-    struct matrix *(*index_alloc)(struct matrix *self, const int *indices,
-                                  int n_idxs);
-    void (*index_fill_values)(struct matrix *self, const int *indices, int n_idxs,
-                              struct matrix *out);
-
-    /* Row-tiling for the promote atom: self must be a 1-row matrix; returns
-       a new matrix of shape (size, self->n) where every row is a copy of
-       self's single row. Output type matches self's concrete type.
-       promote_alloc sets sparsity; promote_fill_values fills values. */
-    struct matrix *(*promote_alloc)(struct matrix *self, int size);
-    void (*promote_fill_values)(struct matrix *self, struct matrix *out);
-
-    /* Broadcast: lift the child Jacobian of a broadcast atom into the output
-       Jacobian. `type` is the broadcast variant; (d1, d2) is the output shape.
-       Output type matches self's concrete type. broadcast_alloc sets sparsity;
-       broadcast_fill_values fills values into out. */
-    struct matrix *(*broadcast_alloc)(struct matrix *self, broadcast_type type,
-                                      int d1, int d2);
-    void (*broadcast_fill_values)(struct matrix *self, broadcast_type type, int d1,
-                                  int d2, struct matrix *out);
-
-    /* diag_vec: child is an (n, self->n) Jacobian for a length-n vector;
-       output is (n*n, self->n) where child row i lands at output row
-       i*(n+1) (column-major diagonal positions). Other output rows are
-       structurally zero. Output type matches self's concrete type. */
-    struct matrix *(*diag_vec_alloc)(struct matrix *self);
-    void (*diag_vec_fill_values)(struct matrix *self, struct matrix *out);
-
-    /* Refresh any internal caches (e.g. a CSC_matrix mirror) so subsequent ATA /
-       ATDA calls reflect the current values. Atoms whose child Jacobian is affine
-       can skip this on iterations after the first; non-affine children must
-       call it before every chain-rule call. No-op for types that don't have
-       a cache (e.g. permuted_dense). */
-    void (*refresh_csc_values)(struct matrix *self);
-
-    /* Lifecycle. */
-    void (*free_fn)(struct matrix *self);
-} matrix;
+    /* Operator ops */
+    matrix_block_left_mult_vec_fn block_left_mult_vec;
+    matrix_block_left_mult_sparsity_fn block_left_mult_sparsity;
+    matrix_block_left_mult_values_fn block_left_mult_values;
+
+    /* Chain-rule ops */
+    matrix_copy_sparsity_fn copy_sparsity;
+    matrix_DA_fill_values_fn DA_fill_values;
+    matrix_ATA_alloc_fn ATA_alloc;
+    matrix_ATDA_fill_values_fn ATDA_fill_values;
+    matrix_transpose_alloc_fn transpose_alloc;
+    matrix_transpose_fill_values_fn transpose_fill_values;
+
+    /* Views and cache */
+    matrix_to_csr_fn to_csr;
+    matrix_refresh_csc_values_fn refresh_csc_values;
+
+    /* Atom-specific ops */
+    matrix_index_alloc_fn index_alloc;
+    matrix_index_fill_values_fn index_fill_values;
+    matrix_promote_alloc_fn promote_alloc;
+    matrix_promote_fill_values_fn promote_fill_values;
+    matrix_broadcast_alloc_fn broadcast_alloc;
+    matrix_broadcast_fill_values_fn broadcast_fill_values;
+    matrix_diag_vec_alloc_fn diag_vec_alloc;
+    matrix_diag_vec_fill_values_fn diag_vec_fill_values;
+
+    /* Lifecycle */
+    matrix_free_fn free_fn;
+};
 
 /* Free helper */
 static inline void free_matrix(matrix *m)

From 3e4ea9fbb3c4f27b4a9b1690d0190dfcfb78c32d Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Wed, 13 May 2026 12:01:15 +0200
Subject: [PATCH 22/31] lazy allocation of dwork in permuted_dense

---
 include/utils/permuted_dense.h |  6 +++--
 src/utils/permuted_dense.c     | 49 +++++++++++++++++++++++++++-------
 2 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/include/utils/permuted_dense.h b/include/utils/permuted_dense.h
index 129b8c2..fbeebee 100644
--- a/include/utils/permuted_dense.h
+++ b/include/utils/permuted_dense.h
@@ -52,8 +52,10 @@ typedef struct permuted_dense
          - transpose: holds (diag(d) X)^T for the BA_pd_csc-based BTDA
                      kernels (BTDA_pd_csc and, transitively, BTDA_csc_pd
                      via its delegate). Size m0*n0 doubles.
-       Sized at alloc time for the largest role this PD could play. Functions
-       taking a const permuted_dense * may still mutate `dwork`. */
+       Allocated lazily on the first kernel that needs it; grown in place
+       (free + SP_MALLOC, contents not preserved) if a later kernel needs
+       more. `dwork == NULL` and `dwork_size == 0` before first use.
+       Functions taking a const permuted_dense * may still mutate `dwork`. */
     double *dwork;
     size_t dwork_size;
 
diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c
index 1cf1055..d975eae 100644
--- a/src/utils/permuted_dense.c
+++ b/src/utils/permuted_dense.c
@@ -380,12 +380,11 @@ matrix *new_permuted_dense(int m, int n, int m0, int n0, const int *row_perm,
     pd->col_perm = (int *) SP_MALLOC(n0 * sizeof(int));
     pd->X = (double *) SP_MALLOC(sz * sizeof(double));
     pd->base.x = pd->X;
-    /* `dwork` sized for the Y-buffer role (Y = diag(d_perm) X) used by ATDA /
-       BTDA_pd_pd, and for the (diag(d) X)^T transpose in BTDA_pd_csc. The
-       legacy old-code BTA_pd_csr_alloc / BTA_csr_pd_alloc upgrade this to a
-       larger gather buffer when their output PD will instead play that role. */
-    pd->dwork_size = sz;
-    pd->dwork = (double *) SP_MALLOC(pd->dwork_size * sizeof(double));
+    /* dwork is allocated lazily by kernels via permuted_dense_ensure_dwork.
+       SP_CALLOC above already zeroed dwork / dwork_size, but make it
+       explicit. */
+    pd->dwork = NULL;
+    pd->dwork_size = 0;
     pd->col_inv = (int *) SP_MALLOC(n * sizeof(int));
     pd->row_inv = (int *) SP_MALLOC(m * sizeof(int));
 
@@ -470,6 +469,20 @@ void DA_pd_fill_values(const double *d, const permuted_dense *A, permuted_dense
     }
 }
 
+/* Ensure pd->dwork is sized at least `size` doubles. Grows in place;
+   contents are NOT preserved. Called from allocator functions so that the
+   corresponding fill kernels never need to allocate. Takes a const pointer
+   and casts internally — this matches the dwork contract (header) that
+   dwork is mutable through a const permuted_dense *. */
+static void permuted_dense_ensure_dwork(const permuted_dense *pd_const, size_t size)
+{
+    permuted_dense *pd = (permuted_dense *) pd_const;
+    if (pd->dwork_size >= size) return;
+    free(pd->dwork);
+    pd->dwork = (double *) SP_MALLOC(size * sizeof(double));
+    pd->dwork_size = size;
+}
+
 matrix *ATA_pd_alloc(const permuted_dense *A)
 {
     int n = A->base.n;
@@ -477,6 +490,10 @@ matrix *ATA_pd_alloc(const permuted_dense *A)
        sets given by A's col_perm. (This follows from Cij = ai^T aj where
        ai and aj are columns of A. Here, ai and aj always have overlapping entries,
        so Cij != 0 for (i, j) in A->col_perm x A->col_perm) */
+
+    /* Pre-size A's dwork for the ATDA fill (Y-buffer = diag(d_perm) X). */
+    permuted_dense_ensure_dwork(A, (size_t) A->m0 * A->n0);
+
     return new_permuted_dense(n, n, A->n0, A->n0, A->col_perm, A->col_perm, NULL);
 }
 
@@ -510,6 +527,12 @@ matrix *BTA_pd_pd_alloc(const permuted_dense *B, const permuted_dense *A)
     matrix *C = new_permuted_dense(B->base.n, A->base.n, B->n0, A->n0, B->col_perm,
                                    A->col_perm, NULL);
 
+    /* Pre-size A's and B's dwork for the BTA fill slow path (gathered row
+       buffers). Worst-case size is `m0 * n0` per operand; over-allocating
+       here lets the fill kernel never touch malloc. */
+    permuted_dense_ensure_dwork(A, (size_t) A->m0 * A->n0);
+    permuted_dense_ensure_dwork(B, (size_t) B->m0 * B->n0);
+
     /* Pre-allocate C->iwork for idx_A + idx_B in BTA / BTDA_pd_pd slow paths
        (each needs at most max_s = MIN(A->m0, B->m0) ints; we store both
        arrays back-to-back in iwork, hence 2 * max_s). */
@@ -601,9 +624,8 @@ void BTA_pd_pd_fill_values(const permuted_dense *B, const permuted_dense *A,
     assert(s > 0);
 
     // ------------------------------------------------------------------------
-    // Gather the matching rows into A->dwork and B->dwork (space is sufficient
-    // since A->dwork has at least space for A's full block, and we only need
-    // part of it. Same comment applies to B->dwork).
+    // Gather the matching rows into A->dwork and B->dwork. dwork is pre-sized
+    // by BTA_pd_pd_alloc (one ensure_dwork call per operand at alloc time).
     // ------------------------------------------------------------------------
     for (int k = 0; k < s; k++)
     {
@@ -741,6 +763,10 @@ matrix *BTA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A)
     matrix *C = new_permuted_dense(B->base.n, A->n, B->n0, col_active->len,
                                    B->col_perm, col_active->data, NULL);
     iVec_free(col_active);
+
+    /* Pre-size B's dwork for the BTDA fill (holds (diag(d) B)^T). */
+    permuted_dense_ensure_dwork(B, (size_t) B->m0 * B->n0);
+
     return C;
 }
 
@@ -791,6 +817,10 @@ matrix *BTA_csc_pd_alloc(const CSC_matrix *B, const permuted_dense *A)
     matrix *C = new_permuted_dense(B->n, A->base.n, row_active->len, A->n0,
                                    row_active->data, A->col_perm, NULL);
     iVec_free(row_active);
+
+    /* Pre-size A's dwork for the BTDA fill (holds (diag(d_perm) X_A)^T). */
+    permuted_dense_ensure_dwork(A, (size_t) A->m0 * A->n0);
+
     return C;
 }
 
@@ -831,6 +861,7 @@ void BTDA_csc_pd_fill_values(const CSC_matrix *B, const double *d,
     int n0_A = A->n0;
 
     /* A->dwork = (diag(d_perm) X_A)^T, row-major shape (n0_A, m0_A).
+       Pre-sized by BTA_csc_pd_alloc; no allocation in fill.
        Column j of (diag(d) X_A) lives contiguously in dwork as row j —
        which is exactly the layout BTA_csc_pd_fill_values wants. */
     for (int kk = 0; kk < m0_A; kk++)

From 7e3c23e55ca6e864cccefaa994a8f2bc2aa97ef1 Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Wed, 13 May 2026 12:11:17 +0200
Subject: [PATCH 23/31] Implement block_left_mult on permuted_dense

---
 include/utils/linalg_dense_sparse_matmuls.h |  9 ++-
 src/utils/dense_matrix.c                    |  9 ++-
 src/utils/linalg_dense_sparse_matmuls.c     | 14 ++---
 src/utils/permuted_dense.c                  | 46 +++++++++++++-
 tests/all_tests.c                           |  2 +
 tests/utils/test_matrix.h                   | 67 +++++++++++++++++++++
 6 files changed, 136 insertions(+), 11 deletions(-)

diff --git a/include/utils/linalg_dense_sparse_matmuls.h b/include/utils/linalg_dense_sparse_matmuls.h
index 24d8f8c..b76c3a3 100644
--- a/include/utils/linalg_dense_sparse_matmuls.h
+++ b/include/utils/linalg_dense_sparse_matmuls.h
@@ -23,10 +23,15 @@
 #include "matrix.h"
 
 /* C = (I_p kron A) @ J via the polymorphic matrix interface.
- * A is dense m x n, J is (n*p) x k in CSC_matrix, C is (m*p) x k in CSC_matrix. */
+ * A is dense m x n, J is (n*p) x k in CSC_matrix, C is (m*p) x k in CSC_matrix.
+ * `work` must be sized at least A->n doubles — used as a scratch buffer when
+ * a sparse column of J needs to be densified before dgemv. The caller is
+ * responsible for sizing it (typically pre-sized in the corresponding
+ * sparsity-build step). */
 // TODO: maybe we can replace these with I_kron_X functionality?
 CSC_matrix *I_kron_A_alloc(const matrix *A, const CSC_matrix *J, int p);
-void I_kron_A_fill_values(const matrix *A, const CSC_matrix *J, CSC_matrix *C);
+void I_kron_A_fill_values(const matrix *A, const CSC_matrix *J, CSC_matrix *C,
+                          double *work);
 
 /* Sparsity and values of C = (Y^T kron I_m) @ J where Y is k x n, J is (m*k) x p,
    and C is (m*n) x p. Y is given in column-major dense format. */
diff --git a/src/utils/dense_matrix.c b/src/utils/dense_matrix.c
index 390f3e9..cdfcbc3 100644
--- a/src/utils/dense_matrix.c
+++ b/src/utils/dense_matrix.c
@@ -42,6 +42,13 @@ static void dense_block_left_mult_vec(const matrix *A, const double *x, double *
                 n, 0.0, y, m);
 }
 
+static void dense_block_left_mult_values(const matrix *A, const CSC_matrix *J,
+                                         CSC_matrix *C)
+{
+    const dense_matrix *dm = (const dense_matrix *) A;
+    I_kron_A_fill_values(A, J, C, dm->work);
+}
+
 static void dense_free(matrix *A)
 {
     dense_matrix *dm = (dense_matrix *) A;
@@ -58,7 +65,7 @@ matrix *new_dense_matrix(int m, int n, const double *data)
     dm->base.nnz = m * n;
     dm->base.block_left_mult_vec = dense_block_left_mult_vec;
     dm->base.block_left_mult_sparsity = I_kron_A_alloc;
-    dm->base.block_left_mult_values = I_kron_A_fill_values;
+    dm->base.block_left_mult_values = dense_block_left_mult_values;
     dm->base.free_fn = dense_free;
     dm->x = (double *) SP_MALLOC(m * n * sizeof(double));
     if (data != NULL)
diff --git a/src/utils/linalg_dense_sparse_matmuls.c b/src/utils/linalg_dense_sparse_matmuls.c
index f1e3aaa..6b97e59 100644
--- a/src/utils/linalg_dense_sparse_matmuls.c
+++ b/src/utils/linalg_dense_sparse_matmuls.c
@@ -91,16 +91,16 @@ CSC_matrix *I_kron_A_alloc(const matrix *A, const CSC_matrix *J, int p)
     return C;
 }
 
-void I_kron_A_fill_values(const matrix *A, const CSC_matrix *J, CSC_matrix *C)
+void I_kron_A_fill_values(const matrix *A, const CSC_matrix *J, CSC_matrix *C,
+                          double *work)
 {
-    const dense_matrix *dm = (const dense_matrix *) A;
-    int m = dm->base.m;
-    int n = dm->base.n;
+    int m = A->m;
+    int n = A->n;
     int k = J->n;
 
     int i, j, s, block, block_start, block_end, start, end;
 
-    double *j_dense = dm->work;
+    double *j_dense = work;
 
     /* for each column of J (and C) */
     for (j = 0; j < k; j++)
@@ -131,7 +131,7 @@ void I_kron_A_fill_values(const matrix *A, const CSC_matrix *J, CSC_matrix *C)
                 /* Fast path: C column segment = val * A[:, row_in_block] */
                 int row_in_block = J->i[start] - block_start;
                 double val = J->x[start];
-                cblas_dcopy(m, dm->x + row_in_block, n, C->x + i, 1);
+                cblas_dcopy(m, A->x + row_in_block, n, C->x + i, 1);
                 if (val != 1.0)
                 {
                     cblas_dscal(m, val, C->x + i, 1);
@@ -147,7 +147,7 @@ void I_kron_A_fill_values(const matrix *A, const CSC_matrix *J, CSC_matrix *C)
                     j_dense[J->i[s] - block_start] = J->x[s];
                 }
 
-                cblas_dgemv(CblasRowMajor, CblasNoTrans, m, n, 1.0, dm->x, n,
+                cblas_dgemv(CblasRowMajor, CblasNoTrans, m, n, 1.0, A->x, n,
                             j_dense, 1, 0.0, C->x + i, 1);
             }
         }
diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c
index d975eae..4e1b932 100644
--- a/src/utils/permuted_dense.c
+++ b/src/utils/permuted_dense.c
@@ -18,6 +18,7 @@
 #include "utils/permuted_dense.h"
 #include "utils/cblas_wrapper.h"
 #include "utils/iVec.h"
+#include "utils/linalg_dense_sparse_matmuls.h"
 #include "utils/tracked_alloc.h"
 #include "utils/utils.h"
 #include <assert.h>
@@ -77,8 +78,9 @@ static void permuted_dense_vtable_ATDA_fill_values(const matrix *self,
     ATDA_pd_fill_values((const permuted_dense *) self, d, (permuted_dense *) out);
 }
 
-/* Forward decl; definition lower in the file. */
+/* Forward decls; definitions lower in the file. */
 static CSR_matrix *permuted_dense_to_csr_alloc(const permuted_dense *A);
+static void permuted_dense_ensure_dwork(const permuted_dense *pd_const, size_t size);
 
 /* Lazy CSR_matrix view: allocate structure on first call, then return the cache.
    The cache's x array aliases pd->X (see permuted_dense_to_csr_alloc), so
@@ -328,6 +330,44 @@ static void permuted_dense_vtable_diag_vec_fill_values(matrix *self, matrix *out
     memcpy(out_pd->X, pd->X, pd->m0 * pd->n0 * sizeof(double));
 }
 
+/* ===== Operator-role adapters: PD acting as the constant left operand of
+   left_matmul. Currently restricted to full-block PDs (m0 == m, n0 == n,
+   identity perms) — that's the case dense_matrix covers today. */
+
+static void permuted_dense_vtable_block_left_mult_vec(const matrix *A,
+                                                      const double *x, double *y,
+                                                      int p)
+{
+    assert(((const permuted_dense *) A)->m0 == A->m &&
+           ((const permuted_dense *) A)->n0 == A->n);
+    /* y (p x m) = x (p x n) * A^T (n x m), all row-major. Matches the
+       dense_matrix implementation; A->x is the row-major value buffer. */
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, p, A->m, A->n, 1.0, x,
+                A->n, A->x, A->n, 0.0, y, A->m);
+}
+
+static CSC_matrix *permuted_dense_vtable_block_left_mult_sparsity(const matrix *A,
+                                                                  const CSC_matrix *J,
+                                                                  int p)
+{
+    const permuted_dense *pd = (const permuted_dense *) A;
+    assert(pd->m0 == A->m && pd->n0 == A->n);
+    /* Pre-size dwork for the subsequent block_left_mult_values fill, which
+       densifies a sparse column of J (size A->n) before applying A. Honors
+       the no-alloc-in-fill rule. */
+    permuted_dense_ensure_dwork(pd, (size_t) A->n);
+    return I_kron_A_alloc(A, J, p);
+}
+
+static void permuted_dense_vtable_block_left_mult_values(const matrix *A,
+                                                         const CSC_matrix *J,
+                                                         CSC_matrix *C)
+{
+    const permuted_dense *pd = (const permuted_dense *) A;
+    assert(pd->m0 == A->m && pd->n0 == A->n);
+    I_kron_A_fill_values(A, J, C, pd->dwork);
+}
+
 matrix *new_permuted_dense(int m, int n, int m0, int n0, const int *row_perm,
                            const int *col_perm, const double *X_data)
 {
@@ -353,6 +393,10 @@ matrix *new_permuted_dense(int m, int n, int m0, int n0, const int *row_perm,
     pd->base.m = m;
     pd->base.n = n;
     pd->base.nnz = m0 * n0;
+    pd->base.block_left_mult_vec = permuted_dense_vtable_block_left_mult_vec;
+    pd->base.block_left_mult_sparsity =
+        permuted_dense_vtable_block_left_mult_sparsity;
+    pd->base.block_left_mult_values = permuted_dense_vtable_block_left_mult_values;
     pd->base.copy_sparsity = permuted_dense_vtable_copy_sparsity;
     pd->base.DA_fill_values = permuted_dense_vtable_DA_fill_values;
     pd->base.ATA_alloc = permuted_dense_vtable_ATA_alloc;
diff --git a/tests/all_tests.c b/tests/all_tests.c
index a98ed60..c1b38a8 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -363,6 +363,8 @@ int main(void)
     mu_run_test(test_sparse_vs_dense_mult_vec, tests_run);
     mu_run_test(test_dense_matrix_trans, tests_run);
     mu_run_test(test_sparse_vs_dense_mult_vec_blocks, tests_run);
+    mu_run_test(test_pd_operator_block_left_mult_vec, tests_run);
+    mu_run_test(test_pd_operator_vs_dense_block_left_mult, tests_run);
     mu_run_test(test_permuted_dense_to_csr_basic, tests_run);
     mu_run_test(test_permuted_dense_to_csr_empty, tests_run);
     mu_run_test(test_permuted_dense_to_csr_full, tests_run);
diff --git a/tests/utils/test_matrix.h b/tests/utils/test_matrix.h
index d0ccfe5..38001b8 100644
--- a/tests/utils/test_matrix.h
+++ b/tests/utils/test_matrix.h
@@ -4,6 +4,7 @@
 #include "minunit.h"
 #include "test_helpers.h"
 #include "utils/dense_matrix.h"
+#include "utils/permuted_dense.h"
 #include "utils/sparse_matrix.h"
 #include <stdlib.h>
 #include <string.h>
@@ -132,4 +133,70 @@ const char *test_sparse_vs_dense_mult_vec_blocks(void)
     return 0;
 }
 
+/* Full-block permuted_dense acting as operator must be byte-equivalent to
+   dense_matrix for all three block_left_mult_* slots. Mirrors the data of
+   test_dense_matrix_mult_vec and exercises the new PD-as-operator path. */
+const char *test_pd_operator_block_left_mult_vec(void)
+{
+    double data[] = {1.0, 2.0, 3.0, 4.0};
+    int row_perm[2] = {0, 1};
+    int col_perm[2] = {0, 1};
+    matrix *A = new_permuted_dense(2, 2, 2, 2, row_perm, col_perm, data);
+
+    double x[] = {1.0, 2.0};
+    double y[2] = {0.0, 0.0};
+
+    A->block_left_mult_vec(A, x, y, 1);
+
+    double y_expected[2] = {5.0, 11.0};
+    mu_assert("y incorrect", cmp_double_array(y, y_expected, 2));
+
+    free_matrix(A);
+    return 0;
+}
+
+/* Full-block PD operator vs dense_matrix: block_left_mult_sparsity and
+   block_left_mult_values must produce byte-equivalent CSC outputs.
+   J is a 6x2 CSC representing two identity-like columns into a single
+   block (p=1), exercising both the single-nonzero fast path and the
+   multi-nonzero densify path. */
+const char *test_pd_operator_vs_dense_block_left_mult(void)
+{
+    /* A = [1 2 3; 4 5 6] (2x3). */
+    double data[] = {1, 2, 3, 4, 5, 6};
+    int row_perm[2] = {0, 1};
+    int col_perm[3] = {0, 1, 2};
+    matrix *A_pd = new_permuted_dense(2, 3, 2, 3, row_perm, col_perm, data);
+    matrix *A_dm = new_dense_matrix(2, 3, data);
+
+    /* J is 3x2 CSC: col 0 = [1.0 at row 0], col 1 = [2.0 at row 0, 3.0 at row 2].
+       p = 1; output C is 2x2. */
+    CSC_matrix *J = new_CSC_matrix(3, 2, 3);
+    int Jp[3] = {0, 1, 3};
+    int Ji[3] = {0, 0, 2};
+    double Jx[3] = {1.0, 2.0, 3.0};
+    memcpy(J->p, Jp, 3 * sizeof(int));
+    memcpy(J->i, Ji, 3 * sizeof(int));
+    memcpy(J->x, Jx, 3 * sizeof(double));
+
+    CSC_matrix *C_pd = A_pd->block_left_mult_sparsity(A_pd, J, 1);
+    CSC_matrix *C_dm = A_dm->block_left_mult_sparsity(A_dm, J, 1);
+
+    mu_assert("nnz mismatch", C_pd->nnz == C_dm->nnz);
+    mu_assert("p mismatch", cmp_int_array(C_pd->p, C_dm->p, 3));
+    mu_assert("i mismatch", cmp_int_array(C_pd->i, C_dm->i, C_pd->nnz));
+
+    A_pd->block_left_mult_values(A_pd, J, C_pd);
+    A_dm->block_left_mult_values(A_dm, J, C_dm);
+
+    mu_assert("x mismatch", cmp_double_array(C_pd->x, C_dm->x, C_pd->nnz));
+
+    free_CSC_matrix(C_pd);
+    free_CSC_matrix(C_dm);
+    free_CSC_matrix(J);
+    free_matrix(A_pd);
+    free_matrix(A_dm);
+    return 0;
+}
+
 #endif /* TEST_MATRIX_H */

From fe1096ae4a4c54099910f0e7ca37000d6086edce Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Wed, 13 May 2026 12:30:14 +0200
Subject: [PATCH 24/31] swap dense matrix for permuted dense and delete
 dense_matrix

---
 include/utils/dense_matrix.h            |  42 ----------
 include/utils/mini_numpy.h              |   3 +
 include/utils/permuted_dense.h          |   4 +
 src/atoms/affine/left_matmul.c          |  32 +++----
 src/atoms/affine/right_matmul.c         |  11 +--
 src/utils/dense_matrix.c                | 102 ----------------------
 src/utils/linalg_dense_sparse_matmuls.c |   2 +-
 src/utils/mini_numpy.c                  |  11 +++
 src/utils/permuted_dense.c              |  42 +++++++---
 tests/all_tests.c                       |  11 ++-
 tests/utils/test_matrix.h               | 107 +++++++-----------------
 11 files changed, 108 insertions(+), 259 deletions(-)
 delete mode 100644 include/utils/dense_matrix.h
 delete mode 100644 src/utils/dense_matrix.c

diff --git a/include/utils/dense_matrix.h b/include/utils/dense_matrix.h
deleted file mode 100644
index 9b85a58..0000000
--- a/include/utils/dense_matrix.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright 2026 Daniel Cederberg and William Zhang
- *
- * This file is part of the SparseDiffEngine project.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef DENSE_MATRIX_H
-#define DENSE_MATRIX_H
-
-#include "matrix.h"
-
-/* TODO: do we need this matrix class? */
-
-/* Dense matrix (row-major) */
-typedef struct dense_matrix
-{
-    matrix base;
-    double *x;
-    double *work; /* scratch buffer, length n */
-} dense_matrix;
-
-/* Constructors. If data is NULL, the value buffer is allocated but left
-   uninitialized; otherwise m*n entries are copied from data. */
-matrix *new_dense_matrix(int m, int n, const double *data);
-
-/* Transpose helper */
-matrix *dense_matrix_trans(const dense_matrix *self);
-
-void A_transpose(double *AT, const double *A, int m, int n);
-
-#endif /* DENSE_MATRIX_H */
diff --git a/include/utils/mini_numpy.h b/include/utils/mini_numpy.h
index a17ca9e..ac14051 100644
--- a/include/utils/mini_numpy.h
+++ b/include/utils/mini_numpy.h
@@ -33,6 +33,9 @@ void scaled_ones(double *result, int size, double value);
 /* Naive implementation of Z = X @ Y, X is m x k, Y is k x n, Z is m x n */
 void mat_mat_mult(const double *X, const double *Y, double *Z, int m, int k, int n);
 
+/* Row-major dense transpose: AT[j*m + i] = A[i*n + j] for an m x n A. */
+void A_transpose(double *AT, const double *A, int m, int n);
+
 /* Compute v = (Y kron I_m) @ w where Y is k x n (col-major), len(w) = m * n, and
    len(v) = m * k.  Equivalently, reshape w as the m x n matrix W (col-major) and
    compute v = vec(W @ Y^T). */
diff --git a/include/utils/permuted_dense.h b/include/utils/permuted_dense.h
index fbeebee..7a9d926 100644
--- a/include/utils/permuted_dense.h
+++ b/include/utils/permuted_dense.h
@@ -82,6 +82,10 @@ typedef struct permuted_dense
 matrix *new_permuted_dense(int m, int n, int m0, int n0, const int *row_perm,
                            const int *col_perm, const double *X_data);
 
+/* Convenience constructor for the trivial-perm case: row_perm = [0..m-1],
+   col_perm = [0..n-1], dense block fills the full (m, n) shape. */
+matrix *new_permuted_dense_full(int m, int n, const double *data);
+
 /* CSR_matrix view: callers should use the vtable, i.e. base.to_csr(base). The PD
    owns and caches the returned CSR_matrix; its value array aliases self->X,
    so values are always live with no separate fill needed. Callers must not
diff --git a/src/atoms/affine/left_matmul.c b/src/atoms/affine/left_matmul.c
index 860ce88..daeb66d 100644
--- a/src/atoms/affine/left_matmul.c
+++ b/src/atoms/affine/left_matmul.c
@@ -17,12 +17,13 @@
  */
 #include "atoms/affine.h"
 #include "subexpr.h"
-#include "utils/dense_matrix.h"
+#include "utils/mini_numpy.h"
 #include "utils/permuted_dense.h"
 #include "utils/sparse_matrix.h"
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 /* This file implement the atom 'left_matmul' corresponding to the operation y =
    A @ f(x), where A is a given matrix and f(x) is an arbitrary expression.
@@ -118,9 +119,10 @@ static void jacobian_init_impl(expr *node)
     /* initialize child's jacobian */
     jacobian_init(x);
 
-    /* Fast path: A is a constant dense_matrix, child is a leaf variable, and
-       there are no Kronecker blocks. The Jacobian is A placed at the variable's
-       column slot — a full-dense permuted_dense. Skip the CSC_matrix mirror entirely. */
+    /* Fast path: A is a constant full-block PD operator, child is a leaf
+       variable, and there are no Kronecker blocks. The Jacobian is A placed
+       at the variable's column slot — a full-dense permuted_dense. Skip the
+       CSC_matrix mirror entirely. */
     if (lnode->produce_pd_jacobian)
     {
         int m_loc = lnode->A->m;
@@ -128,9 +130,9 @@ static void jacobian_init_impl(expr *node)
         int *col_perm = (int *) SP_MALLOC(lnode->A->n * sizeof(int));
         for (int i = 0; i < m_loc; i++) row_perm[i] = i;
         for (int j = 0; j < lnode->A->n; j++) col_perm[j] = x->var_id + j;
-        dense_matrix *dm = (dense_matrix *) lnode->A;
         node->jacobian = new_permuted_dense(m_loc, node->n_vars, m_loc,
-                                            lnode->A->n, row_perm, col_perm, dm->x);
+                                            lnode->A->n, row_perm, col_perm,
+                                            lnode->A->x);
         free(row_perm);
         free(col_perm);
         return;
@@ -198,16 +200,14 @@ static void eval_wsum_hess(expr *node, const double *w)
 
 static void refresh_dense_left(left_matmul_expr *lnode)
 {
-    dense_matrix *dm_A = (dense_matrix *) lnode->A;
-    dense_matrix *dm_AT = (dense_matrix *) lnode->AT;
-    int m = dm_A->base.m;
-    int n = dm_A->base.n;
+    int m = lnode->A->m;
+    int n = lnode->A->n;
 
     /* The parameter represents the A in left_matmul_dense(A, x) in column-major.
        In this diffengine, we store A in row-major order. Hence, param->vals
        actually corresponds to the transpose of A, and we transpose AT to get A. */
-    memcpy(dm_AT->x, lnode->param_source->value, m * n * sizeof(double));
-    A_transpose(dm_A->x, dm_AT->x, n, m);
+    memcpy(lnode->AT->x, lnode->param_source->value, m * n * sizeof(double));
+    A_transpose(lnode->A->x, lnode->AT->x, n, m);
 }
 
 expr *new_left_matmul(expr *param_node, expr *u, const CSR_matrix *A)
@@ -320,8 +320,8 @@ expr *new_left_matmul_dense(expr *param_node, expr *u, int m, int n,
         lnode->refresh_param_values = refresh_dense_left;
 
         /* A and AT buffers are filled by refresh_dense_left from the parameter. */
-        lnode->A = new_dense_matrix(m, n, NULL);
-        lnode->AT = new_dense_matrix(n, m, NULL);
+        lnode->A = new_permuted_dense_full(m, n, NULL);
+        lnode->AT = new_permuted_dense_full(n, m, NULL);
         node->needs_parameter_refresh = true;
     }
     /* constant matrix case */
@@ -333,8 +333,8 @@ expr *new_left_matmul_dense(expr *param_node, expr *u, int m, int n,
             exit(1);
         }
 
-        lnode->A = new_dense_matrix(m, n, data);
-        lnode->AT = dense_matrix_trans((const dense_matrix *) lnode->A);
+        lnode->A = new_permuted_dense_full(m, n, data);
+        lnode->AT = lnode->A->transpose_alloc(lnode->A);
 
         /* If the child is a leaf variable and there are no blocks, the Jacobian
            is exactly A placed in the variable's column slot — a full-dense
diff --git a/src/atoms/affine/right_matmul.c b/src/atoms/affine/right_matmul.c
index 90f031b..5e3b39f 100644
--- a/src/atoms/affine/right_matmul.c
+++ b/src/atoms/affine/right_matmul.c
@@ -18,10 +18,11 @@
 #include "atoms/affine.h"
 #include "subexpr.h"
 #include "utils/CSR_matrix.h"
-#include "utils/dense_matrix.h"
+#include "utils/mini_numpy.h"
 #include "utils/tracked_alloc.h"
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 /* This file implements the atom 'right_matmul' corresponding to the operation y =
    f(x) @ A, where A is a given matrix and f(x) is an arbitrary expression.
@@ -44,10 +45,10 @@ static void refresh_dense_right(left_matmul_expr *lnode)
        Furthermore, lnode->param_source->value corresponds to the column-major
        version of A, which is BT (an m x n matrix) */
 
-    dense_matrix *B = (dense_matrix *) lnode->AT;
-    dense_matrix *BT = (dense_matrix *) lnode->A;
-    int m = B->base.n;
-    int n = B->base.m;
+    matrix *B = lnode->AT;
+    matrix *BT = lnode->A;
+    int m = B->n;
+    int n = B->m;
 
     memcpy(BT->x, lnode->param_source->value, m * n * sizeof(double));
     A_transpose(B->x, BT->x, m, n);
diff --git a/src/utils/dense_matrix.c b/src/utils/dense_matrix.c
deleted file mode 100644
index cdfcbc3..0000000
--- a/src/utils/dense_matrix.c
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright 2026 Daniel Cederberg and William Zhang
- *
- * This file is part of the SparseDiffEngine project.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "utils/dense_matrix.h"
-#include "utils/cblas_wrapper.h"
-#include "utils/linalg_dense_sparse_matmuls.h"
-#include "utils/tracked_alloc.h"
-#include <stdlib.h>
-#include <string.h>
-
-static void dense_block_left_mult_vec(const matrix *A, const double *x, double *y,
-                                      int p)
-{
-    const dense_matrix *dm = (const dense_matrix *) A;
-    int m = dm->base.m;
-    int n = dm->base.n;
-
-    /* y = kron(I_p, A) @ x via a single dgemm call:
-       Treat x as n x p (column-major blocks) and y as m x p.
-       But x and y are stored as p blocks of length n and m
-       respectively (i.e. block-interleaved). This is the same as
-       treating them as row-major matrices of shape p x n and
-       p x m, so:
-       y (p x m) = x (p x n) * A^T (n x m), all row-major.
-       cblas with RowMajor: C = alpha * A * B + beta * C
-       where A = x (p x n), B = A^T (n x m), C = y (p x m). */
-    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, p, m, n, 1.0, x, n, dm->x,
-                n, 0.0, y, m);
-}
-
-static void dense_block_left_mult_values(const matrix *A, const CSC_matrix *J,
-                                         CSC_matrix *C)
-{
-    const dense_matrix *dm = (const dense_matrix *) A;
-    I_kron_A_fill_values(A, J, C, dm->work);
-}
-
-static void dense_free(matrix *A)
-{
-    dense_matrix *dm = (dense_matrix *) A;
-    free(dm->x);
-    free(dm->work);
-    free(dm);
-}
-
-matrix *new_dense_matrix(int m, int n, const double *data)
-{
-    dense_matrix *dm = (dense_matrix *) SP_CALLOC(1, sizeof(dense_matrix));
-    dm->base.m = m;
-    dm->base.n = n;
-    dm->base.nnz = m * n;
-    dm->base.block_left_mult_vec = dense_block_left_mult_vec;
-    dm->base.block_left_mult_sparsity = I_kron_A_alloc;
-    dm->base.block_left_mult_values = dense_block_left_mult_values;
-    dm->base.free_fn = dense_free;
-    dm->x = (double *) SP_MALLOC(m * n * sizeof(double));
-    if (data != NULL)
-    {
-        memcpy(dm->x, data, m * n * sizeof(double));
-    }
-    dm->base.x = dm->x;
-    dm->work = (double *) SP_MALLOC(n * sizeof(double));
-    return &dm->base;
-}
-
-matrix *dense_matrix_trans(const dense_matrix *A)
-{
-    int m = A->base.m;
-    int n = A->base.n;
-    double *AT_x = (double *) SP_MALLOC(m * n * sizeof(double));
-
-    A_transpose(AT_x, A->x, m, n);
-
-    matrix *result = new_dense_matrix(n, m, AT_x);
-    free(AT_x);
-    return result;
-}
-
-void A_transpose(double *AT, const double *A, int m, int n)
-{
-    for (int i = 0; i < m; i++)
-    {
-        for (int j = 0; j < n; j++)
-        {
-            AT[j * m + i] = A[i * n + j];
-        }
-    }
-}
diff --git a/src/utils/linalg_dense_sparse_matmuls.c b/src/utils/linalg_dense_sparse_matmuls.c
index 6b97e59..22f133d 100644
--- a/src/utils/linalg_dense_sparse_matmuls.c
+++ b/src/utils/linalg_dense_sparse_matmuls.c
@@ -18,8 +18,8 @@
 #include "utils/CSC_matrix.h"
 #include "utils/CSR_matrix.h"
 #include "utils/cblas_wrapper.h"
-#include "utils/dense_matrix.h"
 #include "utils/iVec.h"
+#include "utils/linalg_dense_sparse_matmuls.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <stdlib.h>
diff --git a/src/utils/mini_numpy.c b/src/utils/mini_numpy.c
index de0d345..141e77e 100644
--- a/src/utils/mini_numpy.c
+++ b/src/utils/mini_numpy.c
@@ -69,6 +69,17 @@ void mat_mat_mult(const double *X, const double *Y, double *Z, int m, int k, int
     }
 }
 
+void A_transpose(double *AT, const double *A, int m, int n)
+{
+    for (int i = 0; i < m; i++)
+    {
+        for (int j = 0; j < n; j++)
+        {
+            AT[j * m + i] = A[i * n + j];
+        }
+    }
+}
+
 void Y_kron_I_vec(int m, int k, int n, const double *Y, const double *w, double *v)
 {
     for (int j = 0; j < k; j++)
diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c
index 4e1b932..2d45d0e 100644
--- a/src/utils/permuted_dense.c
+++ b/src/utils/permuted_dense.c
@@ -100,8 +100,8 @@ static matrix *permuted_dense_vtable_transpose_alloc(const matrix *self)
     const permuted_dense *pd = (const permuted_dense *) self;
     /* Swap (m, n), (m0, n0), and (row_perm, col_perm). The constructor
        asserts strict increase of both perms, which holds by construction. */
-    return new_permuted_dense(pd->base.n, pd->base.m, pd->n0, pd->m0,
-                              pd->col_perm, pd->row_perm, NULL);
+    return new_permuted_dense(pd->base.n, pd->base.m, pd->n0, pd->m0, pd->col_perm,
+                              pd->row_perm, NULL);
 }
 
 static void permuted_dense_vtable_transpose_fill_values(const matrix *self,
@@ -332,23 +332,33 @@ static void permuted_dense_vtable_diag_vec_fill_values(matrix *self, matrix *out
 
 /* ===== Operator-role adapters: PD acting as the constant left operand of
    left_matmul. Currently restricted to full-block PDs (m0 == m, n0 == n,
-   identity perms) — that's the case dense_matrix covers today. */
+   identity perms) — the only operator shape any caller needs today. */
 
 static void permuted_dense_vtable_block_left_mult_vec(const matrix *A,
                                                       const double *x, double *y,
                                                       int p)
 {
+    /* Full-block precondition: A->x is a single contiguous row-major m x n
+       block (perms are identity). For a non-trivial PD, A->x still points
+       at pd->X but X only stores the values at the permuted positions; the
+       layout below assumes a full m x n matrix, hence the assert. */
     assert(((const permuted_dense *) A)->m0 == A->m &&
            ((const permuted_dense *) A)->n0 == A->n);
-    /* y (p x m) = x (p x n) * A^T (n x m), all row-major. Matches the
-       dense_matrix implementation; A->x is the row-major value buffer. */
-    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, p, A->m, A->n, 1.0, x,
-                A->n, A->x, A->n, 0.0, y, A->m);
+
+    /* y = kron(I_p, A) @ x via a single dgemm.
+       Input x is p blocks of length n (block-interleaved); output y is p
+       blocks of length m. That's identical in memory to row-major matrices
+       of shape (p, n) and (p, m) respectively, so we can compute
+           y (p x m) = x (p x n) * A^T (n x m)
+       in one shot. CblasRowMajor + CblasNoTrans on x + CblasTrans on A
+       gives exactly that. */
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, p, A->m, A->n, 1.0, x, A->n,
+                A->x, A->n, 0.0, y, A->m);
 }
 
-static CSC_matrix *permuted_dense_vtable_block_left_mult_sparsity(const matrix *A,
-                                                                  const CSC_matrix *J,
-                                                                  int p)
+static CSC_matrix *
+permuted_dense_vtable_block_left_mult_sparsity(const matrix *A, const CSC_matrix *J,
+                                               int p)
 {
     const permuted_dense *pd = (const permuted_dense *) A;
     assert(pd->m0 == A->m && pd->n0 == A->n);
@@ -467,6 +477,18 @@ matrix *new_permuted_dense(int m, int n, int m0, int n0, const int *row_perm,
     return &pd->base;
 }
 
+matrix *new_permuted_dense_full(int m, int n, const double *data)
+{
+    int *row_perm = (int *) SP_MALLOC(m * sizeof(int));
+    int *col_perm = (int *) SP_MALLOC(n * sizeof(int));
+    for (int i = 0; i < m; i++) row_perm[i] = i;
+    for (int j = 0; j < n; j++) col_perm[j] = j;
+    matrix *out = new_permuted_dense(m, n, m, n, row_perm, col_perm, data);
+    free(row_perm);
+    free(col_perm);
+    return out;
+}
+
 static CSR_matrix *permuted_dense_to_csr_alloc(const permuted_dense *A)
 {
     int m0 = A->m0;
diff --git a/tests/all_tests.c b/tests/all_tests.c
index c1b38a8..253e749 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -358,13 +358,12 @@ int main(void)
     mu_run_test(test_csr_to_coo, tests_run);
     mu_run_test(test_csr_to_coo_lower_triangular, tests_run);
     mu_run_test(test_refresh_lower_triangular_coo, tests_run);
-    mu_run_test(test_dense_matrix_mult_vec, tests_run);
-    mu_run_test(test_dense_matrix_mult_vec_blocks, tests_run);
-    mu_run_test(test_sparse_vs_dense_mult_vec, tests_run);
-    mu_run_test(test_dense_matrix_trans, tests_run);
-    mu_run_test(test_sparse_vs_dense_mult_vec_blocks, tests_run);
+    mu_run_test(test_pd_mult_vec_basic, tests_run);
+    mu_run_test(test_pd_mult_vec_blocks, tests_run);
+    mu_run_test(test_sparse_vs_pd_mult_vec, tests_run);
+    mu_run_test(test_pd_trans_full_block, tests_run);
+    mu_run_test(test_sparse_vs_pd_mult_vec_blocks, tests_run);
     mu_run_test(test_pd_operator_block_left_mult_vec, tests_run);
-    mu_run_test(test_pd_operator_vs_dense_block_left_mult, tests_run);
     mu_run_test(test_permuted_dense_to_csr_basic, tests_run);
     mu_run_test(test_permuted_dense_to_csr_empty, tests_run);
     mu_run_test(test_permuted_dense_to_csr_full, tests_run);
diff --git a/tests/utils/test_matrix.h b/tests/utils/test_matrix.h
index 38001b8..d14dfbc 100644
--- a/tests/utils/test_matrix.h
+++ b/tests/utils/test_matrix.h
@@ -3,19 +3,18 @@
 
 #include "minunit.h"
 #include "test_helpers.h"
-#include "utils/dense_matrix.h"
 #include "utils/permuted_dense.h"
 #include "utils/sparse_matrix.h"
 #include <stdlib.h>
 #include <string.h>
 
-/* Test dense block_left_mult_vec against known result.
+/* Test full-block PD block_left_mult_vec against known result.
    A = [1 2; 3 4] (2x2), x = [1; 2], p = 1
    y = A * x = [1*1+2*2; 3*1+4*2] = [5; 11] */
-const char *test_dense_matrix_mult_vec(void)
+const char *test_pd_mult_vec_basic(void)
 {
     double data[] = {1.0, 2.0, 3.0, 4.0};
-    matrix *A = new_dense_matrix(2, 2, data);
+    matrix *A = new_permuted_dense_full(2, 2, data);
 
     double x[] = {1.0, 2.0};
     double y[2] = {0.0, 0.0};
@@ -29,13 +28,13 @@ const char *test_dense_matrix_mult_vec(void)
     return 0;
 }
 
-/* Test dense block_left_mult_vec with multiple blocks.
+/* Test full-block PD block_left_mult_vec with multiple blocks.
    A = [1 2; 3 4] (2x2), x = [1; 2; 3; 4], p = 2
    y = [A*[1;2]; A*[3;4]] = [5; 11; 11; 25] */
-const char *test_dense_matrix_mult_vec_blocks(void)
+const char *test_pd_mult_vec_blocks(void)
 {
     double data[] = {1.0, 2.0, 3.0, 4.0};
-    matrix *A = new_dense_matrix(2, 2, data);
+    matrix *A = new_permuted_dense_full(2, 2, data);
 
     double x[] = {1.0, 2.0, 3.0, 4.0};
     double y[4] = {0};
@@ -49,9 +48,9 @@ const char *test_dense_matrix_mult_vec_blocks(void)
     return 0;
 }
 
-/* Compare sparse vs dense block_left_mult_vec for a non-square matrix.
+/* Compare sparse vs PD block_left_mult_vec for a non-square matrix.
    A = [1 2 3; 4 5 6] (2x3), x = [1; 2; 3], p = 1 */
-const char *test_sparse_vs_dense_mult_vec(void)
+const char *test_sparse_vs_pd_mult_vec(void)
 {
     /* Build CSR_matrix for A = [1 2 3; 4 5 6] */
     CSR_matrix *csr = new_CSR_matrix(2, 3, 6);
@@ -62,48 +61,48 @@ const char *test_sparse_vs_dense_mult_vec(void)
     memcpy(csr->i, Ai, 6 * sizeof(int));
     memcpy(csr->x, Ax, 6 * sizeof(double));
 
-    double dense_data[] = {1, 2, 3, 4, 5, 6};
+    double pd_data[] = {1, 2, 3, 4, 5, 6};
 
     matrix *sparse = new_sparse_matrix(csr);
-    matrix *dense = new_dense_matrix(2, 3, dense_data);
+    matrix *pd = new_permuted_dense_full(2, 3, pd_data);
 
     double x[] = {1.0, 2.0, 3.0};
     double y_sparse[2] = {0};
-    double y_dense[2] = {0};
+    double y_pd[2] = {0};
 
     sparse->block_left_mult_vec(sparse, x, y_sparse, 1);
-    dense->block_left_mult_vec(dense, x, y_dense, 1);
+    pd->block_left_mult_vec(pd, x, y_pd, 1);
 
-    mu_assert("sparse vs dense mismatch", cmp_double_array(y_sparse, y_dense, 2));
+    mu_assert("sparse vs pd mismatch", cmp_double_array(y_sparse, y_pd, 2));
 
     free_matrix(sparse);
-    free_matrix(dense);
+    free_matrix(pd);
     return 0;
 }
 
-/* Test dense transpose */
-const char *test_dense_matrix_trans(void)
+/* Test full-block PD transpose via vtable. */
+const char *test_pd_trans_full_block(void)
 {
     double data[] = {1, 2, 3, 4, 5, 6}; /* 2x3 */
-    matrix *A = new_dense_matrix(2, 3, data);
-    matrix *AT = dense_matrix_trans((const dense_matrix *) A);
+    matrix *A = new_permuted_dense_full(2, 3, data);
+    matrix *AT = A->transpose_alloc(A);
+    A->transpose_fill_values(A, AT);
 
     mu_assert("transpose m", AT->m == 3);
     mu_assert("transpose n", AT->n == 2);
 
     /* AT should be [1 4; 2 5; 3 6] stored row-major */
-    dense_matrix *dm = (dense_matrix *) AT;
     double AT_expected[6] = {1.0, 4.0, 2.0, 5.0, 3.0, 6.0};
-    mu_assert("AT vals incorrect", cmp_double_array(dm->x, AT_expected, 6));
+    mu_assert("AT vals incorrect", cmp_double_array(AT->x, AT_expected, 6));
 
     free_matrix(A);
     free_matrix(AT);
     return 0;
 }
 
-/* Compare sparse vs dense block_left_mult_vec with p=2 blocks.
+/* Compare sparse vs PD block_left_mult_vec with p=2 blocks.
    A = [1 2; 3 4], x = [1; 2; 3; 4], p = 2 */
-const char *test_sparse_vs_dense_mult_vec_blocks(void)
+const char *test_sparse_vs_pd_mult_vec_blocks(void)
 {
     CSR_matrix *csr = new_CSR_matrix(2, 2, 4);
     int Ap[3] = {0, 2, 4};
@@ -113,29 +112,27 @@ const char *test_sparse_vs_dense_mult_vec_blocks(void)
     memcpy(csr->i, Ai, 4 * sizeof(int));
     memcpy(csr->x, Ax, 4 * sizeof(double));
 
-    double dense_data[] = {1, 2, 3, 4};
+    double pd_data[] = {1, 2, 3, 4};
 
     matrix *sparse = new_sparse_matrix(csr);
-    matrix *dense = new_dense_matrix(2, 2, dense_data);
+    matrix *pd = new_permuted_dense_full(2, 2, pd_data);
 
     double x[] = {1.0, 2.0, 3.0, 4.0};
     double y_sparse[4] = {0};
-    double y_dense[4] = {0};
+    double y_pd[4] = {0};
 
     sparse->block_left_mult_vec(sparse, x, y_sparse, 2);
-    dense->block_left_mult_vec(dense, x, y_dense, 2);
+    pd->block_left_mult_vec(pd, x, y_pd, 2);
 
-    mu_assert("sparse vs dense blocks mismatch",
-              cmp_double_array(y_sparse, y_dense, 4));
+    mu_assert("sparse vs pd blocks mismatch", cmp_double_array(y_sparse, y_pd, 4));
 
     free_matrix(sparse);
-    free_matrix(dense);
+    free_matrix(pd);
     return 0;
 }
 
-/* Full-block permuted_dense acting as operator must be byte-equivalent to
-   dense_matrix for all three block_left_mult_* slots. Mirrors the data of
-   test_dense_matrix_mult_vec and exercises the new PD-as-operator path. */
+/* Full-block permuted_dense acting as operator: smoke test mirroring
+   test_pd_mult_vec_basic with an explicit row_perm/col_perm. */
 const char *test_pd_operator_block_left_mult_vec(void)
 {
     double data[] = {1.0, 2.0, 3.0, 4.0};
@@ -155,48 +152,4 @@ const char *test_pd_operator_block_left_mult_vec(void)
     return 0;
 }
 
-/* Full-block PD operator vs dense_matrix: block_left_mult_sparsity and
-   block_left_mult_values must produce byte-equivalent CSC outputs.
-   J is a 6x2 CSC representing two identity-like columns into a single
-   block (p=1), exercising both the single-nonzero fast path and the
-   multi-nonzero densify path. */
-const char *test_pd_operator_vs_dense_block_left_mult(void)
-{
-    /* A = [1 2 3; 4 5 6] (2x3). */
-    double data[] = {1, 2, 3, 4, 5, 6};
-    int row_perm[2] = {0, 1};
-    int col_perm[3] = {0, 1, 2};
-    matrix *A_pd = new_permuted_dense(2, 3, 2, 3, row_perm, col_perm, data);
-    matrix *A_dm = new_dense_matrix(2, 3, data);
-
-    /* J is 3x2 CSC: col 0 = [1.0 at row 0], col 1 = [2.0 at row 0, 3.0 at row 2].
-       p = 1; output C is 2x2. */
-    CSC_matrix *J = new_CSC_matrix(3, 2, 3);
-    int Jp[3] = {0, 1, 3};
-    int Ji[3] = {0, 0, 2};
-    double Jx[3] = {1.0, 2.0, 3.0};
-    memcpy(J->p, Jp, 3 * sizeof(int));
-    memcpy(J->i, Ji, 3 * sizeof(int));
-    memcpy(J->x, Jx, 3 * sizeof(double));
-
-    CSC_matrix *C_pd = A_pd->block_left_mult_sparsity(A_pd, J, 1);
-    CSC_matrix *C_dm = A_dm->block_left_mult_sparsity(A_dm, J, 1);
-
-    mu_assert("nnz mismatch", C_pd->nnz == C_dm->nnz);
-    mu_assert("p mismatch", cmp_int_array(C_pd->p, C_dm->p, 3));
-    mu_assert("i mismatch", cmp_int_array(C_pd->i, C_dm->i, C_pd->nnz));
-
-    A_pd->block_left_mult_values(A_pd, J, C_pd);
-    A_dm->block_left_mult_values(A_dm, J, C_dm);
-
-    mu_assert("x mismatch", cmp_double_array(C_pd->x, C_dm->x, C_pd->nnz));
-
-    free_CSC_matrix(C_pd);
-    free_CSC_matrix(C_dm);
-    free_CSC_matrix(J);
-    free_matrix(A_pd);
-    free_matrix(A_dm);
-    return 0;
-}
-
 #endif /* TEST_MATRIX_H */

From 2d421c731e0e51aedef09100319b6c67aab39f12 Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Wed, 13 May 2026 13:27:33 +0200
Subject: [PATCH 25/31] more infrastructure

---
 include/utils/permuted_dense.h    |  22 ++++++
 src/atoms/affine/left_matmul.c    |  11 +--
 src/utils/permuted_dense.c        |  97 ++++++++++++++++++++++++
 tests/all_tests.c                 |   3 +
 tests/utils/test_permuted_dense.h | 122 ++++++++++++++++++++++++++++++
 5 files changed, 250 insertions(+), 5 deletions(-)

diff --git a/include/utils/permuted_dense.h b/include/utils/permuted_dense.h
index 7a9d926..2e39f75 100644
--- a/include/utils/permuted_dense.h
+++ b/include/utils/permuted_dense.h
@@ -126,6 +126,28 @@ matrix *BA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A);
 void BA_pd_csc_fill_values(const double *B, int n0_B, const int *inv,
                            const CSC_matrix *A, permuted_dense *C);
 
+/* Allocate new permuted dense for C = B @ A where B and A are both PD. Both
+   may have arbitrary (sorted) row_perm / col_perm; no full-block assumption.
+   If B->col_perm and A->row_perm have no overlap C is structurally empty;
+   otherwise C has row_perm = B->row_perm, col_perm = A->col_perm. */
+matrix *BA_pd_pd_alloc(const permuted_dense *B, const permuted_dense *A);
+
+/* Fill values of C = B @ A for two PDs (general row_perm / col_perm).
+   Intersects B->col_perm with A->row_perm, gathers the matching column
+   slice of B and row slice of A into the operands' dwork scratch, and
+   computes one cblas_dgemm. */
+void BA_pd_pd_fill_values(const permuted_dense *B, const permuted_dense *A,
+                          permuted_dense *C);
+
+/* Polymorphic dispatcher: C = B @ A where B is PD and A is any matrix
+   type (permuted_dense or sparse_matrix). C is always PD. Routes on A's
+   type — mirrors the BTA_matrices_alloc style in matrix_BTA.c. For the
+   sparse-A branch the caller must refresh A's csc_cache values before
+   BA_pd_matrices_fill_values (same contract as BTDA_matrices_fill_values). */
+matrix *BA_pd_matrices_alloc(const permuted_dense *B, const matrix *A);
+void BA_pd_matrices_fill_values(const permuted_dense *B, const matrix *A,
+                                permuted_dense *C);
+
 /* Allocate new permuted dense for C = B^T @ A where B is PD and A is CSC */
 matrix *BTA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A);
 
diff --git a/src/atoms/affine/left_matmul.c b/src/atoms/affine/left_matmul.c
index daeb66d..a062c44 100644
--- a/src/atoms/affine/left_matmul.c
+++ b/src/atoms/affine/left_matmul.c
@@ -130,9 +130,8 @@ static void jacobian_init_impl(expr *node)
         int *col_perm = (int *) SP_MALLOC(lnode->A->n * sizeof(int));
         for (int i = 0; i < m_loc; i++) row_perm[i] = i;
         for (int j = 0; j < lnode->A->n; j++) col_perm[j] = x->var_id + j;
-        node->jacobian = new_permuted_dense(m_loc, node->n_vars, m_loc,
-                                            lnode->A->n, row_perm, col_perm,
-                                            lnode->A->x);
+        node->jacobian = new_permuted_dense(m_loc, node->n_vars, m_loc, lnode->A->n,
+                                            row_perm, col_perm, lnode->A->x);
         free(row_perm);
         free(col_perm);
         return;
@@ -162,11 +161,13 @@ static void eval_jacobian(expr *node)
 
     /* evaluate child's jacobian and convert to CSC_matrix */
     x->eval_jacobian(x);
-    csr_to_csc_fill_values(x->jacobian->to_csr(x->jacobian), Jchild_CSC, node->work->iwork);
+    csr_to_csc_fill_values(x->jacobian->to_csr(x->jacobian), Jchild_CSC,
+                           node->work->iwork);
 
     /* compute this node's jacobian: */
     lnode->A->block_left_mult_values(lnode->A, Jchild_CSC, J_CSC);
-    csc_to_csr_fill_values(J_CSC, node->jacobian->to_csr(node->jacobian), lnode->csc_to_csr_work);
+    csc_to_csr_fill_values(J_CSC, node->jacobian->to_csr(node->jacobian),
+                           lnode->csc_to_csr_work);
 }
 
 static void wsum_hess_init_impl(expr *node)
diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c
index 2d45d0e..487b000 100644
--- a/src/utils/permuted_dense.c
+++ b/src/utils/permuted_dense.c
@@ -19,6 +19,7 @@
 #include "utils/cblas_wrapper.h"
 #include "utils/iVec.h"
 #include "utils/linalg_dense_sparse_matmuls.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include "utils/utils.h"
 #include <assert.h>
@@ -809,6 +810,102 @@ void BA_pd_csc_fill_values(const double *B, int n0_B, const int *inv,
     }
 }
 
+matrix *BA_pd_pd_alloc(const permuted_dense *B, const permuted_dense *A)
+{
+    /* C is structurally zero when B's columns and A's rows don't overlap. */
+    if (!has_overlap(B->col_perm, B->n0, A->row_perm, A->m0, 0))
+    {
+        return new_permuted_dense(B->base.m, A->base.n, 0, 0, NULL, NULL, NULL);
+    }
+
+    /* Otherwise C has a dense block of size B->m0 x A->n0, with row index
+       set B->row_perm and column index set A->col_perm. */
+    matrix *C = new_permuted_dense(B->base.m, A->base.n, B->m0, A->n0, B->row_perm,
+                                   A->col_perm, NULL);
+
+    int s_max = MIN(B->n0, A->m0);
+
+    /* Pre-size B's and A's dwork for the gathers in fill. Worst-case
+       intersection size is s_max; B_sub is (m0, s) and A_sub is (s, n0). */
+    permuted_dense_ensure_dwork(B, (size_t) B->m0 * s_max);
+    permuted_dense_ensure_dwork(A, (size_t) s_max * A->n0);
+
+    /* Pre-allocate C->iwork for idx_B + idx_A back-to-back (2 * s_max ints),
+       same idiom as BTA_pd_pd_alloc. */
+    permuted_dense *C_pd = (permuted_dense *) C;
+    C_pd->iwork_size = (size_t) 2 * s_max;
+    if (C_pd->iwork_size > 0)
+    {
+        C_pd->iwork = (int *) SP_MALLOC(C_pd->iwork_size * sizeof(int));
+    }
+    return C;
+}
+
+void BA_pd_pd_fill_values(const permuted_dense *B, const permuted_dense *A,
+                          permuted_dense *C)
+{
+    /* C may be empty when B->col_perm and A->row_perm don't overlap. */
+    if (C->base.nnz == 0)
+    {
+        return;
+    }
+
+    int s_max = MIN(B->n0, A->m0);
+    int *idx_B = C->iwork;
+    int *idx_A = C->iwork + s_max;
+    int s = sorted_intersect_indices(B->col_perm, B->n0, A->row_perm, A->m0, idx_B,
+                                     idx_A);
+    assert(s > 0);
+
+    /* Gather B_sub into B->dwork, shape (B->m0, s), row-major.
+       B_sub[ii, kk] = B->X[ii, idx_B[kk]]. */
+    for (int ii = 0; ii < B->m0; ii++)
+    {
+        for (int kk = 0; kk < s; kk++)
+        {
+            B->dwork[ii * s + kk] = B->X[ii * B->n0 + idx_B[kk]];
+        }
+    }
+
+    /* Gather A_sub into A->dwork, shape (s, A->n0), row-major.
+       A_sub[kk, :] = A->X[idx_A[kk], :]. */
+    for (int kk = 0; kk < s; kk++)
+    {
+        memcpy(A->dwork + kk * A->n0, A->X + idx_A[kk] * A->n0,
+               A->n0 * sizeof(double));
+    }
+
+    /* C->X = B_sub @ A_sub, shape (B->m0, A->n0). */
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, B->m0, A->n0, s, 1.0,
+                B->dwork, s, A->dwork, A->n0, 0.0, C->X, A->n0);
+}
+
+matrix *BA_pd_matrices_alloc(const permuted_dense *B, const matrix *A)
+{
+    if (A->is_permuted_dense)
+    {
+        return BA_pd_pd_alloc(B, (const permuted_dense *) A);
+    }
+    /* A is sparse — use the existing BA_pd_csc_* kernels. Ensure the
+       csc_cache structure exists at alloc time (Phase 1 contract). */
+    sparse_matrix *sm_A = (sparse_matrix *) A;
+    sparse_matrix_ensure_csc_cache(sm_A);
+    return BA_pd_csc_alloc(B, sm_A->csc_cache);
+}
+
+void BA_pd_matrices_fill_values(const permuted_dense *B, const matrix *A,
+                                permuted_dense *C)
+{
+    if (A->is_permuted_dense)
+    {
+        BA_pd_pd_fill_values(B, (const permuted_dense *) A, C);
+        return;
+    }
+    /* A is sparse — caller must have refreshed sm_A->csc_cache values. */
+    sparse_matrix *sm_A = (sparse_matrix *) A;
+    BA_pd_csc_fill_values(B->X, B->n0, B->col_inv, sm_A->csc_cache, C);
+}
+
 matrix *BTA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A)
 {
     /* Cij != 0 if column i of B overlaps with column j of A. So we loop
diff --git a/tests/all_tests.c b/tests/all_tests.c
index 253e749..ab0e52b 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -387,6 +387,9 @@ int main(void)
     mu_run_test(test_permuted_dense_BTA_partial_overlap, tests_run);
     mu_run_test(test_permuted_dense_BTDA_decomposition, tests_run);
     mu_run_test(test_BTA_pd_csc_matches_csr, tests_run);
+    mu_run_test(test_BA_pd_matrices_pd_pd_full_block_B, tests_run);
+    mu_run_test(test_BA_pd_matrices_pd_pd_general_B, tests_run);
+    mu_run_test(test_BA_pd_matrices_pd_csc, tests_run);
     mu_run_test(test_BTA_pd_csr_basic, tests_run);
     mu_run_test(test_BTA_pd_csr_leaf_variable, tests_run);
     mu_run_test(test_BTA_pd_csr_no_overlap, tests_run);
diff --git a/tests/utils/test_permuted_dense.h b/tests/utils/test_permuted_dense.h
index 5edf1ab..55ee49f 100644
--- a/tests/utils/test_permuted_dense.h
+++ b/tests/utils/test_permuted_dense.h
@@ -6,6 +6,7 @@
 #include "test_helpers.h"
 #include "utils/CSC_matrix.h"
 #include "utils/permuted_dense.h"
+#include "utils/sparse_matrix.h"
 #include "utils/utils.h"
 #include <stdlib.h>
 #include <string.h>
@@ -838,4 +839,125 @@ const char *test_BTA_pd_csc_matches_csr(void)
     return 0;
 }
 
+/* BA_pd_matrices: C = B @ A where B is full-block PD (the production
+   shape gated by left_matmul.c) and A is PD with non-trivial perms.
+   B (2x3) row_perm=[0,1], col_perm=[0,1,2], X_B=[[1,2,3],[4,5,6]].
+   A (3x5) row_perm=[0,2], col_perm=[1,4], X_A=[[7,8],[9,10]].
+   Hand-computed C (2x5) nonzero at cols {1,4}: X_C=[[34,38],[82,92]]. */
+const char *test_BA_pd_matrices_pd_pd_full_block_B(void)
+{
+    int row_perm_B[2] = {0, 1};
+    int col_perm_B[3] = {0, 1, 2};
+    double XB[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    matrix *B_m = new_permuted_dense(2, 3, 2, 3, row_perm_B, col_perm_B, XB);
+
+    int row_perm_A[2] = {0, 2};
+    int col_perm_A[2] = {1, 4};
+    double XA[4] = {7.0, 8.0, 9.0, 10.0};
+    matrix *A_m = new_permuted_dense(3, 5, 2, 2, row_perm_A, col_perm_A, XA);
+
+    matrix *C_m = BA_pd_matrices_alloc((permuted_dense *) B_m, A_m);
+    BA_pd_matrices_fill_values((permuted_dense *) B_m, A_m, (permuted_dense *) C_m);
+
+    permuted_dense *C = (permuted_dense *) C_m;
+    mu_assert("dim m", C_m->m == 2);
+    mu_assert("dim n", C_m->n == 5);
+    mu_assert("m0", C->m0 == 2);
+    mu_assert("n0", C->n0 == 2);
+    int expected_row_perm[2] = {0, 1};
+    int expected_col_perm[2] = {1, 4};
+    mu_assert("row_perm", cmp_int_array(C->row_perm, expected_row_perm, 2));
+    mu_assert("col_perm", cmp_int_array(C->col_perm, expected_col_perm, 2));
+    double expected_X[4] = {34.0, 38.0, 82.0, 92.0};
+    mu_assert("X", cmp_double_array(C->X, expected_X, 4));
+
+    free_matrix(C_m);
+    free_matrix(A_m);
+    free_matrix(B_m);
+    return 0;
+}
+
+/* BA_pd_matrices with general (non-full-block) B. B->col_perm and
+   A->row_perm only partially overlap, exercising the
+   sorted_intersect_indices gather path.
+   B (2x5) row_perm=[0,1], col_perm=[1,3], X_B=[[1,2],[3,4]].
+   A (5x4) row_perm=[1,2], col_perm=[0,3], X_A=[[5,6],[7,8]].
+   Intersection K = {1,3} ∩ {1,2} = {1}, s=1.
+   Hand-computed C (2x4) nonzero at cols {0,3}: X_C=[[5,6],[15,18]]. */
+const char *test_BA_pd_matrices_pd_pd_general_B(void)
+{
+    int row_perm_B[2] = {0, 1};
+    int col_perm_B[2] = {1, 3};
+    double XB[4] = {1.0, 2.0, 3.0, 4.0};
+    matrix *B_m = new_permuted_dense(2, 5, 2, 2, row_perm_B, col_perm_B, XB);
+
+    int row_perm_A[2] = {1, 2};
+    int col_perm_A[2] = {0, 3};
+    double XA[4] = {5.0, 6.0, 7.0, 8.0};
+    matrix *A_m = new_permuted_dense(5, 4, 2, 2, row_perm_A, col_perm_A, XA);
+
+    matrix *C_m = BA_pd_matrices_alloc((permuted_dense *) B_m, A_m);
+    BA_pd_matrices_fill_values((permuted_dense *) B_m, A_m, (permuted_dense *) C_m);
+
+    permuted_dense *C = (permuted_dense *) C_m;
+    mu_assert("dim m", C_m->m == 2);
+    mu_assert("dim n", C_m->n == 4);
+    mu_assert("m0", C->m0 == 2);
+    mu_assert("n0", C->n0 == 2);
+    int expected_row_perm[2] = {0, 1};
+    int expected_col_perm[2] = {0, 3};
+    mu_assert("row_perm", cmp_int_array(C->row_perm, expected_row_perm, 2));
+    mu_assert("col_perm", cmp_int_array(C->col_perm, expected_col_perm, 2));
+    double expected_X[4] = {5.0, 6.0, 15.0, 18.0};
+    mu_assert("X", cmp_double_array(C->X, expected_X, 4));
+
+    free_matrix(C_m);
+    free_matrix(A_m);
+    free_matrix(B_m);
+    return 0;
+}
+
+/* BA_pd_matrices with sparse A. Same B and same global A content as the
+   pd_pd_general_B test — the dispatcher routes through BA_pd_csc_*
+   and should yield byte-identical output. */
+const char *test_BA_pd_matrices_pd_csc(void)
+{
+    int row_perm_B[2] = {0, 1};
+    int col_perm_B[2] = {1, 3};
+    double XB[4] = {1.0, 2.0, 3.0, 4.0};
+    matrix *B_m = new_permuted_dense(2, 5, 2, 2, row_perm_B, col_perm_B, XB);
+
+    /* A as 5x4 sparse_matrix, same nonzero values as the PD case:
+       (1,0)=5, (1,3)=6, (2,0)=7, (2,3)=8. */
+    CSR_matrix *csr = new_CSR_matrix(5, 4, 4);
+    int Ap[6] = {0, 0, 2, 4, 4, 4};
+    int Ai[4] = {0, 3, 0, 3};
+    double Ax[4] = {5.0, 6.0, 7.0, 8.0};
+    memcpy(csr->p, Ap, 6 * sizeof(int));
+    memcpy(csr->i, Ai, 4 * sizeof(int));
+    memcpy(csr->x, Ax, 4 * sizeof(double));
+    matrix *A_m = new_sparse_matrix(csr);
+
+    matrix *C_m = BA_pd_matrices_alloc((permuted_dense *) B_m, A_m);
+    A_m->refresh_csc_values(A_m); /* values must be fresh before fill */
+    BA_pd_matrices_fill_values((permuted_dense *) B_m, A_m, (permuted_dense *) C_m);
+
+    permuted_dense *C = (permuted_dense *) C_m;
+    mu_assert("dim m", C_m->m == 2);
+    mu_assert("dim n", C_m->n == 4);
+    mu_assert("m0", C->m0 == 2);
+    mu_assert("n0", C->n0 == 2);
+    int expected_row_perm[2] = {0, 1};
+    int expected_col_perm[2] = {0, 3};
+    mu_assert("row_perm", cmp_int_array(C->row_perm, expected_row_perm, 2));
+    mu_assert("col_perm", cmp_int_array(C->col_perm, expected_col_perm, 2));
+    double expected_X[4] = {5.0, 6.0, 15.0, 18.0};
+    mu_assert("X", cmp_double_array(C->X, expected_X, 4));
+
+    free_matrix(C_m);
+    free_matrix(A_m);
+    free_matrix(B_m);
+    return 0;
+}
+
 #endif /* TEST_PERMUTED_DENSE_H */

From 737515860be26d4b200a31eb422dc2e86853bb2e Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Wed, 13 May 2026 13:55:12 +0200
Subject: [PATCH 26/31] better BA_pd_pd_fill_values

---
 include/utils/matrix_BTA.h        |  11 +++
 include/utils/permuted_dense.h    |   9 ---
 src/utils/matrix_BTA.c            |  26 +++++++
 src/utils/permuted_dense.c        | 110 +++++++++++++-----------------
 tests/all_tests.c                 |   1 +
 tests/utils/test_permuted_dense.h |  41 +++++++++++
 6 files changed, 127 insertions(+), 71 deletions(-)

diff --git a/include/utils/matrix_BTA.h b/include/utils/matrix_BTA.h
index 115d3d5..05b6d58 100644
--- a/include/utils/matrix_BTA.h
+++ b/include/utils/matrix_BTA.h
@@ -13,6 +13,7 @@
 #define MATRIX_BTA_H
 
 #include "matrix.h"
+#include "permuted_dense.h"
 
 /* Polymorphic dispatchers for C = BT @ A and C = BT @ diag(d) @ A. The output
    type depends on the input types: (PD, PD) → PD, (Sparse, PD) → PD,
@@ -29,4 +30,14 @@ matrix *BTA_matrices_alloc(matrix *A, matrix *B);
 /* Fill values of C = BT @ diag(d) @ A. */
 void BTDA_matrices_fill_values(matrix *A, const double *d, matrix *B, matrix *C);
 
+/* Polymorphic dispatcher: C = B @ A where B is PD and A is any matrix
+   type (permuted_dense or sparse_matrix). C is always PD. Routes on A's
+   type. For the sparse-A branch the dispatcher ensures sm_A->csc_cache
+   structure exists at alloc time; before BA_pd_matrices_fill_values the
+   caller must have refreshed sm_A->csc_cache values (same fill-side
+   contract as BTDA_matrices_fill_values). */
+matrix *BA_pd_matrices_alloc(const permuted_dense *B, const matrix *A);
+void BA_pd_matrices_fill_values(const permuted_dense *B, const matrix *A,
+                                permuted_dense *C);
+
 #endif /* MATRIX_BTA_H */
diff --git a/include/utils/permuted_dense.h b/include/utils/permuted_dense.h
index 2e39f75..4869d97 100644
--- a/include/utils/permuted_dense.h
+++ b/include/utils/permuted_dense.h
@@ -139,15 +139,6 @@ matrix *BA_pd_pd_alloc(const permuted_dense *B, const permuted_dense *A);
 void BA_pd_pd_fill_values(const permuted_dense *B, const permuted_dense *A,
                           permuted_dense *C);
 
-/* Polymorphic dispatcher: C = B @ A where B is PD and A is any matrix
-   type (permuted_dense or sparse_matrix). C is always PD. Routes on A's
-   type — mirrors the BTA_matrices_alloc style in matrix_BTA.c. For the
-   sparse-A branch the caller must refresh A's csc_cache values before
-   BA_pd_matrices_fill_values (same contract as BTDA_matrices_fill_values). */
-matrix *BA_pd_matrices_alloc(const permuted_dense *B, const matrix *A);
-void BA_pd_matrices_fill_values(const permuted_dense *B, const matrix *A,
-                                permuted_dense *C);
-
 /* Allocate new permuted dense for C = B^T @ A where B is PD and A is CSC */
 matrix *BTA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A);
 
diff --git a/src/utils/matrix_BTA.c b/src/utils/matrix_BTA.c
index 924fe86..cd4d815 100644
--- a/src/utils/matrix_BTA.c
+++ b/src/utils/matrix_BTA.c
@@ -69,3 +69,29 @@ void BTDA_matrices_fill_values(matrix *A, const double *d, matrix *B, matrix *C)
     sparse_matrix *sm_C = (sparse_matrix *) C;
     BTDA_fill_values(sm_A->csc_cache, sm_B->csc_cache, d, sm_C->csr);
 }
+
+matrix *BA_pd_matrices_alloc(const permuted_dense *B, const matrix *A)
+{
+    if (A->is_permuted_dense)
+    {
+        return BA_pd_pd_alloc(B, (const permuted_dense *) A);
+    }
+    /* A is sparse — use the existing BA_pd_csc_* kernels. Ensure the
+       csc_cache structure exists at alloc time. */
+    sparse_matrix *sm_A = (sparse_matrix *) A;
+    sparse_matrix_ensure_csc_cache(sm_A);
+    return BA_pd_csc_alloc(B, sm_A->csc_cache);
+}
+
+void BA_pd_matrices_fill_values(const permuted_dense *B, const matrix *A,
+                                permuted_dense *C)
+{
+    if (A->is_permuted_dense)
+    {
+        BA_pd_pd_fill_values(B, (const permuted_dense *) A, C);
+        return;
+    }
+    /* A is sparse — caller must have refreshed sm_A->csc_cache values. */
+    sparse_matrix *sm_A = (sparse_matrix *) A;
+    BA_pd_csc_fill_values(B->X, B->n0, B->col_inv, sm_A->csc_cache, C);
+}
diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c
index 487b000..34d7466 100644
--- a/src/utils/permuted_dense.c
+++ b/src/utils/permuted_dense.c
@@ -19,7 +19,6 @@
 #include "utils/cblas_wrapper.h"
 #include "utils/iVec.h"
 #include "utils/linalg_dense_sparse_matmuls.h"
-#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include "utils/utils.h"
 #include <assert.h>
@@ -595,20 +594,19 @@ matrix *BTA_pd_pd_alloc(const permuted_dense *B, const permuted_dense *A)
                                    A->col_perm, NULL);
 
     /* Pre-size A's and B's dwork for the BTA fill slow path (gathered row
-       buffers). Worst-case size is `m0 * n0` per operand; over-allocating
-       here lets the fill kernel never touch malloc. */
-    permuted_dense_ensure_dwork(A, (size_t) A->m0 * A->n0);
-    permuted_dense_ensure_dwork(B, (size_t) B->m0 * B->n0);
+       buffers). Each operand needs s_max rows of its own n0 doubles, where
+       s_max = MIN(A->m0, B->m0) bounds the intersection of row_perms. */
+    int s_max = MIN(A->m0, B->m0);
+    permuted_dense_ensure_dwork(A, (size_t) s_max * A->n0);
+    permuted_dense_ensure_dwork(B, (size_t) s_max * B->n0);
 
     /* Pre-allocate C->iwork for idx_A + idx_B in BTA / BTDA_pd_pd slow paths
-       (each needs at most max_s = MIN(A->m0, B->m0) ints; we store both
-       arrays back-to-back in iwork, hence 2 * max_s). */
+       (each needs at most s_max ints; we store both arrays back-to-back
+       in iwork, hence 2 * s_max). */
     permuted_dense *C_pd = (permuted_dense *) C;
-    C_pd->iwork_size = (size_t) 2 * MIN(A->m0, B->m0);
-    if (C_pd->iwork_size > 0)
-    {
-        C_pd->iwork = (int *) SP_MALLOC(C_pd->iwork_size * sizeof(int));
-    }
+    C_pd->iwork_size = (size_t) 2 * s_max;
+    C_pd->iwork = (int *) SP_MALLOC(C_pd->iwork_size * sizeof(int));
+
     return C;
 }
 
@@ -673,19 +671,13 @@ void BTA_pd_pd_fill_values(const permuted_dense *B, const permuted_dense *A,
     }
 
     // -----------------------------------------------------------------------
-    // find intersection of row permutations. We use C->iwork as the storage
-    // for idx_A | idx_B (back-to-back) and grow it in place if too small
+    // find intersection of row permutations. C->iwork was pre-sized by
+    // BTA_pd_pd_alloc to 2 * MIN(A->m0, B->m0) ints (idx_A | idx_B back-
+    // to-back), so no allocation here.
     // -----------------------------------------------------------------------
-    int max_s = MIN(A->m0, B->m0);
-    size_t needed = 2 * (size_t) max_s;
-    if (C->iwork_size < needed)
-    {
-        free(C->iwork);
-        C->iwork = (int *) SP_MALLOC(needed * sizeof(int));
-        C->iwork_size = needed;
-    }
+    int s_max = MIN(A->m0, B->m0);
     int *idx_A = C->iwork;
-    int *idx_B = C->iwork + max_s;
+    int *idx_B = C->iwork + s_max;
     int s = sorted_intersect_indices(A->row_perm, A->m0, B->row_perm, B->m0, idx_A,
                                      idx_B);
     assert(s > 0);
@@ -716,6 +708,12 @@ void BTDA_pd_pd_fill_values(const permuted_dense *B, const double *d,
         return;
     }
 
+    /* TODO: must remove this allocation. Very important. The DA
+       intermediate PD is allocated and freed on every Hessian iteration
+       — violates the no-alloc-in-fill policy. Fix is to fold diag(d)
+       directly into BTA_pd_pd_fill_values's gather/dgemm (either via a
+       shared internal helper that takes an optional d, or by rewriting
+       this kernel inline using pre-sized A->dwork). */
     /* C = BT @ (DA) */
     permuted_dense *DA = (permuted_dense *) A->base.copy_sparsity(&A->base);
     DA_pd_fill_values(d, A, DA);
@@ -812,13 +810,13 @@ void BA_pd_csc_fill_values(const double *B, int n0_B, const int *inv,
 
 matrix *BA_pd_pd_alloc(const permuted_dense *B, const permuted_dense *A)
 {
-    /* C is structurally zero when B's columns and A's rows don't overlap. */
+    /* if B's columns don't overlap with A's rows, C = B @ A is empty */
     if (!has_overlap(B->col_perm, B->n0, A->row_perm, A->m0, 0))
     {
         return new_permuted_dense(B->base.m, A->base.n, 0, 0, NULL, NULL, NULL);
     }
 
-    /* Otherwise C has a dense block of size B->m0 x A->n0, with row index
+    /* otherwise C has a dense block of size B->m0 x A->n0, with row index
        set B->row_perm and column index set A->col_perm. */
     matrix *C = new_permuted_dense(B->base.m, A->base.n, B->m0, A->n0, B->row_perm,
                                    A->col_perm, NULL);
@@ -827,20 +825,19 @@ matrix *BA_pd_pd_alloc(const permuted_dense *B, const permuted_dense *A)
 
     /* Pre-size B's and A's dwork for the gathers in fill. Worst-case
        intersection size is s_max; B_sub is (m0, s) and A_sub is (s, n0). */
-    permuted_dense_ensure_dwork(B, (size_t) B->m0 * s_max);
     permuted_dense_ensure_dwork(A, (size_t) s_max * A->n0);
+    permuted_dense_ensure_dwork(B, (size_t) s_max * B->m0);
 
     /* Pre-allocate C->iwork for idx_B + idx_A back-to-back (2 * s_max ints),
        same idiom as BTA_pd_pd_alloc. */
     permuted_dense *C_pd = (permuted_dense *) C;
     C_pd->iwork_size = (size_t) 2 * s_max;
-    if (C_pd->iwork_size > 0)
-    {
-        C_pd->iwork = (int *) SP_MALLOC(C_pd->iwork_size * sizeof(int));
-    }
+    C_pd->iwork = (int *) SP_MALLOC(C_pd->iwork_size * sizeof(int));
+
     return C;
 }
 
+/* TODO: do we want to reuse BTA_pd_pd_fill_values? */
 void BA_pd_pd_fill_values(const permuted_dense *B, const permuted_dense *A,
                           permuted_dense *C)
 {
@@ -850,6 +847,19 @@ void BA_pd_pd_fill_values(const permuted_dense *B, const permuted_dense *A,
         return;
     }
 
+    /* if B's col_perm and A's row_perm are identical, one matmul suffices */
+    if (B->n0 == A->m0 && int_arrays_equal(B->col_perm, A->row_perm, B->n0))
+    {
+        cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, B->m0, A->n0, B->n0,
+                    1.0, B->X, B->n0, A->X, A->n0, 0.0, C->X, A->n0);
+        return;
+    }
+
+    // -----------------------------------------------------------------------
+    // find intersection of B's col_perm and A's row_perm. C->iwork was
+    // pre-sized by BA_pd_pd_alloc to 2 * MIN(B->n0, A->m0) ints (idx_B |
+    // idx_A back-to-back), so no allocation here.
+    // -----------------------------------------------------------------------
     int s_max = MIN(B->n0, A->m0);
     int *idx_B = C->iwork;
     int *idx_A = C->iwork + s_max;
@@ -857,8 +867,12 @@ void BA_pd_pd_fill_values(const permuted_dense *B, const permuted_dense *A,
                                      idx_A);
     assert(s > 0);
 
-    /* Gather B_sub into B->dwork, shape (B->m0, s), row-major.
-       B_sub[ii, kk] = B->X[ii, idx_B[kk]]. */
+    // ------------------------------------------------------------------------
+    // Gather the matching slices into B->dwork (column gather) and A->dwork
+    // (row gather). dwork is pre-sized by BA_pd_pd_alloc (one ensure_dwork
+    // call per operand at alloc time).
+    // ------------------------------------------------------------------------
+    /* B_sub shape (B->m0, s) row-major: B_sub[ii, kk] = B->X[ii, idx_B[kk]]. */
     for (int ii = 0; ii < B->m0; ii++)
     {
         for (int kk = 0; kk < s; kk++)
@@ -866,46 +880,18 @@ void BA_pd_pd_fill_values(const permuted_dense *B, const permuted_dense *A,
             B->dwork[ii * s + kk] = B->X[ii * B->n0 + idx_B[kk]];
         }
     }
-
-    /* Gather A_sub into A->dwork, shape (s, A->n0), row-major.
-       A_sub[kk, :] = A->X[idx_A[kk], :]. */
+    /* A_sub shape (s, A->n0) row-major: A_sub[kk, :] = A->X[idx_A[kk], :]. */
     for (int kk = 0; kk < s; kk++)
     {
         memcpy(A->dwork + kk * A->n0, A->X + idx_A[kk] * A->n0,
                A->n0 * sizeof(double));
     }
 
-    /* C->X = B_sub @ A_sub, shape (B->m0, A->n0). */
+    /* matmul on the gathered slices */
     cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, B->m0, A->n0, s, 1.0,
                 B->dwork, s, A->dwork, A->n0, 0.0, C->X, A->n0);
 }
 
-matrix *BA_pd_matrices_alloc(const permuted_dense *B, const matrix *A)
-{
-    if (A->is_permuted_dense)
-    {
-        return BA_pd_pd_alloc(B, (const permuted_dense *) A);
-    }
-    /* A is sparse — use the existing BA_pd_csc_* kernels. Ensure the
-       csc_cache structure exists at alloc time (Phase 1 contract). */
-    sparse_matrix *sm_A = (sparse_matrix *) A;
-    sparse_matrix_ensure_csc_cache(sm_A);
-    return BA_pd_csc_alloc(B, sm_A->csc_cache);
-}
-
-void BA_pd_matrices_fill_values(const permuted_dense *B, const matrix *A,
-                                permuted_dense *C)
-{
-    if (A->is_permuted_dense)
-    {
-        BA_pd_pd_fill_values(B, (const permuted_dense *) A, C);
-        return;
-    }
-    /* A is sparse — caller must have refreshed sm_A->csc_cache values. */
-    sparse_matrix *sm_A = (sparse_matrix *) A;
-    BA_pd_csc_fill_values(B->X, B->n0, B->col_inv, sm_A->csc_cache, C);
-}
-
 matrix *BTA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A)
 {
     /* Cij != 0 if column i of B overlaps with column j of A. So we loop
diff --git a/tests/all_tests.c b/tests/all_tests.c
index ab0e52b..227b596 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -390,6 +390,7 @@ int main(void)
     mu_run_test(test_BA_pd_matrices_pd_pd_full_block_B, tests_run);
     mu_run_test(test_BA_pd_matrices_pd_pd_general_B, tests_run);
     mu_run_test(test_BA_pd_matrices_pd_csc, tests_run);
+    mu_run_test(test_BA_pd_matrices_fast_path, tests_run);
     mu_run_test(test_BTA_pd_csr_basic, tests_run);
     mu_run_test(test_BTA_pd_csr_leaf_variable, tests_run);
     mu_run_test(test_BTA_pd_csr_no_overlap, tests_run);
diff --git a/tests/utils/test_permuted_dense.h b/tests/utils/test_permuted_dense.h
index 55ee49f..b5baca4 100644
--- a/tests/utils/test_permuted_dense.h
+++ b/tests/utils/test_permuted_dense.h
@@ -5,6 +5,7 @@
 #include "old-code/old_permuted_dense.h"
 #include "test_helpers.h"
 #include "utils/CSC_matrix.h"
+#include "utils/matrix_BTA.h"
 #include "utils/permuted_dense.h"
 #include "utils/sparse_matrix.h"
 #include "utils/utils.h"
@@ -960,4 +961,44 @@ const char *test_BA_pd_matrices_pd_csc(void)
     return 0;
 }
 
+/* BA_pd_matrices fast path: B->col_perm == A->row_perm exactly, so the
+   slow-path gather is skipped and one cblas_dgemm runs directly on
+   B->X and A->X.
+   B (2x4) row_perm=[0,1], col_perm=[1,3], X_B=[[1,2],[3,4]].
+   A (4x3) row_perm=[1,3], col_perm=[0,2], X_A=[[5,6],[7,8]].
+   Matching col_perm_B == row_perm_A == [1,3] triggers the fast path.
+   Hand-computed C (2x3) nonzero at cols {0,2}: X_C=[[19,22],[43,50]]. */
+const char *test_BA_pd_matrices_fast_path(void)
+{
+    int row_perm_B[2] = {0, 1};
+    int col_perm_B[2] = {1, 3};
+    double XB[4] = {1.0, 2.0, 3.0, 4.0};
+    matrix *B_m = new_permuted_dense(2, 4, 2, 2, row_perm_B, col_perm_B, XB);
+
+    int row_perm_A[2] = {1, 3};
+    int col_perm_A[2] = {0, 2};
+    double XA[4] = {5.0, 6.0, 7.0, 8.0};
+    matrix *A_m = new_permuted_dense(4, 3, 2, 2, row_perm_A, col_perm_A, XA);
+
+    matrix *C_m = BA_pd_matrices_alloc((permuted_dense *) B_m, A_m);
+    BA_pd_matrices_fill_values((permuted_dense *) B_m, A_m, (permuted_dense *) C_m);
+
+    permuted_dense *C = (permuted_dense *) C_m;
+    mu_assert("dim m", C_m->m == 2);
+    mu_assert("dim n", C_m->n == 3);
+    mu_assert("m0", C->m0 == 2);
+    mu_assert("n0", C->n0 == 2);
+    int expected_row_perm[2] = {0, 1};
+    int expected_col_perm[2] = {0, 2};
+    mu_assert("row_perm", cmp_int_array(C->row_perm, expected_row_perm, 2));
+    mu_assert("col_perm", cmp_int_array(C->col_perm, expected_col_perm, 2));
+    double expected_X[4] = {19.0, 22.0, 43.0, 50.0};
+    mu_assert("X", cmp_double_array(C->X, expected_X, 4));
+
+    free_matrix(C_m);
+    free_matrix(A_m);
+    free_matrix(B_m);
+    return 0;
+}
+
 #endif /* TEST_PERMUTED_DENSE_H */

From bc9aa3b690344922cc7aba2234e70d3f55cadc75 Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Wed, 13 May 2026 14:04:02 +0200
Subject: [PATCH 27/31] add permuted dense polymorphism to left matmul

---
 include/subexpr.h                             |  7 +++
 src/atoms/affine/left_matmul.c                | 27 ++++++++++
 tests/all_tests.c                             |  1 +
 .../jacobian_tests/affine/test_left_matmul.h  | 51 +++++++++++++++++++
 4 files changed, 86 insertions(+)

diff --git a/include/subexpr.h b/include/subexpr.h
index 1676690..7c6cdd8 100644
--- a/include/subexpr.h
+++ b/include/subexpr.h
@@ -138,6 +138,13 @@ typedef struct left_matmul_expr
        variable, and n_blocks == 1 — in that case the Jacobian is exactly A placed in
        the variable's column slot, which is naturally a full-dense permuted_dense. */
     bool produce_pd_jacobian;
+
+    /* When true, jacobian_init_impl built node->jacobian via
+       BA_pd_matrices_alloc(lnode->A, x->jacobian) — i.e. A is a constant
+       full-block PD and the (composite) child has a polymorphic Jacobian
+       that the dispatcher routes on. eval_jacobian then refreshes values
+       via BA_pd_matrices_fill_values. */
+    bool produce_pd_jacobian_from_child;
 } left_matmul_expr;
 
 /* Scalar multiplication: y = a * child where a comes from param_source */
diff --git a/src/atoms/affine/left_matmul.c b/src/atoms/affine/left_matmul.c
index a062c44..ec67a09 100644
--- a/src/atoms/affine/left_matmul.c
+++ b/src/atoms/affine/left_matmul.c
@@ -17,6 +17,7 @@
  */
 #include "atoms/affine.h"
 #include "subexpr.h"
+#include "utils/matrix_BTA.h"
 #include "utils/mini_numpy.h"
 #include "utils/permuted_dense.h"
 #include "utils/sparse_matrix.h"
@@ -137,6 +138,18 @@ static void jacobian_init_impl(expr *node)
         return;
     }
 
+    /* PD A + composite child: route through the polymorphic dispatcher.
+       The dispatcher handles both PD and sparse child Jacobians internally,
+       so no x->jacobian->is_permuted_dense check here. */
+    if (lnode->n_blocks == 1 && lnode->A->is_permuted_dense &&
+        lnode->param_source == NULL)
+    {
+        node->jacobian =
+            BA_pd_matrices_alloc((permuted_dense *) lnode->A, x->jacobian);
+        lnode->produce_pd_jacobian_from_child = true;
+        return;
+    }
+
     /* General path via CSC_matrix mirror. */
     lnode->Jchild_CSC =
         csr_to_csc_alloc(x->jacobian->to_csr(x->jacobian), node->work->iwork);
@@ -156,6 +169,20 @@ static void eval_jacobian(expr *node)
     /* Fast path: PD Jacobian backed by constant A. Values never change. */
     if (lnode->produce_pd_jacobian) return;
 
+    /* PD A + composite child: refresh values via the dispatcher. Values
+       always need recomputing because the child's Jacobian may change
+       (this branch fires even for affine children today — see
+       multiply.c::jacobian_csc_filled for a possible future
+       affine-tracking cache that would skip the dgemm). */
+    if (lnode->produce_pd_jacobian_from_child)
+    {
+        x->eval_jacobian(x);
+        x->jacobian->refresh_csc_values(x->jacobian); /* no-op for PD */
+        BA_pd_matrices_fill_values((permuted_dense *) lnode->A, x->jacobian,
+                                   (permuted_dense *) node->jacobian);
+        return;
+    }
+
     CSC_matrix *Jchild_CSC = lnode->Jchild_CSC;
     CSC_matrix *J_CSC = lnode->J_CSC;
 
diff --git a/tests/all_tests.c b/tests/all_tests.c
index 227b596..97ae9c1 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -224,6 +224,7 @@ int main(void)
     mu_run_test(test_jacobian_left_matmul_log, tests_run);
     mu_run_test(test_jacobian_left_matmul_log_matrix, tests_run);
     mu_run_test(test_jacobian_left_matmul_exp_composite, tests_run);
+    mu_run_test(test_jacobian_left_matmul_pd_from_composite_child, tests_run);
     mu_run_test(test_jacobian_right_matmul_log, tests_run);
     mu_run_test(test_jacobian_right_matmul_log_vector, tests_run);
     mu_run_test(test_jacobian_matmul, tests_run);
diff --git a/tests/jacobian_tests/affine/test_left_matmul.h b/tests/jacobian_tests/affine/test_left_matmul.h
index be70b25..1879aab 100644
--- a/tests/jacobian_tests/affine/test_left_matmul.h
+++ b/tests/jacobian_tests/affine/test_left_matmul.h
@@ -8,6 +8,7 @@
 #include "minunit.h"
 #include "numerical_diff.h"
 #include "test_helpers.h"
+#include "utils/permuted_dense.h"
 
 const char *test_jacobian_left_matmul_log(void)
 {
@@ -145,3 +146,53 @@ const char *test_jacobian_left_matmul_exp_composite(void)
     free_expr(A_exp_Bx);
     return 0;
 }
+
+/* outer = A2 @ (A1 @ x). Inner left_matmul produces a PD Jacobian via the
+   leaf-var fast path. Outer left_matmul sees a PD child Jacobian and must
+   fire the produce_pd_jacobian_from_child branch via BA_pd_matrices_*.
+
+   x is a length-2 leaf variable at var_id=0, n_vars=2.
+   A1 is 3x2: [[1,2],[3,4],[5,6]] (row-major).
+   A2 is 4x3: [[1,0,1],[0,1,0],[1,0,1],[0,1,0]] (row-major).
+   Expected outer->jacobian: PD of shape (4, 2), row_perm=[0..3],
+   col_perm=[0,1], X = A2 @ A1 = [[6,8],[3,4],[6,8],[3,4]]. */
+const char *test_jacobian_left_matmul_pd_from_composite_child(void)
+{
+    double A1_data[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    double A2_data[12] = {1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
+                          1.0, 0.0, 1.0, 0.0, 1.0, 0.0};
+
+    expr *x = new_variable(2, 1, 0, 2);
+    expr *A1_x = new_left_matmul_dense(NULL, x, 3, 2, A1_data);
+    expr *A2_A1_x = new_left_matmul_dense(NULL, A1_x, 4, 3, A2_data);
+
+    double x_vals[2] = {0.5, -1.5};
+    A2_A1_x->forward(A2_A1_x, x_vals);
+    jacobian_init(A2_A1_x);
+    A2_A1_x->eval_jacobian(A2_A1_x);
+
+    /* Structural: outer's Jacobian must be PD (produced by the new
+       produce_pd_jacobian_from_child branch). */
+    mu_assert("outer Jacobian should be PD",
+              A2_A1_x->jacobian->is_permuted_dense);
+    permuted_dense *pd = (permuted_dense *) A2_A1_x->jacobian;
+    mu_assert("global m", A2_A1_x->jacobian->m == 4);
+    mu_assert("global n", A2_A1_x->jacobian->n == 2);
+    mu_assert("m0", pd->m0 == 4);
+    mu_assert("n0", pd->n0 == 2);
+    int expected_row_perm[4] = {0, 1, 2, 3};
+    int expected_col_perm[2] = {0, 1};
+    mu_assert("row_perm", cmp_int_array(pd->row_perm, expected_row_perm, 4));
+    mu_assert("col_perm", cmp_int_array(pd->col_perm, expected_col_perm, 2));
+
+    /* Numerical: X = A2 @ A1 (row-major 4x2). */
+    double expected_X[8] = {6.0, 8.0, 3.0, 4.0, 6.0, 8.0, 3.0, 4.0};
+    mu_assert("X values", cmp_double_array(pd->X, expected_X, 8));
+
+    /* Cross-check against numerical differentiation for paranoia. */
+    mu_assert("check_jacobian failed",
+              check_jacobian_num(A2_A1_x, x_vals, NUMERICAL_DIFF_DEFAULT_H));
+
+    free_expr(A2_A1_x);
+    return 0;
+}

From f6afb1bd00449e233192d8146d321726a30f2218 Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Wed, 13 May 2026 14:51:04 +0200
Subject: [PATCH 28/31] minor

---
 src/atoms/affine/left_matmul.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/atoms/affine/left_matmul.c b/src/atoms/affine/left_matmul.c
index ec67a09..75596d9 100644
--- a/src/atoms/affine/left_matmul.c
+++ b/src/atoms/affine/left_matmul.c
@@ -363,6 +363,10 @@ expr *new_left_matmul_dense(expr *param_node, expr *u, int m, int n,
 
         lnode->A = new_permuted_dense_full(m, n, data);
         lnode->AT = lnode->A->transpose_alloc(lnode->A);
+        /* transpose_alloc only sets up structure; we must also fill values
+           or AT->X is uninitialized. eval_wsum_hess uses AT->X via
+           AT->block_left_mult_vec, so missing this corrupts every Hessian. */
+        lnode->A->transpose_fill_values(lnode->A, lnode->AT);
 
         /* If the child is a leaf variable and there are no blocks, the Jacobian
            is exactly A placed in the variable's column slot — a full-dense

From f8952d8bd6ae84dc075053c8a1c3a8402e835bcd Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Wed, 13 May 2026 14:57:35 +0200
Subject: [PATCH 29/31] add test

---
 tests/all_tests.c                         |  1 +
 tests/wsum_hess/affine/test_left_matmul.h | 23 +++++++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/tests/all_tests.c b/tests/all_tests.c
index 97ae9c1..20deb03 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -292,6 +292,7 @@ int main(void)
     mu_run_test(test_wsum_hess_left_matmul, tests_run);
     mu_run_test(test_wsum_hess_left_matmul_matrix, tests_run);
     mu_run_test(test_wsum_hess_left_matmul_exp_composite, tests_run);
+    mu_run_test(test_wsum_hess_left_matmul_dense_matrix_exp, tests_run);
     mu_run_test(test_wsum_hess_matmul, tests_run);
     mu_run_test(test_wsum_hess_matmul_yx, tests_run);
     mu_run_test(test_wsum_hess_right_matmul, tests_run);
diff --git a/tests/wsum_hess/affine/test_left_matmul.h b/tests/wsum_hess/affine/test_left_matmul.h
index 0013efb..4869dba 100644
--- a/tests/wsum_hess/affine/test_left_matmul.h
+++ b/tests/wsum_hess/affine/test_left_matmul.h
@@ -196,3 +196,26 @@ const char *test_wsum_hess_left_matmul_matrix(void)
     free_expr(A_log_x);
     return 0;
 }
+
+/* Regression test for the Phase 3 transpose_fill_values omission in
+   new_left_matmul_dense. Mirrors the Python failure
+       cp.sum(A @ cp.exp(X))   with X a (2,2) Variable and A a 2x2 numpy array.
+   eval_wsum_hess reads lnode->AT->X via AT->block_left_mult_vec; before the
+   fix, AT->X was uninitialized memory (transpose_alloc allocates without
+   filling) and the analytic Hessian disagreed with finite differences. */
+const char *test_wsum_hess_left_matmul_dense_matrix_exp(void)
+{
+    double x_vals[4] = {0.5, -0.3, 0.7, -0.2};
+    double w[4] = {1.0, 1.0, 1.0, 1.0}; /* cp.sum: unit weight everywhere */
+    double A_data[4] = {1.0, 2.0, 3.0, 4.0};
+
+    expr *X = new_variable(2, 2, 0, 4);
+    expr *exp_X = new_exp(X);
+    expr *A_exp_X = new_left_matmul_dense(NULL, exp_X, 2, 2, A_data);
+
+    mu_assert("check_wsum_hess failed",
+              check_wsum_hess(A_exp_X, x_vals, w, NUMERICAL_DIFF_DEFAULT_H));
+
+    free_expr(A_exp_X);
+    return 0;
+}

From c36f46aca3e1f898de327d62e326f86715728d1a Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Thu, 14 May 2026 07:47:01 +0200
Subject: [PATCH 30/31] clean up left_matmul and enable parameter

---
 include/subexpr.h                             |  14 --
 src/atoms/affine/left_matmul.c                | 131 ++++++++----------
 src/atoms/affine/transpose.c                  |  11 +-
 tests/all_tests.c                             |   1 +
 .../jacobian_tests/affine/test_left_matmul.h  |  72 +++++++++-
 5 files changed, 126 insertions(+), 103 deletions(-)

diff --git a/include/subexpr.h b/include/subexpr.h
index 7c6cdd8..0dd6c8e 100644
--- a/include/subexpr.h
+++ b/include/subexpr.h
@@ -131,20 +131,6 @@ typedef struct left_matmul_expr
     int *csc_to_csr_work;
     expr *param_source;
     void (*refresh_param_values)(struct left_matmul_expr *);
-
-    /* When true, jacobian_init_impl produces a permuted_dense node->jacobian
-       directly (skipping the CSC_matrix mirror machinery). Set by
-       new_left_matmul_dense when A is a constant dense matrix, child is a leaf
-       variable, and n_blocks == 1 — in that case the Jacobian is exactly A placed in
-       the variable's column slot, which is naturally a full-dense permuted_dense. */
-    bool produce_pd_jacobian;
-
-    /* When true, jacobian_init_impl built node->jacobian via
-       BA_pd_matrices_alloc(lnode->A, x->jacobian) — i.e. A is a constant
-       full-block PD and the (composite) child has a polymorphic Jacobian
-       that the dispatcher routes on. eval_jacobian then refreshes values
-       via BA_pd_matrices_fill_values. */
-    bool produce_pd_jacobian_from_child;
 } left_matmul_expr;
 
 /* Scalar multiplication: y = a * child where a comes from param_source */
diff --git a/src/atoms/affine/left_matmul.c b/src/atoms/affine/left_matmul.c
index 75596d9..7fd2a13 100644
--- a/src/atoms/affine/left_matmul.c
+++ b/src/atoms/affine/left_matmul.c
@@ -112,86 +112,64 @@ static void free_type_data(expr *node)
     lnode->param_source = NULL;
 }
 
-static void jacobian_init_impl(expr *node)
+/* TODO: use better polymorphism here if you add another matrix type*/
+
+/* jacobian_init when node->jacobian is permuted_dense */
+static void jacobian_init_pd(expr *node)
 {
+    /* initialize jacobian of child */
     expr *x = node->left;
     left_matmul_expr *lnode = (left_matmul_expr *) node;
-
-    /* initialize child's jacobian */
     jacobian_init(x);
 
-    /* Fast path: A is a constant full-block PD operator, child is a leaf
-       variable, and there are no Kronecker blocks. The Jacobian is A placed
-       at the variable's column slot — a full-dense permuted_dense. Skip the
-       CSC_matrix mirror entirely. */
-    if (lnode->produce_pd_jacobian)
-    {
-        int m_loc = lnode->A->m;
-        int *row_perm = (int *) SP_MALLOC(m_loc * sizeof(int));
-        int *col_perm = (int *) SP_MALLOC(lnode->A->n * sizeof(int));
-        for (int i = 0; i < m_loc; i++) row_perm[i] = i;
-        for (int j = 0; j < lnode->A->n; j++) col_perm[j] = x->var_id + j;
-        node->jacobian = new_permuted_dense(m_loc, node->n_vars, m_loc, lnode->A->n,
-                                            row_perm, col_perm, lnode->A->x);
-        free(row_perm);
-        free(col_perm);
-        return;
-    }
+    /* initialize this node's jacobian */
+    node->jacobian = BA_pd_matrices_alloc((permuted_dense *) lnode->A, x->jacobian);
+}
 
-    /* PD A + composite child: route through the polymorphic dispatcher.
-       The dispatcher handles both PD and sparse child Jacobians internally,
-       so no x->jacobian->is_permuted_dense check here. */
-    if (lnode->n_blocks == 1 && lnode->A->is_permuted_dense &&
-        lnode->param_source == NULL)
-    {
-        node->jacobian =
-            BA_pd_matrices_alloc((permuted_dense *) lnode->A, x->jacobian);
-        lnode->produce_pd_jacobian_from_child = true;
-        return;
-    }
+/* eval_jacobian when node->jacobian is permuted_dense */
+static void eval_jacobian_pd(expr *node)
+{
+    /* evaluate jacobian of child */
+    left_matmul_expr *lnode = (left_matmul_expr *) node;
+    expr *x = node->left;
+    x->eval_jacobian(x);
 
-    /* General path via CSC_matrix mirror. */
+    /* must refresh CSC cache if x->jacobian is sparse_matrix */
+    x->jacobian->refresh_csc_values(x->jacobian);
+    BA_pd_matrices_fill_values((permuted_dense *) lnode->A, x->jacobian,
+                               (permuted_dense *) node->jacobian);
+}
+
+/* jacobian_init when node->jacobian is sparse */
+static void jacobian_init_sparse(expr *node)
+{
+    /* initialize jacobian of child */
+    expr *x = node->left;
+    left_matmul_expr *lnode = (left_matmul_expr *) node;
+    jacobian_init(x);
+
+    /* initialize this node's jacobian */
     lnode->Jchild_CSC =
         csr_to_csc_alloc(x->jacobian->to_csr(x->jacobian), node->work->iwork);
-
-    /* precompute sparsity of this node's jacobian in CSC_matrix and CSR_matrix */
     lnode->J_CSC = lnode->A->block_left_mult_sparsity(lnode->A, lnode->Jchild_CSC,
                                                       lnode->n_blocks);
     node->jacobian =
         new_sparse_matrix(csc_to_csr_alloc(lnode->J_CSC, lnode->csc_to_csr_work));
 }
 
-static void eval_jacobian(expr *node)
+/* eval_jacobian when node->jacobian is sparse */
+static void eval_jacobian_sparse(expr *node)
 {
+    /* evaluate jacobian of child */
     left_matmul_expr *lnode = (left_matmul_expr *) node;
     expr *x = node->left;
+    x->eval_jacobian(x);
 
-    /* Fast path: PD Jacobian backed by constant A. Values never change. */
-    if (lnode->produce_pd_jacobian) return;
-
-    /* PD A + composite child: refresh values via the dispatcher. Values
-       always need recomputing because the child's Jacobian may change
-       (this branch fires even for affine children today — see
-       multiply.c::jacobian_csc_filled for a possible future
-       affine-tracking cache that would skip the dgemm). */
-    if (lnode->produce_pd_jacobian_from_child)
-    {
-        x->eval_jacobian(x);
-        x->jacobian->refresh_csc_values(x->jacobian); /* no-op for PD */
-        BA_pd_matrices_fill_values((permuted_dense *) lnode->A, x->jacobian,
-                                   (permuted_dense *) node->jacobian);
-        return;
-    }
-
+    /* evaluate this node's jacobian */
     CSC_matrix *Jchild_CSC = lnode->Jchild_CSC;
     CSC_matrix *J_CSC = lnode->J_CSC;
-
-    /* evaluate child's jacobian and convert to CSC_matrix */
-    x->eval_jacobian(x);
     csr_to_csc_fill_values(x->jacobian->to_csr(x->jacobian), Jchild_CSC,
                            node->work->iwork);
-
-    /* compute this node's jacobian: */
     lnode->A->block_left_mult_values(lnode->A, Jchild_CSC, J_CSC);
     csc_to_csr_fill_values(J_CSC, node->jacobian->to_csr(node->jacobian),
                            lnode->csc_to_csr_work);
@@ -267,23 +245,25 @@ expr *new_left_matmul(expr *param_node, expr *u, const CSR_matrix *A)
     left_matmul_expr *lnode =
         (left_matmul_expr *) SP_CALLOC(1, sizeof(left_matmul_expr));
     expr *node = &lnode->base;
-    init_expr(node, d1, d2, u->n_vars, forward, jacobian_init_impl, eval_jacobian,
-              is_affine, wsum_hess_init_impl, eval_wsum_hess, free_type_data);
+    /* Sparse A — always the general CSC-mirror path. */
+    init_expr(node, d1, d2, u->n_vars, forward, jacobian_init_sparse,
+              eval_jacobian_sparse, is_affine, wsum_hess_init_impl, eval_wsum_hess,
+              free_type_data);
     node->left = u;
     expr_retain(u);
 
     /* allocate workspace. iwork is used for converting J_child csr to csc
-       (requiring size node->n_vars) and for transposing A (requiring size A->n).
+       (requiring size node->n_vars).
        csc_to_csr_work is used for converting J_CSC to CSR_matrix (requiring
        node->size) */
-    node->work->iwork = (int *) SP_MALLOC(MAX(A->n, node->n_vars) * sizeof(int));
+    node->work->iwork = (int *) SP_MALLOC(node->n_vars * sizeof(int));
     lnode->csc_to_csr_work = (int *) SP_MALLOC(node->size * sizeof(int));
     lnode->n_blocks = n_blocks;
 
     /* store A and AT. new_sparse_matrix takes ownership, so clone first. */
     lnode->A = new_sparse_matrix(new_csr(A));
-    lnode->AT =
-        sparse_matrix_trans((const sparse_matrix *) lnode->A, node->work->iwork);
+    lnode->AT = lnode->A->transpose_alloc(lnode->A);
+    lnode->A->transpose_fill_values(lnode->A, lnode->AT);
 
     /* parameter support */
     lnode->param_source = param_node;
@@ -325,8 +305,18 @@ expr *new_left_matmul_dense(expr *param_node, expr *u, int m, int n,
     left_matmul_expr *lnode =
         (left_matmul_expr *) SP_CALLOC(1, sizeof(left_matmul_expr));
     expr *node = &lnode->base;
-    init_expr(node, d1, d2, u->n_vars, forward, jacobian_init_impl, eval_jacobian,
-              is_affine, wsum_hess_init_impl, eval_wsum_hess, free_type_data);
+    /* PD A: the BA_pd_matrices dispatcher applies whenever there is a single
+       Kronecker block, whether A is constant or parameterized. With a
+       parameter, A's structure is fixed at construction (full-block PD with
+       trivial permutations); refresh_dense_left updates A->X before each
+       forward, and eval_jacobian_pd reads those refreshed values via
+       BA_pd_matrices_fill_values. With n_blocks > 1 the Kronecker structure
+       forces the general CSC-mirror path. */
+    bool pd_path = (n_blocks == 1);
+    init_expr(node, d1, d2, u->n_vars, forward,
+              pd_path ? jacobian_init_pd : jacobian_init_sparse,
+              pd_path ? eval_jacobian_pd : eval_jacobian_sparse, is_affine,
+              wsum_hess_init_impl, eval_wsum_hess, free_type_data);
     node->left = u;
     expr_retain(u);
 
@@ -363,18 +353,7 @@ expr *new_left_matmul_dense(expr *param_node, expr *u, int m, int n,
 
         lnode->A = new_permuted_dense_full(m, n, data);
         lnode->AT = lnode->A->transpose_alloc(lnode->A);
-        /* transpose_alloc only sets up structure; we must also fill values
-           or AT->X is uninitialized. eval_wsum_hess uses AT->X via
-           AT->block_left_mult_vec, so missing this corrupts every Hessian. */
         lnode->A->transpose_fill_values(lnode->A, lnode->AT);
-
-        /* If the child is a leaf variable and there are no blocks, the Jacobian
-           is exactly A placed in the variable's column slot — a full-dense
-           permuted_dense. Enable the fast path. */
-        if (u->var_id != NOT_A_VARIABLE && n_blocks == 1)
-        {
-            lnode->produce_pd_jacobian = true;
-        }
     }
 
     return node;
diff --git a/src/atoms/affine/transpose.c b/src/atoms/affine/transpose.c
index 7fd3720..59c961e 100644
--- a/src/atoms/affine/transpose.c
+++ b/src/atoms/affine/transpose.c
@@ -16,6 +16,7 @@
  * limitations under the License.
  */
 #include "atoms/affine.h"
+#include "utils/mini_numpy.h"
 #include "utils/tracked_alloc.h"
 #include <stdlib.h>
 #include <string.h>
@@ -29,15 +30,7 @@ static void forward(expr *node, const double *u)
     /* local forward pass */
     int d1 = node->d1;
     int d2 = node->d2;
-    double *X = node->left->value;
-    double *XT = node->value;
-    for (int i = 0; i < d1; ++i)
-    {
-        for (int j = 0; j < d2; ++j)
-        {
-            XT[j * d1 + i] = X[i * d2 + j];
-        }
-    }
+    A_transpose(node->value, node->left->value, d1, d2);
 }
 
 static void jacobian_init_impl(expr *node)
diff --git a/tests/all_tests.c b/tests/all_tests.c
index 20deb03..853308f 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -225,6 +225,7 @@ int main(void)
     mu_run_test(test_jacobian_left_matmul_log_matrix, tests_run);
     mu_run_test(test_jacobian_left_matmul_exp_composite, tests_run);
     mu_run_test(test_jacobian_left_matmul_pd_from_composite_child, tests_run);
+    mu_run_test(test_jacobian_left_matmul_pd_param, tests_run);
     mu_run_test(test_jacobian_right_matmul_log, tests_run);
     mu_run_test(test_jacobian_right_matmul_log_vector, tests_run);
     mu_run_test(test_jacobian_matmul, tests_run);
diff --git a/tests/jacobian_tests/affine/test_left_matmul.h b/tests/jacobian_tests/affine/test_left_matmul.h
index 1879aab..18a27c8 100644
--- a/tests/jacobian_tests/affine/test_left_matmul.h
+++ b/tests/jacobian_tests/affine/test_left_matmul.h
@@ -171,10 +171,9 @@ const char *test_jacobian_left_matmul_pd_from_composite_child(void)
     jacobian_init(A2_A1_x);
     A2_A1_x->eval_jacobian(A2_A1_x);
 
-    /* Structural: outer's Jacobian must be PD (produced by the new
-       produce_pd_jacobian_from_child branch). */
-    mu_assert("outer Jacobian should be PD",
-              A2_A1_x->jacobian->is_permuted_dense);
+    /* Structural: outer's Jacobian must be PD (produced by the
+       jacobian_init_pd path via BA_pd_matrices_alloc). */
+    mu_assert("outer Jacobian should be PD", A2_A1_x->jacobian->is_permuted_dense);
     permuted_dense *pd = (permuted_dense *) A2_A1_x->jacobian;
     mu_assert("global m", A2_A1_x->jacobian->m == 4);
     mu_assert("global n", A2_A1_x->jacobian->n == 2);
@@ -196,3 +195,68 @@ const char *test_jacobian_left_matmul_pd_from_composite_child(void)
     free_expr(A2_A1_x);
     return 0;
 }
+
+/* Parameterized A: A_param @ x with leaf-variable x and n_blocks == 1.
+   Verifies the PD path (jacobian_init_pd / eval_jacobian_pd) handles a
+   parameterized A — the structure is fixed at construction, refresh_dense_left
+   updates A->X before each forward, and eval_jacobian_pd reads those values
+   via BA_pd_matrices_fill_values.
+
+   x is length 2 at var_id=0, n_vars=2.
+   A_param is 3x2. The parameter convention is column-major, so the param's
+   value array is column-major of A. We test two parameter assignments:
+     A = [[1,2],[3,4],[5,6]]   column-major: [1,3,5,2,4,6]
+     A = [[7,8],[9,10],[11,12]] column-major: [7,9,11,8,10,12]
+   The Jacobian of A @ x w.r.t. x is just A itself, placed at the
+   variable's column slot. */
+const char *test_jacobian_left_matmul_pd_param(void)
+{
+    expr *x = new_variable(2, 1, 0, 2);
+
+    double theta[6] = {1.0, 3.0, 5.0, 2.0, 4.0, 6.0};
+    expr *A_param = new_parameter(3, 2, 0, 2, theta);
+    expr *A_x = new_left_matmul_dense(A_param, x, 3, 2, NULL);
+
+    double x_vals[2] = {0.5, -1.5};
+    A_x->forward(A_x, x_vals);
+    jacobian_init(A_x);
+    A_x->eval_jacobian(A_x);
+
+    /* Structural: Jacobian must be PD. */
+    mu_assert("Jacobian should be PD", A_x->jacobian->is_permuted_dense);
+    permuted_dense *pd = (permuted_dense *) A_x->jacobian;
+    mu_assert("global m", A_x->jacobian->m == 3);
+    mu_assert("global n", A_x->jacobian->n == 2);
+    mu_assert("m0", pd->m0 == 3);
+    mu_assert("n0", pd->n0 == 2);
+    int expected_row_perm[3] = {0, 1, 2};
+    int expected_col_perm[2] = {0, 1};
+    mu_assert("row_perm", cmp_int_array(pd->row_perm, expected_row_perm, 3));
+    mu_assert("col_perm", cmp_int_array(pd->col_perm, expected_col_perm, 2));
+
+    /* Values: pd->X is row-major of A. */
+    double expected_X1[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    mu_assert("X values (param 1)", cmp_double_array(pd->X, expected_X1, 6));
+
+    mu_assert("numerical check (param 1)",
+              check_jacobian_num(A_x, x_vals, NUMERICAL_DIFF_DEFAULT_H));
+
+    /* Update the parameter and re-evaluate. The PD output struct is reused;
+       only pd->X should change. */
+    double theta2[6] = {7.0, 9.0, 11.0, 8.0, 10.0, 12.0};
+    memcpy(A_param->value, theta2, 6 * sizeof(double));
+    expr_set_needs_refresh(A_x);
+
+    A_x->forward(A_x, x_vals);
+    A_x->eval_jacobian(A_x);
+
+    mu_assert("Jacobian still PD after refresh", A_x->jacobian->is_permuted_dense);
+    double expected_X2[6] = {7.0, 8.0, 9.0, 10.0, 11.0, 12.0};
+    mu_assert("X values (param 2)", cmp_double_array(pd->X, expected_X2, 6));
+
+    mu_assert("numerical check (param 2)",
+              check_jacobian_num(A_x, x_vals, NUMERICAL_DIFF_DEFAULT_H));
+
+    free_expr(A_x);
+    return 0;
+}

From d74ea3ed0dfe7561d54e2c892292d76358abd0be Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Thu, 14 May 2026 08:07:22 +0200
Subject: [PATCH 31/31] fix filename

---
 include/utils/{COO_Matrix.h => COO_matrix.h}         | 0
 include/utils/utils.h                                | 5 ++---
 src/utils/{COO_Matrix.c => COO_matrix.c}             | 0
 tests/utils/{test_coo_matrix.h => test_COO_matrix.h} | 0
 4 files changed, 2 insertions(+), 3 deletions(-)
 rename include/utils/{COO_Matrix.h => COO_matrix.h} (100%)
 rename src/utils/{COO_Matrix.c => COO_matrix.c} (100%)
 rename tests/utils/{test_coo_matrix.h => test_COO_matrix.h} (100%)

diff --git a/include/utils/COO_Matrix.h b/include/utils/COO_matrix.h
similarity index 100%
rename from include/utils/COO_Matrix.h
rename to include/utils/COO_matrix.h
diff --git a/include/utils/utils.h b/include/utils/utils.h
index 1b9a6bd..5c3c5fe 100644
--- a/include/utils/utils.h
+++ b/include/utils/utils.h
@@ -30,11 +30,10 @@
 /* Sort an array of integers in ascending order */
 void sort_int_array(int *array, int size);
 
-/* Return true iff sorted index arrays a_idx and b_idx (lengths a_len, b_len)
+/* Return true if sorted index arrays a_idx and b_idx (lengths a_len, b_len)
    share any value, where b_idx entries are shifted by b_offset before
    comparison (a_idx[ai] == b_idx[bi] - b_offset). Use b_offset = 0 for a
-   plain intersection check. O(a_len + b_len) sorted-merge scan with early
-   termination on first match. */
+   plain intersection check.  */
 bool has_overlap(const int *a_idx, int a_len, const int *b_idx, int b_len,
                  int b_offset);
 
diff --git a/src/utils/COO_Matrix.c b/src/utils/COO_matrix.c
similarity index 100%
rename from src/utils/COO_Matrix.c
rename to src/utils/COO_matrix.c
diff --git a/tests/utils/test_coo_matrix.h b/tests/utils/test_COO_matrix.h
similarity index 100%
rename from tests/utils/test_coo_matrix.h
rename to tests/utils/test_COO_matrix.h