diff --git a/include/atoms/affine.h b/include/atoms/affine.h
index 9527d3c..49b4637 100644
--- a/include/atoms/affine.h
+++ b/include/atoms/affine.h
@@ -19,8 +19,7 @@
 #define AFFINE_H
 
 #include "expr.h"
-#include "subexpr.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 
 expr *new_add(expr *left, expr *right);
 expr *new_neg(expr *child);
@@ -45,7 +44,7 @@ expr *new_transpose(expr *child);
 /* Left matrix multiplication: A @ f(x) where A is a constant sparse matrix.
    param_node is NULL for fixed constants. We currently do not support sparse
    parameters, so param_node should always be null. */
-expr *new_left_matmul(expr *param_node, expr *u, const CSR_Matrix *A);
+expr *new_left_matmul(expr *param_node, expr *u, const CSR_matrix *A);
 
 /* Left matrix multiplication: A @ f(x) where A is a constant dense matrix
    (in row-major, m x n, with values given by 'data') or a parameter
@@ -59,7 +58,7 @@ expr *new_left_matmul_dense(expr *param_node, expr *u, int m, int n,
 /* Right matrix multiplication: f(x) @ A where A is a constant sparse matrix.
    We currently do not support sparse parameters, so param_node should always be
    null. */
-expr *new_right_matmul(expr *param_node, expr *u, const CSR_Matrix *A);
+expr *new_right_matmul(expr *param_node, expr *u, const CSR_matrix *A);
 
 /* Right matrix multiplication: f(x) @ A where A is a constant dense matrix
    (in row-major, m x n, with values given by 'data') or a parameter
diff --git a/include/atoms/bivariate_full_dom.h b/include/atoms/bivariate_full_dom.h
index a864c7b..44e3f01 100644
--- a/include/atoms/bivariate_full_dom.h
+++ b/include/atoms/bivariate_full_dom.h
@@ -22,7 +22,7 @@
 
 expr *new_elementwise_mult(expr *left, expr *right);
 
-/* Matrix multiplication: Z = X @ Y */
+/* matrix multiplication: Z = X @ Y */
 expr *new_matmul(expr *x, expr *y);
 
 #endif /* BIVARIATE_FULL_DOM_H */
diff --git a/include/atoms/non_elementwise_full_dom.h b/include/atoms/non_elementwise_full_dom.h
index 7de046c..65c6070 100644
--- a/include/atoms/non_elementwise_full_dom.h
+++ b/include/atoms/non_elementwise_full_dom.h
@@ -20,9 +20,9 @@
 
 #include "expr.h"
 #include "subexpr.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 
-expr *new_quad_form(expr *child, CSR_Matrix *Q);
+expr *new_quad_form(expr *child, CSR_matrix *Q);
 
 /* product of all entries, without axis argument */
 expr *new_prod(expr *child);
diff --git a/include/expr.h b/include/expr.h
index 1c54b2e..fbe5a67 100644
--- a/include/expr.h
+++ b/include/expr.h
@@ -18,8 +18,9 @@
 #ifndef EXPR_H
 #define EXPR_H
 
-#include "utils/CSC_Matrix.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSC_matrix.h"
+#include "utils/CSR_matrix.h"
+#include "utils/matrix.h"
 #include <stdbool.h>
 #include <stddef.h>
 #include <string.h>
@@ -44,16 +45,16 @@ typedef struct
 {
     double *dwork;
     int *iwork;
-    CSC_Matrix *jacobian_csc;
-    int *csc_work; /* for CSR-CSC conversion */
+    CSC_matrix *jacobian_csc;
+    int *csc_work; /* for CSR_matrix-CSC_matrix conversion */
 
     /* jacobian_csc_filled is only used for affine functions to avoid redundant
        conversions. Could become relevant for non-affine functions if we start
        supporting common subexpressions on the Python side. */
     bool jacobian_csc_filled;
     double *local_jac_diag; /* cached f'(g(x)) diagonal */
-    CSR_Matrix *hess_term1; /* Jg^T D Jg workspace */
-    CSR_Matrix *hess_term2; /* child wsum_hess workspace */
+    matrix *hess_term1;     /* Jg^T D Jg workspace */
+    matrix *hess_term2;     /* child wsum_hess workspace */
 } Expr_Work;
 
 /* Base expression node structure */
@@ -70,8 +71,8 @@ typedef struct expr
     //                     oracle related quantities
     // ------------------------------------------------------------------------
     double *value;
-    CSR_Matrix *jacobian;
-    CSR_Matrix *wsum_hess;
+    matrix *jacobian;
+    matrix *wsum_hess;
     forward_fn forward;
     jacobian_init_fn jacobian_init_impl;
     wsum_hess_init_fn wsum_hess_init_impl;
@@ -110,7 +111,7 @@ void free_expr(expr *node);
 void jacobian_init(expr *node);
 void wsum_hess_init(expr *node);
 
-/* Initialize CSC form of the Jacobian from the CSR Jacobian.
+/* Initialize CSC_matrix form of the Jacobian from the CSR_matrix Jacobian.
  * Must be called after jacobian_init. */
 void jacobian_csc_init(expr *node);
 
diff --git a/include/old-code/old_CSR.h b/include/old-code/old_CSR.h
index 41af333..7033246 100644
--- a/include/old-code/old_CSR.h
+++ b/include/old-code/old_CSR.h
@@ -18,29 +18,29 @@
 #ifndef OLD_CSR_H
 #define OLD_CSR_H
 
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 
 /* Build (I_p kron A) = blkdiag(A, A, ..., A) of size (p*A->m) x (p*A->n) */
-CSR_Matrix *block_diag_repeat_csr(const CSR_Matrix *A, int p);
+CSR_matrix *block_diag_repeat_csr(const CSR_matrix *A, int p);
 
 /* Build (A kron I_p) of size (A->m * p) x (A->n * p) with nnz = A->nnz * p. */
-CSR_Matrix *kron_identity_csr(const CSR_Matrix *A, int p);
+CSR_matrix *kron_identity_csr(const CSR_matrix *A, int p);
 
 /* Computes values of the row matrix C = z^T A (column indices must have been
    pre-computed) and transposed matrix AT must be provided) */
-void Ax_csr_fill_values(const CSR_Matrix *AT, const double *z, CSR_Matrix *C);
+void Ax_csr_fill_values(const CSR_matrix *AT, const double *z, CSR_matrix *C);
 
-/* Insert value into CSR matrix A with just one row at col_idx. Assumes that A
+/* Insert value into CSR_matrix matrix A with just one row at col_idx. Assumes that A
 has enough space and that A does not have an element at col_idx. It does update
 nnz. */
-void csr_insert_value(CSR_Matrix *A, int col_idx, double value);
+void csr_insert_value(CSR_matrix *A, int col_idx, double value);
 
-/* Compute C = diag(d) * A where d is an array and A, C are CSR matrices
+/* Compute C = diag(d) * A where d is an array and A, C are CSR_matrix matrices
  * d must have length m
  * C must be pre-allocated with same dimensions as A */
-void diag_csr_mult(const double *d, const CSR_Matrix *A, CSR_Matrix *C);
+void diag_csr_mult(const double *d, const CSR_matrix *A, CSR_matrix *C);
 
 /* y = Ax, where y is returned as dense (no column offset) */
-void Ax_csr_wo_offset(const CSR_Matrix *A, const double *x, double *y);
+void Ax_csr_wo_offset(const CSR_matrix *A, const double *x, double *y);
 
 #endif /* OLD_CSR_H */
diff --git a/include/old-code/old_CSR_sum.h b/include/old-code/old_CSR_sum.h
index 72d0b30..3582f31 100644
--- a/include/old-code/old_CSR_sum.h
+++ b/include/old-code/old_CSR_sum.h
@@ -18,44 +18,44 @@
 #ifndef OLD_CSR_SUM_H
 #define OLD_CSR_SUM_H
 
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 
-/* Compute C = A + B where A, B, C are CSR matrices
+/* Compute C = A + B where A, B, C are CSR_matrix matrices
  * A and B must have same dimensions
  * C must be pre-allocated with sufficient nnz capacity.
  * C must be different from A and B */
-void sum_csr_matrices(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C);
+void sum_csr_matrices(const CSR_matrix *A, const CSR_matrix *B, CSR_matrix *C);
 
-/* Compute C = diag(d1) * A + diag(d2) * B where A, B, C are CSR matrices */
-void sum_scaled_csr_matrices(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C,
+/* Compute C = diag(d1) * A + diag(d2) * B where A, B, C are CSR_matrix matrices */
+void sum_scaled_csr_matrices(const CSR_matrix *A, const CSR_matrix *B, CSR_matrix *C,
                              const double *d1, const double *d2);
 
 /* forward declaration */
 struct int_double_pair;
 
 /* Sum all rows of A into a single row matrix C */
-void sum_all_rows_csr(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_all_rows_csr(const CSR_matrix *A, CSR_matrix *C,
                       struct int_double_pair *pairs);
 
 /* Sum blocks of rows of A into a matrix C */
-void sum_block_of_rows_csr(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_block_of_rows_csr(const CSR_matrix *A, CSR_matrix *C,
                            struct int_double_pair *pairs, int row_block_size);
 
 /* Sum evenly spaced rows of A into a matrix C */
-void sum_evenly_spaced_rows_csr(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_evenly_spaced_rows_csr(const CSR_matrix *A, CSR_matrix *C,
                                 struct int_double_pair *pairs, int row_spacing);
 
 /* Sum evenly spaced rows of A starting at offset into a row matrix C */
-void sum_spaced_rows_into_row_csr(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_spaced_rows_into_row_csr(const CSR_matrix *A, CSR_matrix *C,
                                   struct int_double_pair *pairs, int offset,
                                   int spacing);
 
 /* Fill values of summed rows using precomputed idx_map and sparsity of C */
-void sum_all_rows_csr_fill_values(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_all_rows_csr_fill_values(const CSR_matrix *A, CSR_matrix *C,
                                   const int *idx_map);
 
 /* Fill values of summed block rows using precomputed idx_map */
-void sum_block_of_rows_csr_fill_values(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_block_of_rows_csr_fill_values(const CSR_matrix *A, CSR_matrix *C,
                                        const int *idx_map);
 
 #endif /* OLD_CSR_SUM_H */
diff --git a/include/old-code/old_affine.h b/include/old-code/old_affine.h
index 3b08491..cf0e598 100644
--- a/include/old-code/old_affine.h
+++ b/include/old-code/old_affine.h
@@ -19,8 +19,8 @@
 #define OLD_AFFINE_H
 
 #include "expr.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 
-expr *new_linear(expr *u, const CSR_Matrix *A, const double *b);
+expr *new_linear(expr *u, const CSR_matrix *A, const double *b);
 
 #endif /* OLD_AFFINE_H */
diff --git a/include/old-code/old_permuted_dense.h b/include/old-code/old_permuted_dense.h
new file mode 100644
index 0000000..38ac5cf
--- /dev/null
+++ b/include/old-code/old_permuted_dense.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2026 Daniel Cederberg and William Zhang
+ *
+ * This file is part of the SparseDiffEngine project.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef OLD_PERMUTED_DENSE_H
+#define OLD_PERMUTED_DENSE_H
+
+#include "utils/CSR_matrix.h"
+#include "utils/permuted_dense.h"
+
+/* Legacy CSR-based (PD, Sparse) BTA / BTDA kernels.
+
+   Mathematically equivalent to BTA_pd_csc_alloc / BTDA_pd_csc_fill_values
+   in src/utils/permuted_dense.c — they all compute C = B^T (diag(d)) A
+   for B PD and A sparse. The matrix_BTA dispatcher used to choose between
+   the CSR-here and CSC-in-utils variants; after a benchmark on
+   trimmed_log_reg-shaped workloads we committed to CSC and moved these
+   kernels out of production paths.
+
+   Kept here as a reference implementation, as cross-comparison fodder for
+   tests (test_BTA_pd_csc_matches_csr), and as the CSR side of the
+   profile_BTA_pd_csr_vs_csc microbenchmark. */
+
+/* Allocate a new permuted_dense for C = B^T A where B is PD and A is
+   CSR-sparse. Output is PD with row_perm = B->col_perm and col_perm = the
+   sorted union of columns appearing in A's rows at positions row_perm_B.
+   Dense block size = (B->n0, |col_active|). Values uninitialized. */
+matrix *BTA_pd_csr_alloc(const permuted_dense *B, const CSR_matrix *A);
+
+/* Fill C->X = X_B^T @ A_sub_dense, where A_sub_dense is A's rows at
+   positions row_perm_B, columns restricted to C's col_perm, scattered to a
+   dense buffer. C must have the structure produced by BTA_pd_csr_alloc. */
+void BTA_pd_csr_fill_values(const permuted_dense *B, const CSR_matrix *A,
+                            permuted_dense *C);
+
+/* BTDA variant: C->X = X_B^T diag(d) A_sub_dense. d may be NULL (treated
+   as identity scaling). C must have the structure produced by
+   BTA_pd_csr_alloc. */
+void BTDA_pd_csr_fill_values(const permuted_dense *B, const double *d,
+                             const CSR_matrix *A, permuted_dense *C);
+
+/* Legacy CSR-pd kernels (B=CSR, A=PD), formerly in src/utils/permuted_dense.c.
+   Production now dispatches the (PD A, sparse B) branch through CSC-pd
+   kernels (BTA_csc_pd_alloc / BTDA_csc_pd_fill_values in utils/permuted_dense.h),
+   so these CSR variants live here as reference implementations and as
+   targets for the direct unit tests in tests/old-code. */
+
+/* Allocate a new permuted_dense for C = B^T A where B is CSR-sparse and A
+   is PD. Output is PD with row_perm = the sorted union of columns appearing
+   in B's rows at positions row_perm_A, and col_perm = A->col_perm. */
+matrix *BTA_csr_pd_alloc(const CSR_matrix *B_csr, const permuted_dense *A);
+
+/* No-d BTA fill. C must have the structure produced by BTA_csr_pd_alloc. */
+void BTA_csr_pd_fill_values(const CSR_matrix *B_csr, const permuted_dense *A,
+                            permuted_dense *C);
+
+/* BTDA variant: C->X = B_sub_dense^T diag(d) X_A. d may be NULL (treated
+   as identity scaling). C must have the structure produced by
+   BTA_csr_pd_alloc. */
+void BTDA_csr_pd_fill_values(const CSR_matrix *B_csr, const double *d,
+                             const permuted_dense *A, permuted_dense *C);
+
+#endif /* OLD_PERMUTED_DENSE_H */
diff --git a/include/problem.h b/include/problem.h
index 7e68a28..530907c 100644
--- a/include/problem.h
+++ b/include/problem.h
@@ -19,8 +19,8 @@
 #define PROBLEM_H
 
 #include "expr.h"
-#include "utils/COO_Matrix.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/COO_matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/Timer.h"
 #include <stdbool.h>
 
@@ -59,11 +59,11 @@ typedef struct problem
     double *gradient_values;
 
     /* allocated by problem_init_derivatives */
-    CSR_Matrix *jacobian;
-    CSR_Matrix *lagrange_hessian;
+    CSR_matrix *jacobian;
+    CSR_matrix *lagrange_hessian;
     int *hess_idx_map; /* maps all wsum_hess nnz to lagrange_hessian */
-    COO_Matrix *jacobian_coo;
-    COO_Matrix *lagrange_hessian_coo; /* lower triangular part stored in COO */
+    COO_matrix *jacobian_coo;
+    COO_matrix *lagrange_hessian_coo; /* lower triangular part stored in COO */
 
     /* for the affine shortcut we keep track of the first time the jacobian and
      * hessian are called */
diff --git a/include/subexpr.h b/include/subexpr.h
index f97feef..0dd6c8e 100644
--- a/include/subexpr.h
+++ b/include/subexpr.h
@@ -19,8 +19,8 @@
 #define SUBEXPR_H
 
 #include "expr.h"
-#include "utils/CSC_Matrix.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSC_matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/matrix.h"
 
 /* Forward declaration */
@@ -40,7 +40,7 @@ typedef struct parameter_expr
 } parameter_expr;
 
 /* Linear operator: y = A * x + b
- * The matrix A is stored as node->jacobian (CSR). */
+ * The matrix A is stored as node->jacobian (CSR_matrix). */
 typedef struct linear_op_expr
 {
     expr base;
@@ -58,8 +58,8 @@ typedef struct power_expr
 typedef struct quad_form_expr
 {
     expr base;
-    CSR_Matrix *Q;
-    CSC_Matrix *QJf; /* Q * J_f in CSC (for chain rule hessian) */
+    CSR_matrix *Q;
+    CSC_matrix *QJf; /* Q * J_f in CSC_matrix (for chain rule hessian) */
 } quad_form_expr;
 
 /* Sum reduction along an axis */
@@ -102,19 +102,19 @@ typedef struct hstack_expr
     expr base;
     expr **args;
     int n_args;
-    CSR_Matrix *CSR_work; /* for summing Hessians of children */
+    CSR_matrix *CSR_work; /* for summing Hessians of children */
 } hstack_expr;
 
 /* Elementwise multiplication */
 typedef struct elementwise_mult_expr
 {
     expr base;
-    CSR_Matrix *CSR_work1; /* C  = Jg2^T diag(w) Jg1 */
-    CSR_Matrix *CSR_work2; /* CT = C^T */
-    int *idx_map_C;        /* C[j]  -> wsum_hess pos */
-    int *idx_map_CT;       /* CT[j] -> wsum_hess pos */
-    int *idx_map_Hx;       /* x->wsum_hess[j] -> pos */
-    int *idx_map_Hy;       /* y->wsum_hess[j] -> pos */
+    matrix *C;       /* C  = Jg2^T diag(w) Jg1 (Sparse or PD) */
+    matrix *CT;      /* CT = C^T; same concrete type as C */
+    int *idx_map_C;  /* C[j]  -> wsum_hess pos */
+    int *idx_map_CT; /* CT[j] -> wsum_hess pos */
+    int *idx_map_Hx; /* x->wsum_hess[j] -> pos */
+    int *idx_map_Hy; /* y->wsum_hess[j] -> pos */
 } elementwise_mult_expr;
 
 /* Left matrix multiplication: y = A * f(x) where f(x) is an expression. Note that
@@ -123,11 +123,11 @@ important distinction compared to linear_op_expr. */
 typedef struct left_matmul_expr
 {
     expr base;
-    Matrix *A;
-    Matrix *AT;
+    matrix *A;
+    matrix *AT;
     int n_blocks;
-    CSC_Matrix *Jchild_CSC;
-    CSC_Matrix *J_CSC;
+    CSC_matrix *Jchild_CSC;
+    CSC_matrix *J_CSC;
     int *csc_to_csr_work;
     expr *param_source;
     void (*refresh_param_values)(struct left_matmul_expr *);
@@ -151,7 +151,7 @@ typedef struct vector_mult_expr
 /* 1D convolution: y = conv(a, child) where a is a length-m kernel held by
  * param_source. Output has size (m + n - 1) where n is the child length.
  * Forward and wsum_hess backprop are computed as direct loops; for Jacobian
- * we materialize T(a) as a CSR once at jacobian_init and reuse the engine's
+ * we materialize T(a) as a CSR_matrix once at jacobian_init and reuse the engine's
  * block-left-mult machinery for composite children. */
 typedef struct convolve_expr
 {
@@ -159,8 +159,8 @@ typedef struct convolve_expr
     expr *param_source; /* length-m kernel */
     int m;              /* kernel length */
     int n;              /* input length */
-    CSR_Matrix *T;      /* (m+n-1) x n convolution matrix */
-    CSC_Matrix *Jchild_CSC;
+    CSR_matrix *T;      /* (m+n-1) x n convolution matrix */
+    CSC_matrix *Jchild_CSC;
 } convolve_expr;
 
 /* Bivariate matrix multiplication: Z = f(u) @ g(u) where both children
@@ -169,16 +169,16 @@ typedef struct matmul_expr
 {
     expr base;
     /* Jacobian workspace */
-    CSR_Matrix *term1_CSR; /* (Y^T x I_m) @ J_f */
-    CSR_Matrix *term2_CSR; /* (I_n x X) @ J_g */
+    CSR_matrix *term1_CSR; /* (Y^T x I_m) @ J_f */
+    CSR_matrix *term2_CSR; /* (I_n x X) @ J_g */
 
     /* Hessian workspace (composite only) */
-    CSR_Matrix *B;       /* cross-Hessian B(w), mk x kn */
-    CSR_Matrix *BJg;     /* B @ J_g */
-    CSC_Matrix *BJg_CSC; /* BJg in CSC */
-    int *BJg_csc_work;   /* CSR-to-CSC workspace */
-    CSR_Matrix *C;       /* J_f^T @ B @ J_g */
-    CSR_Matrix *CT;      /* C^T */
+    CSR_matrix *B;       /* cross-Hessian B(w), mk x kn */
+    CSR_matrix *BJg;     /* B @ J_g */
+    CSC_matrix *BJg_CSC; /* BJg in CSC_matrix */
+    int *BJg_csc_work;   /* CSR_matrix-to-CSC_matrix workspace */
+    CSR_matrix *C;       /* J_f^T @ B @ J_g */
+    CSR_matrix *CT;      /* C^T */
     int *idx_map_C;
     int *idx_map_CT;
     int *idx_map_Hf;
@@ -194,14 +194,6 @@ typedef struct index_expr
     bool has_duplicates; /* True if indices have duplicates (affects Hessian path) */
 } index_expr;
 
-/* Broadcast types */
-typedef enum
-{
-    BROADCAST_ROW,   /* (1, n) -> (m, n) */
-    BROADCAST_COL,   /* (m, 1) -> (m, n) */
-    BROADCAST_SCALAR /* (1, 1) -> (m, n) */
-} broadcast_type;
-
 typedef struct broadcast_expr
 {
     expr base;
diff --git a/include/utils/COO_Matrix.h b/include/utils/COO_matrix.h
similarity index 57%
rename from include/utils/COO_Matrix.h
rename to include/utils/COO_matrix.h
index 51a39e9..64b57b4 100644
--- a/include/utils/COO_Matrix.h
+++ b/include/utils/COO_matrix.h
@@ -15,24 +15,24 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef COO_MATRIX_H
-#define COO_MATRIX_H
+#ifndef COO_matrix_H
+#define COO_matrix_H
 
-#include "CSR_Matrix.h"
+#include "CSR_matrix.h"
 
-/* COO (Coordinate) Sparse Matrix Format
+/* COO (Coordinate) Sparse matrix Format
  *
  * For an m x n matrix with nnz nonzeros:
  * - rows: array of size nnz containing row indices
  * - cols: array of size nnz containing column indices
  * - x: array of size nnz containing values
- * - value_map: array of size nnz mapping CSR entries to COO entries (for
+ * - value_map: array of size nnz mapping CSR_matrix entries to COO entries (for
  * lower-triangular COO)
  * - m: number of rows
  * - n: number of columns
  * - nnz: number of nonzero entries
  */
-typedef struct COO_Matrix
+typedef struct COO_matrix
 {
     int *rows;
     int *cols;
@@ -41,20 +41,19 @@ typedef struct COO_Matrix
     int m;
     int n;
     int nnz;
-} COO_Matrix;
+} COO_matrix;
 
-/* Construct a COO matrix from a CSR matrix */
-COO_Matrix *new_coo_matrix(const CSR_Matrix *A);
+/* COO from CSR */
+COO_matrix *new_COO_matrix(const CSR_matrix *A);
 
-/* Construct a COO matrix containing only the lower-triangular
- * entries (col <= row) of a symmetric CSR matrix. Populates
- * value_map so that refresh_lower_triangular_coo can update
- * values without recomputing structure. */
-COO_Matrix *new_coo_matrix_lower_triangular(const CSR_Matrix *A);
+/* Construct COO containing only the lower-triangular entries (col <= row) of a
+   symmetric CSR. Populates value_map so that refresh_lower_triangular_coo can
+   update values without recomputing structure. */
+COO_matrix *new_COO_matrix_lower_triangular(const CSR_matrix *A);
 
-/* Refresh COO values from a new CSR value array using value_map */
-void refresh_lower_triangular_coo(COO_Matrix *coo, const double *vals);
+/* Refresh COO values from a new CSR_matrix value array using value_map */
+void refresh_lower_triangular_coo(COO_matrix *coo, const double *vals);
 
-void free_coo_matrix(COO_Matrix *matrix);
+void free_COO_matrix(COO_matrix *matrix);
 
-#endif /* COO_MATRIX_H */
+#endif /* COO_matrix_H */
diff --git a/include/utils/CSC_Matrix.h b/include/utils/CSC_matrix.h
similarity index 56%
rename from include/utils/CSC_Matrix.h
rename to include/utils/CSC_matrix.h
index f5236f6..34d3155 100644
--- a/include/utils/CSC_Matrix.h
+++ b/include/utils/CSC_matrix.h
@@ -15,12 +15,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef CSC_MATRIX_H
-#define CSC_MATRIX_H
+#ifndef CSC_matrix_H
+#define CSC_matrix_H
 
-#include "CSR_Matrix.h"
+#include "CSR_matrix.h"
 
-/* CSC (Compressed Sparse Column) Matrix Format
+/* CSC_matrix (Compressed Sparse Column) matrix Format
  *
  * For an m x n matrix with nnz nonzeros:
  * - p: array of size (n + 1) indicating start of each column
@@ -30,7 +30,7 @@
  * - n: number of columns
  * - nnz: number of nonzero entries
  */
-typedef struct CSC_Matrix
+typedef struct CSC_matrix
 {
     int *p;
     int *i;
@@ -38,43 +38,43 @@ typedef struct CSC_Matrix
     int m;
     int n;
     int nnz;
-} CSC_Matrix;
+} CSC_matrix;
 
 /* constructor and destructor */
-CSC_Matrix *new_csc_matrix(int m, int n, int nnz);
-void free_csc_matrix(CSC_Matrix *matrix);
+CSC_matrix *new_CSC_matrix(int m, int n, int nnz);
+void free_CSC_matrix(CSC_matrix *matrix);
 
 /* Fill sparsity of C = A^T D A for diagonal D */
-CSR_Matrix *ATA_alloc(const CSC_Matrix *A);
+CSR_matrix *ATA_alloc(const CSC_matrix *A);
 
 /* Fill sparsity of C = B^T D A for diagonal D */
-CSR_Matrix *BTA_alloc(const CSC_Matrix *A, const CSC_Matrix *B);
+CSR_matrix *BTA_alloc(const CSC_matrix *A, const CSC_matrix *B);
 
 /* Fill sparsity of C = BA, where B is symmetric. */
-CSC_Matrix *symBA_alloc(const CSR_Matrix *B, const CSC_Matrix *A);
+CSC_matrix *symBA_alloc(const CSR_matrix *B, const CSC_matrix *A);
 
 /* Compute values for C = A^T D A (null d corresponds to D as identity) */
-void ATDA_fill_values(const CSC_Matrix *A, const double *d, CSR_Matrix *C);
+void ATDA_fill_values(const CSC_matrix *A, const double *d, CSR_matrix *C);
 
 /* Compute values for C = B^T D A (null d corresonds to D as identity) */
-void BTDA_fill_values(const CSC_Matrix *A, const CSC_Matrix *B, const double *d,
-                      CSR_Matrix *C);
+void BTDA_fill_values(const CSC_matrix *A, const CSC_matrix *B, const double *d,
+                      CSR_matrix *C);
 
 /* Fill values of C = BA. The matrix B does not have to be symmetric */
-void BA_fill_values(const CSR_Matrix *B, const CSC_Matrix *A, CSC_Matrix *C);
+void BA_fill_values(const CSR_matrix *B, const CSC_matrix *A, CSC_matrix *C);
 
 /* Fill values of C = x^T A. The matrix C must have filled sparsity. */
-void yTA_fill_values(const CSC_Matrix *A, const double *x, CSR_Matrix *C);
+void yTA_fill_values(const CSC_matrix *A, const double *x, CSR_matrix *C);
 
-/* Count nonzero columns of a CSC matrix */
-int count_nonzero_cols_csc(const CSC_Matrix *A);
+/* Count nonzero columns of a CSC_matrix matrix */
+int count_nonzero_cols_csc(const CSC_matrix *A);
 
-/* convert from CSR to CSC format */
-CSC_Matrix *csr_to_csc_alloc(const CSR_Matrix *A, int *iwork);
-void csr_to_csc_fill_values(const CSR_Matrix *A, CSC_Matrix *C, int *iwork);
+/* convert from CSR_matrix to CSC_matrix format */
+CSC_matrix *csr_to_csc_alloc(const CSR_matrix *A, int *iwork);
+void csr_to_csc_fill_values(const CSR_matrix *A, CSC_matrix *C, int *iwork);
 
-/* convert from CSC to CSR format */
-CSR_Matrix *csc_to_csr_alloc(const CSC_Matrix *A, int *iwork);
-void csc_to_csr_fill_values(const CSC_Matrix *A, CSR_Matrix *C, int *iwork);
+/* convert from CSC_matrix to CSR_matrix format */
+CSR_matrix *csc_to_csr_alloc(const CSC_matrix *A, int *iwork);
+void csc_to_csr_fill_values(const CSC_matrix *A, CSR_matrix *C, int *iwork);
 
-#endif /* CSC_MATRIX_H */
+#endif /* CSC_matrix_H */
diff --git a/include/utils/CSR_Matrix.h b/include/utils/CSR_matrix.h
similarity index 62%
rename from include/utils/CSR_Matrix.h
rename to include/utils/CSR_matrix.h
index dd5ad91..a4880c9 100644
--- a/include/utils/CSR_Matrix.h
+++ b/include/utils/CSR_matrix.h
@@ -15,11 +15,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef CSR_MATRIX_H
-#define CSR_MATRIX_H
+#ifndef CSR_matrix_H
+#define CSR_matrix_H
 #include <stdbool.h>
 
-/* CSR (Compressed Sparse Row) Matrix Format
+/* CSR_matrix (Compressed Sparse Row) matrix Format
  *
  * For an m x n matrix with nnz nonzeros:
  * - p: array of size (m + 1) indicating start of each row
@@ -29,7 +29,7 @@
  * - n: number of columns
  * - nnz: number of nonzero entries
  */
-typedef struct CSR_Matrix
+typedef struct CSR_matrix
 {
     int *p;
     int *i;
@@ -37,37 +37,37 @@ typedef struct CSR_Matrix
     int m;
     int n;
     int nnz;
-} CSR_Matrix;
+} CSR_matrix;
 
 /* constructors and destructors */
-CSR_Matrix *new_csr_matrix(int m, int n, int nnz);
-CSR_Matrix *new_csr(const CSR_Matrix *A);
-CSR_Matrix *new_csr_copy_sparsity(const CSR_Matrix *A);
-void free_csr_matrix(CSR_Matrix *matrix);
-void copy_csr_matrix(const CSR_Matrix *A, CSR_Matrix *C);
+CSR_matrix *new_CSR_matrix(int m, int n, int nnz);
+CSR_matrix *new_csr(const CSR_matrix *A);
+CSR_matrix *new_csr_copy_sparsity(const CSR_matrix *A);
+void free_CSR_matrix(CSR_matrix *matrix);
+void copy_CSR_matrix(const CSR_matrix *A, CSR_matrix *C);
 
 /* transpose functionality (iwork must be of size A->n) */
-CSR_Matrix *transpose(const CSR_Matrix *A, int *iwork);
-CSR_Matrix *AT_alloc(const CSR_Matrix *A, int *iwork);
-void AT_fill_values(const CSR_Matrix *A, CSR_Matrix *AT, int *iwork);
+CSR_matrix *transpose(const CSR_matrix *A, int *iwork);
+CSR_matrix *AT_alloc(const CSR_matrix *A, int *iwork);
+void AT_fill_values(const CSR_matrix *A, CSR_matrix *AT, int *iwork);
 
 /* computes dense y = Ax */
-void Ax_csr(const CSR_Matrix *A, const double *x, double *y, int col_offset);
+void Ax_csr(const CSR_matrix *A, const double *x, double *y, int col_offset);
 
 /* fills values of C = diag(d) @ A */
-void DA_fill_values(const double *d, const CSR_Matrix *A, CSR_Matrix *C);
+void DA_fill_values(const double *d, const CSR_matrix *A, CSR_matrix *C);
 
 /* Count number of columns with nonzero entries in A and marks them in col_nz */
-int count_nonzero_cols(const CSR_Matrix *A, bool *col_nz);
+int count_nonzero_cols(const CSR_matrix *A, bool *col_nz);
 
 /* inserts 'idx' into array 'arr' in sorted order, and moves the other elements */
 void insert_idx(int idx, int *arr, int len);
 
 /* get value at position (row, col) in A */
-double csr_get_value(const CSR_Matrix *A, int row, int col);
+double csr_get_value(const CSR_matrix *A, int row, int col);
 
-/* Expand symmetric CSR matrix A to full matrix C. A is assumed to store
+/* Expand symmetric CSR_matrix matrix A to full matrix C. A is assumed to store
    only upper triangle. C must be pre-allocated with sufficient nnz */
-void symmetrize_csr(const int *Ap, const int *Ai, int m, CSR_Matrix *C);
+void symmetrize_csr(const int *Ap, const int *Ai, int m, CSR_matrix *C);
 
-#endif /* CSR_MATRIX_H */
+#endif /* CSR_matrix_H */
diff --git a/include/utils/CSR_sum.h b/include/utils/CSR_sum.h
index f3ba700..62f49d9 100644
--- a/include/utils/CSR_sum.h
+++ b/include/utils/CSR_sum.h
@@ -18,60 +18,61 @@
 #ifndef CSR_SUM_H
 #define CSR_SUM_H
 
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 
 /* forward declaration */
 struct int_double_pair;
 
 /* Compute sparsity pattern of C = A + B (and sets C->nnz) */
-void sum_csr_alloc(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C);
+void sum_csr_alloc(const CSR_matrix *A, const CSR_matrix *B, CSR_matrix *C);
 
 /* Fills values of C = A + B (assuming C's sparsity pattern is set) */
-void sum_csr_fill_values(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C);
+void sum_csr_fill_values(const CSR_matrix *A, const CSR_matrix *B, CSR_matrix *C);
 
 /* Fills values of C = diag(d1) * A + diag(d2) * B (assuming C's sparsity is set)*/
-void sum_scaled_csr_matrices_fill_values(const CSR_Matrix *A, const CSR_Matrix *B,
-                                         CSR_Matrix *C, const double *d1,
+void sum_scaled_csr_matrices_fill_values(const CSR_matrix *A, const CSR_matrix *B,
+                                         CSR_matrix *C, const double *d1,
                                          const double *d2);
 
-/* The following five functions are used for summing either more than two CSR
-   matrices or rows of CSR matrices. To implement the filling of values efficiently,
-   we compute an idx_map when we fill the sparsity pattern of the output matrix,
-   which maps each nonzero entry in the input matrix to its position in the output
-   matrix. This allows us to fill the values with a single pass of the output matrix
-   through the input matrices, without needing to search for the position of each
-   entry in the output matrix. So each idx_map should have size equal to the number
-   of nonzeros in the corresponding input matrix, and idx_map[j] should give the
-   index in the output matrix of the entry (in the value array of the output matrix)
-   corresponding to the j-th nonzero in the input matrix.
+/* The following five functions are used for summing either more than two CSR_matrix
+   matrices or rows of CSR_matrix matrices. To implement the filling of values
+   efficiently, we compute an idx_map when we fill the sparsity pattern of the output
+   matrix, which maps each nonzero entry in the input matrix to its position in the
+   output matrix. This allows us to fill the values with a single pass of the output
+   matrix through the input matrices, without needing to search for the position of
+   each entry in the output matrix. So each idx_map should have size equal to the
+   number of nonzeros in the corresponding input matrix, and idx_map[j] should give
+   the index in the output matrix of the entry (in the value array of the output
+   matrix) corresponding to the j-th nonzero in the input matrix.
 
    Output matrix C, input matrix A, iwork->size = max(A->n, A->nnz) for the first
    four functions. The last function allocates the output matrix and returns it. */
 // ------------------------------------------------------------------------------------
-void sum_all_rows_csr_alloc(const CSR_Matrix *A, CSR_Matrix *C, int *iwork,
+void sum_all_rows_csr_alloc(const CSR_matrix *A, CSR_matrix *C, int *iwork,
                             int *idx_map);
 
-void sum_block_of_rows_csr_alloc(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_block_of_rows_csr_alloc(const CSR_matrix *A, CSR_matrix *C,
                                  int row_block_size, int *iwork, int *idx_map);
 
-void sum_evenly_spaced_rows_csr_alloc(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_evenly_spaced_rows_csr_alloc(const CSR_matrix *A, CSR_matrix *C,
                                       int row_spacing, int *iwork, int *idx_map);
 
-void sum_spaced_rows_into_row_csr_alloc(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_spaced_rows_into_row_csr_alloc(const CSR_matrix *A, CSR_matrix *C,
                                         int spacing, int *iwork, int *idx_map);
 
 /* Compute sparsity pattern of out = A + B + C + D */
-CSR_Matrix *sum_4_csr_alloc(const CSR_Matrix *A, const CSR_Matrix *B,
-                            const CSR_Matrix *C, const CSR_Matrix *D,
+CSR_matrix *sum_4_csr_alloc(const CSR_matrix *A, const CSR_matrix *B,
+                            const CSR_matrix *C, const CSR_matrix *D,
                             int *idx_maps[4]);
 // ------------------------------------------------------------------------------------
 
-/* Accumulates values from A according to map. Must memset to zero before calling. */
-void accumulator(const CSR_Matrix *A, const int *idx_map, double *out);
+/* Accumulates `nnz` values from `vals` into `out` at the positions given by
+   `idx_map` (length `nnz`). Caller must zero `out` before calling. */
+void accumulator(const double *vals, int nnz, const int *idx_map, double *out);
 
 /* Accumulates values from A according to map with spacing. Must memset to zero
  * before calling. */
-void accumulator_with_spacing(const CSR_Matrix *A, const int *idx_map, double *out,
+void accumulator_with_spacing(const CSR_matrix *A, const int *idx_map, double *out,
                               int spacing);
 
 #endif /* CSR_SUM_H */
diff --git a/include/utils/linalg_dense_sparse_matmuls.h b/include/utils/linalg_dense_sparse_matmuls.h
index 949b2da..b76c3a3 100644
--- a/include/utils/linalg_dense_sparse_matmuls.h
+++ b/include/utils/linalg_dense_sparse_matmuls.h
@@ -18,26 +18,31 @@
 #ifndef LINALG_DENSE_SPARSE_H
 #define LINALG_DENSE_SPARSE_H
 
-#include "CSC_Matrix.h"
-#include "CSR_Matrix.h"
+#include "CSC_matrix.h"
+#include "CSR_matrix.h"
 #include "matrix.h"
 
-/* C = (I_p kron A) @ J via the polymorphic Matrix interface.
- * A is dense m x n, J is (n*p) x k in CSC, C is (m*p) x k in CSC. */
+/* C = (I_p kron A) @ J via the polymorphic matrix interface.
+ * A is dense m x n, J is (n*p) x k in CSC_matrix, C is (m*p) x k in CSC_matrix.
+ * `work` must be sized at least A->n doubles — used as a scratch buffer when
+ * a sparse column of J needs to be densified before dgemv. The caller is
+ * responsible for sizing it (typically pre-sized in the corresponding
+ * sparsity-build step). */
 // TODO: maybe we can replace these with I_kron_X functionality?
-CSC_Matrix *I_kron_A_alloc(const Matrix *A, const CSC_Matrix *J, int p);
-void I_kron_A_fill_values(const Matrix *A, const CSC_Matrix *J, CSC_Matrix *C);
+CSC_matrix *I_kron_A_alloc(const matrix *A, const CSC_matrix *J, int p);
+void I_kron_A_fill_values(const matrix *A, const CSC_matrix *J, CSC_matrix *C,
+                          double *work);
 
 /* Sparsity and values of C = (Y^T kron I_m) @ J where Y is k x n, J is (m*k) x p,
    and C is (m*n) x p. Y is given in column-major dense format. */
-CSR_Matrix *YT_kron_I_alloc(int m, int k, int n, const CSC_Matrix *J);
-void YT_kron_I_fill_values(int m, int k, int n, const double *Y, const CSC_Matrix *J,
-                           CSR_Matrix *C);
+CSR_matrix *YT_kron_I_alloc(int m, int k, int n, const CSC_matrix *J);
+void YT_kron_I_fill_values(int m, int k, int n, const double *Y, const CSC_matrix *J,
+                           CSR_matrix *C);
 
 /* Sparsity and values of C = (I_n kron X) @ J where X is m x k (col-major dense),
    J is (k*n) x p, and C is (m*n) x p. */
-CSR_Matrix *I_kron_X_alloc(int m, int k, int n, const CSC_Matrix *J);
-void I_kron_X_fill_values(int m, int k, int n, const double *X, const CSC_Matrix *J,
-                          CSR_Matrix *C);
+CSR_matrix *I_kron_X_alloc(int m, int k, int n, const CSC_matrix *J);
+void I_kron_X_fill_values(int m, int k, int n, const double *X, const CSC_matrix *J,
+                          CSR_matrix *C);
 
 #endif /* LINALG_DENSE_SPARSE_H */
diff --git a/include/utils/linalg_sparse_matmuls.h b/include/utils/linalg_sparse_matmuls.h
index a833078..516ff18 100644
--- a/include/utils/linalg_sparse_matmuls.h
+++ b/include/utils/linalg_sparse_matmuls.h
@@ -18,41 +18,41 @@
 #ifndef LINALG_H
 #define LINALG_H
 
-#include "CSC_Matrix.h"
-#include "CSR_Matrix.h"
+#include "CSC_matrix.h"
+#include "CSR_matrix.h"
 
 /* Compute sparsity pattern and values for the matrix-matrix multiplication
    C = (I_p kron A) @ J where A is m x n, J is (n*p) x k, and C is (m*p) x k,
    without relying on generic sparse matrix-matrix multiplication. Specialized
    logic for this is much faster (50-100x) than generic sparse matmul.
 
-    * J is provided in CSC format and is split into p blocks of n rows each
-    * C is returned in CSC format
+    * J is provided in CSC_matrix format and is split into p blocks of n rows each
+    * C is returned in CSC_matrix format
     * Mathematically it corresponds to  C = [A @ J1; A @ J2; ...; A @ Jp],
       where J = [J1; J2; ...; Jp]
 */
-CSC_Matrix *block_left_multiply_fill_sparsity(const CSR_Matrix *A,
-                                              const CSC_Matrix *J, int p);
+CSC_matrix *block_left_multiply_fill_sparsity(const CSR_matrix *A,
+                                              const CSC_matrix *J, int p);
 
-void block_left_multiply_fill_values(const CSR_Matrix *A, const CSC_Matrix *J,
-                                     CSC_Matrix *C);
+void block_left_multiply_fill_values(const CSR_matrix *A, const CSC_matrix *J,
+                                     CSC_matrix *C);
 
 /* Compute y = kron(I_p, A) @ x where A is m x n and x is(n*p)-length vector.
    The output y is m*p-length vector corresponding to
    y = [A @ x1; A @ x2; ...; A @ xp] where x is divided into p blocks of n
    elements.
 */
-void block_left_multiply_vec(const CSR_Matrix *A, const double *x, double *y, int p);
+void block_left_multiply_vec(const CSR_matrix *A, const double *x, double *y, int p);
 
-/* Fill values of C = A @ B where A is CSR, B is CSC.
+/* Fill values of C = A @ B where A is CSR_matrix, B is CSC_matrix.
  * C must have sparsity pattern already computed.
  */
-void csr_csc_matmul_fill_values(const CSR_Matrix *A, const CSC_Matrix *B,
-                                CSR_Matrix *C);
+void csr_csc_matmul_fill_values(const CSR_matrix *A, const CSC_matrix *B,
+                                CSR_matrix *C);
 
-/* C = A @ B where A is CSR, B is CSC. Result C is CSR.
+/* C = A @ B where A is CSR_matrix, B is CSC_matrix. Result C is CSR_matrix.
  * Allocates and precomputes sparsity pattern. No workspace required.
  */
-CSR_Matrix *csr_csc_matmul_alloc(const CSR_Matrix *A, const CSC_Matrix *B);
+CSR_matrix *csr_csc_matmul_alloc(const CSR_matrix *A, const CSC_matrix *B);
 
 #endif /* LINALG_H */
diff --git a/include/utils/matrix.h b/include/utils/matrix.h
index c54443a..17a84ad 100644
--- a/include/utils/matrix.h
+++ b/include/utils/matrix.h
@@ -18,38 +18,135 @@
 #ifndef MATRIX_H
 #define MATRIX_H
 
-#include "CSC_Matrix.h"
-#include "CSR_Matrix.h"
+#include "CSC_matrix.h"
+#include "CSR_matrix.h"
+#include <stdbool.h>
 
-/* Base matrix type with function pointers for polymorphic dispatch */
-typedef struct Matrix
+/* Broadcast shape used by the broadcast atom and its vtable methods. */
+typedef enum
 {
-    int m, n;
-    void (*block_left_mult_vec)(const struct Matrix *self, const double *x,
-                                double *y, int p);
-    CSC_Matrix *(*block_left_mult_sparsity)(const struct Matrix *self,
-                                            const CSC_Matrix *J, int p);
-    void (*block_left_mult_values)(const struct Matrix *self, const CSC_Matrix *J,
-                                   CSC_Matrix *C);
-    void (*update_values)(struct Matrix *self, const double *new_values);
-    void (*free_fn)(struct Matrix *self);
-} Matrix;
-
-/* Sparse matrix wrapping CSR */
-typedef struct Sparse_Matrix
+    BROADCAST_ROW,   /* (1, n) -> (m, n) */
+    BROADCAST_COL,   /* (m, 1) -> (m, n) */
+    BROADCAST_SCALAR /* (1, 1) -> (m, n) */
+} broadcast_type;
+
+/* Polymorphic matrix base. Concrete types embed `matrix` as their first
+   member and implement the vtable slots below. Currently implemented:
+       1. sparse_matrix  — generic CSR_matrix-backed matrix.
+       2. permuted_dense — matrix whose nonzeros lie in a single dense block
+                           located at chosen rows and columns of the global
+                           index space.
+   A third type is potentially planned. */
+
+typedef struct matrix matrix;
+
+/* y = kron(I_p, A) @ x */
+typedef void (*matrix_block_left_mult_vec_fn)(const matrix *A, const double *x,
+                                              double *y, int p);
+
+/* Allocate sparsity of C = kron(I_p, A) @ J */
+typedef CSC_matrix *(*matrix_block_left_mult_sparsity_fn)(const matrix *A,
+                                                          const CSC_matrix *J,
+                                                          int p);
+
+/* Fill values of C = kron(I_p, A) @ J */
+typedef void (*matrix_block_left_mult_values_fn)(const matrix *A,
+                                                 const CSC_matrix *J, CSC_matrix *C);
+
+/* Allocate a new matrix with the same sparsity as A */
+typedef matrix *(*matrix_copy_sparsity_fn)(const matrix *A);
+
+/* Fill values of C = diag(d) @ A */
+typedef void (*matrix_DA_fill_values_fn)(const double *d, const matrix *A,
+                                         matrix *C);
+
+/* Allocate C = AT @ A */
+typedef matrix *(*matrix_ATA_alloc_fn)(matrix *A);
+
+/* Fill values of C = AT @ diag(d) @ A */
+typedef void (*matrix_ATDA_fill_values_fn)(const matrix *A, const double *d,
+                                           matrix *C);
+
+/* Allocate AT = transpose(A) */
+typedef matrix *(*matrix_transpose_alloc_fn)(const matrix *A);
+
+/* Fill values of AT = transpose(A) */
+typedef void (*matrix_transpose_fill_values_fn)(const matrix *A, matrix *AT);
+
+/* Returns a CSR_matrix view of A */
+typedef CSR_matrix *(*matrix_to_csr_fn)(matrix *A);
+
+/* Refresh any internal caches (e.g. a CSC_matrix mirror) so subsequent ATA /
+   ATDA calls reflect the current values. */
+typedef void (*matrix_refresh_csc_values_fn)(matrix *A);
+
+/* Allocate C = A[indices, :] */
+typedef matrix *(*matrix_index_alloc_fn)(matrix *A, const int *indices, int n_idxs);
+
+/* Fill values of C = A[indices, :] */
+typedef void (*matrix_index_fill_values_fn)(matrix *A, const int *indices,
+                                            int n_idxs, matrix *C);
+
+/* Row-tiling for the promote atom: A must be a 1-row matrix; returns
+   a new matrix of shape (size, A->n) where every row is a copy of A's
+   single row. */
+typedef matrix *(*matrix_promote_alloc_fn)(matrix *A, int size);
+typedef void (*matrix_promote_fill_values_fn)(matrix *A, matrix *out);
+
+/* Broadcast: lift the child Jacobian of a broadcast atom into the output
+   Jacobian. `type` is the broadcast variant; (d1, d2) is the output shape. */
+typedef matrix *(*matrix_broadcast_alloc_fn)(matrix *A, broadcast_type type, int d1,
+                                             int d2);
+typedef void (*matrix_broadcast_fill_values_fn)(matrix *A, broadcast_type type,
+                                                int d1, int d2, matrix *out);
+
+/* diag_vec: A is an (n, A->n) Jacobian for a length-n vector; output is
+   (n*n, A->n) where row i lands at output row i*(n+1) (column-major
+   diagonal positions). Other output rows are structurally zero. */
+typedef matrix *(*matrix_diag_vec_alloc_fn)(matrix *A);
+typedef void (*matrix_diag_vec_fill_values_fn)(matrix *A, matrix *out);
+
+typedef void (*matrix_free_fn)(matrix *self);
+
+struct matrix
 {
-    Matrix base;
-    CSR_Matrix *csr;
-} Sparse_Matrix;
+    int m, n, nnz;
+    double *x; /* non-owning pointer to the value buffer */
+    bool is_permuted_dense;
+
+    /* Operator ops */
+    matrix_block_left_mult_vec_fn block_left_mult_vec;
+    matrix_block_left_mult_sparsity_fn block_left_mult_sparsity;
+    matrix_block_left_mult_values_fn block_left_mult_values;
+
+    /* Chain-rule ops */
+    matrix_copy_sparsity_fn copy_sparsity;
+    matrix_DA_fill_values_fn DA_fill_values;
+    matrix_ATA_alloc_fn ATA_alloc;
+    matrix_ATDA_fill_values_fn ATDA_fill_values;
+    matrix_transpose_alloc_fn transpose_alloc;
+    matrix_transpose_fill_values_fn transpose_fill_values;
+
+    /* Views and cache */
+    matrix_to_csr_fn to_csr;
+    matrix_refresh_csc_values_fn refresh_csc_values;
 
-/* Constructors */
-Matrix *new_sparse_matrix(const CSR_Matrix *A);
+    /* Atom-specific ops */
+    matrix_index_alloc_fn index_alloc;
+    matrix_index_fill_values_fn index_fill_values;
+    matrix_promote_alloc_fn promote_alloc;
+    matrix_promote_fill_values_fn promote_fill_values;
+    matrix_broadcast_alloc_fn broadcast_alloc;
+    matrix_broadcast_fill_values_fn broadcast_fill_values;
+    matrix_diag_vec_alloc_fn diag_vec_alloc;
+    matrix_diag_vec_fill_values_fn diag_vec_fill_values;
 
-/* Transpose helper */
-Matrix *sparse_matrix_trans(const Sparse_Matrix *self, int *iwork);
+    /* Lifecycle */
+    matrix_free_fn free_fn;
+};
 
 /* Free helper */
-static inline void free_matrix(Matrix *m)
+static inline void free_matrix(matrix *m)
 {
     if (m)
     {
diff --git a/include/utils/matrix_BTA.h b/include/utils/matrix_BTA.h
new file mode 100644
index 0000000..05b6d58
--- /dev/null
+++ b/include/utils/matrix_BTA.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2026 Daniel Cederberg and William Zhang
+ *
+ * This file is part of the SparseDiffEngine project.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ */
+#ifndef MATRIX_BTA_H
+#define MATRIX_BTA_H
+
+#include "matrix.h"
+#include "permuted_dense.h"
+
+/* Polymorphic dispatchers for C = BT @ A and C = BT @ diag(d) @ A. The output
+   type depends on the input types: (PD, PD) → PD, (Sparse, PD) → PD,
+   (PD, Sparse) → PD, (Sparse, Sparse) → Sparse. (Here PD = permuted_dense.)
+
+   Contract: neither function touches sparse_matrix internals. The caller must,
+   before calling either function, ensure each Sparse operand's csc_cache
+   exists (sparse_matrix_ensure_csc_cache). Before BTDA_matrices_fill_values
+   the caller must also refresh the cache values (refresh_csc_values). */
+
+/* Allocate sparsity for C = BT @ A. */
+matrix *BTA_matrices_alloc(matrix *A, matrix *B);
+
+/* Fill values of C = BT @ diag(d) @ A. */
+void BTDA_matrices_fill_values(matrix *A, const double *d, matrix *B, matrix *C);
+
+/* Polymorphic dispatcher: C = B @ A where B is PD and A is any matrix
+   type (permuted_dense or sparse_matrix). C is always PD. Routes on A's
+   type. For the sparse-A branch the dispatcher ensures sm_A->csc_cache
+   structure exists at alloc time; before BA_pd_matrices_fill_values the
+   caller must have refreshed sm_A->csc_cache values (same fill-side
+   contract as BTDA_matrices_fill_values). */
+matrix *BA_pd_matrices_alloc(const permuted_dense *B, const matrix *A);
+void BA_pd_matrices_fill_values(const permuted_dense *B, const matrix *A,
+                                permuted_dense *C);
+
+#endif /* MATRIX_BTA_H */
diff --git a/include/utils/dense_matrix.h b/include/utils/matrix_sum.h
similarity index 53%
rename from include/utils/dense_matrix.h
rename to include/utils/matrix_sum.h
index bd1693b..95c6fcf 100644
--- a/include/utils/dense_matrix.h
+++ b/include/utils/matrix_sum.h
@@ -15,26 +15,20 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef DENSE_MATRIX_H
-#define DENSE_MATRIX_H
+#ifndef MATRIX_SUM_H
+#define MATRIX_SUM_H
 
 #include "matrix.h"
 
-/* Dense matrix (row-major) */
-typedef struct Dense_Matrix
-{
-    Matrix base;
-    double *x;
-    double *work; /* scratch buffer, length n */
-} Dense_Matrix;
+/* Polymorphic wrappers for allocating C = A + B. Right now we always
+   convert to CSR matrices internally for the sum. */
+void sum_matrices_alloc(matrix *A, matrix *B, matrix *C);
 
-/* Constructors. If data is NULL, the value buffer is allocated but left
-   uninitialized; otherwise m*n entries are copied from data. */
-Matrix *new_dense_matrix(int m, int n, const double *data);
+/* Fill values of C = A + B. Uses CSR matrices internally. */
+void sum_matrices_fill_values(matrix *A, matrix *B, matrix *C);
 
-/* Transpose helper */
-Matrix *dense_matrix_trans(const Dense_Matrix *self);
+/* Fill values of C = diag(d1) * A + diag(d2) * B. Uses CSR matrices internally. */
+void sum_scaled_matrices_fill_values(matrix *A, matrix *B, matrix *C,
+                                     const double *d1, const double *d2);
 
-void A_transpose(double *AT, const double *A, int m, int n);
-
-#endif /* DENSE_MATRIX_H */
+#endif /* MATRIX_SUM_H */
diff --git a/include/utils/mini_numpy.h b/include/utils/mini_numpy.h
index c87f80a..ac14051 100644
--- a/include/utils/mini_numpy.h
+++ b/include/utils/mini_numpy.h
@@ -18,7 +18,7 @@
 #ifndef MINI_NUMPY_H
 #define MINI_NUMPY_H
 
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 
 /* Example: a = [1, 2], len = 2, repeats = 3, result = [1, 1, 1, 2, 2, 2] */
 void repeat(double *result, const double *a, int len, int repeats);
@@ -33,6 +33,9 @@ void scaled_ones(double *result, int size, double value);
 /* Naive implementation of Z = X @ Y, X is m x k, Y is k x n, Z is m x n */
 void mat_mat_mult(const double *X, const double *Y, double *Z, int m, int k, int n);
 
+/* Row-major dense transpose: AT[j*m + i] = A[i*n + j] for an m x n A. */
+void A_transpose(double *AT, const double *A, int m, int n);
+
 /* Compute v = (Y kron I_m) @ w where Y is k x n (col-major), len(w) = m * n, and
    len(v) = m * k.  Equivalently, reshape w as the m x n matrix W (col-major) and
    compute v = vec(W @ Y^T). */
@@ -46,10 +49,10 @@ void I_kron_XT_vec(int m, int k, int n, const double *X, const double *w, double
 /* Fill T_csr's row pointers and column indices for the 1D full-convolution
    Toeplitz matrix T(a), sized (m+n-1) x n with m*n nonzeros. Values (x) are
    not written; call conv_matrix_fill_values to populate them. */
-void conv_matrix_fill_sparsity(CSR_Matrix *T_csr, int m, int n);
+void conv_matrix_fill_sparsity(CSR_matrix *T_csr, int m, int n);
 
 /* Overwrite T_csr->x from kernel a, using the sparsity already written by
    conv_matrix_fill_sparsity. T[r, col] = a[r - col]. */
-void conv_matrix_fill_values(CSR_Matrix *T_csr, const double *a);
+void conv_matrix_fill_values(CSR_matrix *T_csr, const double *a);
 
 #endif /* MINI_NUMPY_H */
diff --git a/include/utils/permuted_dense.h b/include/utils/permuted_dense.h
new file mode 100644
index 0000000..4869d97
--- /dev/null
+++ b/include/utils/permuted_dense.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright 2026 Daniel Cederberg and William Zhang
+ *
+ * This file is part of the SparseDiffEngine project.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef PERMUTED_DENSE_H
+#define PERMUTED_DENSE_H
+
+#include "matrix.h"
+#include <stddef.h>
+
+/* permuted_dense represents a matrix whose only nonzeros lie in a dense
+   block, after rows and columns are restricted to chosen subsets of the
+   global index space. For local indices (ii, jj) with 0 <= ii < m0
+   and 0 <= jj < n0,
+
+       M[row_perm[ii], col_perm[jj]] = X[ii, jj].
+
+   All other entries of M are zero. row_perm and col_perm are stored in
+   strictly increasing order; the constructor asserts this. */
+typedef struct permuted_dense
+{
+    matrix base;
+    int m0;                /* rows of dense block (= len(row_perm))      */
+    int n0;                /* cols of dense block (= len(col_perm))      */
+    int *row_perm;         /* row_perm[ii] in [0, base.m), sorted        */
+    int *col_perm;         /* col_perm[jj] in [0, base.n), sorted        */
+    double *X;             /* m0 * n0, row-major               */
+    int *col_inv;          /* length base.n: col_inv[col_perm[jj]] = jj, */
+                           /* otherwise -1; used by `x CSC_matrix` allocation.  */
+    int *row_inv;          /* length base.m: row_inv[row_perm[ii]] = ii, */
+                           /* otherwise -1; used by index_alloc.         */
+    CSR_matrix *csr_cache; /* lazy CSR_matrix view built by to_csr; structure */
+                           /* allocated on first call, values refilled */
+                           /* on every call. NULL until first call.    */
+    /* Mutable double-precision BLAS scratch shared across kernels that
+       operate on this PD. Two non-overlapping roles (a given fill call uses
+       at most one):
+         - Y-buffer: holds diag(d_perm) X for ATDA / BTDA_pd_pd (size m0*n0).
+         - transpose: holds (diag(d) X)^T for the BA_pd_csc-based BTDA
+                     kernels (BTDA_pd_csc and, transitively, BTDA_csc_pd
+                     via its delegate). Size m0*n0 doubles.
+       Allocated lazily on the first kernel that needs it; grown in place
+       (free + SP_MALLOC, contents not preserved) if a later kernel needs
+       more. `dwork == NULL` and `dwork_size == 0` before first use.
+       Functions taking a const permuted_dense * may still mutate `dwork`. */
+    double *dwork;
+    size_t dwork_size;
+
+    /* Mutable int scratch. Currently only used to hold the row-intersection
+       index arrays idx_A / idx_B in BTA_pd_pd_fill_values and the
+       slow path of BTDA_pd_pd_fill_values; allocated by
+       BTA_pd_pd_alloc for those outputs (NULL on PDs from other
+       allocators). Fill kernels fall back to a per-call SP_MALLOC if
+       iwork_size is too small. */
+    int *iwork;
+    size_t iwork_size;
+
+    /* CONTRACT: `dwork` and `iwork` are freely overwritten by every kernel
+       that takes this PD as input or output — contents do NOT survive
+       across calls. Do not use them to cache precomputed factors or carry
+       state between kernel invocations: any subsequent call (ATDA, BTDA,
+       BTA gather, …) may clobber them without warning. If you need
+       persistence, add a dedicated field. */
+} permuted_dense;
+
+/* Constructor. row_perm and col_perm must be strictly increasing in their
+   respective ranges. If X_data is NULL the value buffer is allocated but
+   left uninitialized; otherwise m0 * n0 entries are copied. */
+matrix *new_permuted_dense(int m, int n, int m0, int n0, const int *row_perm,
+                           const int *col_perm, const double *X_data);
+
+/* Convenience constructor for the trivial-perm case: row_perm = [0..m-1],
+   col_perm = [0..n-1], dense block fills the full (m, n) shape. */
+matrix *new_permuted_dense_full(int m, int n, const double *data);
+
+/* CSR_matrix view: callers should use the vtable, i.e. base.to_csr(base). The PD
+   owns and caches the returned CSR_matrix; its value array aliases self->X,
+   so values are always live with no separate fill needed. Callers must not
+   free the returned CSR_matrix — it's released by free_matrix on the PD. */
+
+/* Fill values of C = diag(d) @ A where len(d) = number of (global) rows of A */
+void DA_pd_fill_values(const double *d, const permuted_dense *A, permuted_dense *C);
+
+/* Allocate new permuted dense for C = AT @ A */
+matrix *ATA_pd_alloc(const permuted_dense *A);
+
+/* Fill values of C = AT @ diag(d) @ A */
+void ATDA_pd_fill_values(const permuted_dense *A, const double *d,
+                         permuted_dense *C);
+
+/* Allocate new permuted dense forC = BT @ A where A and B are both permuted_dense.
+   (If B and A have no overlapping rows, then C is empty) */
+matrix *BTA_pd_pd_alloc(const permuted_dense *B, const permuted_dense *A);
+
+/* Fill values of C = BT @ A where A and B are both permuted dense. */
+void BTA_pd_pd_fill_values(const permuted_dense *B, const permuted_dense *A,
+                           permuted_dense *C);
+
+/* Fill values of C = BT @ diag(d) @ A where A and B are both permuted dense. */
+void BTDA_pd_pd_fill_values(const permuted_dense *B, const double *d,
+                            const permuted_dense *A, permuted_dense *C);
+
+/* Allocate new permuted dense for C = B @ A where B is PD and A is CSC.
+   This function is currently never used in production, but we keep it
+   here because it is simple and might be useful in the future. */
+matrix *BA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A);
+
+/* Fill values of C = B @ A where B is value buffer to permuted dense and A is CSC.
+
+   The raw-buffer signature for B lets callers pass a transposed dense block
+   (e.g. (diag(d) B)^T stored in B->dwork) without needing to build a transposed
+   permuted dense. */
+void BA_pd_csc_fill_values(const double *B, int n0_B, const int *inv,
+                           const CSC_matrix *A, permuted_dense *C);
+
+/* Allocate new permuted dense for C = B @ A where B and A are both PD. Both
+   may have arbitrary (sorted) row_perm / col_perm; no full-block assumption.
+   If B->col_perm and A->row_perm have no overlap C is structurally empty;
+   otherwise C has row_perm = B->row_perm, col_perm = A->col_perm. */
+matrix *BA_pd_pd_alloc(const permuted_dense *B, const permuted_dense *A);
+
+/* Fill values of C = B @ A for two PDs (general row_perm / col_perm).
+   Intersects B->col_perm with A->row_perm, gathers the matching column
+   slice of B and row slice of A into the operands' dwork scratch, and
+   computes one cblas_dgemm. */
+void BA_pd_pd_fill_values(const permuted_dense *B, const permuted_dense *A,
+                          permuted_dense *C);
+
+/* Allocate new permuted dense for C = B^T @ A where B is PD and A is CSC */
+matrix *BTA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A);
+
+/* Fill values of C = B^T @ diag(d) @ A where B is PD and A is CSC */
+void BTDA_pd_csc_fill_values(const permuted_dense *B, const double *d,
+                             const CSC_matrix *A, permuted_dense *C);
+
+/* Allocate new permuted_dense for C = B^T @ A where B is Sparse CSC and A is PD. */
+matrix *BTA_csc_pd_alloc(const CSC_matrix *B, const permuted_dense *A);
+
+/* Fill values of C = B^T @ diag(d) @ A where B is CSC and A is PD */
+void BTDA_csc_pd_fill_values(const CSC_matrix *B, const double *d,
+                             const permuted_dense *A, permuted_dense *C);
+
+#endif /* PERMUTED_DENSE_H */
diff --git a/include/utils/sparse_matrix.h b/include/utils/sparse_matrix.h
new file mode 100644
index 0000000..210daa1
--- /dev/null
+++ b/include/utils/sparse_matrix.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2026 Daniel Cederberg and William Zhang
+ *
+ * This file is part of the SparseDiffEngine project.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SPARSE_MATRIX_H
+#define SPARSE_MATRIX_H
+
+#include "matrix.h"
+
+/* Sparse matrix wrapping CSR_matrix. csc_cache is a lazily-built CSC_matrix mirror
+   used by the chain-rule ATA / ATDA paths; it's allocated on first need and refilled
+   by refresh_csc_values. csc_iwork is the workspace for csr_to_csc. */
+typedef struct sparse_matrix
+{
+    matrix base;
+    CSR_matrix *csr;
+    CSC_matrix *csc_cache;
+    int *csc_iwork;
+    int *transpose_iwork; /* sized csr->n; allocated by sparse_transpose_alloc
+                             on the output sm and reused by
+                             sparse_transpose_fill_values. NULL when this
+                             sm wasn't produced by transpose_alloc. */
+} sparse_matrix;
+
+/* Constructor. Takes ownership of A; the caller must not free A separately
+   (free_matrix on the returned matrix frees A). */
+matrix *new_sparse_matrix(CSR_matrix *A);
+
+/* Convenience: allocate a sparse_matrix of shape (m, n) with capacity for
+   nnz entries. Equivalent to new_sparse_matrix(new_CSR_matrix(m, n, nnz)).
+   Sparsity pattern and values are uninitialized. */
+matrix *new_sparse_matrix_alloc(int m, int n, int nnz);
+
+/* Transpose helper */
+matrix *sparse_matrix_trans(const sparse_matrix *self, int *iwork);
+
+/* Build the CSC_matrix cache structure if absent. Idempotent; structure-only,
+   values are NOT filled (use refresh_csc_values for that). Exposed so the
+   bivariate dispatchers in matrix_BTA can prepare sparsity without touching
+   uninitialized values during the init phase. */
+void sparse_matrix_ensure_csc_cache(sparse_matrix *sm);
+
+#endif /* SPARSE_MATRIX_H */
diff --git a/include/utils/utils.h b/include/utils/utils.h
index 8346858..5c3c5fe 100644
--- a/include/utils/utils.h
+++ b/include/utils/utils.h
@@ -18,9 +18,23 @@
 #ifndef UTILS_H
 #define UTILS_H
 
+#include <stdbool.h>
+
+#ifndef MAX
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
+#endif
+#ifndef MIN
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
 
 /* Sort an array of integers in ascending order */
 void sort_int_array(int *array, int size);
 
+/* Return true if sorted index arrays a_idx and b_idx (lengths a_len, b_len)
+   share any value, where b_idx entries are shifted by b_offset before
+   comparison (a_idx[ai] == b_idx[bi] - b_offset). Use b_offset = 0 for a
+   plain intersection check.  */
+bool has_overlap(const int *a_idx, int a_len, const int *b_idx, int b_len,
+                 int b_offset);
+
 #endif // UTILS_H
diff --git a/src/atoms/affine/add.c b/src/atoms/affine/add.c
index 631ffb6..288bfde 100644
--- a/src/atoms/affine/add.c
+++ b/src/atoms/affine/add.c
@@ -16,7 +16,8 @@
  * limitations under the License.
  */
 #include "atoms/affine.h"
-#include "utils/CSR_sum.h"
+#include "utils/matrix_sum.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <stdio.h>
@@ -43,10 +44,10 @@ static void jacobian_init_impl(expr *node)
 
     /* we never have to store more than the sum of children's nnz */
     int nnz_max = node->left->jacobian->nnz + node->right->jacobian->nnz;
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, nnz_max);
+    node->jacobian = new_sparse_matrix_alloc(node->size, node->n_vars, nnz_max);
 
-    /* fill sparsity pattern  */
-    sum_csr_alloc(node->left->jacobian, node->right->jacobian, node->jacobian);
+    /* fill sparsity pattern */
+    sum_matrices_alloc(node->left->jacobian, node->right->jacobian, node->jacobian);
 }
 
 static void eval_jacobian(expr *node)
@@ -56,7 +57,8 @@ static void eval_jacobian(expr *node)
     node->right->eval_jacobian(node->right);
 
     /* sum children's jacobians */
-    sum_csr_fill_values(node->left->jacobian, node->right->jacobian, node->jacobian);
+    sum_matrices_fill_values(node->left->jacobian, node->right->jacobian,
+                             node->jacobian);
 }
 
 static void wsum_hess_init_impl(expr *node)
@@ -67,10 +69,11 @@ static void wsum_hess_init_impl(expr *node)
 
     /* we never have to store more than the sum of children's nnz */
     int nnz_max = node->left->wsum_hess->nnz + node->right->wsum_hess->nnz;
-    node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, nnz_max);
+    node->wsum_hess = new_sparse_matrix_alloc(node->n_vars, node->n_vars, nnz_max);
 
     /* fill sparsity pattern of hessian */
-    sum_csr_alloc(node->left->wsum_hess, node->right->wsum_hess, node->wsum_hess);
+    sum_matrices_alloc(node->left->wsum_hess, node->right->wsum_hess,
+                       node->wsum_hess);
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
@@ -80,8 +83,8 @@ static void eval_wsum_hess(expr *node, const double *w)
     node->right->eval_wsum_hess(node->right, w);
 
     /* sum children's wsum_hess */
-    sum_csr_fill_values(node->left->wsum_hess, node->right->wsum_hess,
-                        node->wsum_hess);
+    sum_matrices_fill_values(node->left->wsum_hess, node->right->wsum_hess,
+                             node->wsum_hess);
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/broadcast.c b/src/atoms/affine/broadcast.c
index 94e32e1..213461b 100644
--- a/src/atoms/affine/broadcast.c
+++ b/src/atoms/affine/broadcast.c
@@ -17,7 +17,6 @@
  */
 #include "atoms/affine.h"
 #include "subexpr.h"
-#include "utils/mini_numpy.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <stdio.h>
@@ -71,119 +70,21 @@ static void jacobian_init_impl(expr *node)
 {
     expr *x = node->left;
     jacobian_init(x);
-    broadcast_expr *bcast = (broadcast_expr *) node;
-    int total_nnz;
-
-    // --------------------------------------------------------------------
-    //                     count number of nonzeros
-    // --------------------------------------------------------------------
-    if (bcast->type == BROADCAST_ROW)
-    {
-        /* Row broadcast: (1, n) -> (m, n) */
-        total_nnz = x->jacobian->nnz * node->d1;
-    }
-    else if (bcast->type == BROADCAST_COL)
-    {
-        /* Column broadcast: (m, 1) -> (m, n) */
-        total_nnz = x->jacobian->nnz * node->d2;
-    }
-    else
-    {
-        /* Scalar broadcast: (1, 1) -> (m, n) */
-        total_nnz = x->jacobian->nnz * node->size;
-    }
-
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, total_nnz);
-
-    // ---------------------------------------------------------------------
-    //                 fill sparsity pattern
-    // ---------------------------------------------------------------------
-    CSR_Matrix *Jx = x->jacobian;
-    CSR_Matrix *J = node->jacobian;
-
-    if (bcast->type == BROADCAST_ROW)
-    {
-        J->nnz = 0;
-        for (int i = 0; i < node->d2; i++)
-        {
-            int nnz_in_row = Jx->p[i + 1] - Jx->p[i];
-
-            /* copy columns indices */
-            tile_int(J->i + J->nnz, Jx->i + Jx->p[i], nnz_in_row, node->d1);
-
-            /* set row pointers */
-            for (int rep = 0; rep < node->d1; rep++)
-            {
-                J->p[i * node->d1 + rep] = J->nnz;
-                J->nnz += nnz_in_row;
-            }
-        }
-        assert(J->nnz == total_nnz);
-        J->p[node->size] = total_nnz;
-    }
-    else if (bcast->type == BROADCAST_COL)
-    {
-        /* copy column indices */
-        tile_int(J->i, Jx->i, Jx->nnz, node->d2);
 
-        /* set row pointers */
-        int offset = 0;
-        for (int i = 0; i < node->d2; i++)
-        {
-            for (int j = 0; j < node->d1; j++)
-            {
-                int nnz_in_row = Jx->p[j + 1] - Jx->p[j];
-                J->p[i * node->d1 + j] = offset;
-                offset += nnz_in_row;
-            }
-        }
-        assert(offset == total_nnz);
-        J->p[node->size] = total_nnz;
-    }
-    else
-    {
-        /* copy column indices */
-        tile_int(J->i, Jx->i, Jx->nnz, node->size);
-
-        /* set row pointers */
-        int offset = 0;
-        int nnz = Jx->p[1] - Jx->p[0];
-        for (int i = 0; i < node->size; i++)
-        {
-            J->p[i] = offset;
-            offset += nnz;
-        }
-        assert(offset == total_nnz);
-        J->p[node->size] = total_nnz;
-    }
+    /* allocate sparsity for the broadcast output; output type matches child's. */
+    broadcast_expr *bcast = (broadcast_expr *) node;
+    node->jacobian =
+        x->jacobian->broadcast_alloc(x->jacobian, bcast->type, node->d1, node->d2);
 }
 
 static void eval_jacobian(expr *node)
 {
     node->left->eval_jacobian(node->left);
 
+    /* fill values into the preallocated output. */
     broadcast_expr *bcast = (broadcast_expr *) node;
-    CSR_Matrix *Jx = node->left->jacobian;
-    CSR_Matrix *J = node->jacobian;
-
-    if (bcast->type == BROADCAST_ROW)
-    {
-        J->nnz = 0;
-        for (int i = 0; i < node->d2; i++)
-        {
-            int nnz_in_row = Jx->p[i + 1] - Jx->p[i];
-            tile_double(J->x + J->nnz, Jx->x + Jx->p[i], nnz_in_row, node->d1);
-            J->nnz += nnz_in_row * node->d1;
-        }
-    }
-    else if (bcast->type == BROADCAST_COL)
-    {
-        tile_double(J->x, Jx->x, Jx->nnz, node->d2);
-    }
-    else
-    {
-        tile_double(J->x, Jx->x, Jx->nnz, node->size);
-    }
+    node->left->jacobian->broadcast_fill_values(node->left->jacobian, bcast->type,
+                                                node->d1, node->d2, node->jacobian);
 }
 
 static void wsum_hess_init_impl(expr *node)
@@ -192,7 +93,7 @@ static void wsum_hess_init_impl(expr *node)
     wsum_hess_init(x);
 
     /* Same sparsity as child - weights get summed */
-    node->wsum_hess = new_csr_copy_sparsity(x->wsum_hess);
+    node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess);
 
     /* allocate space for weight vector */
     node->work->dwork = SP_MALLOC(node->size * sizeof(double));
@@ -239,7 +140,8 @@ static void eval_wsum_hess(expr *node, const double *w)
     }
 
     x->eval_wsum_hess(x, node->work->dwork);
-    memcpy(node->wsum_hess->x, x->wsum_hess->x, x->wsum_hess->nnz * sizeof(double));
+    memcpy(node->wsum_hess->x, x->wsum_hess->x,
+           node->wsum_hess->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/convolve.c b/src/atoms/affine/convolve.c
index 55e5857..9cdfeb7 100644
--- a/src/atoms/affine/convolve.c
+++ b/src/atoms/affine/convolve.c
@@ -17,9 +17,10 @@
  */
 #include "atoms/affine.h"
 #include "subexpr.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/linalg_sparse_matmuls.h"
 #include "utils/mini_numpy.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <stdio.h>
 #include <stdlib.h>
@@ -76,13 +77,14 @@ static void jacobian_init_impl(expr *node)
     jacobian_init(child);
 
     /* Build convolution matrix of size (m+n-1) x n with m*n nonzeros */
-    cnode->T = new_csr_matrix(m + n - 1, n, m * n);
+    cnode->T = new_CSR_matrix(m + n - 1, n, m * n);
     conv_matrix_fill_sparsity(cnode->T, m, n);
     conv_matrix_fill_values(cnode->T, a);
 
     /* J = T @ J_child */
-    cnode->Jchild_CSC = csr_to_csc_alloc(child->jacobian, node->work->iwork);
-    node->jacobian = csr_csc_matmul_alloc(cnode->T, cnode->Jchild_CSC);
+    cnode->Jchild_CSC = csr_to_csc_alloc(child->jacobian->to_csr(child->jacobian), node->work->iwork);
+    node->jacobian =
+        new_sparse_matrix(csr_csc_matmul_alloc(cnode->T, cnode->Jchild_CSC));
 }
 
 static void eval_jacobian(expr *node)
@@ -93,8 +95,10 @@ static void eval_jacobian(expr *node)
     child->eval_jacobian(child);
 
     /* J = T @ J_child */
-    csr_to_csc_fill_values(child->jacobian, cnode->Jchild_CSC, node->work->iwork);
-    csr_csc_matmul_fill_values(cnode->T, cnode->Jchild_CSC, node->jacobian);
+    csr_to_csc_fill_values(child->jacobian->to_csr(child->jacobian), cnode->Jchild_CSC,
+                           node->work->iwork);
+    csr_csc_matmul_fill_values(cnode->T, cnode->Jchild_CSC,
+                               node->jacobian->to_csr(node->jacobian));
 }
 
 static void wsum_hess_init_impl(expr *node)
@@ -103,7 +107,7 @@ static void wsum_hess_init_impl(expr *node)
     convolve_expr *cnode = (convolve_expr *) node;
 
     wsum_hess_init(child);
-    node->wsum_hess = new_csr_copy_sparsity(child->wsum_hess);
+    node->wsum_hess = child->wsum_hess->copy_sparsity(child->wsum_hess);
     node->work->dwork = (double *) SP_MALLOC(cnode->n * sizeof(double));
 }
 
@@ -127,7 +131,7 @@ static void eval_wsum_hess(expr *node, const double *w)
 
     child->eval_wsum_hess(child, w_prime);
     memcpy(node->wsum_hess->x, child->wsum_hess->x,
-           child->wsum_hess->nnz * sizeof(double));
+           node->wsum_hess->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
@@ -138,8 +142,8 @@ static bool is_affine(const expr *node)
 static void free_type_data(expr *node)
 {
     convolve_expr *cnode = (convolve_expr *) node;
-    free_csr_matrix(cnode->T);
-    free_csc_matrix(cnode->Jchild_CSC);
+    free_CSR_matrix(cnode->T);
+    free_CSC_matrix(cnode->Jchild_CSC);
     free_expr(cnode->param_source);
 }
 
diff --git a/src/atoms/affine/diag_mat.c b/src/atoms/affine/diag_mat.c
index e168d02..0d5a7c1 100644
--- a/src/atoms/affine/diag_mat.c
+++ b/src/atoms/affine/diag_mat.c
@@ -28,7 +28,7 @@ expr *new_diag_mat(expr *child)
     assert(child->d1 == child->d2);
     int n = child->d1;
 
-    int *indices = (int *) malloc((size_t) n * sizeof(int));
+    int *indices = (int *) malloc(n * sizeof(int));
     for (int i = 0; i < n; i++)
     {
         indices[i] = i * (n + 1);
diff --git a/src/atoms/affine/diag_vec.c b/src/atoms/affine/diag_vec.c
index bd94c05..03fbd45 100644
--- a/src/atoms/affine/diag_vec.c
+++ b/src/atoms/affine/diag_vec.c
@@ -48,49 +48,18 @@ static void forward(expr *node, const double *u)
 static void jacobian_init_impl(expr *node)
 {
     expr *x = node->left;
-    int n = x->size;
     jacobian_init(x);
 
-    CSR_Matrix *Jx = x->jacobian;
-    CSR_Matrix *J = new_csr_matrix(node->size, node->n_vars, Jx->nnz);
-
-    /* Output has n^2 rows but only n diagonal positions are non-empty.
-     * Diagonal position i is at row i*(n+1) in Fortran order. */
-    int nnz = 0;
-    int next_diag = 0;
-    for (int row = 0; row < node->size; row++)
-    {
-        J->p[row] = nnz;
-        if (row == next_diag)
-        {
-            int child_row = row / (n + 1);
-            int len = Jx->p[child_row + 1] - Jx->p[child_row];
-            memcpy(J->i + nnz, Jx->i + Jx->p[child_row], len * sizeof(int));
-            nnz += len;
-            next_diag += n + 1;
-        }
-    }
-    J->p[node->size] = nnz;
-
-    node->jacobian = J;
+    /* output type matches child's; rows i*(n+1) hold child row i, others zero. */
+    node->jacobian = x->jacobian->diag_vec_alloc(x->jacobian);
 }
 
 static void eval_jacobian(expr *node)
 {
-    expr *x = node->left;
-    int n = x->size;
-    x->eval_jacobian(x);
-
-    CSR_Matrix *J = node->jacobian;
-    CSR_Matrix *Jx = x->jacobian;
+    node->left->eval_jacobian(node->left);
 
-    /* Copy values from child row i to output diagonal row i*(n+1) */
-    for (int i = 0; i < n; i++)
-    {
-        int out_row = i * (n + 1);
-        int len = J->p[out_row + 1] - J->p[out_row];
-        memcpy(J->x + J->p[out_row], Jx->x + Jx->p[i], len * sizeof(double));
-    }
+    /* fill the diagonal rows of the preallocated output. */
+    node->left->jacobian->diag_vec_fill_values(node->left->jacobian, node->jacobian);
 }
 
 static void wsum_hess_init_impl(expr *node)
@@ -105,8 +74,7 @@ static void wsum_hess_init_impl(expr *node)
 
     /* Copy child's Hessian structure (diag_vec is linear, so its own Hessian is
      * zero) */
-    CSR_Matrix *Hx = x->wsum_hess;
-    node->wsum_hess = new_csr_copy_sparsity(Hx);
+    node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess);
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
@@ -122,7 +90,8 @@ static void eval_wsum_hess(expr *node, const double *w)
 
     /* Evaluate child's Hessian with extracted weights */
     x->eval_wsum_hess(x, node->work->dwork);
-    memcpy(node->wsum_hess->x, x->wsum_hess->x, x->wsum_hess->nnz * sizeof(double));
+    memcpy(node->wsum_hess->x, x->wsum_hess->x,
+           node->wsum_hess->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/hstack.c b/src/atoms/affine/hstack.c
index da670d2..4bf416e 100644
--- a/src/atoms/affine/hstack.c
+++ b/src/atoms/affine/hstack.c
@@ -16,7 +16,9 @@
  * limitations under the License.
  */
 #include "atoms/affine.h"
+#include "subexpr.h"
 #include "utils/CSR_sum.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <stdio.h>
@@ -56,17 +58,16 @@ static void jacobian_init_impl(expr *node)
         nnz += hnode->args[i]->jacobian->nnz;
     }
 
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, nnz);
+    CSR_matrix *A = new_CSR_matrix(node->size, node->n_vars, nnz);
 
     /* precompute sparsity pattern of this node's jacobian */
     int row_offset = 0;
-    CSR_Matrix *A = node->jacobian;
     A->nnz = 0;
 
     for (int i = 0; i < hnode->n_args; i++)
     {
         expr *child = hnode->args[i];
-        CSR_Matrix *B = child->jacobian;
+        CSR_matrix *B = child->jacobian->to_csr(child->jacobian);
 
         /* copy columns */
         memcpy(A->i + A->nnz, B->i, B->nnz * sizeof(int));
@@ -81,23 +82,22 @@ static void jacobian_init_impl(expr *node)
         row_offset += child->size;
     }
     A->p[node->size] = A->nnz;
+    node->jacobian = new_sparse_matrix(A);
 }
 
 static void eval_jacobian(expr *node)
 {
     hstack_expr *hnode = (hstack_expr *) node;
-    CSR_Matrix *A = node->jacobian;
-    A->nnz = 0;
+    node->jacobian->nnz = 0;
 
     for (int i = 0; i < hnode->n_args; i++)
     {
         expr *child = hnode->args[i];
         child->eval_jacobian(child);
-
         /* copy values */
-        memcpy(A->x + A->nnz, child->jacobian->x,
+        memcpy(node->jacobian->x + node->jacobian->nnz, child->jacobian->x,
                child->jacobian->nnz * sizeof(double));
-        A->nnz += child->jacobian->nnz;
+        node->jacobian->nnz += child->jacobian->nnz;
     }
 }
 
@@ -114,25 +114,24 @@ static void wsum_hess_init_impl(expr *node)
 
     /* worst-case scenario the nnz of node->wsum_hess is the sum of children's
        nnz */
-    node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, nnz);
-    hnode->CSR_work = new_csr_matrix(node->n_vars, node->n_vars, nnz);
+    CSR_matrix *H = new_CSR_matrix(node->n_vars, node->n_vars, nnz);
+    hnode->CSR_work = new_CSR_matrix(node->n_vars, node->n_vars, nnz);
 
     /* fill sparsity pattern */
-    CSR_Matrix *H = node->wsum_hess;
     H->nnz = 0;
-
     for (int i = 0; i < hnode->n_args; i++)
     {
-        expr *child = hnode->args[i];
-        copy_csr_matrix(H, hnode->CSR_work);
-        sum_csr_alloc(hnode->CSR_work, child->wsum_hess, H);
+        matrix *child_hess = hnode->args[i]->wsum_hess;
+        copy_CSR_matrix(H, hnode->CSR_work);
+        sum_csr_alloc(hnode->CSR_work, child_hess->to_csr(child_hess), H);
     }
+    node->wsum_hess = new_sparse_matrix(H);
 }
 
 static void wsum_hess_eval(expr *node, const double *w)
 {
     hstack_expr *hnode = (hstack_expr *) node;
-    CSR_Matrix *H = node->wsum_hess;
+    CSR_matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
     int row_offset = 0;
     memset(H->x, 0, H->nnz * sizeof(double));
 
@@ -140,8 +139,9 @@ static void wsum_hess_eval(expr *node, const double *w)
     {
         expr *child = hnode->args[i];
         child->eval_wsum_hess(child, w + row_offset);
-        copy_csr_matrix(H, hnode->CSR_work);
-        sum_csr_fill_values(hnode->CSR_work, child->wsum_hess, H);
+        copy_CSR_matrix(H, hnode->CSR_work);
+        sum_csr_fill_values(hnode->CSR_work,
+                            child->wsum_hess->to_csr(child->wsum_hess), H);
         row_offset += child->size;
     }
 }
@@ -169,7 +169,7 @@ static void free_type_data(expr *node)
         hnode->args[i] = NULL;
     }
 
-    free_csr_matrix(hnode->CSR_work);
+    free_CSR_matrix(hnode->CSR_work);
     hnode->CSR_work = NULL;
     free(hnode->args);
     hnode->args = NULL;
diff --git a/src/atoms/affine/index.c b/src/atoms/affine/index.c
index 34c8310..3fd070e 100644
--- a/src/atoms/affine/index.c
+++ b/src/atoms/affine/index.c
@@ -64,21 +64,10 @@ static void jacobian_init_impl(expr *node)
     index_expr *idx = (index_expr *) node;
     jacobian_init(x);
 
-    CSR_Matrix *Jx = x->jacobian;
-    CSR_Matrix *J = new_csr_matrix(node->size, node->n_vars, Jx->nnz);
-
-    /* set sparsity pattern */
-    J->p[0] = 0;
-    for (int i = 0; i < idx->n_idxs; i++)
-    {
-        int row = idx->indices[i];
-        int len = Jx->p[row + 1] - Jx->p[row];
-        memcpy(J->i + J->p[i], Jx->i + Jx->p[row], len * sizeof(int));
-        J->p[i + 1] = J->p[i] + len;
-    }
-
-    J->nnz = J->p[idx->n_idxs];
-    node->jacobian = J;
+    /* allocate sparsity pattern for the matrix consisting of rows
+       'idx->indices' of the child's Jacobian */
+    node->jacobian =
+        x->jacobian->index_alloc(x->jacobian, idx->indices, idx->n_idxs);
 }
 
 static void eval_jacobian(expr *node)
@@ -87,14 +76,9 @@ static void eval_jacobian(expr *node)
     index_expr *idx = (index_expr *) node;
     x->eval_jacobian(x);
 
-    CSR_Matrix *J = node->jacobian;
-    CSR_Matrix *Jx = x->jacobian;
-
-    for (int i = 0; i < idx->n_idxs; i++)
-    {
-        int len = J->p[i + 1] - J->p[i];
-        memcpy(J->x + J->p[i], Jx->x + Jx->p[idx->indices[i]], len * sizeof(double));
-    }
+    /* copy values of the selected rows into the preallocated output */
+    x->jacobian->index_fill_values(x->jacobian, idx->indices, idx->n_idxs,
+                                   node->jacobian);
 }
 
 static void wsum_hess_init_impl(expr *node)
@@ -113,8 +97,7 @@ static void wsum_hess_init_impl(expr *node)
        many numerical zeros in child->wsum_hess that are actually
        structural zeros, but we do not try to exploit that sparsity
        right now. */
-    CSR_Matrix *Hx = x->wsum_hess;
-    node->wsum_hess = new_csr_copy_sparsity(Hx);
+    node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess);
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
@@ -142,7 +125,8 @@ static void eval_wsum_hess(expr *node, const double *w)
 
     /* evalute hessian of child */
     x->eval_wsum_hess(x, node->work->dwork);
-    memcpy(node->wsum_hess->x, x->wsum_hess->x, x->wsum_hess->nnz * sizeof(double));
+    memcpy(node->wsum_hess->x, x->wsum_hess->x,
+           node->wsum_hess->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/left_matmul.c b/src/atoms/affine/left_matmul.c
index c083385..7fd2a13 100644
--- a/src/atoms/affine/left_matmul.c
+++ b/src/atoms/affine/left_matmul.c
@@ -17,10 +17,14 @@
  */
 #include "atoms/affine.h"
 #include "subexpr.h"
-#include "utils/dense_matrix.h"
+#include "utils/matrix_BTA.h"
+#include "utils/mini_numpy.h"
+#include "utils/permuted_dense.h"
+#include "utils/sparse_matrix.h"
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 /* This file implement the atom 'left_matmul' corresponding to the operation y =
    A @ f(x), where A is a given matrix and f(x) is an arbitrary expression.
@@ -78,7 +82,7 @@ static void forward(expr *node, const double *u)
     node->left->forward(node->left, u);
 
     /* y = A_kron @ vec(f(x)) */
-    Matrix *A = lnode->A;
+    matrix *A = lnode->A;
     int n_blocks = lnode->n_blocks;
     A->block_left_mult_vec(A, x->value, node->value, n_blocks);
 }
@@ -93,8 +97,8 @@ static void free_type_data(expr *node)
     left_matmul_expr *lnode = (left_matmul_expr *) node;
     free_matrix(lnode->A);
     free_matrix(lnode->AT);
-    free_csc_matrix(lnode->Jchild_CSC);
-    free_csc_matrix(lnode->J_CSC);
+    free_CSC_matrix(lnode->Jchild_CSC);
+    free_CSC_matrix(lnode->J_CSC);
     free(lnode->csc_to_csr_work);
     if (lnode->param_source != NULL)
     {
@@ -108,36 +112,67 @@ static void free_type_data(expr *node)
     lnode->param_source = NULL;
 }
 
-static void jacobian_init_impl(expr *node)
+/* TODO: use better polymorphism here if you add another matrix type*/
+
+/* jacobian_init when node->jacobian is permuted_dense */
+static void jacobian_init_pd(expr *node)
 {
+    /* initialize jacobian of child */
     expr *x = node->left;
     left_matmul_expr *lnode = (left_matmul_expr *) node;
+    jacobian_init(x);
+
+    /* initialize this node's jacobian */
+    node->jacobian = BA_pd_matrices_alloc((permuted_dense *) lnode->A, x->jacobian);
+}
+
+/* eval_jacobian when node->jacobian is permuted_dense */
+static void eval_jacobian_pd(expr *node)
+{
+    /* evaluate jacobian of child */
+    left_matmul_expr *lnode = (left_matmul_expr *) node;
+    expr *x = node->left;
+    x->eval_jacobian(x);
+
+    /* must refresh CSC cache if x->jacobian is sparse_matrix */
+    x->jacobian->refresh_csc_values(x->jacobian);
+    BA_pd_matrices_fill_values((permuted_dense *) lnode->A, x->jacobian,
+                               (permuted_dense *) node->jacobian);
+}
 
-    /* initialize child's jacobian and precompute sparsity of its CSC */
+/* jacobian_init when node->jacobian is sparse */
+static void jacobian_init_sparse(expr *node)
+{
+    /* initialize jacobian of child */
+    expr *x = node->left;
+    left_matmul_expr *lnode = (left_matmul_expr *) node;
     jacobian_init(x);
-    lnode->Jchild_CSC = csr_to_csc_alloc(x->jacobian, node->work->iwork);
 
-    /* precompute sparsity of this node's jacobian in CSC and CSR */
+    /* initialize this node's jacobian */
+    lnode->Jchild_CSC =
+        csr_to_csc_alloc(x->jacobian->to_csr(x->jacobian), node->work->iwork);
     lnode->J_CSC = lnode->A->block_left_mult_sparsity(lnode->A, lnode->Jchild_CSC,
                                                       lnode->n_blocks);
-    node->jacobian = csc_to_csr_alloc(lnode->J_CSC, lnode->csc_to_csr_work);
+    node->jacobian =
+        new_sparse_matrix(csc_to_csr_alloc(lnode->J_CSC, lnode->csc_to_csr_work));
 }
 
-static void eval_jacobian(expr *node)
+/* eval_jacobian when node->jacobian is sparse */
+static void eval_jacobian_sparse(expr *node)
 {
+    /* evaluate jacobian of child */
     left_matmul_expr *lnode = (left_matmul_expr *) node;
     expr *x = node->left;
-
-    CSC_Matrix *Jchild_CSC = lnode->Jchild_CSC;
-    CSC_Matrix *J_CSC = lnode->J_CSC;
-
-    /* evaluate child's jacobian and convert to CSC */
     x->eval_jacobian(x);
-    csr_to_csc_fill_values(x->jacobian, Jchild_CSC, node->work->iwork);
 
-    /* compute this node's jacobian: */
+    /* evaluate this node's jacobian */
+    CSC_matrix *Jchild_CSC = lnode->Jchild_CSC;
+    CSC_matrix *J_CSC = lnode->J_CSC;
+    csr_to_csc_fill_values(x->jacobian->to_csr(x->jacobian), Jchild_CSC,
+                           node->work->iwork);
     lnode->A->block_left_mult_values(lnode->A, Jchild_CSC, J_CSC);
-    csc_to_csr_fill_values(J_CSC, node->jacobian, lnode->csc_to_csr_work);
+    csc_to_csr_fill_values(J_CSC, node->jacobian->to_csr(node->jacobian),
+                           lnode->csc_to_csr_work);
 }
 
 static void wsum_hess_init_impl(expr *node)
@@ -147,7 +182,7 @@ static void wsum_hess_init_impl(expr *node)
     wsum_hess_init(x);
 
     /* allocate this node's hessian with the same sparsity as child's */
-    node->wsum_hess = new_csr_copy_sparsity(x->wsum_hess);
+    node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess);
 
     /* work for computing A^T w*/
     int n_blocks = ((left_matmul_expr *) node)->n_blocks;
@@ -160,7 +195,7 @@ static void eval_wsum_hess(expr *node, const double *w)
     left_matmul_expr *lnode = (left_matmul_expr *) node;
 
     /* compute A^T w*/
-    Matrix *AT = lnode->AT;
+    matrix *AT = lnode->AT;
     int n_blocks = lnode->n_blocks;
     AT->block_left_mult_vec(AT, w, node->work->dwork, n_blocks);
 
@@ -171,19 +206,17 @@ static void eval_wsum_hess(expr *node, const double *w)
 
 static void refresh_dense_left(left_matmul_expr *lnode)
 {
-    Dense_Matrix *dm_A = (Dense_Matrix *) lnode->A;
-    Dense_Matrix *dm_AT = (Dense_Matrix *) lnode->AT;
-    int m = dm_A->base.m;
-    int n = dm_A->base.n;
+    int m = lnode->A->m;
+    int n = lnode->A->n;
 
     /* The parameter represents the A in left_matmul_dense(A, x) in column-major.
        In this diffengine, we store A in row-major order. Hence, param->vals
        actually corresponds to the transpose of A, and we transpose AT to get A. */
-    memcpy(dm_AT->x, lnode->param_source->value, m * n * sizeof(double));
-    A_transpose(dm_A->x, dm_AT->x, n, m);
+    memcpy(lnode->AT->x, lnode->param_source->value, m * n * sizeof(double));
+    A_transpose(lnode->A->x, lnode->AT->x, n, m);
 }
 
-expr *new_left_matmul(expr *param_node, expr *u, const CSR_Matrix *A)
+expr *new_left_matmul(expr *param_node, expr *u, const CSR_matrix *A)
 {
     /* We expect u->d1 == A->n. However, numpy's broadcasting rules allow users
        to do A @ u where u is (n, ) which in C is actually (1, n). In that case
@@ -212,23 +245,25 @@ expr *new_left_matmul(expr *param_node, expr *u, const CSR_Matrix *A)
     left_matmul_expr *lnode =
         (left_matmul_expr *) SP_CALLOC(1, sizeof(left_matmul_expr));
     expr *node = &lnode->base;
-    init_expr(node, d1, d2, u->n_vars, forward, jacobian_init_impl, eval_jacobian,
-              is_affine, wsum_hess_init_impl, eval_wsum_hess, free_type_data);
+    /* Sparse A — always the general CSC-mirror path. */
+    init_expr(node, d1, d2, u->n_vars, forward, jacobian_init_sparse,
+              eval_jacobian_sparse, is_affine, wsum_hess_init_impl, eval_wsum_hess,
+              free_type_data);
     node->left = u;
     expr_retain(u);
 
     /* allocate workspace. iwork is used for converting J_child csr to csc
-       (requiring size node->n_vars) and for transposing A (requiring size A->n).
-       csc_to_csr_work is used for converting J_CSC to CSR (requiring
+       (requiring size node->n_vars).
+       csc_to_csr_work is used for converting J_CSC to CSR_matrix (requiring
        node->size) */
-    node->work->iwork = (int *) SP_MALLOC(MAX(A->n, node->n_vars) * sizeof(int));
+    node->work->iwork = (int *) SP_MALLOC(node->n_vars * sizeof(int));
     lnode->csc_to_csr_work = (int *) SP_MALLOC(node->size * sizeof(int));
     lnode->n_blocks = n_blocks;
 
-    /* store A and AT */
-    lnode->A = new_sparse_matrix(A);
-    lnode->AT =
-        sparse_matrix_trans((const Sparse_Matrix *) lnode->A, node->work->iwork);
+    /* store A and AT. new_sparse_matrix takes ownership, so clone first. */
+    lnode->A = new_sparse_matrix(new_csr(A));
+    lnode->AT = lnode->A->transpose_alloc(lnode->A);
+    lnode->A->transpose_fill_values(lnode->A, lnode->AT);
 
     /* parameter support */
     lnode->param_source = param_node;
@@ -245,6 +280,9 @@ expr *new_left_matmul(expr *param_node, expr *u, const CSR_Matrix *A)
 expr *new_left_matmul_dense(expr *param_node, expr *u, int m, int n,
                             const double *data)
 {
+    /* TODO: do a helper function for this dimension check (so we can use it in both
+     * dense and sparse constructors). We could include even more code in that
+     * functon, all the day down to the parameter support I think*/
     int d1, d2, n_blocks;
     if (u->d1 == n)
     {
@@ -267,8 +305,18 @@ expr *new_left_matmul_dense(expr *param_node, expr *u, int m, int n,
     left_matmul_expr *lnode =
         (left_matmul_expr *) SP_CALLOC(1, sizeof(left_matmul_expr));
     expr *node = &lnode->base;
-    init_expr(node, d1, d2, u->n_vars, forward, jacobian_init_impl, eval_jacobian,
-              is_affine, wsum_hess_init_impl, eval_wsum_hess, free_type_data);
+    /* PD A: the BA_pd_matrices dispatcher applies whenever there is a single
+       Kronecker block, whether A is constant or parameterized. With a
+       parameter, A's structure is fixed at construction (full-block PD with
+       trivial permutations); refresh_dense_left updates A->X before each
+       forward, and eval_jacobian_pd reads those refreshed values via
+       BA_pd_matrices_fill_values. With n_blocks > 1 the Kronecker structure
+       forces the general CSC-mirror path. */
+    bool pd_path = (n_blocks == 1);
+    init_expr(node, d1, d2, u->n_vars, forward,
+              pd_path ? jacobian_init_pd : jacobian_init_sparse,
+              pd_path ? eval_jacobian_pd : eval_jacobian_sparse, is_affine,
+              wsum_hess_init_impl, eval_wsum_hess, free_type_data);
     node->left = u;
     expr_retain(u);
 
@@ -290,8 +338,8 @@ expr *new_left_matmul_dense(expr *param_node, expr *u, int m, int n,
         lnode->refresh_param_values = refresh_dense_left;
 
         /* A and AT buffers are filled by refresh_dense_left from the parameter. */
-        lnode->A = new_dense_matrix(m, n, NULL);
-        lnode->AT = new_dense_matrix(n, m, NULL);
+        lnode->A = new_permuted_dense_full(m, n, NULL);
+        lnode->AT = new_permuted_dense_full(n, m, NULL);
         node->needs_parameter_refresh = true;
     }
     /* constant matrix case */
@@ -303,8 +351,9 @@ expr *new_left_matmul_dense(expr *param_node, expr *u, int m, int n,
             exit(1);
         }
 
-        lnode->A = new_dense_matrix(m, n, data);
-        lnode->AT = dense_matrix_trans((const Dense_Matrix *) lnode->A);
+        lnode->A = new_permuted_dense_full(m, n, data);
+        lnode->AT = lnode->A->transpose_alloc(lnode->A);
+        lnode->A->transpose_fill_values(lnode->A, lnode->AT);
     }
 
     return node;
diff --git a/src/atoms/affine/neg.c b/src/atoms/affine/neg.c
index f453876..01bbf5e 100644
--- a/src/atoms/affine/neg.c
+++ b/src/atoms/affine/neg.c
@@ -40,7 +40,7 @@ static void jacobian_init_impl(expr *node)
     jacobian_init(x);
 
     /* same sparsity pattern as child */
-    node->jacobian = new_csr_copy_sparsity(x->jacobian);
+    node->jacobian = x->jacobian->copy_sparsity(x->jacobian);
 }
 
 static void eval_jacobian(expr *node)
@@ -49,10 +49,9 @@ static void eval_jacobian(expr *node)
     node->left->eval_jacobian(node->left);
 
     /* negate values only (sparsity pattern set in jacobian_init_impl) */
-    CSR_Matrix *child_jac = node->left->jacobian;
-    for (int k = 0; k < child_jac->nnz; k++)
+    for (int k = 0; k < node->left->jacobian->nnz; k++)
     {
-        node->jacobian->x[k] = -child_jac->x[k];
+        node->jacobian->x[k] = -node->left->jacobian->x[k];
     }
 }
 
@@ -63,9 +62,8 @@ static void wsum_hess_init_impl(expr *node)
     /* initialize child's wsum_hess */
     wsum_hess_init(x);
 
-    /* same sparsity pattern as child */
-    CSR_Matrix *child_hess = x->wsum_hess;
-    node->wsum_hess = new_csr_copy_sparsity(child_hess);
+    /* same sparsity pattern as child (polymorphic copy) */
+    node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess);
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
@@ -74,10 +72,9 @@ static void eval_wsum_hess(expr *node, const double *w)
     node->left->eval_wsum_hess(node->left, w);
 
     /* negate values (sparsity pattern set in wsum_hess_init_impl) */
-    CSR_Matrix *child_hess = node->left->wsum_hess;
-    for (int k = 0; k < child_hess->nnz; k++)
+    for (int k = 0; k < node->left->wsum_hess->nnz; k++)
     {
-        node->wsum_hess->x[k] = -child_hess->x[k];
+        node->wsum_hess->x[k] = -node->left->wsum_hess->x[k];
     }
 }
 
diff --git a/src/atoms/affine/parameter.c b/src/atoms/affine/parameter.c
index 57458b0..0b9ffbe 100644
--- a/src/atoms/affine/parameter.c
+++ b/src/atoms/affine/parameter.c
@@ -17,6 +17,7 @@
  */
 #include "atoms/affine.h"
 #include "subexpr.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <stdio.h>
 #include <stdlib.h>
@@ -32,7 +33,7 @@ static void forward(expr *node, const double *u)
 static void jacobian_init_impl(expr *node)
 {
     /* Zero jacobian: size x n_vars with 0 nonzeros. */
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, 0);
+    node->jacobian = new_sparse_matrix_alloc(node->size, node->n_vars, 0);
 }
 
 static void eval_jacobian(expr *node)
@@ -43,7 +44,7 @@ static void eval_jacobian(expr *node)
 static void wsum_hess_init_impl(expr *node)
 {
     /* Zero Hessian: n_vars x n_vars with 0 nonzeros. */
-    node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, 0);
+    node->wsum_hess = new_sparse_matrix_alloc(node->n_vars, node->n_vars, 0);
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
diff --git a/src/atoms/affine/promote.c b/src/atoms/affine/promote.c
index ffa21a6..1a031d8 100644
--- a/src/atoms/affine/promote.c
+++ b/src/atoms/affine/promote.c
@@ -40,46 +40,23 @@ static void jacobian_init_impl(expr *node)
     expr *x = node->left;
     jacobian_init(x);
 
-    /* each output row copies the single row from child's jacobian */
-    int nnz = node->size * x->jacobian->nnz;
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, nnz);
-
-    /* fill sparsity pattern */
-    CSR_Matrix *J = node->jacobian;
-    J->nnz = 0;
-    for (int row = 0; row < node->size; row++)
-    {
-        J->p[row] = J->nnz;
-        memcpy(J->i + J->nnz, x->jacobian->i, x->jacobian->nnz * sizeof(int));
-        J->nnz += x->jacobian->nnz;
-    }
-    assert(J->nnz == nnz);
-    J->p[node->size] = J->nnz;
+    /* allocate sparsity for an (node->size, n_vars) matrix whose rows are all
+       copies of the child's single row; output type matches child's type. */
+    node->jacobian = x->jacobian->promote_alloc(x->jacobian, node->size);
 }
 
 static void eval_jacobian(expr *node)
 {
     node->left->eval_jacobian(node->left);
 
-    CSR_Matrix *child_jac = node->left->jacobian;
-    CSR_Matrix *jac = node->jacobian;
-    int child_nnz = child_jac->p[1] - child_jac->p[0];
-
-    /* Copy child's row values to each output row */
-    for (int row = 0; row < node->size; row++)
-    {
-        memcpy(jac->x + row * child_nnz, child_jac->x + child_jac->p[0],
-               child_nnz * sizeof(double));
-    }
+    /* tile the child's single row into the preallocated output. */
+    node->left->jacobian->promote_fill_values(node->left->jacobian, node->jacobian);
 }
 
 static void wsum_hess_init_impl(expr *node)
 {
     wsum_hess_init(node->left);
-
-    /* same sparsity as child since we're summing weights */
-    CSR_Matrix *child_hess = node->left->wsum_hess;
-    node->wsum_hess = new_csr_copy_sparsity(child_hess);
+    node->wsum_hess = node->left->wsum_hess->copy_sparsity(node->left->wsum_hess);
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
@@ -95,8 +72,8 @@ static void eval_wsum_hess(expr *node, const double *w)
     node->left->eval_wsum_hess(node->left, &sum_w);
 
     /* copy values */
-    CSR_Matrix *child_hess = node->left->wsum_hess;
-    memcpy(node->wsum_hess->x, child_hess->x, child_hess->nnz * sizeof(double));
+    memcpy(node->wsum_hess->x, node->left->wsum_hess->x,
+           node->left->wsum_hess->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/reshape.c b/src/atoms/affine/reshape.c
index d654b17..e59a7ec 100644
--- a/src/atoms/affine/reshape.c
+++ b/src/atoms/affine/reshape.c
@@ -36,7 +36,7 @@ static void jacobian_init_impl(expr *node)
 {
     expr *x = node->left;
     jacobian_init(x);
-    node->jacobian = new_csr_copy_sparsity(x->jacobian);
+    node->jacobian = x->jacobian->copy_sparsity(x->jacobian);
 }
 
 static void eval_jacobian(expr *node)
@@ -50,14 +50,15 @@ static void wsum_hess_init_impl(expr *node)
 {
     expr *x = node->left;
     wsum_hess_init(x);
-    node->wsum_hess = new_csr_copy_sparsity(x->wsum_hess);
+    node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess);
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
 {
     expr *x = node->left;
     x->eval_wsum_hess(x, w);
-    memcpy(node->wsum_hess->x, x->wsum_hess->x, x->wsum_hess->nnz * sizeof(double));
+    memcpy(node->wsum_hess->x, x->wsum_hess->x,
+           node->wsum_hess->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/right_matmul.c b/src/atoms/affine/right_matmul.c
index c3344f2..5e3b39f 100644
--- a/src/atoms/affine/right_matmul.c
+++ b/src/atoms/affine/right_matmul.c
@@ -17,11 +17,12 @@
  */
 #include "atoms/affine.h"
 #include "subexpr.h"
-#include "utils/CSR_Matrix.h"
-#include "utils/dense_matrix.h"
+#include "utils/CSR_matrix.h"
+#include "utils/mini_numpy.h"
 #include "utils/tracked_alloc.h"
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 /* This file implements the atom 'right_matmul' corresponding to the operation y =
    f(x) @ A, where A is a given matrix and f(x) is an arbitrary expression.
@@ -29,12 +30,12 @@
     transpose: f(x) @ A = (A^T @ f(x)^T)^T.
 
    For the parameter case:
-     - param_source stores A values in CSR data order
+     - param_source stores A values in CSR_matrix data order
      - inner left_matmul stores AT as its A-matrix and A as its AT-matrix
      - on refresh: update AT (inner's AT, the original A) from param_source,
        then recompute A^T (inner's A) from the updated A. */
 
-/* Refresh for sparse right_matmul: param stores A in CSR data order.
+/* Refresh for sparse right_matmul: param stores A in CSR_matrix data order.
    Inner left_matmul: lnode->A = AT (transposed), lnode->AT = A (original).
    So: update lnode->AT from param values, then recompute lnode->A. */
 static void refresh_dense_right(left_matmul_expr *lnode)
@@ -44,21 +45,21 @@ static void refresh_dense_right(left_matmul_expr *lnode)
        Furthermore, lnode->param_source->value corresponds to the column-major
        version of A, which is BT (an m x n matrix) */
 
-    Dense_Matrix *B = (Dense_Matrix *) lnode->AT;
-    Dense_Matrix *BT = (Dense_Matrix *) lnode->A;
-    int m = B->base.n;
-    int n = B->base.m;
+    matrix *B = lnode->AT;
+    matrix *BT = lnode->A;
+    int m = B->n;
+    int n = B->m;
 
     memcpy(BT->x, lnode->param_source->value, m * n * sizeof(double));
     A_transpose(B->x, BT->x, m, n);
 }
 
-expr *new_right_matmul(expr *param_node, expr *u, const CSR_Matrix *A)
+expr *new_right_matmul(expr *param_node, expr *u, const CSR_matrix *A)
 {
     /* We can express right matmul using left matmul and transpose:
        u @ A = (A^T @ u^T)^T. */
     int *work_transpose = (int *) SP_MALLOC(A->n * sizeof(int));
-    CSR_Matrix *AT = transpose(A, work_transpose);
+    CSR_matrix *AT = transpose(A, work_transpose);
 
     expr *u_transpose = new_transpose(u);
     expr *left_matmul = new_left_matmul(NULL, u_transpose, AT);
@@ -74,7 +75,7 @@ expr *new_right_matmul(expr *param_node, expr *u, const CSR_Matrix *A)
 
     expr *node = new_transpose(left_matmul);
 
-    free_csr_matrix(AT);
+    free_CSR_matrix(AT);
     free(work_transpose);
     return node;
 }
diff --git a/src/atoms/affine/scalar_mult.c b/src/atoms/affine/scalar_mult.c
index e54142a..433f3e1 100644
--- a/src/atoms/affine/scalar_mult.c
+++ b/src/atoms/affine/scalar_mult.c
@@ -59,7 +59,7 @@ static void jacobian_init_impl(expr *node)
     jacobian_init(x);
 
     /* same sparsity as child */
-    node->jacobian = new_csr_copy_sparsity(x->jacobian);
+    node->jacobian = x->jacobian->copy_sparsity(x->jacobian);
 }
 
 static void eval_jacobian(expr *node)
@@ -85,7 +85,7 @@ static void wsum_hess_init_impl(expr *node)
     wsum_hess_init(x);
 
     /* same sparsity as child */
-    node->wsum_hess = new_csr_copy_sparsity(x->wsum_hess);
+    node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess);
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
diff --git a/src/atoms/affine/sum.c b/src/atoms/affine/sum.c
index c43d2ab..74f720b 100644
--- a/src/atoms/affine/sum.c
+++ b/src/atoms/affine/sum.c
@@ -16,9 +16,11 @@
  * limitations under the License.
  */
 #include "atoms/affine.h"
+#include "subexpr.h"
 #include "utils/CSR_sum.h"
 #include "utils/int_double_pair.h"
 #include "utils/mini_numpy.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include "utils/utils.h"
 #include <assert.h>
@@ -86,12 +88,12 @@ static void jacobian_init_impl(expr *node)
 
     /* initialize child's jacobian */
     jacobian_init(x);
+    CSR_matrix *Jx = x->jacobian->to_csr(x->jacobian);
 
     /* we never have to store more than the child's nnz */
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, x->jacobian->nnz);
-    node->work->iwork =
-        SP_MALLOC(MAX(node->jacobian->n, x->jacobian->nnz) * sizeof(int));
-    snode->idx_map = SP_MALLOC(x->jacobian->nnz * sizeof(int));
+    CSR_matrix *jac = new_CSR_matrix(node->size, node->n_vars, Jx->nnz);
+    node->work->iwork = SP_MALLOC(MAX(jac->n, Jx->nnz) * sizeof(int));
+    snode->idx_map = SP_MALLOC(Jx->nnz * sizeof(int));
 
     /* the idx_map array maps each nonzero entry j in x->jacobian
        to the corresponding index in the output row matrix C. Specifically, for
@@ -100,68 +102,70 @@ static void jacobian_init_impl(expr *node)
 
     if (axis == -1)
     {
-        sum_all_rows_csr_alloc(x->jacobian, node->jacobian, node->work->iwork,
-                               snode->idx_map);
+        sum_all_rows_csr_alloc(Jx, jac, node->work->iwork, snode->idx_map);
     }
     else if (axis == 0)
     {
-        sum_block_of_rows_csr_alloc(x->jacobian, node->jacobian, x->d1,
-                                    node->work->iwork, snode->idx_map);
+        sum_block_of_rows_csr_alloc(Jx, jac, x->d1, node->work->iwork,
+                                    snode->idx_map);
     }
     else if (axis == 1)
     {
-        sum_evenly_spaced_rows_csr_alloc(x->jacobian, node->jacobian, node->size,
-                                         node->work->iwork, snode->idx_map);
+        sum_evenly_spaced_rows_csr_alloc(Jx, jac, node->size, node->work->iwork,
+                                         snode->idx_map);
     }
+
+    node->jacobian = new_sparse_matrix(jac);
 }
 
 static void eval_jacobian(expr *node)
 {
-    expr *x = node->left;
+    expr *child = node->left;
 
     /* evaluate child's jacobian */
-    x->eval_jacobian(x);
+    child->eval_jacobian(child);
 
     /* we have precomputed an idx map between the nonzeros of the child's jacobian
        and this node's jacobian, so we just accumulate accordingly */
     memset(node->jacobian->x, 0, node->jacobian->nnz * sizeof(double));
-    accumulator(x->jacobian, ((sum_expr *) node)->idx_map, node->jacobian->x);
+    accumulator(child->jacobian->x, child->jacobian->nnz,
+                ((sum_expr *) node)->idx_map, node->jacobian->x);
 }
 
 static void wsum_hess_init_impl(expr *node)
 {
-    expr *x = node->left;
+    expr *child = node->left;
     /* initialize child's wsum_hess */
-    wsum_hess_init(x);
+    wsum_hess_init(child);
 
     /* we never have to store more than the child's nnz */
-    node->wsum_hess = new_csr_copy_sparsity(x->wsum_hess);
-    node->work->dwork = SP_MALLOC(x->size * sizeof(double));
+    node->wsum_hess = child->wsum_hess->copy_sparsity(child->wsum_hess);
+    node->work->dwork = SP_MALLOC(child->size * sizeof(double));
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
 {
-    expr *x = node->left;
+    expr *child = node->left;
     sum_expr *snode = (sum_expr *) node;
     int axis = snode->axis;
 
     if (axis == -1)
     {
-        scaled_ones(node->work->dwork, x->size, *w);
+        scaled_ones(node->work->dwork, child->size, *w);
     }
     else if (axis == 0)
     {
-        repeat(node->work->dwork, w, x->d2, x->d1);
+        repeat(node->work->dwork, w, child->d2, child->d1);
     }
     else if (axis == 1)
     {
-        tile_double(node->work->dwork, w, x->d1, x->d2);
+        tile_double(node->work->dwork, w, child->d1, child->d2);
     }
 
-    x->eval_wsum_hess(x, node->work->dwork);
+    child->eval_wsum_hess(child, node->work->dwork);
 
-    /* copy values */
-    memcpy(node->wsum_hess->x, x->wsum_hess->x, x->wsum_hess->nnz * sizeof(double));
+    memcpy(node->wsum_hess->x, child->wsum_hess->x,
+           node->wsum_hess->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/trace.c b/src/atoms/affine/trace.c
index c421b3f..846a45b 100644
--- a/src/atoms/affine/trace.c
+++ b/src/atoms/affine/trace.c
@@ -16,8 +16,10 @@
  * limitations under the License.
  */
 #include "atoms/affine.h"
+#include "subexpr.h"
 #include "utils/CSR_sum.h"
 #include "utils/int_double_pair.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include "utils/utils.h"
 #include <assert.h>
@@ -55,7 +57,7 @@ static void jacobian_init_impl(expr *node)
     // ---------------------------------------------------------------
     //    count total nnz and allocate matrix with sufficient space
     // ---------------------------------------------------------------
-    const CSR_Matrix *A = x->jacobian;
+    const CSR_matrix *A = x->jacobian->to_csr(x->jacobian);
     int total_nnz = 0;
     int row_spacing = x->d1 + 1;
 
@@ -64,22 +66,23 @@ static void jacobian_init_impl(expr *node)
         total_nnz += A->p[row + 1] - A->p[row];
     }
 
-    node->jacobian = new_csr_matrix(1, node->n_vars, total_nnz);
+    CSR_matrix *jac = new_CSR_matrix(1, node->n_vars, total_nnz);
 
     // ---------------------------------------------------------------
     // fill sparsity pattern and idx_map
     // ---------------------------------------------------------------
     trace_expr *tnode = (trace_expr *) node;
-    node->work->iwork = SP_MALLOC(MAX(node->jacobian->n, total_nnz) * sizeof(int));
+    node->work->iwork = SP_MALLOC(MAX(jac->n, total_nnz) * sizeof(int));
 
     /* the idx_map array maps each nonzero entry j in the original matrix A (from the
        selected, evenly spaced rows) to the corresponding index in the output row
        matrix C. Specifically, for each nonzero entry j in A (from the selected
        rows), idx_map[j] gives the position in C->x where the value from A->x[j]
        should be accumulated. */
-    tnode->idx_map = SP_MALLOC(x->jacobian->nnz * sizeof(int));
-    sum_spaced_rows_into_row_csr_alloc(A, node->jacobian, row_spacing,
-                                       node->work->iwork, tnode->idx_map);
+    tnode->idx_map = SP_MALLOC(A->nnz * sizeof(int));
+    sum_spaced_rows_into_row_csr_alloc(A, jac, row_spacing, node->work->iwork,
+                                       tnode->idx_map);
+    node->jacobian = new_sparse_matrix(jac);
 }
 
 static void eval_jacobian(expr *node)
@@ -92,8 +95,8 @@ static void eval_jacobian(expr *node)
 
     /* local jacobian */
     memset(node->jacobian->x, 0, node->jacobian->nnz * sizeof(double));
-    accumulator_with_spacing(x->jacobian, tnode->idx_map, node->jacobian->x,
-                             x->d1 + 1);
+    accumulator_with_spacing(x->jacobian->to_csr(x->jacobian), tnode->idx_map,
+                             node->jacobian->x, x->d1 + 1);
 }
 
 /* Placeholders for Hessian-related functions */
@@ -110,7 +113,7 @@ static void wsum_hess_init_impl(expr *node)
        contribution to wsum_hess of entries of the child that will always have
        zero weight in eval_wsum_hess. We do this for simplicity. But the Hessian
        can for sure be made more sophisticated. */
-    node->wsum_hess = new_csr_copy_sparsity(x->wsum_hess);
+    node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess);
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
@@ -125,7 +128,8 @@ static void eval_wsum_hess(expr *node, const double *w)
 
     x->eval_wsum_hess(x, node->work->dwork);
 
-    memcpy(node->wsum_hess->x, x->wsum_hess->x, sizeof(double) * x->wsum_hess->nnz);
+    memcpy(node->wsum_hess->x, x->wsum_hess->x,
+           node->wsum_hess->nnz * sizeof(double));
 }
 
 static bool is_affine(const expr *node)
diff --git a/src/atoms/affine/transpose.c b/src/atoms/affine/transpose.c
index d4a7998..59c961e 100644
--- a/src/atoms/affine/transpose.c
+++ b/src/atoms/affine/transpose.c
@@ -16,8 +16,8 @@
  * limitations under the License.
  */
 #include "atoms/affine.h"
+#include "utils/mini_numpy.h"
 #include "utils/tracked_alloc.h"
-#include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -30,60 +30,38 @@ static void forward(expr *node, const double *u)
     /* local forward pass */
     int d1 = node->d1;
     int d2 = node->d2;
-    double *X = node->left->value;
-    double *XT = node->value;
-    for (int i = 0; i < d1; ++i)
-    {
-        for (int j = 0; j < d2; ++j)
-        {
-            XT[j * d1 + i] = X[i * d2 + j];
-        }
-    }
+    A_transpose(node->value, node->left->value, d1, d2);
 }
 
 static void jacobian_init_impl(expr *node)
 {
     expr *child = node->left;
     jacobian_init(child);
-    CSR_Matrix *Jc = child->jacobian;
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, Jc->nnz);
 
-    /* fill sparsity */
-    CSR_Matrix *J = node->jacobian;
+    int n_out = node->size;
     int d1 = node->d1;
     int d2 = node->d2;
-    int nnz = 0;
-    J->p[0] = 0;
 
-    /* 'k' is the old row that gets swapped to 'row'*/
-    int k, len;
-    for (int row = 0; row < J->m; ++row)
+    /* The transpose's Jacobian is a row permutation of the child's:
+       J_node[r, :] = J_child[k(r), :] where k(r) = (r/d1) + (r%d1)*d2. */
+    int *indices = (int *) SP_MALLOC(n_out * sizeof(int));
+    for (int r = 0; r < n_out; r++)
     {
-        k = (row / d1) + (row % d1) * d2;
-        len = Jc->p[k + 1] - Jc->p[k];
-        memcpy(J->i + nnz, Jc->i + Jc->p[k], len * sizeof(int));
-        nnz += len;
-        J->p[row + 1] = nnz;
+        indices[r] = (r / d1) + (r % d1) * d2;
     }
+
+    node->jacobian = child->jacobian->index_alloc(child->jacobian, indices, n_out);
+
+    /* save indices for eval_jacobian */
+    node->work->iwork = indices;
 }
 
 static void eval_jacobian(expr *node)
 {
     expr *child = node->left;
     child->eval_jacobian(child);
-    CSR_Matrix *Jc = child->jacobian;
-    CSR_Matrix *J = node->jacobian;
-
-    int d1 = node->d1;
-    int d2 = node->d2;
-    int nnz = 0;
-    for (int row = 0; row < J->m; ++row)
-    {
-        int k = (row / d1) + (row % d1) * d2;
-        int len = Jc->p[k + 1] - Jc->p[k];
-        memcpy(J->x + nnz, Jc->x + Jc->p[k], len * sizeof(double));
-        nnz += len;
-    }
+    child->jacobian->index_fill_values(child->jacobian, node->work->iwork,
+                                       node->size, node->jacobian);
 }
 
 static void wsum_hess_init_impl(expr *node)
@@ -93,7 +71,7 @@ static void wsum_hess_init_impl(expr *node)
     wsum_hess_init(x);
 
     /* same sparsity pattern as child */
-    node->wsum_hess = new_csr_copy_sparsity(x->wsum_hess);
+    node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess);
 
     /* for computing Kw where K is the commutation matrix */
     node->work->dwork = (double *) SP_MALLOC(node->size * sizeof(double));
@@ -102,7 +80,6 @@ static void eval_wsum_hess(expr *node, const double *w)
 {
     int d2 = node->d2;
     int d1 = node->d1;
-    // TODO: meaybe more efficient to do this with memcpy first
 
     /* evaluate hessian of child at Kw */
     for (int i = 0; i < d2; ++i)
diff --git a/src/atoms/affine/upper_tri.c b/src/atoms/affine/upper_tri.c
index 51c339b..2da4d36 100644
--- a/src/atoms/affine/upper_tri.c
+++ b/src/atoms/affine/upper_tri.c
@@ -40,7 +40,7 @@ expr *new_upper_tri(expr *child)
     int *indices = NULL;
     if (n_elems > 0)
     {
-        indices = (int *) malloc((size_t) n_elems * sizeof(int));
+        indices = (int *) malloc(n_elems * sizeof(int));
         int k = 0;
         for (int i = 0; i < n; i++)
         {
diff --git a/src/atoms/affine/variable.c b/src/atoms/affine/variable.c
index 0dc12d2..fe010e7 100644
--- a/src/atoms/affine/variable.c
+++ b/src/atoms/affine/variable.c
@@ -16,6 +16,7 @@
  * limitations under the License.
  */
 #include "atoms/affine.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <stdlib.h>
 #include <string.h>
@@ -27,14 +28,15 @@ static void forward(expr *node, const double *u)
 
 static void jacobian_init_impl(expr *node)
 {
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, node->size);
+    CSR_matrix *jac = new_CSR_matrix(node->size, node->n_vars, node->size);
     for (int j = 0; j < node->size; j++)
     {
-        node->jacobian->p[j] = j;
-        node->jacobian->i[j] = j + node->var_id;
-        node->jacobian->x[j] = 1.0;
+        jac->p[j] = j;
+        jac->i[j] = j + node->var_id;
+        jac->x[j] = 1.0;
     }
-    node->jacobian->p[node->size] = node->size;
+    jac->p[node->size] = node->size;
+    node->jacobian = new_sparse_matrix(jac);
 }
 
 static void eval_jacobian(expr *node)
@@ -46,7 +48,7 @@ static void eval_jacobian(expr *node)
 static void wsum_hess_init_impl(expr *node)
 {
     /* Variables have zero Hessian */
-    node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, 0);
+    node->wsum_hess = new_sparse_matrix_alloc(node->n_vars, node->n_vars, 0);
 }
 
 static void wsum_hess_eval(expr *node, const double *w)
diff --git a/src/atoms/affine/vector_mult.c b/src/atoms/affine/vector_mult.c
index 3647f8b..ee7992b 100644
--- a/src/atoms/affine/vector_mult.c
+++ b/src/atoms/affine/vector_mult.c
@@ -59,7 +59,7 @@ static void jacobian_init_impl(expr *node)
     jacobian_init(x);
 
     /* same sparsity as child */
-    node->jacobian = new_csr_copy_sparsity(x->jacobian);
+    node->jacobian = x->jacobian->copy_sparsity(x->jacobian);
 }
 
 static void eval_jacobian(expr *node)
@@ -67,17 +67,11 @@ static void eval_jacobian(expr *node)
     expr *x = node->left;
     const double *a = ((vector_mult_expr *) node)->param_source->value;
 
-    /* evaluate x */
+    /* evaluate jacobian of child */
     x->eval_jacobian(x);
 
-    /* row-wise scale child's jacobian */
-    for (int i = 0; i < node->size; i++)
-    {
-        for (int j = x->jacobian->p[i]; j < x->jacobian->p[i + 1]; j++)
-        {
-            node->jacobian->x[j] = a[i] * x->jacobian->x[j];
-        }
-    }
+    /* row-wise scale child's jacobian: diag(a) @ Jx */
+    x->jacobian->DA_fill_values(a, x->jacobian, node->jacobian);
 }
 
 static void wsum_hess_init_impl(expr *node)
@@ -88,8 +82,9 @@ static void wsum_hess_init_impl(expr *node)
     wsum_hess_init(x);
 
     /* same sparsity as child */
-    node->wsum_hess = new_csr_copy_sparsity(x->wsum_hess);
+    node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess);
 
+    /* workspace for storing scaled weights */
     node->work->dwork = (double *) SP_MALLOC(node->size * sizeof(double));
 }
 
@@ -107,7 +102,8 @@ static void eval_wsum_hess(expr *node, const double *w)
     x->eval_wsum_hess(x, node->work->dwork);
 
     /* copy values from child to this node */
-    memcpy(node->wsum_hess->x, x->wsum_hess->x, x->wsum_hess->nnz * sizeof(double));
+    memcpy(node->wsum_hess->x, x->wsum_hess->x,
+           node->wsum_hess->nnz * sizeof(double));
 }
 
 static void free_type_data(expr *node)
diff --git a/src/atoms/bivariate_full_dom/matmul.c b/src/atoms/bivariate_full_dom/matmul.c
index 5fb8af6..e3ef874 100644
--- a/src/atoms/bivariate_full_dom/matmul.c
+++ b/src/atoms/bivariate_full_dom/matmul.c
@@ -17,12 +17,13 @@
  */
 #include "atoms/bivariate_full_dom.h"
 #include "subexpr.h"
-#include "utils/CSC_Matrix.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSC_matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/CSR_sum.h"
 #include "utils/linalg_dense_sparse_matmuls.h"
 #include "utils/linalg_sparse_matmuls.h"
 #include "utils/mini_numpy.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include "utils/utils.h"
 #include <assert.h>
@@ -41,10 +42,10 @@
 // column positions (offset by j in the Y-variable indexing).
 // ------------------------------------------------------------------------------
 
-static CSR_Matrix *build_cross_hessian_sparsity(int m, int k, int n)
+static CSR_matrix *build_cross_hessian_sparsity(int m, int k, int n)
 {
     int total_nnz = m * k * n;
-    CSR_Matrix *B = new_csr_matrix(m * k, k * n, total_nnz);
+    CSR_matrix *B = new_CSR_matrix(m * k, k * n, total_nnz);
     int idx = 0;
 
     for (int j = 0; j < k; j++)
@@ -64,7 +65,7 @@ static CSR_Matrix *build_cross_hessian_sparsity(int m, int k, int n)
 }
 
 static void fill_cross_hessian_values(int m, int k, int n, const double *w,
-                                      CSR_Matrix *B)
+                                      CSR_matrix *B)
 {
     int idx = 0;
     for (int j = 0; j < k; j++)
@@ -102,15 +103,15 @@ static void free_matmul_data(expr *node)
 {
     matmul_expr *mnode = (matmul_expr *) node;
     /* Jacobian workspace */
-    free_csr_matrix(mnode->term1_CSR);
-    free_csr_matrix(mnode->term2_CSR);
+    free_CSR_matrix(mnode->term1_CSR);
+    free_CSR_matrix(mnode->term2_CSR);
     /* Hessian workspace */
-    free_csr_matrix(mnode->B);
-    free_csr_matrix(mnode->BJg);
-    free_csc_matrix(mnode->BJg_CSC);
+    free_CSR_matrix(mnode->B);
+    free_CSR_matrix(mnode->BJg);
+    free_CSC_matrix(mnode->BJg_CSC);
     free(mnode->BJg_csc_work);
-    free_csr_matrix(mnode->C);
-    free_csr_matrix(mnode->CT);
+    free_CSR_matrix(mnode->C);
+    free_CSR_matrix(mnode->CT);
     free(mnode->idx_map_C);
     free(mnode->idx_map_CT);
     free(mnode->idx_map_Hf);
@@ -139,7 +140,7 @@ static void jacobian_init_no_chain_rule(expr *node)
     int k = x->d2;
     int n = y->d2;
     int nnz = m * n * 2 * k;
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, nnz);
+    CSR_matrix *jac = new_CSR_matrix(node->size, node->n_vars, nnz);
 
     int nnz_idx = 0;
     for (int i = 0; i < node->size; i++)
@@ -147,33 +148,34 @@ static void jacobian_init_no_chain_rule(expr *node)
         int row = i % m;
         int col = i / m;
 
-        node->jacobian->p[i] = nnz_idx;
+        jac->p[i] = nnz_idx;
 
         if (x->var_id < y->var_id)
         {
             for (int j = 0; j < k; j++)
             {
-                node->jacobian->i[nnz_idx++] = x->var_id + row + j * m;
+                jac->i[nnz_idx++] = x->var_id + row + j * m;
             }
             for (int j = 0; j < k; j++)
             {
-                node->jacobian->i[nnz_idx++] = y->var_id + col * k + j;
+                jac->i[nnz_idx++] = y->var_id + col * k + j;
             }
         }
         else
         {
             for (int j = 0; j < k; j++)
             {
-                node->jacobian->i[nnz_idx++] = y->var_id + col * k + j;
+                jac->i[nnz_idx++] = y->var_id + col * k + j;
             }
             for (int j = 0; j < k; j++)
             {
-                node->jacobian->i[nnz_idx++] = x->var_id + row + j * m;
+                jac->i[nnz_idx++] = x->var_id + row + j * m;
             }
         }
     }
-    node->jacobian->p[node->size] = nnz_idx;
+    jac->p[node->size] = nnz_idx;
     assert(nnz_idx == nnz);
+    node->jacobian = new_sparse_matrix(jac);
 }
 
 static void eval_jacobian_no_chain_rule(expr *node)
@@ -182,13 +184,14 @@ static void eval_jacobian_no_chain_rule(expr *node)
     expr *y = node->right;
     int m = x->d1;
     int k = x->d2;
-    double *Jx = node->jacobian->x;
+    CSR_matrix *jac = node->jacobian->to_csr(node->jacobian);
+    double *Jx = jac->x;
 
     for (int i = 0; i < node->size; i++)
     {
         int row = i % m;
         int col = i / m;
-        int pos = node->jacobian->p[i];
+        int pos = jac->p[i];
 
         if (x->var_id < y->var_id)
         {
@@ -234,8 +237,9 @@ static void jacobian_init_chain_rule(expr *node)
     mnode->term1_CSR = YT_kron_I_alloc(m, k, n, f->work->jacobian_csc);
     mnode->term2_CSR = I_kron_X_alloc(m, k, n, g->work->jacobian_csc);
     int max_nnz = mnode->term1_CSR->nnz + mnode->term2_CSR->nnz;
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, max_nnz);
-    sum_csr_alloc(mnode->term1_CSR, mnode->term2_CSR, node->jacobian);
+    CSR_matrix *jac = new_CSR_matrix(node->size, node->n_vars, max_nnz);
+    sum_csr_alloc(mnode->term1_CSR, mnode->term2_CSR, jac);
+    node->jacobian = new_sparse_matrix(jac);
 }
 
 static void eval_jacobian_chain_rule(expr *node)
@@ -250,14 +254,16 @@ static void eval_jacobian_chain_rule(expr *node)
     /* evaluate Jacobians of children */
     f->eval_jacobian(f);
     g->eval_jacobian(g);
-    csr_to_csc_fill_values(f->jacobian, f->work->jacobian_csc, f->work->csc_work);
-    csr_to_csc_fill_values(g->jacobian, g->work->jacobian_csc, g->work->csc_work);
+    csr_to_csc_fill_values(f->jacobian->to_csr(f->jacobian), f->work->jacobian_csc,
+                           f->work->csc_work);
+    csr_to_csc_fill_values(g->jacobian->to_csr(g->jacobian), g->work->jacobian_csc,
+                           g->work->csc_work);
 
     /* evaluate term1, term2, and their sum */
     YT_kron_I_fill_values(m, k, n, g->value, f->work->jacobian_csc,
                           mnode->term1_CSR);
     I_kron_X_fill_values(m, k, n, f->value, g->work->jacobian_csc, mnode->term2_CSR);
-    sum_csr_fill_values(mnode->term1_CSR, mnode->term2_CSR, node->jacobian);
+    sum_csr_fill_values(mnode->term1_CSR, mnode->term2_CSR, node->jacobian->to_csr(node->jacobian));
 }
 
 // ------------------------------------------------------------------------------------
@@ -272,10 +278,10 @@ static void wsum_hess_init_no_chain_rule(expr *node)
     int k = x->d2;
     int n = y->d2;
     int total_nnz = 2 * m * k * n;
-    node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, total_nnz);
+    CSR_matrix *hess = new_CSR_matrix(node->n_vars, node->n_vars, total_nnz);
     int nnz = 0;
-    int *Hi = node->wsum_hess->i;
-    int *Hp = node->wsum_hess->p;
+    int *Hi = hess->i;
+    int *Hp = hess->p;
     int start, i;
 
     if (x->var_id < y->var_id)
@@ -338,6 +344,7 @@ static void wsum_hess_init_no_chain_rule(expr *node)
     }
     Hp[node->n_vars] = nnz;
     assert(nnz == total_nnz);
+    node->wsum_hess = new_sparse_matrix(hess);
 }
 
 static void eval_wsum_hess_no_chain_rule(expr *node, const double *w)
@@ -414,8 +421,8 @@ static void wsum_hess_init_chain_rule(expr *node)
     int m = f->d1;
     int k = f->d2;
     int n = g->d2;
-    CSC_Matrix *Jf = f->work->jacobian_csc;
-    CSC_Matrix *Jg = g->work->jacobian_csc;
+    CSC_matrix *Jf = f->work->jacobian_csc;
+    CSC_matrix *Jg = g->work->jacobian_csc;
 
     /* initialize C = Jf^T @ B @ Jg = Jf^T @ (B @ Jg) */
     mnode->B = build_cross_hessian_sparsity(m, k, n);
@@ -435,8 +442,10 @@ static void wsum_hess_init_chain_rule(expr *node)
 
     /* sum the four terms and fill idx maps */
     int *maps[4];
-    node->wsum_hess =
-        sum_4_csr_alloc(mnode->C, mnode->CT, f->wsum_hess, g->wsum_hess, maps);
+    CSR_matrix *hess =
+        sum_4_csr_alloc(mnode->C, mnode->CT, f->wsum_hess->to_csr(f->wsum_hess),
+                        g->wsum_hess->to_csr(g->wsum_hess), maps);
+    node->wsum_hess = new_sparse_matrix(hess);
     mnode->idx_map_C = maps[0];
     mnode->idx_map_CT = maps[1];
     mnode->idx_map_Hf = maps[2];
@@ -460,23 +469,23 @@ static void eval_wsum_hess_chain_rule(expr *node, const double *w)
     int n = g->d2;
     bool is_f_affine = f->is_affine(f);
     bool is_g_affine = g->is_affine(g);
-    CSC_Matrix *Jf = f->work->jacobian_csc;
-    CSC_Matrix *Jg = g->work->jacobian_csc;
+    CSC_matrix *Jf = f->work->jacobian_csc;
+    CSC_matrix *Jg = g->work->jacobian_csc;
 
-    /* refresh child Jacobian CSC values (cache if affine) */
+    /* refresh child Jacobian CSC_matrix values (cache if affine) */
     if (!f->work->jacobian_csc_filled)
     {
-        csr_to_csc_fill_values(f->jacobian, Jf, f->work->csc_work);
+        csr_to_csc_fill_values(f->jacobian->to_csr(f->jacobian), Jf, f->work->csc_work);
         if (is_f_affine)
         {
             f->work->jacobian_csc_filled = true;
         }
     }
 
-    /* refresh child Jacobian CSC values (cache if affine) */
+    /* refresh child Jacobian CSC_matrix values (cache if affine) */
     if (!g->work->jacobian_csc_filled)
     {
-        csr_to_csc_fill_values(g->jacobian, Jg, g->work->csc_work);
+        csr_to_csc_fill_values(g->jacobian->to_csr(g->jacobian), Jg, g->work->csc_work);
         if (is_g_affine)
         {
             g->work->jacobian_csc_filled = true;
@@ -508,10 +517,12 @@ static void eval_wsum_hess_chain_rule(expr *node, const double *w)
 
     /* accumulate H = C + C^T + H_f + H_g */
     memset(node->wsum_hess->x, 0, node->wsum_hess->nnz * sizeof(double));
-    accumulator(mnode->C, mnode->idx_map_C, node->wsum_hess->x);
-    accumulator(mnode->CT, mnode->idx_map_CT, node->wsum_hess->x);
-    accumulator(f->wsum_hess, mnode->idx_map_Hf, node->wsum_hess->x);
-    accumulator(g->wsum_hess, mnode->idx_map_Hg, node->wsum_hess->x);
+    accumulator(mnode->C->x, mnode->C->nnz, mnode->idx_map_C, node->wsum_hess->x);
+    accumulator(mnode->CT->x, mnode->CT->nnz, mnode->idx_map_CT, node->wsum_hess->x);
+    accumulator(f->wsum_hess->x, f->wsum_hess->nnz, mnode->idx_map_Hf,
+                node->wsum_hess->x);
+    accumulator(g->wsum_hess->x, g->wsum_hess->nnz, mnode->idx_map_Hg,
+                node->wsum_hess->x);
 }
 
 expr *new_matmul(expr *x, expr *y)
diff --git a/src/atoms/bivariate_full_dom/multiply.c b/src/atoms/bivariate_full_dom/multiply.c
index 5f606f8..7ae7841 100644
--- a/src/atoms/bivariate_full_dom/multiply.c
+++ b/src/atoms/bivariate_full_dom/multiply.c
@@ -17,7 +17,11 @@
  */
 #include "atoms/bivariate_full_dom.h"
 #include "subexpr.h"
+#include "utils/CSR_matrix.h"
 #include "utils/CSR_sum.h"
+#include "utils/matrix_BTA.h"
+#include "utils/matrix_sum.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <stdio.h>
@@ -50,10 +54,10 @@ static void jacobian_init_impl(expr *node)
     jacobian_init(node->left);
     jacobian_init(node->right);
     int nnz_max = node->left->jacobian->nnz + node->right->jacobian->nnz;
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, nnz_max);
+    node->jacobian = new_sparse_matrix_alloc(node->size, node->n_vars, nnz_max);
 
     /* fill sparsity pattern */
-    sum_csr_alloc(node->left->jacobian, node->right->jacobian, node->jacobian);
+    sum_matrices_alloc(node->left->jacobian, node->right->jacobian, node->jacobian);
 }
 
 static void eval_jacobian(expr *node)
@@ -66,8 +70,8 @@ static void eval_jacobian(expr *node)
 
     /* chain rule: the jacobian of h(x) = f(g1(x), g2(x))) is Jh = J_{f, 1} J_{g1} +
      * J_{f, 2} J_{g2} */
-    sum_scaled_csr_matrices_fill_values(x->jacobian, y->jacobian, node->jacobian,
-                                        y->value, x->value);
+    sum_scaled_matrices_fill_values(x->jacobian, y->jacobian, node->jacobian,
+                                    y->value, x->value);
 }
 
 static void wsum_hess_init_impl(expr *node)
@@ -80,7 +84,8 @@ static void wsum_hess_init_impl(expr *node)
         x->var_id != y->var_id)
     {
         assert(y->var_id != NOT_A_VARIABLE);
-        node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, 2 * node->size);
+        CSR_matrix *hess =
+            new_CSR_matrix(node->n_vars, node->n_vars, 2 * node->size);
 
         int i, var1_id, var2_id;
 
@@ -98,8 +103,8 @@ static void wsum_hess_init_impl(expr *node)
         /* var1 rows of Hessian */
         for (i = 0; i < node->size; i++)
         {
-            node->wsum_hess->p[var1_id + i] = i;
-            node->wsum_hess->i[i] = var2_id + i;
+            hess->p[var1_id + i] = i;
+            hess->i[i] = var2_id + i;
         }
 
         int nnz = node->size;
@@ -107,22 +112,23 @@ static void wsum_hess_init_impl(expr *node)
         /* rows between var1 and var2 */
         for (i = var1_id + node->size; i < var2_id; i++)
         {
-            node->wsum_hess->p[i] = nnz;
+            hess->p[i] = nnz;
         }
 
         /* var2 rows of Hessian */
         for (i = 0; i < node->size; i++)
         {
-            node->wsum_hess->p[var2_id + i] = nnz + i;
-            node->wsum_hess->i[nnz + i] = var1_id + i;
+            hess->p[var2_id + i] = nnz + i;
+            hess->i[nnz + i] = var1_id + i;
         }
 
         /* remaining rows */
         nnz += node->size;
         for (i = var2_id + node->size; i <= node->n_vars; i++)
         {
-            node->wsum_hess->p[i] = nnz;
+            hess->p[i] = nnz;
         }
+        node->wsum_hess = new_sparse_matrix(hess);
     }
     else
     {
@@ -145,30 +151,37 @@ static void wsum_hess_init_impl(expr *node)
             node->work->dwork = (double *) SP_MALLOC(node->size * sizeof(double));
         }
 
-        /* prepare sparsity pattern of csc conversion */
-        jacobian_csc_init(x);
-        jacobian_csc_init(y);
-        CSC_Matrix *Jg1 = x->work->jacobian_csc;
-        CSC_Matrix *Jg2 = y->work->jacobian_csc;
+        /* For sparse matrices we need the CSC cache to be valid for the
+           BTA_matrices_alloc / BTDA_matrices_fill_values calls below. */
+        if (!x->jacobian->is_permuted_dense)
+        {
+            sparse_matrix_ensure_csc_cache((sparse_matrix *) x->jacobian);
+        }
+        if (!y->jacobian->is_permuted_dense)
+        {
+            sparse_matrix_ensure_csc_cache((sparse_matrix *) y->jacobian);
+        }
 
         /* compute sparsity of C and prepare CT */
-        CSR_Matrix *C = BTA_alloc(Jg1, Jg2);
-        node->work->iwork = (int *) SP_MALLOC(C->m * sizeof(int));
-        CSR_Matrix *CT = AT_alloc(C, node->work->iwork);
+        matrix *C = BTA_matrices_alloc(x->jacobian, y->jacobian);
+        matrix *CT = C->transpose_alloc(C);
 
         /* initialize wsum_hessians of children */
         wsum_hess_init(x);
         wsum_hess_init(y);
 
         elementwise_mult_expr *mul_node = (elementwise_mult_expr *) node;
-        mul_node->CSR_work1 = C;
-        mul_node->CSR_work2 = CT;
+        mul_node->C = C;
+        mul_node->CT = CT;
 
         /* compute sparsity pattern of H = C + C^T + term2 + term3 (we also
            fill index maps telling us where to accumulate each element of each
            matrix in the sum) */
         int *maps[4];
-        node->wsum_hess = sum_4_csr_alloc(C, CT, x->wsum_hess, y->wsum_hess, maps);
+        CSR_matrix *hess = sum_4_csr_alloc(C->to_csr(C), CT->to_csr(CT),
+                                           x->wsum_hess->to_csr(x->wsum_hess),
+                                           y->wsum_hess->to_csr(y->wsum_hess), maps);
+        node->wsum_hess = new_sparse_matrix(hess);
         mul_node->idx_map_C = maps[0];
         mul_node->idx_map_CT = maps[1];
         mul_node->idx_map_Hx = maps[2];
@@ -193,44 +206,34 @@ static void eval_wsum_hess(expr *node, const double *w)
         bool is_x_affine = x->is_affine(x);
         bool is_y_affine = y->is_affine(y);
         // ----------------------------------------------------------------------
-        //            convert Jacobians of children to CSC format
-        //      (we only need to do this once if the child is affine)
-        //      TODO: what if we have parameters? Should we set jacobian_csc_filled
-        //      to false whenever parameters change value?
+        //  Refresh each operand's CSC_matrix cache as needed for the (Sparse,
+        //  Sparse) dispatch path. For PD operands, refresh_csc_values is a no-op.
+        //  The jacobian_csc_filled flag preserves the affine optimization: we only
+        //  refresh on the first eval for affine children.
         // ----------------------------------------------------------------------
         if (!x->work->jacobian_csc_filled)
         {
-            csr_to_csc_fill_values(x->jacobian, x->work->jacobian_csc,
-                                   x->work->csc_work);
-
+            x->jacobian->refresh_csc_values(x->jacobian);
             if (is_x_affine)
             {
                 x->work->jacobian_csc_filled = true;
             }
         }
-
         if (!y->work->jacobian_csc_filled)
         {
-            csr_to_csc_fill_values(y->jacobian, y->work->jacobian_csc,
-                                   y->work->csc_work);
-
+            y->jacobian->refresh_csc_values(y->jacobian);
             if (is_y_affine)
             {
                 y->work->jacobian_csc_filled = true;
             }
         }
 
-        CSC_Matrix *Jg1 = x->work->jacobian_csc;
-        CSC_Matrix *Jg2 = y->work->jacobian_csc;
-
         // ---------------------------------------------------------------
         //                    compute C and CT
         // ---------------------------------------------------------------
         elementwise_mult_expr *mul_node = (elementwise_mult_expr *) node;
-        CSR_Matrix *C = mul_node->CSR_work1;
-        CSR_Matrix *CT = mul_node->CSR_work2;
-        BTDA_fill_values(Jg1, Jg2, w, C);
-        AT_fill_values(C, CT, node->work->iwork);
+        BTDA_matrices_fill_values(x->jacobian, w, y->jacobian, mul_node->C);
+        mul_node->C->transpose_fill_values(mul_node->C, mul_node->CT);
 
         // ---------------------------------------------------------------
         //              compute term2 and term 3
@@ -257,18 +260,22 @@ static void eval_wsum_hess(expr *node, const double *w)
         //        compute H = C + C^T + term2 + term3
         // ---------------------------------------------------------------
         memset(node->wsum_hess->x, 0, node->wsum_hess->nnz * sizeof(double));
-        accumulator(C, mul_node->idx_map_C, node->wsum_hess->x);
-        accumulator(CT, mul_node->idx_map_CT, node->wsum_hess->x);
-        accumulator(x->wsum_hess, mul_node->idx_map_Hx, node->wsum_hess->x);
-        accumulator(y->wsum_hess, mul_node->idx_map_Hy, node->wsum_hess->x);
+        accumulator(mul_node->C->x, mul_node->C->nnz, mul_node->idx_map_C,
+                    node->wsum_hess->x);
+        accumulator(mul_node->CT->x, mul_node->CT->nnz, mul_node->idx_map_CT,
+                    node->wsum_hess->x);
+        accumulator(x->wsum_hess->x, x->wsum_hess->nnz, mul_node->idx_map_Hx,
+                    node->wsum_hess->x);
+        accumulator(y->wsum_hess->x, y->wsum_hess->nnz, mul_node->idx_map_Hy,
+                    node->wsum_hess->x);
     }
 }
 
 static void free_type_data(expr *node)
 {
     elementwise_mult_expr *mul_node = (elementwise_mult_expr *) node;
-    free_csr_matrix(mul_node->CSR_work1);
-    free_csr_matrix(mul_node->CSR_work2);
+    free_matrix(mul_node->C);
+    free_matrix(mul_node->CT);
     free(mul_node->idx_map_C);
     free(mul_node->idx_map_CT);
     free(mul_node->idx_map_Hx);
diff --git a/src/atoms/bivariate_restricted_dom/quad_over_lin.c b/src/atoms/bivariate_restricted_dom/quad_over_lin.c
index ecf2741..12be5a1 100644
--- a/src/atoms/bivariate_restricted_dom/quad_over_lin.c
+++ b/src/atoms/bivariate_restricted_dom/quad_over_lin.c
@@ -17,7 +17,8 @@
  */
 #include "atoms/bivariate_restricted_dom.h"
 #include "subexpr.h"
-#include "utils/CSC_Matrix.h"
+#include "utils/CSC_matrix.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <math.h>
@@ -58,27 +59,28 @@ static void jacobian_init_impl(expr *node)
     /* if left node is a variable */
     if (x->var_id != NOT_A_VARIABLE)
     {
-        node->jacobian = new_csr_matrix(1, node->n_vars, x->size + 1);
-        node->jacobian->p[0] = 0;
-        node->jacobian->p[1] = x->size + 1;
+        CSR_matrix *jac = new_CSR_matrix(1, node->n_vars, x->size + 1);
+        jac->p[0] = 0;
+        jac->p[1] = x->size + 1;
 
         /* if x has lower idx than y*/
         if (x->var_id < y->var_id)
         {
             for (int j = 0; j < x->size; j++)
             {
-                node->jacobian->i[j] = x->var_id + j;
+                jac->i[j] = x->var_id + j;
             }
-            node->jacobian->i[x->size] = y->var_id;
+            jac->i[x->size] = y->var_id;
         }
         else /* y has lower idx than x */
         {
-            node->jacobian->i[0] = y->var_id;
+            jac->i[0] = y->var_id;
             for (int j = 0; j < x->size; j++)
             {
-                node->jacobian->i[j + 1] = x->var_id + j;
+                jac->i[j + 1] = x->var_id + j;
             }
         }
+        node->jacobian = new_sparse_matrix(jac);
     }
     else /* left node is not a variable (guaranteed to be a linear operator) */
     {
@@ -87,46 +89,48 @@ static void jacobian_init_impl(expr *node)
         /* compute required allocation and allocate jacobian */
         bool *col_nz = (bool *) SP_CALLOC(
             node->n_vars, sizeof(bool)); /* TODO: could use iwork here instead*/
-        int nonzero_cols = count_nonzero_cols(x->jacobian, col_nz);
-        node->jacobian = new_csr_matrix(1, node->n_vars, nonzero_cols + 1);
+        CSR_matrix *Jx = x->jacobian->to_csr(x->jacobian);
+        int nonzero_cols = count_nonzero_cols(Jx, col_nz);
+        CSR_matrix *jac = new_CSR_matrix(1, node->n_vars, nonzero_cols + 1);
 
         /* precompute column indices */
-        node->jacobian->nnz = 0;
+        jac->nnz = 0;
         for (int j = 0; j < node->n_vars; j++)
         {
             if (col_nz[j])
             {
-                node->jacobian->i[node->jacobian->nnz] = j;
-                node->jacobian->nnz++;
+                jac->i[jac->nnz] = j;
+                jac->nnz++;
             }
         }
-        assert(nonzero_cols == node->jacobian->nnz);
+        assert(nonzero_cols == jac->nnz);
 
         free(col_nz);
 
         /* insert y variable index at correct position */
-        insert_idx(y->var_id, node->jacobian->i, node->jacobian->nnz);
-        node->jacobian->nnz += 1;
-        node->jacobian->p[0] = 0;
-        node->jacobian->p[1] = node->jacobian->nnz;
+        insert_idx(y->var_id, jac->i, jac->nnz);
+        jac->nnz += 1;
+        jac->p[0] = 0;
+        jac->p[1] = jac->nnz;
 
         /* find position where y should be inserted */
         node->work->iwork = (int *) SP_MALLOC(sizeof(int));
-        for (int j = 0; j < node->jacobian->nnz; j++)
+        for (int j = 0; j < jac->nnz; j++)
         {
-            if (node->jacobian->i[j] == y->var_id)
+            if (jac->i[j] == y->var_id)
             {
                 node->work->iwork[0] = j;
                 break;
             }
         }
 
-        /* prepare CSC form of child jacobian for chain rule.
+        node->jacobian = new_sparse_matrix(jac);
+
+        /* prepare CSC_matrix form of child jacobian for chain rule.
          * For a linear operator the values are constant, so fill
          * them once here. */
         jacobian_csc_init(x);
-        csr_to_csc_fill_values(x->jacobian, x->work->jacobian_csc,
-                               x->work->csc_work);
+        csr_to_csc_fill_values(Jx, x->work->jacobian_csc, x->work->csc_work);
     }
 }
 
@@ -134,6 +138,7 @@ static void eval_jacobian(expr *node)
 {
     expr *x = node->left;
     expr *y = node->right;
+    CSR_matrix *jac = node->jacobian->to_csr(node->jacobian);
 
     /* if x is a variable */
     if (x->var_id != NOT_A_VARIABLE)
@@ -143,16 +148,16 @@ static void eval_jacobian(expr *node)
         {
             for (int j = 0; j < x->size; j++)
             {
-                node->jacobian->x[j] = (2.0 * x->value[j]) / y->value[0];
+                jac->x[j] = (2.0 * x->value[j]) / y->value[0];
             }
-            node->jacobian->x[x->size] = -node->value[0] / y->value[0];
+            jac->x[x->size] = -node->value[0] / y->value[0];
         }
         else /* y has lower idx than x */
         {
-            node->jacobian->x[0] = -node->value[0] / y->value[0];
+            jac->x[0] = -node->value[0] / y->value[0];
             for (int j = 0; j < x->size; j++)
             {
-                node->jacobian->x[j + 1] = (2.0 * x->value[j]) / y->value[0];
+                jac->x[j + 1] = (2.0 * x->value[j]) / y->value[0];
             }
         }
     }
@@ -164,13 +169,13 @@ static void eval_jacobian(expr *node)
             node->work->dwork[j] = (2.0 * x->value[j]) / y->value[0];
         }
 
-        /* chain rule (no derivative wrt y) using CSC format */
-        yTA_fill_values(x->work->jacobian_csc, node->work->dwork, node->jacobian);
+        /* chain rule (no derivative wrt y) using CSC_matrix format */
+        yTA_fill_values(x->work->jacobian_csc, node->work->dwork, jac);
 
         /* insert derivative wrt y at right place (for correctness this assumes
            that y does not appear in the numerator, but this will always be
            the case since y is a new variable for the denominator */
-        node->jacobian->x[node->work->iwork[0]] = -node->value[0] / y->value[0];
+        jac->x[node->work->iwork[0]] = -node->value[0] / y->value[0];
     }
 }
 
@@ -184,9 +189,8 @@ static void wsum_hess_init_impl(expr *node)
     /* if left node is a variable */
     if (x->var_id != NOT_A_VARIABLE)
     {
-        node->wsum_hess =
-            new_csr_matrix(node->n_vars, node->n_vars, 3 * x->size + 1);
-        CSR_Matrix *H = node->wsum_hess;
+        CSR_matrix *H = new_CSR_matrix(node->n_vars, node->n_vars, 3 * x->size + 1);
+        node->wsum_hess = new_sparse_matrix(H);
 
         /* if x has lower idx than y*/
         if (var_id_x < var_id_y)
diff --git a/src/atoms/bivariate_restricted_dom/rel_entr.c b/src/atoms/bivariate_restricted_dom/rel_entr.c
index d2ab721..ecd4520 100644
--- a/src/atoms/bivariate_restricted_dom/rel_entr.c
+++ b/src/atoms/bivariate_restricted_dom/rel_entr.c
@@ -16,6 +16,7 @@
  * limitations under the License.
  */
 #include "atoms/bivariate_restricted_dom.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <math.h>
@@ -45,7 +46,7 @@ static void forward_vector_args(expr *node, const double *u)
 
 static void jacobian_init_vectors_args(expr *node)
 {
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, 2 * node->size);
+    CSR_matrix *jac = new_CSR_matrix(node->size, node->n_vars, 2 * node->size);
 
     expr *x = node->left;
     expr *y = node->right;
@@ -57,29 +58,29 @@ static void jacobian_init_vectors_args(expr *node)
     {
         for (int j = 0; j < node->size; j++)
         {
-            node->jacobian->i[2 * j] = j + x->var_id;
-            node->jacobian->i[2 * j + 1] = j + y->var_id;
-            node->jacobian->p[j] = 2 * j;
+            jac->i[2 * j] = j + x->var_id;
+            jac->i[2 * j + 1] = j + y->var_id;
+            jac->p[j] = 2 * j;
         }
     }
     else
     {
         for (int j = 0; j < node->size; j++)
         {
-            node->jacobian->i[2 * j] = j + y->var_id;
-            node->jacobian->i[2 * j + 1] = j + x->var_id;
-            node->jacobian->p[j] = 2 * j;
+            jac->i[2 * j] = j + y->var_id;
+            jac->i[2 * j + 1] = j + x->var_id;
+            jac->p[j] = 2 * j;
         }
     }
 
-    node->jacobian->p[node->size] = 2 * node->size;
+    jac->p[node->size] = 2 * node->size;
+    node->jacobian = new_sparse_matrix(jac);
 }
 
 static void eval_jacobian_vector_args(expr *node)
 {
     expr *x = node->left;
     expr *y = node->right;
-
     /* if x has lower variable idx than y */
     if (x->var_id < y->var_id)
     {
@@ -101,7 +102,7 @@ static void eval_jacobian_vector_args(expr *node)
 
 static void wsum_hess_init_vector_args(expr *node)
 {
-    node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, 4 * node->size);
+    CSR_matrix *H = new_CSR_matrix(node->n_vars, node->n_vars, 4 * node->size);
     expr *x = node->left;
     expr *y = node->right;
 
@@ -121,9 +122,9 @@ static void wsum_hess_init_vector_args(expr *node)
     /* var1 rows of Hessian */
     for (i = 0; i < node->size; i++)
     {
-        node->wsum_hess->p[var1_id + i] = 2 * i;
-        node->wsum_hess->i[2 * i] = var1_id + i;
-        node->wsum_hess->i[2 * i + 1] = var2_id + i;
+        H->p[var1_id + i] = 2 * i;
+        H->i[2 * i] = var1_id + i;
+        H->i[2 * i + 1] = var2_id + i;
     }
 
     int nnz = 2 * node->size;
@@ -131,21 +132,22 @@ static void wsum_hess_init_vector_args(expr *node)
     /* rows between var1 and var2 */
     for (i = var1_id + node->size; i < var2_id; i++)
     {
-        node->wsum_hess->p[i] = nnz;
+        H->p[i] = nnz;
     }
 
     /* var2 rows of Hessian */
     for (i = 0; i < node->size; i++)
     {
-        node->wsum_hess->p[var2_id + i] = nnz + 2 * i;
+        H->p[var2_id + i] = nnz + 2 * i;
     }
-    memcpy(node->wsum_hess->i + nnz, node->wsum_hess->i, nnz * sizeof(int));
+    memcpy(H->i + nnz, H->i, nnz * sizeof(int));
 
     /* remaining rows */
     for (i = var2_id + node->size; i <= node->n_vars; i++)
     {
-        node->wsum_hess->p[i] = 4 * node->size;
+        H->p[i] = 4 * node->size;
     }
+    node->wsum_hess = new_sparse_matrix(H);
 }
 
 static void eval_wsum_hess_vector_args(expr *node, const double *w)
diff --git a/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c b/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c
index 5f09cc9..a356289 100644
--- a/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c
+++ b/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c
@@ -16,6 +16,7 @@
  * limitations under the License.
  */
 #include "atoms/bivariate_restricted_dom.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <math.h>
@@ -50,35 +51,35 @@ static void jacobian_init_scalar_vector(expr *node)
     assert(x->var_id != NOT_A_VARIABLE && y->var_id != NOT_A_VARIABLE);
     assert(x->var_id != y->var_id);
 
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, 2 * node->size);
+    CSR_matrix *jac = new_CSR_matrix(node->size, node->n_vars, 2 * node->size);
 
     if (x->var_id < y->var_id)
     {
         for (int j = 0; j < node->size; j++)
         {
-            node->jacobian->i[2 * j] = x->var_id;
-            node->jacobian->i[2 * j + 1] = y->var_id + j;
-            node->jacobian->p[j] = 2 * j;
+            jac->i[2 * j] = x->var_id;
+            jac->i[2 * j + 1] = y->var_id + j;
+            jac->p[j] = 2 * j;
         }
     }
     else
     {
         for (int j = 0; j < node->size; j++)
         {
-            node->jacobian->i[2 * j] = y->var_id + j;
-            node->jacobian->i[2 * j + 1] = x->var_id;
-            node->jacobian->p[j] = 2 * j;
+            jac->i[2 * j] = y->var_id + j;
+            jac->i[2 * j + 1] = x->var_id;
+            jac->p[j] = 2 * j;
         }
     }
 
-    node->jacobian->p[node->size] = 2 * node->size;
+    jac->p[node->size] = 2 * node->size;
+    node->jacobian = new_sparse_matrix(jac);
 }
 
 static void eval_jacobian_scalar_vector(expr *node)
 {
     expr *x = node->left;
     expr *y = node->right;
-
     if (x->var_id < y->var_id)
     {
         for (int i = 0; i < node->size; i++)
@@ -104,8 +105,8 @@ static void wsum_hess_init_scalar_vector(expr *node)
     int var_id_x = x->var_id;
     int var_id_y = y->var_id;
 
-    node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, 3 * node->size + 1);
-    CSR_Matrix *H = node->wsum_hess;
+    CSR_matrix *H =
+        new_CSR_matrix(node->n_vars, node->n_vars, 3 * node->size + 1);
 
     if (var_id_x < var_id_y)
     {
@@ -161,6 +162,7 @@ static void wsum_hess_init_scalar_vector(expr *node)
             H->p[i] = 3 * node->size + 1;
         }
     }
+    node->wsum_hess = new_sparse_matrix(H);
 }
 
 static void eval_wsum_hess_scalar_vector(expr *node, const double *w)
diff --git a/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c b/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c
index f425357..f2b7928 100644
--- a/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c
+++ b/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c
@@ -16,6 +16,7 @@
  * limitations under the License.
  */
 #include "atoms/bivariate_restricted_dom.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <math.h>
@@ -50,35 +51,35 @@ static void jacobian_init_vector_scalar(expr *node)
     assert(x->var_id != NOT_A_VARIABLE && y->var_id != NOT_A_VARIABLE);
     assert(x->var_id != y->var_id);
 
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, 2 * node->size);
+    CSR_matrix *jac = new_CSR_matrix(node->size, node->n_vars, 2 * node->size);
 
     if (x->var_id < y->var_id)
     {
         for (int j = 0; j < node->size; j++)
         {
-            node->jacobian->i[2 * j] = x->var_id + j;
-            node->jacobian->i[2 * j + 1] = y->var_id;
-            node->jacobian->p[j] = 2 * j;
+            jac->i[2 * j] = x->var_id + j;
+            jac->i[2 * j + 1] = y->var_id;
+            jac->p[j] = 2 * j;
         }
     }
     else
     {
         for (int j = 0; j < node->size; j++)
         {
-            node->jacobian->i[2 * j] = y->var_id;
-            node->jacobian->i[2 * j + 1] = x->var_id + j;
-            node->jacobian->p[j] = 2 * j;
+            jac->i[2 * j] = y->var_id;
+            jac->i[2 * j + 1] = x->var_id + j;
+            jac->p[j] = 2 * j;
         }
     }
 
-    node->jacobian->p[node->size] = 2 * node->size;
+    jac->p[node->size] = 2 * node->size;
+    node->jacobian = new_sparse_matrix(jac);
 }
 
 static void eval_jacobian_vector_scalar(expr *node)
 {
     expr *x = node->left;
     expr *y = node->right;
-
     if (x->var_id < y->var_id)
     {
         for (int i = 0; i < node->size; i++)
@@ -104,8 +105,8 @@ static void wsum_hess_init_vector_scalar(expr *node)
     int var_id_x = x->var_id;
     int var_id_y = y->var_id;
 
-    node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, 3 * node->size + 1);
-    CSR_Matrix *H = node->wsum_hess;
+    CSR_matrix *H =
+        new_CSR_matrix(node->n_vars, node->n_vars, 3 * node->size + 1);
 
     if (var_id_x < var_id_y)
     {
@@ -161,6 +162,7 @@ static void wsum_hess_init_vector_scalar(expr *node)
             H->p[i] = 3 * node->size + 1;
         }
     }
+    node->wsum_hess = new_sparse_matrix(H);
 }
 
 static void eval_wsum_hess_vector_scalar(expr *node, const double *w)
diff --git a/src/atoms/elementwise_full_dom/common.c b/src/atoms/elementwise_full_dom/common.c
index b9735a4..352f784 100644
--- a/src/atoms/elementwise_full_dom/common.c
+++ b/src/atoms/elementwise_full_dom/common.c
@@ -17,9 +17,10 @@
  */
 #include "atoms/elementwise_full_dom.h"
 #include "subexpr.h"
-#include "utils/CSC_Matrix.h"
-#include "utils/CSR_Matrix.h"
-#include "utils/CSR_sum.h"
+#include "utils/CSC_matrix.h"
+#include "utils/CSR_matrix.h"
+#include "utils/matrix_sum.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <stdio.h>
 #include <stdlib.h>
@@ -32,20 +33,20 @@ void jacobian_init_elementwise(expr *node)
     /* if the variable is a child */
     if (child->var_id != NOT_A_VARIABLE)
     {
-        node->jacobian = new_csr_matrix(node->size, node->n_vars, node->size);
+        CSR_matrix *jac = new_CSR_matrix(node->size, node->n_vars, node->size);
         for (int j = 0; j < node->size; j++)
         {
-            node->jacobian->p[j] = j;
-            node->jacobian->i[j] = j + child->var_id;
+            jac->p[j] = j;
+            jac->i[j] = j + child->var_id;
         }
-        node->jacobian->p[node->size] = node->size;
+        jac->p[node->size] = node->size;
+        node->jacobian = new_sparse_matrix(jac);
     }
     else
     {
         /* jacobian of h(x) = f(g(x)) is Jf @ Jg, and here Jf is diagonal */
         jacobian_init(child);
-        CSR_Matrix *Jg = child->jacobian;
-        node->jacobian = new_csr_copy_sparsity(Jg);
+        node->jacobian = child->jacobian->copy_sparsity(child->jacobian);
         node->work->dwork = (double *) SP_MALLOC(node->size * sizeof(double));
         node->work->local_jac_diag =
             (double *) SP_MALLOC(node->size * sizeof(double));
@@ -64,11 +65,11 @@ void eval_jacobian_elementwise(expr *node)
     {
         /* jacobian of h(x) = f(g(x)) is Jf @ Jg, and here Jf is diagonal */
         child->eval_jacobian(child);
-        CSR_Matrix *Jg = child->jacobian;
         node->local_jacobian(node, node->work->local_jac_diag);
         memcpy(node->work->dwork, node->work->local_jac_diag,
                node->size * sizeof(double));
-        DA_fill_values(node->work->dwork, Jg, node->jacobian);
+        child->jacobian->DA_fill_values(node->work->dwork, child->jacobian,
+                                        node->jacobian);
     }
 }
 
@@ -81,18 +82,19 @@ void wsum_hess_init_elementwise(expr *node)
     /* if the variable is a child */
     if (id != NOT_A_VARIABLE)
     {
-        node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, node->size);
+        CSR_matrix *hess = new_CSR_matrix(node->n_vars, node->n_vars, node->size);
 
         for (i = 0; i < node->size; i++)
         {
-            node->wsum_hess->p[id + i] = i;
-            node->wsum_hess->i[i] = id + i;
+            hess->p[id + i] = i;
+            hess->i[i] = id + i;
         }
 
         for (i = id + node->size; i <= node->n_vars; i++)
         {
-            node->wsum_hess->p[i] = node->size;
+            hess->p[i] = node->size;
         }
+        node->wsum_hess = new_sparse_matrix(hess);
     }
     else
     {
@@ -101,28 +103,27 @@ void wsum_hess_init_elementwise(expr *node)
             term2 = sum_i (J_f^T w)_i^T Hg_i.
 
             For elementwise functions, D is diagonal. */
-        jacobian_csc_init(child);
-        CSC_Matrix *Jg = child->work->jacobian_csc;
-
         if (child->is_affine(child))
         {
-            node->wsum_hess = ATA_alloc(Jg);
+            node->wsum_hess = child->jacobian->ATA_alloc(child->jacobian);
         }
         else
         {
             /* term1: Jg^T @ D @ Jg */
-            node->work->hess_term1 = ATA_alloc(Jg);
+            node->work->hess_term1 = child->jacobian->ATA_alloc(child->jacobian);
 
-            /* term2: child's Hessian */
+            /* term2: child's Hessian (mirror its sparsity polymorphically) */
             wsum_hess_init(child);
-            CSR_Matrix *Hg = child->wsum_hess;
-            node->work->hess_term2 = new_csr_copy_sparsity(Hg);
+            node->work->hess_term2 =
+                child->wsum_hess->copy_sparsity(child->wsum_hess);
 
             /* wsum_hess = term1 + term2 */
-            int max_nnz = node->work->hess_term1->nnz + node->work->hess_term2->nnz;
-            node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, max_nnz);
-            sum_csr_alloc(node->work->hess_term1, node->work->hess_term2,
-                          node->wsum_hess);
+            int max_nnz =
+                node->work->hess_term1->nnz + node->work->hess_term2->nnz;
+            node->wsum_hess =
+                new_sparse_matrix_alloc(node->n_vars, node->n_vars, max_nnz);
+            sum_matrices_alloc(node->work->hess_term1, node->work->hess_term2,
+                               node->wsum_hess);
         }
     }
 }
@@ -139,27 +140,27 @@ void eval_wsum_hess_elementwise(expr *node, const double *w)
     {
         if (child->is_affine(child))
         {
+            /* Refresh the child Jacobian's CSC_matrix mirror once; subsequent calls
+               skip since the affine child's values don't change. */
             if (!child->work->jacobian_csc_filled)
             {
-                csr_to_csc_fill_values(child->jacobian, child->work->jacobian_csc,
-                                       child->work->csc_work);
+                child->jacobian->refresh_csc_values(child->jacobian);
                 child->work->jacobian_csc_filled = true;
             }
 
             node->local_wsum_hess(node, node->work->dwork, w);
-            ATDA_fill_values(child->work->jacobian_csc, node->work->dwork,
-                             node->wsum_hess);
+            child->jacobian->ATDA_fill_values(child->jacobian, node->work->dwork,
+                                              node->wsum_hess);
         }
         else
         {
-            /* refresh CSC jacobian values */
-            csr_to_csc_fill_values(child->jacobian, child->work->jacobian_csc,
-                                   child->work->csc_work);
+            /* Non-affine child: values change every iteration, must refresh. */
+            child->jacobian->refresh_csc_values(child->jacobian);
 
             /* term1: Jg^T @ D @ Jg */
             node->local_wsum_hess(node, node->work->dwork, w);
-            ATDA_fill_values(child->work->jacobian_csc, node->work->dwork,
-                             node->work->hess_term1);
+            child->jacobian->ATDA_fill_values(child->jacobian, node->work->dwork,
+                                              node->work->hess_term1);
 
             /* term2: child Hessian with weight Jf^T w */
             memcpy(node->work->dwork, node->work->local_jac_diag,
@@ -174,8 +175,8 @@ void eval_wsum_hess_elementwise(expr *node, const double *w)
                    child->wsum_hess->nnz * sizeof(double));
 
             /* wsum_hess = term1 + term2 */
-            sum_csr_fill_values(node->work->hess_term1, node->work->hess_term2,
-                                node->wsum_hess);
+            sum_matrices_fill_values(node->work->hess_term1,
+                                     node->work->hess_term2, node->wsum_hess);
         }
     }
 }
diff --git a/src/atoms/elementwise_restricted_dom/atanh.c b/src/atoms/elementwise_restricted_dom/atanh.c
index 79a08f2..c2c71c2 100644
--- a/src/atoms/elementwise_restricted_dom/atanh.c
+++ b/src/atoms/elementwise_restricted_dom/atanh.c
@@ -30,19 +30,21 @@ static void atanh_forward(expr *node, const double *u)
 static void atanh_eval_jacobian(expr *node)
 {
     double *x = node->left->value;
+    double *jx = node->jacobian->x;
     for (int j = 0; j < node->size; j++)
     {
-        node->jacobian->x[j] = 1.0 / (1.0 - x[j] * x[j]);
+        jx[j] = 1.0 / (1.0 - x[j] * x[j]);
     }
 }
 
 static void atanh_eval_wsum_hess(expr *node, const double *w)
 {
     double *x = node->left->value;
+    double *hx = node->wsum_hess->x;
     for (int j = 0; j < node->size; j++)
     {
         double c = 1.0 - x[j] * x[j];
-        node->wsum_hess->x[j] = w[j] * (2.0 * x[j]) / (c * c);
+        hx[j] = w[j] * (2.0 * x[j]) / (c * c);
     }
 }
 
diff --git a/src/atoms/elementwise_restricted_dom/common.c b/src/atoms/elementwise_restricted_dom/common.c
index 9a2f561..5c88733 100644
--- a/src/atoms/elementwise_restricted_dom/common.c
+++ b/src/atoms/elementwise_restricted_dom/common.c
@@ -16,6 +16,7 @@
  * limitations under the License.
  */
 #include "atoms/elementwise_restricted_dom.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <stdlib.h>
 
@@ -23,13 +24,14 @@ void jacobian_init_restricted(expr *node)
 {
     expr *child = node->left;
 
-    node->jacobian = new_csr_matrix(node->size, node->n_vars, node->size);
+    CSR_matrix *jac = new_CSR_matrix(node->size, node->n_vars, node->size);
     for (int j = 0; j < node->size; j++)
     {
-        node->jacobian->p[j] = j;
-        node->jacobian->i[j] = j + child->var_id;
+        jac->p[j] = j;
+        jac->i[j] = j + child->var_id;
     }
-    node->jacobian->p[node->size] = node->size;
+    jac->p[node->size] = node->size;
+    node->jacobian = new_sparse_matrix(jac);
 }
 
 void wsum_hess_init_restricted(expr *node)
@@ -38,18 +40,19 @@ void wsum_hess_init_restricted(expr *node)
     int id = child->var_id;
     int i;
 
-    node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, node->size);
+    CSR_matrix *hess = new_CSR_matrix(node->n_vars, node->n_vars, node->size);
 
     for (i = 0; i < node->size; i++)
     {
-        node->wsum_hess->p[id + i] = i;
-        node->wsum_hess->i[i] = id + i;
+        hess->p[id + i] = i;
+        hess->i[i] = id + i;
     }
 
     for (i = id + node->size; i <= node->n_vars; i++)
     {
-        node->wsum_hess->p[i] = node->size;
+        hess->p[i] = node->size;
     }
+    node->wsum_hess = new_sparse_matrix(hess);
 }
 
 bool is_affine_restricted(const expr *node)
diff --git a/src/atoms/elementwise_restricted_dom/entr.c b/src/atoms/elementwise_restricted_dom/entr.c
index 53ac275..c3d29cb 100644
--- a/src/atoms/elementwise_restricted_dom/entr.c
+++ b/src/atoms/elementwise_restricted_dom/entr.c
@@ -32,18 +32,20 @@ static void entr_forward(expr *node, const double *u)
 static void entr_eval_jacobian(expr *node)
 {
     double *x = node->left->value;
+    double *jx = node->jacobian->x;
     for (int j = 0; j < node->size; j++)
     {
-        node->jacobian->x[j] = -log(x[j]) - 1.0;
+        jx[j] = -log(x[j]) - 1.0;
     }
 }
 
 static void entr_eval_wsum_hess(expr *node, const double *w)
 {
     double *x = node->left->value;
+    double *hx = node->wsum_hess->x;
     for (int j = 0; j < node->size; j++)
     {
-        node->wsum_hess->x[j] = -w[j] / x[j];
+        hx[j] = -w[j] / x[j];
     }
 }
 
diff --git a/src/atoms/elementwise_restricted_dom/log.c b/src/atoms/elementwise_restricted_dom/log.c
index ce96425..d9753e1 100644
--- a/src/atoms/elementwise_restricted_dom/log.c
+++ b/src/atoms/elementwise_restricted_dom/log.c
@@ -31,18 +31,20 @@ static void log_forward(expr *node, const double *u)
 static void log_eval_jacobian(expr *node)
 {
     double *x = node->left->value;
+    double *jx = node->jacobian->x;
     for (int j = 0; j < node->size; j++)
     {
-        node->jacobian->x[j] = 1.0 / x[j];
+        jx[j] = 1.0 / x[j];
     }
 }
 
 static void log_eval_wsum_hess(expr *node, const double *w)
 {
     double *x = node->left->value;
+    double *hx = node->wsum_hess->x;
     for (int j = 0; j < node->size; j++)
     {
-        node->wsum_hess->x[j] = -w[j] / (x[j] * x[j]);
+        hx[j] = -w[j] / (x[j] * x[j]);
     }
 }
 
diff --git a/src/atoms/elementwise_restricted_dom/tan.c b/src/atoms/elementwise_restricted_dom/tan.c
index ccf945e..3a4e4dc 100644
--- a/src/atoms/elementwise_restricted_dom/tan.c
+++ b/src/atoms/elementwise_restricted_dom/tan.c
@@ -30,20 +30,22 @@ static void tan_forward(expr *node, const double *u)
 static void tan_eval_jacobian(expr *node)
 {
     double *x = node->left->value;
+    double *jx = node->jacobian->x;
     for (int j = 0; j < node->size; j++)
     {
         double c = cos(x[j]);
-        node->jacobian->x[j] = 1.0 / (c * c);
+        jx[j] = 1.0 / (c * c);
     }
 }
 
 static void tan_eval_wsum_hess(expr *node, const double *w)
 {
     double *x = node->left->value;
+    double *hx = node->wsum_hess->x;
     for (int j = 0; j < node->size; j++)
     {
         double c = cos(x[j]);
-        node->wsum_hess->x[j] = 2.0 * w[j] * node->value[j] / (c * c);
+        hx[j] = 2.0 * w[j] * node->value[j] / (c * c);
     }
 }
 
diff --git a/src/atoms/other/prod.c b/src/atoms/other/prod.c
index 083e4c1..aeb7226 100644
--- a/src/atoms/other/prod.c
+++ b/src/atoms/other/prod.c
@@ -16,6 +16,7 @@
  * limitations under the License.
  */
 #include "atoms/non_elementwise_full_dom.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <math.h>
@@ -71,13 +72,14 @@ static void jacobian_init_impl(expr *node)
     /* if x is a variable */
     if (x->var_id != NOT_A_VARIABLE)
     {
-        node->jacobian = new_csr_matrix(1, node->n_vars, x->size);
-        node->jacobian->p[0] = 0;
-        node->jacobian->p[1] = x->size;
+        CSR_matrix *jac = new_CSR_matrix(1, node->n_vars, x->size);
+        jac->p[0] = 0;
+        jac->p[1] = x->size;
         for (int j = 0; j < x->size; j++)
         {
-            node->jacobian->i[j] = x->var_id + j;
+            jac->i[j] = x->var_id + j;
         }
+        node->jacobian = new_sparse_matrix(jac);
     }
     else
     {
@@ -96,21 +98,22 @@ static void eval_jacobian(expr *node)
     /* if x is a variable */
     if (x->var_id != NOT_A_VARIABLE)
     {
+        double *jx = node->jacobian->x;
         if (num_of_zeros == 0)
         {
             for (int j = 0; j < x->size; j++)
             {
-                node->jacobian->x[j] = node->value[0] / x->value[j];
+                jx[j] = node->value[0] / x->value[j];
             }
         }
         else if (num_of_zeros == 1)
         {
-            memset(node->jacobian->x, 0, sizeof(double) * x->size);
-            node->jacobian->x[pnode->zero_index] = pnode->prod_nonzero;
+            memset(jx, 0, sizeof(double) * x->size);
+            jx[pnode->zero_index] = pnode->prod_nonzero;
         }
         else
         {
-            memset(node->jacobian->x, 0, sizeof(double) * x->size);
+            memset(jx, 0, sizeof(double) * x->size);
         }
     }
     else
@@ -128,21 +131,21 @@ static void wsum_hess_init_impl(expr *node)
     /* if x is a variable */
     if (x->var_id != NOT_A_VARIABLE)
     {
-        /* allocate n_vars x n_vars CSR matrix with dense block */
+        /* allocate n_vars x n_vars CSR_matrix matrix with dense block */
         int block_size = x->size;
         int nnz = block_size * block_size;
-        node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, nnz);
+        CSR_matrix *hess = new_CSR_matrix(node->n_vars, node->n_vars, nnz);
 
         /* fill row pointers for the dense block */
         for (int i = 0; i < block_size; i++)
         {
-            node->wsum_hess->p[x->var_id + i] = i * block_size;
+            hess->p[x->var_id + i] = i * block_size;
         }
 
         /* fill row pointers for rows after the block */
         for (int i = x->var_id + block_size; i <= node->n_vars; i++)
         {
-            node->wsum_hess->p[i] = nnz;
+            hess->p[i] = nnz;
         }
 
         /* fill column indices for the dense block */
@@ -150,9 +153,10 @@ static void wsum_hess_init_impl(expr *node)
         {
             for (int j = 0; j < block_size; j++)
             {
-                node->wsum_hess->i[i * block_size + j] = x->var_id + j;
+                hess->i[i * block_size + j] = x->var_id + j;
             }
         }
+        node->wsum_hess = new_sparse_matrix(hess);
     }
     else
     {
diff --git a/src/atoms/other/prod_axis_one.c b/src/atoms/other/prod_axis_one.c
index ac2583e..2e0e971 100644
--- a/src/atoms/other/prod_axis_one.c
+++ b/src/atoms/other/prod_axis_one.c
@@ -16,6 +16,7 @@
  * limitations under the License.
  */
 #include "atoms/non_elementwise_full_dom.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <math.h>
@@ -81,14 +82,14 @@ static void jacobian_init_impl(expr *node)
     /* if x is a variable */
     if (x->var_id != NOT_A_VARIABLE)
     {
-        node->jacobian = new_csr_matrix(node->size, node->n_vars, x->size);
+        CSR_matrix *jac = new_CSR_matrix(node->size, node->n_vars, x->size);
 
         /* set row pointers (each row has d2 nnzs) */
         for (int row = 0; row < x->d1; row++)
         {
-            node->jacobian->p[row] = row * x->d2;
+            jac->p[row] = row * x->d2;
         }
-        node->jacobian->p[x->d1] = x->size;
+        jac->p[x->d1] = x->size;
 
         /* set column indices */
         for (int row = 0; row < x->d1; row++)
@@ -96,9 +97,10 @@ static void jacobian_init_impl(expr *node)
             int start = row * x->d2;
             for (int col = 0; col < x->d2; col++)
             {
-                node->jacobian->i[start + col] = x->var_id + col * x->d1 + row;
+                jac->i[start + col] = x->var_id + col * x->d1 + row;
             }
         }
+        node->jacobian = new_sparse_matrix(jac);
     }
     else
     {
@@ -161,8 +163,7 @@ static void wsum_hess_init_impl(expr *node)
         /* each row i has d2-1 non-zero entries, with column indices corresponding to
            the columns in that row (except the diagonal element). */
         int nnz = x->d1 * x->d2 * (x->d2 - 1);
-        node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, nnz);
-        CSR_Matrix *H = node->wsum_hess;
+        CSR_matrix *H = new_CSR_matrix(node->n_vars, node->n_vars, nnz);
 
         /* fill sparsity pattern */
         int nnz_per_row = x->d2 - 1;
@@ -192,6 +193,7 @@ static void wsum_hess_init_impl(expr *node)
         {
             H->p[i] = nnz;
         }
+        node->wsum_hess = new_sparse_matrix(H);
     }
     else
     {
@@ -205,7 +207,7 @@ static inline void wsum_hess_row_no_zeros(expr *node, const double *w, int row,
                                           int d2)
 {
     expr *x = node->left;
-    CSR_Matrix *H = node->wsum_hess;
+    CSR_matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
     double scale = w[row] * node->value[row];
 
     /* for each variable xk in this row, fill in Hessian entries
@@ -232,7 +234,7 @@ static inline void wsum_hess_row_one_zero(expr *node, const double *w, int row,
 {
     expr *x = node->left;
     prod_axis *pnode = (prod_axis *) node;
-    CSR_Matrix *H = node->wsum_hess;
+    CSR_matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
     double *H_vals = H->x;
     int p = pnode->zero_index[row]; /* zero column index */
     double w_prod = w[row] * pnode->prod_nonzero[row];
@@ -277,7 +279,7 @@ static inline void wsum_hess_row_two_zeros(expr *node, const double *w, int row,
 {
     expr *x = node->left;
     prod_axis *pnode = (prod_axis *) node;
-    CSR_Matrix *H = node->wsum_hess;
+    CSR_matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
     double *H_vals = H->x;
 
     /* find indices p and q where row has zeros */
@@ -330,7 +332,7 @@ static inline void wsum_hess_row_two_zeros(expr *node, const double *w, int row,
 
 static inline void wsum_hess_row_many_zeros(expr *node, int row, int d2)
 {
-    CSR_Matrix *H = node->wsum_hess;
+    CSR_matrix *H = node->wsum_hess->to_csr(node->wsum_hess);
     double *H_vals = H->x;
     expr *x = node->left;
 
diff --git a/src/atoms/other/prod_axis_zero.c b/src/atoms/other/prod_axis_zero.c
index 20fac1b..55bedd2 100644
--- a/src/atoms/other/prod_axis_zero.c
+++ b/src/atoms/other/prod_axis_zero.c
@@ -16,6 +16,7 @@
  * limitations under the License.
  */
 #include "atoms/non_elementwise_full_dom.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <math.h>
@@ -76,14 +77,14 @@ static void jacobian_init_impl(expr *node)
     /* if x is a variable */
     if (x->var_id != NOT_A_VARIABLE)
     {
-        node->jacobian = new_csr_matrix(node->size, node->n_vars, x->size);
+        CSR_matrix *jac = new_CSR_matrix(node->size, node->n_vars, x->size);
 
         /* set row pointers (each row has d1 nnzs) */
         for (int row = 0; row < x->d2; row++)
         {
-            node->jacobian->p[row] = row * x->d1;
+            jac->p[row] = row * x->d1;
         }
-        node->jacobian->p[x->d2] = x->size;
+        jac->p[x->d2] = x->size;
 
         /* set column indices */
         for (int col = 0; col < x->d2; col++)
@@ -91,9 +92,10 @@ static void jacobian_init_impl(expr *node)
             int start = col * x->d1;
             for (int i = 0; i < x->d1; i++)
             {
-                node->jacobian->i[start + i] = x->var_id + start + i;
+                jac->i[start + i] = x->var_id + start + i;
             }
         }
+        node->jacobian = new_sparse_matrix(jac);
     }
     else
     {
@@ -154,8 +156,7 @@ static void wsum_hess_init_impl(expr *node)
     {
         /* Hessian has block diagonal structure: d2 blocks of size d1 x d1 */
         int nnz = x->d2 * x->d1 * x->d1;
-        node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, nnz);
-        CSR_Matrix *H = node->wsum_hess;
+        CSR_matrix *H = new_CSR_matrix(node->n_vars, node->n_vars, nnz);
 
         /* fill row pointers for the variable's rows (block diagonal) */
         for (int i = 0; i < x->size; i++)
@@ -184,6 +185,7 @@ static void wsum_hess_init_impl(expr *node)
                 }
             }
         }
+        node->wsum_hess = new_sparse_matrix(H);
     }
     else
     {
diff --git a/src/atoms/other/quad_form.c b/src/atoms/other/quad_form.c
index 72474df..d49c16b 100644
--- a/src/atoms/other/quad_form.c
+++ b/src/atoms/other/quad_form.c
@@ -17,9 +17,10 @@
  */
 #include "atoms/non_elementwise_full_dom.h"
 #include "subexpr.h"
-#include "utils/CSC_Matrix.h"
-#include "utils/CSR_sum.h"
+#include "utils/CSC_matrix.h"
+#include "utils/matrix_sum.h"
 #include "utils/cblas_wrapper.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <math.h>
@@ -35,7 +36,7 @@ static void forward(expr *node, const double *u)
     x->forward(x, u);
 
     /* local forward pass  */
-    CSR_Matrix *Q = ((quad_form_expr *) node)->Q;
+    CSR_matrix *Q = ((quad_form_expr *) node)->Q;
     Ax_csr(Q, x->value, node->work->dwork, 0);
     node->value[0] = 0.0;
 
@@ -51,27 +52,28 @@ static void jacobian_init_impl(expr *node)
 
     if (x->var_id != NOT_A_VARIABLE)
     {
-        node->jacobian = new_csr_matrix(1, node->n_vars, x->size);
-        node->jacobian->p[0] = 0;
-        node->jacobian->p[1] = x->size;
+        CSR_matrix *jac = new_CSR_matrix(1, node->n_vars, x->size);
+        jac->p[0] = 0;
+        jac->p[1] = x->size;
 
         for (int j = 0; j < x->size; j++)
         {
-            node->jacobian->i[j] = x->var_id + j;
+            jac->i[j] = x->var_id + j;
         }
+        node->jacobian = new_sparse_matrix(jac);
     }
     else
     {
         /* chain rule: J = 2 * (Q @ f(x))^T * J_f */
         jacobian_init(x);
         jacobian_csc_init(x);
-        CSC_Matrix *J_csc = x->work->jacobian_csc;
+        CSC_matrix *J_csc = x->work->jacobian_csc;
 
         /* allocate the right number of nnz */
         int nnz = count_nonzero_cols_csc(J_csc);
-        node->jacobian = new_csr_matrix(1, node->n_vars, nnz);
-        node->jacobian->p[0] = 0;
-        node->jacobian->p[1] = nnz;
+        CSR_matrix *jac = new_CSR_matrix(1, node->n_vars, nnz);
+        jac->p[0] = 0;
+        jac->p[1] = nnz;
 
         /* fill sparsity pattern */
         int idx = 0;
@@ -79,22 +81,24 @@ static void jacobian_init_impl(expr *node)
         {
             if (J_csc->p[j + 1] > J_csc->p[j])
             {
-                node->jacobian->i[idx++] = j;
+                jac->i[idx++] = j;
             }
         }
+        node->jacobian = new_sparse_matrix(jac);
     }
 }
 
 static void eval_jacobian(expr *node)
 {
     expr *x = node->left;
-    CSR_Matrix *Q = ((quad_form_expr *) node)->Q;
+    CSR_matrix *Q = ((quad_form_expr *) node)->Q;
+    CSR_matrix *jac = node->jacobian->to_csr(node->jacobian);
 
     if (x->var_id != NOT_A_VARIABLE)
     {
         /* jacobian = 2 * (Q @ x)^T */
-        Ax_csr(Q, x->value, node->jacobian->x, 0);
-        cblas_dscal(x->size, 2.0, node->jacobian->x, 1);
+        Ax_csr(Q, x->value, jac->x, 0);
+        cblas_dscal(x->size, 2.0, jac->x, 1);
     }
     else
     {
@@ -103,7 +107,7 @@ static void eval_jacobian(expr *node)
 
         if (!x->work->jacobian_csc_filled)
         {
-            csr_to_csc_fill_values(x->jacobian, x->work->jacobian_csc,
+            csr_to_csc_fill_values(x->jacobian->to_csr(x->jacobian), x->work->jacobian_csc,
                                    x->work->csc_work);
 
             if (x->is_affine(x))
@@ -114,20 +118,20 @@ static void eval_jacobian(expr *node)
 
         /* The jacobian has same values as the gradient, which is
            J_f^T (Q @ f(x)). Here, dwork stores Q @ f(x) from forward */
-        yTA_fill_values(x->work->jacobian_csc, node->work->dwork, node->jacobian);
+        yTA_fill_values(x->work->jacobian_csc, node->work->dwork, jac);
 
-        cblas_dscal(node->jacobian->nnz, 2.0, node->jacobian->x, 1);
+        cblas_dscal(jac->nnz, 2.0, jac->x, 1);
     }
 }
 
 static void wsum_hess_init_impl(expr *node)
 {
-    CSR_Matrix *Q = ((quad_form_expr *) node)->Q;
+    CSR_matrix *Q = ((quad_form_expr *) node)->Q;
     expr *x = node->left;
 
     if (x->var_id != NOT_A_VARIABLE)
     {
-        CSR_Matrix *H = new_csr_matrix(node->n_vars, node->n_vars, Q->nnz);
+        CSR_matrix *H = new_CSR_matrix(node->n_vars, node->n_vars, Q->nnz);
 
         /* set global row pointers */
         memcpy(H->p + x->var_id, Q->p, (x->size + 1) * sizeof(int));
@@ -142,7 +146,7 @@ static void wsum_hess_init_impl(expr *node)
             H->i[i] = Q->i[i] + x->var_id;
         }
 
-        node->wsum_hess = H;
+        node->wsum_hess = new_sparse_matrix(H);
     }
     else
     {
@@ -157,28 +161,30 @@ static void wsum_hess_init_impl(expr *node)
 
         /* jacobian_csc_init(x) already called in jacobian_init */
         quad_form_expr *qnode = (quad_form_expr *) node;
-        CSC_Matrix *Jf = x->work->jacobian_csc;
+        CSC_matrix *Jf = x->work->jacobian_csc;
 
         /* term1 = Jf^T W Jf = Jf^T B*/
-        CSC_Matrix *B = symBA_alloc(Q, Jf);
+        CSC_matrix *B = symBA_alloc(Q, Jf);
         qnode->QJf = B;
-        node->work->hess_term1 = BTA_alloc(Jf, B);
+        node->work->hess_term1 = new_sparse_matrix(BTA_alloc(Jf, B));
 
         /* term2 = sum_i (Qf(x))_i nabla^2 f_i */
         wsum_hess_init(x);
-        node->work->hess_term2 = new_csr_copy_sparsity(x->wsum_hess);
+        node->work->hess_term2 = x->wsum_hess->copy_sparsity(x->wsum_hess);
 
         /* hess = term1 + term2 */
-        int max_nnz = node->work->hess_term1->nnz + node->work->hess_term2->nnz;
-        node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, max_nnz);
-        sum_csr_alloc(node->work->hess_term1, node->work->hess_term2,
-                      node->wsum_hess);
+        int max_nnz =
+            node->work->hess_term1->nnz + node->work->hess_term2->nnz;
+        node->wsum_hess =
+            new_sparse_matrix_alloc(node->n_vars, node->n_vars, max_nnz);
+        sum_matrices_alloc(node->work->hess_term1, node->work->hess_term2,
+                           node->wsum_hess);
     }
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
 {
-    CSR_Matrix *Q = ((quad_form_expr *) node)->Q;
+    CSR_matrix *Q = ((quad_form_expr *) node)->Q;
     expr *x = node->left;
     double two_w = 2.0 * w[0];
 
@@ -191,11 +197,11 @@ static void eval_wsum_hess(expr *node, const double *w)
     }
     else
     {
-        /* fill the CSC representation of the Jacobian of the child */
-        CSC_Matrix *Jf = x->work->jacobian_csc;
+        /* fill the CSC_matrix representation of the Jacobian of the child */
+        CSC_matrix *Jf = x->work->jacobian_csc;
         if (!x->work->jacobian_csc_filled)
         {
-            csr_to_csc_fill_values(x->jacobian, Jf, x->work->csc_work);
+            csr_to_csc_fill_values(x->jacobian->to_csr(x->jacobian), Jf, x->work->csc_work);
 
             if (x->is_affine(x))
             {
@@ -203,9 +209,8 @@ static void eval_wsum_hess(expr *node, const double *w)
             }
         }
 
-        CSC_Matrix *QJf = ((quad_form_expr *) node)->QJf;
-        CSR_Matrix *term1 = node->work->hess_term1;
-        CSR_Matrix *term2 = node->work->hess_term2;
+        CSC_matrix *QJf = ((quad_form_expr *) node)->QJf;
+        CSR_matrix *term1 = node->work->hess_term1->to_csr(node->work->hess_term1);
 
         /* term1 = J_f^T Q J_f = J_f^T B  */
         BA_fill_values(Q, Jf, QJf);
@@ -213,25 +218,27 @@ static void eval_wsum_hess(expr *node, const double *w)
 
         /* term2 */
         x->eval_wsum_hess(x, node->work->dwork);
-        memcpy(term2->x, x->wsum_hess->x, x->wsum_hess->nnz * sizeof(double));
+        memcpy(node->work->hess_term2->x, x->wsum_hess->x,
+               x->wsum_hess->nnz * sizeof(double));
 
         /* scale both terms by 2w */
-        cblas_dscal(term1->nnz, two_w, term1->x, 1);
-        cblas_dscal(term2->nnz, two_w, term2->x, 1);
+        cblas_dscal(node->work->hess_term1->nnz, two_w, node->work->hess_term1->x, 1);
+        cblas_dscal(node->work->hess_term2->nnz, two_w, node->work->hess_term2->x, 1);
 
         /* sum the two terms */
-        sum_csr_fill_values(term1, term2, node->wsum_hess);
+        sum_matrices_fill_values(node->work->hess_term1, node->work->hess_term2,
+                                 node->wsum_hess);
     }
 }
 
 static void free_type_data(expr *node)
 {
     quad_form_expr *qnode = (quad_form_expr *) node;
-    free_csr_matrix(qnode->Q);
+    free_CSR_matrix(qnode->Q);
     qnode->Q = NULL;
     if (qnode->QJf != NULL)
     {
-        free_csc_matrix(qnode->QJf);
+        free_CSC_matrix(qnode->QJf);
         qnode->QJf = NULL;
     }
 }
@@ -243,7 +250,7 @@ static bool is_affine(const expr *node)
     return false;
 }
 
-expr *new_quad_form(expr *left, CSR_Matrix *Q)
+expr *new_quad_form(expr *left, CSR_matrix *Q)
 {
     assert(left->d1 == 1 || left->d2 == 1); /* left must be a vector */
     quad_form_expr *qnode = (quad_form_expr *) SP_CALLOC(1, sizeof(quad_form_expr));
@@ -255,8 +262,8 @@ expr *new_quad_form(expr *left, CSR_Matrix *Q)
     expr_retain(left);
 
     /* Set type-specific field */
-    qnode->Q = new_csr_matrix(Q->m, Q->n, Q->nnz);
-    copy_csr_matrix(Q, qnode->Q);
+    qnode->Q = new_CSR_matrix(Q->m, Q->n, Q->nnz);
+    copy_CSR_matrix(Q, qnode->Q);
 
     /* dwork stores the result of Q @ f(x) in the forward pass */
     node->work->dwork = (double *) SP_MALLOC(left->size * sizeof(double));
diff --git a/src/expr.c b/src/expr.c
index 64f0410..b99b266 100644
--- a/src/expr.c
+++ b/src/expr.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 #include "expr.h"
-#include "utils/CSC_Matrix.h"
+#include "utils/CSC_matrix.h"
 #include "utils/int_double_pair.h"
 #include "utils/tracked_alloc.h"
 #include <stdlib.h>
@@ -52,7 +52,7 @@ void jacobian_csc_init(expr *node)
     }
     node->work->csc_work = (int *) SP_MALLOC(node->n_vars * sizeof(int));
     node->work->jacobian_csc =
-        csr_to_csc_alloc(node->jacobian, node->work->csc_work);
+        csr_to_csc_alloc(node->jacobian->to_csr(node->jacobian), node->work->csc_work);
 }
 
 void free_expr(expr *node)
@@ -76,19 +76,19 @@ void free_expr(expr *node)
 
     /* free value array and derivative matrices */
     free(node->value);
-    free_csr_matrix(node->jacobian);
-    free_csr_matrix(node->wsum_hess);
+    free_matrix(node->jacobian);
+    free_matrix(node->wsum_hess);
 
     /* free workspace */
     if (node->work)
     {
         free(node->work->dwork);
         free(node->work->iwork);
-        free_csc_matrix(node->work->jacobian_csc);
+        free_CSC_matrix(node->work->jacobian_csc);
         free(node->work->csc_work);
         free(node->work->local_jac_diag);
-        free_csr_matrix(node->work->hess_term1);
-        free_csr_matrix(node->work->hess_term2);
+        free_matrix(node->work->hess_term1);
+        free_matrix(node->work->hess_term2);
         free(node->work);
     }
 
diff --git a/src/old-code/linear_op.c b/src/old-code/linear_op.c
index a63bc4d..8637ee6 100644
--- a/src/old-code/linear_op.c
+++ b/src/old-code/linear_op.c
@@ -17,7 +17,8 @@
  */
 #include "old-code/old_affine.h"
 #include "subexpr.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
+#include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <stdlib.h>
@@ -32,7 +33,7 @@ static void forward(expr *node, const double *u)
     node->left->forward(node->left, u);
 
     /* y = A * x (A is stored as node->jacobian) */
-    Ax_csr(node->jacobian, x->value, node->value, x->var_id);
+    Ax_csr(node->jacobian->to_csr(node->jacobian), x->value, node->value, x->var_id);
 
     /* y += b (if offset exists) */
     if (lin_node->b != NULL)
@@ -74,7 +75,8 @@ static void eval_jacobian(expr *node)
 static void wsum_hess_init_impl(expr *node)
 {
     /* Linear operator Hessian is always zero */
-    node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, 0);
+    node->wsum_hess =
+        new_sparse_matrix(new_CSR_matrix(node->n_vars, node->n_vars, 0));
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
@@ -84,7 +86,7 @@ static void eval_wsum_hess(expr *node, const double *w)
     (void) w;
 }
 
-expr *new_linear(expr *u, const CSR_Matrix *A, const double *b)
+expr *new_linear(expr *u, const CSR_matrix *A, const double *b)
 {
     assert(u->d2 == 1);
     /* Allocate the type-specific struct */
@@ -97,8 +99,9 @@ expr *new_linear(expr *u, const CSR_Matrix *A, const double *b)
     expr_retain(u);
 
     /* Store A directly as the jacobian (linear op jacobian is constant) */
-    node->jacobian = new_csr_matrix(A->m, A->n, A->nnz);
-    copy_csr_matrix(A, node->jacobian);
+    CSR_matrix *jac = new_CSR_matrix(A->m, A->n, A->nnz);
+    copy_CSR_matrix(A, jac);
+    node->jacobian = new_sparse_matrix(jac);
 
     /* Initialize offset (copy b if provided, otherwise NULL) */
     if (b != NULL)
diff --git a/src/old-code/old_CSR.c b/src/old-code/old_CSR.c
index 09225ea..e4137ab 100644
--- a/src/old-code/old_CSR.c
+++ b/src/old-code/old_CSR.c
@@ -16,11 +16,11 @@
  * limitations under the License.
  */
 #include "old-code/old_CSR.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 #include <assert.h>
 #include <string.h>
 
-CSR_Matrix *block_diag_repeat_csr(const CSR_Matrix *A, int p)
+CSR_matrix *block_diag_repeat_csr(const CSR_matrix *A, int p)
 {
     assert(p > 0);
 
@@ -28,7 +28,7 @@ CSR_Matrix *block_diag_repeat_csr(const CSR_Matrix *A, int p)
     int n = A->n;
     int nnz = A->nnz;
 
-    CSR_Matrix *A_kron = new_csr_matrix(m * p, n * p, nnz * p);
+    CSR_matrix *A_kron = new_CSR_matrix(m * p, n * p, nnz * p);
 
     int nnz_cursor = 0;
     for (int block = 0; block < p; block++)
@@ -55,7 +55,7 @@ CSR_Matrix *block_diag_repeat_csr(const CSR_Matrix *A, int p)
     return A_kron;
 }
 
-CSR_Matrix *kron_identity_csr(const CSR_Matrix *A, int p)
+CSR_matrix *kron_identity_csr(const CSR_matrix *A, int p)
 {
     assert(p > 0);
 
@@ -63,7 +63,7 @@ CSR_Matrix *kron_identity_csr(const CSR_Matrix *A, int p)
     int n = A->n;
     int nnz = A->nnz;
 
-    CSR_Matrix *A_kron = new_csr_matrix(m * p, n * p, nnz * p);
+    CSR_matrix *A_kron = new_CSR_matrix(m * p, n * p, nnz * p);
 
     int nnz_cursor = 0;
     for (int row_block = 0; row_block < m; row_block++)
@@ -90,7 +90,7 @@ CSR_Matrix *kron_identity_csr(const CSR_Matrix *A, int p)
     return A_kron;
 }
 
-void Ax_csr_fill_values(const CSR_Matrix *AT, const double *z, CSR_Matrix *C)
+void Ax_csr_fill_values(const CSR_matrix *AT, const double *z, CSR_matrix *C)
 {
     int A_ncols = AT->m;
 
@@ -116,7 +116,7 @@ void Ax_csr_fill_values(const CSR_Matrix *AT, const double *z, CSR_Matrix *C)
     }
 }
 
-void csr_insert_value(CSR_Matrix *A, int col_idx, double value)
+void csr_insert_value(CSR_matrix *A, int col_idx, double value)
 {
     assert(A->m == 1);
 
@@ -144,7 +144,7 @@ void csr_insert_value(CSR_Matrix *A, int col_idx, double value)
     A->nnz++;
 }
 
-void Ax_csr_wo_offset(const CSR_Matrix *A, const double *x, double *y)
+void Ax_csr_wo_offset(const CSR_matrix *A, const double *x, double *y)
 {
     for (int row = 0; row < A->m; row++)
     {
@@ -157,9 +157,9 @@ void Ax_csr_wo_offset(const CSR_Matrix *A, const double *x, double *y)
     }
 }
 
-void diag_csr_mult(const double *d, const CSR_Matrix *A, CSR_Matrix *C)
+void diag_csr_mult(const double *d, const CSR_matrix *A, CSR_matrix *C)
 {
-    copy_csr_matrix(A, C);
+    copy_CSR_matrix(A, C);
 
     for (int row = 0; row < C->m; row++)
     {
diff --git a/src/old-code/old_CSR_sum.c b/src/old-code/old_CSR_sum.c
index e1cea50..c6a659d 100644
--- a/src/old-code/old_CSR_sum.c
+++ b/src/old-code/old_CSR_sum.c
@@ -16,13 +16,13 @@
  * limitations under the License.
  */
 #include "old-code/old_CSR_sum.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/int_double_pair.h"
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 
-void sum_csr_matrices(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C)
+void sum_csr_matrices(const CSR_matrix *A, const CSR_matrix *B, CSR_matrix *C)
 {
     /* A and B must be different from C */
     assert(A != C && B != C);
@@ -84,7 +84,7 @@ void sum_csr_matrices(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C)
     C->p[A->m] = C->nnz;
 }
 
-void sum_scaled_csr_matrices(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C,
+void sum_scaled_csr_matrices(const CSR_matrix *A, const CSR_matrix *B, CSR_matrix *C,
                              const double *d1, const double *d2)
 {
     C->nnz = 0;
@@ -150,7 +150,7 @@ void sum_scaled_csr_matrices(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matri
     C->p[A->m] = C->nnz;
 }
 
-void sum_all_rows_csr(const CSR_Matrix *A, CSR_Matrix *C, int_double_pair *pairs)
+void sum_all_rows_csr(const CSR_matrix *A, CSR_matrix *C, int_double_pair *pairs)
 {
     assert(C->m == 1);
     C->n = A->n;
@@ -185,7 +185,7 @@ void sum_all_rows_csr(const CSR_Matrix *A, CSR_Matrix *C, int_double_pair *pairs
     C->p[1] = C->nnz;
 }
 
-void sum_block_of_rows_csr(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_block_of_rows_csr(const CSR_matrix *A, CSR_matrix *C,
                            int_double_pair *pairs, int row_block_size)
 {
     assert(A->m % row_block_size == 0);
@@ -238,7 +238,7 @@ void sum_block_of_rows_csr(const CSR_Matrix *A, CSR_Matrix *C,
     }
 }
 
-void sum_evenly_spaced_rows_csr(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_evenly_spaced_rows_csr(const CSR_matrix *A, CSR_matrix *C,
                                 int_double_pair *pairs, int row_spacing)
 {
     assert(C->m == row_spacing);
@@ -286,7 +286,7 @@ void sum_evenly_spaced_rows_csr(const CSR_Matrix *A, CSR_Matrix *C,
     }
 }
 
-void sum_spaced_rows_into_row_csr(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_spaced_rows_into_row_csr(const CSR_matrix *A, CSR_matrix *C,
                                   int_double_pair *pairs, int offset, int spacing)
 {
     assert(C->m == 1);
diff --git a/src/old-code/old_permuted_dense.c b/src/old-code/old_permuted_dense.c
new file mode 100644
index 0000000..453bdef
--- /dev/null
+++ b/src/old-code/old_permuted_dense.c
@@ -0,0 +1,286 @@
+/*
+ * Copyright 2026 Daniel Cederberg and William Zhang
+ *
+ * This file is part of the SparseDiffEngine project.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "old-code/old_permuted_dense.h"
+
+#include "utils/cblas_wrapper.h"
+#include "utils/tracked_alloc.h"
+#include <stdlib.h>
+#include <string.h>
+
+matrix *BTA_pd_csr_alloc(const permuted_dense *B, const CSR_matrix *A)
+{
+    /* Cij != 0 only if i is in B's column permutation and column j of A
+      overlaps with column i of B. */
+
+    /* Gather the union of columns appearing in A's rows at positions
+       row_perm_B. Use a bitmap of size A->n for O(nnz) collection. */
+    int p = A->n;
+    char *seen = (char *) SP_CALLOC(p, sizeof(char));
+    int s_A = 0;
+    for (int kk = 0; kk < B->m0; kk++)
+    {
+        int row = B->row_perm[kk];
+        for (int e = A->p[row]; e < A->p[row + 1]; e++)
+        {
+            int j = A->i[e];
+            if (!seen[j])
+            {
+                seen[j] = 1;
+                s_A++;
+            }
+        }
+    }
+
+    int *col_active = (int *) SP_MALLOC((s_A > 0 ? s_A : 1) * sizeof(int));
+    int idx = 0;
+    for (int j = 0; j < p; j++)
+    {
+        if (seen[j])
+        {
+            col_active[idx++] = j;
+        }
+    }
+
+    matrix *C =
+        new_permuted_dense(B->base.n, p, B->n0, s_A, B->col_perm, col_active, NULL);
+    free(col_active);
+    free(seen);
+
+    /* Upgrade `dwork` (currently sized for the Y-role at m0_C * n0_C = B->n0 *
+       s_A) to fit the gather buffer A_sub_dense used by BTA_csr_pd /
+       BTDA_pd_csr_fill_values: shape (B->m0, s_A) row-major. The dgemm
+       reads it as (B->m0, s_A), so size B->m0 * s_A doubles suffices. */
+    permuted_dense *C_pd = (permuted_dense *) C;
+    size_t gather_size = B->m0 * s_A;
+    if (gather_size > C_pd->dwork_size)
+    {
+        free(C_pd->dwork);
+        C_pd->dwork_size = gather_size;
+        C_pd->dwork = (double *) SP_CALLOC(gather_size, sizeof(double));
+    }
+    return C;
+}
+
+void BTA_pd_csr_fill_values(const permuted_dense *B, const CSR_matrix *A_csr,
+                            permuted_dense *C)
+{
+    int m0 = B->m0;
+    int dn_B = B->n0;
+    int s_A = C->n0;
+
+    if (s_A == 0 || m0 == 0)
+    {
+        /* Output dense block is empty; nothing to fill. */
+        return;
+    }
+
+    /* Use C->col_inv (pre-built by new_permuted_dense) as col_inv_out and
+       C->dwork as A_sub_dense; both are owned by C. dwork is sized at alloc
+       time to cover m0 * s_A; only that prefix is touched. */
+    double *A_sub_dense = C->dwork;
+    size_t used = m0 * s_A;
+    memset(A_sub_dense, 0, used * sizeof(double));
+
+    for (int kk = 0; kk < m0; kk++)
+    {
+        int row = B->row_perm[kk];
+        for (int e = A_csr->p[row]; e < A_csr->p[row + 1]; e++)
+        {
+            int j = A_csr->i[e];
+            int jj = C->col_inv[j];
+            /* jj should always be valid (we built col_perm from these entries),
+               but guard against asymmetry between alloc and fill calls. */
+            if (jj >= 0)
+            {
+                A_sub_dense[kk * s_A + jj] = A_csr->x[e];
+            }
+        }
+    }
+
+    /* C->X = X_B^T @ A_sub_dense */
+    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dn_B, s_A, m0, 1.0, B->X,
+                dn_B, A_sub_dense, s_A, 0.0, C->X, s_A);
+}
+
+/* BTDA variant of BTA_csr_pd: C->X = X_B^T diag(d) A_sub_dense. Folds d
+   into the scatter step. */
+void BTDA_pd_csr_fill_values(const permuted_dense *B, const double *d,
+                             const CSR_matrix *A_csr, permuted_dense *C)
+{
+    int m0 = B->m0;
+    int dn_B = B->n0;
+    int s_A = C->n0;
+
+    if (s_A == 0 || m0 == 0)
+    {
+        return;
+    }
+
+    double *A_sub_dense = C->dwork;
+    size_t used = m0 * s_A;
+    memset(A_sub_dense, 0, used * sizeof(double));
+
+    for (int kk = 0; kk < m0; kk++)
+    {
+        int row = B->row_perm[kk];
+        double dk = d ? d[row] : 1.0;
+        for (int e = A_csr->p[row]; e < A_csr->p[row + 1]; e++)
+        {
+            int j = A_csr->i[e];
+            int jj = C->col_inv[j];
+            if (jj >= 0)
+            {
+                A_sub_dense[kk * s_A + jj] = dk * A_csr->x[e];
+            }
+        }
+    }
+
+    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dn_B, s_A, m0, 1.0, B->X,
+                dn_B, A_sub_dense, s_A, 0.0, C->X, s_A);
+}
+
+/* Legacy CSR-pd kernels (B=CSR, A=PD), formerly in src/utils/permuted_dense.c.
+   Production now goes through BTA_csc_pd_alloc / BTDA_csc_pd_fill_values;
+   these are kept here for reference + direct unit tests. */
+
+matrix *BTA_csr_pd_alloc(const CSR_matrix *B_csr, const permuted_dense *A)
+{
+    /* Gather the union of columns appearing in B's rows at positions
+       row_perm_A. Bitmap of size B_csr->n for O(nnz) collection. */
+    int q = B_csr->n;
+    char *seen = (char *) SP_CALLOC(q, sizeof(char));
+    int r_B = 0;
+    for (int kk = 0; kk < A->m0; kk++)
+    {
+        int row = A->row_perm[kk];
+        for (int e = B_csr->p[row]; e < B_csr->p[row + 1]; e++)
+        {
+            int i = B_csr->i[e];
+            if (!seen[i])
+            {
+                seen[i] = 1;
+                r_B++;
+            }
+        }
+    }
+
+    int *row_active = (int *) SP_MALLOC((r_B > 0 ? r_B : 1) * sizeof(int));
+    int idx = 0;
+    for (int i = 0; i < q; i++)
+    {
+        if (seen[i])
+        {
+            row_active[idx++] = i;
+        }
+    }
+
+    matrix *C =
+        new_permuted_dense(q, A->base.n, r_B, A->n0, row_active, A->col_perm, NULL);
+    free(row_active);
+    free(seen);
+
+    /* Upgrade `dwork` (currently sized for the Y-role at m0_C * n0_C = r_B *
+       A->n0) to fit the gather buffer B_sub_dense used by BTA_csr_pd /
+       BTDA_csr_pd_fill_values: shape (A->m0, r_B) row-major. */
+    permuted_dense *C_pd = (permuted_dense *) C;
+    size_t gather_size = A->m0 * r_B;
+    if (gather_size > C_pd->dwork_size)
+    {
+        free(C_pd->dwork);
+        C_pd->dwork_size = gather_size;
+        C_pd->dwork = (double *) SP_CALLOC(gather_size, sizeof(double));
+    }
+    return C;
+}
+
+/* No-d BTA fill for the legacy CSR-pd kernel. */
+void BTA_csr_pd_fill_values(const CSR_matrix *B_csr, const permuted_dense *A,
+                            permuted_dense *C)
+{
+    int m0 = A->m0;
+    int dn_A = A->n0;
+    int r_B = C->m0;
+
+    if (r_B == 0 || m0 == 0)
+    {
+        /* Output dense block is empty; nothing to fill. */
+        return;
+    }
+
+    /* Use C->row_inv (pre-built by new_permuted_dense) as row_inv_out and
+       C->dwork as B_sub_dense; both are owned by C. dwork is sized at alloc
+       time to cover m0 * r_B; only that prefix is touched. */
+    double *B_sub_dense = C->dwork;
+    size_t used = m0 * r_B;
+    memset(B_sub_dense, 0, used * sizeof(double));
+
+    for (int kk = 0; kk < m0; kk++)
+    {
+        int row = A->row_perm[kk];
+        for (int e = B_csr->p[row]; e < B_csr->p[row + 1]; e++)
+        {
+            int i = B_csr->i[e];
+            int ii = C->row_inv[i];
+            if (ii >= 0)
+            {
+                B_sub_dense[kk * r_B + ii] = B_csr->x[e];
+            }
+        }
+    }
+
+    /* C->X = B_sub_dense^T @ X_A */
+    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, r_B, dn_A, m0, 1.0,
+                B_sub_dense, r_B, A->X, dn_A, 0.0, C->X, dn_A);
+}
+
+/* BTDA variant: C->X = B_sub_dense^T diag(d) X_A. Folds d into the scatter
+   step. d may be NULL (treated as identity). */
+void BTDA_csr_pd_fill_values(const CSR_matrix *B_csr, const double *d,
+                             const permuted_dense *A, permuted_dense *C)
+{
+    int m0 = A->m0;
+    int dn_A = A->n0;
+    int r_B = C->m0;
+
+    if (r_B == 0 || m0 == 0)
+    {
+        return;
+    }
+
+    double *B_sub_dense = C->dwork;
+    size_t used = m0 * r_B;
+    memset(B_sub_dense, 0, used * sizeof(double));
+
+    for (int kk = 0; kk < m0; kk++)
+    {
+        int row = A->row_perm[kk];
+        double dk = d ? d[row] : 1.0;
+        for (int e = B_csr->p[row]; e < B_csr->p[row + 1]; e++)
+        {
+            int i = B_csr->i[e];
+            int ii = C->row_inv[i];
+            if (ii >= 0)
+            {
+                B_sub_dense[kk * r_B + ii] = dk * B_csr->x[e];
+            }
+        }
+    }
+
+    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, r_B, dn_A, m0, 1.0,
+                B_sub_dense, r_B, A->X, dn_A, 0.0, C->X, dn_A);
+}
diff --git a/src/problem.c b/src/problem.c
index 691921a..e462fa4 100644
--- a/src/problem.c
+++ b/src/problem.c
@@ -83,9 +83,10 @@ static void problem_lagrange_hess_fill_sparsity(problem *prob, int *iwork)
     int *cols = iwork;
     int *col_to_pos = iwork; /* reused after qsort */
     int nnz = 0;
-    CSR_Matrix *H_obj = prob->objective->wsum_hess;
-    CSR_Matrix *H_c;
-    CSR_Matrix *H = prob->lagrange_hessian;
+    CSR_matrix *H_obj =
+        prob->objective->wsum_hess->to_csr(prob->objective->wsum_hess);
+    CSR_matrix *H_c;
+    CSR_matrix *H = prob->lagrange_hessian;
     H->p[0] = 0;
 
     // ----------------------------------------------------------------------
@@ -100,7 +101,7 @@ static void problem_lagrange_hess_fill_sparsity(problem *prob, int *iwork)
         /* gather columns from constraint hessians */
         for (int c_idx = 0; c_idx < prob->n_constraints; c_idx++)
         {
-            H_c = constrs[c_idx]->wsum_hess;
+            H_c = constrs[c_idx]->wsum_hess->to_csr(constrs[c_idx]->wsum_hess);
             int c_len = H_c->p[row + 1] - H_c->p[row];
             memcpy(cols + count, H_c->i + H_c->p[row], c_len * sizeof(int));
             count += c_len;
@@ -146,7 +147,7 @@ static void problem_lagrange_hess_fill_sparsity(problem *prob, int *iwork)
     /* map constraint hessian entries */
     for (int c_idx = 0; c_idx < prob->n_constraints; c_idx++)
     {
-        H_c = constrs[c_idx]->wsum_hess;
+        H_c = constrs[c_idx]->wsum_hess->to_csr(constrs[c_idx]->wsum_hess);
         for (int row = 0; row < H->m; row++)
         {
             for (int idx = H->p[row]; idx < H->p[row + 1]; idx++)
@@ -176,37 +177,39 @@ void problem_init_jacobian(problem *prob)
     {
         expr *c = prob->constraints[i];
         jacobian_init(c);
-        nnz += c->jacobian->nnz;
+        CSR_matrix *Jc = c->jacobian->to_csr(c->jacobian);
+        nnz += Jc->nnz;
 
         if (c->is_affine(c))
         {
-            prob->stats.nnz_affine += c->jacobian->nnz;
+            prob->stats.nnz_affine += Jc->nnz;
         }
         else
         {
-            prob->stats.nnz_nonlinear += c->jacobian->nnz;
+            prob->stats.nnz_nonlinear += Jc->nnz;
         }
     }
 
-    prob->jacobian = new_csr_matrix(prob->total_constraint_size, prob->n_vars, nnz);
+    prob->jacobian = new_CSR_matrix(prob->total_constraint_size, prob->n_vars, nnz);
 
     /* set sparsity pattern of jacobian */
-    CSR_Matrix *H = prob->jacobian;
+    CSR_matrix *H = prob->jacobian;
     H->p[0] = 0;
     int row_offset = 0;
     int nnz_offset = 0;
     for (int i = 0; i < prob->n_constraints; i++)
     {
         expr *c = prob->constraints[i];
+        CSR_matrix *Jc = c->jacobian->to_csr(c->jacobian);
 
-        for (int r = 1; r <= c->jacobian->m; r++)
+        for (int r = 1; r <= Jc->m; r++)
         {
-            H->p[row_offset + r] = nnz_offset + c->jacobian->p[r];
+            H->p[row_offset + r] = nnz_offset + Jc->p[r];
         }
 
-        memcpy(H->i + nnz_offset, c->jacobian->i, c->jacobian->nnz * sizeof(int));
-        row_offset += c->jacobian->m;
-        nnz_offset += c->jacobian->nnz;
+        memcpy(H->i + nnz_offset, Jc->i, Jc->nnz * sizeof(int));
+        row_offset += Jc->m;
+        nnz_offset += Jc->nnz;
     }
     assert(nnz_offset == nnz);
 
@@ -231,7 +234,7 @@ void problem_init_hessian(problem *prob)
         nnz += prob->constraints[i]->wsum_hess->nnz;
     }
 
-    prob->lagrange_hessian = new_csr_matrix(prob->n_vars, prob->n_vars, nnz);
+    prob->lagrange_hessian = new_CSR_matrix(prob->n_vars, prob->n_vars, nnz);
     memset(prob->lagrange_hessian->x, 0, nnz * sizeof(double)); /* affine shortcut */
     prob->stats.nnz_hessian = nnz;
     prob->hess_idx_map = (int *) SP_MALLOC(nnz * sizeof(int));
@@ -248,7 +251,7 @@ void problem_init_jacobian_coo(problem *prob)
     problem_init_jacobian(prob);
     Timer timer;
     clock_gettime(CLOCK_MONOTONIC, &timer.start);
-    prob->jacobian_coo = new_coo_matrix(prob->jacobian);
+    prob->jacobian_coo = new_COO_matrix(prob->jacobian);
     clock_gettime(CLOCK_MONOTONIC, &timer.end);
     prob->stats.time_init_derivatives += GET_ELAPSED_SECONDS(timer);
 }
@@ -259,7 +262,7 @@ void problem_init_hessian_coo_lower_triangular(problem *prob)
     Timer timer;
     clock_gettime(CLOCK_MONOTONIC, &timer.start);
     prob->lagrange_hessian_coo =
-        new_coo_matrix_lower_triangular(prob->lagrange_hessian);
+        new_COO_matrix_lower_triangular(prob->lagrange_hessian);
     clock_gettime(CLOCK_MONOTONIC, &timer.end);
     prob->stats.time_init_derivatives += GET_ELAPSED_SECONDS(timer);
 }
@@ -342,10 +345,10 @@ void free_problem(problem *prob)
     /* Free allocated arrays */
     free(prob->constraint_values);
     free(prob->gradient_values);
-    free_csr_matrix(prob->jacobian);
-    free_csr_matrix(prob->lagrange_hessian);
-    free_coo_matrix(prob->jacobian_coo);
-    free_coo_matrix(prob->lagrange_hessian_coo);
+    free_CSR_matrix(prob->jacobian);
+    free_CSR_matrix(prob->lagrange_hessian);
+    free_COO_matrix(prob->jacobian_coo);
+    free_COO_matrix(prob->lagrange_hessian_coo);
     free(prob->hess_idx_map);
 
     /* Release expression references (decrements refcount) */
@@ -466,7 +469,7 @@ void problem_gradient(problem *prob)
 
     /* copy sparse jacobian to dense gradient */
     memset(prob->gradient_values, 0, prob->n_vars * sizeof(double));
-    CSR_Matrix *jac = prob->objective->jacobian;
+    CSR_matrix *jac = prob->objective->jacobian->to_csr(prob->objective->jacobian);
     for (int k = jac->p[0]; k < jac->p[1]; k++)
     {
         prob->gradient_values[jac->i[k]] = jac->x[k];
@@ -482,13 +485,12 @@ void problem_jacobian(problem *prob)
     clock_gettime(CLOCK_MONOTONIC, &timer.start);
     bool first_call = !prob->jacobian_called;
 
-    CSR_Matrix *J = prob->jacobian;
+    CSR_matrix *J = prob->jacobian;
     int nnz_offset = 0;
 
     for (int i = 0; i < prob->n_constraints; i++)
     {
         expr *c = prob->constraints[i];
-
         if (!first_call && c->is_affine(c))
         {
             /* skip evaluation for affine constraints after first call */
@@ -537,21 +539,22 @@ void problem_hessian(problem *prob, double obj_w, const double *w)
     // ------------------------------------------------------------------------
     //           assemble Lagrange hessian using index map
     // ------------------------------------------------------------------------
-    CSR_Matrix *H = prob->lagrange_hessian;
+    CSR_matrix *H = prob->lagrange_hessian;
     int *idx_map = prob->hess_idx_map;
 
     /* zero out hessian before adding contribution from obj and constraints */
     memset(H->x, 0, H->nnz * sizeof(double));
 
     /* accumulate objective function */
-    accumulator(obj->wsum_hess, idx_map, H->x);
+    accumulator(obj->wsum_hess->x, obj->wsum_hess->nnz, idx_map, H->x);
     offset = obj->wsum_hess->nnz;
 
     /* accumulate constraint functions */
     for (int i = 0; i < prob->n_constraints; i++)
     {
-        accumulator(constrs[i]->wsum_hess, idx_map + offset, H->x);
-        offset += constrs[i]->wsum_hess->nnz;
+        matrix *c_hess = constrs[i]->wsum_hess;
+        accumulator(c_hess->x, c_hess->nnz, idx_map + offset, H->x);
+        offset += c_hess->nnz;
     }
 
     clock_gettime(CLOCK_MONOTONIC, &timer.end);
diff --git a/src/utils/COO_Matrix.c b/src/utils/COO_matrix.c
similarity index 87%
rename from src/utils/COO_Matrix.c
rename to src/utils/COO_matrix.c
index 727be25..8602c09 100644
--- a/src/utils/COO_Matrix.c
+++ b/src/utils/COO_matrix.c
@@ -15,14 +15,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "utils/COO_Matrix.h"
+#include "utils/COO_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <stdlib.h>
 #include <string.h>
 
-COO_Matrix *new_coo_matrix(const CSR_Matrix *A)
+COO_matrix *new_COO_matrix(const CSR_matrix *A)
 {
-    COO_Matrix *coo = (COO_Matrix *) SP_MALLOC(sizeof(COO_Matrix));
+    COO_matrix *coo = (COO_matrix *) SP_MALLOC(sizeof(COO_matrix));
     coo->m = A->m;
     coo->n = A->n;
     coo->nnz = A->nnz;
@@ -45,7 +45,7 @@ COO_Matrix *new_coo_matrix(const CSR_Matrix *A)
     return coo;
 }
 
-COO_Matrix *new_coo_matrix_lower_triangular(const CSR_Matrix *A)
+COO_matrix *new_COO_matrix_lower_triangular(const CSR_matrix *A)
 {
     /* Pass 1: count lower-triangular entries (col <= row) */
     int count = 0;
@@ -60,7 +60,7 @@ COO_Matrix *new_coo_matrix_lower_triangular(const CSR_Matrix *A)
         }
     }
 
-    COO_Matrix *coo = (COO_Matrix *) SP_MALLOC(sizeof(COO_Matrix));
+    COO_matrix *coo = (COO_matrix *) SP_MALLOC(sizeof(COO_matrix));
     coo->m = A->m;
     coo->n = A->n;
     coo->nnz = count;
@@ -89,7 +89,7 @@ COO_Matrix *new_coo_matrix_lower_triangular(const CSR_Matrix *A)
     return coo;
 }
 
-void refresh_lower_triangular_coo(COO_Matrix *coo, const double *vals)
+void refresh_lower_triangular_coo(COO_matrix *coo, const double *vals)
 {
     for (int i = 0; i < coo->nnz; i++)
     {
@@ -97,7 +97,7 @@ void refresh_lower_triangular_coo(COO_Matrix *coo, const double *vals)
     }
 }
 
-void free_coo_matrix(COO_Matrix *matrix)
+void free_COO_matrix(COO_matrix *matrix)
 {
     if (matrix)
     {
diff --git a/src/utils/CSC_Matrix.c b/src/utils/CSC_matrix.c
similarity index 90%
rename from src/utils/CSC_Matrix.c
rename to src/utils/CSC_matrix.c
index 4718040..08eec98 100644
--- a/src/utils/CSC_Matrix.c
+++ b/src/utils/CSC_matrix.c
@@ -15,16 +15,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "utils/CSC_Matrix.h"
+#include "utils/CSC_matrix.h"
 #include "utils/iVec.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 
-CSC_Matrix *new_csc_matrix(int m, int n, int nnz)
+CSC_matrix *new_CSC_matrix(int m, int n, int nnz)
 {
-    CSC_Matrix *matrix = (CSC_Matrix *) SP_MALLOC(sizeof(CSC_Matrix));
+    CSC_matrix *matrix = (CSC_matrix *) SP_MALLOC(sizeof(CSC_matrix));
     if (!matrix) return NULL;
 
     matrix->p = (int *) SP_MALLOC((n + 1) * sizeof(int));
@@ -47,7 +47,7 @@ CSC_Matrix *new_csc_matrix(int m, int n, int nnz)
     return matrix;
 }
 
-void free_csc_matrix(CSC_Matrix *matrix)
+void free_CSC_matrix(CSC_matrix *matrix)
 {
     if (matrix)
     {
@@ -58,7 +58,7 @@ void free_csc_matrix(CSC_Matrix *matrix)
     }
 }
 
-CSR_Matrix *ATA_alloc(const CSC_Matrix *A)
+CSR_matrix *ATA_alloc(const CSC_matrix *A)
 {
     /* A is m x n, A^T A is n x n */
     int n = A->n;
@@ -102,7 +102,7 @@ CSR_Matrix *ATA_alloc(const CSC_Matrix *A)
     }
 
     /* Allocate C and symmetrize it */
-    CSR_Matrix *C = new_csr_matrix(n, n, nnz);
+    CSR_matrix *C = new_CSR_matrix(n, n, nnz);
     symmetrize_csr(Cp, Ci->data, n, C);
 
     /* free workspace */
@@ -169,7 +169,7 @@ static inline double sparse_wdot(const double *a_x, const int *a_i, int a_nnz,
     return sum;
 }
 
-void ATDA_fill_values(const CSC_Matrix *A, const double *d, CSR_Matrix *C)
+void ATDA_fill_values(const CSC_matrix *A, const double *d, CSR_matrix *C)
 {
     int j, ii, jj;
     for (ii = 0; ii < C->m; ii++)
@@ -203,9 +203,9 @@ void ATDA_fill_values(const CSC_Matrix *A, const double *d, CSR_Matrix *C)
     }
 }
 
-CSC_Matrix *csr_to_csc_alloc(const CSR_Matrix *A, int *iwork)
+CSC_matrix *csr_to_csc_alloc(const CSR_matrix *A, int *iwork)
 {
-    CSC_Matrix *C = new_csc_matrix(A->m, A->n, A->nnz);
+    CSC_matrix *C = new_CSC_matrix(A->m, A->n, A->nnz);
 
     int i, j;
     int *count = iwork;
@@ -247,7 +247,7 @@ CSC_Matrix *csr_to_csc_alloc(const CSR_Matrix *A, int *iwork)
     return C;
 }
 
-void csr_to_csc_fill_values(const CSR_Matrix *A, CSC_Matrix *C, int *iwork)
+void csr_to_csc_fill_values(const CSR_matrix *A, CSC_matrix *C, int *iwork)
 {
     int i, j;
     int *count = iwork;
@@ -266,9 +266,9 @@ void csr_to_csc_fill_values(const CSR_Matrix *A, CSC_Matrix *C, int *iwork)
     }
 }
 
-CSR_Matrix *csc_to_csr_alloc(const CSC_Matrix *A, int *iwork)
+CSR_matrix *csc_to_csr_alloc(const CSC_matrix *A, int *iwork)
 {
-    CSR_Matrix *C = new_csr_matrix(A->m, A->n, A->nnz);
+    CSR_matrix *C = new_CSR_matrix(A->m, A->n, A->nnz);
 
     int i, j;
     int *count = iwork;
@@ -312,7 +312,7 @@ CSR_Matrix *csc_to_csr_alloc(const CSC_Matrix *A, int *iwork)
     return C;
 }
 
-void csc_to_csr_fill_values(const CSC_Matrix *A, CSR_Matrix *C, int *iwork)
+void csc_to_csr_fill_values(const CSC_matrix *A, CSR_matrix *C, int *iwork)
 {
     int i, j;
     int *count = iwork;
@@ -332,7 +332,7 @@ void csc_to_csr_fill_values(const CSC_Matrix *A, CSR_Matrix *C, int *iwork)
     }
 }
 
-CSR_Matrix *BTA_alloc(const CSC_Matrix *A, const CSC_Matrix *B)
+CSR_matrix *BTA_alloc(const CSC_matrix *A, const CSC_matrix *B)
 {
     /* A is m x n, B is m x p, C = B^T A is p x n */
     int n = A->n;
@@ -378,7 +378,7 @@ CSR_Matrix *BTA_alloc(const CSC_Matrix *A, const CSC_Matrix *B)
     }
 
     /* Allocate C */
-    CSR_Matrix *C = new_csr_matrix(p, n, nnz);
+    CSR_matrix *C = new_CSR_matrix(p, n, nnz);
     memcpy(C->p, Cp, (p + 1) * sizeof(int));
     memcpy(C->i, Ci->data, nnz * sizeof(int));
 
@@ -389,7 +389,7 @@ CSR_Matrix *BTA_alloc(const CSC_Matrix *A, const CSC_Matrix *B)
     return C;
 }
 
-void yTA_fill_values(const CSC_Matrix *A, const double *y, CSR_Matrix *C)
+void yTA_fill_values(const CSC_matrix *A, const double *y, CSR_matrix *C)
 {
     for (int col = 0; col < A->n; col++)
     {
@@ -413,9 +413,9 @@ void yTA_fill_values(const CSC_Matrix *A, const double *y, CSR_Matrix *C)
     }
 }
 
-/* computes C = B^T * D * A in CSR */
-void BTDA_fill_values(const CSC_Matrix *A, const CSC_Matrix *B, const double *d,
-                      CSR_Matrix *C)
+/* computes C = B^T * D * A in CSR_matrix */
+void BTDA_fill_values(const CSC_matrix *A, const CSC_matrix *B, const double *d,
+                      CSR_matrix *C)
 {
     int i, j, jj;
     for (i = 0; i < C->m; i++)
@@ -446,7 +446,7 @@ void BTDA_fill_values(const CSC_Matrix *A, const CSC_Matrix *B, const double *d,
  * faster when Q is dense, since it touches each Q entry exactly once.
  * The sparse_dot approach below is simpler but redundantly scans
  * column j of A for each nonzero row of C. */
-void BA_fill_values(const CSR_Matrix *Q, const CSC_Matrix *A, CSC_Matrix *C)
+void BA_fill_values(const CSR_matrix *Q, const CSC_matrix *A, CSC_matrix *C)
 {
     /* fill values of C = Q * A, given the sparsity pattern of C. */
     int i, j, ii;
@@ -467,10 +467,10 @@ void BA_fill_values(const CSR_Matrix *Q, const CSC_Matrix *A, CSC_Matrix *C)
     }
 }
 
-CSC_Matrix *symBA_alloc(const CSR_Matrix *B, const CSC_Matrix *A)
+CSC_matrix *symBA_alloc(const CSR_matrix *B, const CSC_matrix *A)
 {
     /* Allocate C = B * A (sparsity only). B must be symmetric.
-     * B is CSR (m x m), A is CSC (m x n), C is CSC (m x n).
+     * B is CSR_matrix (m x m), A is CSC_matrix (m x n), C is CSC_matrix (m x n).
      *
      * Column j of C is B * a_j = sum_k A_{k,j} B[:, k], so the nonzero
      * rows of column j of C are the union of the nonzero rows of B[:, k].
@@ -525,7 +525,7 @@ CSC_Matrix *symBA_alloc(const CSR_Matrix *B, const CSC_Matrix *A)
 
     /* allocate C and copy the computed structure */
     int total_nnz = Cp[n];
-    CSC_Matrix *C = new_csc_matrix(m, n, total_nnz);
+    CSC_matrix *C = new_CSC_matrix(m, n, total_nnz);
     memcpy(C->p, Cp, (n + 1) * sizeof(int));
     memcpy(C->i, Ci->data, total_nnz * sizeof(int));
 
@@ -536,7 +536,7 @@ CSC_Matrix *symBA_alloc(const CSR_Matrix *B, const CSC_Matrix *A)
     return C;
 }
 
-int count_nonzero_cols_csc(const CSC_Matrix *A)
+int count_nonzero_cols_csc(const CSC_matrix *A)
 {
     int count = 0;
     for (int j = 0; j < A->n; j++)
diff --git a/src/utils/CSR_Matrix.c b/src/utils/CSR_matrix.c
similarity index 86%
rename from src/utils/CSR_Matrix.c
rename to src/utils/CSR_matrix.c
index c52b97d..1123068 100644
--- a/src/utils/CSR_Matrix.c
+++ b/src/utils/CSR_matrix.c
@@ -15,7 +15,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/int_double_pair.h"
 #include "utils/tracked_alloc.h"
 #include "utils/utils.h"
@@ -25,9 +25,9 @@
 #include <stdlib.h>
 #include <string.h>
 
-CSR_Matrix *new_csr_matrix(int m, int n, int nnz)
+CSR_matrix *new_CSR_matrix(int m, int n, int nnz)
 {
-    CSR_Matrix *matrix = (CSR_Matrix *) SP_MALLOC(sizeof(CSR_Matrix));
+    CSR_matrix *matrix = (CSR_matrix *) SP_MALLOC(sizeof(CSR_matrix));
     matrix->p = (int *) SP_CALLOC(m + 1, sizeof(int));
     matrix->i = (int *) SP_CALLOC(nnz, sizeof(int));
     matrix->x = (double *) SP_MALLOC(nnz * sizeof(double));
@@ -37,24 +37,24 @@ CSR_Matrix *new_csr_matrix(int m, int n, int nnz)
     return matrix;
 }
 
-CSR_Matrix *new_csr(const CSR_Matrix *A)
+CSR_matrix *new_csr(const CSR_matrix *A)
 {
-    CSR_Matrix *copy = new_csr_matrix(A->m, A->n, A->nnz);
+    CSR_matrix *copy = new_CSR_matrix(A->m, A->n, A->nnz);
     memcpy(copy->p, A->p, (A->m + 1) * sizeof(int));
     memcpy(copy->i, A->i, A->nnz * sizeof(int));
     memcpy(copy->x, A->x, A->nnz * sizeof(double));
     return copy;
 }
 
-CSR_Matrix *new_csr_copy_sparsity(const CSR_Matrix *A)
+CSR_matrix *new_csr_copy_sparsity(const CSR_matrix *A)
 {
-    CSR_Matrix *copy = new_csr_matrix(A->m, A->n, A->nnz);
+    CSR_matrix *copy = new_CSR_matrix(A->m, A->n, A->nnz);
     memcpy(copy->p, A->p, (A->m + 1) * sizeof(int));
     memcpy(copy->i, A->i, A->nnz * sizeof(int));
     return copy;
 }
 
-void free_csr_matrix(CSR_Matrix *matrix)
+void free_CSR_matrix(CSR_matrix *matrix)
 {
     if (matrix)
     {
@@ -65,7 +65,7 @@ void free_csr_matrix(CSR_Matrix *matrix)
     }
 }
 
-void copy_csr_matrix(const CSR_Matrix *A, CSR_Matrix *C)
+void copy_CSR_matrix(const CSR_matrix *A, CSR_matrix *C)
 {
     C->m = A->m;
     C->n = A->n;
@@ -75,7 +75,7 @@ void copy_csr_matrix(const CSR_Matrix *A, CSR_Matrix *C)
     memcpy(C->x, A->x, A->nnz * sizeof(double));
 }
 
-void Ax_csr(const CSR_Matrix *A, const double *x, double *y, int col_offset)
+void Ax_csr(const CSR_matrix *A, const double *x, double *y, int col_offset)
 {
     for (int row = 0; row < A->m; row++)
     {
@@ -88,7 +88,7 @@ void Ax_csr(const CSR_Matrix *A, const double *x, double *y, int col_offset)
     }
 }
 
-int count_nonzero_cols(const CSR_Matrix *A, bool *col_nz)
+int count_nonzero_cols(const CSR_matrix *A, bool *col_nz)
 {
     for (int row = 0; row < A->m; row++)
     {
@@ -120,7 +120,7 @@ void insert_idx(int idx, int *arr, int len)
     arr[j] = idx;
 }
 
-void DA_fill_values(const double *d, const CSR_Matrix *A, CSR_Matrix *C)
+void DA_fill_values(const double *d, const CSR_matrix *A, CSR_matrix *C)
 {
     memcpy(C->x, A->x, A->nnz * sizeof(double));
 
@@ -133,9 +133,9 @@ void DA_fill_values(const double *d, const CSR_Matrix *A, CSR_Matrix *C)
     }
 }
 
-CSR_Matrix *transpose(const CSR_Matrix *A, int *iwork)
+CSR_matrix *transpose(const CSR_matrix *A, int *iwork)
 {
-    CSR_Matrix *AT = new_csr_matrix(A->n, A->m, A->nnz);
+    CSR_matrix *AT = new_CSR_matrix(A->n, A->m, A->nnz);
 
     int i, j;
     int *count = iwork;
@@ -178,10 +178,10 @@ CSR_Matrix *transpose(const CSR_Matrix *A, int *iwork)
     return AT;
 }
 
-CSR_Matrix *AT_alloc(const CSR_Matrix *A, int *iwork)
+CSR_matrix *AT_alloc(const CSR_matrix *A, int *iwork)
 {
     /* Allocate A^T and compute sparsity pattern without filling values */
-    CSR_Matrix *AT = new_csr_matrix(A->n, A->m, A->nnz);
+    CSR_matrix *AT = new_CSR_matrix(A->n, A->m, A->nnz);
 
     int i, j;
     int *count = iwork;
@@ -223,7 +223,7 @@ CSR_Matrix *AT_alloc(const CSR_Matrix *A, int *iwork)
     return AT;
 }
 
-void AT_fill_values(const CSR_Matrix *A, CSR_Matrix *AT, int *iwork)
+void AT_fill_values(const CSR_matrix *A, CSR_matrix *AT, int *iwork)
 {
     /* Fill values of A^T given sparsity pattern is already computed */
     int i, j;
@@ -241,7 +241,7 @@ void AT_fill_values(const CSR_Matrix *A, CSR_Matrix *AT, int *iwork)
     }
 }
 
-double csr_get_value(const CSR_Matrix *A, int row, int col)
+double csr_get_value(const CSR_matrix *A, int row, int col)
 {
     for (int j = A->p[row]; j < A->p[row + 1]; j++)
     {
@@ -253,7 +253,7 @@ double csr_get_value(const CSR_Matrix *A, int row, int col)
     return 0.0;
 }
 
-void symmetrize_csr(const int *Ap, const int *Ai, int m, CSR_Matrix *C)
+void symmetrize_csr(const int *Ap, const int *Ai, int m, CSR_matrix *C)
 {
     int i, j, col;
 
diff --git a/src/utils/CSR_sum.c b/src/utils/CSR_sum.c
index 0e5d4cf..286fd38 100644
--- a/src/utils/CSR_sum.c
+++ b/src/utils/CSR_sum.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 #include "utils/CSR_sum.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/int_double_pair.h"
 #include "utils/tracked_alloc.h"
 #include "utils/utils.h"
@@ -24,7 +24,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-void sum_csr_alloc(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C)
+void sum_csr_alloc(const CSR_matrix *A, const CSR_matrix *B, CSR_matrix *C)
 {
     /* A and B must be different from C */
     assert(A != C && B != C);
@@ -81,7 +81,7 @@ void sum_csr_alloc(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C)
     C->p[A->m] = C->nnz;
 }
 
-void sum_csr_fill_values(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C)
+void sum_csr_fill_values(const CSR_matrix *A, const CSR_matrix *B, CSR_matrix *C)
 {
     /* Assumes C->p and C->i already contain the sparsity pattern of A+B.
        Fills only C->x accordingly. */
@@ -113,8 +113,8 @@ void sum_csr_fill_values(const CSR_Matrix *A, const CSR_Matrix *B, CSR_Matrix *C
     }
 }
 
-void sum_scaled_csr_matrices_fill_values(const CSR_Matrix *A, const CSR_Matrix *B,
-                                         CSR_Matrix *C, const double *d1,
+void sum_scaled_csr_matrices_fill_values(const CSR_matrix *A, const CSR_matrix *B,
+                                         CSR_matrix *C, const double *d1,
                                          const double *d2)
 {
     /* Assumes C->p and C->i already contain the sparsity pattern of A+B.
@@ -148,7 +148,7 @@ void sum_scaled_csr_matrices_fill_values(const CSR_Matrix *A, const CSR_Matrix *
 }
 
 /* iwork must have size max(A->n, A->nnz), and idx_map must have size A->nnz */
-void sum_block_of_rows_csr_alloc(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_block_of_rows_csr_alloc(const CSR_matrix *A, CSR_matrix *C,
                                  int row_block_size, int *iwork, int *idx_map)
 {
     assert(A->m % row_block_size == 0);
@@ -220,7 +220,7 @@ void sum_block_of_rows_csr_alloc(const CSR_Matrix *A, CSR_Matrix *C,
 }
 
 /* iwork must have size max(A->n, A->nnz), and idx_map must have size A->nnz */
-void sum_evenly_spaced_rows_csr_alloc(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_evenly_spaced_rows_csr_alloc(const CSR_matrix *A, CSR_matrix *C,
                                       int row_spacing, int *iwork, int *idx_map)
 {
     assert(C->m == row_spacing);
@@ -285,16 +285,16 @@ void sum_evenly_spaced_rows_csr_alloc(const CSR_Matrix *A, CSR_Matrix *C,
     }
 }
 
-void accumulator(const CSR_Matrix *A, const int *idx_map, double *out)
+void accumulator(const double *vals, int nnz, const int *idx_map, double *out)
 {
     /* don't forget to initialize accumulator to 0 before calling this */
-    for (int j = 0; j < A->nnz; j++)
+    for (int j = 0; j < nnz; j++)
     {
-        out[idx_map[j]] += A->x[j];
+        out[idx_map[j]] += vals[j];
     }
 }
 
-void accumulator_with_spacing(const CSR_Matrix *A, const int *idx_map, double *out,
+void accumulator_with_spacing(const CSR_matrix *A, const int *idx_map, double *out,
                               int spacing)
 {
     /* don't forget to initialze accumulator to 0 before calling this */
@@ -307,7 +307,7 @@ void accumulator_with_spacing(const CSR_Matrix *A, const int *idx_map, double *o
     }
 }
 
-void sum_all_rows_csr_alloc(const CSR_Matrix *A, CSR_Matrix *C, int *iwork,
+void sum_all_rows_csr_alloc(const CSR_matrix *A, CSR_matrix *C, int *iwork,
                             int *idx_map)
 {
     // -------------------------------------------------------------------
@@ -354,23 +354,23 @@ void sum_all_rows_csr_alloc(const CSR_Matrix *A, CSR_Matrix *C, int *iwork,
 
 /*
  * Sums evenly spaced rows from A into a single row in C and fills an index map.
- * A: input CSR matrix
- * C: output CSR matrix (must have m=1)
+ * A: input CSR_matrix matrix
+ * C: output CSR_matrix matrix (must have m=1)
  * spacing: row spacing
  * iwork: workspace of size at least max(A->n, A->nnz)
  * idx_map: output index map, size at least A->nnz
  */
-CSR_Matrix *sum_4_csr_alloc(const CSR_Matrix *A, const CSR_Matrix *B,
-                            const CSR_Matrix *C, const CSR_Matrix *D,
+CSR_matrix *sum_4_csr_alloc(const CSR_matrix *A, const CSR_matrix *B,
+                            const CSR_matrix *C, const CSR_matrix *D,
                             int *idx_maps[4])
 {
-    const CSR_Matrix *inputs[4] = {A, B, C, D};
+    const CSR_matrix *inputs[4] = {A, B, C, D};
     int m = A->m;
     int n = A->n;
     int nnz_ub = A->nnz + B->nnz + C->nnz + D->nnz;
 
     /* allocate output and index maps */
-    CSR_Matrix *out = new_csr_matrix(m, n, nnz_ub);
+    CSR_matrix *out = new_CSR_matrix(m, n, nnz_ub);
     for (int k = 0; k < 4; k++)
     {
         idx_maps[k] = (int *) SP_MALLOC(inputs[k]->nnz * sizeof(int));
@@ -430,7 +430,7 @@ CSR_Matrix *sum_4_csr_alloc(const CSR_Matrix *A, const CSR_Matrix *B,
     return out;
 }
 
-void sum_spaced_rows_into_row_csr_alloc(const CSR_Matrix *A, CSR_Matrix *C,
+void sum_spaced_rows_into_row_csr_alloc(const CSR_matrix *A, CSR_matrix *C,
                                         int spacing, int *iwork, int *idx_map)
 {
     assert(C->m == 1);
diff --git a/src/utils/dense_matrix.c b/src/utils/dense_matrix.c
deleted file mode 100644
index 5c628c2..0000000
--- a/src/utils/dense_matrix.c
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright 2026 Daniel Cederberg and William Zhang
- *
- * This file is part of the SparseDiffEngine project.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "utils/dense_matrix.h"
-#include "utils/cblas_wrapper.h"
-#include "utils/linalg_dense_sparse_matmuls.h"
-#include "utils/tracked_alloc.h"
-#include <stdlib.h>
-#include <string.h>
-
-static void dense_block_left_mult_vec(const Matrix *A, const double *x, double *y,
-                                      int p)
-{
-    const Dense_Matrix *dm = (const Dense_Matrix *) A;
-    int m = dm->base.m;
-    int n = dm->base.n;
-
-    /* y = kron(I_p, A) @ x via a single dgemm call:
-       Treat x as n x p (column-major blocks) and y as m x p.
-       But x and y are stored as p blocks of length n and m
-       respectively (i.e. block-interleaved). This is the same as
-       treating them as row-major matrices of shape p x n and
-       p x m, so:
-       y (p x m) = x (p x n) * A^T (n x m), all row-major.
-       cblas with RowMajor: C = alpha * A * B + beta * C
-       where A = x (p x n), B = A^T (n x m), C = y (p x m). */
-    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, p, m, n, 1.0, x, n, dm->x,
-                n, 0.0, y, m);
-}
-
-static void dense_update_values(Matrix *self, const double *new_values)
-{
-    Dense_Matrix *dm = (Dense_Matrix *) self;
-    memcpy(dm->x, new_values, dm->base.m * dm->base.n * sizeof(double));
-}
-
-static void dense_free(Matrix *A)
-{
-    Dense_Matrix *dm = (Dense_Matrix *) A;
-    free(dm->x);
-    free(dm->work);
-    free(dm);
-}
-
-Matrix *new_dense_matrix(int m, int n, const double *data)
-{
-    Dense_Matrix *dm = (Dense_Matrix *) SP_CALLOC(1, sizeof(Dense_Matrix));
-    dm->base.m = m;
-    dm->base.n = n;
-    dm->base.block_left_mult_vec = dense_block_left_mult_vec;
-    dm->base.block_left_mult_sparsity = I_kron_A_alloc;
-    dm->base.block_left_mult_values = I_kron_A_fill_values;
-    dm->base.update_values = dense_update_values;
-    dm->base.free_fn = dense_free;
-    dm->x = (double *) SP_MALLOC(m * n * sizeof(double));
-    if (data != NULL)
-    {
-        memcpy(dm->x, data, m * n * sizeof(double));
-    }
-    dm->work = (double *) SP_MALLOC(n * sizeof(double));
-    return &dm->base;
-}
-
-Matrix *dense_matrix_trans(const Dense_Matrix *A)
-{
-    int m = A->base.m;
-    int n = A->base.n;
-    double *AT_x = (double *) SP_MALLOC(m * n * sizeof(double));
-
-    A_transpose(AT_x, A->x, m, n);
-
-    Matrix *result = new_dense_matrix(n, m, AT_x);
-    free(AT_x);
-    return result;
-}
-
-void A_transpose(double *AT, const double *A, int m, int n)
-{
-    for (int i = 0; i < m; i++)
-    {
-        for (int j = 0; j < n; j++)
-        {
-            AT[j * m + i] = A[i * n + j];
-        }
-    }
-}
diff --git a/src/utils/linalg_dense_sparse_matmuls.c b/src/utils/linalg_dense_sparse_matmuls.c
index 74cd5fe..22f133d 100644
--- a/src/utils/linalg_dense_sparse_matmuls.c
+++ b/src/utils/linalg_dense_sparse_matmuls.c
@@ -15,21 +15,21 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "utils/CSC_Matrix.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSC_matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/cblas_wrapper.h"
-#include "utils/dense_matrix.h"
 #include "utils/iVec.h"
+#include "utils/linalg_dense_sparse_matmuls.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 
 /* ---------------------------------------------------------------
- * C = (I_p kron A) @ J  via the polymorphic Matrix interface.
- * A is dense m x n, J is (n*p) x k CSC, C is (m*p) x k CSC.
+ * C = (I_p kron A) @ J  via the polymorphic matrix interface.
+ * A is dense m x n, J is (n*p) x k CSC_matrix, C is (m*p) x k CSC_matrix.
  * --------------------------------------------------------------- */
-CSC_Matrix *I_kron_A_alloc(const Matrix *A, const CSC_Matrix *J, int p)
+CSC_matrix *I_kron_A_alloc(const matrix *A, const CSC_matrix *J, int p)
 {
     int m = A->m;
     int n = A->n;
@@ -82,7 +82,7 @@ CSC_Matrix *I_kron_A_alloc(const Matrix *A, const CSC_Matrix *J, int p)
         Cp[j + 1] = Ci->len;
     }
 
-    CSC_Matrix *C = new_csc_matrix(m * p, J->n, Ci->len);
+    CSC_matrix *C = new_CSC_matrix(m * p, J->n, Ci->len);
     memcpy(C->p, Cp, (J->n + 1) * sizeof(int));
     memcpy(C->i, Ci->data, Ci->len * sizeof(int));
     free(Cp);
@@ -91,16 +91,16 @@ CSC_Matrix *I_kron_A_alloc(const Matrix *A, const CSC_Matrix *J, int p)
     return C;
 }
 
-void I_kron_A_fill_values(const Matrix *A, const CSC_Matrix *J, CSC_Matrix *C)
+void I_kron_A_fill_values(const matrix *A, const CSC_matrix *J, CSC_matrix *C,
+                          double *work)
 {
-    const Dense_Matrix *dm = (const Dense_Matrix *) A;
-    int m = dm->base.m;
-    int n = dm->base.n;
+    int m = A->m;
+    int n = A->n;
     int k = J->n;
 
     int i, j, s, block, block_start, block_end, start, end;
 
-    double *j_dense = dm->work;
+    double *j_dense = work;
 
     /* for each column of J (and C) */
     for (j = 0; j < k; j++)
@@ -131,7 +131,7 @@ void I_kron_A_fill_values(const Matrix *A, const CSC_Matrix *J, CSC_Matrix *C)
                 /* Fast path: C column segment = val * A[:, row_in_block] */
                 int row_in_block = J->i[start] - block_start;
                 double val = J->x[start];
-                cblas_dcopy(m, dm->x + row_in_block, n, C->x + i, 1);
+                cblas_dcopy(m, A->x + row_in_block, n, C->x + i, 1);
                 if (val != 1.0)
                 {
                     cblas_dscal(m, val, C->x + i, 1);
@@ -147,7 +147,7 @@ void I_kron_A_fill_values(const Matrix *A, const CSC_Matrix *J, CSC_Matrix *C)
                     j_dense[J->i[s] - block_start] = J->x[s];
                 }
 
-                cblas_dgemv(CblasRowMajor, CblasNoTrans, m, n, 1.0, dm->x, n,
+                cblas_dgemv(CblasRowMajor, CblasNoTrans, m, n, 1.0, A->x, n,
                             j_dense, 1, 0.0, C->x + i, 1);
             }
         }
@@ -156,9 +156,9 @@ void I_kron_A_fill_values(const Matrix *A, const CSC_Matrix *J, CSC_Matrix *C)
 
 /* ---------------------------------------------------------------
  * C = (Y^T kron I_m) @ J
- * Y is k x n (col-major), J is (m*k) x p CSC, C is (m*n) x p CSR
+ * Y is k x n (col-major), J is (m*k) x p CSC_matrix, C is (m*n) x p CSR_matrix
  * --------------------------------------------------------------- */
-CSR_Matrix *YT_kron_I_alloc(int m, int k, int n, const CSC_Matrix *J)
+CSR_matrix *YT_kron_I_alloc(int m, int k, int n, const CSC_matrix *J)
 {
     (void) k;
     /* C has n blocks of m rows.  All rows at the same position within
@@ -198,7 +198,7 @@ CSR_Matrix *YT_kron_I_alloc(int m, int k, int n, const CSC_Matrix *J)
     // ---------------------------------------------------------------
     //           replicate sparsity pattern across blocks
     // ---------------------------------------------------------------
-    CSR_Matrix *C = new_csr_matrix(m * n, J->n, total_nnz);
+    CSR_matrix *C = new_CSR_matrix(m * n, J->n, total_nnz);
     int idx = 0;
     for (i = 0; i < m * n; i++)
     {
@@ -219,8 +219,8 @@ CSR_Matrix *YT_kron_I_alloc(int m, int k, int n, const CSC_Matrix *J)
     return C;
 }
 
-void YT_kron_I_fill_values(int m, int k, int n, const double *Y, const CSC_Matrix *J,
-                           CSR_Matrix *C)
+void YT_kron_I_fill_values(int m, int k, int n, const double *Y, const CSC_matrix *J,
+                           CSR_matrix *C)
 {
     (void) n;
     assert(C->m == m * n);
@@ -256,7 +256,7 @@ void YT_kron_I_fill_values(int m, int k, int n, const double *Y, const CSC_Matri
     }
 }
 
-CSR_Matrix *I_kron_X_alloc(int m, int k, int n, const CSC_Matrix *J)
+CSR_matrix *I_kron_X_alloc(int m, int k, int n, const CSC_matrix *J)
 {
     /* Step 1: for each block, find which columns of J have any
      *         nonzero in row range [blk*k, blk*k + k). */
@@ -287,7 +287,7 @@ CSR_Matrix *I_kron_X_alloc(int m, int k, int n, const CSC_Matrix *J)
 
     /* Step 2: replicate each block's pattern for all m rows
      *         within that block. */
-    CSR_Matrix *C = new_csr_matrix(m * n, J->n, total_nnz);
+    CSR_matrix *C = new_CSR_matrix(m * n, J->n, total_nnz);
     int idx = 0;
     for (i = 0; i < m * n; i++)
     {
@@ -308,8 +308,8 @@ CSR_Matrix *I_kron_X_alloc(int m, int k, int n, const CSC_Matrix *J)
     return C;
 }
 
-void I_kron_X_fill_values(int m, int k, int n, const double *X, const CSC_Matrix *J,
-                          CSR_Matrix *C)
+void I_kron_X_fill_values(int m, int k, int n, const double *X, const CSC_matrix *J,
+                          CSR_matrix *C)
 {
     (void) n;
     assert(C->m == m * n);
diff --git a/src/utils/linalg_sparse_matmuls.c b/src/utils/linalg_sparse_matmuls.c
index 42ee992..7720f70 100644
--- a/src/utils/linalg_sparse_matmuls.c
+++ b/src/utils/linalg_sparse_matmuls.c
@@ -15,35 +15,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "utils/CSC_Matrix.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSC_matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/iVec.h"
 #include "utils/tracked_alloc.h"
+#include "utils/utils.h"
 #include <assert.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-static inline bool has_overlap(const int *a_idx, int a_len, const int *b_idx,
-                               int b_len, int b_offset)
-{
-    int ai = 0, bi = 0;
-    while (ai < a_len && bi < b_len)
-    {
-        if (a_idx[ai] == b_idx[bi] - b_offset) return true;
-        if (a_idx[ai] < b_idx[bi] - b_offset)
-        {
-            ai++;
-        }
-        else
-        {
-            bi++;
-        }
-    }
-    return false;
-}
-
 /* Unweighted sparse dot product of two sorted index arrays */
 static inline double sparse_dot(const double *a_x, const int *a_i, int a_nnz,
                                 const double *b_x, const int *b_i, int b_nnz,
@@ -104,8 +86,8 @@ static inline double sparse_dot_offset(const double *a_x, const int *a_idx,
     return sum;
 }
 
-CSC_Matrix *block_left_multiply_fill_sparsity(const CSR_Matrix *A,
-                                              const CSC_Matrix *J, int p)
+CSC_matrix *block_left_multiply_fill_sparsity(const CSR_matrix *A,
+                                              const CSC_matrix *J, int p)
 {
     /* A is m x n, J is (n*p) x k, C is (m*p) x k */
     int m = A->m;
@@ -175,7 +157,7 @@ CSC_Matrix *block_left_multiply_fill_sparsity(const CSR_Matrix *A,
         Cp[j + 1] = Ci->len;
     }
 
-    CSC_Matrix *C = new_csc_matrix(m * p, J->n, Ci->len);
+    CSC_matrix *C = new_CSC_matrix(m * p, J->n, Ci->len);
     memcpy(C->p, Cp, (J->n + 1) * sizeof(int));
     memcpy(C->i, Ci->data, Ci->len * sizeof(int));
     free(Cp);
@@ -184,8 +166,8 @@ CSC_Matrix *block_left_multiply_fill_sparsity(const CSR_Matrix *A,
     return C;
 }
 
-void block_left_multiply_fill_values(const CSR_Matrix *A, const CSC_Matrix *J,
-                                     CSC_Matrix *C)
+void block_left_multiply_fill_values(const CSR_matrix *A, const CSC_matrix *J,
+                                     CSC_matrix *C)
 {
     /* A is m x n, J is (n*p) x k, C is (m*p) x k */
     int m = A->m;
@@ -246,9 +228,9 @@ void block_left_multiply_fill_values(const CSR_Matrix *A, const CSC_Matrix *J,
     }
 }
 
-/* Fill values of C = A @ B where A is CSR, B is CSC. */
-void csr_csc_matmul_fill_values(const CSR_Matrix *A, const CSC_Matrix *B,
-                                CSR_Matrix *C)
+/* Fill values of C = A @ B where A is CSR_matrix, B is CSC_matrix. */
+void csr_csc_matmul_fill_values(const CSR_matrix *A, const CSC_matrix *B,
+                                CSR_matrix *C)
 {
     for (int i = 0; i < A->m; i++)
     {
@@ -268,9 +250,9 @@ void csr_csc_matmul_fill_values(const CSR_Matrix *A, const CSC_Matrix *B,
     }
 }
 
-/* C = A @ B where A is CSR (m x n), B is CSC (n x p). Result C is CSR (m x p)
+/* C = A @ B where A is CSR_matrix (m x n), B is CSC_matrix (n x p). Result C is CSR_matrix (m x p)
   with precomputed sparsity pattern */
-CSR_Matrix *csr_csc_matmul_alloc(const CSR_Matrix *A, const CSC_Matrix *B)
+CSR_matrix *csr_csc_matmul_alloc(const CSR_matrix *A, const CSC_matrix *B)
 {
     int m = A->m;
     int p = B->n;
@@ -304,7 +286,7 @@ CSR_Matrix *csr_csc_matmul_alloc(const CSR_Matrix *A, const CSC_Matrix *B)
         Cp[i + 1] = nnz;
     }
 
-    CSR_Matrix *C = new_csr_matrix(m, p, nnz);
+    CSR_matrix *C = new_CSR_matrix(m, p, nnz);
     memcpy(C->p, Cp, (m + 1) * sizeof(int));
     memcpy(C->i, Ci->data, nnz * sizeof(int));
     free(Cp);
@@ -317,7 +299,7 @@ CSR_Matrix *csr_csc_matmul_alloc(const CSR_Matrix *A, const CSC_Matrix *B)
  * y = [A @ x1; A @ x2; ...; A @ xp] where A is m x n and x is (n*p)-length vector.
  * x is split into p blocks of n elements each.
  */
-void block_left_multiply_vec(const struct CSR_Matrix *A, const double *x, double *y,
+void block_left_multiply_vec(const struct CSR_matrix *A, const double *x, double *y,
                              int p)
 {
     /* For each block */
diff --git a/src/utils/matrix_BTA.c b/src/utils/matrix_BTA.c
new file mode 100644
index 0000000..cd4d815
--- /dev/null
+++ b/src/utils/matrix_BTA.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2026 Daniel Cederberg and William Zhang
+ *
+ * This file is part of the SparseDiffEngine project.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ */
+#include "utils/matrix_BTA.h"
+
+#include "utils/CSC_matrix.h"
+#include "utils/CSR_matrix.h"
+#include "utils/permuted_dense.h"
+#include "utils/sparse_matrix.h"
+
+matrix *BTA_matrices_alloc(matrix *A, matrix *B)
+{
+    if (A->is_permuted_dense && B->is_permuted_dense)
+    {
+        return BTA_pd_pd_alloc((permuted_dense *) B, (permuted_dense *) A);
+    }
+    if (B->is_permuted_dense)
+    {
+        sparse_matrix *sm_A = (sparse_matrix *) A;
+        return BTA_pd_csc_alloc((permuted_dense *) B, sm_A->csc_cache);
+    }
+    if (A->is_permuted_dense)
+    {
+        sparse_matrix *sm_B = (sparse_matrix *) B;
+        return BTA_csc_pd_alloc(sm_B->csc_cache, (permuted_dense *) A);
+    }
+
+    /* both sparse */
+    sparse_matrix *sm_A = (sparse_matrix *) A;
+    sparse_matrix *sm_B = (sparse_matrix *) B;
+    CSR_matrix *C_csr = BTA_alloc(sm_A->csc_cache, sm_B->csc_cache);
+    return new_sparse_matrix(C_csr);
+}
+
+void BTDA_matrices_fill_values(matrix *A, const double *d, matrix *B, matrix *C)
+{
+    if (A->is_permuted_dense && B->is_permuted_dense)
+    {
+        BTDA_pd_pd_fill_values((permuted_dense *) B, d, (permuted_dense *) A,
+                               (permuted_dense *) C);
+        return;
+    }
+    if (B->is_permuted_dense)
+    {
+        sparse_matrix *sm_A = (sparse_matrix *) A;
+        BTDA_pd_csc_fill_values((permuted_dense *) B, d, sm_A->csc_cache,
+                                (permuted_dense *) C);
+        return;
+    }
+    if (A->is_permuted_dense)
+    {
+        sparse_matrix *sm_B = (sparse_matrix *) B;
+        BTDA_csc_pd_fill_values(sm_B->csc_cache, d, (permuted_dense *) A,
+                                (permuted_dense *) C);
+        return;
+    }
+
+    /* both sparse */
+    sparse_matrix *sm_A = (sparse_matrix *) A;
+    sparse_matrix *sm_B = (sparse_matrix *) B;
+    sparse_matrix *sm_C = (sparse_matrix *) C;
+    BTDA_fill_values(sm_A->csc_cache, sm_B->csc_cache, d, sm_C->csr);
+}
+
+matrix *BA_pd_matrices_alloc(const permuted_dense *B, const matrix *A)
+{
+    if (A->is_permuted_dense)
+    {
+        return BA_pd_pd_alloc(B, (const permuted_dense *) A);
+    }
+    /* A is sparse — use the existing BA_pd_csc_* kernels. Ensure the
+       csc_cache structure exists at alloc time. */
+    sparse_matrix *sm_A = (sparse_matrix *) A;
+    sparse_matrix_ensure_csc_cache(sm_A);
+    return BA_pd_csc_alloc(B, sm_A->csc_cache);
+}
+
+void BA_pd_matrices_fill_values(const permuted_dense *B, const matrix *A,
+                                permuted_dense *C)
+{
+    if (A->is_permuted_dense)
+    {
+        BA_pd_pd_fill_values(B, (const permuted_dense *) A, C);
+        return;
+    }
+    /* A is sparse — caller must have refreshed sm_A->csc_cache values. */
+    sparse_matrix *sm_A = (sparse_matrix *) A;
+    BA_pd_csc_fill_values(B->X, B->n0, B->col_inv, sm_A->csc_cache, C);
+}
diff --git a/src/utils/matrix_sum.c b/src/utils/matrix_sum.c
new file mode 100644
index 0000000..c8b12b5
--- /dev/null
+++ b/src/utils/matrix_sum.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2026 Daniel Cederberg and William Zhang
+ *
+ * This file is part of the SparseDiffEngine project.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "utils/matrix_sum.h"
+#include "utils/CSR_sum.h"
+
+void sum_matrices_alloc(matrix *A, matrix *B, matrix *C)
+{
+    CSR_matrix *cc = C->to_csr(C);
+    sum_csr_alloc(A->to_csr(A), B->to_csr(B), cc);
+    C->nnz = cc->nnz;
+}
+
+void sum_matrices_fill_values(matrix *A, matrix *B, matrix *C)
+{
+    sum_csr_fill_values(A->to_csr(A), B->to_csr(B), C->to_csr(C));
+}
+
+void sum_scaled_matrices_fill_values(matrix *A, matrix *B, matrix *C,
+                                     const double *d1, const double *d2)
+{
+    sum_scaled_csr_matrices_fill_values(A->to_csr(A), B->to_csr(B), C->to_csr(C),
+                                        d1, d2);
+}
diff --git a/src/utils/mini_numpy.c b/src/utils/mini_numpy.c
index baa00c5..141e77e 100644
--- a/src/utils/mini_numpy.c
+++ b/src/utils/mini_numpy.c
@@ -69,6 +69,17 @@ void mat_mat_mult(const double *X, const double *Y, double *Z, int m, int k, int
     }
 }
 
+void A_transpose(double *AT, const double *A, int m, int n)
+{
+    for (int i = 0; i < m; i++)
+    {
+        for (int j = 0; j < n; j++)
+        {
+            AT[j * m + i] = A[i * n + j];
+        }
+    }
+}
+
 void Y_kron_I_vec(int m, int k, int n, const double *Y, const double *w, double *v)
 {
     for (int j = 0; j < k; j++)
@@ -101,7 +112,7 @@ void I_kron_XT_vec(int m, int k, int n, const double *X, const double *w, double
     }
 }
 
-void conv_matrix_fill_sparsity(CSR_Matrix *T_csr, int m, int n)
+void conv_matrix_fill_sparsity(CSR_matrix *T_csr, int m, int n)
 {
     int nnz = 0;
     for (int r = 0; r < T_csr->m; r++)
@@ -118,7 +129,7 @@ void conv_matrix_fill_sparsity(CSR_Matrix *T_csr, int m, int n)
     T_csr->p[T_csr->m] = nnz;
 }
 
-void conv_matrix_fill_values(CSR_Matrix *T_csr, const double *a)
+void conv_matrix_fill_values(CSR_matrix *T_csr, const double *a)
 {
     for (int r = 0; r < T_csr->m; r++)
     {
diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c
new file mode 100644
index 0000000..34d7466
--- /dev/null
+++ b/src/utils/permuted_dense.c
@@ -0,0 +1,1061 @@
+/*
+ * Copyright 2026 Daniel Cederberg and William Zhang
+ *
+ * This file is part of the SparseDiffEngine project.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "utils/permuted_dense.h"
+#include "utils/cblas_wrapper.h"
+#include "utils/iVec.h"
+#include "utils/linalg_dense_sparse_matmuls.h"
+#include "utils/tracked_alloc.h"
+#include "utils/utils.h"
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static void permuted_dense_free(matrix *self)
+{
+    permuted_dense *pd = (permuted_dense *) self;
+    free(pd->row_perm);
+    free(pd->col_perm);
+    free(pd->col_inv);
+    free(pd->row_inv);
+    /* csr_cache->x aliases pd->X (set in permuted_dense_to_csr_alloc); NULL it
+       so free_CSR_matrix doesn't double-free the shared buffer. */
+    if (pd->csr_cache != NULL)
+    {
+        pd->csr_cache->x = NULL;
+    }
+    free_CSR_matrix(pd->csr_cache);
+    free(pd->X);
+    free(pd->dwork);
+    free(pd->iwork);
+    free(pd);
+}
+
+/* permuted_dense has no CSC_matrix mirror; chain-rule kernels operate on X directly.
+ */
+static void permuted_dense_refresh_csc_values(matrix *self)
+{
+    (void) self;
+}
+
+/* Vtable adapters — each delegates to the existing permuted_dense_* kernel. */
+static matrix *permuted_dense_vtable_copy_sparsity(const matrix *self)
+{
+    const permuted_dense *pd = (const permuted_dense *) self;
+    return new_permuted_dense(pd->base.m, pd->base.n, pd->m0, pd->n0, pd->row_perm,
+                              pd->col_perm, NULL);
+}
+
+static void permuted_dense_vtable_DA_fill_values(const double *d, const matrix *self,
+                                                 matrix *out)
+{
+    DA_pd_fill_values(d, (const permuted_dense *) self, (permuted_dense *) out);
+}
+
+static matrix *permuted_dense_vtable_ATA_alloc(matrix *self)
+{
+    return ATA_pd_alloc((const permuted_dense *) self);
+}
+
+static void permuted_dense_vtable_ATDA_fill_values(const matrix *self,
+                                                   const double *d, matrix *out)
+{
+    ATDA_pd_fill_values((const permuted_dense *) self, d, (permuted_dense *) out);
+}
+
+/* Forward decls; definitions lower in the file. */
+static CSR_matrix *permuted_dense_to_csr_alloc(const permuted_dense *A);
+static void permuted_dense_ensure_dwork(const permuted_dense *pd_const, size_t size);
+
+/* Lazy CSR_matrix view: allocate structure on first call, then return the cache.
+   The cache's x array aliases pd->X (see permuted_dense_to_csr_alloc), so
+   values are always live without a per-call refresh. */
+static CSR_matrix *permuted_dense_to_csr(matrix *self)
+{
+    permuted_dense *pd = (permuted_dense *) self;
+    if (pd->csr_cache == NULL)
+    {
+        pd->csr_cache = permuted_dense_to_csr_alloc(pd);
+    }
+    return pd->csr_cache;
+}
+
+static matrix *permuted_dense_vtable_transpose_alloc(const matrix *self)
+{
+    const permuted_dense *pd = (const permuted_dense *) self;
+    /* Swap (m, n), (m0, n0), and (row_perm, col_perm). The constructor
+       asserts strict increase of both perms, which holds by construction. */
+    return new_permuted_dense(pd->base.n, pd->base.m, pd->n0, pd->m0, pd->col_perm,
+                              pd->row_perm, NULL);
+}
+
+static void permuted_dense_vtable_transpose_fill_values(const matrix *self,
+                                                        matrix *out)
+{
+    const permuted_dense *pd_in = (const permuted_dense *) self;
+    permuted_dense *pd_out = (permuted_dense *) out;
+    int m0 = pd_in->m0;
+    int n0 = pd_in->n0;
+    /* pd_out has shape (n0, m0); transpose pd_in->X into pd_out->X. */
+    for (int ii = 0; ii < m0; ii++)
+    {
+        for (int jj = 0; jj < n0; jj++)
+        {
+            pd_out->X[jj * m0 + ii] = pd_in->X[ii * n0 + jj];
+        }
+    }
+}
+
+static matrix *permuted_dense_vtable_index_alloc(matrix *self, const int *indices,
+                                                 int n_idxs)
+{
+    const permuted_dense *pd = (const permuted_dense *) self;
+
+    /* Scan indices: which output positions i hit a row in pd->row_perm? */
+    int *new_row_perm = (int *) SP_MALLOC(n_idxs * sizeof(int));
+    int new_m0 = 0;
+    for (int i = 0; i < n_idxs; i++)
+    {
+        if (pd->row_inv[indices[i]] >= 0)
+        {
+            new_row_perm[new_m0++] = i;
+        }
+    }
+
+    matrix *out = new_permuted_dense(n_idxs, pd->base.n, new_m0, pd->n0,
+                                     new_row_perm, pd->col_perm, NULL);
+    free(new_row_perm);
+    return out;
+}
+
+static void permuted_dense_vtable_index_fill_values(matrix *self, const int *indices,
+                                                    int n_idxs, matrix *out)
+{
+    (void) n_idxs;
+    const permuted_dense *pd = (const permuted_dense *) self;
+    permuted_dense *out_pd = (permuted_dense *) out;
+    int n0 = pd->n0;
+    for (int k = 0; k < out_pd->m0; k++)
+    {
+        int i = out_pd->row_perm[k];
+        int old_ii = pd->row_inv[indices[i]];
+        memcpy(out_pd->X + k * n0, pd->X + old_ii * n0, n0 * sizeof(double));
+    }
+}
+
+static matrix *permuted_dense_vtable_promote_alloc(matrix *self, int size)
+{
+    const permuted_dense *pd = (const permuted_dense *) self;
+    assert(pd->m0 <= 1);
+
+    if (pd->m0 == 0)
+    {
+        /* source row is all-zero; output is also structurally all-zero. */
+        return new_permuted_dense(size, pd->base.n, 0, pd->n0, NULL, pd->col_perm,
+                                  NULL);
+    }
+
+    int *new_row_perm = (int *) SP_MALLOC(size * sizeof(int));
+    for (int i = 0; i < size; i++)
+    {
+        new_row_perm[i] = i;
+    }
+    matrix *out = new_permuted_dense(size, pd->base.n, size, pd->n0, new_row_perm,
+                                     pd->col_perm, NULL);
+    free(new_row_perm);
+    return out;
+}
+
+static void permuted_dense_vtable_promote_fill_values(matrix *self, matrix *out)
+{
+    const permuted_dense *pd = (const permuted_dense *) self;
+    permuted_dense *out_pd = (permuted_dense *) out;
+    if (pd->m0 == 0) return;
+    int n0 = pd->n0;
+    for (int k = 0; k < out_pd->m0; k++)
+    {
+        memcpy(out_pd->X + k * n0, pd->X, n0 * sizeof(double));
+    }
+}
+
+static matrix *permuted_dense_vtable_broadcast_alloc(matrix *self,
+                                                     broadcast_type type, int d1,
+                                                     int d2)
+{
+    const permuted_dense *pd = (const permuted_dense *) self;
+    int out_m = d1 * d2;
+
+    int new_m0;
+    if (type == BROADCAST_SCALAR)
+    {
+        new_m0 = (pd->m0 == 0) ? 0 : out_m;
+    }
+    else if (type == BROADCAST_ROW)
+    {
+        new_m0 = d1 * pd->m0;
+    }
+    else /* BROADCAST_COL */
+    {
+        new_m0 = d2 * pd->m0;
+    }
+
+    if (new_m0 == 0)
+    {
+        return new_permuted_dense(out_m, pd->base.n, 0, pd->n0, NULL, pd->col_perm,
+                                  NULL);
+    }
+
+    int *new_row_perm = (int *) SP_MALLOC(new_m0 * sizeof(int));
+    int k = 0;
+    if (type == BROADCAST_SCALAR)
+    {
+        for (int i = 0; i < out_m; i++)
+        {
+            new_row_perm[k++] = i;
+        }
+    }
+    else if (type == BROADCAST_ROW)
+    {
+        for (int j_ii = 0; j_ii < pd->m0; j_ii++)
+        {
+            int j_old = pd->row_perm[j_ii];
+            for (int i = 0; i < d1; i++)
+            {
+                new_row_perm[k++] = j_old * d1 + i;
+            }
+        }
+    }
+    else /* BROADCAST_COL */
+    {
+        for (int j = 0; j < d2; j++)
+        {
+            for (int ii_old = 0; ii_old < pd->m0; ii_old++)
+            {
+                new_row_perm[k++] = j * d1 + pd->row_perm[ii_old];
+            }
+        }
+    }
+
+    matrix *out = new_permuted_dense(out_m, pd->base.n, new_m0, pd->n0, new_row_perm,
+                                     pd->col_perm, NULL);
+    free(new_row_perm);
+    return out;
+}
+
+static void permuted_dense_vtable_broadcast_fill_values(matrix *self,
+                                                        broadcast_type type, int d1,
+                                                        int d2, matrix *out)
+{
+    const permuted_dense *pd = (const permuted_dense *) self;
+    permuted_dense *out_pd = (permuted_dense *) out;
+    if (pd->m0 == 0)
+    {
+        return;
+    }
+    int n0 = pd->n0;
+
+    if (type == BROADCAST_SCALAR)
+    {
+        for (int k = 0; k < out_pd->m0; k++)
+        {
+            memcpy(out_pd->X + k * n0, pd->X, n0 * sizeof(double));
+        }
+    }
+    else if (type == BROADCAST_ROW)
+    {
+        /* output row k corresponds to child dense row (k / d1). */
+        (void) d2;
+        for (int k = 0; k < out_pd->m0; k++)
+        {
+            memcpy(out_pd->X + k * n0, pd->X + (k / d1) * n0, n0 * sizeof(double));
+        }
+    }
+    else /* BROADCAST_COL */
+    {
+        (void) d1;
+        size_t child_block = pd->m0 * n0;
+        for (int j = 0; j < d2; j++)
+        {
+            memcpy(out_pd->X + j * child_block, pd->X, child_block * sizeof(double));
+        }
+    }
+}
+
+static matrix *permuted_dense_vtable_diag_vec_alloc(matrix *self)
+{
+    const permuted_dense *pd = (const permuted_dense *) self;
+    int n = pd->base.m;
+    int out_m = n * n;
+
+    if (pd->m0 == 0)
+    {
+        return new_permuted_dense(out_m, pd->base.n, 0, pd->n0, NULL, pd->col_perm,
+                                  NULL);
+    }
+
+    int *new_row_perm = (int *) SP_MALLOC(pd->m0 * sizeof(int));
+    for (int ii = 0; ii < pd->m0; ii++)
+    {
+        new_row_perm[ii] = pd->row_perm[ii] * (n + 1);
+    }
+    matrix *out = new_permuted_dense(out_m, pd->base.n, pd->m0, pd->n0, new_row_perm,
+                                     pd->col_perm, NULL);
+    free(new_row_perm);
+    return out;
+}
+
+static void permuted_dense_vtable_diag_vec_fill_values(matrix *self, matrix *out)
+{
+    const permuted_dense *pd = (const permuted_dense *) self;
+    permuted_dense *out_pd = (permuted_dense *) out;
+    if (pd->m0 == 0)
+    {
+        return;
+    }
+    memcpy(out_pd->X, pd->X, pd->m0 * pd->n0 * sizeof(double));
+}
+
+/* ===== Operator-role adapters: PD acting as the constant left operand of
+   left_matmul. Currently restricted to full-block PDs (m0 == m, n0 == n,
+   identity perms) — the only operator shape any caller needs today. */
+
+static void permuted_dense_vtable_block_left_mult_vec(const matrix *A,
+                                                      const double *x, double *y,
+                                                      int p)
+{
+    /* Full-block precondition: A->x is a single contiguous row-major m x n
+       block (perms are identity). For a non-trivial PD, A->x still points
+       at pd->X but X only stores the values at the permuted positions; the
+       layout below assumes a full m x n matrix, hence the assert. */
+    assert(((const permuted_dense *) A)->m0 == A->m &&
+           ((const permuted_dense *) A)->n0 == A->n);
+
+    /* y = kron(I_p, A) @ x via a single dgemm.
+       Input x is p blocks of length n (block-interleaved); output y is p
+       blocks of length m. That's identical in memory to row-major matrices
+       of shape (p, n) and (p, m) respectively, so we can compute
+           y (p x m) = x (p x n) * A^T (n x m)
+       in one shot. CblasRowMajor + CblasNoTrans on x + CblasTrans on A
+       gives exactly that. */
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, p, A->m, A->n, 1.0, x, A->n,
+                A->x, A->n, 0.0, y, A->m);
+}
+
+static CSC_matrix *
+permuted_dense_vtable_block_left_mult_sparsity(const matrix *A, const CSC_matrix *J,
+                                               int p)
+{
+    const permuted_dense *pd = (const permuted_dense *) A;
+    assert(pd->m0 == A->m && pd->n0 == A->n);
+    /* Pre-size dwork for the subsequent block_left_mult_values fill, which
+       densifies a sparse column of J (size A->n) before applying A. Honors
+       the no-alloc-in-fill rule. */
+    permuted_dense_ensure_dwork(pd, (size_t) A->n);
+    return I_kron_A_alloc(A, J, p);
+}
+
+static void permuted_dense_vtable_block_left_mult_values(const matrix *A,
+                                                         const CSC_matrix *J,
+                                                         CSC_matrix *C)
+{
+    const permuted_dense *pd = (const permuted_dense *) A;
+    assert(pd->m0 == A->m && pd->n0 == A->n);
+    I_kron_A_fill_values(A, J, C, pd->dwork);
+}
+
+matrix *new_permuted_dense(int m, int n, int m0, int n0, const int *row_perm,
+                           const int *col_perm, const double *X_data)
+{
+    /* Validate sorted invariants. */
+    for (int ii = 1; ii < m0; ii++)
+    {
+        assert(row_perm[ii] > row_perm[ii - 1]);
+    }
+    for (int jj = 1; jj < n0; jj++)
+    {
+        assert(col_perm[jj] > col_perm[jj - 1]);
+    }
+    if (m0 > 0)
+    {
+        assert(row_perm[0] >= 0 && row_perm[m0 - 1] < m);
+    }
+    if (n0 > 0)
+    {
+        assert(col_perm[0] >= 0 && col_perm[n0 - 1] < n);
+    }
+
+    permuted_dense *pd = (permuted_dense *) SP_CALLOC(1, sizeof(permuted_dense));
+    pd->base.m = m;
+    pd->base.n = n;
+    pd->base.nnz = m0 * n0;
+    pd->base.block_left_mult_vec = permuted_dense_vtable_block_left_mult_vec;
+    pd->base.block_left_mult_sparsity =
+        permuted_dense_vtable_block_left_mult_sparsity;
+    pd->base.block_left_mult_values = permuted_dense_vtable_block_left_mult_values;
+    pd->base.copy_sparsity = permuted_dense_vtable_copy_sparsity;
+    pd->base.DA_fill_values = permuted_dense_vtable_DA_fill_values;
+    pd->base.ATA_alloc = permuted_dense_vtable_ATA_alloc;
+    pd->base.ATDA_fill_values = permuted_dense_vtable_ATDA_fill_values;
+    pd->base.to_csr = permuted_dense_to_csr;
+    pd->base.transpose_alloc = permuted_dense_vtable_transpose_alloc;
+    pd->base.transpose_fill_values = permuted_dense_vtable_transpose_fill_values;
+    pd->base.is_permuted_dense = true;
+    pd->base.index_alloc = permuted_dense_vtable_index_alloc;
+    pd->base.index_fill_values = permuted_dense_vtable_index_fill_values;
+    pd->base.promote_alloc = permuted_dense_vtable_promote_alloc;
+    pd->base.promote_fill_values = permuted_dense_vtable_promote_fill_values;
+    pd->base.broadcast_alloc = permuted_dense_vtable_broadcast_alloc;
+    pd->base.broadcast_fill_values = permuted_dense_vtable_broadcast_fill_values;
+    pd->base.diag_vec_alloc = permuted_dense_vtable_diag_vec_alloc;
+    pd->base.diag_vec_fill_values = permuted_dense_vtable_diag_vec_fill_values;
+    pd->base.refresh_csc_values = permuted_dense_refresh_csc_values;
+    pd->base.free_fn = permuted_dense_free;
+
+    pd->m0 = m0;
+    pd->n0 = n0;
+
+    int sz = m0 * n0;
+    pd->row_perm = (int *) SP_MALLOC(m0 * sizeof(int));
+    pd->col_perm = (int *) SP_MALLOC(n0 * sizeof(int));
+    pd->X = (double *) SP_MALLOC(sz * sizeof(double));
+    pd->base.x = pd->X;
+    /* dwork is allocated lazily by kernels via permuted_dense_ensure_dwork.
+       SP_CALLOC above already zeroed dwork / dwork_size, but make it
+       explicit. */
+    pd->dwork = NULL;
+    pd->dwork_size = 0;
+    pd->col_inv = (int *) SP_MALLOC(n * sizeof(int));
+    pd->row_inv = (int *) SP_MALLOC(m * sizeof(int));
+
+    if (m0 > 0)
+    {
+        memcpy(pd->row_perm, row_perm, m0 * sizeof(int));
+    }
+    if (n0 > 0)
+    {
+        memcpy(pd->col_perm, col_perm, n0 * sizeof(int));
+    }
+
+    for (int j = 0; j < n; j++)
+    {
+        pd->col_inv[j] = -1;
+    }
+    for (int jj = 0; jj < n0; jj++)
+    {
+        pd->col_inv[col_perm[jj]] = jj;
+    }
+
+    for (int i = 0; i < m; i++)
+    {
+        pd->row_inv[i] = -1;
+    }
+    for (int ii = 0; ii < m0; ii++)
+    {
+        pd->row_inv[row_perm[ii]] = ii;
+    }
+
+    if (X_data != NULL && sz > 0)
+    {
+        memcpy(pd->X, X_data, sz * sizeof(double));
+    }
+
+    return &pd->base;
+}
+
+matrix *new_permuted_dense_full(int m, int n, const double *data)
+{
+    int *row_perm = (int *) SP_MALLOC(m * sizeof(int));
+    int *col_perm = (int *) SP_MALLOC(n * sizeof(int));
+    for (int i = 0; i < m; i++) row_perm[i] = i;
+    for (int j = 0; j < n; j++) col_perm[j] = j;
+    matrix *out = new_permuted_dense(m, n, m, n, row_perm, col_perm, data);
+    free(row_perm);
+    free(col_perm);
+    return out;
+}
+
+static CSR_matrix *permuted_dense_to_csr_alloc(const permuted_dense *A)
+{
+    int m0 = A->m0;
+    int n0 = A->n0;
+    int m = A->base.m;
+    CSR_matrix *C = new_CSR_matrix(m, A->base.n, m0 * n0);
+
+    /* Alias C->x to A->X: the dense block layout already matches what the
+       CSR_matrix view's value array would hold, so values are always live with no
+       memcpy needed. The PD owns the buffer; permuted_dense_free nulls
+       C->x before free_CSR_matrix to avoid double-free. */
+    free(C->x);
+    C->x = A->X;
+
+    /* fill column indices (each dense row contributes a copy of col_perm) */
+    for (int ii = 0; ii < m0; ii++)
+    {
+        memcpy(C->i + ii * n0, A->col_perm, n0 * sizeof(int));
+    }
+
+    /* set row pointers via count and then cumulative sum  */
+    memset(C->p, 0, (m + 1) * sizeof(int));
+    for (int ii = 0; ii < m0; ii++)
+    {
+        C->p[A->row_perm[ii] + 1] = n0;
+    }
+
+    for (int i = 0; i < m; i++)
+    {
+        C->p[i + 1] += C->p[i];
+    }
+
+    return C;
+}
+
+void DA_pd_fill_values(const double *d, const permuted_dense *A, permuted_dense *C)
+{
+    int m0 = A->m0;
+    int n0 = A->n0;
+    cblas_dcopy(m0 * n0, A->X, 1, C->X, 1);
+    for (int ii = 0; ii < m0; ii++)
+    {
+        cblas_dscal(n0, d[A->row_perm[ii]], C->X + ii * n0, 1);
+    }
+}
+
+/* Ensure pd->dwork is sized at least `size` doubles. Grows in place;
+   contents are NOT preserved. Called from allocator functions so that the
+   corresponding fill kernels never need to allocate. Takes a const pointer
+   and casts internally — this matches the dwork contract (header) that
+   dwork is mutable through a const permuted_dense *. */
+static void permuted_dense_ensure_dwork(const permuted_dense *pd_const, size_t size)
+{
+    permuted_dense *pd = (permuted_dense *) pd_const;
+    if (pd->dwork_size >= size) return;
+    free(pd->dwork);
+    pd->dwork = (double *) SP_MALLOC(size * sizeof(double));
+    pd->dwork_size = size;
+}
+
+matrix *ATA_pd_alloc(const permuted_dense *A)
+{
+    int n = A->base.n;
+    /* C = AT @ A has a dense block of size n0 x n0, with row and column index
+       sets given by A's col_perm. (This follows from Cij = ai^T aj where
+       ai and aj are columns of A. Here, ai and aj always have overlapping entries,
+       so Cij != 0 for (i, j) in A->col_perm x A->col_perm) */
+
+    /* Pre-size A's dwork for the ATDA fill (Y-buffer = diag(d_perm) X). */
+    permuted_dense_ensure_dwork(A, (size_t) A->m0 * A->n0);
+
+    return new_permuted_dense(n, n, A->n0, A->n0, A->col_perm, A->col_perm, NULL);
+}
+
+void ATDA_pd_fill_values(const permuted_dense *A, const double *d, permuted_dense *C)
+{
+    int m0 = A->m0;
+    int n0 = A->n0;
+
+    /* dwork = diag(d_perm) @ X, where d_perm[ii] = d[row_perm[ii]]. */
+    cblas_dcopy(m0 * n0, A->X, 1, A->dwork, 1);
+    for (int ii = 0; ii < m0; ii++)
+    {
+        cblas_dscal(n0, d[A->row_perm[ii]], A->dwork + ii * n0, 1);
+    }
+
+    /* C  = XT @ dwork = XT @ diag(d_perm) @ X */
+    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, n0, n0, m0, 1.0, A->X, n0,
+                A->dwork, n0, 0.0, C->X, n0);
+}
+
+matrix *BTA_pd_pd_alloc(const permuted_dense *B, const permuted_dense *A)
+{
+    /* if A and B have no overlapping rows, then C = BT @ A is empty */
+    if (!has_overlap(A->row_perm, A->m0, B->row_perm, B->m0, 0))
+    {
+        return new_permuted_dense(B->base.n, A->base.n, 0, 0, NULL, NULL, NULL);
+    }
+
+    /* otherwise C has a dense block of size B->n0 x A->n0, with row and column
+       index sets given by B->col_perm and A->col_perm, respectively */
+    matrix *C = new_permuted_dense(B->base.n, A->base.n, B->n0, A->n0, B->col_perm,
+                                   A->col_perm, NULL);
+
+    /* Pre-size A's and B's dwork for the BTA fill slow path (gathered row
+       buffers). Each operand needs s_max rows of its own n0 doubles, where
+       s_max = MIN(A->m0, B->m0) bounds the intersection of row_perms. */
+    int s_max = MIN(A->m0, B->m0);
+    permuted_dense_ensure_dwork(A, (size_t) s_max * A->n0);
+    permuted_dense_ensure_dwork(B, (size_t) s_max * B->n0);
+
+    /* Pre-allocate C->iwork for idx_A + idx_B in BTA / BTDA_pd_pd slow paths
+       (each needs at most s_max ints; we store both arrays back-to-back
+       in iwork, hence 2 * s_max). */
+    permuted_dense *C_pd = (permuted_dense *) C;
+    C_pd->iwork_size = (size_t) 2 * s_max;
+    C_pd->iwork = (int *) SP_MALLOC(C_pd->iwork_size * sizeof(int));
+
+    return C;
+}
+
+/* Return 1 iff arrays a and b of length n are element-wise equal. */
+static int int_arrays_equal(const int *a, const int *b, int n)
+{
+    for (int i = 0; i < n; i++)
+    {
+        if (a[i] != b[i]) return 0;
+    }
+    return 1;
+}
+
+/* Find intersection of two sorted, ascending int arrays. For each pair of positions
+   (ii, jj) where a[ii] == b[jj], write ii into idx_a and jj into idx_b. Returns the
+   count of matches. Buffers idx_a and idx_b must have capacity >= min(a_len, b_len);
+   no allocation is performed. */
+static inline int sorted_intersect_indices(const int *a, int a_len, const int *b,
+                                           int b_len, int *idx_a, int *idx_b)
+{
+    int s = 0;
+    int ii = 0, jj = 0;
+    while (ii < a_len && jj < b_len)
+    {
+        int ra = a[ii];
+        int rb = b[jj];
+        if (ra == rb)
+        {
+            idx_a[s] = ii;
+            idx_b[s] = jj;
+            s++;
+            ii++;
+            jj++;
+        }
+        else if (ra < rb)
+        {
+            ii++;
+        }
+        else
+        {
+            jj++;
+        }
+    }
+    return s;
+}
+
+void BTA_pd_pd_fill_values(const permuted_dense *B, const permuted_dense *A,
+                           permuted_dense *C)
+{
+    /* C may be empty if there is no overlap in row permutations */
+    if (C->base.nnz == 0)
+    {
+        return;
+    }
+
+    /* if B and A have identical row_perms, one matmul suffices */
+    if (A->m0 == B->m0 && int_arrays_equal(A->row_perm, B->row_perm, A->m0))
+    {
+        cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, B->n0, A->n0, A->m0,
+                    1.0, B->X, B->n0, A->X, A->n0, 0.0, C->X, A->n0);
+        return;
+    }
+
+    // -----------------------------------------------------------------------
+    // find intersection of row permutations. C->iwork was pre-sized by
+    // BTA_pd_pd_alloc to 2 * MIN(A->m0, B->m0) ints (idx_A | idx_B back-
+    // to-back), so no allocation here.
+    // -----------------------------------------------------------------------
+    int s_max = MIN(A->m0, B->m0);
+    int *idx_A = C->iwork;
+    int *idx_B = C->iwork + s_max;
+    int s = sorted_intersect_indices(A->row_perm, A->m0, B->row_perm, B->m0, idx_A,
+                                     idx_B);
+    assert(s > 0);
+
+    // ------------------------------------------------------------------------
+    // Gather the matching rows into A->dwork and B->dwork. dwork is pre-sized
+    // by BTA_pd_pd_alloc (one ensure_dwork call per operand at alloc time).
+    // ------------------------------------------------------------------------
+    for (int k = 0; k < s; k++)
+    {
+        memcpy(A->dwork + k * A->n0, A->X + idx_A[k] * A->n0,
+               A->n0 * sizeof(double));
+        memcpy(B->dwork + k * B->n0, B->X + idx_B[k] * B->n0,
+               B->n0 * sizeof(double));
+    }
+
+    /* matmul on the gathered rows */
+    cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, B->n0, A->n0, s, 1.0,
+                B->dwork, B->n0, A->dwork, A->n0, 0.0, C->X, A->n0);
+}
+
+void BTDA_pd_pd_fill_values(const permuted_dense *B, const double *d,
+                            const permuted_dense *A, permuted_dense *C)
+{
+    /* C may be empty if there is no overlap in row permutations of A and B */
+    if (C->base.nnz == 0)
+    {
+        return;
+    }
+
+    /* TODO: must remove this allocation. Very important. The DA
+       intermediate PD is allocated and freed on every Hessian iteration
+       — violates the no-alloc-in-fill policy. Fix is to fold diag(d)
+       directly into BTA_pd_pd_fill_values's gather/dgemm (either via a
+       shared internal helper that takes an optional d, or by rewriting
+       this kernel inline using pre-sized A->dwork). */
+    /* C = BT @ (DA) */
+    permuted_dense *DA = (permuted_dense *) A->base.copy_sparsity(&A->base);
+    DA_pd_fill_values(d, A, DA);
+    BTA_pd_pd_fill_values(B, DA, C);
+    free_matrix(&DA->base);
+}
+
+/* The CSR-flavored kernels for (B=Sparse, A=PD) live in src/old-code; the
+   production path uses BTA_csc_pd_alloc / BTDA_csc_pd_fill_values defined
+   further below, which delegate to BTA_pd_csc via the (A^T B)^T identity. */
+
+/* Return true if any of the 'len' integers in 'indices' exist in the set
+   marked by 'inv' (inv[k] != -1 iff k is in the set). */
+static inline bool idxs_hits_set(const int *idxs, int len, const int *inv)
+{
+    for (int ii = 0; ii < len; ii++)
+    {
+        if (inv[idxs[ii]] != -1)
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+/* Inner product of a sparse vector (vals[0..len) at positions idxs[0..len))
+   with a dense vector, where inv maps each idxs value to a position in
+   'dense' (inv[k] == -1 means skip that entry). */
+static inline double sparse_dot_dense(const double *vals, const int *idxs, int len,
+                                      const int *inv, const double *dense)
+{
+    double sum = 0.0;
+    for (int e = 0; e < len; e++)
+    {
+        int kk = inv[idxs[e]];
+        if (kk == -1)
+        {
+            continue;
+        }
+        sum += vals[e] * dense[kk];
+    }
+    return sum;
+}
+
+matrix *BA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A)
+{
+    /* Cij != 0 if row i of B overlaps with column j of A. So we loop through
+    the columns of A. For each column of A, we check if it has any nonzeros in
+    rows that are in B's col_perm. If yes, column j of C will have a nonzero
+    block corresponding to the rows of B */
+    iVec *col_perm_C = iVec_new(10);
+    for (int j = 0; j < A->n; j++)
+    {
+        int start = A->p[j];
+        int len = A->p[j + 1] - start;
+        if (idxs_hits_set(A->i + start, len, B->col_inv))
+        {
+            iVec_append(col_perm_C, j);
+        }
+    }
+
+    matrix *C = new_permuted_dense(B->base.m, A->n, B->m0, col_perm_C->len,
+                                   B->row_perm, col_perm_C->data, NULL);
+    iVec_free(col_perm_C);
+    return C;
+}
+
+void BA_pd_csc_fill_values(const double *B, int n0_B, const int *inv,
+                           const CSC_matrix *A, permuted_dense *C)
+{
+    /* C[i, j] = bi^T @ ajj, where bi is the ith row of B_X (length n0_B,
+       row stride n0_B) and ajj is the jjth column of A's sparse block
+       (column jj = C->col_perm[j]). inv maps A's row indices to positions
+       in B_X (entries with inv[r] == -1 are skipped). */
+
+    /* row i of C */
+    for (int i = 0; i < C->m0; i++)
+    {
+        double *ci = C->X + i * C->n0;
+
+        /* col j of C  */
+        for (int j = 0; j < C->n0; j++)
+        {
+
+            int jj = C->col_perm[j];
+            int start = A->p[jj];
+            int len = A->p[jj + 1] - start;
+            /* we compute entry C[i, j] */
+            ci[j] =
+                sparse_dot_dense(A->x + start, A->i + start, len, inv, B + i * n0_B);
+        }
+    }
+}
+
+matrix *BA_pd_pd_alloc(const permuted_dense *B, const permuted_dense *A)
+{
+    /* if B's columns don't overlap with A's rows, C = B @ A is empty */
+    if (!has_overlap(B->col_perm, B->n0, A->row_perm, A->m0, 0))
+    {
+        return new_permuted_dense(B->base.m, A->base.n, 0, 0, NULL, NULL, NULL);
+    }
+
+    /* otherwise C has a dense block of size B->m0 x A->n0, with row index
+       set B->row_perm and column index set A->col_perm. */
+    matrix *C = new_permuted_dense(B->base.m, A->base.n, B->m0, A->n0, B->row_perm,
+                                   A->col_perm, NULL);
+
+    int s_max = MIN(B->n0, A->m0);
+
+    /* Pre-size B's and A's dwork for the gathers in fill. Worst-case
+       intersection size is s_max; B_sub is (m0, s) and A_sub is (s, n0). */
+    permuted_dense_ensure_dwork(A, (size_t) s_max * A->n0);
+    permuted_dense_ensure_dwork(B, (size_t) s_max * B->m0);
+
+    /* Pre-allocate C->iwork for idx_B + idx_A back-to-back (2 * s_max ints),
+       same idiom as BTA_pd_pd_alloc. */
+    permuted_dense *C_pd = (permuted_dense *) C;
+    C_pd->iwork_size = (size_t) 2 * s_max;
+    C_pd->iwork = (int *) SP_MALLOC(C_pd->iwork_size * sizeof(int));
+
+    return C;
+}
+
+/* TODO: do we want to reuse BTA_pd_pd_fill_values? */
+void BA_pd_pd_fill_values(const permuted_dense *B, const permuted_dense *A,
+                          permuted_dense *C)
+{
+    /* C may be empty when B->col_perm and A->row_perm don't overlap. */
+    if (C->base.nnz == 0)
+    {
+        return;
+    }
+
+    /* if B's col_perm and A's row_perm are identical, one matmul suffices */
+    if (B->n0 == A->m0 && int_arrays_equal(B->col_perm, A->row_perm, B->n0))
+    {
+        cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, B->m0, A->n0, B->n0,
+                    1.0, B->X, B->n0, A->X, A->n0, 0.0, C->X, A->n0);
+        return;
+    }
+
+    // -----------------------------------------------------------------------
+    // find intersection of B's col_perm and A's row_perm. C->iwork was
+    // pre-sized by BA_pd_pd_alloc to 2 * MIN(B->n0, A->m0) ints (idx_B |
+    // idx_A back-to-back), so no allocation here.
+    // -----------------------------------------------------------------------
+    int s_max = MIN(B->n0, A->m0);
+    int *idx_B = C->iwork;
+    int *idx_A = C->iwork + s_max;
+    int s = sorted_intersect_indices(B->col_perm, B->n0, A->row_perm, A->m0, idx_B,
+                                     idx_A);
+    assert(s > 0);
+
+    // ------------------------------------------------------------------------
+    // Gather the matching slices into B->dwork (column gather) and A->dwork
+    // (row gather). dwork is pre-sized by BA_pd_pd_alloc (one ensure_dwork
+    // call per operand at alloc time).
+    // ------------------------------------------------------------------------
+    /* B_sub shape (B->m0, s) row-major: B_sub[ii, kk] = B->X[ii, idx_B[kk]]. */
+    for (int ii = 0; ii < B->m0; ii++)
+    {
+        for (int kk = 0; kk < s; kk++)
+        {
+            B->dwork[ii * s + kk] = B->X[ii * B->n0 + idx_B[kk]];
+        }
+    }
+    /* A_sub shape (s, A->n0) row-major: A_sub[kk, :] = A->X[idx_A[kk], :]. */
+    for (int kk = 0; kk < s; kk++)
+    {
+        memcpy(A->dwork + kk * A->n0, A->X + idx_A[kk] * A->n0,
+               A->n0 * sizeof(double));
+    }
+
+    /* matmul on the gathered slices */
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, B->m0, A->n0, s, 1.0,
+                B->dwork, s, A->dwork, A->n0, 0.0, C->X, A->n0);
+}
+
+matrix *BTA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A)
+{
+    /* Cij != 0 if column i of B overlaps with column j of A. So we loop
+    through the columns of A. For each column of A, we check if it has any
+    nonzeros in rows that are in B's row_perm. If yes, column j of C will
+    have a nonzero block corresponding to the columns of B */
+    iVec *col_active = iVec_new(8);
+    for (int j = 0; j < A->n; j++)
+    {
+        int start = A->p[j];
+        int len = A->p[j + 1] - start;
+        if (idxs_hits_set(A->i + start, len, B->row_inv))
+        {
+            iVec_append(col_active, j);
+        }
+    }
+
+    matrix *C = new_permuted_dense(B->base.n, A->n, B->n0, col_active->len,
+                                   B->col_perm, col_active->data, NULL);
+    iVec_free(col_active);
+
+    /* Pre-size B's dwork for the BTDA fill (holds (diag(d) B)^T). */
+    permuted_dense_ensure_dwork(B, (size_t) B->m0 * B->n0);
+
+    return C;
+}
+
+/* C = B^T diag(d) A = (diag (d) B)^T A */
+void BTDA_pd_csc_fill_values(const permuted_dense *B, const double *d,
+                             const CSC_matrix *A, permuted_dense *C)
+{
+    /* C may be empty */
+    if (C->base.nnz == 0)
+    {
+        return;
+    }
+
+    int m0 = B->m0;
+    int n0 = B->n0;
+
+    /* conpute B->dwork = (diag(d) B)^T */
+    for (int kk = 0; kk < m0; kk++)
+    {
+        double dk = d[B->row_perm[kk]];
+        for (int ii = 0; ii < n0; ii++)
+        {
+            B->dwork[ii * m0 + kk] = dk * B->X[kk * n0 + ii];
+        }
+    }
+
+    BA_pd_csc_fill_values(B->dwork, m0, B->row_inv, A, C);
+}
+
+matrix *BTA_csc_pd_alloc(const CSC_matrix *B, const permuted_dense *A)
+{
+    /* Cij != 0 if column i of B overlaps with row j of A. So we loop through the
+       columns of B. For each column of B, we check if it has any nonzeros in rows
+       that are in A->row_perm. If yes, column i of C will have a nonzero block
+       corresponding to the columns of A */
+
+    iVec *row_active = iVec_new(10);
+    for (int i = 0; i < B->n; i++)
+    {
+        int start = B->p[i];
+        int len = B->p[i + 1] - start;
+        if (idxs_hits_set(B->i + start, len, A->row_inv))
+        {
+            iVec_append(row_active, i);
+        }
+    }
+
+    matrix *C = new_permuted_dense(B->n, A->base.n, row_active->len, A->n0,
+                                   row_active->data, A->col_perm, NULL);
+    iVec_free(row_active);
+
+    /* Pre-size A's dwork for the BTDA fill (holds (diag(d_perm) X_A)^T). */
+    permuted_dense_ensure_dwork(A, (size_t) A->m0 * A->n0);
+
+    return C;
+}
+
+/* Internal helper for BTDA_csc_pd_fill_values: C = B^T @ A where B is CSC
+   and the right operand A is supplied as a transposed-layout raw buffer
+   (row j of A_T = m0_A contiguous doubles = the j-th column of A's dense
+   block). Transposed-output sibling of BA_pd_csc_fill_values. */
+static void BTA_csc_pd_fill_values(const CSC_matrix *B, const double *A_T, int m0_A,
+                                   const int *inv, permuted_dense *C)
+{
+    /* C[i_C, j_C] = dot(col C->row_perm[i_C] of B, row j_C of A_T). */
+    for (int i_C = 0; i_C < C->m0; i_C++)
+    {
+        int B_col = C->row_perm[i_C];
+        int start = B->p[B_col];
+        int len = B->p[B_col + 1] - start;
+        double *ci = C->X + i_C * C->n0;
+        for (int j_C = 0; j_C < C->n0; j_C++)
+        {
+            ci[j_C] = sparse_dot_dense(B->x + start, B->i + start, len, inv,
+                                       A_T + j_C * m0_A);
+        }
+    }
+}
+
+/* C = B^T diag(d) A. Folds diag(d) into A's dense block (writing
+   (diag(d_perm) X_A)^T into A->dwork) and delegates to BTA_csc_pd_fill_values.
+   Mirrors how BTDA_pd_csc_fill_values wraps BA_pd_csc_fill_values. */
+void BTDA_csc_pd_fill_values(const CSC_matrix *B, const double *d,
+                             const permuted_dense *A, permuted_dense *C)
+{
+    if (C->base.nnz == 0)
+    {
+        return;
+    }
+
+    int m0_A = A->m0;
+    int n0_A = A->n0;
+
+    /* A->dwork = (diag(d_perm) X_A)^T, row-major shape (n0_A, m0_A).
+       Pre-sized by BTA_csc_pd_alloc; no allocation in fill.
+       Column j of (diag(d) X_A) lives contiguously in dwork as row j —
+       which is exactly the layout BTA_csc_pd_fill_values wants. */
+    for (int kk = 0; kk < m0_A; kk++)
+    {
+        double dk = d[A->row_perm[kk]];
+        for (int jj = 0; jj < n0_A; jj++)
+        {
+            A->dwork[jj * m0_A + kk] = dk * A->X[kk * n0_A + jj];
+        }
+    }
+
+    BTA_csc_pd_fill_values(B, A->dwork, m0_A, A->row_inv, C);
+}
+
+/* Original transpose-via-Cprime implementation of BTDA_csc_pd_fill_values.
+   No longer linked; preserved here as in-file reference for the math
+   identity C = (A^T diag(d) B)^T and the BA_pd_csc_fill_values delegation. */
+#if defined(__GNUC__) || defined(__clang__)
+__attribute__((unused))
+#endif
+static void
+BTDA_csc_pd_fill_values_via_transpose_dead(const CSC_matrix *B, const double *d,
+                                           const permuted_dense *A,
+                                           permuted_dense *C)
+{
+    if (C->base.nnz == 0)
+    {
+        return;
+    }
+
+    /* Cprime has shape (A->n0, |row_active|) — i.e. C transposed. */
+    matrix *Cprime_m = BTA_pd_csc_alloc(A, B);
+    permuted_dense *Cprime = (permuted_dense *) Cprime_m;
+    BTDA_pd_csc_fill_values(A, d, B, Cprime);
+
+    /* C->X = Cprime->X^T. Cprime has dims (C->n0, C->m0). */
+    int m0 = C->m0;
+    int n0 = C->n0;
+    for (int i = 0; i < m0; i++)
+    {
+        for (int j = 0; j < n0; j++)
+        {
+            C->X[i * n0 + j] = Cprime->X[j * m0 + i];
+        }
+    }
+
+    free_matrix(Cprime_m);
+}
diff --git a/src/utils/sparse_matrix.c b/src/utils/sparse_matrix.c
index 0237346..211ac31 100644
--- a/src/utils/sparse_matrix.c
+++ b/src/utils/sparse_matrix.c
@@ -15,71 +15,350 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include "utils/sparse_matrix.h"
+
+#include "utils/CSC_matrix.h"
 #include "utils/linalg_sparse_matmuls.h"
 #include "utils/matrix.h"
+#include "utils/mini_numpy.h"
 #include "utils/tracked_alloc.h"
 #include <stdlib.h>
 #include <string.h>
 
-static void sparse_block_left_mult_vec(const Matrix *self, const double *x,
+static void sparse_block_left_mult_vec(const matrix *self, const double *x,
                                        double *y, int p)
 {
-    const Sparse_Matrix *sm = (const Sparse_Matrix *) self;
+    const sparse_matrix *sm = (const sparse_matrix *) self;
     block_left_multiply_vec(sm->csr, x, y, p);
 }
 
-static CSC_Matrix *sparse_block_left_mult_sparsity(const Matrix *self,
-                                                   const CSC_Matrix *J, int p)
+static CSC_matrix *sparse_block_left_mult_sparsity(const matrix *self,
+                                                   const CSC_matrix *J, int p)
 {
-    const Sparse_Matrix *sm = (const Sparse_Matrix *) self;
+    const sparse_matrix *sm = (const sparse_matrix *) self;
     return block_left_multiply_fill_sparsity(sm->csr, J, p);
 }
 
-static void sparse_block_left_mult_values(const Matrix *self, const CSC_Matrix *J,
-                                          CSC_Matrix *C)
+static void sparse_block_left_mult_values(const matrix *self, const CSC_matrix *J,
+                                          CSC_matrix *C)
 {
-    const Sparse_Matrix *sm = (const Sparse_Matrix *) self;
+    const sparse_matrix *sm = (const sparse_matrix *) self;
     block_left_multiply_fill_values(sm->csr, J, C);
 }
 
-static void sparse_update_values(Matrix *self, const double *new_values)
+static void sparse_free(matrix *self)
 {
-    Sparse_Matrix *sm = (Sparse_Matrix *) self;
-    memcpy(sm->csr->x, new_values, sm->csr->nnz * sizeof(double));
+    sparse_matrix *sm = (sparse_matrix *) self;
+    free_CSR_matrix(sm->csr);
+    free_CSC_matrix(sm->csc_cache);
+    free(sm->csc_iwork);
+    free(sm->transpose_iwork);
+    free(sm);
 }
 
-static void sparse_free(Matrix *self)
+/* Forward decl: ctor is referenced by copy_sparsity below. */
+matrix *new_sparse_matrix(CSR_matrix *A);
+
+/* Build the CSC_matrix cache structure if absent. Values are NOT filled here; caller
+   must call refresh_csc_values before consuming. ATA_alloc only needs structure,
+   so it's safe to call without a subsequent refresh. */
+void sparse_matrix_ensure_csc_cache(sparse_matrix *sm)
 {
-    Sparse_Matrix *sm = (Sparse_Matrix *) self;
-    free_csr_matrix(sm->csr);
-    free(sm);
+    if (sm->csc_cache != NULL) return;
+    sm->csc_iwork = (int *) SP_MALLOC(sm->csr->n * sizeof(int));
+    sm->csc_cache = csr_to_csc_alloc(sm->csr, sm->csc_iwork);
 }
 
-Matrix *new_sparse_matrix(const CSR_Matrix *A)
+static matrix *sparse_copy_sparsity(const matrix *self)
+{
+    const sparse_matrix *sm = (const sparse_matrix *) self;
+    return new_sparse_matrix(new_csr_copy_sparsity(sm->csr));
+}
+
+static void sparse_DA_fill_values(const double *d, const matrix *self, matrix *out)
+{
+    const sparse_matrix *sm = (const sparse_matrix *) self;
+    sparse_matrix *sm_out = (sparse_matrix *) out;
+    DA_fill_values(d, sm->csr, sm_out->csr);
+}
+
+static matrix *sparse_ATA_alloc(matrix *self)
+{
+    sparse_matrix *sm = (sparse_matrix *) self;
+    sparse_matrix_ensure_csc_cache(sm);
+    return new_sparse_matrix(ATA_alloc(sm->csc_cache));
+}
+
+/* Caller must have called refresh_csc_values since the last change to csr->x. */
+static void sparse_ATDA_fill_values(const matrix *self, const double *d, matrix *out)
+{
+    const sparse_matrix *sm = (const sparse_matrix *) self;
+    sparse_matrix *sm_out = (sparse_matrix *) out;
+    ATDA_fill_values(sm->csc_cache, d, sm_out->csr);
+}
+
+static CSR_matrix *sparse_to_csr(matrix *self)
+{
+    return ((sparse_matrix *) self)->csr;
+}
+
+static matrix *sparse_transpose_alloc(const matrix *self)
+{
+    const sparse_matrix *sm = (const sparse_matrix *) self;
+    int *iwork = (int *) SP_MALLOC(sm->csr->n * sizeof(int));
+    CSR_matrix *AT = AT_alloc(sm->csr, iwork);
+    sparse_matrix *out = (sparse_matrix *) new_sparse_matrix(AT);
+    out->transpose_iwork = iwork;
+    return &out->base;
+}
+
+static void sparse_transpose_fill_values(const matrix *self, matrix *out)
+{
+    const sparse_matrix *sm_in = (const sparse_matrix *) self;
+    sparse_matrix *sm_out = (sparse_matrix *) out;
+    AT_fill_values(sm_in->csr, sm_out->csr, sm_out->transpose_iwork);
+}
+
+static matrix *sparse_index_alloc(matrix *self, const int *indices, int n_idxs)
+{
+    CSR_matrix *Jx = ((sparse_matrix *) self)->csr;
+    CSR_matrix *J = new_CSR_matrix(n_idxs, self->n, Jx->nnz);
+
+    J->p[0] = 0;
+    for (int i = 0; i < n_idxs; i++)
+    {
+        int row = indices[i];
+        int len = Jx->p[row + 1] - Jx->p[row];
+        memcpy(J->i + J->p[i], Jx->i + Jx->p[row], len * sizeof(int));
+        J->p[i + 1] = J->p[i] + len;
+    }
+    J->nnz = J->p[n_idxs];
+    return new_sparse_matrix(J);
+}
+
+static void sparse_index_fill_values(matrix *self, const int *indices, int n_idxs,
+                                     matrix *out)
+{
+    CSR_matrix *Jx = ((sparse_matrix *) self)->csr;
+    CSR_matrix *J = ((sparse_matrix *) out)->csr;
+    for (int i = 0; i < n_idxs; i++)
+    {
+        int len = J->p[i + 1] - J->p[i];
+        memcpy(J->x + J->p[i], Jx->x + Jx->p[indices[i]], len * sizeof(double));
+    }
+}
+
+static matrix *sparse_promote_alloc(matrix *self, int size)
+{
+    CSR_matrix *Jx = ((sparse_matrix *) self)->csr;
+    int row_nnz = Jx->nnz;
+    CSR_matrix *J = new_CSR_matrix(size, self->n, size * row_nnz);
+
+    for (int row = 0; row < size; row++)
+    {
+        J->p[row] = row * row_nnz;
+        memcpy(J->i + row * row_nnz, Jx->i, row_nnz * sizeof(int));
+    }
+    J->p[size] = size * row_nnz;
+    J->nnz = size * row_nnz;
+    return new_sparse_matrix(J);
+}
+
+static void sparse_promote_fill_values(matrix *self, matrix *out)
+{
+    CSR_matrix *Jx = ((sparse_matrix *) self)->csr;
+    int row_nnz = Jx->nnz;
+    for (int row = 0; row < out->m; row++)
+    {
+        memcpy(out->x + row * row_nnz, Jx->x, row_nnz * sizeof(double));
+    }
+}
+
+static matrix *sparse_broadcast_alloc(matrix *self, broadcast_type type, int d1,
+                                      int d2)
+{
+    CSR_matrix *Jx = ((sparse_matrix *) self)->csr;
+    int out_m = d1 * d2;
+    int total_nnz;
+    if (type == BROADCAST_ROW)
+    {
+        total_nnz = Jx->nnz * d1;
+    }
+    else if (type == BROADCAST_COL)
+    {
+        total_nnz = Jx->nnz * d2;
+    }
+    else /* BROADCAST_SCALAR */
+    {
+        total_nnz = Jx->nnz * out_m;
+    }
+
+    CSR_matrix *J = new_CSR_matrix(out_m, self->n, total_nnz);
+
+    if (type == BROADCAST_ROW)
+    {
+        int acc = 0;
+        for (int i = 0; i < d2; i++)
+        {
+            int nnz_in_row = Jx->p[i + 1] - Jx->p[i];
+            tile_int(J->i + acc, Jx->i + Jx->p[i], nnz_in_row, d1);
+            for (int rep = 0; rep < d1; rep++)
+            {
+                J->p[i * d1 + rep] = acc;
+                acc += nnz_in_row;
+            }
+        }
+        J->p[out_m] = total_nnz;
+    }
+    else if (type == BROADCAST_COL)
+    {
+        tile_int(J->i, Jx->i, Jx->nnz, d2);
+        int offset = 0;
+        for (int i = 0; i < d2; i++)
+        {
+            for (int j = 0; j < d1; j++)
+            {
+                int nnz_in_row = Jx->p[j + 1] - Jx->p[j];
+                J->p[i * d1 + j] = offset;
+                offset += nnz_in_row;
+            }
+        }
+        J->p[out_m] = total_nnz;
+    }
+    else /* BROADCAST_SCALAR */
+    {
+        tile_int(J->i, Jx->i, Jx->nnz, out_m);
+        int row_nnz = Jx->nnz;
+        for (int i = 0; i < out_m; i++)
+        {
+            J->p[i] = i * row_nnz;
+        }
+        J->p[out_m] = total_nnz;
+    }
+    return new_sparse_matrix(J);
+}
+
+static void sparse_broadcast_fill_values(matrix *self, broadcast_type type, int d1,
+                                         int d2, matrix *out)
+{
+    CSR_matrix *Jx = ((sparse_matrix *) self)->csr;
+    if (type == BROADCAST_ROW)
+    {
+        int acc = 0;
+        for (int i = 0; i < d2; i++)
+        {
+            int nnz_in_row = Jx->p[i + 1] - Jx->p[i];
+            tile_double(out->x + acc, Jx->x + Jx->p[i], nnz_in_row, d1);
+            acc += nnz_in_row * d1;
+        }
+    }
+    else if (type == BROADCAST_COL)
+    {
+        tile_double(out->x, Jx->x, Jx->nnz, d2);
+    }
+    else /* BROADCAST_SCALAR */
+    {
+        tile_double(out->x, Jx->x, Jx->nnz, d1 * d2);
+    }
+}
+
+static matrix *sparse_diag_vec_alloc(matrix *self)
+{
+    CSR_matrix *Jx = ((sparse_matrix *) self)->csr;
+    int n = self->m;
+    int out_m = n * n;
+    CSR_matrix *J = new_CSR_matrix(out_m, self->n, Jx->nnz);
+
+    int nnz = 0;
+    int next_diag = 0;
+    for (int row = 0; row < out_m; row++)
+    {
+        J->p[row] = nnz;
+        if (row == next_diag)
+        {
+            int child_row = row / (n + 1);
+            int len = Jx->p[child_row + 1] - Jx->p[child_row];
+            memcpy(J->i + nnz, Jx->i + Jx->p[child_row], len * sizeof(int));
+            nnz += len;
+            next_diag += n + 1;
+        }
+    }
+    J->p[out_m] = nnz;
+    J->nnz = nnz;
+    return new_sparse_matrix(J);
+}
+
+static void sparse_diag_vec_fill_values(matrix *self, matrix *out)
+{
+    CSR_matrix *Jx = ((sparse_matrix *) self)->csr;
+    CSR_matrix *J = ((sparse_matrix *) out)->csr;
+    int n = self->m;
+    for (int i = 0; i < n; i++)
+    {
+        int out_row = i * (n + 1);
+        int len = J->p[out_row + 1] - J->p[out_row];
+        memcpy(J->x + J->p[out_row], Jx->x + Jx->p[i], len * sizeof(double));
+    }
+}
+
+/* Build CSC_matrix structure on first call; refill values from csr->x on every call. */
+static void sparse_refresh_csc_values(matrix *self)
+{
+    sparse_matrix *sm = (sparse_matrix *) self;
+    sparse_matrix_ensure_csc_cache(sm);
+    csr_to_csc_fill_values(sm->csr, sm->csc_cache, sm->csc_iwork);
+}
+
+static void wire_vtable(sparse_matrix *sm)
 {
-    Sparse_Matrix *sm = (Sparse_Matrix *) SP_CALLOC(1, sizeof(Sparse_Matrix));
-    sm->base.m = A->m;
-    sm->base.n = A->n;
     sm->base.block_left_mult_vec = sparse_block_left_mult_vec;
     sm->base.block_left_mult_sparsity = sparse_block_left_mult_sparsity;
     sm->base.block_left_mult_values = sparse_block_left_mult_values;
-    sm->base.update_values = sparse_update_values;
+    sm->base.copy_sparsity = sparse_copy_sparsity;
+    sm->base.DA_fill_values = sparse_DA_fill_values;
+    sm->base.ATA_alloc = sparse_ATA_alloc;
+    sm->base.ATDA_fill_values = sparse_ATDA_fill_values;
+    sm->base.to_csr = sparse_to_csr;
+    sm->base.transpose_alloc = sparse_transpose_alloc;
+    sm->base.transpose_fill_values = sparse_transpose_fill_values;
+    sm->base.index_alloc = sparse_index_alloc;
+    sm->base.index_fill_values = sparse_index_fill_values;
+    sm->base.promote_alloc = sparse_promote_alloc;
+    sm->base.promote_fill_values = sparse_promote_fill_values;
+    sm->base.broadcast_alloc = sparse_broadcast_alloc;
+    sm->base.broadcast_fill_values = sparse_broadcast_fill_values;
+    sm->base.diag_vec_alloc = sparse_diag_vec_alloc;
+    sm->base.diag_vec_fill_values = sparse_diag_vec_fill_values;
+    sm->base.refresh_csc_values = sparse_refresh_csc_values;
     sm->base.free_fn = sparse_free;
-    sm->csr = new_csr(A);
+}
+
+matrix *new_sparse_matrix(CSR_matrix *A)
+{
+    sparse_matrix *sm = (sparse_matrix *) SP_CALLOC(1, sizeof(sparse_matrix));
+    sm->base.m = A->m;
+    sm->base.n = A->n;
+    sm->base.nnz = A->nnz;
+    sm->base.x = A->x;
+    wire_vtable(sm);
+    sm->csr = A;
     return &sm->base;
 }
 
-Matrix *sparse_matrix_trans(const Sparse_Matrix *self, int *iwork)
+matrix *new_sparse_matrix_alloc(int m, int n, int nnz)
+{
+    return new_sparse_matrix(new_CSR_matrix(m, n, nnz));
+}
+
+matrix *sparse_matrix_trans(const sparse_matrix *self, int *iwork)
 {
-    CSR_Matrix *AT = transpose(self->csr, iwork);
-    Sparse_Matrix *sm = (Sparse_Matrix *) SP_CALLOC(1, sizeof(Sparse_Matrix));
+    CSR_matrix *AT = transpose(self->csr, iwork);
+    sparse_matrix *sm = (sparse_matrix *) SP_CALLOC(1, sizeof(sparse_matrix));
     sm->base.m = AT->m;
     sm->base.n = AT->n;
-    sm->base.block_left_mult_vec = sparse_block_left_mult_vec;
-    sm->base.block_left_mult_sparsity = sparse_block_left_mult_sparsity;
-    sm->base.block_left_mult_values = sparse_block_left_mult_values;
-    sm->base.update_values = sparse_update_values;
-    sm->base.free_fn = sparse_free;
+    sm->base.nnz = AT->nnz;
+    sm->base.x = AT->x;
+    wire_vtable(sm);
     sm->csr = AT;
     return &sm->base;
 }
diff --git a/src/utils/utils.c b/src/utils/utils.c
index 5163b3c..9d504a1 100644
--- a/src/utils/utils.c
+++ b/src/utils/utils.c
@@ -30,3 +30,22 @@ void sort_int_array(int *array, int size)
 {
     qsort(array, size, sizeof(int), compare_int_asc);
 }
+
+bool has_overlap(const int *a_idx, int a_len, const int *b_idx, int b_len,
+                 int b_offset)
+{
+    int ai = 0, bi = 0;
+    while (ai < a_len && bi < b_len)
+    {
+        if (a_idx[ai] == b_idx[bi] - b_offset) return true;
+        if (a_idx[ai] < b_idx[bi] - b_offset)
+        {
+            ai++;
+        }
+        else
+        {
+            bi++;
+        }
+    }
+    return false;
+}
diff --git a/tests/all_tests.c b/tests/all_tests.c
index 76807b6..853308f 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -55,17 +55,20 @@
 #include "jacobian_tests/other/test_prod_axis_zero.h"
 #include "jacobian_tests/other/test_quad_form.h"
 #include "numerical_diff/test_numerical_diff.h"
+#include "old-code/test_old_permuted_dense.h"
 #include "problem/test_param_broadcast.h"
 #include "problem/test_param_prob.h"
 #include "problem/test_problem.h"
 #include "utils/test_cblas.h"
-#include "utils/test_coo_matrix.h"
+#include "utils/test_COO_matrix.h"
 #include "utils/test_csc_matrix.h"
 #include "utils/test_csr_csc_conversion.h"
 #include "utils/test_csr_matrix.h"
 #include "utils/test_linalg_sparse_matmuls.h"
 #include "utils/test_linalg_utils_matmul_chain_rule.h"
 #include "utils/test_matrix.h"
+#include "utils/test_matrix_BTA.h"
+#include "utils/test_permuted_dense.h"
 #include "wsum_hess/affine/test_broadcast.h"
 #include "wsum_hess/affine/test_convolve.h"
 #include "wsum_hess/affine/test_diag_mat.h"
@@ -102,7 +105,10 @@
 #endif /* PROFILE_ONLY */
 
 #ifdef PROFILE_ONLY
+#include "profiling/profile_BTA_pd_csr_vs_csc.h"
 #include "profiling/profile_left_matmul.h"
+#include "profiling/profile_log_reg.h"
+#include "profiling/profile_trimmed_log_reg.h"
 #endif /* PROFILE_ONLY */
 
 int main(void)
@@ -218,12 +224,15 @@ int main(void)
     mu_run_test(test_jacobian_left_matmul_log, tests_run);
     mu_run_test(test_jacobian_left_matmul_log_matrix, tests_run);
     mu_run_test(test_jacobian_left_matmul_exp_composite, tests_run);
+    mu_run_test(test_jacobian_left_matmul_pd_from_composite_child, tests_run);
+    mu_run_test(test_jacobian_left_matmul_pd_param, tests_run);
     mu_run_test(test_jacobian_right_matmul_log, tests_run);
     mu_run_test(test_jacobian_right_matmul_log_vector, tests_run);
     mu_run_test(test_jacobian_matmul, tests_run);
     mu_run_test(test_jacobian_convolve, tests_run);
     mu_run_test(test_jacobian_convolve_composite, tests_run);
     mu_run_test(test_jacobian_transpose, tests_run);
+    mu_run_test(test_jacobian_transpose_pd_preserved, tests_run);
     mu_run_test(test_diag_mat_jacobian_variable, tests_run);
     mu_run_test(test_diag_mat_jacobian_of_log, tests_run);
     mu_run_test(test_upper_tri_jacobian_variable, tests_run);
@@ -284,6 +293,7 @@ int main(void)
     mu_run_test(test_wsum_hess_left_matmul, tests_run);
     mu_run_test(test_wsum_hess_left_matmul_matrix, tests_run);
     mu_run_test(test_wsum_hess_left_matmul_exp_composite, tests_run);
+    mu_run_test(test_wsum_hess_left_matmul_dense_matrix_exp, tests_run);
     mu_run_test(test_wsum_hess_matmul, tests_run);
     mu_run_test(test_wsum_hess_matmul_yx, tests_run);
     mu_run_test(test_wsum_hess_right_matmul, tests_run);
@@ -351,11 +361,48 @@ int main(void)
     mu_run_test(test_csr_to_coo, tests_run);
     mu_run_test(test_csr_to_coo_lower_triangular, tests_run);
     mu_run_test(test_refresh_lower_triangular_coo, tests_run);
-    mu_run_test(test_dense_matrix_mult_vec, tests_run);
-    mu_run_test(test_dense_matrix_mult_vec_blocks, tests_run);
-    mu_run_test(test_sparse_vs_dense_mult_vec, tests_run);
-    mu_run_test(test_dense_matrix_trans, tests_run);
-    mu_run_test(test_sparse_vs_dense_mult_vec_blocks, tests_run);
+    mu_run_test(test_pd_mult_vec_basic, tests_run);
+    mu_run_test(test_pd_mult_vec_blocks, tests_run);
+    mu_run_test(test_sparse_vs_pd_mult_vec, tests_run);
+    mu_run_test(test_pd_trans_full_block, tests_run);
+    mu_run_test(test_sparse_vs_pd_mult_vec_blocks, tests_run);
+    mu_run_test(test_pd_operator_block_left_mult_vec, tests_run);
+    mu_run_test(test_permuted_dense_to_csr_basic, tests_run);
+    mu_run_test(test_permuted_dense_to_csr_empty, tests_run);
+    mu_run_test(test_permuted_dense_to_csr_full, tests_run);
+    mu_run_test(test_permuted_dense_to_csr_single_row, tests_run);
+    mu_run_test(test_permuted_dense_to_csr_single_col, tests_run);
+    mu_run_test(test_DA_pd_fill_values, tests_run);
+    mu_run_test(test_ATA_pd_alloc, tests_run);
+    mu_run_test(test_ATDA_pd_fill_values, tests_run);
+    mu_run_test(test_permuted_dense_times_csc, tests_run);
+    mu_run_test(test_permuted_dense_times_csc_no_active, tests_run);
+    mu_run_test(test_permuted_dense_to_csr_lazy, tests_run);
+    mu_run_test(test_permuted_dense_col_inv, tests_run);
+    mu_run_test(test_permuted_dense_index, tests_run);
+    mu_run_test(test_permuted_dense_promote, tests_run);
+    mu_run_test(test_permuted_dense_broadcast_scalar, tests_run);
+    mu_run_test(test_permuted_dense_broadcast_row, tests_run);
+    mu_run_test(test_permuted_dense_broadcast_col, tests_run);
+    mu_run_test(test_permuted_dense_diag_vec, tests_run);
+    mu_run_test(test_permuted_dense_BTA_matching_row_perm, tests_run);
+    mu_run_test(test_permuted_dense_BTA_empty_overlap, tests_run);
+    mu_run_test(test_permuted_dense_BTA_partial_overlap, tests_run);
+    mu_run_test(test_permuted_dense_BTDA_decomposition, tests_run);
+    mu_run_test(test_BTA_pd_csc_matches_csr, tests_run);
+    mu_run_test(test_BA_pd_matrices_pd_pd_full_block_B, tests_run);
+    mu_run_test(test_BA_pd_matrices_pd_pd_general_B, tests_run);
+    mu_run_test(test_BA_pd_matrices_pd_csc, tests_run);
+    mu_run_test(test_BA_pd_matrices_fast_path, tests_run);
+    mu_run_test(test_BTA_pd_csr_basic, tests_run);
+    mu_run_test(test_BTA_pd_csr_leaf_variable, tests_run);
+    mu_run_test(test_BTA_pd_csr_no_overlap, tests_run);
+    mu_run_test(test_BTA_csr_pd_basic, tests_run);
+    mu_run_test(test_BTA_csr_pd_leaf_variable, tests_run);
+    mu_run_test(test_BTA_csr_pd_no_overlap, tests_run);
+    mu_run_test(test_BTDA_matrices_pd_pd, tests_run);
+    mu_run_test(test_BTDA_matrices_csr_pd, tests_run);
+    mu_run_test(test_BTDA_matrices_pd_csr, tests_run);
     mu_run_test(test_YT_kron_I, tests_run);
     mu_run_test(test_YT_kron_I_larger, tests_run);
     mu_run_test(test_I_kron_X, tests_run);
@@ -401,6 +448,9 @@ int main(void)
 #ifdef PROFILE_ONLY
     printf("\n--- Profiling Tests ---\n");
     mu_run_test(profile_left_matmul, tests_run);
+    mu_run_test(profile_log_reg, tests_run);
+    mu_run_test(profile_trimmed_log_reg, tests_run);
+    mu_run_test(profile_BTA_pd_csr_vs_csc, tests_run);
 #endif /* PROFILE_ONLY */
 
     printf("\n=== All %d tests passed ===\n", tests_run);
diff --git a/tests/forward_pass/affine/test_add.h b/tests/forward_pass/affine/test_add.h
index 1f8a61a..4b14ae3 100644
--- a/tests/forward_pass/affine/test_add.h
+++ b/tests/forward_pass/affine/test_add.h
@@ -5,6 +5,7 @@
 #include "atoms/affine.h"
 #include "expr.h"
 #include "minunit.h"
+#include "subexpr.h"
 #include "test_helpers.h"
 
 const char *test_addition(void)
diff --git a/tests/forward_pass/affine/test_diag_mat.h b/tests/forward_pass/affine/test_diag_mat.h
index cb20a05..d5703ac 100644
--- a/tests/forward_pass/affine/test_diag_mat.h
+++ b/tests/forward_pass/affine/test_diag_mat.h
@@ -8,7 +8,7 @@
 const char *test_diag_mat_forward(void)
 {
     /* 3x3 matrix variable (column-major): [1,2,3,4,5,6,7,8,9]
-     * Matrix:  1 4 7
+     * matrix:  1 4 7
      *          2 5 8
      *          3 6 9
      * Diagonal: (0,0)=1, (1,1)=5, (2,2)=9 */
diff --git a/tests/forward_pass/affine/test_linear_op.h b/tests/forward_pass/affine/test_linear_op.h
index 801a403..8a8f9f8 100644
--- a/tests/forward_pass/affine/test_linear_op.h
+++ b/tests/forward_pass/affine/test_linear_op.h
@@ -11,14 +11,14 @@
 
 const char *test_linear_op(void)
 {
-    /* create CSR matrix
+    /* create CSR_matrix matrix
      A = [0 0 2 3 0 0]
          [0 0 1 0 2 0]
          [0 0 3 4 5 0] */
     double Ax[7] = {2.0, 3.0, 1.0, 2.0, 3.0, 4.0, 5.0};
     int Ai[7] = {2, 3, 2, 4, 2, 3, 4};
     int Ap[4] = {0, 2, 4, 7};
-    CSR_Matrix *A = new_csr_matrix(3, 6, 7);
+    CSR_matrix *A = new_CSR_matrix(3, 6, 7);
     memcpy(A->x, Ax, 7 * sizeof(double));
     memcpy(A->i, Ai, 7 * sizeof(int));
     memcpy(A->p, Ap, 4 * sizeof(int));
@@ -31,6 +31,6 @@ const char *test_linear_op(void)
     double expected[3] = {8, 7, 26};
     mu_assert("fail", cmp_double_array(linear_node->value, expected, 3));
     free_expr(linear_node);
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     return 0;
 }
diff --git a/tests/forward_pass/affine/test_upper_tri.h b/tests/forward_pass/affine/test_upper_tri.h
index 056c3a5..3db43e8 100644
--- a/tests/forward_pass/affine/test_upper_tri.h
+++ b/tests/forward_pass/affine/test_upper_tri.h
@@ -8,7 +8,7 @@
 const char *test_upper_tri_forward_4x4(void)
 {
     /* 4x4 matrix variable (column-major): [1..16]
-     * Matrix:  1  5   9  13
+     * matrix:  1  5   9  13
      *          2  6  10  14
      *          3  7  11  15
      *          4  8  12  16
diff --git a/tests/jacobian_tests/affine/test_broadcast.h b/tests/jacobian_tests/affine/test_broadcast.h
index f41082d..a1f79ed 100644
--- a/tests/jacobian_tests/affine/test_broadcast.h
+++ b/tests/jacobian_tests/affine/test_broadcast.h
@@ -37,12 +37,9 @@ const char *test_broadcast_row_jacobian(void)
     int expected_p[7] = {0, 1, 2, 3, 4, 5, 6};
     int expected_i[6] = {0, 0, 1, 1, 2, 2};
 
-    mu_assert("broadcast row jacobian vals fail",
-              cmp_double_array(bcast->jacobian->x, expected_x, 6));
-    mu_assert("broadcast row jacobian rows fail",
-              cmp_int_array(bcast->jacobian->p, expected_p, 4));
-    mu_assert("broadcast row jacobian cols fail",
-              cmp_int_array(bcast->jacobian->i, expected_i, 6));
+    mu_assert("vals fail", cmp_values(bcast->jacobian, expected_x, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(bcast->jacobian, expected_p, expected_i, 6, 6));
 
     free_expr(bcast);
     return 0;
@@ -82,12 +79,9 @@ const char *test_broadcast_col_jacobian(void)
     int expected_p[7] = {0, 1, 2, 3, 4, 5, 6};
     int expected_i[6] = {0, 1, 2, 0, 1, 2};
 
-    mu_assert("broadcast col jacobian vals fail",
-              cmp_double_array(bcast->jacobian->x, expected_x, 6));
-    mu_assert("broadcast col jacobian rows fail",
-              cmp_int_array(bcast->jacobian->p, expected_p, 7));
-    mu_assert("broadcast col jacobian cols fail",
-              cmp_int_array(bcast->jacobian->i, expected_i, 6));
+    mu_assert("vals fail", cmp_values(bcast->jacobian, expected_x, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(bcast->jacobian, expected_p, expected_i, 6, 6));
 
     free_expr(bcast);
     return 0;
@@ -123,12 +117,9 @@ const char *test_broadcast_scalar_to_matrix_jacobian(void)
     int expected_p[7] = {0, 1, 2, 3, 4, 5, 6};
     int expected_i[6] = {0, 0, 0, 0, 0, 0};
 
-    mu_assert("broadcast scalar jacobian vals fail",
-              cmp_double_array(bcast->jacobian->x, expected_x, 6));
-    mu_assert("broadcast scalar jacobian rows fail",
-              cmp_int_array(bcast->jacobian->p, expected_p, 7));
-    mu_assert("broadcast scalar jacobian cols fail",
-              cmp_int_array(bcast->jacobian->i, expected_i, 6));
+    mu_assert("vals fail", cmp_values(bcast->jacobian, expected_x, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(bcast->jacobian, expected_p, expected_i, 6, 6));
 
     free_expr(bcast);
     return 0;
@@ -150,17 +141,18 @@ const char *test_double_broadcast(void)
     jacobian_init(sum);
     sum->eval_jacobian(sum);
 
+    /* TODO: what is this test? */
+
     /* All 6 elements depend on the single input variable */
     // double expected_x[6] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
     // int expected_p[7] = {0, 1, 2, 3, 4, 5, 6};
     // int expected_i[6] = {0, 0, 0, 0, 0, 0};
     //
-    // mu_assert("broadcast scalar jacobian vals fail",
-    //          cmp_double_array(sum->jacobian->x, expected_x, 6));
-    // mu_assert("broadcast scalar jacobian rows fail",
-    //          cmp_int_array(sum ->jacobian->p, expected_p, 7));
-    // mu_assert("broadcast scalar jacobian cols fail",
-    //          cmp_int_array(bcast->jacobian->i, expected_i, 6));
+    // mu_assert("broadcast scalar jacobian vals fail", //
+    // cmp_double_array(sum->jacobian->x, expected_x, 6)); mu_assert("broadcast
+    // scalar jacobian rows fail", // cmp_int_array(sum ->jacobian->p, expected_p,
+    // 7)); mu_assert("broadcast scalar jacobian cols fail", //
+    // cmp_int_array(bcast->jacobian->to_csr(bcast->jacobian)->i, // expected_i, 6));
 
     free_expr(sum);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_convolve.h b/tests/jacobian_tests/affine/test_convolve.h
index c3aae98..0eb4db1 100644
--- a/tests/jacobian_tests/affine/test_convolve.h
+++ b/tests/jacobian_tests/affine/test_convolve.h
@@ -18,7 +18,7 @@ const char *test_jacobian_convolve(void)
      *    [3, 2, 1],
      *    [0, 3, 2],
      *    [0, 0, 3]]
-     * stored in CSR with nnz = 9, shape 5 x 3. */
+     * stored in CSR_matrix with nnz = 9, shape 5 x 3. */
     double kernel[3] = {1.0, 2.0, 3.0};
     expr *kernel_param = new_parameter(3, 1, PARAM_FIXED, 3, kernel);
     expr *x = new_variable(3, 1, 0, 3);
@@ -37,12 +37,9 @@ const char *test_jacobian_convolve(void)
     int expected_i[9] = {0, 0, 1, 0, 1, 2, 1, 2, 2};
     double expected_x[9] = {1.0, 2.0, 1.0, 3.0, 2.0, 1.0, 3.0, 2.0, 3.0};
 
-    mu_assert("Convolve Jacobian row pointers incorrect",
-              cmp_int_array(y->jacobian->p, expected_p, 6));
-    mu_assert("Convolve Jacobian column indices incorrect",
-              cmp_int_array(y->jacobian->i, expected_i, 9));
-    mu_assert("Convolve Jacobian values incorrect",
-              cmp_double_array(y->jacobian->x, expected_x, 9));
+    mu_assert("sparsity fail",
+              cmp_sparsity(y->jacobian, expected_p, expected_i, 5, 9));
+    mu_assert("vals fail", cmp_values(y->jacobian, expected_x, 9));
 
     free_expr(y);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_diag_mat.h b/tests/jacobian_tests/affine/test_diag_mat.h
index b1031ac..9b669a0 100644
--- a/tests/jacobian_tests/affine/test_diag_mat.h
+++ b/tests/jacobian_tests/affine/test_diag_mat.h
@@ -10,7 +10,7 @@ const char *test_diag_mat_jacobian_variable(void)
 {
     /* diag_mat of a 2x2 variable (4 vars total)
      * Diagonal indices in column-major: [0, 3]
-     * Jacobian is 2x4 CSR: row 0 has col 0, row 1 has col 3 */
+     * Jacobian is 2x4 CSR_matrix: row 0 has col 0, row 1 has col 3 */
     double u[4] = {1.0, 2.0, 3.0, 4.0};
     expr *var = new_variable(2, 2, 0, 4);
     expr *dm = new_diag_mat(var);
@@ -23,9 +23,9 @@ const char *test_diag_mat_jacobian_variable(void)
     int expected_p[3] = {0, 1, 2};
     int expected_i[2] = {0, 3};
 
-    mu_assert("diag_mat jac vals", cmp_double_array(dm->jacobian->x, expected_x, 2));
-    mu_assert("diag_mat jac p", cmp_int_array(dm->jacobian->p, expected_p, 3));
-    mu_assert("diag_mat jac i", cmp_int_array(dm->jacobian->i, expected_i, 2));
+    mu_assert("vals fail", cmp_values(dm->jacobian, expected_x, 2));
+    mu_assert("sparsity fail",
+              cmp_sparsity(dm->jacobian, expected_p, expected_i, 2, 2));
 
     free_expr(dm);
     return 0;
@@ -49,12 +49,12 @@ const char *test_diag_mat_jacobian_of_log(void)
     dm->eval_jacobian(dm);
 
     double expected_x[2] = {1.0, 0.25};
+    int expected_p[3] = {0, 1, 2};
     int expected_i[2] = {0, 3};
 
-    mu_assert("diag_mat log jac vals",
-              cmp_double_array(dm->jacobian->x, expected_x, 2));
-    mu_assert("diag_mat log jac cols",
-              cmp_int_array(dm->jacobian->i, expected_i, 2));
+    mu_assert("vals fail", cmp_values(dm->jacobian, expected_x, 2));
+    mu_assert("sparsity fail",
+              cmp_sparsity(dm->jacobian, expected_p, expected_i, 2, 2));
 
     free_expr(dm);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_hstack.h b/tests/jacobian_tests/affine/test_hstack.h
index a5b163c..017730e 100644
--- a/tests/jacobian_tests/affine/test_hstack.h
+++ b/tests/jacobian_tests/affine/test_hstack.h
@@ -42,9 +42,9 @@ const char *test_jacobian_hstack_vectors(void)
     int expected_Ai[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
     int expected_Ap[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
 
-    mu_assert("vals fail", cmp_double_array(stack->jacobian->x, expected_Ax, 9));
-    mu_assert("cols fail", cmp_int_array(stack->jacobian->i, expected_Ai, 9));
-    mu_assert("rows fail", cmp_int_array(stack->jacobian->p, expected_Ap, 10));
+    mu_assert("vals fail", cmp_values(stack->jacobian, expected_Ax, 9));
+    mu_assert("sparsity fail",
+              cmp_sparsity(stack->jacobian, expected_Ap, expected_Ai, 9, 9));
 
     free_expr(stack);
     return 0;
@@ -86,9 +86,9 @@ const char *test_jacobian_hstack_matrix(void)
     int expected_Ap[19] = {0,  1,  2,  3,  4,  5,  6,  7,  8, 9,
                            10, 11, 12, 13, 14, 15, 16, 17, 18};
 
-    mu_assert("vals fail", cmp_double_array(stack->jacobian->x, expected_Ax, 18));
-    mu_assert("cols fail", cmp_int_array(stack->jacobian->i, expected_Ai, 18));
-    mu_assert("rows fail", cmp_int_array(stack->jacobian->p, expected_Ap, 19));
+    mu_assert("vals fail", cmp_values(stack->jacobian, expected_Ax, 18));
+    mu_assert("sparsity fail",
+              cmp_sparsity(stack->jacobian, expected_Ap, expected_Ai, 18, 18));
 
     free_expr(stack);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_index.h b/tests/jacobian_tests/affine/test_index.h
index 8cdd9f7..eeda8c2 100644
--- a/tests/jacobian_tests/affine/test_index.h
+++ b/tests/jacobian_tests/affine/test_index.h
@@ -53,12 +53,12 @@ const char *test_index_jacobian_of_variable(void)
 
     /* Jacobian is 2x3 with pattern: row 0 selects col 0, row 1 selects col 2 */
     double expected_x[2] = {1.0, 1.0};
-    int expected_p[3] = {0, 1, 2}; /* CSR row ptrs */
+    int expected_p[3] = {0, 1, 2}; /* CSR_matrix row ptrs */
     int expected_i[2] = {0, 2};    /* column indices */
 
-    mu_assert("index jac vals", cmp_double_array(idx->jacobian->x, expected_x, 2));
-    mu_assert("index jac p", cmp_int_array(idx->jacobian->p, expected_p, 3));
-    mu_assert("index jac i", cmp_int_array(idx->jacobian->i, expected_i, 2));
+    mu_assert("vals fail", cmp_values(idx->jacobian, expected_x, 2));
+    mu_assert("sparsity fail",
+              cmp_sparsity(idx->jacobian, expected_p, expected_i, 2, 2));
 
     free_expr(idx);
     return 0;
@@ -76,16 +76,17 @@ const char *test_index_jacobian_of_log(void)
     jacobian_init(idx);
     idx->eval_jacobian(idx);
 
-    /* d/dx log(x) = diag(1/x), then select rows 0 and 2
+    /* d/dx log(x) = diag(1/x), then select rows 0 and 2.
+     * Each selected row has exactly one nonzero (the diagonal entry).
      * Row 0: 1/1 = 1.0 at col 0
      * Row 1: 1/4 = 0.25 at col 2 */
     double expected_x[2] = {1.0, 0.25};
+    int expected_p[3] = {0, 1, 2};
     int expected_i[2] = {0, 2};
 
-    mu_assert("index of log jac vals",
-              cmp_double_array(idx->jacobian->x, expected_x, 2));
-    mu_assert("index of log jac cols",
-              cmp_int_array(idx->jacobian->i, expected_i, 2));
+    mu_assert("vals fail", cmp_values(idx->jacobian, expected_x, 2));
+    mu_assert("sparsity fail",
+              cmp_sparsity(idx->jacobian, expected_p, expected_i, 2, 2));
 
     free_expr(idx);
     return 0;
@@ -107,12 +108,9 @@ const char *test_index_jacobian_repeated(void)
     int expected_p[3] = {0, 1, 2};
     int expected_i[2] = {0, 0}; /* Both reference col 0 */
 
-    mu_assert("index repeated jac vals",
-              cmp_double_array(idx->jacobian->x, expected_x, 2));
-    mu_assert("index repeated row ptr",
-              cmp_int_array(idx->jacobian->p, expected_p, 3));
-    mu_assert("index repeated jac i",
-              cmp_int_array(idx->jacobian->i, expected_i, 2));
+    mu_assert("vals fail", cmp_values(idx->jacobian, expected_x, 2));
+    mu_assert("sparsity fail",
+              cmp_sparsity(idx->jacobian, expected_p, expected_i, 2, 2));
 
     free_expr(idx);
     return 0;
@@ -133,12 +131,14 @@ const char *test_sum_of_index(void)
     jacobian_init(s);
     s->eval_jacobian(s);
 
-    /* Gradient: [1, 0, 1] in sparse form */
+    /* Gradient: [1, 0, 1] in sparse form. Single output row holds both nnz. */
     double expected_x[2] = {1.0, 1.0};
+    int expected_p[2] = {0, 2};
     int expected_i[2] = {0, 2};
 
-    mu_assert("sum of index vals", cmp_double_array(s->jacobian->x, expected_x, 2));
-    mu_assert("sum of index cols", cmp_int_array(s->jacobian->i, expected_i, 2));
+    mu_assert("vals fail", cmp_values(s->jacobian, expected_x, 2));
+    mu_assert("sparsity fail",
+              cmp_sparsity(s->jacobian, expected_p, expected_i, 1, 2));
 
     free_expr(s);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_left_matmul.h b/tests/jacobian_tests/affine/test_left_matmul.h
index a0c1385..18a27c8 100644
--- a/tests/jacobian_tests/affine/test_left_matmul.h
+++ b/tests/jacobian_tests/affine/test_left_matmul.h
@@ -8,6 +8,7 @@
 #include "minunit.h"
 #include "numerical_diff.h"
 #include "test_helpers.h"
+#include "utils/permuted_dense.h"
 
 const char *test_jacobian_left_matmul_log(void)
 {
@@ -23,7 +24,7 @@ const char *test_jacobian_left_matmul_log(void)
      * [5,   0, 2    ]
      * [7,   0, 0    ]
      *
-     * Stored in CSR format (4x3 sparse):
+     * Stored in CSR_matrix format (4x3 sparse):
      * nnz = 7
      * p = [0, 2, 4, 6, 7]
      * i = [0, 2, 0, 2, 0, 2, 0]
@@ -32,8 +33,8 @@ const char *test_jacobian_left_matmul_log(void)
     double x_vals[3] = {1.0, 2.0, 3.0};
     expr *x = new_variable(3, 1, 0, 3);
 
-    /* Create sparse matrix A in CSR format */
-    CSR_Matrix *A = new_csr_matrix(4, 3, 7);
+    /* Create sparse matrix A in CSR_matrix format */
+    CSR_matrix *A = new_CSR_matrix(4, 3, 7);
     int A_p[5] = {0, 2, 4, 6, 7};
     int A_i[7] = {0, 2, 0, 2, 0, 2, 0};
     double A_x[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
@@ -61,11 +62,11 @@ const char *test_jacobian_left_matmul_log(void)
     int expected_Ai[7] = {0, 2, 0, 2, 0, 2, 0};
     int expected_Ap[5] = {0, 2, 4, 6, 7};
 
-    mu_assert("vals fail", cmp_double_array(A_log_x->jacobian->x, expected_Ax, 7));
-    mu_assert("cols fail", cmp_int_array(A_log_x->jacobian->i, expected_Ai, 7));
-    mu_assert("rows fail", cmp_int_array(A_log_x->jacobian->p, expected_Ap, 5));
+    mu_assert("vals fail", cmp_values(A_log_x->jacobian, expected_Ax, 7));
+    mu_assert("sparsity fail",
+              cmp_sparsity(A_log_x->jacobian, expected_Ap, expected_Ai, 4, 7));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     free_expr(A_log_x);
     return 0;
 }
@@ -76,8 +77,8 @@ const char *test_jacobian_left_matmul_log_matrix(void)
     double x_vals[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     expr *x = new_variable(3, 2, 0, 6);
 
-    /* Create sparse matrix A in CSR format (4x3) */
-    CSR_Matrix *A = new_csr_matrix(4, 3, 7);
+    /* Create sparse matrix A in CSR_matrix format (4x3) */
+    CSR_matrix *A = new_CSR_matrix(4, 3, 7);
     int A_p[5] = {0, 2, 4, 6, 7};
     int A_i[7] = {0, 2, 0, 2, 0, 2, 0};
     double A_x[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
@@ -100,11 +101,11 @@ const char *test_jacobian_left_matmul_log_matrix(void)
     int expected_Ai[14] = {0, 2, 0, 2, 0, 2, 0, 3, 5, 3, 5, 3, 5, 3};
     int expected_Ap[9] = {0, 2, 4, 6, 7, 9, 11, 13, 14};
 
-    mu_assert("vals fail", cmp_double_array(A_log_x->jacobian->x, expected_Ax, 14));
-    mu_assert("cols fail", cmp_int_array(A_log_x->jacobian->i, expected_Ai, 14));
-    mu_assert("rows fail", cmp_int_array(A_log_x->jacobian->p, expected_Ap, 9));
+    mu_assert("vals fail", cmp_values(A_log_x->jacobian, expected_Ax, 14));
+    mu_assert("sparsity fail",
+              cmp_sparsity(A_log_x->jacobian, expected_Ap, expected_Ai, 8, 14));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     free_expr(A_log_x);
     return 0;
 }
@@ -116,7 +117,7 @@ const char *test_jacobian_left_matmul_exp_composite(void)
     expr *x = new_variable(3, 1, 0, 3);
 
     /* Create B matrix (3x3 all ones) */
-    CSR_Matrix *B = new_csr_matrix(3, 3, 9);
+    CSR_matrix *B = new_CSR_matrix(3, 3, 9);
     int B_p[4] = {0, 3, 6, 9};
     int B_i[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
     double B_x[9] = {1, 1, 1, 1, 1, 1, 1, 1, 1};
@@ -125,7 +126,7 @@ const char *test_jacobian_left_matmul_exp_composite(void)
     memcpy(B->x, B_x, 9 * sizeof(double));
 
     /* Create A matrix */
-    CSR_Matrix *A = new_csr_matrix(4, 3, 7);
+    CSR_matrix *A = new_CSR_matrix(4, 3, 7);
     int A_p[5] = {0, 2, 4, 6, 7};
     int A_i[7] = {0, 2, 0, 2, 0, 2, 0};
     double A_x[7] = {1, 2, 3, 4, 5, 6, 7};
@@ -140,8 +141,122 @@ const char *test_jacobian_left_matmul_exp_composite(void)
     mu_assert("check_jacobian failed",
               check_jacobian_num(A_exp_Bx, x_vals, NUMERICAL_DIFF_DEFAULT_H));
 
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
     free_expr(A_exp_Bx);
     return 0;
 }
+
+/* outer = A2 @ (A1 @ x). Inner left_matmul produces a PD Jacobian via the
+   leaf-var fast path. Outer left_matmul sees a PD child Jacobian and must
+   fire the produce_pd_jacobian_from_child branch via BA_pd_matrices_*.
+
+   x is a length-2 leaf variable at var_id=0, n_vars=2.
+   A1 is 3x2: [[1,2],[3,4],[5,6]] (row-major).
+   A2 is 4x3: [[1,0,1],[0,1,0],[1,0,1],[0,1,0]] (row-major).
+   Expected outer->jacobian: PD of shape (4, 2), row_perm=[0..3],
+   col_perm=[0,1], X = A2 @ A1 = [[6,8],[3,4],[6,8],[3,4]]. */
+const char *test_jacobian_left_matmul_pd_from_composite_child(void)
+{
+    double A1_data[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    double A2_data[12] = {1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
+                          1.0, 0.0, 1.0, 0.0, 1.0, 0.0};
+
+    expr *x = new_variable(2, 1, 0, 2);
+    expr *A1_x = new_left_matmul_dense(NULL, x, 3, 2, A1_data);
+    expr *A2_A1_x = new_left_matmul_dense(NULL, A1_x, 4, 3, A2_data);
+
+    double x_vals[2] = {0.5, -1.5};
+    A2_A1_x->forward(A2_A1_x, x_vals);
+    jacobian_init(A2_A1_x);
+    A2_A1_x->eval_jacobian(A2_A1_x);
+
+    /* Structural: outer's Jacobian must be PD (produced by the
+       jacobian_init_pd path via BA_pd_matrices_alloc). */
+    mu_assert("outer Jacobian should be PD", A2_A1_x->jacobian->is_permuted_dense);
+    permuted_dense *pd = (permuted_dense *) A2_A1_x->jacobian;
+    mu_assert("global m", A2_A1_x->jacobian->m == 4);
+    mu_assert("global n", A2_A1_x->jacobian->n == 2);
+    mu_assert("m0", pd->m0 == 4);
+    mu_assert("n0", pd->n0 == 2);
+    int expected_row_perm[4] = {0, 1, 2, 3};
+    int expected_col_perm[2] = {0, 1};
+    mu_assert("row_perm", cmp_int_array(pd->row_perm, expected_row_perm, 4));
+    mu_assert("col_perm", cmp_int_array(pd->col_perm, expected_col_perm, 2));
+
+    /* Numerical: X = A2 @ A1 (row-major 4x2). */
+    double expected_X[8] = {6.0, 8.0, 3.0, 4.0, 6.0, 8.0, 3.0, 4.0};
+    mu_assert("X values", cmp_double_array(pd->X, expected_X, 8));
+
+    /* Cross-check against numerical differentiation for paranoia. */
+    mu_assert("check_jacobian failed",
+              check_jacobian_num(A2_A1_x, x_vals, NUMERICAL_DIFF_DEFAULT_H));
+
+    free_expr(A2_A1_x);
+    return 0;
+}
+
+/* Parameterized A: A_param @ x with leaf-variable x and n_blocks == 1.
+   Verifies the PD path (jacobian_init_pd / eval_jacobian_pd) handles a
+   parameterized A — the structure is fixed at construction, refresh_dense_left
+   updates A->X before each forward, and eval_jacobian_pd reads those values
+   via BA_pd_matrices_fill_values.
+
+   x is length 2 at var_id=0, n_vars=2.
+   A_param is 3x2. The parameter convention is column-major, so the param's
+   value array is column-major of A. We test two parameter assignments:
+     A = [[1,2],[3,4],[5,6]]   column-major: [1,3,5,2,4,6]
+     A = [[7,8],[9,10],[11,12]] column-major: [7,9,11,8,10,12]
+   The Jacobian of A @ x w.r.t. x is just A itself, placed at the
+   variable's column slot. */
+const char *test_jacobian_left_matmul_pd_param(void)
+{
+    expr *x = new_variable(2, 1, 0, 2);
+
+    double theta[6] = {1.0, 3.0, 5.0, 2.0, 4.0, 6.0};
+    expr *A_param = new_parameter(3, 2, 0, 2, theta);
+    expr *A_x = new_left_matmul_dense(A_param, x, 3, 2, NULL);
+
+    double x_vals[2] = {0.5, -1.5};
+    A_x->forward(A_x, x_vals);
+    jacobian_init(A_x);
+    A_x->eval_jacobian(A_x);
+
+    /* Structural: Jacobian must be PD. */
+    mu_assert("Jacobian should be PD", A_x->jacobian->is_permuted_dense);
+    permuted_dense *pd = (permuted_dense *) A_x->jacobian;
+    mu_assert("global m", A_x->jacobian->m == 3);
+    mu_assert("global n", A_x->jacobian->n == 2);
+    mu_assert("m0", pd->m0 == 3);
+    mu_assert("n0", pd->n0 == 2);
+    int expected_row_perm[3] = {0, 1, 2};
+    int expected_col_perm[2] = {0, 1};
+    mu_assert("row_perm", cmp_int_array(pd->row_perm, expected_row_perm, 3));
+    mu_assert("col_perm", cmp_int_array(pd->col_perm, expected_col_perm, 2));
+
+    /* Values: pd->X is row-major of A. */
+    double expected_X1[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    mu_assert("X values (param 1)", cmp_double_array(pd->X, expected_X1, 6));
+
+    mu_assert("numerical check (param 1)",
+              check_jacobian_num(A_x, x_vals, NUMERICAL_DIFF_DEFAULT_H));
+
+    /* Update the parameter and re-evaluate. The PD output struct is reused;
+       only pd->X should change. */
+    double theta2[6] = {7.0, 9.0, 11.0, 8.0, 10.0, 12.0};
+    memcpy(A_param->value, theta2, 6 * sizeof(double));
+    expr_set_needs_refresh(A_x);
+
+    A_x->forward(A_x, x_vals);
+    A_x->eval_jacobian(A_x);
+
+    mu_assert("Jacobian still PD after refresh", A_x->jacobian->is_permuted_dense);
+    double expected_X2[6] = {7.0, 8.0, 9.0, 10.0, 11.0, 12.0};
+    mu_assert("X values (param 2)", cmp_double_array(pd->X, expected_X2, 6));
+
+    mu_assert("numerical check (param 2)",
+              check_jacobian_num(A_x, x_vals, NUMERICAL_DIFF_DEFAULT_H));
+
+    free_expr(A_x);
+    return 0;
+}
diff --git a/tests/jacobian_tests/affine/test_neg.h b/tests/jacobian_tests/affine/test_neg.h
index ca4e595..f4d859c 100644
--- a/tests/jacobian_tests/affine/test_neg.h
+++ b/tests/jacobian_tests/affine/test_neg.h
@@ -19,12 +19,9 @@ const char *test_neg_jacobian(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("neg jacobian vals fail",
-              cmp_double_array(neg_node->jacobian->x, expected_x, 3));
-    mu_assert("neg jacobian rows fail",
-              cmp_int_array(neg_node->jacobian->p, expected_p, 4));
-    mu_assert("neg jacobian cols fail",
-              cmp_int_array(neg_node->jacobian->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(neg_node->jacobian, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(neg_node->jacobian, expected_p, expected_i, 3, 3));
 
     free_expr(neg_node);
     return 0;
@@ -50,12 +47,9 @@ const char *test_neg_chain(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("neg chain jacobian vals fail",
-              cmp_double_array(neg2->jacobian->x, expected_x, 3));
-    mu_assert("neg chain jacobian rows fail",
-              cmp_int_array(neg2->jacobian->p, expected_p, 4));
-    mu_assert("neg chain jacobian cols fail",
-              cmp_int_array(neg2->jacobian->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(neg2->jacobian, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(neg2->jacobian, expected_p, expected_i, 3, 3));
 
     free_expr(neg2);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_promote.h b/tests/jacobian_tests/affine/test_promote.h
index ad2ab49..c0e31ed 100644
--- a/tests/jacobian_tests/affine/test_promote.h
+++ b/tests/jacobian_tests/affine/test_promote.h
@@ -22,12 +22,9 @@ const char *test_promote_scalar_jacobian(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 0, 0};
 
-    mu_assert("promote jacobian vals fail",
-              cmp_double_array(promote_node->jacobian->x, expected_x, 3));
-    mu_assert("promote jacobian rows fail",
-              cmp_int_array(promote_node->jacobian->p, expected_p, 4));
-    mu_assert("promote jacobian cols fail",
-              cmp_int_array(promote_node->jacobian->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(promote_node->jacobian, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(promote_node->jacobian, expected_p, expected_i, 3, 3));
 
     free_expr(promote_node);
     return 0;
@@ -54,12 +51,9 @@ const char *test_promote_scalar_to_matrix_jacobian(void)
     int expected_p[7] = {0, 1, 2, 3, 4, 5, 6};
     int expected_i[6] = {0, 0, 0, 0, 0, 0};
 
-    mu_assert("promote matrix jacobian vals fail",
-              cmp_double_array(promote_node->jacobian->x, expected_x, 6));
-    mu_assert("promote matrix jacobian rows fail",
-              cmp_int_array(promote_node->jacobian->p, expected_p, 7));
-    mu_assert("promote matrix jacobian cols fail",
-              cmp_int_array(promote_node->jacobian->i, expected_i, 6));
+    mu_assert("vals fail", cmp_values(promote_node->jacobian, expected_x, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(promote_node->jacobian, expected_p, expected_i, 6, 6));
 
     free_expr(promote_node);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_right_matmul.h b/tests/jacobian_tests/affine/test_right_matmul.h
index 8c2fa04..4259172 100644
--- a/tests/jacobian_tests/affine/test_right_matmul.h
+++ b/tests/jacobian_tests/affine/test_right_matmul.h
@@ -17,8 +17,8 @@ const char *test_jacobian_right_matmul_log(void)
     double x_vals[4] = {1.0, 3.0, 2.0, 4.0}; // column-wise vectorization
     expr *x = new_variable(2, 2, 0, 4);
 
-    /* Create sparse matrix A in CSR format (2x3) */
-    CSR_Matrix *A = new_csr_matrix(2, 3, 4);
+    /* Create sparse matrix A in CSR_matrix format (2x3) */
+    CSR_matrix *A = new_CSR_matrix(2, 3, 4);
     int A_p[3] = {0, 2, 4};
     int A_i[4] = {0, 2, 0, 2};
     double A_x[4] = {1.0, 2.0, 3.0, 4.0};
@@ -47,11 +47,11 @@ const char *test_jacobian_right_matmul_log(void)
     int expected_Ai[8] = {0, 2, 1, 3, 0, 2, 1, 3};
     int expected_Ap[7] = {0, 2, 4, 4, 4, 6, 8};
 
-    mu_assert("vals fail", cmp_double_array(log_x_A->jacobian->x, expected_Ax, 8));
-    mu_assert("cols fail", cmp_int_array(log_x_A->jacobian->i, expected_Ai, 8));
-    mu_assert("rows fail", cmp_int_array(log_x_A->jacobian->p, expected_Ap, 7));
+    mu_assert("vals fail", cmp_values(log_x_A->jacobian, expected_Ax, 8));
+    mu_assert("sparsity fail",
+              cmp_sparsity(log_x_A->jacobian, expected_Ap, expected_Ai, 6, 8));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     free_expr(log_x_A);
     return 0;
 }
@@ -66,8 +66,8 @@ const char *test_jacobian_right_matmul_log_vector(void)
     double x_vals[3] = {1.0, 2.0, 3.0};
     expr *x = new_variable(1, 3, 0, 3);
 
-    /* Create sparse matrix A in CSR format (3x2) */
-    CSR_Matrix *A = new_csr_matrix(3, 2, 4);
+    /* Create sparse matrix A in CSR_matrix format (3x2) */
+    CSR_matrix *A = new_CSR_matrix(3, 2, 4);
     int A_p[4] = {0, 1, 3, 4};
     int A_i[4] = {0, 0, 1, 1};
     double A_x[4] = {1.0, 2.0, 3.0, 4.0};
@@ -92,11 +92,11 @@ const char *test_jacobian_right_matmul_log_vector(void)
     int expected_Ai[4] = {0, 1, 1, 2};
     int expected_Ap[3] = {0, 2, 4};
 
-    mu_assert("vals fail", cmp_double_array(log_x_A->jacobian->x, expected_Ax, 4));
-    mu_assert("cols fail", cmp_int_array(log_x_A->jacobian->i, expected_Ai, 4));
-    mu_assert("rows fail", cmp_int_array(log_x_A->jacobian->p, expected_Ap, 3));
+    mu_assert("vals fail", cmp_values(log_x_A->jacobian, expected_Ax, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(log_x_A->jacobian, expected_Ap, expected_Ai, 2, 4));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     free_expr(log_x_A);
     return 0;
 }
diff --git a/tests/jacobian_tests/affine/test_scalar_mult.h b/tests/jacobian_tests/affine/test_scalar_mult.h
index a16a6fa..afdcdd4 100644
--- a/tests/jacobian_tests/affine/test_scalar_mult.h
+++ b/tests/jacobian_tests/affine/test_scalar_mult.h
@@ -36,12 +36,9 @@ const char *test_jacobian_scalar_mult_log_vector(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("scalar mult log: x values fail",
-              cmp_double_array(y->jacobian->x, expected_x, 3));
-    mu_assert("scalar mult log: row pointers fail",
-              cmp_int_array(y->jacobian->p, expected_p, 4));
-    mu_assert("scalar mult log: column indices fail",
-              cmp_int_array(y->jacobian->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(y->jacobian, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(y->jacobian, expected_p, expected_i, 3, 3));
 
     free_expr(y);
     return 0;
@@ -73,12 +70,9 @@ const char *test_jacobian_scalar_mult_log_matrix(void)
     int expected_p[5] = {0, 1, 2, 3, 4};
     int expected_i[4] = {0, 1, 2, 3};
 
-    mu_assert("scalar mult log matrix: x values fail",
-              cmp_double_array(y->jacobian->x, expected_x, 4));
-    mu_assert("scalar mult log matrix: row pointers fail",
-              cmp_int_array(y->jacobian->p, expected_p, 5));
-    mu_assert("scalar mult log matrix: column indices fail",
-              cmp_int_array(y->jacobian->i, expected_i, 4));
+    mu_assert("vals fail", cmp_values(y->jacobian, expected_x, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(y->jacobian, expected_p, expected_i, 4, 4));
 
     free_expr(y);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_sum.h b/tests/jacobian_tests/affine/test_sum.h
index f34ee91..150fe53 100644
--- a/tests/jacobian_tests/affine/test_sum.h
+++ b/tests/jacobian_tests/affine/test_sum.h
@@ -25,9 +25,9 @@ const char *test_jacobian_sum_log(void)
     int expected_Ap[2] = {0, 3};
     int expected_Ai[3] = {2, 3, 4};
 
-    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->x, expected_Ax, 3));
-    mu_assert("rows fail", cmp_int_array(sum_node->jacobian->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(sum_node->jacobian->i, expected_Ai, 3));
+    mu_assert("vals fail", cmp_values(sum_node->jacobian, expected_Ax, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(sum_node->jacobian, expected_Ap, expected_Ai, 1, 3));
 
     free_expr(sum_node);
     return 0;
@@ -59,9 +59,9 @@ const char *test_jacobian_sum_mult(void)
     int expected_Ap[2] = {0, 6}; /* 1x10 matrix: row 0 spans all 6 nonzeros */
     int expected_Ai[6] = {2, 3, 4, 6, 7, 8}; /* column indices */
 
-    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->x, expected_Ax, 6));
-    mu_assert("rows fail", cmp_int_array(sum_node->jacobian->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(sum_node->jacobian->i, expected_Ai, 6));
+    mu_assert("vals fail", cmp_values(sum_node->jacobian, expected_Ax, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(sum_node->jacobian, expected_Ap, expected_Ai, 1, 6));
 
     free_expr(sum_node);
     return 0;
@@ -99,9 +99,9 @@ const char *test_jacobian_sum_log_axis_0(void)
     int expected_Ap[3] = {0, 3, 6};
     int expected_Ai[6] = {2, 3, 4, 5, 6, 7}; /* column indices */
 
-    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->x, expected_Ax, 6));
-    mu_assert("rows fail", cmp_int_array(sum_node->jacobian->p, expected_Ap, 3));
-    mu_assert("cols fail", cmp_int_array(sum_node->jacobian->i, expected_Ai, 6));
+    mu_assert("vals fail", cmp_values(sum_node->jacobian, expected_Ax, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(sum_node->jacobian, expected_Ap, expected_Ai, 2, 6));
 
     free_expr(sum_node);
     return 0;
@@ -146,9 +146,9 @@ const char *test_jacobian_sum_add_log_axis_0(void)
     int expected_Ai[12] = {2, 3, 4, 8,  9,  10,  /* row 0 columns */
                            5, 6, 7, 11, 12, 13}; /* row 1 columns */
 
-    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->x, expected_Ax, 12));
-    mu_assert("rows fail", cmp_int_array(sum_node->jacobian->p, expected_Ap, 3));
-    mu_assert("cols fail", cmp_int_array(sum_node->jacobian->i, expected_Ai, 12));
+    mu_assert("vals fail", cmp_values(sum_node->jacobian, expected_Ax, 12));
+    mu_assert("sparsity fail",
+              cmp_sparsity(sum_node->jacobian, expected_Ap, expected_Ai, 2, 12));
 
     free_expr(sum_node);
     return 0;
@@ -188,9 +188,9 @@ const char *test_jacobian_sum_log_axis_1(void)
     int expected_Ap[4] = {0, 2, 4, 6};
     int expected_Ai[6] = {2, 5, 3, 6, 4, 7}; /* column indices */
 
-    mu_assert("vals fail", cmp_double_array(sum_node->jacobian->x, expected_Ax, 6));
-    mu_assert("rows fail", cmp_int_array(sum_node->jacobian->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(sum_node->jacobian->i, expected_Ai, 6));
+    mu_assert("vals fail", cmp_values(sum_node->jacobian, expected_Ax, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(sum_node->jacobian, expected_Ap, expected_Ai, 3, 6));
 
     free_expr(sum_node);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_trace.h b/tests/jacobian_tests/affine/test_trace.h
index 61701bd..808d634 100644
--- a/tests/jacobian_tests/affine/test_trace.h
+++ b/tests/jacobian_tests/affine/test_trace.h
@@ -39,10 +39,9 @@ const char *test_jacobian_trace_variable(void)
     int expected_Ap[2] = {0, 3};
     int expected_Ai[3] = {1, 5, 9}; /* column indices (global variable indices) */
 
-    mu_assert("vals fail",
-              cmp_double_array(trace_node->jacobian->x, expected_Ax, 3));
-    mu_assert("rows fail", cmp_int_array(trace_node->jacobian->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(trace_node->jacobian->i, expected_Ai, 3));
+    mu_assert("vals fail", cmp_values(trace_node->jacobian, expected_Ax, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(trace_node->jacobian, expected_Ap, expected_Ai, 1, 3));
 
     free_expr(trace_node);
     return 0;
@@ -94,10 +93,9 @@ const char *test_jacobian_trace_composite(void)
     int expected_Ai[3] = {1, 5, 9}; /* column indices (global variable indices) */
 
     mu_assert("vals match count", trace_node->jacobian->nnz == 3);
-    mu_assert("rows fail", cmp_int_array(trace_node->jacobian->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(trace_node->jacobian->i, expected_Ai, 3));
-    mu_assert("vals fail",
-              cmp_double_array(trace_node->jacobian->x, expected_Ax, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(trace_node->jacobian, expected_Ap, expected_Ai, 1, 3));
+    mu_assert("vals fail", cmp_values(trace_node->jacobian, expected_Ax, 3));
 
     free_expr(trace_node);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_transpose.h b/tests/jacobian_tests/affine/test_transpose.h
index a02e22d..3302545 100644
--- a/tests/jacobian_tests/affine/test_transpose.h
+++ b/tests/jacobian_tests/affine/test_transpose.h
@@ -5,13 +5,14 @@
 #include "atoms/affine.h"
 #include "minunit.h"
 #include "test_helpers.h"
+#include "utils/permuted_dense.h"
 #include <math.h>
 #include <stdio.h>
 
 const char *test_jacobian_transpose(void)
 {
     // A = [1 2; 3 4]
-    CSR_Matrix *A = new_csr_matrix(2, 2, 4);
+    CSR_matrix *A = new_CSR_matrix(2, 2, 4);
     int A_p[3] = {0, 2, 4};
     int A_i[4] = {0, 1, 0, 1};
     double A_x[4] = {1, 2, 3, 4};
@@ -33,14 +34,59 @@ const char *test_jacobian_transpose(void)
     int expected_p[5] = {0, 2, 4, 6, 8};
     int expected_i[8] = {0, 1, 2, 3, 0, 1, 2, 3};
 
-    mu_assert("jacobian values fail",
-              cmp_double_array(transpose_AX->jacobian->x, expected_x, 8));
-    mu_assert("jacobian row ptr fail",
-              cmp_int_array(transpose_AX->jacobian->p, expected_p, 5));
-    mu_assert("jacobian col idx fail",
-              cmp_int_array(transpose_AX->jacobian->i, expected_i, 8));
+    mu_assert("vals fail", cmp_values(transpose_AX->jacobian, expected_x, 8));
+    mu_assert("sparsity fail",
+              cmp_sparsity(transpose_AX->jacobian, expected_p, expected_i, 4, 8));
     free_expr(transpose_AX);
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
+    return 0;
+}
+
+/* When the child of transpose has a PD Jacobian, the output should also be PD
+   with the same col_perm and a permuted row_perm. Setup:
+     u : 2x1 column variable, n_vars = 2.
+     AU = left_matmul_dense(A, u) with A a 6x2 dense matrix => AU is 6x1.
+          PD Jacobian: global (6, 2), m0=6, n0=2, row_perm=[0..5], col_perm=[0,1].
+     R  = reshape(AU, 3, 2). copy_sparsity preserves PD.
+     T  = transpose(R) with d1=2, d2=3. k(r) = (r/2) + (r%2)*3 = [0,3,1,4,2,5].
+          All r are active, so output row_perm stays [0..5] and the dense
+          block X is row-permuted: X_out[i, :] = X_c[k(i), :]. */
+const char *test_jacobian_transpose_pd_preserved(void)
+{
+    double A_data[12] = {1.0, 2.0, 3.0, 4.0,  5.0,  6.0,
+                         7.0, 8.0, 9.0, 10.0, 11.0, 12.0};
+    expr *u = new_variable(2, 1, 0, 2);
+    expr *AU = new_left_matmul_dense(NULL, u, 6, 2, A_data);
+    expr *R = new_reshape(AU, 3, 2);
+    expr *T = new_transpose(R);
+
+    double u_vals[2] = {0.5, -1.5};
+    T->forward(T, u_vals);
+    jacobian_init(T);
+    T->eval_jacobian(T);
+
+    /* Structural: output Jacobian must be a PD. */
+    mu_assert("transpose Jacobian should be PD", T->jacobian->is_permuted_dense);
+    permuted_dense *pd_T = (permuted_dense *) T->jacobian;
+    mu_assert("global m", T->jacobian->m == 6);
+    mu_assert("global n", T->jacobian->n == 2);
+    mu_assert("m0", pd_T->m0 == 6);
+    mu_assert("n0", pd_T->n0 == 2);
+    int expected_row_perm[6] = {0, 1, 2, 3, 4, 5};
+    int expected_col_perm[2] = {0, 1};
+    mu_assert("row_perm", cmp_int_array(pd_T->row_perm, expected_row_perm, 6));
+    mu_assert("col_perm", cmp_int_array(pd_T->col_perm, expected_col_perm, 2));
+
+    /* Numerical: X_out rows = A rows permuted by k(r) = [0,3,1,4,2,5]. */
+    double expected_X[12] = {1.0,  2.0,   /* row 0 from A row 0 */
+                             7.0,  8.0,   /* row 1 from A row 3 */
+                             3.0,  4.0,   /* row 2 from A row 1 */
+                             9.0,  10.0,  /* row 3 from A row 4 */
+                             5.0,  6.0,   /* row 4 from A row 2 */
+                             11.0, 12.0}; /* row 5 from A row 5 */
+    mu_assert("X values", cmp_double_array(pd_T->X, expected_X, 12));
+
+    free_expr(T);
     return 0;
 }
 
diff --git a/tests/jacobian_tests/affine/test_upper_tri.h b/tests/jacobian_tests/affine/test_upper_tri.h
index f0bffa1..82d7419 100644
--- a/tests/jacobian_tests/affine/test_upper_tri.h
+++ b/tests/jacobian_tests/affine/test_upper_tri.h
@@ -10,7 +10,7 @@ const char *test_upper_tri_jacobian_variable(void)
 {
     /* upper_tri of a 4x4 variable (16 vars total)
      * Row-major upper tri indices: [4, 8, 12, 9, 13, 14]
-     * Jacobian is 6x16 CSR: row k has a single 1.0 at col indices[k] */
+     * Jacobian is 6x16 CSR_matrix: row k has a single 1.0 at col indices[k] */
     double u[16];
     for (int k = 0; k < 16; k++)
     {
@@ -27,10 +27,9 @@ const char *test_upper_tri_jacobian_variable(void)
     int expected_p[7] = {0, 1, 2, 3, 4, 5, 6};
     int expected_i[6] = {4, 8, 12, 9, 13, 14};
 
-    mu_assert("upper_tri jac vals",
-              cmp_double_array(ut->jacobian->x, expected_x, 6));
-    mu_assert("upper_tri jac p", cmp_int_array(ut->jacobian->p, expected_p, 7));
-    mu_assert("upper_tri jac i", cmp_int_array(ut->jacobian->i, expected_i, 6));
+    mu_assert("vals fail", cmp_values(ut->jacobian, expected_x, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(ut->jacobian, expected_p, expected_i, 6, 6));
 
     free_expr(ut);
     return 0;
@@ -57,12 +56,12 @@ const char *test_upper_tri_jacobian_of_log(void)
     ut->eval_jacobian(ut);
 
     double expected_x[6] = {0.2, 1.0 / 9.0, 1.0 / 13.0, 0.1, 1.0 / 14.0, 1.0 / 15.0};
+    int expected_p[7] = {0, 1, 2, 3, 4, 5, 6};
     int expected_i[6] = {4, 8, 12, 9, 13, 14};
 
-    mu_assert("upper_tri log jac vals",
-              cmp_double_array(ut->jacobian->x, expected_x, 6));
-    mu_assert("upper_tri log jac cols",
-              cmp_int_array(ut->jacobian->i, expected_i, 6));
+    mu_assert("vals fail", cmp_values(ut->jacobian, expected_x, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(ut->jacobian, expected_p, expected_i, 6, 6));
 
     free_expr(ut);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_vector_mult.h b/tests/jacobian_tests/affine/test_vector_mult.h
index cd6fa1a..8196f9e 100644
--- a/tests/jacobian_tests/affine/test_vector_mult.h
+++ b/tests/jacobian_tests/affine/test_vector_mult.h
@@ -35,12 +35,9 @@ const char *test_jacobian_vector_mult_log_vector(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vector mult log: x values fail",
-              cmp_double_array(y->jacobian->x, expected_x, 3));
-    mu_assert("vector mult log: row pointers fail",
-              cmp_int_array(y->jacobian->p, expected_p, 4));
-    mu_assert("vector mult log: column indices fail",
-              cmp_int_array(y->jacobian->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(y->jacobian, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(y->jacobian, expected_p, expected_i, 3, 3));
 
     free_expr(y);
     return 0;
@@ -71,12 +68,9 @@ const char *test_jacobian_vector_mult_log_matrix(void)
     int expected_p[5] = {0, 1, 2, 3, 4};
     int expected_i[4] = {0, 1, 2, 3};
 
-    mu_assert("vector mult log matrix: x values fail",
-              cmp_double_array(y->jacobian->x, expected_x, 4));
-    mu_assert("vector mult log matrix: row pointers fail",
-              cmp_int_array(y->jacobian->p, expected_p, 5));
-    mu_assert("vector mult log matrix: column indices fail",
-              cmp_int_array(y->jacobian->i, expected_i, 4));
+    mu_assert("vals fail", cmp_values(y->jacobian, expected_x, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(y->jacobian, expected_p, expected_i, 4, 4));
 
     free_expr(y);
     return 0;
diff --git a/tests/jacobian_tests/affine/test_vstack.h b/tests/jacobian_tests/affine/test_vstack.h
index fc4229f..86f4ac8 100644
--- a/tests/jacobian_tests/affine/test_vstack.h
+++ b/tests/jacobian_tests/affine/test_vstack.h
@@ -38,12 +38,9 @@ const char *test_jacobian_vstack_vectors(void)
     int expected_i[6] = {0, 1, 2, 0, 1, 2};
     int expected_p[7] = {0, 1, 2, 3, 4, 5, 6};
 
-    mu_assert("vstack jac vectors: vals",
-              cmp_double_array(stack->jacobian->x, expected_x, 6));
-    mu_assert("vstack jac vectors: cols",
-              cmp_int_array(stack->jacobian->i, expected_i, 6));
-    mu_assert("vstack jac vectors: rows",
-              cmp_int_array(stack->jacobian->p, expected_p, 7));
+    mu_assert("vals fail", cmp_values(stack->jacobian, expected_x, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(stack->jacobian, expected_p, expected_i, 6, 6));
 
     free_expr(stack);
     return 0;
@@ -88,12 +85,9 @@ const char *test_jacobian_vstack_matrix(void)
     int expected_i[9] = {0, 1, 6, 2, 3, 7, 4, 5, 8};
     int expected_p[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
 
-    mu_assert("vstack jac matrix: vals",
-              cmp_double_array(stack->jacobian->x, expected_x, 9));
-    mu_assert("vstack jac matrix: cols",
-              cmp_int_array(stack->jacobian->i, expected_i, 9));
-    mu_assert("vstack jac matrix: rows",
-              cmp_int_array(stack->jacobian->p, expected_p, 10));
+    mu_assert("vals fail", cmp_values(stack->jacobian, expected_x, 9));
+    mu_assert("sparsity fail",
+              cmp_sparsity(stack->jacobian, expected_p, expected_i, 9, 9));
 
     free_expr(stack);
     return 0;
diff --git a/tests/jacobian_tests/bivariate_full_dom/test_elementwise_mult.h b/tests/jacobian_tests/bivariate_full_dom/test_elementwise_mult.h
index 9e7e6e5..08d50e4 100644
--- a/tests/jacobian_tests/bivariate_full_dom/test_elementwise_mult.h
+++ b/tests/jacobian_tests/bivariate_full_dom/test_elementwise_mult.h
@@ -24,9 +24,8 @@ const char *test_jacobian_elementwise_mult_1(void)
     int rows[4] = {0, 2, 4, 6};
     int cols[6] = {2, 7, 3, 8, 4, 9};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, vals, 6));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, rows, 4));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, cols, 6));
+    mu_assert("vals fail", cmp_values(node->jacobian, vals, 6));
+    mu_assert("sparsity fail", cmp_sparsity(node->jacobian, rows, cols, 3, 6));
     free_expr(node);
     return 0;
 }
@@ -49,9 +48,8 @@ const char *test_jacobian_elementwise_mult_2(void)
     int rows[4] = {0, 2, 4, 6};
     int cols[6] = {2, 7, 3, 8, 4, 9};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, vals, 6));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, rows, 4));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, cols, 6));
+    mu_assert("vals fail", cmp_values(node->jacobian, vals, 6));
+    mu_assert("sparsity fail", cmp_sparsity(node->jacobian, rows, cols, 3, 6));
     free_expr(node);
     return 0;
 }
@@ -66,7 +64,7 @@ const char *test_jacobian_elementwise_mult_3(void)
          0 0 1 1 3 0 0 0 0 0
          0 0 1 -1 1 0 0 0 0 0]
     */
-    CSR_Matrix *A = new_csr_matrix(3, 10, 9);
+    CSR_matrix *A = new_CSR_matrix(3, 10, 9);
     double Ax_vals[9] = {1.0, 2.0, 1.0, 1.0, 3.0, 1.0, -1.0, 1.0};
     int Ai[9] = {2, 3, 2, 3, 4, 2, 3, 4};
     int Ap[4] = {0, 2, 5, 8};
@@ -80,7 +78,7 @@ const char *test_jacobian_elementwise_mult_3(void)
          0 0 0 0 0 0 0 1 -2 1]
 
     */
-    CSR_Matrix *B = new_csr_matrix(3, 10, 9);
+    CSR_matrix *B = new_CSR_matrix(3, 10, 9);
     double Bx_vals[9] = {1.0, 3.0, 1.0, 1.0, 4.0, 1.0, -2.0, 1.0};
     int Bi[9] = {7, 8, 7, 8, 9, 7, 8, 9};
     int Bp[4] = {0, 2, 5, 8};
@@ -112,12 +110,11 @@ const char *test_jacobian_elementwise_mult_3(void)
     int rows[4] = {0, 4, 10, 16};
     int cols[16] = {2, 3, 7, 8, 2, 3, 4, 7, 8, 9, 2, 3, 4, 7, 8, 9};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, vals, 16));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, rows, 4));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, cols, 16));
+    mu_assert("vals fail", cmp_values(node->jacobian, vals, 16));
+    mu_assert("sparsity fail", cmp_sparsity(node->jacobian, rows, cols, 3, 16));
     free_expr(node);
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
     return 0;
 }
 
@@ -131,7 +128,7 @@ const char *test_jacobian_elementwise_mult_4(void)
          0 0 1 1 3 0 0 0 0 0
          0 0 1 -1 1 0 0 0 0 0]
     */
-    CSR_Matrix *A = new_csr_matrix(3, 10, 9);
+    CSR_matrix *A = new_CSR_matrix(3, 10, 9);
     double Ax_vals[9] = {1.0, 2.0, 1.0, 1.0, 3.0, 1.0, -1.0, 1.0};
     int Ai[9] = {2, 3, 2, 3, 4, 2, 3, 4};
     int Ap[4] = {0, 2, 5, 8};
@@ -157,10 +154,9 @@ const char *test_jacobian_elementwise_mult_4(void)
     int rows[4] = {0, 2, 5, 8};
     int cols[8] = {2, 3, 2, 3, 4, 2, 3, 4};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, vals, 8));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, rows, 4));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, cols, 8));
+    mu_assert("vals fail", cmp_values(node->jacobian, vals, 8));
+    mu_assert("sparsity fail", cmp_sparsity(node->jacobian, rows, cols, 3, 8));
     free_expr(node);
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     return 0;
 }
diff --git a/tests/jacobian_tests/bivariate_full_dom/test_matmul.h b/tests/jacobian_tests/bivariate_full_dom/test_matmul.h
index 386efa1..6155252 100644
--- a/tests/jacobian_tests/bivariate_full_dom/test_matmul.h
+++ b/tests/jacobian_tests/bivariate_full_dom/test_matmul.h
@@ -49,15 +49,10 @@ const char *test_jacobian_matmul(void)
     Z->eval_jacobian(Z);
 
     /* Verify sparsity pattern */
-    mu_assert("Jacobian should have 8 rows", Z->jacobian->m == z_size);
     mu_assert("Jacobian should have 18 columns", Z->jacobian->n == n_vars);
-    mu_assert("Jacobian should have 48 nonzeros", Z->jacobian->nnz == 48);
 
-    /* Check row pointers: each row should have 6 entries */
+    /* Each row should have 6 entries. */
     int expected_p[9] = {0, 6, 12, 18, 24, 30, 36, 42, 48};
-    mu_assert("Row pointers incorrect",
-              cmp_int_array(Z->jacobian->p, expected_p, 9));
-
     int expected_i[48] = {0, 2, 4, 6,  7,  8,   /* row 0 */
                           1, 3, 5, 6,  7,  8,   /* row 1 */
                           0, 2, 4, 9,  10, 11,  /* row 2 */
@@ -66,8 +61,8 @@ const char *test_jacobian_matmul(void)
                           1, 3, 5, 12, 13, 14,  /* row 5 */
                           0, 2, 4, 15, 16, 17,  /* row 6 */
                           1, 3, 5, 15, 16, 17}; /* row 7 */
-    mu_assert("Column indices incorrect",
-              cmp_int_array(Z->jacobian->i, expected_i, 48));
+    mu_assert("sparsity fail",
+              cmp_sparsity(Z->jacobian, expected_p, expected_i, z_size, 48));
 
     /* Verify Jacobian values row-wise: for each row, values are
        [Y^T row for the column, X row values] since X has lower var_id */
@@ -81,8 +76,7 @@ const char *test_jacobian_matmul(void)
         /* row 6 (col 3) */ 16.0, 17.0, 18.0, 1.0, 3.0, 5.0,
         /* row 7 (col 3) */ 16.0, 17.0, 18.0, 2.0, 4.0, 6.0};
 
-    mu_assert("Jacobian values incorrect",
-              cmp_double_array(Z->jacobian->x, expected_x, 48));
+    mu_assert("vals fail", cmp_values(Z->jacobian, expected_x, 48));
 
     free_expr(Z);
     return 0;
diff --git a/tests/jacobian_tests/bivariate_restricted_dom/test_quad_over_lin.h b/tests/jacobian_tests/bivariate_restricted_dom/test_quad_over_lin.h
index e1007f8..1a44469 100644
--- a/tests/jacobian_tests/bivariate_restricted_dom/test_quad_over_lin.h
+++ b/tests/jacobian_tests/bivariate_restricted_dom/test_quad_over_lin.h
@@ -25,9 +25,9 @@ const char *test_quad_over_lin1(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 3, 4, 7};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 4));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, expected_Ai, 4));
+    mu_assert("vals fail", cmp_values(node->jacobian, expected_Ax, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 1, 4));
     free_expr(node);
     return 0;
 }
@@ -49,9 +49,9 @@ const char *test_quad_over_lin2(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 5, 6, 7};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 4));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, expected_Ai, 4));
+    mu_assert("vals fail", cmp_values(node->jacobian, expected_Ax, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 1, 4));
     free_expr(node);
     return 0;
 }
@@ -63,7 +63,7 @@ const char *test_quad_over_lin3(void)
     // A = [0 0 1 2 3 0 0 0
     //      0 0 4 5 6 0 0]
 
-    CSR_Matrix *A = new_csr_matrix(2, 8, 6);
+    CSR_matrix *A = new_CSR_matrix(2, 8, 6);
     double Ax[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     int Ai[6] = {2, 3, 4, 2, 3, 4};
     int Ap[3] = {0, 3, 6};
@@ -86,11 +86,11 @@ const char *test_quad_over_lin3(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 3, 4, 7};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_vals, 4));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, expected_Ai, 4));
+    mu_assert("vals fail", cmp_values(node->jacobian, expected_vals, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 1, 4));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     free_expr(node);
     return 0;
 }
@@ -103,7 +103,7 @@ const char *test_quad_over_lin4(void)
     //      0 0 0 0 0 4 5 6
     //
 
-    CSR_Matrix *A = new_csr_matrix(2, 8, 6);
+    CSR_matrix *A = new_CSR_matrix(2, 8, 6);
     double Ax[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     int Ai[6] = {5, 6, 7, 5, 6, 7};
     int Ap[3] = {0, 3, 6};
@@ -126,11 +126,11 @@ const char *test_quad_over_lin4(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 5, 6, 7};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_vals, 4));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, expected_Ai, 4));
+    mu_assert("vals fail", cmp_values(node->jacobian, expected_vals, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 1, 4));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     free_expr(node);
     return 0;
 }
@@ -143,7 +143,7 @@ const char *test_quad_over_lin5(void)
     //      0 2 0 0 0 4 5 6
     //
 
-    CSR_Matrix *A = new_csr_matrix(2, 8, 9);
+    CSR_matrix *A = new_CSR_matrix(2, 8, 9);
     double Ax[9] = {1, 3, 1.0, 2.0, 3.0, 2, 4.0, 5.0, 6.0};
     int Ai[9] = {0, 3, 5, 6, 7, 1, 5, 6, 7};
     int Ap[3] = {0, 5, 9};
@@ -166,11 +166,11 @@ const char *test_quad_over_lin5(void)
     int expected_Ap[2] = {0, 7};
     int expected_Ai[7] = {0, 1, 2, 3, 5, 6, 7};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_vals, 7));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, expected_Ai, 7));
+    mu_assert("vals fail", cmp_values(node->jacobian, expected_vals, 7));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 1, 7));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     free_expr(node);
     return 0;
 }
diff --git a/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr.h b/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr.h
index 883e233..e9998fa 100644
--- a/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr.h
+++ b/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr.h
@@ -30,9 +30,9 @@ const char *test_jacobian_rel_entr_vector_args_1(void)
     int expected_Ap[4] = {0, 2, 4, 6};
     int expected_Ai[6] = {2, 7, 3, 8, 4, 9};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 6));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, expected_Ai, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 3, 6));
+    mu_assert("vals fail", cmp_values(node->jacobian, expected_Ax, 6));
     free_expr(node);
     return 0;
 }
@@ -61,9 +61,9 @@ const char *test_jacobian_rel_entr_vector_args_2(void)
     int expected_Ap[4] = {0, 2, 4, 6};
     int expected_Ai[6] = {2, 7, 3, 8, 4, 9};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 6));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, expected_Ai, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 3, 6));
+    mu_assert("vals fail", cmp_values(node->jacobian, expected_Ax, 6));
     free_expr(node);
     return 0;
 }
@@ -103,9 +103,9 @@ const char *test_jacobian_rel_entr_matrix_args(void)
     int expected_Ap[7] = {0, 2, 4, 6, 8, 10, 12};
     int expected_Ai[12] = {0, 6, 1, 7, 2, 8, 3, 9, 4, 10, 5, 11};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 12));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, expected_Ap, 7));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, expected_Ai, 12));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 6, 12));
+    mu_assert("vals fail", cmp_values(node->jacobian, expected_Ax, 12));
 
     free_expr(node);
     return 0;
diff --git a/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_scalar_vector.h b/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_scalar_vector.h
index 20cd6fa..5236d3e 100644
--- a/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_scalar_vector.h
+++ b/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_scalar_vector.h
@@ -28,9 +28,9 @@ const char *test_jacobian_rel_entr_scalar_vector(void)
     int expected_Ap[4] = {0, 2, 4, 6};
     int expected_Ai[6] = {0, 1, 0, 2, 0, 3};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 6));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, expected_Ai, 6));
+    mu_assert("vals fail", cmp_values(node->jacobian, expected_Ax, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 3, 6));
     free_expr(node);
     return 0;
 }
diff --git a/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_vector_scalar.h b/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_vector_scalar.h
index 9a454dd..d2f594f 100644
--- a/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_vector_scalar.h
+++ b/tests/jacobian_tests/bivariate_restricted_dom/test_rel_entr_vector_scalar.h
@@ -28,9 +28,9 @@ const char *test_jacobian_rel_entr_vector_scalar(void)
     int expected_Ap[4] = {0, 2, 4, 6};
     int expected_Ai[6] = {0, 3, 1, 3, 2, 3};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 6));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, expected_Ai, 6));
+    mu_assert("vals fail", cmp_values(node->jacobian, expected_Ax, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 3, 6));
     free_expr(node);
     return 0;
 }
diff --git a/tests/jacobian_tests/composite/test_chain_rule_jacobian.h b/tests/jacobian_tests/composite/test_chain_rule_jacobian.h
index bb7a23c..1fd6a65 100644
--- a/tests/jacobian_tests/composite/test_chain_rule_jacobian.h
+++ b/tests/jacobian_tests/composite/test_chain_rule_jacobian.h
@@ -6,7 +6,7 @@
 #include "minunit.h"
 #include "numerical_diff.h"
 #include "test_helpers.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 
 const char *test_jacobian_exp_sum(void)
 {
@@ -79,8 +79,8 @@ const char *test_jacobian_Ax_Bx_multiply(void)
     /* the first and last values are not used, but good to include them in test */
     double u_vals[4] = {1.0, 2.0, 3.0, 4.0};
 
-    CSR_Matrix *A = new_csr_random(2, 2, 1.0);
-    CSR_Matrix *B = new_csr_random(2, 2, 1.0);
+    CSR_matrix *A = new_csr_random(2, 2, 1.0);
+    CSR_matrix *B = new_csr_random(2, 2, 1.0);
     expr *x = new_variable(2, 1, 1, 4);
     expr *Ax = new_left_matmul(NULL, x, A);
     expr *Bx = new_left_matmul(NULL, x, B);
@@ -90,8 +90,8 @@ const char *test_jacobian_Ax_Bx_multiply(void)
               check_jacobian_num(multiply, u_vals, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(multiply);
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
     return 0;
 }
 
@@ -99,8 +99,8 @@ const char *test_jacobian_AX_BX_multiply(void)
 {
     double u_vals[4] = {1.0, 2.0, 3.0, 4.0};
 
-    CSR_Matrix *A = new_csr_random(2, 2, 1.0);
-    CSR_Matrix *B = new_csr_random(2, 2, 1.0);
+    CSR_matrix *A = new_csr_random(2, 2, 1.0);
+    CSR_matrix *B = new_csr_random(2, 2, 1.0);
     expr *X = new_variable(2, 2, 0, 4);
     expr *AX = new_left_matmul(NULL, X, A);
     expr *BX = new_left_matmul(NULL, X, B);
@@ -110,8 +110,8 @@ const char *test_jacobian_AX_BX_multiply(void)
               check_jacobian_num(multiply, u_vals, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(multiply);
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
     return 0;
 }
 
@@ -120,10 +120,10 @@ const char *test_jacobian_quad_form_Ax(void)
     /* (Ax)^T Q (Ax) where Q is symmetric */
     double u_vals[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
 
-    CSR_Matrix *A = new_csr_random(3, 4, 1.0);
+    CSR_matrix *A = new_csr_random(3, 4, 1.0);
 
     /* Q = [1 2 0; 2 3 0; 0 0 4] */
-    CSR_Matrix *Q = new_csr_matrix(3, 3, 5);
+    CSR_matrix *Q = new_CSR_matrix(3, 3, 5);
     double Qx[5] = {1.0, 2.0, 2.0, 3.0, 4.0};
     int Qi[5] = {0, 1, 0, 1, 2};
     int Qp[4] = {0, 2, 4, 5};
@@ -140,8 +140,8 @@ const char *test_jacobian_quad_form_Ax(void)
               check_jacobian_num(node, u_vals, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(node);
-    free_csr_matrix(A);
-    free_csr_matrix(Q);
+    free_CSR_matrix(A);
+    free_CSR_matrix(Q);
     return 0;
 }
 
@@ -151,7 +151,7 @@ const char *test_jacobian_quad_form_exp(void)
     double u_vals[3] = {0.5, 1.0, 1.5};
 
     /* Q = [1 2 0; 2 3 0; 0 0 4] */
-    CSR_Matrix *Q = new_csr_matrix(3, 3, 5);
+    CSR_matrix *Q = new_CSR_matrix(3, 3, 5);
     double Qx[5] = {1.0, 2.0, 2.0, 3.0, 4.0};
     int Qi[5] = {0, 1, 0, 1, 2};
     int Qp[4] = {0, 2, 4, 5};
@@ -167,7 +167,7 @@ const char *test_jacobian_quad_form_exp(void)
               check_jacobian_num(node, u_vals, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(node);
-    free_csr_matrix(Q);
+    free_CSR_matrix(Q);
     return 0;
 }
 
@@ -212,8 +212,8 @@ const char *test_jacobian_matmul_Ax_By(void)
     /* Z = (A @ X) @ (B @ Y) with constant matrices A, B */
     double u_vals[10] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0};
 
-    CSR_Matrix *A = new_csr_random(3, 2, 1.0);
-    CSR_Matrix *B = new_csr_random(2, 3, 1.0);
+    CSR_matrix *A = new_csr_random(3, 2, 1.0);
+    CSR_matrix *B = new_csr_random(2, 3, 1.0);
 
     expr *X = new_variable(2, 2, 0, 10);    /* 2x2, vars 0-3 */
     expr *Y = new_variable(3, 2, 4, 10);    /* 3x2, vars 4-9 */
@@ -225,8 +225,8 @@ const char *test_jacobian_matmul_Ax_By(void)
               check_jacobian_num(Z, u_vals, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(Z);
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
     return 0;
 }
 
@@ -235,8 +235,8 @@ const char *test_jacobian_matmul_sin_Ax_cos_Bx(void)
     /* Z = sin(A @ X) @ cos(B @ X), shared variable X */
     double u_vals[6] = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0};
 
-    CSR_Matrix *A = new_csr_random(2, 3, 1.0);
-    CSR_Matrix *B = new_csr_random(2, 3, 1.0);
+    CSR_matrix *A = new_csr_random(2, 3, 1.0);
+    CSR_matrix *B = new_csr_random(2, 3, 1.0);
 
     expr *X = new_variable(3, 2, 0, 6);     /* 3x2, vars 0-5 */
     expr *AX = new_left_matmul(NULL, X, A); /* 2x2 */
@@ -249,8 +249,8 @@ const char *test_jacobian_matmul_sin_Ax_cos_Bx(void)
               check_jacobian_num(Z, u_vals, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(Z);
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
     return 0;
 }
 
diff --git a/tests/jacobian_tests/composite/test_composite_exp.h b/tests/jacobian_tests/composite/test_composite_exp.h
index 32671d1..73a2e95 100644
--- a/tests/jacobian_tests/composite/test_composite_exp.h
+++ b/tests/jacobian_tests/composite/test_composite_exp.h
@@ -11,7 +11,7 @@ const char *test_jacobian_composite_exp(void)
 {
     double u_vals[6] = {0, 0, 1, 2, 3, 0};
 
-    CSR_Matrix *A = new_csr_matrix(2, 6, 6);
+    CSR_matrix *A = new_CSR_matrix(2, 6, 6);
     double Ax[6] = {3, 2, 1, 2, 1, 1};
     int Ai[6] = {2, 3, 4, 2, 3, 4};
     int Ap[3] = {0, 3, 6};
@@ -33,11 +33,10 @@ const char *test_jacobian_composite_exp(void)
     double vals[6] = {3 * e10, 2 * e10, 1 * e10, 2 * e7, 1 * e7, 1 * e7};
     int rows[3] = {0, 3, 6};
     int cols[6] = {2, 3, 4, 2, 3, 4};
-    mu_assert("vals fail", cmp_double_array(exp_node->jacobian->x, vals, 6));
-    mu_assert("rows fail", cmp_int_array(exp_node->jacobian->p, rows, 3));
-    mu_assert("cols fail", cmp_int_array(exp_node->jacobian->i, cols, 6));
+    mu_assert("vals fail", cmp_values(exp_node->jacobian, vals, 6));
+    mu_assert("sparsity fail", cmp_sparsity(exp_node->jacobian, rows, cols, 2, 6));
     free_expr(exp_node);
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     return 0;
 }
 
@@ -46,7 +45,7 @@ const char *test_jacobian_composite_exp_add(void)
 {
     double u_vals[7] = {0, 0, 1, 1, 1, 2, 2};
 
-    CSR_Matrix *A = new_csr_matrix(3, 7, 9);
+    CSR_matrix *A = new_CSR_matrix(3, 7, 9);
     double Ax[9] = {1, 1, 1, 2, 2, 2, 3, 3, 3};
     int Ai[9] = {2, 3, 4, 2, 3, 4, 2, 3, 4};
     int Ap[4] = {0, 3, 6, 9};
@@ -54,7 +53,7 @@ const char *test_jacobian_composite_exp_add(void)
     memcpy(A->i, Ai, 9 * sizeof(int));
     memcpy(A->p, Ap, 4 * sizeof(int));
 
-    CSR_Matrix *B = new_csr_matrix(3, 7, 6);
+    CSR_matrix *B = new_CSR_matrix(3, 7, 6);
     double Bx[6] = {1, 1, 2, 2, 3, 3};
     int Bi[6] = {5, 6, 5, 6, 5, 6};
     int Bp[4] = {0, 2, 4, 6};
@@ -74,7 +73,7 @@ const char *test_jacobian_composite_exp_add(void)
               check_jacobian_num(sum, u_vals, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(sum);
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
     return 0;
 }
diff --git a/tests/jacobian_tests/elementwise_restricted_dom/test_log.h b/tests/jacobian_tests/elementwise_restricted_dom/test_log.h
index ff99d64..b671910 100644
--- a/tests/jacobian_tests/elementwise_restricted_dom/test_log.h
+++ b/tests/jacobian_tests/elementwise_restricted_dom/test_log.h
@@ -18,9 +18,9 @@ const char *test_jacobian_log(void)
     log_node->forward(log_node, u_vals);
     jacobian_init(log_node);
     log_node->eval_jacobian(log_node);
-    mu_assert("vals fail", cmp_double_array(log_node->jacobian->x, expected_Ax, 3));
-    mu_assert("rows fail", cmp_int_array(log_node->jacobian->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(log_node->jacobian->i, expected_Ai, 3));
+    mu_assert("vals fail", cmp_values(log_node->jacobian, expected_Ax, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(log_node->jacobian, expected_Ap, expected_Ai, 3, 3));
     free_expr(log_node);
     return 0;
 }
@@ -36,9 +36,9 @@ const char *test_jacobian_log_matrix(void)
     log_node->forward(log_node, u_vals);
     jacobian_init(log_node);
     log_node->eval_jacobian(log_node);
-    mu_assert("vals fail", cmp_double_array(log_node->jacobian->x, expected_Ax, 4));
-    mu_assert("rows fail", cmp_int_array(log_node->jacobian->p, expected_Ap, 5));
-    mu_assert("cols fail", cmp_int_array(log_node->jacobian->i, expected_Ai, 4));
+    mu_assert("vals fail", cmp_values(log_node->jacobian, expected_Ax, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(log_node->jacobian, expected_Ap, expected_Ai, 4, 4));
     free_expr(log_node);
     return 0;
 }
diff --git a/tests/jacobian_tests/other/test_prod.h b/tests/jacobian_tests/other/test_prod.h
index ef59250..7875437 100644
--- a/tests/jacobian_tests/other/test_prod.h
+++ b/tests/jacobian_tests/other/test_prod.h
@@ -25,9 +25,9 @@ const char *test_jacobian_prod_no_zero(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->jacobian->x, expected_Ax, 4));
-    mu_assert("rows fail", cmp_int_array(p->jacobian->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(p->jacobian->i, expected_Ai, 4));
+    mu_assert("vals fail", cmp_values(p->jacobian, expected_Ax, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->jacobian, expected_Ap, expected_Ai, 1, 4));
 
     free_expr(p);
     return 0;
@@ -50,9 +50,9 @@ const char *test_jacobian_prod_one_zero(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->jacobian->x, expected_Ax, 4));
-    mu_assert("rows fail", cmp_int_array(p->jacobian->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(p->jacobian->i, expected_Ai, 4));
+    mu_assert("vals fail", cmp_values(p->jacobian, expected_Ax, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->jacobian, expected_Ap, expected_Ai, 1, 4));
 
     free_expr(p);
     return 0;
@@ -73,9 +73,9 @@ const char *test_jacobian_prod_two_zeros(void)
     int expected_Ap[2] = {0, 4};
     int expected_Ai[4] = {2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->jacobian->x, expected_Ax, 4));
-    mu_assert("rows fail", cmp_int_array(p->jacobian->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(p->jacobian->i, expected_Ai, 4));
+    mu_assert("vals fail", cmp_values(p->jacobian, expected_Ax, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->jacobian, expected_Ap, expected_Ai, 1, 4));
 
     free_expr(p);
     return 0;
diff --git a/tests/jacobian_tests/other/test_prod_axis_one.h b/tests/jacobian_tests/other/test_prod_axis_one.h
index b4757ee..8310f86 100644
--- a/tests/jacobian_tests/other/test_prod_axis_one.h
+++ b/tests/jacobian_tests/other/test_prod_axis_one.h
@@ -37,14 +37,14 @@ const char *test_jacobian_prod_axis_one(void)
     jacobian_init(p);
     p->eval_jacobian(p);
 
-    /* CSR format for 3x10 Jacobian with row-strided structure */
+    /* CSR_matrix format for 3x10 Jacobian with row-strided structure */
     double expected_Ax[9] = {28.0, 7.0, 4.0, 40.0, 16.0, 10.0, 54.0, 27.0, 18.0};
     int expected_Ap[4] = {0, 3, 6, 9};
     int expected_Ai[9] = {1, 4, 7, 2, 5, 8, 3, 6, 9};
 
-    mu_assert("vals fail", cmp_double_array(p->jacobian->x, expected_Ax, 9));
-    mu_assert("rows fail", cmp_int_array(p->jacobian->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(p->jacobian->i, expected_Ai, 9));
+    mu_assert("vals fail", cmp_values(p->jacobian, expected_Ax, 9));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->jacobian, expected_Ap, expected_Ai, 3, 9));
 
     free_expr(p);
     return 0;
@@ -80,14 +80,14 @@ const char *test_jacobian_prod_axis_one_one_zero(void)
     jacobian_init(p);
     p->eval_jacobian(p);
 
-    /* CSR format for 3x10 Jacobian with row-strided structure */
+    /* CSR_matrix format for 3x10 Jacobian with row-strided structure */
     double expected_Ax[9] = {28.0, 7.0, 4.0, 0.0, 16.0, 0.0, 54.0, 27.0, 18.0};
     int expected_Ap[4] = {0, 3, 6, 9};
     int expected_Ai[9] = {1, 4, 7, 2, 5, 8, 3, 6, 9};
 
-    mu_assert("vals fail", cmp_double_array(p->jacobian->x, expected_Ax, 9));
-    mu_assert("rows fail", cmp_int_array(p->jacobian->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(p->jacobian->i, expected_Ai, 9));
+    mu_assert("vals fail", cmp_values(p->jacobian, expected_Ax, 9));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->jacobian, expected_Ap, expected_Ai, 3, 9));
 
     free_expr(p);
     return 0;
diff --git a/tests/jacobian_tests/other/test_prod_axis_zero.h b/tests/jacobian_tests/other/test_prod_axis_zero.h
index 42b4ad1..519a40c 100644
--- a/tests/jacobian_tests/other/test_prod_axis_zero.h
+++ b/tests/jacobian_tests/other/test_prod_axis_zero.h
@@ -31,14 +31,14 @@ const char *test_jacobian_prod_axis_zero(void)
     jacobian_init(p);
     p->eval_jacobian(p);
 
-    /* CSR format for 3x8 Jacobian with block diagonal structure */
+    /* CSR_matrix format for 3x8 Jacobian with block diagonal structure */
     double expected_Ax[6] = {2.0, 1.0, 4.0, 3.0, 6.0, 5.0};
     int expected_Ap[4] = {0, 2, 4, 6};
     int expected_Ai[6] = {1, 2, 3, 4, 5, 6};
 
-    mu_assert("vals fail", cmp_double_array(p->jacobian->x, expected_Ax, 6));
-    mu_assert("rows fail", cmp_int_array(p->jacobian->p, expected_Ap, 4));
-    mu_assert("cols fail", cmp_int_array(p->jacobian->i, expected_Ai, 6));
+    mu_assert("vals fail", cmp_values(p->jacobian, expected_Ax, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->jacobian, expected_Ap, expected_Ai, 3, 6));
 
     free_expr(p);
     return 0;
diff --git a/tests/jacobian_tests/other/test_quad_form.h b/tests/jacobian_tests/other/test_quad_form.h
index d832a83..22d33c0 100644
--- a/tests/jacobian_tests/other/test_quad_form.h
+++ b/tests/jacobian_tests/other/test_quad_form.h
@@ -14,7 +14,7 @@ const char *test_quad_form(void)
     // Q = [1 2 0; 2 3 0; 0 0 4]
     double u_vals[5] = {0, 0, 1, 2, 3};
     expr *x = new_variable(3, 1, 2, 5);
-    CSR_Matrix *Q = new_csr_matrix(3, 3, 5);
+    CSR_matrix *Q = new_CSR_matrix(3, 3, 5);
     double Qx[5] = {1.0, 2.0, 2.0, 3.0, 4.0};
     int Qi[5] = {0, 1, 0, 1, 2};
     int Qp[4] = {0, 2, 4, 5};
@@ -31,11 +31,11 @@ const char *test_quad_form(void)
     int expected_Ap[2] = {0, 3};
     int expected_Ai[3] = {2, 3, 4};
 
-    mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 3));
-    mu_assert("rows fail", cmp_int_array(node->jacobian->p, expected_Ap, 2));
-    mu_assert("cols fail", cmp_int_array(node->jacobian->i, expected_Ai, 3));
+    mu_assert("vals fail", cmp_values(node->jacobian, expected_Ax, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 1, 3));
     free_expr(node);
-    free_csr_matrix(Q);
+    free_CSR_matrix(Q);
     return 0;
 }
 
@@ -51,7 +51,7 @@ src/other/quad_form.c. const char *test_quad_form2(void)
     //         1 0 0 2 0 1]
 double u_vals[6] = {1, 2, 3, 4, 5, 6};
 expr *u = new_variable(6, 1, 0, 6);
-CSR_Matrix *Q = new_csr_matrix(3, 3, 5);
+CSR_matrix *Q = new_CSR_matrix(3, 3, 5);
 double Qx[5] = {1.0, 2.0, 2.0, 3.0, 4.0};
 int Qi[5] = {0, 1, 0, 1, 2};
 int Qp[4] = {0, 2, 4, 5};
@@ -59,7 +59,7 @@ memcpy(Q->x, Qx, 5 * sizeof(double));
 memcpy(Q->i, Qi, 5 * sizeof(int));
 memcpy(Q->p, Qp, 4 * sizeof(int));
 
-CSR_Matrix *A = new_csr_matrix(3, 6, 10);
+CSR_matrix *A = new_CSR_matrix(3, 6, 10);
 double Ax[10] = {1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6, 1.0, 2.0, 1.0};
 int Ai[10] = {0, 2, 3, 4, 2, 3, 4, 0, 3, 5};
 int Ap[4] = {0, 4, 7, 10};
@@ -77,13 +77,13 @@ double expected_Ax[5] = {422, 2222, 3244, 3786, 120};
 int expected_Ap[2] = {0, 5};
 int expected_Ai[5] = {0, 2, 3, 4, 5};
 
-mu_assert("vals fail", cmp_double_array(node->jacobian->x, expected_Ax, 5));
-mu_assert("rows fail", cmp_int_array(node->jacobian->p, expected_Ap, 2));
-mu_assert("cols fail", cmp_int_array(node->jacobian->i, expected_Ai, 5));
+mu_assert("vals fail", cmp_values(node->jacobian, expected_Ax, 5));
+mu_assert("sparsity fail",
+              cmp_sparsity(node->jacobian, expected_Ap, expected_Ai, 1, 5));
 free_expr(node);
 free_expr(Au);
-free_csr_matrix(Q);
-free_csr_matrix(A);
+free_CSR_matrix(Q);
+free_CSR_matrix(A);
 return 0;
 }
 */
diff --git a/tests/numerical_diff.c b/tests/numerical_diff.c
index 9c9812a..b79547d 100644
--- a/tests/numerical_diff.c
+++ b/tests/numerical_diff.c
@@ -13,7 +13,7 @@ static int is_close(double a, double b)
     return fabs(a - b) <= fmax(ABS_TOL, REL_TOL * fmax(fabs(a), fabs(b)));
 }
 
-static void csr_to_dense(const CSR_Matrix *A, double *dense)
+static void csr_to_dense(const CSR_matrix *A, double *dense)
 {
     for (int row = 0; row < A->m; row++)
     {
@@ -76,7 +76,7 @@ int check_jacobian_num(expr *node, const double *u, double h)
     node->forward(node, u);
 
     double *J_analytical = calloc((size_t) m * n, sizeof(double));
-    csr_to_dense(node->jacobian, J_analytical);
+    csr_to_dense(node->jacobian->to_csr(node->jacobian), J_analytical);
 
     int result = 1;
     for (int i = 0; i < m * n; i++)
@@ -98,9 +98,9 @@ int check_jacobian_num(expr *node, const double *u, double h)
     return result;
 }
 
-/* Compute g = J^T w where J is CSR (m x n) and w has m entries.
+/* Compute g = J^T w where J is CSR_matrix (m x n) and w has m entries.
  * Result written into g (size n), which must be zero-initialized. */
-static void csr_transpose_mult_vec(const CSR_Matrix *J, const double *w, double *g)
+static void csr_transpose_mult_vec(const CSR_matrix *J, const double *w, double *g)
 {
     for (int row = 0; row < J->m; row++)
     {
@@ -127,6 +127,12 @@ double *numerical_wsum_hess(expr *node, const double *u, const double *w, double
 
     memcpy(u_work, u, n * sizeof(double));
 
+    /* Hoist the CSR_matrix view once. For sparse_matrix (the only type used by tests
+       that reach here), csr->x aliases node->jacobian->x, so eval_jacobian
+       writes inside the loop update jac->x in place. A PD-backed Jacobian
+       would need a per-iteration to_csr refresh; not exercised today. */
+    CSR_matrix *jac = node->jacobian->to_csr(node->jacobian);
+
     for (int j = 0; j < n; j++)
     {
         /* g(u + h*e_j) */
@@ -134,14 +140,14 @@ double *numerical_wsum_hess(expr *node, const double *u, const double *w, double
         node->forward(node, u_work);
         node->eval_jacobian(node);
         memset(g_plus, 0, n * sizeof(double));
-        csr_transpose_mult_vec(node->jacobian, w, g_plus);
+        csr_transpose_mult_vec(jac, w, g_plus);
 
         /* g(u - h*e_j) */
         u_work[j] = u[j] - h;
         node->forward(node, u_work);
         node->eval_jacobian(node);
         memset(g_minus, 0, n * sizeof(double));
-        csr_transpose_mult_vec(node->jacobian, w, g_minus);
+        csr_transpose_mult_vec(jac, w, g_minus);
 
         u_work[j] = u[j];
 
@@ -171,7 +177,7 @@ int check_wsum_hess(expr *node, const double *u, const double *w, double h)
     node->eval_wsum_hess(node, w);
 
     double *H_ana = calloc((size_t) n * n, sizeof(double));
-    csr_to_dense(node->wsum_hess, H_ana);
+    csr_to_dense(node->wsum_hess->to_csr(node->wsum_hess), H_ana);
 
     int result = 1;
     for (int i = 0; i < n * n; i++)
diff --git a/tests/numerical_diff/test_numerical_diff.h b/tests/numerical_diff/test_numerical_diff.h
index 8af633d..a43b718 100644
--- a/tests/numerical_diff/test_numerical_diff.h
+++ b/tests/numerical_diff/test_numerical_diff.h
@@ -9,7 +9,7 @@ const char *test_check_jacobian_composite_exp(void)
 {
     double u_vals[6] = {0, 0, 1, 2, 3, 0};
 
-    CSR_Matrix *A = new_csr_matrix(2, 6, 6);
+    CSR_matrix *A = new_CSR_matrix(2, 6, 6);
     double Ax[6] = {3, 2, 1, 2, 1, 1};
     int Ai[6] = {2, 3, 4, 2, 3, 4};
     int Ap[3] = {0, 3, 6};
@@ -25,7 +25,7 @@ const char *test_check_jacobian_composite_exp(void)
               check_jacobian_num(exp_node, u_vals, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(exp_node);
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     return 0;
 }
 
@@ -36,7 +36,7 @@ const char *test_check_wsum_hess_exp_composite(void)
     double Ax[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
     int Ai[] = {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4};
     int Ap[] = {0, 5, 10, 15};
-    CSR_Matrix *A_csr = new_csr_matrix(3, 5, 15);
+    CSR_matrix *A_csr = new_CSR_matrix(3, 5, 15);
     memcpy(A_csr->x, Ax, 15 * sizeof(double));
     memcpy(A_csr->i, Ai, 15 * sizeof(int));
     memcpy(A_csr->p, Ap, 4 * sizeof(int));
@@ -49,6 +49,6 @@ const char *test_check_wsum_hess_exp_composite(void)
               check_wsum_hess(exp_node, u_vals, w, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(exp_node);
-    free_csr_matrix(A_csr);
+    free_CSR_matrix(A_csr);
     return 0;
 }
diff --git a/tests/old-code/test_old_permuted_dense.h b/tests/old-code/test_old_permuted_dense.h
new file mode 100644
index 0000000..335e5ff
--- /dev/null
+++ b/tests/old-code/test_old_permuted_dense.h
@@ -0,0 +1,349 @@
+#ifndef TEST_OLD_PERMUTED_DENSE_H
+#define TEST_OLD_PERMUTED_DENSE_H
+
+#include "minunit.h"
+#include "old-code/old_permuted_dense.h"
+#include "test_helpers.h"
+#include "utils/CSR_matrix.h"
+#include "utils/permuted_dense.h"
+#include <stdlib.h>
+#include <string.h>
+
+/* Direct unit tests for the legacy CSR-pd BTA kernels in old-code. They no
+   longer sit on a production path (matrix_BTA dispatcher hard-wires the
+   CSC variants), but the kernels remain as reference implementations and
+   as the CSR side of the cross-comparison test in test_permuted_dense.h. */
+
+const char *test_BTA_pd_csr_basic(void)
+{
+    /* CSR_matrix A: m=4, n=5, with nonzeros:
+       row 0: cols {1, 4}
+       row 1: cols {0, 2}
+       row 2: cols {2}
+       row 3: cols {1, 4} */
+    CSR_matrix *A = new_CSR_matrix(4, 5, 7);
+    A->p[0] = 0;
+    A->p[1] = 2;
+    A->p[2] = 4;
+    A->p[3] = 5;
+    A->p[4] = 7;
+    int Ai[7] = {1, 4, 0, 2, 2, 1, 4};
+    double Ax[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
+    memcpy(A->i, Ai, sizeof Ai);
+    memcpy(A->x, Ax, sizeof Ax);
+
+    /* PD B: m=4, n=4, row_perm = [1, 3], col_perm = [0, 2], X = [[10, 20], [30,
+     * 40]]. */
+    int row_perm_B[2] = {1, 3};
+    int col_perm_B[2] = {0, 2};
+    double XB[4] = {10.0, 20.0, 30.0, 40.0};
+    matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
+    permuted_dense *B = (permuted_dense *) B_m;
+
+    matrix *out_m = BTA_pd_csr_alloc(B, A);
+    permuted_dense *out = (permuted_dense *) out_m;
+
+    /* Expected col_active: union of A's columns in rows 1 and 3
+       = {0, 2} ∪ {1, 4} = {0, 1, 2, 4}, size 4. */
+    int expected_col_perm[4] = {0, 1, 2, 4};
+    mu_assert("out m", out_m->m == 4); /* B.n */
+    mu_assert("out n", out_m->n == 5); /* A.n */
+    mu_assert("m0", out->m0 == 2);
+    mu_assert("n0", out->n0 == 4);
+    mu_assert("row_perm", cmp_int_array(out->row_perm, col_perm_B, 2));
+    mu_assert("col_perm", cmp_int_array(out->col_perm, expected_col_perm, 4));
+
+    BTA_pd_csr_fill_values(B, A, out);
+
+    /* Reference: scatter A and B to dense 4x{5,4}, compute B^T A, extract
+       block at (col_perm_B × out->col_perm). Scatter inlined locally to
+       avoid coupling to the static helpers in tests/utils/test_permuted_dense.h. */
+    double *A_d = (double *) calloc(4 * 5, sizeof(double));
+    double *B_d = (double *) calloc(4 * 4, sizeof(double));
+    for (int i = 0; i < A->m; i++)
+        for (int e = A->p[i]; e < A->p[i + 1]; e++) A_d[i * 5 + A->i[e]] = A->x[e];
+    for (int kk = 0; kk < B->m0; kk++)
+        for (int jj = 0; jj < B->n0; jj++)
+            B_d[B->row_perm[kk] * 4 + B->col_perm[jj]] = B->X[kk * B->n0 + jj];
+
+    double C_ref[4 * 5];
+    memset(C_ref, 0, sizeof C_ref);
+    for (int i = 0; i < 4; i++)
+    {
+        for (int j = 0; j < 5; j++)
+        {
+            double s = 0.0;
+            for (int k = 0; k < 4; k++)
+            {
+                s += B_d[k * 4 + i] * A_d[k * 5 + j];
+            }
+            C_ref[i * 5 + j] = s;
+        }
+    }
+    double expected_X[8];
+    for (int ii = 0; ii < 2; ii++)
+    {
+        for (int jj = 0; jj < 4; jj++)
+        {
+            expected_X[ii * 4 + jj] =
+                C_ref[col_perm_B[ii] * 5 + expected_col_perm[jj]];
+        }
+    }
+    mu_assert("values", cmp_double_array(out->X, expected_X, 8));
+
+    free(A_d);
+    free(B_d);
+    free_matrix(out_m);
+    free_matrix(B_m);
+    free_CSR_matrix(A);
+    return 0;
+}
+
+/* BTA(CSR_matrix A, PD B) where A is a leaf-variable Jacobian (identity-in-block).
+   A is (4, 8): row k has a 1 at column 4+k (variable v of size 4 at var_id=4).
+   Expected: col_perm_out = {4+row_perm_B[kk]} = {4+1, 4+3} = {5, 7}, and X_C =
+   X_B^T. */
+const char *test_BTA_pd_csr_leaf_variable(void)
+{
+    CSR_matrix *A = new_CSR_matrix(4, 8, 4);
+    for (int k = 0; k < 4; k++)
+    {
+        A->p[k] = k;
+        A->i[k] = 4 + k;
+        A->x[k] = 1.0;
+    }
+    A->p[4] = 4;
+
+    int row_perm_B[2] = {1, 3};
+    int col_perm_B[2] = {0, 2};
+    double XB[4] = {10.0, 20.0, 30.0, 40.0}; /* row-major (2, 2) */
+    matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
+    permuted_dense *B = (permuted_dense *) B_m;
+
+    matrix *out_m = BTA_pd_csr_alloc(B, A);
+    permuted_dense *out = (permuted_dense *) out_m;
+
+    int expected_col_perm[2] = {5, 7};
+    mu_assert("m0", out->m0 == 2);
+    mu_assert("n0", out->n0 == 2);
+    mu_assert("row_perm", cmp_int_array(out->row_perm, col_perm_B, 2));
+    mu_assert("col_perm", cmp_int_array(out->col_perm, expected_col_perm, 2));
+
+    BTA_pd_csr_fill_values(B, A, out);
+
+    /* X_C should be X_B^T = [[10, 30], [20, 40]] row-major. */
+    double expected_X[4] = {10.0, 30.0, 20.0, 40.0};
+    mu_assert("values", cmp_double_array(out->X, expected_X, 4));
+
+    free_matrix(out_m);
+    free_matrix(B_m);
+    free_CSR_matrix(A);
+    return 0;
+}
+
+/* BTA(CSR_matrix A, PD B) where A has no entries in any row of row_perm_B.
+   Output dense block should have n0 = 0. */
+const char *test_BTA_pd_csr_no_overlap(void)
+{
+    /* A: rows 0 and 2 have entries; rows 1 and 3 (row_perm_B) are empty. */
+    CSR_matrix *A = new_CSR_matrix(4, 5, 3);
+    A->p[0] = 0;
+    A->p[1] = 2;
+    A->p[2] = 2;
+    A->p[3] = 3;
+    A->p[4] = 3;
+    int Ai[3] = {1, 4, 2};
+    double Ax[3] = {1.0, 2.0, 3.0};
+    memcpy(A->i, Ai, sizeof Ai);
+    memcpy(A->x, Ax, sizeof Ax);
+
+    int row_perm_B[2] = {1, 3}; /* rows that ARE empty in A */
+    int col_perm_B[2] = {0, 2};
+    double XB[4] = {1.0, 2.0, 3.0, 4.0};
+    matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
+    permuted_dense *B = (permuted_dense *) B_m;
+
+    matrix *out_m = BTA_pd_csr_alloc(B, A);
+    permuted_dense *out = (permuted_dense *) out_m;
+
+    mu_assert("m0", out->m0 == 2);
+    mu_assert("n0", out->n0 == 0);
+
+    /* Fill should be a no-op (0-sized dense block). */
+    BTA_pd_csr_fill_values(B, A, out);
+
+    free_matrix(out_m);
+    free_matrix(B_m);
+    free_CSR_matrix(A);
+    return 0;
+}
+
+/* Tests for the production CSR-pd kernel pair (B=CSR, A=PD). The BTA fill
+   variant lives here in old-code because production only calls the BTDA
+   path; the alloc is still in src/utils/permuted_dense.c. */
+
+/* BTA(CSR_matrix B, PD A): basic correctness against a dense reference.
+   A is (4, 5) PD with row_perm = [1, 3], col_perm = [0, 2], dense block (2, 2).
+   B is (4, 4) CSR_matrix with arbitrary sparsity. */
+const char *test_BTA_csr_pd_basic(void)
+{
+    /* PD A: m=4, n=5, row_perm = [1, 3], col_perm = [0, 2].
+       X = [[1, 2], [3, 4]] (2 x 2 row-major). */
+    int row_perm_A[2] = {1, 3};
+    int col_perm_A[2] = {0, 2};
+    double XA[4] = {1.0, 2.0, 3.0, 4.0};
+    matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
+    permuted_dense *A = (permuted_dense *) A_m;
+
+    /* CSR_matrix B: m=4, n=4.
+       row 0: cols {1, 3}
+       row 1: cols {0, 2}
+       row 2: cols {2}
+       row 3: cols {0, 3} */
+    CSR_matrix *B = new_CSR_matrix(4, 4, 7);
+    B->p[0] = 0;
+    B->p[1] = 2;
+    B->p[2] = 4;
+    B->p[3] = 5;
+    B->p[4] = 7;
+    int Bi[7] = {1, 3, 0, 2, 2, 0, 3};
+    double Bx[7] = {10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0};
+    memcpy(B->i, Bi, sizeof Bi);
+    memcpy(B->x, Bx, sizeof Bx);
+
+    matrix *out_m = BTA_csr_pd_alloc(B, A);
+    permuted_dense *out = (permuted_dense *) out_m;
+
+    /* row_active = union of B's cols in rows 1 and 3
+                  = {0, 2} ∪ {0, 3} = {0, 2, 3}, size 3. */
+    int expected_row_perm[3] = {0, 2, 3};
+    mu_assert("out m", out_m->m == 4); /* B.n */
+    mu_assert("out n", out_m->n == 5); /* A.n */
+    mu_assert("m0", out->m0 == 3);
+    mu_assert("n0", out->n0 == 2);
+    mu_assert("row_perm", cmp_int_array(out->row_perm, expected_row_perm, 3));
+    mu_assert("col_perm", cmp_int_array(out->col_perm, col_perm_A, 2));
+
+    BTA_csr_pd_fill_values(B, A, out);
+
+    /* Reference: dense B^T A, extract block at (row_active × col_perm_A).
+       Scatter inlined locally to avoid coupling to static helpers. */
+    double *A_d = (double *) calloc(4 * 5, sizeof(double));
+    double *B_d = (double *) calloc(4 * 4, sizeof(double));
+    for (int kk = 0; kk < A->m0; kk++)
+        for (int jj = 0; jj < A->n0; jj++)
+            A_d[A->row_perm[kk] * 5 + A->col_perm[jj]] = A->X[kk * A->n0 + jj];
+    for (int i = 0; i < B->m; i++)
+        for (int e = B->p[i]; e < B->p[i + 1]; e++) B_d[i * 4 + B->i[e]] = B->x[e];
+
+    double C_ref[4 * 5];
+    memset(C_ref, 0, sizeof C_ref);
+    for (int i = 0; i < 4; i++)
+    {
+        for (int j = 0; j < 5; j++)
+        {
+            double s = 0.0;
+            for (int k = 0; k < 4; k++)
+            {
+                s += B_d[k * 4 + i] * A_d[k * 5 + j];
+            }
+            C_ref[i * 5 + j] = s;
+        }
+    }
+    double expected_X[6];
+    for (int ii = 0; ii < 3; ii++)
+    {
+        for (int jj = 0; jj < 2; jj++)
+        {
+            expected_X[ii * 2 + jj] =
+                C_ref[expected_row_perm[ii] * 5 + col_perm_A[jj]];
+        }
+    }
+    mu_assert("values", cmp_double_array(out->X, expected_X, 6));
+
+    free(A_d);
+    free(B_d);
+    free_matrix(out_m);
+    free_CSR_matrix(B);
+    free_matrix(A_m);
+    return 0;
+}
+
+/* BTA(CSR_matrix B, PD A) where B is a leaf-variable Jacobian (identity-in-block).
+   B is (4, 8): row k has a 1 at column 4+k (variable v of size 4 at var_id=4).
+   Expected: row_perm_out = {4+row_perm_A[kk]} = {4+1, 4+3} = {5, 7}, X_C = X_A. */
+const char *test_BTA_csr_pd_leaf_variable(void)
+{
+    int row_perm_A[2] = {1, 3};
+    int col_perm_A[2] = {0, 2};
+    double XA[4] = {1.0, 2.0, 3.0, 4.0};
+    matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
+    permuted_dense *A = (permuted_dense *) A_m;
+
+    CSR_matrix *B = new_CSR_matrix(4, 8, 4);
+    for (int k = 0; k < 4; k++)
+    {
+        B->p[k] = k;
+        B->i[k] = 4 + k;
+        B->x[k] = 1.0;
+    }
+    B->p[4] = 4;
+
+    matrix *out_m = BTA_csr_pd_alloc(B, A);
+    permuted_dense *out = (permuted_dense *) out_m;
+
+    int expected_row_perm[2] = {5, 7};
+    mu_assert("m0", out->m0 == 2);
+    mu_assert("n0", out->n0 == 2);
+    mu_assert("row_perm", cmp_int_array(out->row_perm, expected_row_perm, 2));
+    mu_assert("col_perm", cmp_int_array(out->col_perm, col_perm_A, 2));
+
+    BTA_csr_pd_fill_values(B, A, out);
+
+    /* X_C should equal X_A. */
+    mu_assert("values", cmp_double_array(out->X, XA, 4));
+
+    free_matrix(out_m);
+    free_CSR_matrix(B);
+    free_matrix(A_m);
+    return 0;
+}
+
+/* BTA(CSR_matrix B, PD A) where B has no entries in any row of row_perm_A.
+   Output dense block should have m0 = 0. */
+const char *test_BTA_csr_pd_no_overlap(void)
+{
+    int row_perm_A[2] = {1, 3};
+    int col_perm_A[2] = {0, 2};
+    double XA[4] = {1.0, 2.0, 3.0, 4.0};
+    matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
+    permuted_dense *A = (permuted_dense *) A_m;
+
+    /* B: rows 0 and 2 have entries; rows 1 and 3 (row_perm_A) are empty. */
+    CSR_matrix *B = new_CSR_matrix(4, 4, 3);
+    B->p[0] = 0;
+    B->p[1] = 2;
+    B->p[2] = 2;
+    B->p[3] = 3;
+    B->p[4] = 3;
+    int Bi[3] = {0, 1, 2};
+    double Bx[3] = {1.0, 2.0, 3.0};
+    memcpy(B->i, Bi, sizeof Bi);
+    memcpy(B->x, Bx, sizeof Bx);
+
+    matrix *out_m = BTA_csr_pd_alloc(B, A);
+    permuted_dense *out = (permuted_dense *) out_m;
+
+    mu_assert("m0", out->m0 == 0);
+    mu_assert("n0", out->n0 == 2);
+
+    /* Fill should be a no-op (0-sized dense block on the row axis). */
+    BTA_csr_pd_fill_values(B, A, out);
+
+    free_matrix(out_m);
+    free_CSR_matrix(B);
+    free_matrix(A_m);
+    return 0;
+}
+
+#endif /* TEST_OLD_PERMUTED_DENSE_H */
diff --git a/tests/problem/test_problem.h b/tests/problem/test_problem.h
index fbb512f..d245cfa 100644
--- a/tests/problem/test_problem.h
+++ b/tests/problem/test_problem.h
@@ -175,7 +175,7 @@ const char *test_problem_jacobian(void)
     problem_constraint_forward(prob, u);
     problem_jacobian(prob);
 
-    CSR_Matrix *jac = prob->jacobian;
+    CSR_matrix *jac = prob->jacobian;
 
     /* Check dimensions */
     mu_assert("jac rows wrong", jac->m == 2);
@@ -239,7 +239,7 @@ const char *test_problem_jacobian_multi(void)
     problem_constraint_forward(prob, u);
     problem_jacobian(prob);
 
-    CSR_Matrix *jac = prob->jacobian;
+    CSR_matrix *jac = prob->jacobian;
 
     /* Check dimensions: 4 rows (2 + 2), 2 cols */
     mu_assert("jac rows wrong", jac->m == 4);
@@ -322,7 +322,7 @@ const char *test_problem_hessian(void)
     double w_obj = 2.0;
     problem_hessian(prob, w_obj, w);
 
-    CSR_Matrix *H = prob->lagrange_hessian;
+    CSR_matrix *H = prob->lagrange_hessian;
 
     /* Check dimensions: 3x3 symmetric */
     mu_assert("H rows wrong", H->m == 3);
diff --git a/tests/profiling/profile_BTA_pd_csr_vs_csc.h b/tests/profiling/profile_BTA_pd_csr_vs_csc.h
new file mode 100644
index 0000000..df1cc46
--- /dev/null
+++ b/tests/profiling/profile_BTA_pd_csr_vs_csc.h
@@ -0,0 +1,142 @@
+#ifndef PROFILE_BTA_PD_CSR_VS_CSC_H
+#define PROFILE_BTA_PD_CSR_VS_CSC_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "minunit.h"
+#include "old-code/old_permuted_dense.h"
+#include "test_helpers.h"
+#include "utils/CSC_matrix.h"
+#include "utils/CSR_matrix.h"
+#include "utils/Timer.h"
+#include "utils/permuted_dense.h"
+#include "utils/utils.h"
+
+/* Microbenchmark: compare BTA_csr_pd vs BTA_csc_pd on trimmed_log_reg-shaped
+   (m=2000, n0_B=785) inputs at two A densities. Output is one fill timing
+   pair per row of the table. */
+static void run_bench_one_density(int m, int n0_B, int n_A, int nnz_per_row,
+                                  int N_ITERS, const char *label)
+{
+    /* B: PD with full m × n0_B dense block (row_perm = 0..m-1, col_perm =
+       0..n0_B-1). Values arbitrary. */
+    int *row_perm_B = (int *) malloc(m * sizeof(int));
+    int *col_perm_B = (int *) malloc(n0_B * sizeof(int));
+    double *XB = (double *) malloc(m * n0_B * sizeof(double));
+    for (int i = 0; i < m; i++) row_perm_B[i] = i;
+    for (int j = 0; j < n0_B; j++) col_perm_B[j] = j;
+    for (int k = 0; k < m * n0_B; k++) XB[k] = (double) (k % 37) * 0.013 + 0.1;
+
+    /* B's global shape: (m, n_B_global). Pick n_B_global = n0_B (no padding). */
+    matrix *B_m = new_permuted_dense(m, n0_B, m, n0_B, row_perm_B, col_perm_B, XB);
+    permuted_dense *B = (permuted_dense *) B_m;
+
+    /* A: (m × n_A) CSR with `nnz_per_row` evenly-spaced nonzeros per row. */
+    int total_nnz = m * nnz_per_row;
+    CSR_matrix *A_csr = new_CSR_matrix(m, n_A, total_nnz);
+    for (int row = 0; row <= m; row++) A_csr->p[row] = row * nnz_per_row;
+    srand(42);
+    for (int row = 0; row < m; row++)
+    {
+        /* Pick nnz_per_row distinct columns by sorted random sampling. */
+        int *cols = (int *) malloc(nnz_per_row * sizeof(int));
+        int picked = 0;
+        while (picked < nnz_per_row)
+        {
+            int c = rand() % n_A;
+            int dup = 0;
+            for (int k = 0; k < picked; k++)
+                if (cols[k] == c)
+                {
+                    dup = 1;
+                    break;
+                }
+            if (!dup) cols[picked++] = c;
+        }
+        /* Insertion sort to keep CSR column-index invariant. */
+        for (int a = 1; a < nnz_per_row; a++)
+        {
+            int v = cols[a];
+            int b = a - 1;
+            while (b >= 0 && cols[b] > v)
+            {
+                cols[b + 1] = cols[b];
+                b--;
+            }
+            cols[b + 1] = v;
+        }
+        for (int k = 0; k < nnz_per_row; k++)
+        {
+            int e = A_csr->p[row] + k;
+            A_csr->i[e] = cols[k];
+            A_csr->x[e] = (double) ((row * 31 + cols[k]) % 53) * 0.027 + 0.05;
+        }
+        free(cols);
+    }
+
+    /* CSC view of A. */
+    int *iwork = (int *) malloc(MAX(m, n_A) * sizeof(int));
+    CSC_matrix *A_csc = csr_to_csc_alloc(A_csr, iwork);
+    csr_to_csc_fill_values(A_csr, A_csc, iwork);
+
+    /* Allocate outputs once for each variant. */
+    matrix *C_csr_m = BTA_pd_csr_alloc(B, A_csr);
+    permuted_dense *C_csr = (permuted_dense *) C_csr_m;
+    matrix *C_csc_m = BTA_pd_csc_alloc(B, A_csc);
+    permuted_dense *C_csc = (permuted_dense *) C_csc_m;
+
+    /* d for BTDA: all ones, so C = B^T diag(d) A = B^T A. */
+    double *d_ones = (double *) malloc(m * sizeof(double));
+    for (int i = 0; i < m; i++) d_ones[i] = 1.0;
+
+    /* Warm-up + time CSR fill. */
+    Timer t1;
+    BTDA_pd_csr_fill_values(B, d_ones, A_csr, C_csr);
+    clock_gettime(CLOCK_MONOTONIC, &t1.start);
+    for (int it = 0; it < N_ITERS; it++)
+        BTDA_pd_csr_fill_values(B, d_ones, A_csr, C_csr);
+    clock_gettime(CLOCK_MONOTONIC, &t1.end);
+    double t_csr_ms = GET_ELAPSED_SECONDS(t1) * 1000.0 / N_ITERS;
+
+    /* Warm-up + time CSC fill. */
+    Timer t2;
+    BTDA_pd_csc_fill_values(B, d_ones, A_csc, C_csc);
+    clock_gettime(CLOCK_MONOTONIC, &t2.start);
+    for (int it = 0; it < N_ITERS; it++)
+        BTDA_pd_csc_fill_values(B, d_ones, A_csc, C_csc);
+    clock_gettime(CLOCK_MONOTONIC, &t2.end);
+    double t_csc_ms = GET_ELAPSED_SECONDS(t2) * 1000.0 / N_ITERS;
+
+    printf("  %-22s CSR = %7.3f ms   CSC = %7.3f ms   ratio CSR/CSC = %.2fx\n",
+           label, t_csr_ms, t_csc_ms, t_csr_ms / t_csc_ms);
+
+    free_matrix(C_csr_m);
+    free_matrix(C_csc_m);
+    free_matrix(B_m);
+    free_CSR_matrix(A_csr);
+    free_CSC_matrix(A_csc);
+    free(iwork);
+    free(row_perm_B);
+    free(col_perm_B);
+    free(XB);
+    free(d_ones);
+}
+
+const char *profile_BTA_pd_csr_vs_csc(void)
+{
+    int m = 2000;
+    int n0_B = 785;
+    int n_A = 2000;
+    int N_ITERS = 50;
+
+    printf("\nBTA pd × sparse fill benchmark (m=%d, n0_B=%d, n_A=%d, %d iters):\n",
+           m, n0_B, n_A, N_ITERS);
+    run_bench_one_density(m, n0_B, n_A, 1, N_ITERS, "leaf-var (1 nnz/row):");
+    run_bench_one_density(m, n0_B, n_A, 50, N_ITERS, "dense-ish (50 nnz/row):");
+    return 0;
+}
+
+#endif /* PROFILE_BTA_PD_CSR_VS_CSC_H */
diff --git a/tests/profiling/profile_left_matmul.h b/tests/profiling/profile_left_matmul.h
index 4644a41..db6b832 100644
--- a/tests/profiling/profile_left_matmul.h
+++ b/tests/profiling/profile_left_matmul.h
@@ -13,10 +13,11 @@
 
 const char *profile_left_matmul(void)
 {
-    /* A @ X where A is 50 x 50 dense stored in CSR and X is 50 x 50 variable */
+    /* A @ X where A is 50 x 50 dense stored in CSR_matrix and X is 50 x 50 variable
+     */
     int n = 100;
     expr *X = new_variable(n, n, 0, n * n);
-    CSR_Matrix *A = new_csr_matrix(n, n, n * n);
+    CSR_matrix *A = new_CSR_matrix(n, n, n * n);
     for (int i = 0; i < n * n; i++)
     {
         A->x[i] = 1.0; /* dense matrix of all ones */
@@ -56,7 +57,7 @@ const char *profile_left_matmul(void)
            GET_ELAPSED_SECONDS(timer));
 
     free(x_vals);
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     free_expr(AX);
     return 0;
 }
diff --git a/tests/profiling/profile_log_reg.h b/tests/profiling/profile_log_reg.h
new file mode 100644
index 0000000..31bc075
--- /dev/null
+++ b/tests/profiling/profile_log_reg.h
@@ -0,0 +1,177 @@
+#ifndef PROFILE_LOG_REG_H
+#define PROFILE_LOG_REG_H
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "atoms/affine.h"
+#include "atoms/elementwise_full_dom.h"
+#include "expr.h"
+#include "minunit.h"
+#include "utils/CSR_sum.h"
+#include "utils/Timer.h"
+#include "utils/permuted_dense.h"
+
+/* Profile and validate Jacobian + Hessian of obj = sum(logistic(A x)).
+
+   Path A: the engine's expression DAG (CSR_matrix/CSC_matrix chain rule).
+   Path B: hardcoded chain rule using permuted_dense kernels for the dense
+           linear algebra (DA and ATDA), plus the engine's CSR_matrix row-sum
+           primitives for J_sum.
+
+   Forward pass is excluded from timing. */
+const char *profile_log_reg(void)
+{
+    int m = 2000;
+    int n = 785;
+
+    /* ---- Random A and initial x ---- */
+    double *A_data = (double *) malloc((size_t) m * n * sizeof(double));
+    double *u = (double *) malloc(n * sizeof(double));
+    srand(42);
+    for (int i = 0; i < m * n; i++)
+    {
+        A_data[i] = (double) rand() / RAND_MAX - 0.5;
+    }
+    for (int i = 0; i < n; i++)
+    {
+        u[i] = (double) rand() / RAND_MAX - 0.5;
+    }
+
+    /* ---- Build expression DAG (shared by both paths) ---- */
+    expr *x = new_variable(n, 1, 0, n);
+    expr *Ax = new_left_matmul_dense(NULL, x, m, n, A_data);
+    expr *log_obj = new_logistic(Ax);
+    expr *obj = new_sum(log_obj, -1);
+    jacobian_init(obj);
+    wsum_hess_init(obj);
+
+    /* Forward (untimed). */
+    obj->forward(obj, u);
+
+    /* ---- Path A: time eval_jacobian and eval_wsum_hess separately ---- */
+    Timer t_a_jac, t_a_hess;
+    double w_one = 1.0;
+    clock_gettime(CLOCK_MONOTONIC, &t_a_jac.start);
+    obj->eval_jacobian(obj);
+    clock_gettime(CLOCK_MONOTONIC, &t_a_jac.end);
+    clock_gettime(CLOCK_MONOTONIC, &t_a_hess.start);
+    obj->eval_wsum_hess(obj, &w_one);
+    clock_gettime(CLOCK_MONOTONIC, &t_a_hess.end);
+    double sec_a_jac = GET_ELAPSED_SECONDS(t_a_jac);
+    double sec_a_hess = GET_ELAPSED_SECONDS(t_a_hess);
+
+    /* ---- Path B setup (untimed) ---- */
+    int *full_rows = (int *) malloc(m * sizeof(int));
+    int *full_cols = (int *) malloc(n * sizeof(int));
+    for (int i = 0; i < m; i++) full_rows[i] = i;
+    for (int j = 0; j < n; j++) full_cols[j] = j;
+
+    matrix *A_pd_M = new_permuted_dense(m, n, m, n, full_rows, full_cols, A_data);
+    permuted_dense *A_pd = (permuted_dense *) A_pd_M;
+    matrix *Jlog_M = new_permuted_dense(m, n, m, n, full_rows, full_cols, NULL);
+    permuted_dense *Jlog_pd = (permuted_dense *) Jlog_M;
+    matrix *H_pd_M = ATA_pd_alloc(A_pd);
+    permuted_dense *H_pd = (permuted_dense *) H_pd_M;
+
+    free(full_rows);
+    free(full_cols);
+
+    /* CSR_matrix scaffolding for the row-sum step (PD owns the cached CSR_matrix
+     * view). */
+    CSR_matrix *Jlog_csr = Jlog_M->to_csr(Jlog_M);
+    CSR_matrix *Jobj_csr = new_CSR_matrix(1, n, n);
+    int *iwork = (int *) malloc((size_t) m * n * sizeof(int));
+    int *idx_map = (int *) malloc((size_t) m * n * sizeof(int));
+    sum_all_rows_csr_alloc(Jlog_csr, Jobj_csr, iwork, idx_map);
+
+    double *d2 = (double *) malloc(m * sizeof(double));
+    double *w_ones = (double *) malloc(m * sizeof(double));
+    for (int i = 0; i < m; i++) w_ones[i] = 1.0;
+
+    /* ---- Path B: time the manual chain rule, Jacobian and Hessian separately ----
+     */
+    Timer t_b_jac, t_b_hess;
+    /* dwork = sigmoid(z); used as the diagonal in DA below and (still in
+       dwork) as sigmas read by local_wsum_hess. */
+    clock_gettime(CLOCK_MONOTONIC, &t_b_jac.start);
+    log_obj->local_jacobian(log_obj, log_obj->work->dwork);
+    DA_pd_fill_values(log_obj->work->dwork, A_pd, Jlog_pd);
+    memset(Jobj_csr->x, 0, Jobj_csr->nnz * sizeof(double));
+    accumulator(Jlog_csr->x, Jlog_csr->nnz, idx_map, Jobj_csr->x);
+    clock_gettime(CLOCK_MONOTONIC, &t_b_jac.end);
+    clock_gettime(CLOCK_MONOTONIC, &t_b_hess.start);
+    log_obj->local_wsum_hess(log_obj, d2, w_ones);
+    ATDA_pd_fill_values(A_pd, d2, H_pd);
+    clock_gettime(CLOCK_MONOTONIC, &t_b_hess.end);
+    double sec_b_jac = GET_ELAPSED_SECONDS(t_b_jac);
+    double sec_b_hess = GET_ELAPSED_SECONDS(t_b_hess);
+
+    printf("\n");
+    printf("                            Jacobian      Hessian        Total\n");
+    printf("  Path A (engine CSR_matrix/CSC_matrix): %10.6fs  %10.6fs  %10.6fs\n",
+           sec_a_jac, sec_a_hess, sec_a_jac + sec_a_hess);
+    printf("  Path B (permuted_dense): %10.6fs  %10.6fs  %10.6fs\n", sec_b_jac,
+           sec_b_hess, sec_b_jac + sec_b_hess);
+    printf("  Speedup (A / B):         %10.2fx %10.2fx %10.2fx\n",
+           sec_a_jac / sec_b_jac, sec_a_hess / sec_b_hess,
+           (sec_a_jac + sec_a_hess) / (sec_b_jac + sec_b_hess));
+
+    /* ---- Compare Jacobian (1 x n, both have full sparsity) ---- */
+    CSR_matrix *J_a = obj->jacobian->to_csr(obj->jacobian);
+    mu_assert("J n mismatch", J_a->n == Jobj_csr->n);
+    mu_assert("J nnz mismatch", J_a->nnz == Jobj_csr->nnz);
+    double max_J_diff = 0.0;
+    for (int j = 0; j < J_a->nnz; j++)
+    {
+        double diff = fabs(J_a->x[j] - Jobj_csr->x[j]);
+        if (diff > max_J_diff) max_J_diff = diff;
+    }
+    printf("  Jacobian max abs diff:   %10.3e\n", max_J_diff);
+    mu_assert("Jacobian mismatch", max_J_diff < 1e-10);
+
+    /* ---- Compare Hessian (n x n): scatter Path A's CSR_matrix into a dense
+       n x n array, compare to H_pd->X (already dense row-major).
+       Extract the CSR_matrix view ONCE: PD's to_csr does an O(m0 * n0)
+       memcpy refresh per call, so calling it inside the inner loop is
+       quadratically expensive. ---- */
+    CSR_matrix *H_a = obj->wsum_hess->to_csr(obj->wsum_hess);
+    double *H_a_dense = (double *) calloc((size_t) n * n, sizeof(double));
+    for (int i = 0; i < n; i++)
+    {
+        for (int e = H_a->p[i]; e < H_a->p[i + 1]; e++)
+        {
+            int col = H_a->i[e];
+            H_a_dense[i * n + col] = H_a->x[e];
+        }
+    }
+    double max_H_diff = 0.0;
+    for (size_t k = 0; k < (size_t) n * n; k++)
+    {
+        double diff = fabs(H_a_dense[k] - H_pd->X[k]);
+        if (diff > max_H_diff) max_H_diff = diff;
+    }
+    printf("  Hessian max abs diff:    %10.3e\n", max_H_diff);
+    mu_assert("Hessian mismatch", max_H_diff < 1e-10);
+
+    /* ---- Cleanup ---- */
+    free(H_a_dense);
+    free(d2);
+    free(w_ones);
+    free(iwork);
+    free(idx_map);
+    free_CSR_matrix(Jobj_csr);
+    /* Jlog_csr is owned by Jlog_M's cache; released by free_matrix below. */
+    free_matrix(H_pd_M);
+    free_matrix(Jlog_M);
+    free_matrix(A_pd_M);
+    free_expr(obj);
+    free(A_data);
+    free(u);
+
+    return 0;
+}
+
+#endif /* PROFILE_LOG_REG_H */
diff --git a/tests/profiling/profile_trimmed_log_reg.h b/tests/profiling/profile_trimmed_log_reg.h
new file mode 100644
index 0000000..3fe2ae6
--- /dev/null
+++ b/tests/profiling/profile_trimmed_log_reg.h
@@ -0,0 +1,96 @@
+#ifndef PROFILE_TRIMMED_LOG_REG_H
+#define PROFILE_TRIMMED_LOG_REG_H
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "atoms/affine.h"
+#include "atoms/bivariate_full_dom.h"
+#include "atoms/elementwise_full_dom.h"
+#include "expr.h"
+#include "minunit.h"
+#include "subexpr.h"
+#include "utils/Timer.h"
+
+/* Profile Jacobian + Hessian of:
+     obj = sum( w ∘ logistic( -(y ∘ (A·theta)) ) )
+
+   theta (n x 1), w (m x 1)  : variables (n_vars = n + m)
+   A     (m x n)             : dense constant
+   y     (m x 1)             : constant in {-1, +1}, wrapped as PARAM_FIXED
+
+   Forward pass is excluded from timing. */
+const char *profile_trimmed_log_reg(void)
+{
+    int m = 2000;
+    int n = 785;
+    int N_HESS_ITERS = 10;
+    int n_vars = n + m;
+
+    /* ---- Random inputs ---- */
+    srand(42);
+    double *A_data = (double *) malloc((size_t) m * n * sizeof(double));
+    double *y_data = (double *) malloc((size_t) m * sizeof(double));
+    double *u = (double *) malloc((size_t) n_vars * sizeof(double));
+    for (int i = 0; i < m * n; i++)
+    {
+        A_data[i] = (double) rand() / RAND_MAX - 0.5;
+    }
+    for (int i = 0; i < m; i++)
+    {
+        y_data[i] = (rand() % 2 == 0) ? 1.0 : -1.0;
+    }
+    for (int i = 0; i < n_vars; i++)
+    {
+        u[i] = (double) rand() / RAND_MAX - 0.5;
+    }
+
+    /* ---- Build expression DAG ---- */
+    expr *theta = new_variable(n, 1, 0, n_vars);
+    expr *w = new_variable(m, 1, n, n_vars);
+
+    expr *y_param = new_parameter(m, 1, PARAM_FIXED, n_vars, y_data);
+
+    expr *A_theta = new_left_matmul_dense(NULL, theta, m, n, A_data);
+    expr *y_A_theta = new_vector_mult(y_param, A_theta);
+    expr *neg_node = new_neg(y_A_theta);
+    expr *sig = new_logistic(neg_node);
+    expr *w_sig = new_elementwise_mult(w, sig);
+    expr *obj = new_sum(w_sig, -1);
+
+    jacobian_init(obj);
+    wsum_hess_init(obj);
+
+    /* Forward (untimed). */
+    obj->forward(obj, u);
+
+    double w_one = 1.0;
+    Timer t_jac, t_hess;
+    clock_gettime(CLOCK_MONOTONIC, &t_jac.start);
+    obj->eval_jacobian(obj);
+    clock_gettime(CLOCK_MONOTONIC, &t_jac.end);
+
+    obj->eval_wsum_hess(obj, &w_one); /* warm-up */
+    clock_gettime(CLOCK_MONOTONIC, &t_hess.start);
+    for (int it = 0; it < N_HESS_ITERS; it++)
+    {
+        obj->eval_wsum_hess(obj, &w_one);
+    }
+    clock_gettime(CLOCK_MONOTONIC, &t_hess.end);
+
+    double sec_jac = GET_ELAPSED_SECONDS(t_jac);
+    double sec_hess = GET_ELAPSED_SECONDS(t_hess) / N_HESS_ITERS;
+
+    printf("\n");
+    printf("trimmed_log_reg (m=%d, n=%d):\n", m, n);
+    printf("  jacobian = %10.6fs   hessian = %10.6fs  (avg over %d)\n", sec_jac,
+           sec_hess, N_HESS_ITERS);
+
+    free_expr(obj);
+    free(A_data);
+    free(y_data);
+    free(u);
+    return 0;
+}
+
+#endif /* PROFILE_TRIMMED_LOG_REG_H */
diff --git a/tests/test_helpers.c b/tests/test_helpers.c
index 90d8446..7d5244a 100644
--- a/tests/test_helpers.c
+++ b/tests/test_helpers.c
@@ -4,7 +4,8 @@
 #include <string.h>
 
 #include "expr.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
+#include "utils/matrix.h"
 
 #define EPSILON 1e-7
 
@@ -44,6 +45,32 @@ int cmp_int_array(const int *actual, const int *expected, int size)
     return 1;
 }
 
+int cmp_sparsity(matrix *M, const int *exp_p, const int *exp_i, int m, int nnz)
+{
+    if (M->m != m)
+    {
+        printf("  FAILED: M->m = %d, expected %d\n", M->m, m);
+        return 0;
+    }
+    if (M->nnz != nnz)
+    {
+        printf("  FAILED: M->nnz = %d, expected %d\n", M->nnz, nnz);
+        return 0;
+    }
+    CSR_matrix *csr = M->to_csr(M);
+    return cmp_int_array(csr->p, exp_p, m + 1) && cmp_int_array(csr->i, exp_i, nnz);
+}
+
+int cmp_values(const matrix *M, const double *exp_x, int nnz)
+{
+    if (M->nnz != nnz)
+    {
+        printf("  FAILED: M->nnz = %d, expected %d\n", M->nnz, nnz);
+        return 0;
+    }
+    return cmp_double_array(M->x, exp_x, nnz);
+}
+
 #ifndef M_PI
 #define M_PI 3.14159265358979323846
 #endif
@@ -56,7 +83,7 @@ static double randn(void)
     return sqrt(-2.0 * log(u1)) * cos(2.0 * M_PI * u2);
 }
 
-CSR_Matrix *new_csr_random(int m, int n, double density)
+CSR_matrix *new_csr_random(int m, int n, double density)
 {
     /* Single pass: over-allocate, fill, then copy to exact size */
     int cap = (int) ((double) m * (double) n * density * 1.5) + m;
@@ -87,7 +114,7 @@ CSR_Matrix *new_csr_random(int m, int n, double density)
     }
     tmp_p[m] = nnz;
 
-    CSR_Matrix *A = new_csr_matrix(m, n, nnz);
+    CSR_matrix *A = new_CSR_matrix(m, n, nnz);
     memcpy(A->p, tmp_p, ((size_t) m + 1) * sizeof(int));
     memcpy(A->i, tmp_i, (size_t) nnz * sizeof(int));
     memcpy(A->x, tmp_x, (size_t) nnz * sizeof(double));
diff --git a/tests/test_helpers.h b/tests/test_helpers.h
index fd47151..b3e96ce 100644
--- a/tests/test_helpers.h
+++ b/tests/test_helpers.h
@@ -2,7 +2,8 @@
 #define TEST_HELPERS_H
 
 #include "expr.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
+#include "utils/matrix.h"
 
 /* Compare two double arrays directly
  * Returns 1 if all values match, 0 otherwise */
@@ -12,8 +13,17 @@ int cmp_double_array(const double *actual, const double *expected, int size);
  * Returns 1 if all values match, 0 otherwise */
 int cmp_int_array(const int *actual, const int *expected, int size);
 
-/* Create a random m x n CSR matrix with approximate nonzero density
+/* Verify M has shape (m, *) with exactly nnz entries, and that M's CSR_matrix
+ * row pointers and column indices match exp_p (length m+1) and exp_i
+ * (length nnz). Returns 1 on full match, 0 otherwise. */
+int cmp_sparsity(matrix *M, const int *exp_p, const int *exp_i, int m, int nnz);
+
+/* Verify M has nnz entries and that its value array matches exp_x of
+ * length nnz. Returns 1 on full match, 0 otherwise. */
+int cmp_values(const matrix *M, const double *exp_x, int nnz);
+
+/* Create a random m x n CSR_matrix matrix with approximate nonzero density
  * in [0, 1]. Nonzero values are standard Gaussian (Box-Muller). */
-CSR_Matrix *new_csr_random(int m, int n, double density);
+CSR_matrix *new_csr_random(int m, int n, double density);
 
 #endif /* TEST_HELPERS_H */
diff --git a/tests/utils/test_coo_matrix.h b/tests/utils/test_COO_matrix.h
similarity index 83%
rename from tests/utils/test_coo_matrix.h
rename to tests/utils/test_COO_matrix.h
index 53fa307..523b94a 100644
--- a/tests/utils/test_coo_matrix.h
+++ b/tests/utils/test_COO_matrix.h
@@ -4,16 +4,16 @@
 
 #include "minunit.h"
 #include "test_helpers.h"
-#include "utils/COO_Matrix.h"
+#include "utils/COO_matrix.h"
 
 const char *test_csr_to_coo(void)
 {
-    /* Create a 3x3 CSR matrix A:
+    /* Create a 3x3 CSR_matrix matrix A:
      * [1.0  2.0  0.0]
      * [0.0  3.0  4.0]
      * [5.0  0.0  6.0]
      */
-    CSR_Matrix *A = new_csr_matrix(3, 3, 6);
+    CSR_matrix *A = new_CSR_matrix(3, 3, 6);
     double Ax[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     int Ai[6] = {0, 1, 1, 2, 0, 2};
     int Ap[4] = {0, 2, 4, 6};
@@ -21,7 +21,7 @@ const char *test_csr_to_coo(void)
     memcpy(A->i, Ai, 6 * sizeof(int));
     memcpy(A->p, Ap, 4 * sizeof(int));
 
-    COO_Matrix *coo = new_coo_matrix(A);
+    COO_matrix *coo = new_COO_matrix(A);
 
     mu_assert("m incorrect", coo->m == 3);
     mu_assert("n incorrect", coo->n == 3);
@@ -35,8 +35,8 @@ const char *test_csr_to_coo(void)
     mu_assert("cols incorrect", cmp_int_array(coo->cols, expected_cols, 6));
     mu_assert("vals incorrect", cmp_double_array(coo->x, expected_x, 6));
 
-    free_coo_matrix(coo);
-    free_csr_matrix(A);
+    free_COO_matrix(coo);
+    free_CSR_matrix(A);
 
     return 0;
 }
@@ -48,7 +48,7 @@ const char *test_csr_to_coo_lower_triangular(void)
      * [2  5  6]
      * [3  6  9]
      */
-    CSR_Matrix *A = new_csr_matrix(3, 3, 9);
+    CSR_matrix *A = new_CSR_matrix(3, 3, 9);
     int Ap[4] = {0, 3, 6, 9};
     int Ai[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
     double Ax[9] = {1, 2, 3, 2, 5, 6, 3, 6, 9};
@@ -56,7 +56,7 @@ const char *test_csr_to_coo_lower_triangular(void)
     memcpy(A->i, Ai, 9 * sizeof(int));
     memcpy(A->x, Ax, 9 * sizeof(double));
 
-    COO_Matrix *coo = new_coo_matrix_lower_triangular(A);
+    COO_matrix *coo = new_COO_matrix_lower_triangular(A);
 
     mu_assert("ltri m incorrect", coo->m == 3);
     mu_assert("ltri n incorrect", coo->n == 3);
@@ -73,15 +73,15 @@ const char *test_csr_to_coo_lower_triangular(void)
     mu_assert("ltri value_map incorrect",
               cmp_int_array(coo->value_map, expected_map, 6));
 
-    free_coo_matrix(coo);
-    free_csr_matrix(A);
+    free_COO_matrix(coo);
+    free_CSR_matrix(A);
 
     return 0;
 }
 
 const char *test_refresh_lower_triangular_coo(void)
 {
-    CSR_Matrix *A = new_csr_matrix(3, 3, 9);
+    CSR_matrix *A = new_CSR_matrix(3, 3, 9);
     int Ap[4] = {0, 3, 6, 9};
     int Ai[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
     double Ax[9] = {1, 2, 3, 2, 5, 6, 3, 6, 9};
@@ -89,7 +89,7 @@ const char *test_refresh_lower_triangular_coo(void)
     memcpy(A->i, Ai, 9 * sizeof(int));
     memcpy(A->x, Ax, 9 * sizeof(double));
 
-    COO_Matrix *coo = new_coo_matrix_lower_triangular(A);
+    COO_matrix *coo = new_COO_matrix_lower_triangular(A);
 
     double vals2[9] = {10, 20, 30, 20, 50, 60, 30, 60, 90};
     refresh_lower_triangular_coo(coo, vals2);
@@ -97,8 +97,8 @@ const char *test_refresh_lower_triangular_coo(void)
     double expected_x[6] = {10, 20, 50, 30, 60, 90};
     mu_assert("refresh vals incorrect", cmp_double_array(coo->x, expected_x, 6));
 
-    free_coo_matrix(coo);
-    free_csr_matrix(A);
+    free_COO_matrix(coo);
+    free_CSR_matrix(A);
 
     return 0;
 }
diff --git a/tests/utils/test_csc_matrix.h b/tests/utils/test_csc_matrix.h
index 5459021..19d0e36 100644
--- a/tests/utils/test_csc_matrix.h
+++ b/tests/utils/test_csc_matrix.h
@@ -5,7 +5,7 @@
 
 #include "minunit.h"
 #include "test_helpers.h"
-#include "utils/CSC_Matrix.h"
+#include "utils/CSC_matrix.h"
 
 /* Test ATA_alloc with a simple 3x3 example
  * A is 4x3 (4 rows, 3 columns):
@@ -21,14 +21,14 @@
  */
 const char *test_ATA_alloc_simple(void)
 {
-    CSC_Matrix *A = new_csc_matrix(4, 3, 6);
+    CSC_matrix *A = new_CSC_matrix(4, 3, 6);
     int Ap[4] = {0, 2, 3, 6};
     int Ai[5] = {0, 2, 1, 2, 1};
     memcpy(A->p, Ap, 4 * sizeof(int));
     memcpy(A->i, Ai, 5 * sizeof(int));
 
     /* Compute C = A^T A */
-    CSR_Matrix *C = ATA_alloc(A);
+    CSR_matrix *C = ATA_alloc(A);
     int expected_p[4] = {0, 2, 3, 5};
     int expected_i[5] = {0, 2, 1, 0, 2};
 
@@ -36,8 +36,8 @@ const char *test_ATA_alloc_simple(void)
     mu_assert("i incorrect", cmp_int_array(C->i, expected_i, C->nnz));
     mu_assert("nnz incorrect", C->nnz == 5);
 
-    free_csr_matrix(C);
-    free_csc_matrix(A);
+    free_CSR_matrix(C);
+    free_CSC_matrix(A);
 
     return 0;
 }
@@ -57,13 +57,13 @@ const char *test_ATA_alloc_simple(void)
  */
 const char *test_ATA_alloc_diagonal_like(void)
 {
-    /* Create A in CSC format (3 rows, 4 cols, 4 nonzeros) */
-    CSC_Matrix *A = new_csc_matrix(3, 4, 4);
+    /* Create A in CSC_matrix format (3 rows, 4 cols, 4 nonzeros) */
+    CSC_matrix *A = new_CSC_matrix(3, 4, 4);
     int Ap[5] = {0, 1, 2, 3, 4};
     int Ai[4] = {0, 1, 2, 0};
     memcpy(A->p, Ap, 5 * sizeof(int));
     memcpy(A->i, Ai, 4 * sizeof(int));
-    CSR_Matrix *C = ATA_alloc(A);
+    CSR_matrix *C = ATA_alloc(A);
 
     int expected_p[5] = {0, 2, 3, 4, 6};
     int expected_i[6] = {0, 3, 1, 2, 0, 3};
@@ -72,23 +72,23 @@ const char *test_ATA_alloc_diagonal_like(void)
     mu_assert("i incorrect", cmp_int_array(C->i, expected_i, C->nnz));
     mu_assert("nnz incorrect", C->nnz == 6);
 
-    free_csr_matrix(C);
-    free_csc_matrix(A);
+    free_CSR_matrix(C);
+    free_CSC_matrix(A);
 
     return 0;
 }
 
 const char *test_ATA_alloc_random(void)
 {
-    /* Create A in CSC format  */
-    CSC_Matrix *A = new_csc_matrix(10, 15, 15);
+    /* Create A in CSC_matrix format  */
+    CSC_matrix *A = new_CSC_matrix(10, 15, 15);
     int Ap[16] = {0, 1, 1, 1, 1, 4, 5, 6, 7, 8, 9, 11, 11, 11, 13, 15};
     int Ai[15] = {5, 0, 6, 9, 0, 5, 1, 3, 6, 0, 6, 3, 6, 6, 8};
     double Ax[15] = {7, 4, 8, 5, 7, 3, 7, 8, 5, 4, 8, 8, 3, 6, 5};
     memcpy(A->p, Ap, 16 * sizeof(int));
     memcpy(A->i, Ai, 15 * sizeof(int));
     memcpy(A->x, Ax, 15 * sizeof(double));
-    CSR_Matrix *C = ATA_alloc(A);
+    CSR_matrix *C = ATA_alloc(A);
 
     int expected_p[16] = {0, 2, 2, 2, 2, 8, 11, 13, 14, 16, 21, 27, 27, 27, 33, 38};
     int expected_i[38] = {0,  6, 4,  5, 9,  10, 13, 14, 4, 5,  10, 0,  6,
@@ -109,18 +109,18 @@ const char *test_ATA_alloc_random(void)
         288., 144., 128., 90.,  144., 182., 108., 288., 180., 288., 108., 241.};
     mu_assert("x incorrect", cmp_double_array(C->x, Cx_correct, C->nnz));
 
-    free_csr_matrix(C);
-    free_csc_matrix(A);
+    free_CSR_matrix(C);
+    free_CSC_matrix(A);
 
     return 0;
 }
 
 const char *test_ATA_alloc_random2(void)
 {
-    /* Create A in CSC format  */
+    /* Create A in CSC_matrix format  */
     int m = 15;
     int n = 10;
-    CSC_Matrix *A = new_csc_matrix(m, n, 15);
+    CSC_matrix *A = new_CSC_matrix(m, n, 15);
     int Ap[11] = {0, 2, 4, 6, 6, 9, 12, 12, 14, 14, 15};
     int Ai[15] = {9, 12, 3, 4, 1, 6, 4, 8, 13, 1, 3, 7, 5, 13, 6};
     double Ax[15] = {0.99, 0.9,  0.51, 0.64, 0.39, 0.29, 0.26, 0.91,
@@ -128,7 +128,7 @@ const char *test_ATA_alloc_random2(void)
     memcpy(A->p, Ap, 11 * sizeof(int));
     memcpy(A->i, Ai, 15 * sizeof(int));
     memcpy(A->x, Ax, 15 * sizeof(double));
-    CSR_Matrix *C = ATA_alloc(A);
+    CSR_matrix *C = ATA_alloc(A);
 
     int expected_p[11] = {0, 1, 4, 7, 7, 10, 13, 13, 15, 15, 17};
     int expected_i[17] = {0, 1, 4, 5, 2, 5, 9, 1, 4, 7, 1, 2, 5, 4, 7, 2, 9};
@@ -147,14 +147,14 @@ const char *test_ATA_alloc_random2(void)
                              0.032857,  0.116699};
     mu_assert("x incorrect", cmp_double_array(C->x, Cx_correct, C->nnz));
 
-    free_csr_matrix(C);
-    free_csc_matrix(A);
+    free_CSR_matrix(C);
+    free_CSC_matrix(A);
 
     return 0;
 }
 const char *test_BTA_alloc_and_BTDA_fill(void)
 {
-    /* Create A: 4x3 CSC matrix
+    /* Create A: 4x3 CSC_matrix matrix
      * [1.0  0.0  2.0]
      * [0.0  3.0  0.0]
      * [4.0  0.0  5.0]
@@ -162,7 +162,7 @@ const char *test_BTA_alloc_and_BTDA_fill(void)
      */
     int m = 4;
     int n = 3;
-    CSC_Matrix *A = new_csc_matrix(m, n, 6);
+    CSC_matrix *A = new_CSC_matrix(m, n, 6);
     int Ap_A[4] = {0, 2, 4, 6};
     int Ai_A[6] = {0, 2, 1, 3, 0, 2};
     double Ax_A[6] = {1.0, 4.0, 3.0, 6.0, 2.0, 5.0};
@@ -170,14 +170,14 @@ const char *test_BTA_alloc_and_BTDA_fill(void)
     memcpy(A->i, Ai_A, 6 * sizeof(int));
     memcpy(A->x, Ax_A, 6 * sizeof(double));
 
-    /* Create B: 4x2 CSC matrix
+    /* Create B: 4x2 CSC_matrix matrix
      * [1.0  0.0]
      * [0.0  2.0]
      * [3.0  0.0]
      * [0.0  4.0]
      */
     int p = 2;
-    CSC_Matrix *B = new_csc_matrix(m, p, 4);
+    CSC_matrix *B = new_CSC_matrix(m, p, 4);
     int Bp[3] = {0, 2, 4};
     int Bi[4] = {0, 2, 1, 3};
     double Bx[4] = {1.0, 3.0, 2.0, 4.0};
@@ -186,7 +186,7 @@ const char *test_BTA_alloc_and_BTDA_fill(void)
     memcpy(B->x, Bx, 4 * sizeof(double));
 
     /* Allocate C = B^T A (should be 2x3) */
-    CSR_Matrix *C = BTA_alloc(A, B);
+    CSR_matrix *C = BTA_alloc(A, B);
 
     /* Sparsity pattern check before filling values */
     int expected_p[3] = {0, 2, 3};
@@ -203,9 +203,9 @@ const char *test_BTA_alloc_and_BTDA_fill(void)
     double expected_x[3] = {37.0, 47.0, 108.0};
     mu_assert("C values incorrect", cmp_double_array(C->x, expected_x, 3));
 
-    free_csr_matrix(C);
-    free_csc_matrix(A);
-    free_csc_matrix(B);
+    free_CSR_matrix(C);
+    free_CSC_matrix(A);
+    free_CSC_matrix(B);
 
     return 0;
 }
diff --git a/tests/utils/test_csr_csc_conversion.h b/tests/utils/test_csr_csc_conversion.h
index efbdc9e..bb8189c 100644
--- a/tests/utils/test_csr_csc_conversion.h
+++ b/tests/utils/test_csr_csc_conversion.h
@@ -5,19 +5,19 @@
 
 #include "minunit.h"
 #include "test_helpers.h"
-#include "utils/CSC_Matrix.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSC_matrix.h"
+#include "utils/CSR_matrix.h"
 
-/* Test CSR to CSC conversion with fill_sparsity and fill_values */
+/* Test CSR_matrix to CSC_matrix conversion with fill_sparsity and fill_values */
 const char *test_csr_to_csc_split(void)
 {
-    /* Create a 4x5 CSR matrix A:
+    /* Create a 4x5 CSR_matrix matrix A:
      * [1.0  0.0  0.0  0.0  1.0]
      * [0.0  0.0  3.0  0.0  0.0]
      * [0.0  2.0  0.0  0.0  0.0]
      * [0.0  0.0  0.0  4.0  0.0]
      */
-    CSR_Matrix *A = new_csr_matrix(4, 5, 5);
+    CSR_matrix *A = new_CSR_matrix(4, 5, 5);
     double Ax[5] = {1.0, 1.0, 3.0, 2.0, 4.0};
     int Ai[5] = {0, 4, 2, 1, 3};
     int Ap[5] = {0, 2, 3, 4, 5};
@@ -29,7 +29,7 @@ const char *test_csr_to_csc_split(void)
     int *iwork = (int *) malloc(A->n * sizeof(int));
 
     /* First, fill sparsity pattern */
-    CSC_Matrix *C = csr_to_csc_alloc(A, iwork);
+    CSC_matrix *C = csr_to_csc_alloc(A, iwork);
 
     /* Check sparsity pattern */
     int Cp_correct[6] = {0, 1, 2, 3, 4, 5};
@@ -47,22 +47,22 @@ const char *test_csr_to_csc_split(void)
     mu_assert("C vals incorrect", cmp_double_array(C->x, Cx_correct, 5));
 
     free(iwork);
-    free_csr_matrix(A);
-    free_csc_matrix(C);
+    free_CSR_matrix(A);
+    free_CSC_matrix(C);
 
     return 0;
 }
 
-/* Test CSC to CSR conversion with fill_sparsity */
+/* Test CSC_matrix to CSR_matrix conversion with fill_sparsity */
 const char *test_csc_to_csr_sparsity(void)
 {
-    /* Create a 4x5 CSC matrix A:
+    /* Create a 4x5 CSC_matrix matrix A:
      * [1.0  0.0  0.0  0.0  2.0]
      * [0.0  0.0  3.0  0.0  0.0]
      * [0.0  4.0  0.0  0.0  0.0]
      * [0.0  0.0  0.0  5.0  0.0]
      */
-    CSC_Matrix *A = new_csc_matrix(4, 5, 5);
+    CSC_matrix *A = new_CSC_matrix(4, 5, 5);
     double Ax[5] = {1.0, 4.0, 3.0, 5.0, 2.0};
     int Ai[5] = {0, 2, 1, 3, 0};
     int Ap[6] = {0, 1, 2, 3, 4, 5};
@@ -74,9 +74,9 @@ const char *test_csc_to_csr_sparsity(void)
     int *iwork = (int *) malloc(A->m * sizeof(int));
 
     /* Fill sparsity pattern */
-    CSR_Matrix *C = csc_to_csr_alloc(A, iwork);
+    CSR_matrix *C = csc_to_csr_alloc(A, iwork);
 
-    /* Expected CSR format:
+    /* Expected CSR_matrix format:
      * Row 0: [1.0 at col 0, 2.0 at col 4]
      * Row 1: [3.0 at col 2]
      * Row 2: [4.0 at col 1]
@@ -91,17 +91,17 @@ const char *test_csc_to_csr_sparsity(void)
     mu_assert("C nnz incorrect", C->nnz == 5);
 
     free(iwork);
-    free_csc_matrix(A);
-    free_csr_matrix(C);
+    free_CSC_matrix(A);
+    free_CSR_matrix(C);
 
     return 0;
 }
 
-/* Test CSC to CSR conversion with fill_values */
+/* Test CSC_matrix to CSR_matrix conversion with fill_values */
 const char *test_csc_to_csr_values(void)
 {
-    /* Create a 4x5 CSC matrix A */
-    CSC_Matrix *A = new_csc_matrix(4, 5, 5);
+    /* Create a 4x5 CSC_matrix matrix A */
+    CSC_matrix *A = new_CSC_matrix(4, 5, 5);
     double Ax[5] = {1.0, 4.0, 3.0, 5.0, 2.0};
     int Ai[5] = {0, 2, 1, 3, 0};
     int Ap[6] = {0, 1, 2, 3, 4, 5};
@@ -113,7 +113,7 @@ const char *test_csc_to_csr_values(void)
     int *iwork = (int *) malloc(A->m * sizeof(int));
 
     /* Fill sparsity pattern */
-    CSR_Matrix *C = csc_to_csr_alloc(A, iwork);
+    CSR_matrix *C = csc_to_csr_alloc(A, iwork);
 
     /* Fill values */
     csc_to_csr_fill_values(A, C, iwork);
@@ -124,21 +124,21 @@ const char *test_csc_to_csr_values(void)
     mu_assert("C vals incorrect", cmp_double_array(C->x, Cx_correct, 5));
 
     free(iwork);
-    free_csc_matrix(A);
-    free_csr_matrix(C);
+    free_CSC_matrix(A);
+    free_CSR_matrix(C);
 
     return 0;
 }
 
-/* Test round-trip conversion: CSR -> CSC -> CSR */
+/* Test round-trip conversion: CSR_matrix -> CSC_matrix -> CSR_matrix */
 const char *test_csr_csc_csr_roundtrip(void)
 {
-    /* Create a 3x4 CSR matrix A:
+    /* Create a 3x4 CSR_matrix matrix A:
      * [1.0  2.0  0.0  3.0]
      * [0.0  4.0  5.0  0.0]
      * [6.0  0.0  7.0  8.0]
      */
-    CSR_Matrix *A = new_csr_matrix(3, 4, 8);
+    CSR_matrix *A = new_CSR_matrix(3, 4, 8);
     double Ax[8] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
     int Ai[8] = {0, 1, 3, 1, 2, 0, 2, 3};
     int Ap[4] = {0, 3, 5, 8};
@@ -146,14 +146,14 @@ const char *test_csr_csc_csr_roundtrip(void)
     memcpy(A->i, Ai, 8 * sizeof(int));
     memcpy(A->p, Ap, 4 * sizeof(int));
 
-    /* Convert CSR to CSC */
+    /* Convert CSR_matrix to CSC_matrix */
     int *iwork_csc = (int *) malloc(A->n * sizeof(int));
-    CSC_Matrix *B = csr_to_csc_alloc(A, iwork_csc);
+    CSC_matrix *B = csr_to_csc_alloc(A, iwork_csc);
     csr_to_csc_fill_values(A, B, iwork_csc);
 
-    /* Convert CSC back to CSR */
+    /* Convert CSC_matrix back to CSR_matrix */
     int *iwork_csr = (int *) malloc(B->m * sizeof(int));
-    CSR_Matrix *C = csc_to_csr_alloc(B, iwork_csr);
+    CSR_matrix *C = csc_to_csr_alloc(B, iwork_csr);
     csc_to_csr_fill_values(B, C, iwork_csr);
 
     /* C should match A */
@@ -163,9 +163,9 @@ const char *test_csr_csc_csr_roundtrip(void)
 
     free(iwork_csc);
     free(iwork_csr);
-    free_csr_matrix(A);
-    free_csc_matrix(B);
-    free_csr_matrix(C);
+    free_CSR_matrix(A);
+    free_CSC_matrix(B);
+    free_CSR_matrix(C);
 
     return 0;
 }
diff --git a/tests/utils/test_csr_matrix.h b/tests/utils/test_csr_matrix.h
index 68e0128..33c11a0 100644
--- a/tests/utils/test_csr_matrix.h
+++ b/tests/utils/test_csr_matrix.h
@@ -6,18 +6,18 @@
 #include "old-code/old_CSR.h"
 #include "old-code/old_CSR_sum.h"
 #include "test_helpers.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/CSR_sum.h"
 #include "utils/int_double_pair.h"
 
 const char *test_diag_csr_mult(void)
 {
-    /* Create a 3x3 CSR matrix A:
+    /* Create a 3x3 CSR_matrix matrix A:
      * [1.0  2.0  0.0]
      * [0.0  3.0  4.0]
      * [5.0  0.0  6.0]
      */
-    CSR_Matrix *A = new_csr_matrix(3, 3, 6);
+    CSR_matrix *A = new_CSR_matrix(3, 3, 6);
     double Ax[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     int Ai[6] = {0, 1, 1, 2, 0, 2};
     int Ap[4] = {0, 2, 4, 6};
@@ -31,7 +31,7 @@ const char *test_diag_csr_mult(void)
      * [0.0  9.0  12.0]
      * [2.5  0.0  3.0]
      */
-    CSR_Matrix *C = new_csr_matrix(3, 3, 6);
+    CSR_matrix *C = new_CSR_matrix(3, 3, 6);
     diag_csr_mult(d, A, C);
 
     double Ax_correct[6] = {2.0, 4.0, 9.0, 12.0, 2.5, 3.0};
@@ -42,8 +42,8 @@ const char *test_diag_csr_mult(void)
     mu_assert("cols incorrect", cmp_int_array(C->i, Ai_correct, 6));
     mu_assert("rows incorrect", cmp_int_array(C->p, Ap_correct, 4));
 
-    free_csr_matrix(A);
-    free_csr_matrix(C);
+    free_CSR_matrix(A);
+    free_CSR_matrix(C);
 
     return 0;
 }
@@ -55,7 +55,7 @@ const char *test_diag_csr_mult(void)
 */
 const char *test_csr_sum(void)
 {
-    CSR_Matrix *A = new_csr_matrix(3, 3, 5);
+    CSR_matrix *A = new_CSR_matrix(3, 3, 5);
     double Ax[5] = {1.0, 2.0, 3.0, 4.0, 5.0};
     int Ai[5] = {0, 2, 1, 0, 2};
     int Ap[4] = {0, 2, 3, 5};
@@ -63,7 +63,7 @@ const char *test_csr_sum(void)
     memcpy(A->i, Ai, 5 * sizeof(int));
     memcpy(A->p, Ap, 4 * sizeof(int));
 
-    CSR_Matrix *B = new_csr_matrix(3, 3, 4);
+    CSR_matrix *B = new_CSR_matrix(3, 3, 4);
     double Bx[4] = {1.0, 2.0, 3.0, 6.0};
     int Bi[4] = {1, 0, 2, 1};
     int Bp[4] = {0, 1, 3, 4};
@@ -71,7 +71,7 @@ const char *test_csr_sum(void)
     memcpy(B->i, Bi, 4 * sizeof(int));
     memcpy(B->p, Bp, 4 * sizeof(int));
 
-    CSR_Matrix *C = new_csr_matrix(3, 3, 9);
+    CSR_matrix *C = new_CSR_matrix(3, 3, 9);
     sum_csr_matrices(A, B, C);
 
     double Cx_correct[9] = {1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 6.0, 5.0};
@@ -83,9 +83,9 @@ const char *test_csr_sum(void)
     mu_assert("C cols incorrect", cmp_int_array(C->i, Ci_correct, 9));
     mu_assert("C rows incorrect", cmp_int_array(C->p, Cp_correct, 4));
 
-    free_csr_matrix(A);
-    free_csr_matrix(B);
-    free_csr_matrix(C);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
+    free_CSR_matrix(C);
 
     return 0;
 }
@@ -97,7 +97,7 @@ const char *test_csr_sum(void)
 */
 const char *test_csr_sum2(void)
 {
-    CSR_Matrix *A = new_csr_matrix(3, 3, 5);
+    CSR_matrix *A = new_CSR_matrix(3, 3, 5);
     double Ax[5] = {1.0, 2.0, 3.0, 4.0, 5.0};
     int Ai[5] = {0, 2, 2, 0, 2};
     int Ap[4] = {0, 2, 3, 5};
@@ -105,7 +105,7 @@ const char *test_csr_sum2(void)
     memcpy(A->i, Ai, 5 * sizeof(int));
     memcpy(A->p, Ap, 4 * sizeof(int));
 
-    CSR_Matrix *B = new_csr_matrix(3, 3, 4);
+    CSR_matrix *B = new_CSR_matrix(3, 3, 4);
     double Bx[4] = {1.0, 2.0, 3.0, 6.0};
     int Bi[4] = {1, 0, 2, 1};
     int Bp[4] = {0, 1, 3, 4};
@@ -113,7 +113,7 @@ const char *test_csr_sum2(void)
     memcpy(B->i, Bi, 4 * sizeof(int));
     memcpy(B->p, Bp, 4 * sizeof(int));
 
-    CSR_Matrix *C = new_csr_matrix(3, 3, 8);
+    CSR_matrix *C = new_CSR_matrix(3, 3, 8);
     sum_csr_matrices(A, B, C);
 
     double Cx_correct[8] = {1, 1, 2, 2, 6, 4, 6, 5};
@@ -125,16 +125,16 @@ const char *test_csr_sum2(void)
     mu_assert("C cols incorrect", cmp_int_array(C->i, Ci_correct, 8));
     mu_assert("C rows incorrect", cmp_int_array(C->p, Cp_correct, 4));
 
-    free_csr_matrix(A);
-    free_csr_matrix(B);
-    free_csr_matrix(C);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
+    free_CSR_matrix(C);
 
     return 0;
 }
 
 const char *test_transpose(void)
 {
-    CSR_Matrix *A = new_csr_matrix(4, 5, 5);
+    CSR_matrix *A = new_CSR_matrix(4, 5, 5);
     double Ax[5] = {1.0, 1.0, 3.0, 2.0, 4.0};
     int Ai[5] = {0, 4, 1, 0, 1};
     int Ap[5] = {0, 2, 3, 4, 5};
@@ -143,7 +143,7 @@ const char *test_transpose(void)
     memcpy(A->p, Ap, 5 * sizeof(int));
 
     int iwork[5];
-    CSR_Matrix *AT = transpose(A, iwork);
+    CSR_matrix *AT = transpose(A, iwork);
     double ATx_correct[5] = {1.0, 2.0, 3.0, 4.0, 1.0};
     int ATi_correct[5] = {0, 2, 1, 3, 0};
     int ATp_correct[6] = {0, 2, 4, 4, 4, 5};
@@ -151,8 +151,8 @@ const char *test_transpose(void)
     mu_assert("AT cols incorrect", cmp_int_array(AT->i, ATi_correct, 5));
     mu_assert("AT rows incorrect", cmp_int_array(AT->p, ATp_correct, 6));
 
-    free_csr_matrix(A);
-    free_csr_matrix(AT);
+    free_CSR_matrix(A);
+    free_CSR_matrix(AT);
 
     return 0;
 }
@@ -167,7 +167,7 @@ A = [1 0 0 0 1
 */
 const char *test_csr_vecmat_values_sparse(void)
 {
-    CSR_Matrix *A = new_csr_matrix(4, 5, 5);
+    CSR_matrix *A = new_CSR_matrix(4, 5, 5);
     double Ax[5] = {1.0, 1.0, 3.0, 2.0, 4.0};
     int Ai[5] = {0, 4, 1, 0, 1};
     int Ap[5] = {0, 2, 3, 4, 5};
@@ -177,7 +177,7 @@ const char *test_csr_vecmat_values_sparse(void)
 
     double z[4] = {1.0, 2.0, 3.0, 4.0};
 
-    CSR_Matrix *C = new_csr_matrix(1, 3, 3);
+    CSR_matrix *C = new_CSR_matrix(1, 3, 3);
     double Cx[3] = {0.0, 0.0, 0.0};
     int Ci[3] = {0, 1, 4};
     int Cp[2] = {0, 3};
@@ -187,7 +187,7 @@ const char *test_csr_vecmat_values_sparse(void)
 
     int iwork[5];
 
-    CSR_Matrix *AT = transpose(A, iwork);
+    CSR_matrix *AT = transpose(A, iwork);
 
     Ax_csr_fill_values(AT, z, C);
 
@@ -196,15 +196,15 @@ const char *test_csr_vecmat_values_sparse(void)
     mu_assert("C nnz incorrect", C->nnz == 3);
     mu_assert("C vals incorrect", cmp_double_array(C->x, Cx_correct, 3));
 
-    free_csr_matrix(A);
-    free_csr_matrix(AT);
-    free_csr_matrix(C);
+    free_CSR_matrix(A);
+    free_CSR_matrix(AT);
+    free_CSR_matrix(C);
 
     return 0;
 }
 const char *test_sum_all_rows_csr(void)
 {
-    /* Create a 3x4 CSR matrix A:
+    /* Create a 3x4 CSR_matrix matrix A:
      * [1.0  2.0  0.0  0.0]
      * [0.0  3.0  4.0  0.0]
      * [5.0  0.0  6.0  7.0]
@@ -212,14 +212,14 @@ const char *test_sum_all_rows_csr(void)
      * Sum all rows should give:
      * [6.0  5.0  10.0  7.0]
      */
-    CSR_Matrix *A = new_csr_matrix(3, 4, 7);
+    CSR_matrix *A = new_CSR_matrix(3, 4, 7);
     double Ax[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
     int Ai[7] = {0, 1, 1, 2, 0, 2, 3};
     int Ap[4] = {0, 2, 4, 7};
     memcpy(A->x, Ax, 7 * sizeof(double));
     memcpy(A->i, Ai, 7 * sizeof(int));
     memcpy(A->p, Ap, 4 * sizeof(int));
-    CSR_Matrix *C = new_csr_matrix(1, 4, 4);
+    CSR_matrix *C = new_CSR_matrix(1, 4, 4);
     int_double_pair *pairs = new_int_double_pair_array(7);
     sum_all_rows_csr(A, C, pairs);
     double Cx_correct[4] = {6.0, 5.0, 10.0, 7.0};
@@ -231,15 +231,15 @@ const char *test_sum_all_rows_csr(void)
     mu_assert("C cols incorrect", cmp_int_array(C->i, Ci_correct, 4));
     mu_assert("C rows incorrect", cmp_int_array(C->p, Cp_correct, 2));
 
-    free_csr_matrix(A);
-    free_csr_matrix(C);
+    free_CSR_matrix(A);
+    free_CSR_matrix(C);
     free_int_double_pair_array(pairs);
 
     return 0;
 }
 const char *test_sum_block_of_rows_csr(void)
 {
-    /* Create a 9x4 CSR matrix A and sum blocks of size 3
+    /* Create a 9x4 CSR_matrix matrix A and sum blocks of size 3
      * Block 0 (rows 0-2):
      * [1.0  2.0  0.0  0.0]
      * [0.0  3.0  1.0  0.0]
@@ -260,7 +260,7 @@ const char *test_sum_block_of_rows_csr(void)
      *
      * Result C should be 3x4 matrix with the sums above
      */
-    CSR_Matrix *A = new_csr_matrix(9, 4, 18);
+    CSR_matrix *A = new_CSR_matrix(9, 4, 18);
 
     double Ax[18] = {1.0, 2.0,  /* row 0 */
                      3.0, 1.0,  /* row 1 */
@@ -289,7 +289,7 @@ const char *test_sum_block_of_rows_csr(void)
     memcpy(A->p, Ap, 10 * sizeof(int));
 
     /* Allocate C for 3 blocks and enough space for all nonzeros */
-    CSR_Matrix *C = new_csr_matrix(3, 4, 12);
+    CSR_matrix *C = new_CSR_matrix(3, 4, 12);
     int_double_pair *pairs = new_int_double_pair_array(18);
 
     sum_block_of_rows_csr(A, C, pairs, 3);
@@ -310,19 +310,20 @@ const char *test_sum_block_of_rows_csr(void)
     mu_assert("C cols incorrect", cmp_int_array(C->i, Ci_correct, 12));
     mu_assert("C rows incorrect", cmp_int_array(C->p, Cp_correct, 4));
 
-    free_csr_matrix(A);
-    free_csr_matrix(C);
+    free_CSR_matrix(A);
+    free_CSR_matrix(C);
     free_int_double_pair_array(pairs);
 
     return 0;
 }
 const char *test_sum_evenly_spaced_rows_csr(void)
 {
-    /* Create a 9x4 CSR matrix A (same as test_sum_block_of_rows_csr) and sum evenly
+    /* Create a 9x4 CSR_matrix matrix A (same as test_sum_block_of_rows_csr) and sum
+    evenly
      * spaced rows With row_spacing=3:
 
 
-    A = 9x4 CSR matrix:
+    A = 9x4 CSR_matrix matrix:
             1 2 0 0
             0 3 1 0
             0 0 4 5
@@ -338,7 +339,7 @@ const char *test_sum_evenly_spaced_rows_csr(void)
             row 1: sum of rows 1, 4, 7 = [1 4 6 0]
             row 2: sum of rows 2, 5, 8 = [3 2 4 11]
     */
-    CSR_Matrix *A = new_csr_matrix(9, 4, 18);
+    CSR_matrix *A = new_CSR_matrix(9, 4, 18);
 
     double Ax[18] = {1.0, 2.0,  /* row 0 */
                      3.0, 1.0,  /* row 1 */
@@ -367,7 +368,7 @@ const char *test_sum_evenly_spaced_rows_csr(void)
     memcpy(A->p, Ap, 10 * sizeof(int));
 
     /* Allocate C for 3 rows (row_spacing=3) and enough space for all nonzeros */
-    CSR_Matrix *C = new_csr_matrix(3, 4, 10);
+    CSR_matrix *C = new_CSR_matrix(3, 4, 10);
     int_double_pair *pairs = new_int_double_pair_array(18);
 
     sum_evenly_spaced_rows_csr(A, C, pairs, 3);
@@ -388,20 +389,20 @@ const char *test_sum_evenly_spaced_rows_csr(void)
     mu_assert("C cols incorrect", cmp_int_array(C->i, Ci_correct, 10));
     mu_assert("C rows incorrect", cmp_int_array(C->p, Cp_correct, 4));
 
-    free_csr_matrix(A);
-    free_csr_matrix(C);
+    free_CSR_matrix(A);
+    free_CSR_matrix(C);
     free_int_double_pair_array(pairs);
 
     return 0;
 }
 const char *test_AT_alloc_and_fill(void)
 {
-    /* Create a 3x4 CSR matrix A:
+    /* Create a 3x4 CSR_matrix matrix A:
      * [1.0  0.0  2.0  0.0]
      * [0.0  3.0  0.0  4.0]
      * [5.0  0.0  6.0  0.0]
      */
-    CSR_Matrix *A = new_csr_matrix(3, 4, 6);
+    CSR_matrix *A = new_CSR_matrix(3, 4, 6);
     double Ax[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     int Ai[6] = {0, 2, 1, 3, 0, 2};
     int Ap[4] = {0, 2, 4, 6};
@@ -411,7 +412,7 @@ const char *test_AT_alloc_and_fill(void)
 
     /* Allocate A^T (should be 4x3) */
     int *iwork = (int *) malloc(A->n * sizeof(int));
-    CSR_Matrix *AT = AT_alloc(A, iwork);
+    CSR_matrix *AT = AT_alloc(A, iwork);
 
     /* Fill values of A^T */
     AT_fill_values(A, AT, iwork);
@@ -432,8 +433,8 @@ const char *test_AT_alloc_and_fill(void)
     mu_assert("AT cols incorrect", cmp_int_array(AT->i, ATi_correct, 6));
     mu_assert("AT rows incorrect", cmp_int_array(AT->p, ATp_correct, 5));
 
-    free_csr_matrix(A);
-    free_csr_matrix(AT);
+    free_CSR_matrix(A);
+    free_CSR_matrix(AT);
     free(iwork);
 
     return 0;
@@ -454,7 +455,7 @@ const char *test_kron_identity_csr(void)
      * [3  0 | 0  0 | 4  0]
      * [0  3 | 0  0 | 0  4]
      */
-    CSR_Matrix *A = new_csr_matrix(2, 3, 4);
+    CSR_matrix *A = new_CSR_matrix(2, 3, 4);
     double Ax[4] = {1.0, 2.0, 3.0, 4.0};
     int Ai[4] = {0, 2, 0, 2};
     int Ap[3] = {0, 2, 4};
@@ -462,7 +463,7 @@ const char *test_kron_identity_csr(void)
     memcpy(A->i, Ai, 4 * sizeof(int));
     memcpy(A->p, Ap, 3 * sizeof(int));
 
-    CSR_Matrix *result = kron_identity_csr(A, 2);
+    CSR_matrix *result = kron_identity_csr(A, 2);
 
     /* Expected: 4x6 with 8 nonzeros
      * Row 0: [1, 0, 0, 0, 2, 0] -> cols {0, 4}, vals {1, 2}
@@ -480,8 +481,8 @@ const char *test_kron_identity_csr(void)
     mu_assert("cols incorrect", cmp_int_array(result->i, expected_i, 8));
     mu_assert("rows incorrect", cmp_int_array(result->p, expected_p, 5));
 
-    free_csr_matrix(A);
-    free_csr_matrix(result);
+    free_CSR_matrix(A);
+    free_CSR_matrix(result);
 
     return 0;
 }
diff --git a/tests/utils/test_linalg_sparse_matmuls.h b/tests/utils/test_linalg_sparse_matmuls.h
index 120f99b..95c1c27 100644
--- a/tests/utils/test_linalg_sparse_matmuls.h
+++ b/tests/utils/test_linalg_sparse_matmuls.h
@@ -5,18 +5,18 @@
 
 #include "minunit.h"
 #include "test_helpers.h"
-#include "utils/CSC_Matrix.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSC_matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/linalg_sparse_matmuls.h"
 
 /* Test block_left_multiply_fill_sparsity with simple case: single block */
 const char *test_block_left_multiply_single_block(void)
 {
-    /* A is 2x3 CSR:
+    /* A is 2x3 CSR_matrix:
      * [1.0  0.0  0.0]
      * [0.0  1.0  1.0]
      */
-    CSR_Matrix *A = new_csr_matrix(2, 3, 3);
+    CSR_matrix *A = new_CSR_matrix(2, 3, 3);
     double Ax[3] = {1.0, 1.0, 1.0};
     int Ai[3] = {0, 1, 2};
     int Ap[3] = {0, 1, 3};
@@ -24,12 +24,12 @@ const char *test_block_left_multiply_single_block(void)
     memcpy(A->i, Ai, 3 * sizeof(int));
     memcpy(A->p, Ap, 3 * sizeof(int));
 
-    /* J is 3x2 CSC (single block, so p=1):
+    /* J is 3x2 CSC_matrix (single block, so p=1):
      * [1.0  0.0]
      * [1.0  0.0]
      * [0.0  1.0]
      */
-    CSC_Matrix *J = new_csc_matrix(3, 2, 3);
+    CSC_matrix *J = new_CSC_matrix(3, 2, 3);
     double Jx[3] = {1.0, 1.0, 1.0};
     int Ji[3] = {0, 1, 2};
     int Jp[3] = {0, 2, 3};
@@ -38,7 +38,7 @@ const char *test_block_left_multiply_single_block(void)
     memcpy(J->p, Jp, 3 * sizeof(int));
 
     /* Compute C = A @ J1 (p=1 means just one block) */
-    CSC_Matrix *C = block_left_multiply_fill_sparsity(A, J, 1);
+    CSC_matrix *C = block_left_multiply_fill_sparsity(A, J, 1);
 
     /* Expected C is 2x2:
      * C[0,0] = A[0,:] @ J[:,0] = 1.0 * 1.0 = 1.0 (row 0 has column 0, J col 0 has
@@ -54,20 +54,20 @@ const char *test_block_left_multiply_single_block(void)
     mu_assert("C col pointers incorrect", cmp_int_array(C->p, expected_p, 3));
     mu_assert("C row indices incorrect", cmp_int_array(C->i, expected_i, 3));
 
-    free_csc_matrix(C);
-    free_csr_matrix(A);
-    free_csc_matrix(J);
+    free_CSC_matrix(C);
+    free_CSR_matrix(A);
+    free_CSC_matrix(J);
     return NULL;
 }
 
 /* Test block_left_multiply_fill_sparsity with two blocks */
 const char *test_block_left_multiply_two_blocks(void)
 {
-    /* A is 2x2 CSR:
+    /* A is 2x2 CSR_matrix:
      * [1.0  0.0]
      * [0.0  1.0]
      */
-    CSR_Matrix *A = new_csr_matrix(2, 2, 2);
+    CSR_matrix *A = new_CSR_matrix(2, 2, 2);
     double Ax[2] = {1.0, 1.0};
     int Ai[2] = {0, 1};
     int Ap[3] = {0, 1, 2};
@@ -75,7 +75,7 @@ const char *test_block_left_multiply_two_blocks(void)
     memcpy(A->i, Ai, 2 * sizeof(int));
     memcpy(A->p, Ap, 3 * sizeof(int));
 
-    /* J is 4x3 CSC (two blocks of 2 rows each):
+    /* J is 4x3 CSC_matrix (two blocks of 2 rows each):
      * Block 1 rows [0,1]:
      * [1.0  0.0  0.0]
      * [0.0  0.0  0.0]
@@ -88,7 +88,7 @@ const char *test_block_left_multiply_two_blocks(void)
      * [0.0  1.0  0.0]
      * [0.0  0.0  1.0]
      */
-    CSC_Matrix *J = new_csc_matrix(4, 3, 3);
+    CSC_matrix *J = new_CSC_matrix(4, 3, 3);
     double Jx[3] = {1.0, 1.0, 1.0};
     int Ji[3] = {0, 2, 3};
     int Jp[4] = {0, 1, 2, 3};
@@ -109,7 +109,7 @@ const char *test_block_left_multiply_two_blocks(void)
      * [0.0  0.0  0.0]
      * [0.0  1.0  1.0]
      */
-    CSC_Matrix *C = block_left_multiply_fill_sparsity(A, J, 2);
+    CSC_matrix *C = block_left_multiply_fill_sparsity(A, J, 2);
     block_left_multiply_fill_values(A, J, C);
 
     int expected_p2[4] = {0, 1, 2, 3};
@@ -121,17 +121,17 @@ const char *test_block_left_multiply_two_blocks(void)
     mu_assert("C row indices incorrect", cmp_int_array(C->i, expected_i2, 3));
     mu_assert("C values incorrect", cmp_double_array(C->x, expected_x2, 3));
 
-    free_csc_matrix(C);
-    free_csr_matrix(A);
-    free_csc_matrix(J);
+    free_CSC_matrix(C);
+    free_CSR_matrix(A);
+    free_CSC_matrix(J);
     return NULL;
 }
 
 /* Test block_left_multiply_fill_sparsity with all zero column in J */
 const char *test_block_left_multiply_zero_column(void)
 {
-    /* A is 2x2 CSR (identity) */
-    CSR_Matrix *A = new_csr_matrix(2, 2, 2);
+    /* A is 2x2 CSR_matrix (identity) */
+    CSR_matrix *A = new_CSR_matrix(2, 2, 2);
     double Ax[2] = {1.0, 1.0};
     int Ai[2] = {0, 1};
     int Ap[3] = {0, 1, 2};
@@ -143,7 +143,7 @@ const char *test_block_left_multiply_zero_column(void)
      * [1.0  0.0]
      * [0.0  0.0]
      */
-    CSC_Matrix *J = new_csc_matrix(2, 2, 1);
+    CSC_matrix *J = new_CSC_matrix(2, 2, 1);
     double Jx[1] = {1.0};
     int Ji[1] = {0};
     int Jp[3] = {0, 1, 1}; /* Column 0 has one nonzero, column 1 is empty */
@@ -151,7 +151,7 @@ const char *test_block_left_multiply_zero_column(void)
     memcpy(J->i, Ji, 1 * sizeof(int));
     memcpy(J->p, Jp, 3 * sizeof(int));
 
-    CSC_Matrix *C = block_left_multiply_fill_sparsity(A, J, 1);
+    CSC_matrix *C = block_left_multiply_fill_sparsity(A, J, 1);
 
     int expected_p3[3] = {0, 1, 1};
     int expected_i3[1] = {0};
@@ -160,21 +160,21 @@ const char *test_block_left_multiply_zero_column(void)
     mu_assert("C col pointers incorrect", cmp_int_array(C->p, expected_p3, 3));
     mu_assert("C row indices incorrect", cmp_int_array(C->i, expected_i3, 1));
 
-    free_csc_matrix(C);
-    free_csr_matrix(A);
-    free_csc_matrix(J);
+    free_CSC_matrix(C);
+    free_CSR_matrix(A);
+    free_CSC_matrix(J);
     return NULL;
 }
 
-/* Test csr_csc_matmul_alloc: C = A @ B where A is CSR and B is CSC */
+/* Test csr_csc_matmul_alloc: C = A @ B where A is CSR_matrix and B is CSC_matrix */
 const char *test_csr_csc_matmul_alloc_basic(void)
 {
-    /* A is 3x2 CSR:
+    /* A is 3x2 CSR_matrix:
      * [1.0  0.0]
      * [0.0  1.0]
      * [1.0  1.0]
      */
-    CSR_Matrix *A = new_csr_matrix(3, 2, 4);
+    CSR_matrix *A = new_CSR_matrix(3, 2, 4);
     double Ax[4] = {1.0, 1.0, 1.0, 1.0};
     int Ai[4] = {0, 1, 0, 1};
     int Ap[4] = {0, 1, 2, 4};
@@ -182,11 +182,11 @@ const char *test_csr_csc_matmul_alloc_basic(void)
     memcpy(A->i, Ai, 4 * sizeof(int));
     memcpy(A->p, Ap, 4 * sizeof(int));
 
-    /* B is 2x3 CSC:
+    /* B is 2x3 CSC_matrix:
      * [1.0  0.0  1.0]
      * [0.0  1.0  1.0]
      */
-    CSC_Matrix *B = new_csc_matrix(2, 3, 4);
+    CSC_matrix *B = new_CSC_matrix(2, 3, 4);
     double Bx[4] = {1.0, 1.0, 1.0, 1.0};
     int Bi[4] = {0, 1, 0, 1};
     int Bp[4] = {0, 1, 2, 4};
@@ -199,7 +199,7 @@ const char *test_csr_csc_matmul_alloc_basic(void)
      *      [0, 1, 1],
      *      [1, 1, 2]]
      */
-    CSR_Matrix *C = csr_csc_matmul_alloc(A, B);
+    CSR_matrix *C = csr_csc_matmul_alloc(A, B);
 
     int expected_p4[4] = {0, 2, 4, 7};
     int expected_i4[7] = {0, 2, 1, 2, 0, 1, 2};
@@ -208,20 +208,20 @@ const char *test_csr_csc_matmul_alloc_basic(void)
     mu_assert("C row pointers incorrect", cmp_int_array(C->p, expected_p4, 4));
     mu_assert("C col indices incorrect", cmp_int_array(C->i, expected_i4, 7));
 
-    free_csr_matrix(C);
-    free_csr_matrix(A);
-    free_csc_matrix(B);
+    free_CSR_matrix(C);
+    free_CSR_matrix(A);
+    free_CSC_matrix(B);
     return NULL;
 }
 
 /* Test csr_csc_matmul_alloc with sparse result */
 const char *test_csr_csc_matmul_alloc_sparse(void)
 {
-    /* A is 2x3 CSR:
+    /* A is 2x3 CSR_matrix:
      * [1.0  0.0  0.0]
      * [0.0  0.0  1.0]
      */
-    CSR_Matrix *A = new_csr_matrix(2, 3, 2);
+    CSR_matrix *A = new_CSR_matrix(2, 3, 2);
     double Ax[2] = {1.0, 1.0};
     int Ai[2] = {0, 2};
     int Ap[3] = {0, 1, 2};
@@ -229,12 +229,12 @@ const char *test_csr_csc_matmul_alloc_sparse(void)
     memcpy(A->i, Ai, 2 * sizeof(int));
     memcpy(A->p, Ap, 3 * sizeof(int));
 
-    /* B is 3x2 CSC:
+    /* B is 3x2 CSC_matrix:
      * [1.0  0.0]
      * [0.0  0.0]
      * [0.0  1.0]
      */
-    CSC_Matrix *B = new_csc_matrix(3, 2, 2);
+    CSC_matrix *B = new_CSC_matrix(3, 2, 2);
     double Bx[2] = {1.0, 1.0};
     int Bi[2] = {0, 2};
     int Bp[3] = {0, 1, 2};
@@ -246,7 +246,7 @@ const char *test_csr_csc_matmul_alloc_sparse(void)
      * C = [[1, 0],
      *      [0, 1]]
      */
-    CSR_Matrix *C = csr_csc_matmul_alloc(A, B);
+    CSR_matrix *C = csr_csc_matmul_alloc(A, B);
 
     int expected_p5[3] = {0, 1, 2};
     int expected_i5[2] = {0, 1};
@@ -255,20 +255,20 @@ const char *test_csr_csc_matmul_alloc_sparse(void)
     mu_assert("C row pointers incorrect", cmp_int_array(C->p, expected_p5, 3));
     mu_assert("C col indices incorrect", cmp_int_array(C->i, expected_i5, 2));
 
-    free_csr_matrix(C);
-    free_csr_matrix(A);
-    free_csc_matrix(B);
+    free_CSR_matrix(C);
+    free_CSR_matrix(A);
+    free_CSC_matrix(B);
     return NULL;
 }
 
 /* Test block_left_multiply_vec with single block: y = A @ x */
 const char *test_block_left_multiply_vec_single_block(void)
 {
-    /* A is 2x3 CSR:
+    /* A is 2x3 CSR_matrix:
      * [1.0  0.0  2.0]
      * [0.0  3.0  0.0]
      */
-    CSR_Matrix *A = new_csr_matrix(2, 3, 3);
+    CSR_matrix *A = new_CSR_matrix(2, 3, 3);
     double Ax[3] = {1.0, 3.0, 2.0};
     int Ai[3] = {0, 1, 2};
     int Ap[3] = {0, 2, 3};
@@ -289,18 +289,18 @@ const char *test_block_left_multiply_vec_single_block(void)
     double expected_y[2] = {7.0, 6.0};
     mu_assert("y values incorrect", cmp_double_array(y, expected_y, 2));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     return NULL;
 }
 
 /* Test block_left_multiply_vec with two blocks: y = [A @ x1; A @ x2] */
 const char *test_block_left_multiply_vec_two_blocks(void)
 {
-    /* A is 2x3 CSR:
+    /* A is 2x3 CSR_matrix:
      * [1.0  2.0  0.0]
      * [0.0  3.0  4.0]
      */
-    CSR_Matrix *A = new_csr_matrix(2, 3, 4);
+    CSR_matrix *A = new_CSR_matrix(2, 3, 4);
     double Ax[4] = {1.0, 2.0, 3.0, 4.0};
     int Ai[4] = {0, 1, 1, 2};
     int Ap[3] = {0, 2, 4};
@@ -323,19 +323,19 @@ const char *test_block_left_multiply_vec_two_blocks(void)
     double expected_y[4] = {5.0, 18.0, 14.0, 39.0};
     mu_assert("y values incorrect", cmp_double_array(y, expected_y, 4));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     return NULL;
 }
 
 /* Test block_left_multiply_vec with sparse matrix and multiple blocks */
 const char *test_block_left_multiply_vec_sparse(void)
 {
-    /* A is 3x4 CSR (very sparse):
+    /* A is 3x4 CSR_matrix (very sparse):
      * [2.0  0.0  0.0  0.0]
      * [0.0  0.0  3.0  0.0]
      * [0.0  0.0  0.0  4.0]
      */
-    CSR_Matrix *A = new_csr_matrix(3, 4, 3);
+    CSR_matrix *A = new_CSR_matrix(3, 4, 3);
     double Ax[3] = {2.0, 3.0, 4.0};
     int Ai[3] = {0, 2, 3};
     int Ap[4] = {0, 1, 2, 3};
@@ -358,18 +358,18 @@ const char *test_block_left_multiply_vec_sparse(void)
     double expected_y[6] = {2.0, 9.0, 16.0, 10.0, 21.0, 32.0};
     mu_assert("y values incorrect", cmp_double_array(y, expected_y, 6));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     return NULL;
 }
 
 /* Test block_left_multiply_vec with three blocks */
 const char *test_block_left_multiply_vec_three_blocks(void)
 {
-    /* A is 2x2 CSR:
+    /* A is 2x2 CSR_matrix:
      * [1.0  2.0]
      * [3.0  4.0]
      */
-    CSR_Matrix *A = new_csr_matrix(2, 2, 4);
+    CSR_matrix *A = new_CSR_matrix(2, 2, 4);
     double Ax[4] = {1.0, 2.0, 3.0, 4.0};
     int Ai[4] = {0, 1, 0, 1};
     int Ap[3] = {0, 2, 4};
@@ -393,6 +393,6 @@ const char *test_block_left_multiply_vec_three_blocks(void)
     double expected_y[6] = {5.0, 11.0, 11.0, 25.0, 17.0, 39.0};
     mu_assert("y values incorrect", cmp_double_array(y, expected_y, 6));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     return NULL;
 }
diff --git a/tests/utils/test_linalg_utils_matmul_chain_rule.h b/tests/utils/test_linalg_utils_matmul_chain_rule.h
index 0c66589..86d7e3f 100644
--- a/tests/utils/test_linalg_utils_matmul_chain_rule.h
+++ b/tests/utils/test_linalg_utils_matmul_chain_rule.h
@@ -4,8 +4,8 @@
 
 #include "minunit.h"
 #include "test_helpers.h"
-#include "utils/CSC_Matrix.h"
-#include "utils/CSR_Matrix.h"
+#include "utils/CSC_matrix.h"
+#include "utils/CSR_matrix.h"
 #include "utils/linalg_dense_sparse_matmuls.h"
 
 /* Test YT_kron_I_alloc and YT_kron_I_fill_values
@@ -17,7 +17,7 @@
  *   [1  3]
  *   [2  4]
  *
- * J (mk=4 x p=3, CSC):
+ * J (mk=4 x p=3, CSC_matrix):
  *   [1  0  2]
  *   [0  1  0]
  *   [3  0  0]
@@ -33,8 +33,8 @@ const char *test_YT_kron_I(void)
 {
     int m = 2, k = 2, n = 2;
 
-    /* J is 4x3 CSC */
-    CSC_Matrix *J = new_csc_matrix(4, 3, 5);
+    /* J is 4x3 CSC_matrix */
+    CSC_matrix *J = new_CSC_matrix(4, 3, 5);
     int Jp[4] = {0, 2, 3, 5};
     int Ji[5] = {0, 2, 1, 0, 3};
     double Jx[5] = {1.0, 3.0, 1.0, 2.0, 1.0};
@@ -45,9 +45,9 @@ const char *test_YT_kron_I(void)
     /* Y col-major: Y[0,0]=1, Y[1,0]=2, Y[0,1]=3, Y[1,1]=4 */
     double Y[4] = {1.0, 2.0, 3.0, 4.0};
 
-    CSR_Matrix *C = YT_kron_I_alloc(m, k, n, J);
+    CSR_matrix *C = YT_kron_I_alloc(m, k, n, J);
 
-    /* Expected CSR (from scipy) */
+    /* Expected CSR_matrix (from scipy) */
     int exp_p[5] = {0, 2, 4, 6, 8};
     int exp_i[8] = {0, 2, 1, 2, 0, 2, 1, 2};
     double exp_x[8] = {7.0, 2.0, 1.0, 2.0, 15.0, 6.0, 3.0, 4.0};
@@ -60,8 +60,8 @@ const char *test_YT_kron_I(void)
     YT_kron_I_fill_values(m, k, n, Y, J, C);
     mu_assert("C values", cmp_double_array(C->x, exp_x, 8));
 
-    free_csr_matrix(C);
-    free_csc_matrix(J);
+    free_CSR_matrix(C);
+    free_CSC_matrix(J);
     return NULL;
 }
 
@@ -71,7 +71,7 @@ const char *test_YT_kron_I(void)
  *   [1.0  0.5  2.0]
  *   [3.0  1.0  0.5]
  *
- * J (mk=6 x p=4, CSC):
+ * J (mk=6 x p=4, CSC_matrix):
  *   [1  0  0  2]
  *   [0  0  1  0]
  *   [0  3  0  0]
@@ -85,8 +85,8 @@ const char *test_YT_kron_I_larger(void)
 {
     int m = 3, k = 2, n = 3;
 
-    /* J is 6x4 CSC */
-    CSC_Matrix *J = new_csc_matrix(6, 4, 8);
+    /* J is 6x4 CSC_matrix */
+    CSC_matrix *J = new_CSC_matrix(6, 4, 8);
     int Jp[5] = {0, 2, 4, 6, 8};
     int Ji[8] = {0, 3, 2, 4, 1, 5, 0, 3};
     double Jx[8] = {1.0, 2.0, 3.0, 1.0, 1.0, 4.0, 2.0, 1.0};
@@ -97,9 +97,9 @@ const char *test_YT_kron_I_larger(void)
     /* Y col-major */
     double Y[6] = {1.0, 3.0, 0.5, 1.0, 2.0, 0.5};
 
-    CSR_Matrix *C = YT_kron_I_alloc(m, k, n, J);
+    CSR_matrix *C = YT_kron_I_alloc(m, k, n, J);
 
-    /* Expected CSR (from scipy) */
+    /* Expected CSR_matrix (from scipy) */
     int exp_p[10] = {0, 2, 4, 6, 8, 10, 12, 14, 16, 18};
     int exp_i[18] = {0, 3, 1, 2, 1, 2, 0, 3, 1, 2, 1, 2, 0, 3, 1, 2, 1, 2};
     double exp_x[18] = {7.0, 5.0, 3.0, 1.0, 3.0, 12.0, 2.5, 2.0, 1.0,
@@ -113,8 +113,8 @@ const char *test_YT_kron_I_larger(void)
     YT_kron_I_fill_values(m, k, n, Y, J, C);
     mu_assert("C2 values", cmp_double_array(C->x, exp_x, 18));
 
-    free_csr_matrix(C);
-    free_csc_matrix(J);
+    free_CSR_matrix(C);
+    free_CSC_matrix(J);
     return NULL;
 }
 
@@ -127,7 +127,7 @@ const char *test_YT_kron_I_larger(void)
  *   [1  3]
  *   [2  4]
  *
- * J (kn=4 x p=3, CSC):
+ * J (kn=4 x p=3, CSC_matrix):
  *   [1  0  2]
  *   [0  1  0]
  *   [3  0  0]
@@ -143,8 +143,8 @@ const char *test_I_kron_X(void)
 {
     int m = 2, k = 2, n = 2;
 
-    /* J is 4x3 CSC */
-    CSC_Matrix *J = new_csc_matrix(4, 3, 5);
+    /* J is 4x3 CSC_matrix */
+    CSC_matrix *J = new_CSC_matrix(4, 3, 5);
     int Jp[4] = {0, 2, 3, 5};
     int Ji[5] = {0, 2, 1, 0, 3};
     double Jx[5] = {1.0, 3.0, 1.0, 2.0, 1.0};
@@ -155,9 +155,9 @@ const char *test_I_kron_X(void)
     /* X col-major */
     double X[4] = {1.0, 2.0, 3.0, 4.0};
 
-    CSR_Matrix *C = I_kron_X_alloc(m, k, n, J);
+    CSR_matrix *C = I_kron_X_alloc(m, k, n, J);
 
-    /* Expected CSR */
+    /* Expected CSR_matrix */
     int exp_p[5] = {0, 3, 6, 8, 10};
     int exp_i[10] = {0, 1, 2, 0, 1, 2, 0, 2, 0, 2};
     double exp_x[10] = {1.0, 3.0, 2.0, 2.0, 4.0, 4.0, 3.0, 3.0, 6.0, 4.0};
@@ -170,8 +170,8 @@ const char *test_I_kron_X(void)
     I_kron_X_fill_values(m, k, n, X, J, C);
     mu_assert("C values", cmp_double_array(C->x, exp_x, 10));
 
-    free_csr_matrix(C);
-    free_csc_matrix(J);
+    free_CSR_matrix(C);
+    free_CSC_matrix(J);
     return NULL;
 }
 
@@ -182,7 +182,7 @@ const char *test_I_kron_X(void)
  *   [2.0  1.0]
  *   [3.0  0.5]
  *
- * J (kn=4 x p=4, CSC):
+ * J (kn=4 x p=4, CSC_matrix):
  *   [1  0  0  2]
  *   [0  3  1  0]
  *   [0  0  4  0]
@@ -194,8 +194,8 @@ const char *test_I_kron_X_larger(void)
 {
     int m = 3, k = 2, n = 2;
 
-    /* J is 4x4 CSC */
-    CSC_Matrix *J = new_csc_matrix(4, 4, 7);
+    /* J is 4x4 CSC_matrix */
+    CSC_matrix *J = new_CSC_matrix(4, 4, 7);
     int Jp[5] = {0, 2, 3, 5, 7};
     int Ji[7] = {0, 3, 1, 1, 2, 0, 3};
     double Jx[7] = {1.0, 2.0, 3.0, 1.0, 4.0, 2.0, 1.0};
@@ -206,9 +206,9 @@ const char *test_I_kron_X_larger(void)
     /* X col-major */
     double X[6] = {1.0, 2.0, 3.0, 0.5, 1.0, 0.5};
 
-    CSR_Matrix *C = I_kron_X_alloc(m, k, n, J);
+    CSR_matrix *C = I_kron_X_alloc(m, k, n, J);
 
-    /* Expected CSR */
+    /* Expected CSR_matrix */
     int exp_p[7] = {0, 4, 8, 12, 15, 18, 21};
     int exp_i[21] = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 2, 3, 0, 2, 3, 0, 2, 3};
     double exp_x[21] = {1.0, 1.5, 0.5, 2.0, 2.0, 3.0, 1.0, 4.0, 3.0,  1.5, 0.5,
@@ -222,7 +222,7 @@ const char *test_I_kron_X_larger(void)
     I_kron_X_fill_values(m, k, n, X, J, C);
     mu_assert("C2 values", cmp_double_array(C->x, exp_x, 21));
 
-    free_csr_matrix(C);
-    free_csc_matrix(J);
+    free_CSR_matrix(C);
+    free_CSC_matrix(J);
     return NULL;
 }
diff --git a/tests/utils/test_matrix.h b/tests/utils/test_matrix.h
index c329a16..d14dfbc 100644
--- a/tests/utils/test_matrix.h
+++ b/tests/utils/test_matrix.h
@@ -3,17 +3,18 @@
 
 #include "minunit.h"
 #include "test_helpers.h"
-#include "utils/dense_matrix.h"
+#include "utils/permuted_dense.h"
+#include "utils/sparse_matrix.h"
 #include <stdlib.h>
 #include <string.h>
 
-/* Test dense block_left_mult_vec against known result.
+/* Test full-block PD block_left_mult_vec against known result.
    A = [1 2; 3 4] (2x2), x = [1; 2], p = 1
    y = A * x = [1*1+2*2; 3*1+4*2] = [5; 11] */
-const char *test_dense_matrix_mult_vec(void)
+const char *test_pd_mult_vec_basic(void)
 {
     double data[] = {1.0, 2.0, 3.0, 4.0};
-    Matrix *A = new_dense_matrix(2, 2, data);
+    matrix *A = new_permuted_dense_full(2, 2, data);
 
     double x[] = {1.0, 2.0};
     double y[2] = {0.0, 0.0};
@@ -27,13 +28,13 @@ const char *test_dense_matrix_mult_vec(void)
     return 0;
 }
 
-/* Test dense block_left_mult_vec with multiple blocks.
+/* Test full-block PD block_left_mult_vec with multiple blocks.
    A = [1 2; 3 4] (2x2), x = [1; 2; 3; 4], p = 2
    y = [A*[1;2]; A*[3;4]] = [5; 11; 11; 25] */
-const char *test_dense_matrix_mult_vec_blocks(void)
+const char *test_pd_mult_vec_blocks(void)
 {
     double data[] = {1.0, 2.0, 3.0, 4.0};
-    Matrix *A = new_dense_matrix(2, 2, data);
+    matrix *A = new_permuted_dense_full(2, 2, data);
 
     double x[] = {1.0, 2.0, 3.0, 4.0};
     double y[4] = {0};
@@ -47,12 +48,12 @@ const char *test_dense_matrix_mult_vec_blocks(void)
     return 0;
 }
 
-/* Compare sparse vs dense block_left_mult_vec for a non-square matrix.
+/* Compare sparse vs PD block_left_mult_vec for a non-square matrix.
    A = [1 2 3; 4 5 6] (2x3), x = [1; 2; 3], p = 1 */
-const char *test_sparse_vs_dense_mult_vec(void)
+const char *test_sparse_vs_pd_mult_vec(void)
 {
-    /* Build CSR for A = [1 2 3; 4 5 6] */
-    CSR_Matrix *csr = new_csr_matrix(2, 3, 6);
+    /* Build CSR_matrix for A = [1 2 3; 4 5 6] */
+    CSR_matrix *csr = new_CSR_matrix(2, 3, 6);
     int Ap[3] = {0, 3, 6};
     int Ai[6] = {0, 1, 2, 0, 1, 2};
     double Ax[6] = {1, 2, 3, 4, 5, 6};
@@ -60,51 +61,50 @@ const char *test_sparse_vs_dense_mult_vec(void)
     memcpy(csr->i, Ai, 6 * sizeof(int));
     memcpy(csr->x, Ax, 6 * sizeof(double));
 
-    double dense_data[] = {1, 2, 3, 4, 5, 6};
+    double pd_data[] = {1, 2, 3, 4, 5, 6};
 
-    Matrix *sparse = new_sparse_matrix(csr);
-    Matrix *dense = new_dense_matrix(2, 3, dense_data);
+    matrix *sparse = new_sparse_matrix(csr);
+    matrix *pd = new_permuted_dense_full(2, 3, pd_data);
 
     double x[] = {1.0, 2.0, 3.0};
     double y_sparse[2] = {0};
-    double y_dense[2] = {0};
+    double y_pd[2] = {0};
 
     sparse->block_left_mult_vec(sparse, x, y_sparse, 1);
-    dense->block_left_mult_vec(dense, x, y_dense, 1);
+    pd->block_left_mult_vec(pd, x, y_pd, 1);
 
-    mu_assert("sparse vs dense mismatch", cmp_double_array(y_sparse, y_dense, 2));
+    mu_assert("sparse vs pd mismatch", cmp_double_array(y_sparse, y_pd, 2));
 
     free_matrix(sparse);
-    free_matrix(dense);
-    free_csr_matrix(csr);
+    free_matrix(pd);
     return 0;
 }
 
-/* Test dense transpose */
-const char *test_dense_matrix_trans(void)
+/* Test full-block PD transpose via vtable. */
+const char *test_pd_trans_full_block(void)
 {
     double data[] = {1, 2, 3, 4, 5, 6}; /* 2x3 */
-    Matrix *A = new_dense_matrix(2, 3, data);
-    Matrix *AT = dense_matrix_trans((const Dense_Matrix *) A);
+    matrix *A = new_permuted_dense_full(2, 3, data);
+    matrix *AT = A->transpose_alloc(A);
+    A->transpose_fill_values(A, AT);
 
     mu_assert("transpose m", AT->m == 3);
     mu_assert("transpose n", AT->n == 2);
 
     /* AT should be [1 4; 2 5; 3 6] stored row-major */
-    Dense_Matrix *dm = (Dense_Matrix *) AT;
     double AT_expected[6] = {1.0, 4.0, 2.0, 5.0, 3.0, 6.0};
-    mu_assert("AT vals incorrect", cmp_double_array(dm->x, AT_expected, 6));
+    mu_assert("AT vals incorrect", cmp_double_array(AT->x, AT_expected, 6));
 
     free_matrix(A);
     free_matrix(AT);
     return 0;
 }
 
-/* Compare sparse vs dense block_left_mult_vec with p=2 blocks.
+/* Compare sparse vs PD block_left_mult_vec with p=2 blocks.
    A = [1 2; 3 4], x = [1; 2; 3; 4], p = 2 */
-const char *test_sparse_vs_dense_mult_vec_blocks(void)
+const char *test_sparse_vs_pd_mult_vec_blocks(void)
 {
-    CSR_Matrix *csr = new_csr_matrix(2, 2, 4);
+    CSR_matrix *csr = new_CSR_matrix(2, 2, 4);
     int Ap[3] = {0, 2, 4};
     int Ai[4] = {0, 1, 0, 1};
     double Ax[4] = {1, 2, 3, 4};
@@ -112,24 +112,43 @@ const char *test_sparse_vs_dense_mult_vec_blocks(void)
     memcpy(csr->i, Ai, 4 * sizeof(int));
     memcpy(csr->x, Ax, 4 * sizeof(double));
 
-    double dense_data[] = {1, 2, 3, 4};
+    double pd_data[] = {1, 2, 3, 4};
 
-    Matrix *sparse = new_sparse_matrix(csr);
-    Matrix *dense = new_dense_matrix(2, 2, dense_data);
+    matrix *sparse = new_sparse_matrix(csr);
+    matrix *pd = new_permuted_dense_full(2, 2, pd_data);
 
     double x[] = {1.0, 2.0, 3.0, 4.0};
     double y_sparse[4] = {0};
-    double y_dense[4] = {0};
+    double y_pd[4] = {0};
 
     sparse->block_left_mult_vec(sparse, x, y_sparse, 2);
-    dense->block_left_mult_vec(dense, x, y_dense, 2);
+    pd->block_left_mult_vec(pd, x, y_pd, 2);
 
-    mu_assert("sparse vs dense blocks mismatch",
-              cmp_double_array(y_sparse, y_dense, 4));
+    mu_assert("sparse vs pd blocks mismatch", cmp_double_array(y_sparse, y_pd, 4));
 
     free_matrix(sparse);
-    free_matrix(dense);
-    free_csr_matrix(csr);
+    free_matrix(pd);
+    return 0;
+}
+
+/* Full-block permuted_dense acting as operator: smoke test mirroring
+   test_pd_mult_vec_basic with an explicit row_perm/col_perm. */
+const char *test_pd_operator_block_left_mult_vec(void)
+{
+    double data[] = {1.0, 2.0, 3.0, 4.0};
+    int row_perm[2] = {0, 1};
+    int col_perm[2] = {0, 1};
+    matrix *A = new_permuted_dense(2, 2, 2, 2, row_perm, col_perm, data);
+
+    double x[] = {1.0, 2.0};
+    double y[2] = {0.0, 0.0};
+
+    A->block_left_mult_vec(A, x, y, 1);
+
+    double y_expected[2] = {5.0, 11.0};
+    mu_assert("y incorrect", cmp_double_array(y, y_expected, 2));
+
+    free_matrix(A);
     return 0;
 }
 
diff --git a/tests/utils/test_matrix_BTA.h b/tests/utils/test_matrix_BTA.h
new file mode 100644
index 0000000..feb5fe9
--- /dev/null
+++ b/tests/utils/test_matrix_BTA.h
@@ -0,0 +1,173 @@
+#ifndef TEST_MATRIX_BTA_H
+#define TEST_MATRIX_BTA_H
+
+#include "minunit.h"
+#include "old-code/old_permuted_dense.h"
+#include "test_helpers.h"
+#include "utils/CSC_matrix.h"
+#include "utils/CSR_matrix.h"
+#include "utils/matrix_BTA.h"
+#include "utils/permuted_dense.h"
+#include "utils/sparse_matrix.h"
+#include "utils/utils.h"
+#include <stdlib.h>
+#include <string.h>
+
+/* Wrapper dispatch sanity: (PD, PD). Compare against direct
+   BTDA_pd_pd_fill_values. */
+const char *test_BTDA_matrices_pd_pd(void)
+{
+    int row_perm[2] = {0, 1};
+    int col_perm_A[2] = {0, 2};
+    int col_perm_B[2] = {1, 3};
+    double XA[4] = {1.0, 2.0, 3.0, 4.0};
+    double XB[4] = {5.0, 6.0, 7.0, 8.0};
+    double d[2] = {2.0, -1.5};
+
+    matrix *A_m = new_permuted_dense(2, 4, 2, 2, row_perm, col_perm_A, XA);
+    matrix *B_m = new_permuted_dense(2, 4, 2, 2, row_perm, col_perm_B, XB);
+
+    /* Wrapper path. */
+    matrix *C_m = BTA_matrices_alloc(A_m, B_m);
+    BTDA_matrices_fill_values(A_m, d, B_m, C_m);
+
+    /* Direct primitive path on independent operands. */
+    matrix *A2 = new_permuted_dense(2, 4, 2, 2, row_perm, col_perm_A, XA);
+    matrix *B2 = new_permuted_dense(2, 4, 2, 2, row_perm, col_perm_B, XB);
+    matrix *C2 = BTA_pd_pd_alloc((permuted_dense *) B2, (permuted_dense *) A2);
+    BTDA_pd_pd_fill_values((permuted_dense *) B2, d, (permuted_dense *) A2,
+                           (permuted_dense *) C2);
+
+    mu_assert("values", cmp_double_array(C_m->x, C2->x, C_m->nnz));
+
+    free_matrix(C_m);
+    free_matrix(B_m);
+    free_matrix(A_m);
+    free_matrix(C2);
+    free_matrix(B2);
+    free_matrix(A2);
+    return 0;
+}
+
+/* Wrapper dispatch sanity: (CSR_matrix, PD). Compare against direct
+   BTDA_pd_csr_fill_values. */
+const char *test_BTDA_matrices_csr_pd(void)
+{
+    /* A: 4x5 CSR_matrix */
+    CSR_matrix *A = new_CSR_matrix(4, 5, 5);
+    A->p[0] = 0;
+    A->p[1] = 2;
+    A->p[2] = 3;
+    A->p[3] = 4;
+    A->p[4] = 5;
+    int Ai[5] = {0, 3, 2, 1, 4};
+    double Ax[5] = {1.0, 2.0, 3.0, 4.0, 5.0};
+    memcpy(A->i, Ai, sizeof Ai);
+    memcpy(A->x, Ax, sizeof Ax);
+    matrix *A_m = new_sparse_matrix(A);
+
+    /* B: 4x4 PD, row_perm = [1, 3], col_perm = [0, 2]. */
+    int row_perm_B[2] = {1, 3};
+    int col_perm_B[2] = {0, 2};
+    double XB[4] = {10.0, 20.0, 30.0, 40.0};
+    matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
+
+    double d[4] = {1.0, -2.0, 0.5, 3.0};
+
+    /* Wrapper path. Dispatchers don't touch sparse_matrix internals — caller
+       owns csc_cache structure and values. */
+    sparse_matrix_ensure_csc_cache((sparse_matrix *) A_m);
+    matrix *C_m = BTA_matrices_alloc(A_m, B_m);
+    A_m->refresh_csc_values(A_m);
+    BTDA_matrices_fill_values(A_m, d, B_m, C_m);
+
+    /* Direct primitive path. */
+    CSR_matrix *A2 = new_CSR_matrix(4, 5, 5);
+    A2->p[0] = 0;
+    A2->p[1] = 2;
+    A2->p[2] = 3;
+    A2->p[3] = 4;
+    A2->p[4] = 5;
+    memcpy(A2->i, Ai, sizeof Ai);
+    memcpy(A2->x, Ax, sizeof Ax);
+    matrix *B2_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
+    permuted_dense *B2 = (permuted_dense *) B2_m;
+    matrix *C2 = BTA_pd_csr_alloc(B2, A2);
+    BTDA_pd_csr_fill_values(B2, d, A2, (permuted_dense *) C2);
+
+    mu_assert("values", cmp_double_array(C_m->x, C2->x, C_m->nnz));
+
+    free_matrix(C_m);
+    free_matrix(B_m);
+    free_matrix(A_m);
+    free_matrix(C2);
+    free_matrix(B2_m);
+    free_CSR_matrix(A2);
+    return 0;
+}
+
+/* Wrapper dispatch sanity: (PD, CSR_matrix). Compare against direct
+   BTDA_csc_pd_fill_values. */
+const char *test_BTDA_matrices_pd_csr(void)
+{
+    /* A: 4x5 PD, row_perm = [1, 3], col_perm = [0, 2]. */
+    int row_perm_A[2] = {1, 3};
+    int col_perm_A[2] = {0, 2};
+    double XA[4] = {1.0, 2.0, 3.0, 4.0};
+    matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
+
+    /* B: 4x4 CSR_matrix. */
+    CSR_matrix *B = new_CSR_matrix(4, 4, 5);
+    B->p[0] = 0;
+    B->p[1] = 2;
+    B->p[2] = 3;
+    B->p[3] = 4;
+    B->p[4] = 5;
+    int Bi[5] = {0, 2, 1, 0, 3};
+    double Bx[5] = {10.0, 20.0, 30.0, 40.0, 50.0};
+    memcpy(B->i, Bi, sizeof Bi);
+    memcpy(B->x, Bx, sizeof Bx);
+    matrix *B_m = new_sparse_matrix(B);
+
+    double d[4] = {1.0, -2.0, 0.5, 3.0};
+
+    /* Wrapper path. Dispatchers don't touch sparse_matrix internals — caller
+       owns csc_cache structure and values. */
+    sparse_matrix_ensure_csc_cache((sparse_matrix *) B_m);
+    matrix *C_m = BTA_matrices_alloc(A_m, B_m);
+    B_m->refresh_csc_values(B_m);
+    BTDA_matrices_fill_values(A_m, d, B_m, C_m);
+
+    /* Direct primitive path: production now dispatches the (PD, Sparse)
+       branch through CSC-pd kernels. Build a CSC view of B and call
+       BTA_csc_pd_alloc + BTDA_csc_pd_fill_values to match. */
+    matrix *A2_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
+    permuted_dense *A2 = (permuted_dense *) A2_m;
+    CSR_matrix *B2_csr = new_CSR_matrix(4, 4, 5);
+    B2_csr->p[0] = 0;
+    B2_csr->p[1] = 2;
+    B2_csr->p[2] = 3;
+    B2_csr->p[3] = 4;
+    B2_csr->p[4] = 5;
+    memcpy(B2_csr->i, Bi, sizeof Bi);
+    memcpy(B2_csr->x, Bx, sizeof Bx);
+    int *iwork = (int *) malloc(MAX(B2_csr->m, B2_csr->n) * sizeof(int));
+    CSC_matrix *B2_csc = csr_to_csc_alloc(B2_csr, iwork);
+    csr_to_csc_fill_values(B2_csr, B2_csc, iwork);
+    matrix *C2 = BTA_csc_pd_alloc(B2_csc, A2);
+    BTDA_csc_pd_fill_values(B2_csc, d, A2, (permuted_dense *) C2);
+
+    mu_assert("values", cmp_double_array(C_m->x, C2->x, C_m->nnz));
+
+    free_matrix(C_m);
+    free_matrix(B_m);
+    free_matrix(A_m);
+    free_matrix(C2);
+    free_CSC_matrix(B2_csc);
+    free_CSR_matrix(B2_csr);
+    free(iwork);
+    free_matrix(A2_m);
+    return 0;
+}
+
+#endif /* TEST_MATRIX_BTA_H */
diff --git a/tests/utils/test_permuted_dense.h b/tests/utils/test_permuted_dense.h
new file mode 100644
index 0000000..b5baca4
--- /dev/null
+++ b/tests/utils/test_permuted_dense.h
@@ -0,0 +1,1004 @@
+#ifndef TEST_PERMUTED_DENSE_H
+#define TEST_PERMUTED_DENSE_H
+
+#include "minunit.h"
+#include "old-code/old_permuted_dense.h"
+#include "test_helpers.h"
+#include "utils/CSC_matrix.h"
+#include "utils/matrix_BTA.h"
+#include "utils/permuted_dense.h"
+#include "utils/sparse_matrix.h"
+#include "utils/utils.h"
+#include <stdlib.h>
+#include <string.h>
+
+/* 5x6 matrix with a 3x2 dense block at rows {1, 2, 4}, cols {0, 3}:
+
+       global view:
+       [0  0  0  0  0  0]
+       [1  0  0  2  0  0]
+       [3  0  0  4  0  0]
+       [0  0  0  0  0  0]
+       [5  0  0  6  0  0]                                                */
+const char *test_permuted_dense_to_csr_basic(void)
+{
+    int row_perm[3] = {1, 2, 4};
+    int col_perm[2] = {0, 3};
+    double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+
+    matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
+
+    CSR_matrix *C = M->to_csr(M);
+    int Cp_expected[6] = {0, 0, 2, 4, 4, 6};
+    int Ci_expected[6] = {0, 3, 0, 3, 0, 3};
+    double Cx_expected[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+
+    mu_assert("dim m", C->m == 5);
+    mu_assert("dim n", C->n == 6);
+    mu_assert("nnz", C->nnz == 6);
+    mu_assert("p", cmp_int_array(C->p, Cp_expected, 6));
+    mu_assert("i", cmp_int_array(C->i, Ci_expected, 6));
+    mu_assert("x", cmp_double_array(C->x, Cx_expected, 6));
+
+    free_matrix(M);
+    return 0;
+}
+
+/* Empty dense block (m0 = n0 = 0): result is an m x n CSR_matrix with
+   no nonzeros. */
+const char *test_permuted_dense_to_csr_empty(void)
+{
+    matrix *M = new_permuted_dense(4, 5, 0, 0, NULL, NULL, NULL);
+
+    CSR_matrix *C = M->to_csr(M);
+    int Cp_expected[5] = {0, 0, 0, 0, 0};
+    mu_assert("nnz", C->nnz == 0);
+    mu_assert("p", cmp_int_array(C->p, Cp_expected, 5));
+
+    free_matrix(M);
+    return 0;
+}
+
+/* Full dense (row_perm = [0..m), col_perm = [0..n)): result is the dense
+   matrix in CSR_matrix. */
+const char *test_permuted_dense_to_csr_full(void)
+{
+    int row_perm[2] = {0, 1};
+    int col_perm[3] = {0, 1, 2};
+    double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+
+    matrix *M = new_permuted_dense(2, 3, 2, 3, row_perm, col_perm, X);
+
+    CSR_matrix *C = M->to_csr(M);
+    int Cp_expected[3] = {0, 3, 6};
+    int Ci_expected[6] = {0, 1, 2, 0, 1, 2};
+    double Cx_expected[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+
+    mu_assert("p", cmp_int_array(C->p, Cp_expected, 3));
+    mu_assert("i", cmp_int_array(C->i, Ci_expected, 6));
+    mu_assert("x", cmp_double_array(C->x, Cx_expected, 6));
+
+    free_matrix(M);
+    return 0;
+}
+
+/* Single dense row, two dense cols. Tests rows with no entries before
+   and after the active row. */
+const char *test_permuted_dense_to_csr_single_row(void)
+{
+    int row_perm[1] = {2};
+    int col_perm[2] = {1, 4};
+    double X[2] = {7.0, 9.0};
+
+    matrix *M = new_permuted_dense(4, 5, 1, 2, row_perm, col_perm, X);
+
+    CSR_matrix *C = M->to_csr(M);
+    int Cp_expected[5] = {0, 0, 0, 2, 2};
+    int Ci_expected[2] = {1, 4};
+    double Cx_expected[2] = {7.0, 9.0};
+
+    mu_assert("p", cmp_int_array(C->p, Cp_expected, 5));
+    mu_assert("i", cmp_int_array(C->i, Ci_expected, 2));
+    mu_assert("x", cmp_double_array(C->x, Cx_expected, 2));
+
+    free_matrix(M);
+    return 0;
+}
+
+/* Single dense col across multiple rows. */
+const char *test_permuted_dense_to_csr_single_col(void)
+{
+    int row_perm[3] = {0, 2, 3};
+    int col_perm[1] = {2};
+    double X[3] = {1.0, 2.0, 3.0};
+
+    matrix *M = new_permuted_dense(4, 4, 3, 1, row_perm, col_perm, X);
+
+    CSR_matrix *C = M->to_csr(M);
+    int Cp_expected[5] = {0, 1, 1, 2, 3};
+    int Ci_expected[3] = {2, 2, 2};
+    double Cx_expected[3] = {1.0, 2.0, 3.0};
+
+    mu_assert("p", cmp_int_array(C->p, Cp_expected, 5));
+    mu_assert("i", cmp_int_array(C->i, Ci_expected, 3));
+    mu_assert("x", cmp_double_array(C->x, Cx_expected, 3));
+
+    free_matrix(M);
+    return 0;
+}
+
+/* DA_fill_values: compare against CSR_matrix DA_fill_values on the equivalent
+   CSR_matrix.
+
+   PD is the 5x6 matrix from the basic to_csr test, with d a length-5
+   global-row diagonal including a negative and zero entry. */
+const char *test_DA_pd_fill_values(void)
+{
+    int row_perm[3] = {1, 2, 4};
+    int col_perm[2] = {0, 3};
+    double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    double d[5] = {7.0, -1.5, 0.0, 9.0, 2.5};
+
+    matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
+    matrix *M_out = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, NULL);
+    permuted_dense *pd = (permuted_dense *) M;
+    permuted_dense *pd_out = (permuted_dense *) M_out;
+
+    DA_pd_fill_values(d, pd, pd_out);
+
+    /* Ground truth: build CSR_matrix of self, run DA_fill_values, compare. */
+    CSR_matrix *csr = M->to_csr(M);
+    CSR_matrix *csr_expected = new_csr_copy_sparsity(csr);
+    DA_fill_values(d, csr, csr_expected);
+
+    CSR_matrix *csr_out = M_out->to_csr(M_out);
+    mu_assert("x", cmp_double_array(csr_out->x, csr_expected->x, csr->nnz));
+
+    free_CSR_matrix(csr_expected);
+    free_matrix(M);
+    free_matrix(M_out);
+    return 0;
+}
+
+/* ATA_alloc: structure-only check. Output is 6x6 with a 2x2 dense block at
+   perms {0, 3} (= self.col_perm on both sides). Values are uninitialized
+   here; ATDA_fill_values is the value-producing op. */
+const char *test_ATA_pd_alloc(void)
+{
+    int row_perm[3] = {1, 2, 4};
+    int col_perm[2] = {0, 3};
+    double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+
+    matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
+    permuted_dense *pd = (permuted_dense *) M;
+
+    matrix *M_ata = ATA_pd_alloc(pd);
+    permuted_dense *pd_ata = (permuted_dense *) M_ata;
+
+    int perm_expected[2] = {0, 3};
+    mu_assert("m", M_ata->m == 6);
+    mu_assert("n", M_ata->n == 6);
+    mu_assert("m0", pd_ata->m0 == 2);
+    mu_assert("n0", pd_ata->n0 == 2);
+    mu_assert("row_perm", cmp_int_array(pd_ata->row_perm, perm_expected, 2));
+    mu_assert("col_perm", cmp_int_array(pd_ata->col_perm, perm_expected, 2));
+
+    free_matrix(M);
+    free_matrix(M_ata);
+    return 0;
+}
+
+/* ATDA: same 5x6 PD, d with negative + zero entries to catch sign bugs.
+   Hand-computed: d_perm = [-1.5, 0, 2.5], Y = diag(d_perm) X gives
+   [[-1.5,-3],[0,0],[12.5,15]], and X^T Y = [[61,72],[72,84]]. */
+const char *test_ATDA_pd_fill_values(void)
+{
+    int row_perm[3] = {1, 2, 4};
+    int col_perm[2] = {0, 3};
+    double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    double d[5] = {7.0, -1.5, 0.0, 9.0, 2.5};
+
+    matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
+    permuted_dense *pd = (permuted_dense *) M;
+
+    matrix *M_out = ATA_pd_alloc(pd);
+    permuted_dense *pd_out = (permuted_dense *) M_out;
+    ATDA_pd_fill_values(pd, d, pd_out);
+
+    double X_expected[4] = {61.0, 72.0, 72.0, 84.0};
+    mu_assert("X", cmp_double_array(pd_out->X, X_expected, 4));
+
+    free_matrix(M);
+    free_matrix(M_out);
+    return 0;
+}
+
+/* PD x CSC_matrix: J is 6x4. col 0 empty; col 1 has rows {0,3} (vals 10, 20);
+   col 2 has row {2} (val 30, but row 2 not in col_perm_self = {0,3} so col 2
+   is INACTIVE); col 3 has row {3} (val 40). Active cols: {1, 3}.
+
+   Expected: m0=3, n0=2, row_perm={1,2,4}, col_perm={1,3}.
+   Values: out.X[:,0] = 10*[1,3,5] + 20*[2,4,6] = [50,110,170],
+           out.X[:,1] = 40*[2,4,6] = [80,160,240]. */
+const char *test_permuted_dense_times_csc(void)
+{
+    int row_perm[3] = {1, 2, 4};
+    int col_perm[2] = {0, 3};
+    double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
+    permuted_dense *pd = (permuted_dense *) M;
+
+    CSC_matrix *J = new_CSC_matrix(6, 4, 4);
+    int Jp[5] = {0, 0, 2, 3, 4};
+    int Ji[4] = {0, 3, 2, 3};
+    double Jx[4] = {10.0, 20.0, 30.0, 40.0};
+    memcpy(J->p, Jp, 5 * sizeof(int));
+    memcpy(J->i, Ji, 4 * sizeof(int));
+    memcpy(J->x, Jx, 4 * sizeof(double));
+
+    matrix *M_out = BA_pd_csc_alloc(pd, J);
+    permuted_dense *pd_out = (permuted_dense *) M_out;
+    BA_pd_csc_fill_values(pd->X, pd->n0, pd->col_inv, J, pd_out);
+
+    int row_perm_expected[3] = {1, 2, 4};
+    int col_perm_expected[2] = {1, 3};
+    double X_expected[6] = {50.0, 80.0, 110.0, 160.0, 170.0, 240.0};
+
+    mu_assert("m", M_out->m == 5);
+    mu_assert("n", M_out->n == 4);
+    mu_assert("m0", pd_out->m0 == 3);
+    mu_assert("n0", pd_out->n0 == 2);
+    mu_assert("row_perm", cmp_int_array(pd_out->row_perm, row_perm_expected, 3));
+    mu_assert("col_perm", cmp_int_array(pd_out->col_perm, col_perm_expected, 2));
+    mu_assert("X", cmp_double_array(pd_out->X, X_expected, 6));
+
+    free_matrix(M);
+    free_matrix(M_out);
+    free_CSC_matrix(J);
+    return 0;
+}
+
+/* PD x CSC_matrix edge case: every column of J has its only nonzero outside
+   col_perm_self, so col_perm_out is empty (n0 = 0). */
+const char *test_permuted_dense_times_csc_no_active(void)
+{
+    int row_perm[3] = {1, 2, 4};
+    int col_perm[2] = {0, 3};
+    double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
+    permuted_dense *pd = (permuted_dense *) M;
+
+    /* J: col 0 has row {1}, col 1 has row {5}. Neither in col_perm_self. */
+    CSC_matrix *J = new_CSC_matrix(6, 2, 2);
+    int Jp[3] = {0, 1, 2};
+    int Ji[2] = {1, 5};
+    double Jx[2] = {100.0, 200.0};
+    memcpy(J->p, Jp, 3 * sizeof(int));
+    memcpy(J->i, Ji, 2 * sizeof(int));
+    memcpy(J->x, Jx, 2 * sizeof(double));
+
+    matrix *M_out = BA_pd_csc_alloc(pd, J);
+    permuted_dense *pd_out = (permuted_dense *) M_out;
+    BA_pd_csc_fill_values(pd->X, pd->n0, pd->col_inv, J, pd_out);
+
+    mu_assert("m", M_out->m == 5);
+    mu_assert("n", M_out->n == 2);
+    mu_assert("m0", pd_out->m0 == 3);
+    mu_assert("n0", pd_out->n0 == 0);
+
+    free_matrix(M);
+    free_matrix(M_out);
+    free_CSC_matrix(J);
+    return 0;
+}
+
+/* to_csr vtable method: lazy CSR_matrix view. First call allocates pd->csr_cache;
+   subsequent calls refresh values to reflect the current pd->X. */
+const char *test_permuted_dense_to_csr_lazy(void)
+{
+    int row_perm[3] = {1, 2, 4};
+    int col_perm[2] = {0, 3};
+    double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+
+    matrix *M = new_permuted_dense(5, 6, 3, 2, row_perm, col_perm, X);
+    permuted_dense *pd = (permuted_dense *) M;
+
+    mu_assert("csr_cache initially NULL", pd->csr_cache == NULL);
+
+    CSR_matrix *csr = M->to_csr(M);
+    mu_assert("csr_cache populated", pd->csr_cache != NULL);
+    mu_assert("returns the cache", csr == pd->csr_cache);
+
+    double expected[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    mu_assert("values match X", cmp_double_array(csr->x, expected, 6));
+
+    /* Mutate X and re-call to_csr: values must reflect the change. */
+    pd->X[0] = 99.0;
+    csr = M->to_csr(M);
+    mu_assert("refresh picks up new value", csr->x[0] == 99.0);
+
+    free_matrix(M);
+    return 0;
+}
+
+/* Sanity check: col_inv is built correctly. col_perm = {0, 3} on n = 6
+   should give col_inv = {0, -1, -1, 1, -1, -1}. */
+const char *test_permuted_dense_col_inv(void)
+{
+    int row_perm[1] = {0};
+    int col_perm[2] = {0, 3};
+    double X[2] = {0.0, 0.0};
+
+    matrix *M = new_permuted_dense(1, 6, 1, 2, row_perm, col_perm, X);
+    permuted_dense *pd = (permuted_dense *) M;
+
+    int expected[6] = {0, -1, -1, 1, -1, -1};
+    mu_assert("col_inv", cmp_int_array(pd->col_inv, expected, 6));
+
+    free_matrix(M);
+    return 0;
+}
+
+/* PD index_alloc / index_fill_values: select rows from a PD; output must be
+   another PD with row_perm equal to the output positions where indices[i]
+   hit the source row_perm. */
+const char *test_permuted_dense_index(void)
+{
+    /* Source PD, shape (6, 4), dense block at rows {1, 3, 4} x cols {0, 2}. */
+    int row_perm[3] = {1, 3, 4};
+    int col_perm[2] = {0, 2};
+    double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    matrix *M = new_permuted_dense(6, 4, 3, 2, row_perm, col_perm, X);
+
+    /* Index by [0, 3, 1, 5, 4]:
+       - position 0 -> source row 0 (not in row_perm, zero)
+       - position 1 -> source row 3 (in row_perm at ii=1, dense)
+       - position 2 -> source row 1 (in row_perm at ii=0, dense)
+       - position 3 -> source row 5 (not in row_perm, zero)
+       - position 4 -> source row 4 (in row_perm at ii=2, dense) */
+    int indices[5] = {0, 3, 1, 5, 4};
+    matrix *out = M->index_alloc(M, indices, 5);
+    permuted_dense *out_pd = (permuted_dense *) out;
+
+    mu_assert("out m", out->m == 5);
+    mu_assert("out n", out->n == 4);
+    mu_assert("out nnz", out->nnz == 6); /* m0=3 * n0=2 */
+    mu_assert("m0", out_pd->m0 == 3);
+    mu_assert("n0", out_pd->n0 == 2);
+
+    int expected_row_perm[3] = {1, 2, 4};
+    mu_assert("row_perm", cmp_int_array(out_pd->row_perm, expected_row_perm, 3));
+    int expected_col_perm[2] = {0, 2};
+    mu_assert("col_perm", cmp_int_array(out_pd->col_perm, expected_col_perm, 2));
+
+    M->index_fill_values(M, indices, 5, out);
+
+    /* Row 0 of out (i=1) = source row 3 = X[1, :] = {3, 4}.
+       Row 1 of out (i=2) = source row 1 = X[0, :] = {1, 2}.
+       Row 2 of out (i=4) = source row 4 = X[2, :] = {5, 6}. */
+    double expected_X[6] = {3.0, 4.0, 1.0, 2.0, 5.0, 6.0};
+    mu_assert("values", cmp_double_array(out_pd->X, expected_X, 6));
+
+    free_matrix(out);
+    free_matrix(M);
+    return 0;
+}
+
+/* PD promote_alloc / promote_fill_values: tile a 1-row PD into a
+   `size`-row PD where every row is a copy of the source row. */
+const char *test_permuted_dense_promote(void)
+{
+    /* Source PD, shape (1, 5), single dense row at row 0, cols {1, 3}. */
+    int row_perm[1] = {0};
+    int col_perm[2] = {1, 3};
+    double X[2] = {7.0, 9.0};
+    matrix *M = new_permuted_dense(1, 5, 1, 2, row_perm, col_perm, X);
+
+    matrix *out = M->promote_alloc(M, 4);
+    permuted_dense *out_pd = (permuted_dense *) out;
+
+    mu_assert("out m", out->m == 4);
+    mu_assert("out n", out->n == 5);
+    mu_assert("out nnz", out->nnz == 8); /* m0=4 * n0=2 */
+    mu_assert("m0", out_pd->m0 == 4);
+    mu_assert("n0", out_pd->n0 == 2);
+
+    int expected_row_perm[4] = {0, 1, 2, 3};
+    mu_assert("row_perm", cmp_int_array(out_pd->row_perm, expected_row_perm, 4));
+    int expected_col_perm[2] = {1, 3};
+    mu_assert("col_perm", cmp_int_array(out_pd->col_perm, expected_col_perm, 2));
+
+    M->promote_fill_values(M, out);
+
+    double expected_X[8] = {7.0, 9.0, 7.0, 9.0, 7.0, 9.0, 7.0, 9.0};
+    mu_assert("values", cmp_double_array(out_pd->X, expected_X, 8));
+
+    free_matrix(out);
+    free_matrix(M);
+    return 0;
+}
+
+/* PD broadcast_alloc / broadcast_fill_values, SCALAR variant.
+   (1, 5) PD with single dense row -> (d1*d2, 5) PD with that row tiled. */
+const char *test_permuted_dense_broadcast_scalar(void)
+{
+    int row_perm[1] = {0};
+    int col_perm[2] = {1, 3};
+    double X[2] = {7.0, 9.0};
+    matrix *M = new_permuted_dense(1, 5, 1, 2, row_perm, col_perm, X);
+
+    int d1 = 2, d2 = 3; /* out shape (2, 3), m = 6 */
+    matrix *out = M->broadcast_alloc(M, BROADCAST_SCALAR, d1, d2);
+    permuted_dense *out_pd = (permuted_dense *) out;
+
+    mu_assert("out m", out->m == 6);
+    mu_assert("out n", out->n == 5);
+    mu_assert("m0", out_pd->m0 == 6);
+    mu_assert("n0", out_pd->n0 == 2);
+    int expected_rp[6] = {0, 1, 2, 3, 4, 5};
+    mu_assert("row_perm", cmp_int_array(out_pd->row_perm, expected_rp, 6));
+
+    M->broadcast_fill_values(M, BROADCAST_SCALAR, d1, d2, out);
+    double expected_X[12] = {7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9};
+    mu_assert("values", cmp_double_array(out_pd->X, expected_X, 12));
+
+    free_matrix(out);
+    free_matrix(M);
+    return 0;
+}
+
+/* PD broadcast_alloc / broadcast_fill_values, ROW variant.
+   (1, d2) input has Jacobian of shape (d2, n_vars). Source PD: m=d2=3,
+   row_perm={0, 2} (rows 0 and 2 dense), col_perm={1, 4}, single dense row
+   per m0. Output (d1, d2) = (2, 3): each child row replicated d1=2
+   times. */
+const char *test_permuted_dense_broadcast_row(void)
+{
+    int row_perm[2] = {0, 2};
+    int col_perm[2] = {1, 4};
+    double X[4] = {1.0, 2.0,  /* row corresponding to child row 0 */
+                   3.0, 4.0}; /* row corresponding to child row 2 */
+    matrix *M = new_permuted_dense(3, 6, 2, 2, row_perm, col_perm, X);
+
+    int d1 = 2, d2 = 3; /* output (2, 3), out m = 6 */
+    matrix *out = M->broadcast_alloc(M, BROADCAST_ROW, d1, d2);
+    permuted_dense *out_pd = (permuted_dense *) out;
+
+    mu_assert("out m", out->m == 6);
+    mu_assert("m0", out_pd->m0 == 4); /* d1 * 2 */
+    mu_assert("n0", out_pd->n0 == 2);
+    /* row_perm = {child_row_perm[0]*d1, +1, child_row_perm[1]*d1, +1}
+                = {0, 1, 4, 5} */
+    int expected_rp[4] = {0, 1, 4, 5};
+    mu_assert("row_perm", cmp_int_array(out_pd->row_perm, expected_rp, 4));
+
+    M->broadcast_fill_values(M, BROADCAST_ROW, d1, d2, out);
+    /* each child row replicated d1 times */
+    double expected_X[8] = {1.0, 2.0, 1.0, 2.0, 3.0, 4.0, 3.0, 4.0};
+    mu_assert("values", cmp_double_array(out_pd->X, expected_X, 8));
+
+    free_matrix(out);
+    free_matrix(M);
+    return 0;
+}
+
+/* PD broadcast_alloc / broadcast_fill_values, COL variant.
+   (d1, 1) input has Jacobian of shape (d1, n_vars). Source PD: m=d1=3,
+   row_perm={0, 2}, col_perm={1, 4}, two dense rows. Output (d1, d2) = (3, 2),
+   out m = 6: each child row appears d2 times, shifted by j*d1. */
+const char *test_permuted_dense_broadcast_col(void)
+{
+    int row_perm[2] = {0, 2};
+    int col_perm[2] = {1, 4};
+    double X[4] = {1.0, 2.0, 3.0, 4.0};
+    matrix *M = new_permuted_dense(3, 6, 2, 2, row_perm, col_perm, X);
+
+    int d1 = 3, d2 = 2;
+    matrix *out = M->broadcast_alloc(M, BROADCAST_COL, d1, d2);
+    permuted_dense *out_pd = (permuted_dense *) out;
+
+    mu_assert("out m", out->m == 6);
+    mu_assert("m0", out_pd->m0 == 4); /* d2 * 2 */
+    mu_assert("n0", out_pd->n0 == 2);
+    /* row_perm = {0+0, 0+2, 3+0, 3+2} = {0, 2, 3, 5} */
+    int expected_rp[4] = {0, 2, 3, 5};
+    mu_assert("row_perm", cmp_int_array(out_pd->row_perm, expected_rp, 4));
+
+    M->broadcast_fill_values(M, BROADCAST_COL, d1, d2, out);
+    /* X = d2 copies of full source X block */
+    double expected_X[8] = {1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0};
+    mu_assert("values", cmp_double_array(out_pd->X, expected_X, 8));
+
+    free_matrix(out);
+    free_matrix(M);
+    return 0;
+}
+
+/* PD diag_vec_alloc / diag_vec_fill_values.
+   Source PD shape (3, 6) with m0=2 (rows 0 and 2) -> output PD shape
+   (9, 6) with the same 2 dense rows mapped to positions {0, 8} = {0*4, 2*4}. */
+const char *test_permuted_dense_diag_vec(void)
+{
+    int row_perm[2] = {0, 2};
+    int col_perm[2] = {1, 4};
+    double X[4] = {1.0, 2.0, 3.0, 4.0};
+    matrix *M = new_permuted_dense(3, 6, 2, 2, row_perm, col_perm, X);
+
+    matrix *out = M->diag_vec_alloc(M);
+    permuted_dense *out_pd = (permuted_dense *) out;
+
+    mu_assert("out m", out->m == 9);
+    mu_assert("out n", out->n == 6);
+    mu_assert("m0", out_pd->m0 == 2);
+    mu_assert("n0", out_pd->n0 == 2);
+    /* row_perm = {0*(n+1), 2*(n+1)} = {0, 8} */
+    int expected_rp[2] = {0, 8};
+    mu_assert("row_perm", cmp_int_array(out_pd->row_perm, expected_rp, 2));
+    int expected_cp[2] = {1, 4};
+    mu_assert("col_perm", cmp_int_array(out_pd->col_perm, expected_cp, 2));
+
+    M->diag_vec_fill_values(M, out);
+    /* X is identical to the source X */
+    double expected_X[4] = {1.0, 2.0, 3.0, 4.0};
+    mu_assert("values", cmp_double_array(out_pd->X, expected_X, 4));
+
+    free_matrix(out);
+    free_matrix(M);
+    return 0;
+}
+
+/* ---- Helpers for BTA / BTDA tests ---- */
+
+/* Scatter a PD into a dense m x n_global buffer (row-major), zero-filled.
+   Buffer is allocated by the caller. */
+static void scatter_pd_to_dense(const permuted_dense *pd, int n_global,
+                                double *dense)
+{
+    int m = pd->base.m;
+    memset(dense, 0, (size_t) m * (size_t) n_global * sizeof(double));
+    for (int ii = 0; ii < pd->m0; ii++)
+    {
+        int row = pd->row_perm[ii];
+        for (int jj = 0; jj < pd->n0; jj++)
+        {
+            int col = pd->col_perm[jj];
+            dense[row * n_global + col] = pd->X[ii * pd->n0 + jj];
+        }
+    }
+}
+
+/* BTA: A and B share row_perm = [1, 3]; both have m=4, distinct col_perms.
+   C = B^T A is computed via the primitive and compared against a hand
+   reference X_B^T X_A. */
+const char *test_permuted_dense_BTA_matching_row_perm(void)
+{
+    int row_perm[2] = {1, 3};
+    int col_perm_A[2] = {0, 2};
+    int col_perm_B[2] = {1, 3};
+    /* X_A is (2, 2), X_B is (2, 2), both row-major. */
+    double XA[4] = {1.0, 2.0, 3.0, 4.0}; /* rows: [1,2], [3,4] */
+    double XB[4] = {5.0, 6.0, 7.0, 8.0}; /* rows: [5,6], [7,8] */
+    matrix *A_m = new_permuted_dense(4, 4, 2, 2, row_perm, col_perm_A, XA);
+    matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm, col_perm_B, XB);
+    permuted_dense *A = (permuted_dense *) A_m;
+    permuted_dense *B = (permuted_dense *) B_m;
+
+    matrix *C_m = BTA_pd_pd_alloc(B, A);
+    permuted_dense *C = (permuted_dense *) C_m;
+
+    mu_assert("out m", C_m->m == 4); /* B.n */
+    mu_assert("out n", C_m->n == 4); /* A.n */
+    mu_assert("m0", C->m0 == 2);
+    mu_assert("n0", C->n0 == 2);
+    mu_assert("row_perm", cmp_int_array(C->row_perm, col_perm_B, 2));
+    mu_assert("col_perm", cmp_int_array(C->col_perm, col_perm_A, 2));
+
+    BTA_pd_pd_fill_values(B, A, C);
+
+    /* Reference: X_B^T X_A. With X_B = [[5,6],[7,8]], X_A = [[1,2],[3,4]]:
+       X_B^T = [[5,7],[6,8]]. X_B^T X_A = [[5*1+7*3, 5*2+7*4], [6*1+8*3, 6*2+8*4]]
+                                        = [[26, 38], [30, 44]]. */
+    double expected[4] = {26.0, 38.0, 30.0, 44.0};
+    mu_assert("values", cmp_double_array(C->X, expected, 4));
+
+    free_matrix(C_m);
+    free_matrix(B_m);
+    free_matrix(A_m);
+    return 0;
+}
+
+/* BTA with empty row intersection: row_perm_A = [0, 2], row_perm_B = [1, 3].
+   BTA_pd_pd_alloc should return an empty C (nnz = 0); the fill
+   kernels should short-circuit without crashing. */
+const char *test_permuted_dense_BTA_empty_overlap(void)
+{
+    int row_perm_A[2] = {0, 2};
+    int row_perm_B[2] = {1, 3};
+    int col_perm_A[2] = {0, 2};
+    int col_perm_B[2] = {1, 3};
+    double XA[4] = {1.0, 2.0, 3.0, 4.0};
+    double XB[4] = {5.0, 6.0, 7.0, 8.0};
+    matrix *A_m = new_permuted_dense(4, 4, 2, 2, row_perm_A, col_perm_A, XA);
+    matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
+    permuted_dense *A = (permuted_dense *) A_m;
+    permuted_dense *B = (permuted_dense *) B_m;
+
+    matrix *C_m = BTA_pd_pd_alloc(B, A);
+    permuted_dense *C = (permuted_dense *) C_m;
+
+    mu_assert("out m", C_m->m == 4); /* B.n */
+    mu_assert("out n", C_m->n == 4); /* A.n */
+    mu_assert("m0", C->m0 == 0);
+    mu_assert("n0", C->n0 == 0);
+    mu_assert("nnz", C_m->nnz == 0);
+
+    /* fill kernels should be safe no-ops on empty C. */
+    BTA_pd_pd_fill_values(B, A, C);
+    double d[4] = {1.0, 1.0, 1.0, 1.0};
+    BTDA_pd_pd_fill_values(B, d, A, C);
+
+    free_matrix(C_m);
+    free_matrix(B_m);
+    free_matrix(A_m);
+    return 0;
+}
+
+/* BTA with partial overlap: row_perm_A = [1, 3, 5], row_perm_B = [3, 5, 7].
+   Intersection = {3, 5}. */
+const char *test_permuted_dense_BTA_partial_overlap(void)
+{
+    int row_perm_A[3] = {1, 3, 5};
+    int row_perm_B[3] = {3, 5, 7};
+    int col_perm_A[2] = {0, 2};
+    int col_perm_B[2] = {1, 3};
+    /* X_A rows correspond to A row_perm order: row 0 -> source row 1, row 1 -> 3,
+     * row 2 -> 5. */
+    double XA[6] = {1.0, 2.0,  /* row 1 (NOT in B) */
+                    3.0, 4.0,  /* row 3 (in B at pos 0) */
+                    5.0, 6.0}; /* row 5 (in B at pos 1) */
+    /* X_B rows: row 0 -> source row 3, row 1 -> 5, row 2 -> 7. */
+    double XB[6] = {10.0, 20.0,  /* row 3 (in A at pos 1) */
+                    30.0, 40.0,  /* row 5 (in A at pos 2) */
+                    50.0, 60.0}; /* row 7 (NOT in A) */
+    matrix *A_m = new_permuted_dense(8, 4, 3, 2, row_perm_A, col_perm_A, XA);
+    matrix *B_m = new_permuted_dense(8, 4, 3, 2, row_perm_B, col_perm_B, XB);
+    permuted_dense *A = (permuted_dense *) A_m;
+    permuted_dense *B = (permuted_dense *) B_m;
+
+    matrix *C_m = BTA_pd_pd_alloc(B, A);
+    permuted_dense *C = (permuted_dense *) C_m;
+    BTA_pd_pd_fill_values(B, A, C);
+
+    /* Reference: scatter A, B to dense 8x4, compute B^T A, compare block at
+       (col_perm_B, col_perm_A). */
+    double *A_d = (double *) calloc((size_t) 8 * 4, sizeof(double));
+    double *B_d = (double *) calloc((size_t) 8 * 4, sizeof(double));
+    scatter_pd_to_dense(A, 4, A_d);
+    scatter_pd_to_dense(B, 4, B_d);
+
+    /* Reference C_ref is 4x4 = B_d^T (4x8) * A_d (8x4). */
+    double C_ref[16];
+    memset(C_ref, 0, sizeof C_ref);
+    for (int i = 0; i < 4; i++)
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            double s = 0.0;
+            for (int k = 0; k < 8; k++)
+            {
+                s += B_d[k * 4 + i] * A_d[k * 4 + j];
+            }
+            C_ref[i * 4 + j] = s;
+        }
+    }
+
+    /* Extract reference block at (col_perm_B, col_perm_A) and compare to C->X. */
+    double expected[4];
+    for (int ii = 0; ii < 2; ii++)
+    {
+        for (int jj = 0; jj < 2; jj++)
+        {
+            expected[ii * 2 + jj] = C_ref[col_perm_B[ii] * 4 + col_perm_A[jj]];
+        }
+    }
+    mu_assert("values", cmp_double_array(C->X, expected, 4));
+
+    free(A_d);
+    free(B_d);
+    free_matrix(C_m);
+    free_matrix(B_m);
+    free_matrix(A_m);
+    return 0;
+}
+
+/* Full BTDA decomposition: tmp = diag(w) A; C = B^T tmp. Compare against a
+   dense triple product B_d^T diag(w) A_d. */
+const char *test_permuted_dense_BTDA_decomposition(void)
+{
+    int row_perm[3] = {0, 1, 2};
+    int col_perm_A[2] = {0, 2};
+    int col_perm_B[2] = {1, 3};
+    double XA[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    double XB[6] = {7.0, 8.0, 9.0, 10.0, 11.0, 12.0};
+    double w[3] = {2.0, -1.0, 3.0};
+
+    matrix *A_m = new_permuted_dense(3, 4, 3, 2, row_perm, col_perm_A, XA);
+    matrix *B_m = new_permuted_dense(3, 4, 3, 2, row_perm, col_perm_B, XB);
+    permuted_dense *A = (permuted_dense *) A_m;
+    permuted_dense *B = (permuted_dense *) B_m;
+
+    /* tmp has the same sparsity as A. */
+    matrix *tmp_m = A_m->copy_sparsity(A_m);
+    permuted_dense *tmp = (permuted_dense *) tmp_m;
+    DA_pd_fill_values(w, A, tmp);
+
+    matrix *C_m = BTA_pd_pd_alloc(B, tmp);
+    permuted_dense *C = (permuted_dense *) C_m;
+    BTA_pd_pd_fill_values(B, tmp, C);
+
+    /* Reference: dense B_d^T diag(w) A_d, extract (col_perm_B, col_perm_A) block. */
+    double *A_d = (double *) calloc((size_t) 3 * 4, sizeof(double));
+    double *B_d = (double *) calloc((size_t) 3 * 4, sizeof(double));
+    scatter_pd_to_dense(A, 4, A_d);
+    scatter_pd_to_dense(B, 4, B_d);
+
+    double C_ref[16];
+    memset(C_ref, 0, sizeof C_ref);
+    for (int i = 0; i < 4; i++)
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            double s = 0.0;
+            for (int k = 0; k < 3; k++)
+            {
+                s += B_d[k * 4 + i] * w[k] * A_d[k * 4 + j];
+            }
+            C_ref[i * 4 + j] = s;
+        }
+    }
+    double expected[4];
+    for (int ii = 0; ii < 2; ii++)
+    {
+        for (int jj = 0; jj < 2; jj++)
+        {
+            expected[ii * 2 + jj] = C_ref[col_perm_B[ii] * 4 + col_perm_A[jj]];
+        }
+    }
+    mu_assert("values", cmp_double_array(C->X, expected, 4));
+
+    free(A_d);
+    free(B_d);
+    free_matrix(C_m);
+    free_matrix(tmp_m);
+    free_matrix(B_m);
+    free_matrix(A_m);
+    return 0;
+}
+
+/* BTA(CSR_matrix A, PD B): basic correctness against a dense reference.
+   A is (4, 5) CSR_matrix with mixed sparsity; B is (4, 4) PD with row_perm = [1, 3],
+   col_perm = [0, 2], dense block (2, 2). */
+/* BTA_pd_csc_alloc + BTDA_pd_csc_fill_values should match the legacy
+   CSR-pd kernels in old-code on both alloc structure and BTDA values.
+   Uses a d with negative + zero entries to exercise sign / drop paths. */
+const char *test_BTA_pd_csc_matches_csr(void)
+{
+    /* Same A and B as test_BTA_pd_csr_basic. */
+    CSR_matrix *A_csr = new_CSR_matrix(4, 5, 7);
+    A_csr->p[0] = 0;
+    A_csr->p[1] = 2;
+    A_csr->p[2] = 4;
+    A_csr->p[3] = 5;
+    A_csr->p[4] = 7;
+    int Ai[7] = {1, 4, 0, 2, 2, 1, 4};
+    double Ax[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
+    memcpy(A_csr->i, Ai, sizeof Ai);
+    memcpy(A_csr->x, Ax, sizeof Ax);
+
+    int *iwork = (int *) malloc(MAX(A_csr->m, A_csr->n) * sizeof(int));
+    CSC_matrix *A_csc = csr_to_csc_alloc(A_csr, iwork);
+    csr_to_csc_fill_values(A_csr, A_csc, iwork);
+
+    int row_perm_B[2] = {1, 3};
+    int col_perm_B[2] = {0, 2};
+    double XB[4] = {10.0, 20.0, 30.0, 40.0};
+    matrix *B_m = new_permuted_dense(4, 4, 2, 2, row_perm_B, col_perm_B, XB);
+    permuted_dense *B = (permuted_dense *) B_m;
+
+    double d[4] = {1.5, -2.0, 0.0, 3.5};
+
+    /* CSR variant (baseline, from old-code). */
+    matrix *C_csr_m = BTA_pd_csr_alloc(B, A_csr);
+    permuted_dense *C_csr = (permuted_dense *) C_csr_m;
+    BTDA_pd_csr_fill_values(B, d, A_csr, C_csr);
+
+    /* CSC variant (under test). */
+    matrix *C_csc_m = BTA_pd_csc_alloc(B, A_csc);
+    permuted_dense *C_csc = (permuted_dense *) C_csc_m;
+    BTDA_pd_csc_fill_values(B, d, A_csc, C_csc);
+
+    /* Structural equality. */
+    mu_assert("m matches", C_csc_m->m == C_csr_m->m);
+    mu_assert("n matches", C_csc_m->n == C_csr_m->n);
+    mu_assert("m0 matches", C_csc->m0 == C_csr->m0);
+    mu_assert("n0 matches", C_csc->n0 == C_csr->n0);
+    mu_assert("row_perm matches",
+              cmp_int_array(C_csc->row_perm, C_csr->row_perm, C_csr->m0));
+    mu_assert("col_perm matches",
+              cmp_int_array(C_csc->col_perm, C_csr->col_perm, C_csr->n0));
+
+    /* Value equality (tolerance-based; dot ordering differs vs dgemm). */
+    mu_assert("BTDA values match",
+              cmp_double_array(C_csc->X, C_csr->X, C_csr->m0 * C_csr->n0));
+
+    free_matrix(C_csr_m);
+    free_matrix(C_csc_m);
+    free_matrix(B_m);
+    free_CSC_matrix(A_csc);
+    free_CSR_matrix(A_csr);
+    free(iwork);
+    return 0;
+}
+
+/* BA_pd_matrices: C = B @ A where B is full-block PD (the production
+   shape gated by left_matmul.c) and A is PD with non-trivial perms.
+   B (2x3) row_perm=[0,1], col_perm=[0,1,2], X_B=[[1,2,3],[4,5,6]].
+   A (3x5) row_perm=[0,2], col_perm=[1,4], X_A=[[7,8],[9,10]].
+   Hand-computed C (2x5) nonzero at cols {1,4}: X_C=[[34,38],[82,92]]. */
+const char *test_BA_pd_matrices_pd_pd_full_block_B(void)
+{
+    int row_perm_B[2] = {0, 1};
+    int col_perm_B[3] = {0, 1, 2};
+    double XB[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    matrix *B_m = new_permuted_dense(2, 3, 2, 3, row_perm_B, col_perm_B, XB);
+
+    int row_perm_A[2] = {0, 2};
+    int col_perm_A[2] = {1, 4};
+    double XA[4] = {7.0, 8.0, 9.0, 10.0};
+    matrix *A_m = new_permuted_dense(3, 5, 2, 2, row_perm_A, col_perm_A, XA);
+
+    matrix *C_m = BA_pd_matrices_alloc((permuted_dense *) B_m, A_m);
+    BA_pd_matrices_fill_values((permuted_dense *) B_m, A_m, (permuted_dense *) C_m);
+
+    permuted_dense *C = (permuted_dense *) C_m;
+    mu_assert("dim m", C_m->m == 2);
+    mu_assert("dim n", C_m->n == 5);
+    mu_assert("m0", C->m0 == 2);
+    mu_assert("n0", C->n0 == 2);
+    int expected_row_perm[2] = {0, 1};
+    int expected_col_perm[2] = {1, 4};
+    mu_assert("row_perm", cmp_int_array(C->row_perm, expected_row_perm, 2));
+    mu_assert("col_perm", cmp_int_array(C->col_perm, expected_col_perm, 2));
+    double expected_X[4] = {34.0, 38.0, 82.0, 92.0};
+    mu_assert("X", cmp_double_array(C->X, expected_X, 4));
+
+    free_matrix(C_m);
+    free_matrix(A_m);
+    free_matrix(B_m);
+    return 0;
+}
+
+/* BA_pd_matrices with general (non-full-block) B. B->col_perm and
+   A->row_perm only partially overlap, exercising the
+   sorted_intersect_indices gather path.
+   B (2x5) row_perm=[0,1], col_perm=[1,3], X_B=[[1,2],[3,4]].
+   A (5x4) row_perm=[1,2], col_perm=[0,3], X_A=[[5,6],[7,8]].
+   Intersection K = {1,3} ∩ {1,2} = {1}, s=1.
+   Hand-computed C (2x4) nonzero at cols {0,3}: X_C=[[5,6],[15,18]]. */
+const char *test_BA_pd_matrices_pd_pd_general_B(void)
+{
+    int row_perm_B[2] = {0, 1};
+    int col_perm_B[2] = {1, 3};
+    double XB[4] = {1.0, 2.0, 3.0, 4.0};
+    matrix *B_m = new_permuted_dense(2, 5, 2, 2, row_perm_B, col_perm_B, XB);
+
+    int row_perm_A[2] = {1, 2};
+    int col_perm_A[2] = {0, 3};
+    double XA[4] = {5.0, 6.0, 7.0, 8.0};
+    matrix *A_m = new_permuted_dense(5, 4, 2, 2, row_perm_A, col_perm_A, XA);
+
+    matrix *C_m = BA_pd_matrices_alloc((permuted_dense *) B_m, A_m);
+    BA_pd_matrices_fill_values((permuted_dense *) B_m, A_m, (permuted_dense *) C_m);
+
+    permuted_dense *C = (permuted_dense *) C_m;
+    mu_assert("dim m", C_m->m == 2);
+    mu_assert("dim n", C_m->n == 4);
+    mu_assert("m0", C->m0 == 2);
+    mu_assert("n0", C->n0 == 2);
+    int expected_row_perm[2] = {0, 1};
+    int expected_col_perm[2] = {0, 3};
+    mu_assert("row_perm", cmp_int_array(C->row_perm, expected_row_perm, 2));
+    mu_assert("col_perm", cmp_int_array(C->col_perm, expected_col_perm, 2));
+    double expected_X[4] = {5.0, 6.0, 15.0, 18.0};
+    mu_assert("X", cmp_double_array(C->X, expected_X, 4));
+
+    free_matrix(C_m);
+    free_matrix(A_m);
+    free_matrix(B_m);
+    return 0;
+}
+
+/* BA_pd_matrices with sparse A. Same B and same global A content as the
+   pd_pd_general_B test — the dispatcher routes through BA_pd_csc_*
+   and should yield byte-identical output. */
+const char *test_BA_pd_matrices_pd_csc(void)
+{
+    int row_perm_B[2] = {0, 1};
+    int col_perm_B[2] = {1, 3};
+    double XB[4] = {1.0, 2.0, 3.0, 4.0};
+    matrix *B_m = new_permuted_dense(2, 5, 2, 2, row_perm_B, col_perm_B, XB);
+
+    /* A as 5x4 sparse_matrix, same nonzero values as the PD case:
+       (1,0)=5, (1,3)=6, (2,0)=7, (2,3)=8. */
+    CSR_matrix *csr = new_CSR_matrix(5, 4, 4);
+    int Ap[6] = {0, 0, 2, 4, 4, 4};
+    int Ai[4] = {0, 3, 0, 3};
+    double Ax[4] = {5.0, 6.0, 7.0, 8.0};
+    memcpy(csr->p, Ap, 6 * sizeof(int));
+    memcpy(csr->i, Ai, 4 * sizeof(int));
+    memcpy(csr->x, Ax, 4 * sizeof(double));
+    matrix *A_m = new_sparse_matrix(csr);
+
+    matrix *C_m = BA_pd_matrices_alloc((permuted_dense *) B_m, A_m);
+    A_m->refresh_csc_values(A_m); /* values must be fresh before fill */
+    BA_pd_matrices_fill_values((permuted_dense *) B_m, A_m, (permuted_dense *) C_m);
+
+    permuted_dense *C = (permuted_dense *) C_m;
+    mu_assert("dim m", C_m->m == 2);
+    mu_assert("dim n", C_m->n == 4);
+    mu_assert("m0", C->m0 == 2);
+    mu_assert("n0", C->n0 == 2);
+    int expected_row_perm[2] = {0, 1};
+    int expected_col_perm[2] = {0, 3};
+    mu_assert("row_perm", cmp_int_array(C->row_perm, expected_row_perm, 2));
+    mu_assert("col_perm", cmp_int_array(C->col_perm, expected_col_perm, 2));
+    double expected_X[4] = {5.0, 6.0, 15.0, 18.0};
+    mu_assert("X", cmp_double_array(C->X, expected_X, 4));
+
+    free_matrix(C_m);
+    free_matrix(A_m);
+    free_matrix(B_m);
+    return 0;
+}
+
+/* BA_pd_matrices fast path: B->col_perm == A->row_perm exactly, so the
+   slow-path gather is skipped and one cblas_dgemm runs directly on
+   B->X and A->X.
+   B (2x4) row_perm=[0,1], col_perm=[1,3], X_B=[[1,2],[3,4]].
+   A (4x3) row_perm=[1,3], col_perm=[0,2], X_A=[[5,6],[7,8]].
+   Matching col_perm_B == row_perm_A == [1,3] triggers the fast path.
+   Hand-computed C (2x3) nonzero at cols {0,2}: X_C=[[19,22],[43,50]]. */
+const char *test_BA_pd_matrices_fast_path(void)
+{
+    int row_perm_B[2] = {0, 1};
+    int col_perm_B[2] = {1, 3};
+    double XB[4] = {1.0, 2.0, 3.0, 4.0};
+    matrix *B_m = new_permuted_dense(2, 4, 2, 2, row_perm_B, col_perm_B, XB);
+
+    int row_perm_A[2] = {1, 3};
+    int col_perm_A[2] = {0, 2};
+    double XA[4] = {5.0, 6.0, 7.0, 8.0};
+    matrix *A_m = new_permuted_dense(4, 3, 2, 2, row_perm_A, col_perm_A, XA);
+
+    matrix *C_m = BA_pd_matrices_alloc((permuted_dense *) B_m, A_m);
+    BA_pd_matrices_fill_values((permuted_dense *) B_m, A_m, (permuted_dense *) C_m);
+
+    permuted_dense *C = (permuted_dense *) C_m;
+    mu_assert("dim m", C_m->m == 2);
+    mu_assert("dim n", C_m->n == 3);
+    mu_assert("m0", C->m0 == 2);
+    mu_assert("n0", C->n0 == 2);
+    int expected_row_perm[2] = {0, 1};
+    int expected_col_perm[2] = {0, 2};
+    mu_assert("row_perm", cmp_int_array(C->row_perm, expected_row_perm, 2));
+    mu_assert("col_perm", cmp_int_array(C->col_perm, expected_col_perm, 2));
+    double expected_X[4] = {19.0, 22.0, 43.0, 50.0};
+    mu_assert("X", cmp_double_array(C->X, expected_X, 4));
+
+    free_matrix(C_m);
+    free_matrix(A_m);
+    free_matrix(B_m);
+    return 0;
+}
+
+#endif /* TEST_PERMUTED_DENSE_H */
diff --git a/tests/wsum_hess/affine/test_broadcast.h b/tests/wsum_hess/affine/test_broadcast.h
index 3d36cb4..6635a7d 100644
--- a/tests/wsum_hess/affine/test_broadcast.h
+++ b/tests/wsum_hess/affine/test_broadcast.h
@@ -56,12 +56,9 @@ const char *test_wsum_hess_broadcast_row(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("broadcast row wsum_hess: x values fail",
-              cmp_double_array(bcast->wsum_hess->x, expected_x, 3));
-    mu_assert("broadcast row wsum_hess: row pointers fail",
-              cmp_int_array(bcast->wsum_hess->p, expected_p, 4));
-    mu_assert("broadcast row wsum_hess: column indices fail",
-              cmp_int_array(bcast->wsum_hess->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(bcast->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(bcast->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(bcast);
     return 0;
@@ -111,12 +108,9 @@ const char *test_wsum_hess_broadcast_col(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("broadcast col wsum_hess: x values fail",
-              cmp_double_array(bcast->wsum_hess->x, expected_x, 3));
-    mu_assert("broadcast col wsum_hess: row pointers fail",
-              cmp_int_array(bcast->wsum_hess->p, expected_p, 4));
-    mu_assert("broadcast col wsum_hess: column indices fail",
-              cmp_int_array(bcast->wsum_hess->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(bcast->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(bcast->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(bcast);
     return 0;
@@ -159,12 +153,9 @@ const char *test_wsum_hess_broadcast_scalar_to_matrix(void)
     int expected_p[2] = {0, 1};
     int expected_i[1] = {0};
 
-    mu_assert("broadcast scalar wsum_hess: x values fail",
-              cmp_double_array(bcast->wsum_hess->x, expected_x, 1));
-    mu_assert("broadcast scalar wsum_hess: row pointers fail",
-              cmp_int_array(bcast->wsum_hess->p, expected_p, 2));
-    mu_assert("broadcast scalar wsum_hess: column indices fail",
-              cmp_int_array(bcast->wsum_hess->i, expected_i, 1));
+    mu_assert("vals fail", cmp_values(bcast->wsum_hess, expected_x, 1));
+    mu_assert("sparsity fail",
+              cmp_sparsity(bcast->wsum_hess, expected_p, expected_i, 1, 1));
 
     free_expr(bcast);
     return 0;
diff --git a/tests/wsum_hess/affine/test_diag_mat.h b/tests/wsum_hess/affine/test_diag_mat.h
index aec571a..2774d5f 100644
--- a/tests/wsum_hess/affine/test_diag_mat.h
+++ b/tests/wsum_hess/affine/test_diag_mat.h
@@ -36,10 +36,9 @@ const char *test_wsum_hess_diag_mat_log(void)
     int expected_p[5] = {0, 1, 2, 3, 4};
     int expected_i[4] = {0, 1, 2, 3};
 
-    mu_assert("diag_mat log hess vals",
-              cmp_double_array(dm->wsum_hess->x, expected_x, 4));
-    mu_assert("diag_mat log hess p", cmp_int_array(dm->wsum_hess->p, expected_p, 5));
-    mu_assert("diag_mat log hess i", cmp_int_array(dm->wsum_hess->i, expected_i, 4));
+    mu_assert("vals fail", cmp_values(dm->wsum_hess, expected_x, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(dm->wsum_hess, expected_p, expected_i, 4, 4));
 
     free_expr(dm);
     return 0;
diff --git a/tests/wsum_hess/affine/test_hstack.h b/tests/wsum_hess/affine/test_hstack.h
index 78c8160..49809a9 100644
--- a/tests/wsum_hess/affine/test_hstack.h
+++ b/tests/wsum_hess/affine/test_hstack.h
@@ -90,12 +90,9 @@ const char *test_wsum_hess_hstack(void)
     int expected_p[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
     int expected_i[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(hstack_node->wsum_hess->x, expected_x, 9));
-    mu_assert("rows incorrect",
-              cmp_int_array(hstack_node->wsum_hess->p, expected_p, 10));
-    mu_assert("cols incorrect",
-              cmp_int_array(hstack_node->wsum_hess->i, expected_i, 9));
+    mu_assert("vals fail", cmp_values(hstack_node->wsum_hess, expected_x, 9));
+    mu_assert("sparsity fail",
+              cmp_sparsity(hstack_node->wsum_hess, expected_p, expected_i, 9, 9));
 
     free_expr(hstack_node);
     return 0;
@@ -200,12 +197,9 @@ const char *test_wsum_hess_hstack_matrix(void)
     int expected_i[18] = {0, 1,  2,  3,  4,  5,  6,  7,  8,
                           9, 10, 11, 12, 13, 14, 15, 16, 17};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(hstack_node->wsum_hess->x, expected_x, 18));
-    mu_assert("rows incorrect",
-              cmp_int_array(hstack_node->wsum_hess->p, expected_p, 19));
-    mu_assert("cols incorrect",
-              cmp_int_array(hstack_node->wsum_hess->i, expected_i, 18));
+    mu_assert("vals fail", cmp_values(hstack_node->wsum_hess, expected_x, 18));
+    mu_assert("sparsity fail",
+              cmp_sparsity(hstack_node->wsum_hess, expected_p, expected_i, 18, 18));
 
     free_expr(hstack_node);
     return 0;
diff --git a/tests/wsum_hess/affine/test_index.h b/tests/wsum_hess/affine/test_index.h
index 3aa2030..6d77a39 100644
--- a/tests/wsum_hess/affine/test_index.h
+++ b/tests/wsum_hess/affine/test_index.h
@@ -37,10 +37,9 @@ const char *test_wsum_hess_index_log(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("index log hess vals",
-              cmp_double_array(idx->wsum_hess->x, expected_x, 3));
-    mu_assert("index log hess p", cmp_int_array(idx->wsum_hess->p, expected_p, 4));
-    mu_assert("index log hess i", cmp_int_array(idx->wsum_hess->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(idx->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(idx->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(idx);
     return 0;
@@ -70,12 +69,9 @@ const char *test_wsum_hess_index_repeated(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("index repeated hess vals",
-              cmp_double_array(idx->wsum_hess->x, expected_x, 3));
-    mu_assert("index repeated hess p",
-              cmp_int_array(idx->wsum_hess->p, expected_p, 4));
-    mu_assert("index repeated hess i",
-              cmp_int_array(idx->wsum_hess->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(idx->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(idx->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(idx);
     return 0;
@@ -109,12 +105,9 @@ const char *test_wsum_hess_sum_index_log(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("sum index log hess vals",
-              cmp_double_array(sum_node->wsum_hess->x, expected_x, 3));
-    mu_assert("sum index log hess p",
-              cmp_int_array(sum_node->wsum_hess->p, expected_p, 4));
-    mu_assert("sum index log hess i",
-              cmp_int_array(sum_node->wsum_hess->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(sum_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(sum_node->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(sum_node);
     return 0;
diff --git a/tests/wsum_hess/affine/test_left_matmul.h b/tests/wsum_hess/affine/test_left_matmul.h
index 313624a..4869dba 100644
--- a/tests/wsum_hess/affine/test_left_matmul.h
+++ b/tests/wsum_hess/affine/test_left_matmul.h
@@ -53,8 +53,8 @@ const char *test_wsum_hess_left_matmul(void)
 
     expr *x = new_variable(3, 1, 0, 3);
 
-    /* Create sparse matrix A in CSR format */
-    CSR_Matrix *A = new_csr_matrix(4, 3, 7);
+    /* Create sparse matrix A in CSR_matrix format */
+    CSR_matrix *A = new_CSR_matrix(4, 3, 7);
     int A_p[5] = {0, 2, 4, 6, 7};
     int A_i[7] = {0, 2, 0, 2, 0, 2, 0};
     double A_x[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
@@ -80,12 +80,11 @@ const char *test_wsum_hess_left_matmul(void)
     int expected_i[3] = {0, 1, 2};
     int expected_p[4] = {0, 1, 2, 3}; /* each row has 1 diagonal entry */
 
-    mu_assert("vals incorrect",
-              cmp_double_array(A_log_x->wsum_hess->x, expected_x, 3));
-    mu_assert("cols incorrect", cmp_int_array(A_log_x->wsum_hess->i, expected_i, 3));
-    mu_assert("rows incorrect", cmp_int_array(A_log_x->wsum_hess->p, expected_p, 4));
+    mu_assert("vals fail", cmp_values(A_log_x->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(A_log_x->wsum_hess, expected_p, expected_i, 3, 3));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     free_expr(A_log_x);
     return 0;
 }
@@ -99,7 +98,7 @@ const char *test_wsum_hess_left_matmul_exp_composite(void)
     expr *x = new_variable(3, 1, 0, 3);
 
     /* Create B matrix (3x3 all ones) */
-    CSR_Matrix *B = new_csr_matrix(3, 3, 9);
+    CSR_matrix *B = new_CSR_matrix(3, 3, 9);
     int B_p[4] = {0, 3, 6, 9};
     int B_i[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
     double B_x[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
@@ -108,7 +107,7 @@ const char *test_wsum_hess_left_matmul_exp_composite(void)
     memcpy(B->x, B_x, 9 * sizeof(double));
 
     /* Create A matrix */
-    CSR_Matrix *A = new_csr_matrix(4, 3, 7);
+    CSR_matrix *A = new_CSR_matrix(4, 3, 7);
     int A_p[5] = {0, 2, 4, 6, 7};
     int A_i[7] = {0, 2, 0, 2, 0, 2, 0};
     double A_x[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
@@ -123,8 +122,8 @@ const char *test_wsum_hess_left_matmul_exp_composite(void)
     mu_assert("check_wsum_hess failed",
               check_wsum_hess(A_exp_Bx, x_vals, w, NUMERICAL_DIFF_DEFAULT_H));
 
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
     free_expr(A_exp_Bx);
     return 0;
 }
@@ -160,8 +159,8 @@ const char *test_wsum_hess_left_matmul_matrix(void)
 
     expr *x = new_variable(3, 2, 0, 6);
 
-    /* Create sparse matrix A in CSR format */
-    CSR_Matrix *A = new_csr_matrix(4, 3, 7);
+    /* Create sparse matrix A in CSR_matrix format */
+    CSR_matrix *A = new_CSR_matrix(4, 3, 7);
     int A_p[5] = {0, 2, 4, 6, 7};
     int A_i[7] = {0, 2, 0, 2, 0, 2, 0};
     double A_x[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
@@ -189,12 +188,34 @@ const char *test_wsum_hess_left_matmul_matrix(void)
     int expected_i[6] = {0, 1, 2, 3, 4, 5};
     int expected_p[7] = {0, 1, 2, 3, 4, 5, 6}; /* each row has 1 diagonal entry */
 
-    mu_assert("vals incorrect",
-              cmp_double_array(A_log_x->wsum_hess->x, expected_x, 6));
-    mu_assert("cols incorrect", cmp_int_array(A_log_x->wsum_hess->i, expected_i, 6));
-    mu_assert("rows incorrect", cmp_int_array(A_log_x->wsum_hess->p, expected_p, 7));
+    mu_assert("vals fail", cmp_values(A_log_x->wsum_hess, expected_x, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(A_log_x->wsum_hess, expected_p, expected_i, 6, 6));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     free_expr(A_log_x);
     return 0;
 }
+
+/* Regression test for the Phase 3 transpose_fill_values omission in
+   new_left_matmul_dense. Mirrors the Python failure
+       cp.sum(A @ cp.exp(X))   with X a (2,2) Variable and A a 2x2 numpy array.
+   eval_wsum_hess reads lnode->AT->X via AT->block_left_mult_vec; before the
+   fix, AT->X was uninitialized memory (transpose_alloc allocates without
+   filling) and the analytic Hessian disagreed with finite differences. */
+const char *test_wsum_hess_left_matmul_dense_matrix_exp(void)
+{
+    double x_vals[4] = {0.5, -0.3, 0.7, -0.2};
+    double w[4] = {1.0, 1.0, 1.0, 1.0}; /* cp.sum: unit weight everywhere */
+    double A_data[4] = {1.0, 2.0, 3.0, 4.0};
+
+    expr *X = new_variable(2, 2, 0, 4);
+    expr *exp_X = new_exp(X);
+    expr *A_exp_X = new_left_matmul_dense(NULL, exp_X, 2, 2, A_data);
+
+    mu_assert("check_wsum_hess failed",
+              check_wsum_hess(A_exp_X, x_vals, w, NUMERICAL_DIFF_DEFAULT_H));
+
+    free_expr(A_exp_X);
+    return 0;
+}
diff --git a/tests/wsum_hess/affine/test_right_matmul.h b/tests/wsum_hess/affine/test_right_matmul.h
index 2f51b9b..661e8e0 100644
--- a/tests/wsum_hess/affine/test_right_matmul.h
+++ b/tests/wsum_hess/affine/test_right_matmul.h
@@ -23,8 +23,8 @@ const char *test_wsum_hess_right_matmul(void)
 
     expr *x = new_variable(2, 2, 0, 4);
 
-    /* Create sparse matrix A in CSR format (2x3) */
-    CSR_Matrix *A = new_csr_matrix(2, 3, 4);
+    /* Create sparse matrix A in CSR_matrix format (2x3) */
+    CSR_matrix *A = new_CSR_matrix(2, 3, 4);
     int A_p[3] = {0, 2, 4};
     int A_i[4] = {0, 2, 0, 2};
     double A_x[4] = {1.0, 2.0, 3.0, 4.0};
@@ -50,12 +50,11 @@ const char *test_wsum_hess_right_matmul(void)
     int expected_i[4] = {0, 1, 2, 3};
     int expected_p[5] = {0, 1, 2, 3, 4}; /* each row has 1 diagonal entry */
 
-    mu_assert("vals incorrect",
-              cmp_double_array(log_x_A->wsum_hess->x, expected_x, 4));
-    mu_assert("cols incorrect", cmp_int_array(log_x_A->wsum_hess->i, expected_i, 4));
-    mu_assert("rows incorrect", cmp_int_array(log_x_A->wsum_hess->p, expected_p, 5));
+    mu_assert("vals fail", cmp_values(log_x_A->wsum_hess, expected_x, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(log_x_A->wsum_hess, expected_p, expected_i, 4, 4));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     free_expr(log_x_A);
     return 0;
 }
@@ -73,8 +72,8 @@ const char *test_wsum_hess_right_matmul_vector(void)
 
     expr *x = new_variable(1, 3, 0, 3);
 
-    /* Create sparse matrix A in CSR format (3x2) */
-    CSR_Matrix *A = new_csr_matrix(3, 2, 4);
+    /* Create sparse matrix A in CSR_matrix format (3x2) */
+    CSR_matrix *A = new_CSR_matrix(3, 2, 4);
     int A_p[4] = {0, 1, 3, 4};
     int A_i[4] = {0, 0, 1, 1};
     double A_x[4] = {1.0, 2.0, 3.0, 4.0};
@@ -99,12 +98,11 @@ const char *test_wsum_hess_right_matmul_vector(void)
     int expected_i[3] = {0, 1, 2};
     int expected_p[4] = {0, 1, 2, 3}; /* each row has 1 diagonal entry */
 
-    mu_assert("vals incorrect",
-              cmp_double_array(log_x_A->wsum_hess->x, expected_x, 3));
-    mu_assert("cols incorrect", cmp_int_array(log_x_A->wsum_hess->i, expected_i, 3));
-    mu_assert("rows incorrect", cmp_int_array(log_x_A->wsum_hess->p, expected_p, 4));
+    mu_assert("vals fail", cmp_values(log_x_A->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(log_x_A->wsum_hess, expected_p, expected_i, 3, 3));
 
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
     free_expr(log_x_A);
     return 0;
 }
diff --git a/tests/wsum_hess/affine/test_scalar_mult.h b/tests/wsum_hess/affine/test_scalar_mult.h
index ba6bae9..d08e134 100644
--- a/tests/wsum_hess/affine/test_scalar_mult.h
+++ b/tests/wsum_hess/affine/test_scalar_mult.h
@@ -38,12 +38,9 @@ const char *test_wsum_hess_scalar_mult_log_vector(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("scalar mult log hess: x values fail",
-              cmp_double_array(y->wsum_hess->x, expected_x, 3));
-    mu_assert("scalar mult log hess: row pointers fail",
-              cmp_int_array(y->wsum_hess->p, expected_p, 4));
-    mu_assert("scalar mult log hess: column indices fail",
-              cmp_int_array(y->wsum_hess->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(y->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(y->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(y);
     return 0;
@@ -76,12 +73,9 @@ const char *test_wsum_hess_scalar_mult_log_matrix(void)
     int expected_p[5] = {0, 1, 2, 3, 4};
     int expected_i[4] = {0, 1, 2, 3};
 
-    mu_assert("scalar mult log hess matrix: x values fail",
-              cmp_double_array(y->wsum_hess->x, expected_x, 4));
-    mu_assert("scalar mult log hess matrix: row pointers fail",
-              cmp_int_array(y->wsum_hess->p, expected_p, 5));
-    mu_assert("scalar mult log hess matrix: column indices fail",
-              cmp_int_array(y->wsum_hess->i, expected_i, 4));
+    mu_assert("vals fail", cmp_values(y->wsum_hess, expected_x, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(y->wsum_hess, expected_p, expected_i, 4, 4));
 
     free_expr(y);
     return 0;
diff --git a/tests/wsum_hess/affine/test_sum.h b/tests/wsum_hess/affine/test_sum.h
index 8ade78b..2c739c9 100644
--- a/tests/wsum_hess/affine/test_sum.h
+++ b/tests/wsum_hess/affine/test_sum.h
@@ -16,7 +16,7 @@ const char *test_wsum_hess_sum_exp_linear(void)
     double Ax[6] = {1, 1, 2, 3, 1, -1};
     int Ai[6] = {0, 1, 0, 1, 0, 1};
     int Ap[4] = {0, 2, 4, 6};
-    CSR_Matrix *A = new_csr_matrix(3, 2, 6);
+    CSR_matrix *A = new_CSR_matrix(3, 2, 6);
     memcpy(A->x, Ax, 6 * sizeof(double));
     memcpy(A->i, Ai, 6 * sizeof(int));
     memcpy(A->p, Ap, 4 * sizeof(int));
@@ -32,7 +32,7 @@ const char *test_wsum_hess_sum_exp_linear(void)
               check_wsum_hess(sum_node, x_vals, &w, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(sum_node);
-    free_csr_matrix(A);
+    free_CSR_matrix(A);
 
     return 0;
 }
@@ -63,12 +63,9 @@ const char *test_wsum_hess_sum_log_axis0(void)
     int expected_p[7] = {0, 1, 2, 3, 4, 5, 6};
     int expected_i[6] = {0, 1, 2, 3, 4, 5};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(sum_node->wsum_hess->x, expected_x, 6));
-    mu_assert("rows incorrect",
-              cmp_int_array(sum_node->wsum_hess->p, expected_p, 7));
-    mu_assert("cols incorrect",
-              cmp_int_array(sum_node->wsum_hess->i, expected_i, 6));
+    mu_assert("vals fail", cmp_values(sum_node->wsum_hess, expected_x, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(sum_node->wsum_hess, expected_p, expected_i, 6, 6));
 
     free_expr(sum_node);
 
@@ -101,12 +98,9 @@ const char *test_wsum_hess_sum_log_axis1(void)
     int expected_p[7] = {0, 1, 2, 3, 4, 5, 6};
     int expected_i[6] = {0, 1, 2, 3, 4, 5};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(sum_node->wsum_hess->x, expected_x, 6));
-    mu_assert("rows incorrect",
-              cmp_int_array(sum_node->wsum_hess->p, expected_p, 7));
-    mu_assert("cols incorrect",
-              cmp_int_array(sum_node->wsum_hess->i, expected_i, 6));
+    mu_assert("vals fail", cmp_values(sum_node->wsum_hess, expected_x, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(sum_node->wsum_hess, expected_p, expected_i, 6, 6));
 
     free_expr(sum_node);
 
diff --git a/tests/wsum_hess/affine/test_trace.h b/tests/wsum_hess/affine/test_trace.h
index bb3f98c..bfbb5e0 100644
--- a/tests/wsum_hess/affine/test_trace.h
+++ b/tests/wsum_hess/affine/test_trace.h
@@ -70,11 +70,9 @@ const char *test_wsum_hess_trace_log_variable(void)
     int expected_Ap[14] = {0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 9};
     int expected_Ai[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
 
-    CSR_Matrix *H = trace_node->wsum_hess;
-    mu_assert("nnz wrong", H->nnz == 9);
-    mu_assert("vals match", cmp_double_array(H->x, expected_Ax, 9));
-    mu_assert("cols match", cmp_int_array(H->i, expected_Ai, 9));
-    mu_assert("rows fail", cmp_int_array(H->p, expected_Ap, 14));
+    mu_assert("sparsity fail",
+              cmp_sparsity(trace_node->wsum_hess, expected_Ap, expected_Ai, 13, 9));
+    mu_assert("vals fail", cmp_values(trace_node->wsum_hess, expected_Ax, 9));
 
     free_expr(trace_node);
     return 0;
@@ -135,11 +133,9 @@ const char *test_wsum_hess_trace_composite(void)
     int expected_Ap[14] = {0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 9};
     int expected_Ai[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
 
-    mu_assert("nnz wrong", trace_node->wsum_hess->nnz == 9);
-    mu_assert("rows fail", cmp_int_array(trace_node->wsum_hess->p, expected_Ap, 14));
-    mu_assert("vals match",
-              cmp_double_array(trace_node->wsum_hess->x, expected_Ax, 9));
-    mu_assert("cols match", cmp_int_array(trace_node->wsum_hess->i, expected_Ai, 9));
+    mu_assert("sparsity fail",
+              cmp_sparsity(trace_node->wsum_hess, expected_Ap, expected_Ai, 13, 9));
+    mu_assert("vals fail", cmp_values(trace_node->wsum_hess, expected_Ax, 9));
     free_expr(trace_node);
     return 0;
 }
diff --git a/tests/wsum_hess/affine/test_transpose.h b/tests/wsum_hess/affine/test_transpose.h
index be91541..ad4e93f 100644
--- a/tests/wsum_hess/affine/test_transpose.h
+++ b/tests/wsum_hess/affine/test_transpose.h
@@ -27,12 +27,9 @@ const char *test_wsum_hess_transpose(void)
     int expected_p[9] = {0, 2, 4, 6, 8, 10, 12, 14, 16};
     int expected_i[16] = {4, 6, 4, 6, 5, 7, 5, 7, 0, 1, 2, 3, 0, 1, 2, 3};
 
-    mu_assert("hess values fail",
-              cmp_double_array(XYT->wsum_hess->x, expected_x, 8));
-    mu_assert("jacobian row ptr fail",
-              cmp_int_array(XYT->wsum_hess->p, expected_p, 5));
-    mu_assert("jacobian col idx fail",
-              cmp_int_array(XYT->wsum_hess->i, expected_i, 8));
+    mu_assert("vals fail", cmp_values(XYT->wsum_hess, expected_x, 16));
+    mu_assert("sparsity fail",
+              cmp_sparsity(XYT->wsum_hess, expected_p, expected_i, 8, 16));
     free_expr(XYT);
 
     return 0;
diff --git a/tests/wsum_hess/affine/test_upper_tri.h b/tests/wsum_hess/affine/test_upper_tri.h
index 418b809..0cff5b6 100644
--- a/tests/wsum_hess/affine/test_upper_tri.h
+++ b/tests/wsum_hess/affine/test_upper_tri.h
@@ -49,12 +49,9 @@ const char *test_wsum_hess_upper_tri_log(void)
     int expected_p[17] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
     int expected_i[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
 
-    mu_assert("upper_tri log hess vals",
-              cmp_double_array(ut->wsum_hess->x, expected_x, 16));
-    mu_assert("upper_tri log hess p",
-              cmp_int_array(ut->wsum_hess->p, expected_p, 17));
-    mu_assert("upper_tri log hess i",
-              cmp_int_array(ut->wsum_hess->i, expected_i, 16));
+    mu_assert("vals fail", cmp_values(ut->wsum_hess, expected_x, 16));
+    mu_assert("sparsity fail",
+              cmp_sparsity(ut->wsum_hess, expected_p, expected_i, 16, 16));
 
     free_expr(ut);
     return 0;
diff --git a/tests/wsum_hess/affine/test_vector_mult.h b/tests/wsum_hess/affine/test_vector_mult.h
index 1d75ad3..5d0d140 100644
--- a/tests/wsum_hess/affine/test_vector_mult.h
+++ b/tests/wsum_hess/affine/test_vector_mult.h
@@ -38,12 +38,9 @@ const char *test_wsum_hess_vector_mult_log_vector(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vector mult log hess: x values fail",
-              cmp_double_array(y->wsum_hess->x, expected_x, 3));
-    mu_assert("vector mult log hess: row pointers fail",
-              cmp_int_array(y->wsum_hess->p, expected_p, 4));
-    mu_assert("vector mult log hess: column indices fail",
-              cmp_int_array(y->wsum_hess->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(y->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(y->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(y);
     return 0;
@@ -76,12 +73,9 @@ const char *test_wsum_hess_vector_mult_log_matrix(void)
     int expected_p[5] = {0, 1, 2, 3, 4};
     int expected_i[4] = {0, 1, 2, 3};
 
-    mu_assert("vector mult log hess matrix: x values fail",
-              cmp_double_array(y->wsum_hess->x, expected_x, 4));
-    mu_assert("vector mult log hess matrix: row pointers fail",
-              cmp_int_array(y->wsum_hess->p, expected_p, 5));
-    mu_assert("vector mult log hess matrix: column indices fail",
-              cmp_int_array(y->wsum_hess->i, expected_i, 4));
+    mu_assert("vals fail", cmp_values(y->wsum_hess, expected_x, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(y->wsum_hess, expected_p, expected_i, 4, 4));
 
     free_expr(y);
     return 0;
diff --git a/tests/wsum_hess/affine/test_vstack.h b/tests/wsum_hess/affine/test_vstack.h
index f003431..cb473a1 100644
--- a/tests/wsum_hess/affine/test_vstack.h
+++ b/tests/wsum_hess/affine/test_vstack.h
@@ -39,12 +39,9 @@ const char *test_wsum_hess_vstack_vectors(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vstack hess vectors: vals",
-              cmp_double_array(stack->wsum_hess->x, expected_x, 3));
-    mu_assert("vstack hess vectors: rows",
-              cmp_int_array(stack->wsum_hess->p, expected_p, 4));
-    mu_assert("vstack hess vectors: cols",
-              cmp_int_array(stack->wsum_hess->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(stack->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(stack->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(stack);
     return 0;
@@ -102,12 +99,9 @@ const char *test_wsum_hess_vstack_matrix(void)
     int expected_p[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
     int expected_i[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
 
-    mu_assert("vstack hess matrix: vals",
-              cmp_double_array(stack->wsum_hess->x, expected_x, 9));
-    mu_assert("vstack hess matrix: rows",
-              cmp_int_array(stack->wsum_hess->p, expected_p, 10));
-    mu_assert("vstack hess matrix: cols",
-              cmp_int_array(stack->wsum_hess->i, expected_i, 9));
+    mu_assert("vals fail", cmp_values(stack->wsum_hess, expected_x, 9));
+    mu_assert("sparsity fail",
+              cmp_sparsity(stack->wsum_hess, expected_p, expected_i, 9, 9));
 
     free_expr(stack);
     return 0;
diff --git a/tests/wsum_hess/bivariate_full_dom/test_matmul.h b/tests/wsum_hess/bivariate_full_dom/test_matmul.h
index 61a2d2d..9241609 100644
--- a/tests/wsum_hess/bivariate_full_dom/test_matmul.h
+++ b/tests/wsum_hess/bivariate_full_dom/test_matmul.h
@@ -49,16 +49,10 @@ const char *test_wsum_hess_matmul(void)
     Z->eval_wsum_hess(Z, w);
 
     /* Verify Hessian dimensions and sparsity */
-    mu_assert("Hessian should be 18x18", Z->wsum_hess->m == n_vars);
-    mu_assert("Hessian should be 18x18", Z->wsum_hess->n == n_vars);
-    mu_assert("Hessian should have 48 nonzeros", Z->wsum_hess->nnz == 48);
+    mu_assert("Hessian should be 18 cols", Z->wsum_hess->n == n_vars);
 
     int expected_p[19] = {0,  4,  8,  12, 16, 20, 24, 26, 28, 30,
                           32, 34, 36, 38, 40, 42, 44, 46, 48};
-
-    mu_assert("Row pointers incorrect",
-              cmp_int_array(Z->wsum_hess->p, expected_p, 19));
-
     int expected_i[48] = {6, 9,  12, 15, /* row 0 */
                           6, 9,  12, 15, /* row 1 */
                           7, 10, 13, 16, /* row 2 */
@@ -78,8 +72,8 @@ const char *test_wsum_hess_matmul(void)
                           2, 3,          /* row 16*/
                           4, 5};
 
-    mu_assert("Column indices incorrect",
-              cmp_int_array(Z->wsum_hess->i, expected_i, 48));
+    mu_assert("sparsity fail",
+              cmp_sparsity(Z->wsum_hess, expected_p, expected_i, n_vars, 48));
 
     double expected_x[48] = {1.0, 3.0, 5.0, 7.0, /* row 0 */
                              2.0, 4.0, 6.0, 8.0, /* row 1 */
@@ -100,8 +94,7 @@ const char *test_wsum_hess_matmul(void)
                              7.0, 8.0,           /* row 16 */
                              7.0, 8.0};          /* row 17 */
 
-    mu_assert("Hessian values incorrect",
-              cmp_double_array(Z->wsum_hess->x, expected_x, 48));
+    mu_assert("vals fail", cmp_values(Z->wsum_hess, expected_x, 48));
 
     free_expr(Z);
     return 0;
@@ -150,9 +143,7 @@ const char *test_wsum_hess_matmul_yx(void)
     Z->eval_wsum_hess(Z, w);
 
     /* Verify Hessian dimensions and sparsity */
-    mu_assert("Hessian should be 18x18", Z->wsum_hess->m == n_vars);
-    mu_assert("Hessian should be 18x18", Z->wsum_hess->n == n_vars);
-    mu_assert("Hessian should have 48 nonzeros", Z->wsum_hess->nnz == 48);
+    mu_assert("Hessian should be 18 cols", Z->wsum_hess->n == n_vars);
 
     /* Row pointers when Y < X:
      * Rows 0-11 (Y variables): each couples with m=2 X variables
@@ -161,9 +152,6 @@ const char *test_wsum_hess_matmul_yx(void)
     int expected_p[19] = {0,  2,  4,  6,  8,  10, 12, 14, 16, 18,
                           20, 22, 24, 28, 32, 36, 40, 44, 48};
 
-    mu_assert("Row pointers incorrect",
-              cmp_int_array(Z->wsum_hess->p, expected_p, 19));
-
     /* Column indices when Y < X:
      * Y[k_idx, col] couples with X[row, k_idx] for all row
      * X variable index = 12 + row + k_idx*m
@@ -189,8 +177,8 @@ const char *test_wsum_hess_matmul_yx(void)
                           2,  5,  8, 11,  /* row 16: X[0,2] */
                           2,  5,  8, 11}; /* row 17: X[1,2] */
 
-    mu_assert("Column indices incorrect",
-              cmp_int_array(Z->wsum_hess->i, expected_i, 48));
+    mu_assert("sparsity fail",
+              cmp_sparsity(Z->wsum_hess, expected_p, expected_i, n_vars, 48));
 
     double expected_x[48] = {1.0, 2.0,            /* row 0 */
                              1.0, 2.0,            /* row 1 */
@@ -211,8 +199,7 @@ const char *test_wsum_hess_matmul_yx(void)
                              1.0, 3.0, 5.0, 7.0,  /* row 16 */
                              2.0, 4.0, 6.0, 8.0}; /* row 17 */
 
-    mu_assert("Hessian values incorrect",
-              cmp_double_array(Z->wsum_hess->x, expected_x, 48));
+    mu_assert("vals fail", cmp_values(Z->wsum_hess, expected_x, 48));
 
     free_expr(Z);
     return 0;
diff --git a/tests/wsum_hess/bivariate_full_dom/test_multiply.h b/tests/wsum_hess/bivariate_full_dom/test_multiply.h
index e85c7a1..a7c01fc 100644
--- a/tests/wsum_hess/bivariate_full_dom/test_multiply.h
+++ b/tests/wsum_hess/bivariate_full_dom/test_multiply.h
@@ -30,9 +30,9 @@ const char *test_wsum_hess_multiply_1(void)
     int expected_i[6] = {8, 9, 10, 3, 4, 5};
     double expected_x[6] = {1.0, 2.0, 3.0, 1.0, 2.0, 3.0};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->p, expected_p, 13));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->i, expected_i, 6));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->wsum_hess, expected_p, expected_i, 12, 6));
+    mu_assert("vals fail", cmp_values(node->wsum_hess, expected_x, 6));
 
     free_expr(node);
     return 0;
@@ -41,14 +41,14 @@ const char *test_wsum_hess_multiply_1(void)
 const char *test_wsum_hess_multiply_sparse_random(void)
 {
     /* Test with larger random sparse matrices
-     * A: 5x10 CSR matrix
-     * B: 5x10 CSR matrix
+     * A: 5x10 CSR_matrix matrix
+     * B: 5x10 CSR_matrix matrix
      * x: 10-dimensional variable with var_id = 0, n_vars = 10
      * Expected Hessian: 10x10 sparse matrix
      */
 
     /* Create A matrix (5x10) */
-    CSR_Matrix *A = new_csr_matrix(5, 10, 10);
+    CSR_matrix *A = new_CSR_matrix(5, 10, 10);
     double Ax[10] = {-1.44165273, -1.13687223, 0.55892257,  0.24912193,  0.84959744,
                      -0.23998915, 0.5913356,   -1.21627912, -0.50379166, 0.41531801};
     int Ai[10] = {1, 2, 4, 8, 2, 3, 8, 9, 1, 2};
@@ -58,7 +58,7 @@ const char *test_wsum_hess_multiply_sparse_random(void)
     memcpy(A->p, Ap, 6 * sizeof(int));
 
     /* Create B matrix (5x10) */
-    CSR_Matrix *B = new_csr_matrix(5, 10, 10);
+    CSR_matrix *B = new_CSR_matrix(5, 10, 10);
     double Bx[10] = {1.27549062,  0.04194731, -0.4356034,  0.405574,   1.34670487,
                      -0.57738638, 0.9411464,  -0.31563179, 1.90831766, -0.89802958};
     int Bi[10] = {0, 3, 5, 7, 0, 5, 0, 3, 7, 9};
@@ -85,7 +85,7 @@ const char *test_wsum_hess_multiply_sparse_random(void)
     double w[5] = {0.50646339, 0.44756224, 0.67295241, 0.16424956, 0.03031469};
     mult_node->eval_wsum_hess(mult_node, w);
 
-    /* Expected Hessian in CSR format (10x10) */
+    /* Expected Hessian in CSR_matrix format (10x10) */
     int expected_p[11] = {0, 6, 9, 13, 18, 19, 20, 20, 22, 25, 29};
     int expected_i[29] = {1, 2, 3, 4, 8, 9, 0, 7, 9, 0, 3, 7, 9, 0, 2,
                           3, 8, 9, 0, 8, 1, 2, 0, 3, 5, 0, 1, 2, 3};
@@ -96,17 +96,14 @@ const char *test_wsum_hess_multiply_sparse_random(void)
         0.361058,    -0.09679721, -0.02914438, 0.02402617,  0.31718166,  -0.03065625,
         -0.09679721, -0.18801593, 0.01371497,  -0.01130641, 0.06305481};
 
-    mu_assert("p array fails",
-              cmp_int_array(mult_node->wsum_hess->p, expected_p, 11));
-    mu_assert("i array fails",
-              cmp_int_array(mult_node->wsum_hess->i, expected_i, 29));
-    mu_assert("x array fails",
-              cmp_double_array(mult_node->wsum_hess->x, expected_x, 29));
+    mu_assert("sparsity fail",
+              cmp_sparsity(mult_node->wsum_hess, expected_p, expected_i, 10, 29));
+    mu_assert("vals fail", cmp_values(mult_node->wsum_hess, expected_x, 29));
 
     /* Cleanup */
     free_expr(mult_node);
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
 
     return 0;
 }
@@ -131,8 +128,8 @@ const char *test_wsum_hess_multiply_linear_ops(void)
      *  [ 75.  18.  76.]]
      */
 
-    /* Create CSR matrix A */
-    CSR_Matrix *A = new_csr_matrix(4, 3, 6);
+    /* Create CSR_matrix matrix A */
+    CSR_matrix *A = new_CSR_matrix(4, 3, 6);
     double Ax[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     int Ai[6] = {0, 2, 1, 0, 2, 1};
     int Ap[5] = {0, 2, 3, 5, 6};
@@ -140,8 +137,8 @@ const char *test_wsum_hess_multiply_linear_ops(void)
     memcpy(A->i, Ai, 6 * sizeof(int));
     memcpy(A->p, Ap, 5 * sizeof(int));
 
-    /* Create CSR matrix B */
-    CSR_Matrix *B = new_csr_matrix(4, 3, 8);
+    /* Create CSR_matrix matrix B */
+    CSR_matrix *B = new_CSR_matrix(4, 3, 8);
     double Bx[8] = {1.0, 4.0, 2.0, 7.0, 3.0, 2.0, 4.0, -1.0};
     int Bi[8] = {0, 2, 1, 2, 0, 2, 1, 2};
     int Bp[5] = {0, 2, 4, 6, 8};
@@ -170,7 +167,7 @@ const char *test_wsum_hess_multiply_linear_ops(void)
     mult_node->eval_wsum_hess(mult_node, w);
 
     /* Check sparsity pattern and values */
-    /* Expected CSR format:
+    /* Expected CSR_matrix format:
      * indptr: [0, 2, 4, 7]
      * indices: [0, 2, 1, 2, 0, 1, 2]
      * data: [74.0, 75.0, 216.0, 18.0, 75.0, 18.0, 76.0]
@@ -179,17 +176,14 @@ const char *test_wsum_hess_multiply_linear_ops(void)
     int expected_i[7] = {0, 2, 1, 2, 0, 1, 2};
     double expected_x[7] = {74.0, 75.0, 216.0, 18.0, 75.0, 18.0, 76.0};
 
-    mu_assert("p array fails",
-              cmp_int_array(mult_node->wsum_hess->p, expected_p, 4));
-    mu_assert("i array fails",
-              cmp_int_array(mult_node->wsum_hess->i, expected_i, 7));
-    mu_assert("x array fails",
-              cmp_double_array(mult_node->wsum_hess->x, expected_x, 7));
+    mu_assert("sparsity fail",
+              cmp_sparsity(mult_node->wsum_hess, expected_p, expected_i, 3, 7));
+    mu_assert("vals fail", cmp_values(mult_node->wsum_hess, expected_x, 7));
 
     /* Cleanup */
     free_expr(mult_node);
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
 
     return 0;
 }
@@ -219,9 +213,9 @@ const char *test_wsum_hess_multiply_2(void)
     int expected_i[6] = {8, 9, 10, 3, 4, 5};
     double expected_x[6] = {1.0, 2.0, 3.0, 1.0, 2.0, 3.0};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->p, expected_p, 13));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->i, expected_i, 6));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 6));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->wsum_hess, expected_p, expected_i, 12, 6));
+    mu_assert("vals fail", cmp_values(node->wsum_hess, expected_x, 6));
 
     free_expr(node);
     return 0;
diff --git a/tests/wsum_hess/bivariate_restricted_dom/test_quad_over_lin.h b/tests/wsum_hess/bivariate_restricted_dom/test_quad_over_lin.h
index f1053e2..2cb5dc4 100644
--- a/tests/wsum_hess/bivariate_restricted_dom/test_quad_over_lin.h
+++ b/tests/wsum_hess/bivariate_restricted_dom/test_quad_over_lin.h
@@ -26,9 +26,9 @@ const char *test_wsum_hess_quad_over_lin_xy(void)
     double expected_x[10] = {1.0,   -0.25, 1.0,  -0.5,  1.0,
                              -0.75, -0.25, -0.5, -0.75, 0.875};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->p, expected_p, 10));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->i, expected_i, 10));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 10));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->wsum_hess, expected_p, expected_i, 9, 10));
+    mu_assert("vals fail", cmp_values(node->wsum_hess, expected_x, 10));
 
     free_expr(node);
     return 0;
@@ -56,9 +56,9 @@ const char *test_wsum_hess_quad_over_lin_yx(void)
     double expected_x[10] = {0.875, -0.25, -0.5, -0.75, -0.25,
                              1.0,   -0.5,  1.0,  -0.75, 1.0};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->p, expected_p, 10));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->i, expected_i, 10));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 10));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->wsum_hess, expected_p, expected_i, 9, 10));
+    mu_assert("vals fail", cmp_values(node->wsum_hess, expected_x, 10));
 
     free_expr(node);
     return 0;
diff --git a/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr.h b/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr.h
index ce2137a..5c3adde 100644
--- a/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr.h
+++ b/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr.h
@@ -30,9 +30,9 @@ const char *test_wsum_hess_rel_entr_1(void)
     double expected_x[12] = {1.0,   -0.25,  1.0,  -0.4, 1.0,  -0.5,
                              -0.25, 0.0625, -0.4, 0.16, -0.5, 0.25};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->p, expected_p, 11));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->i, expected_i, 12));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 12));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->wsum_hess, expected_p, expected_i, 10, 12));
+    mu_assert("vals fail", cmp_values(node->wsum_hess, expected_x, 12));
 
     free_expr(node);
     return 0;
@@ -62,9 +62,9 @@ const char *test_wsum_hess_rel_entr_2(void)
     double expected_x[12] = {0.0625, -0.25, 0.16, -0.4, 0.25, -0.5,
                              -0.25,  1.0,   -0.4, 1.0,  -0.5, 1.0};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->p, expected_p, 11));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->i, expected_i, 12));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 12));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->wsum_hess, expected_p, expected_i, 10, 12));
+    mu_assert("vals fail", cmp_values(node->wsum_hess, expected_x, 12));
 
     free_expr(node);
     return 0;
@@ -98,9 +98,9 @@ const char *test_wsum_hess_rel_entr_matrix(void)
         -1.0 / 6.0, 1.0 / 36.0,         -0.4, 0.16, -0.75, 0.5625,
         -4.0 / 3.0, 1.7777777777777777, -2.5, 6.25, -6.0,  36.0};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->p, expected_p, 13));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->i, expected_i, 24));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 24));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->wsum_hess, expected_p, expected_i, 12, 24));
+    mu_assert("vals fail", cmp_values(node->wsum_hess, expected_x, 24));
 
     free_expr(node);
     return 0;
diff --git a/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_scalar_vector.h b/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_scalar_vector.h
index a431b96..74ba8b7 100644
--- a/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_scalar_vector.h
+++ b/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_scalar_vector.h
@@ -25,9 +25,9 @@ const char *test_wsum_hess_rel_entr_scalar_vector(void)
     double expected_x[10] = {15.0, -2.0,       -5.0 / 3.0, -1.5, -2.0,
                              1.0,  -5.0 / 3.0, 5.0 / 9.0,  -1.5, 0.375};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->p, expected_p, 5));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->i, expected_i, 10));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 10));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->wsum_hess, expected_p, expected_i, 4, 10));
+    mu_assert("vals fail", cmp_values(node->wsum_hess, expected_x, 10));
 
     free_expr(node);
     return 0;
diff --git a/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_vector_scalar.h b/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_vector_scalar.h
index 60b9436..d74e92b 100644
--- a/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_vector_scalar.h
+++ b/tests/wsum_hess/bivariate_restricted_dom/test_rel_entr_vector_scalar.h
@@ -25,9 +25,9 @@ const char *test_wsum_hess_rel_entr_vector_scalar(void)
     double expected_x[10] = {1.0,   -0.25, 1.0,  -0.5,  1.0,
                              -0.75, -0.25, -0.5, -0.75, 0.875};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->p, expected_p, 5));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->i, expected_i, 10));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 10));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->wsum_hess, expected_p, expected_i, 4, 10));
+    mu_assert("vals fail", cmp_values(node->wsum_hess, expected_x, 10));
 
     free_expr(node);
     return 0;
diff --git a/tests/wsum_hess/composite/test_chain_rule_wsum_hess.h b/tests/wsum_hess/composite/test_chain_rule_wsum_hess.h
index c5c1a60..6fbff2d 100644
--- a/tests/wsum_hess/composite/test_chain_rule_wsum_hess.h
+++ b/tests/wsum_hess/composite/test_chain_rule_wsum_hess.h
@@ -123,8 +123,8 @@ const char *test_wsum_hess_Ax_Bx_multiply(void)
     double u_vals[4] = {1.0, 2.0, 3.0, 4.0};
     double w[2] = {1.33, 2.1};
 
-    CSR_Matrix *A = new_csr_random(2, 2, 1.0);
-    CSR_Matrix *B = new_csr_random(2, 2, 1.0);
+    CSR_matrix *A = new_csr_random(2, 2, 1.0);
+    CSR_matrix *B = new_csr_random(2, 2, 1.0);
     expr *x = new_variable(2, 1, 1, 4);
     expr *Ax = new_left_matmul(NULL, x, A);
     expr *Bx = new_left_matmul(NULL, x, B);
@@ -134,8 +134,8 @@ const char *test_wsum_hess_Ax_Bx_multiply(void)
               check_wsum_hess(multiply, u_vals, w, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(multiply);
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
     return 0;
 }
 
@@ -159,8 +159,8 @@ const char *test_wsum_hess_AX_BX_multiply(void)
     double u_vals[4] = {1.0, 2.0, 3.0, 4.0};
     double w[4] = {1.1, 2.2, 3.3, 4.4};
 
-    CSR_Matrix *A = new_csr_random(2, 2, 1.0);
-    CSR_Matrix *B = new_csr_random(2, 2, 1.0);
+    CSR_matrix *A = new_csr_random(2, 2, 1.0);
+    CSR_matrix *B = new_csr_random(2, 2, 1.0);
     expr *X = new_variable(2, 2, 0, 4);
     expr *AX = new_left_matmul(NULL, X, A);
     expr *BX = new_left_matmul(NULL, X, B);
@@ -170,8 +170,8 @@ const char *test_wsum_hess_AX_BX_multiply(void)
               check_wsum_hess(multiply, u_vals, w, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(multiply);
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
     return 0;
 }
 
@@ -180,8 +180,8 @@ const char *test_wsum_hess_multiply_deep_composite(void)
     double u_vals[4] = {1.0, 2.0, 3.0, 4.0};
     double w[4] = {1.1, 2.2, 3.3, 4.4};
 
-    CSR_Matrix *A = new_csr_random(2, 2, 1.0);
-    CSR_Matrix *B = new_csr_random(2, 2, 1.0);
+    CSR_matrix *A = new_csr_random(2, 2, 1.0);
+    CSR_matrix *B = new_csr_random(2, 2, 1.0);
     expr *X = new_variable(2, 2, 0, 8);
     expr *Y = new_variable(2, 2, 0, 8);
     expr *AX = new_left_matmul(NULL, X, A);
@@ -195,8 +195,8 @@ const char *test_wsum_hess_multiply_deep_composite(void)
               check_wsum_hess(multiply, u_vals, w, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(multiply);
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
     return 0;
 }
 
@@ -205,10 +205,10 @@ const char *test_wsum_hess_quad_form_Ax(void)
     double u_vals[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     double w = 1.0;
 
-    CSR_Matrix *A = new_csr_random(3, 4, 1.0);
+    CSR_matrix *A = new_csr_random(3, 4, 1.0);
 
     /* Q = [1 2 0; 2 3 0; 0 0 4] (symmetric) */
-    CSR_Matrix *Q = new_csr_matrix(3, 3, 5);
+    CSR_matrix *Q = new_CSR_matrix(3, 3, 5);
     double Qx[5] = {1.0, 2.0, 2.0, 3.0, 4.0};
     int Qi[5] = {0, 1, 0, 1, 2};
     int Qp[4] = {0, 2, 4, 5};
@@ -224,8 +224,8 @@ const char *test_wsum_hess_quad_form_Ax(void)
               check_wsum_hess(node, u_vals, &w, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(node);
-    free_csr_matrix(A);
-    free_csr_matrix(Q);
+    free_CSR_matrix(A);
+    free_CSR_matrix(Q);
     return 0;
 }
 
@@ -234,10 +234,10 @@ const char *test_wsum_hess_quad_form_sin_Ax(void)
     double u_vals[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
     double w = 2.0;
 
-    CSR_Matrix *A = new_csr_random(3, 4, 1.0);
+    CSR_matrix *A = new_csr_random(3, 4, 1.0);
 
     /* Q = [1 2 0; 2 3 0; 0 0 4] (symmetric) */
-    CSR_Matrix *Q = new_csr_matrix(3, 3, 5);
+    CSR_matrix *Q = new_CSR_matrix(3, 3, 5);
     double Qx[5] = {1.0, 2.0, 2.0, 3.0, 4.0};
     int Qi[5] = {0, 1, 0, 1, 2};
     int Qp[4] = {0, 2, 4, 5};
@@ -254,8 +254,8 @@ const char *test_wsum_hess_quad_form_sin_Ax(void)
               check_wsum_hess(node, u_vals, &w, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(node);
-    free_csr_matrix(A);
-    free_csr_matrix(Q);
+    free_CSR_matrix(A);
+    free_CSR_matrix(Q);
     return 0;
 }
 
@@ -303,8 +303,8 @@ const char *test_wsum_hess_matmul_Ax_By(void)
     double u_vals[10] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0};
     double w[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
 
-    CSR_Matrix *A = new_csr_random(3, 2, 1.0);
-    CSR_Matrix *B = new_csr_random(2, 3, 1.0);
+    CSR_matrix *A = new_csr_random(3, 2, 1.0);
+    CSR_matrix *B = new_csr_random(2, 3, 1.0);
 
     expr *X = new_variable(2, 2, 0, 10);
     expr *Y = new_variable(3, 2, 4, 10);
@@ -316,8 +316,8 @@ const char *test_wsum_hess_matmul_Ax_By(void)
               check_wsum_hess(Z, u_vals, w, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(Z);
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
     return 0;
 }
 
@@ -327,8 +327,8 @@ const char *test_wsum_hess_matmul_sin_Ax_cos_Bx(void)
     double u_vals[6] = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0};
     double w[4] = {1.0, 2.0, 3.0, 4.0};
 
-    CSR_Matrix *A = new_csr_random(2, 3, 1.0);
-    CSR_Matrix *B = new_csr_random(2, 3, 1.0);
+    CSR_matrix *A = new_csr_random(2, 3, 1.0);
+    CSR_matrix *B = new_csr_random(2, 3, 1.0);
 
     expr *X = new_variable(3, 2, 0, 6);
     expr *AX = new_left_matmul(NULL, X, A); /* 2x2 */
@@ -341,8 +341,8 @@ const char *test_wsum_hess_matmul_sin_Ax_cos_Bx(void)
               check_wsum_hess(Z, u_vals, w, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(Z);
-    free_csr_matrix(A);
-    free_csr_matrix(B);
+    free_CSR_matrix(A);
+    free_CSR_matrix(B);
     return 0;
 }
 
@@ -368,7 +368,7 @@ const char *test_wsum_hess_quad_form_exp(void)
     double w = 3.0;
 
     /* Q = [1 2 0; 2 3 0; 0 0 4] (symmetric) */
-    CSR_Matrix *Q = new_csr_matrix(3, 3, 5);
+    CSR_matrix *Q = new_CSR_matrix(3, 3, 5);
     double Qx[5] = {1.0, 2.0, 2.0, 3.0, 4.0};
     int Qi[5] = {0, 1, 0, 1, 2};
     int Qp[4] = {0, 2, 4, 5};
@@ -384,6 +384,6 @@ const char *test_wsum_hess_quad_form_exp(void)
               check_wsum_hess(node, u_vals, &w, NUMERICAL_DIFF_DEFAULT_H));
 
     free_expr(node);
-    free_csr_matrix(Q);
+    free_CSR_matrix(Q);
     return 0;
 }
diff --git a/tests/wsum_hess/elementwise_full_dom/test_exp.h b/tests/wsum_hess/elementwise_full_dom/test_exp.h
index be00553..891eae0 100644
--- a/tests/wsum_hess/elementwise_full_dom/test_exp.h
+++ b/tests/wsum_hess/elementwise_full_dom/test_exp.h
@@ -27,12 +27,9 @@ const char *test_wsum_hess_exp(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(exp_node->wsum_hess->x, expected_x, 3));
-    mu_assert("rows incorrect",
-              cmp_int_array(exp_node->wsum_hess->p, expected_p, 4));
-    mu_assert("cols incorrect",
-              cmp_int_array(exp_node->wsum_hess->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(exp_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(exp_node->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(exp_node);
 
diff --git a/tests/wsum_hess/elementwise_full_dom/test_hyperbolic.h b/tests/wsum_hess/elementwise_full_dom/test_hyperbolic.h
index 88a655f..cf543a7 100644
--- a/tests/wsum_hess/elementwise_full_dom/test_hyperbolic.h
+++ b/tests/wsum_hess/elementwise_full_dom/test_hyperbolic.h
@@ -34,12 +34,9 @@ const char *test_wsum_hess_sinh(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(sinh_node->wsum_hess->x, expected_x, 3));
-    mu_assert("rows incorrect",
-              cmp_int_array(sinh_node->wsum_hess->p, expected_p, 4));
-    mu_assert("cols incorrect",
-              cmp_int_array(sinh_node->wsum_hess->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(sinh_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(sinh_node->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(sinh_node);
 
@@ -72,12 +69,9 @@ const char *test_wsum_hess_tanh(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(tanh_node->wsum_hess->x, expected_x, 3));
-    mu_assert("rows incorrect",
-              cmp_int_array(tanh_node->wsum_hess->p, expected_p, 4));
-    mu_assert("cols incorrect",
-              cmp_int_array(tanh_node->wsum_hess->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(tanh_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(tanh_node->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(tanh_node);
 
@@ -110,12 +104,9 @@ const char *test_wsum_hess_asinh(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(asinh_node->wsum_hess->x, expected_x, 3));
-    mu_assert("rows incorrect",
-              cmp_int_array(asinh_node->wsum_hess->p, expected_p, 4));
-    mu_assert("cols incorrect",
-              cmp_int_array(asinh_node->wsum_hess->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(asinh_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(asinh_node->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(asinh_node);
 
@@ -149,12 +140,9 @@ const char *test_wsum_hess_atanh(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(atanh_node->wsum_hess->x, expected_x, 3));
-    mu_assert("rows incorrect",
-              cmp_int_array(atanh_node->wsum_hess->p, expected_p, 4));
-    mu_assert("cols incorrect",
-              cmp_int_array(atanh_node->wsum_hess->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(atanh_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(atanh_node->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(atanh_node);
 
diff --git a/tests/wsum_hess/elementwise_full_dom/test_logistic.h b/tests/wsum_hess/elementwise_full_dom/test_logistic.h
index 2de6070..025ec8c 100644
--- a/tests/wsum_hess/elementwise_full_dom/test_logistic.h
+++ b/tests/wsum_hess/elementwise_full_dom/test_logistic.h
@@ -42,12 +42,9 @@ const char *test_wsum_hess_logistic(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(logistic_node->wsum_hess->x, expected_x, 3));
-    mu_assert("rows incorrect",
-              cmp_int_array(logistic_node->wsum_hess->p, expected_p, 4));
-    mu_assert("cols incorrect",
-              cmp_int_array(logistic_node->wsum_hess->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(logistic_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(logistic_node->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(logistic_node);
 
diff --git a/tests/wsum_hess/elementwise_full_dom/test_power.h b/tests/wsum_hess/elementwise_full_dom/test_power.h
index c21021e..1456bef 100644
--- a/tests/wsum_hess/elementwise_full_dom/test_power.h
+++ b/tests/wsum_hess/elementwise_full_dom/test_power.h
@@ -27,12 +27,9 @@ const char *test_wsum_hess_power(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(power_node->wsum_hess->x, expected_x, 3));
-    mu_assert("rows incorrect",
-              cmp_int_array(power_node->wsum_hess->p, expected_p, 4));
-    mu_assert("cols incorrect",
-              cmp_int_array(power_node->wsum_hess->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(power_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(power_node->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(power_node);
 
diff --git a/tests/wsum_hess/elementwise_full_dom/test_trig.h b/tests/wsum_hess/elementwise_full_dom/test_trig.h
index 51b1422..550c272 100644
--- a/tests/wsum_hess/elementwise_full_dom/test_trig.h
+++ b/tests/wsum_hess/elementwise_full_dom/test_trig.h
@@ -27,12 +27,9 @@ const char *test_wsum_hess_sin(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(sin_node->wsum_hess->x, expected_x, 3));
-    mu_assert("rows incorrect",
-              cmp_int_array(sin_node->wsum_hess->p, expected_p, 4));
-    mu_assert("cols incorrect",
-              cmp_int_array(sin_node->wsum_hess->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(sin_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(sin_node->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(sin_node);
 
@@ -56,12 +53,9 @@ const char *test_wsum_hess_cos(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(cos_node->wsum_hess->x, expected_x, 3));
-    mu_assert("rows incorrect",
-              cmp_int_array(cos_node->wsum_hess->p, expected_p, 4));
-    mu_assert("cols incorrect",
-              cmp_int_array(cos_node->wsum_hess->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(cos_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(cos_node->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(cos_node);
 
@@ -87,12 +81,9 @@ const char *test_wsum_hess_tan(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(tan_node->wsum_hess->x, expected_x, 3));
-    mu_assert("rows incorrect",
-              cmp_int_array(tan_node->wsum_hess->p, expected_p, 4));
-    mu_assert("cols incorrect",
-              cmp_int_array(tan_node->wsum_hess->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(tan_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(tan_node->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(tan_node);
 
diff --git a/tests/wsum_hess/elementwise_full_dom/test_xexp.h b/tests/wsum_hess/elementwise_full_dom/test_xexp.h
index c4c6f94..55e80ef 100644
--- a/tests/wsum_hess/elementwise_full_dom/test_xexp.h
+++ b/tests/wsum_hess/elementwise_full_dom/test_xexp.h
@@ -28,12 +28,9 @@ const char *test_wsum_hess_xexp(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(xexp_node->wsum_hess->x, expected_x, 3));
-    mu_assert("rows incorrect",
-              cmp_int_array(xexp_node->wsum_hess->p, expected_p, 4));
-    mu_assert("cols incorrect",
-              cmp_int_array(xexp_node->wsum_hess->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(xexp_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(xexp_node->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(xexp_node);
 
diff --git a/tests/wsum_hess/elementwise_restricted_dom/test_entr.h b/tests/wsum_hess/elementwise_restricted_dom/test_entr.h
index 1f4fb14..f7c83ca 100644
--- a/tests/wsum_hess/elementwise_restricted_dom/test_entr.h
+++ b/tests/wsum_hess/elementwise_restricted_dom/test_entr.h
@@ -27,12 +27,9 @@ const char *test_wsum_hess_entr(void)
     int expected_p[4] = {0, 1, 2, 3};
     int expected_i[3] = {0, 1, 2};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(entr_node->wsum_hess->x, expected_x, 3));
-    mu_assert("rows incorrect",
-              cmp_int_array(entr_node->wsum_hess->p, expected_p, 4));
-    mu_assert("cols incorrect",
-              cmp_int_array(entr_node->wsum_hess->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(entr_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(entr_node->wsum_hess, expected_p, expected_i, 3, 3));
 
     free_expr(entr_node);
 
diff --git a/tests/wsum_hess/elementwise_restricted_dom/test_log.h b/tests/wsum_hess/elementwise_restricted_dom/test_log.h
index 024ea35..c8acff5 100644
--- a/tests/wsum_hess/elementwise_restricted_dom/test_log.h
+++ b/tests/wsum_hess/elementwise_restricted_dom/test_log.h
@@ -39,12 +39,9 @@ const char *test_wsum_hess_log(void)
     int expected_p[8] = {0, 0, 0, 1, 2, 3, 3, 3};
     int expected_i[3] = {2, 3, 4};
 
-    mu_assert("vals incorrect",
-              cmp_double_array(log_node->wsum_hess->x, expected_x, 3));
-    mu_assert("rows incorrect",
-              cmp_int_array(log_node->wsum_hess->p, expected_p, 8));
-    mu_assert("cols incorrect",
-              cmp_int_array(log_node->wsum_hess->i, expected_i, 3));
+    mu_assert("vals fail", cmp_values(log_node->wsum_hess, expected_x, 3));
+    mu_assert("sparsity fail",
+              cmp_sparsity(log_node->wsum_hess, expected_p, expected_i, 7, 3));
 
     free_expr(log_node);
 
@@ -58,7 +55,7 @@ const char *test_wsum_hess_exp_composite(void)
     double Ax[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
     int Ai[] = {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4};
     int Ap[] = {0, 5, 10, 15};
-    CSR_Matrix *A_csr = new_csr_matrix(3, 5, 15);
+    CSR_matrix *A_csr = new_CSR_matrix(3, 5, 15);
     memcpy(A_csr->x, Ax, 15 * sizeof(double));
     memcpy(A_csr->i, Ai, 15 * sizeof(int));
     memcpy(A_csr->p, Ap, 4 * sizeof(int));
@@ -70,7 +67,7 @@ const char *test_wsum_hess_exp_composite(void)
     mu_assert("check_wsum_hess failed",
               check_wsum_hess(exp_node, u_vals, w, NUMERICAL_DIFF_DEFAULT_H));
 
-    free_csr_matrix(A_csr);
+    free_CSR_matrix(A_csr);
     free_expr(exp_node);
 
     return 0;
diff --git a/tests/wsum_hess/other/test_prod.h b/tests/wsum_hess/other/test_prod.h
index 753abcf..28184c1 100644
--- a/tests/wsum_hess/other/test_prod.h
+++ b/tests/wsum_hess/other/test_prod.h
@@ -28,9 +28,9 @@ const char *test_wsum_hess_prod_no_zero(void)
     int expected_p[9] = {0, 0, 0, 4, 8, 12, 16, 16, 16};
     int expected_i[16] = {2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 16));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->p, expected_p, 9));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->i, expected_i, 16));
+    mu_assert("vals fail", cmp_values(p->wsum_hess, expected_x, 16));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->wsum_hess, expected_p, expected_i, 8, 16));
 
     free_expr(p);
     return 0;
@@ -62,9 +62,9 @@ const char *test_wsum_hess_prod_one_zero(void)
     int expected_p[9] = {0, 0, 0, 4, 8, 12, 16, 16, 16};
     int expected_i[16] = {2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 16));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->p, expected_p, 9));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->i, expected_i, 16));
+    mu_assert("vals fail", cmp_values(p->wsum_hess, expected_x, 16));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->wsum_hess, expected_p, expected_i, 8, 16));
 
     free_expr(p);
     return 0;
@@ -91,9 +91,9 @@ const char *test_wsum_hess_prod_two_zeros(void)
     int expected_p[9] = {0, 0, 0, 4, 8, 12, 16, 16, 16};
     int expected_i[16] = {2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 16));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->p, expected_p, 9));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->i, expected_i, 16));
+    mu_assert("vals fail", cmp_values(p->wsum_hess, expected_x, 16));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->wsum_hess, expected_p, expected_i, 8, 16));
 
     free_expr(p);
     return 0;
@@ -118,9 +118,9 @@ const char *test_wsum_hess_prod_many_zeros(void)
     int expected_p[9] = {0, 0, 0, 4, 8, 12, 16, 16, 16};
     int expected_i[16] = {2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 16));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->p, expected_p, 9));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->i, expected_i, 16));
+    mu_assert("vals fail", cmp_values(p->wsum_hess, expected_x, 16));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->wsum_hess, expected_p, expected_i, 8, 16));
 
     free_expr(p);
     return 0;
diff --git a/tests/wsum_hess/other/test_prod_axis_one.h b/tests/wsum_hess/other/test_prod_axis_one.h
index e6915a0..21b810d 100644
--- a/tests/wsum_hess/other/test_prod_axis_one.h
+++ b/tests/wsum_hess/other/test_prod_axis_one.h
@@ -47,7 +47,7 @@ const char *test_wsum_hess_prod_axis_one_no_zeros(void)
                              /* Var 6 (row 1, col 2): [8, 4] (excludes col 2) */
                              8.0, 4.0};
 
-    /* Row pointers (monotonically increasing for valid CSR format) */
+    /* Row pointers (monotonically increasing for valid CSR_matrix format) */
     int expected_p[9] = {0, 0, 2, 4, 6, 8, 10, 12, 12};
 
     /* Column indices (each row of the matrix interacts with its own columns,
@@ -65,9 +65,9 @@ const char *test_wsum_hess_prod_axis_one_no_zeros(void)
                           /* Var 6 (row 1, col 2): cols 2,4 (excludes 6) */
                           2, 4};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 12));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->p, expected_p, 9));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->i, expected_i, 12));
+    mu_assert("vals fail", cmp_values(p->wsum_hess, expected_x, 12));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->wsum_hess, expected_p, expected_i, 8, 12));
 
     free_expr(p);
     return 0;
@@ -141,7 +141,7 @@ const char *test_wsum_hess_prod_axis_one_one_zero(void)
     expected_x[16] = 18.0;
     expected_x[17] = 9.0;
 
-    /* Row pointers (monotonically increasing for valid CSR format) */
+    /* Row pointers (monotonically increasing for valid CSR_matrix format) */
     int expected_p[11] = {0, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18};
 
     /* Column indices (each row of the matrix interacts with its own columns,
@@ -165,9 +165,9 @@ const char *test_wsum_hess_prod_axis_one_one_zero(void)
                           /* Var 9 (row 2, col 2): cols 3,6 (excludes 9) */
                           3, 6};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 18));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->p, expected_p, 11));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->i, expected_i, 18));
+    mu_assert("vals fail", cmp_values(p->wsum_hess, expected_x, 18));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->wsum_hess, expected_p, expected_i, 10, 18));
 
     free_expr(p);
     return 0;
@@ -210,7 +210,7 @@ const char *test_wsum_hess_prod_axis_one_mixed_zeros(void)
     memset(expected_x, 0, sizeof(expected_x));
 
     /* For a 5x3 matrix with var_id=1, each row has 2 nnz (d2-1):
-     * CSR row pointers: p[i] = (i-1)*2 for i in [1,15]
+     * CSR_matrix row pointers: p[i] = (i-1)*2 for i in [1,15]
      *   Var 1 (matrix [0,0]): p[1]=0
      *   Var 2 (matrix [1,0]): p[2]=2
      *   Var 3 (matrix [2,0]): p[3]=4
@@ -293,7 +293,7 @@ const char *test_wsum_hess_prod_axis_one_mixed_zeros(void)
     expected_x[28] = 25.0; /* 75/(3*1) */
     expected_x[29] = 5.0;  /* 75/(3*5) */
 
-    /* Row pointers (monotonically increasing for valid CSR format) */
+    /* Row pointers (monotonically increasing for valid CSR_matrix format) */
     int expected_p[17] = {0,  0,  2,  4,  6,  8,  10, 12, 14,
                           16, 18, 20, 22, 24, 26, 28, 30};
 
@@ -317,9 +317,9 @@ const char *test_wsum_hess_prod_axis_one_mixed_zeros(void)
         }
     }
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 30));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->p, expected_p, 17));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->i, expected_i, 30));
+    mu_assert("vals fail", cmp_values(p->wsum_hess, expected_x, 30));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->wsum_hess, expected_p, expected_i, 16, 30));
 
     free_expr(p);
     return 0;
@@ -373,9 +373,9 @@ const char *test_wsum_hess_prod_axis_one_2x2(void)
                          0,  /* Var 2 (row 0, col 1): only col 0 */
                          1}; /* Var 3 (row 1, col 1): only col 0 */
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 4));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->p, expected_p, 5));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->i, expected_i, 4));
+    mu_assert("vals fail", cmp_values(p->wsum_hess, expected_x, 4));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->wsum_hess, expected_p, expected_i, 4, 4));
 
     free_expr(p);
     return 0;
diff --git a/tests/wsum_hess/other/test_prod_axis_zero.h b/tests/wsum_hess/other/test_prod_axis_zero.h
index 6a54e0d..e057e47 100644
--- a/tests/wsum_hess/other/test_prod_axis_zero.h
+++ b/tests/wsum_hess/other/test_prod_axis_zero.h
@@ -63,9 +63,9 @@ const char *test_wsum_hess_prod_axis_zero_no_zeros(void)
      */
     int expected_i[12] = {1, 2, 1, 2, 3, 4, 3, 4, 5, 6, 5, 6};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 12));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->p, expected_p, 9));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->i, expected_i, 12));
+    mu_assert("vals fail", cmp_values(p->wsum_hess, expected_x, 12));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->wsum_hess, expected_p, expected_i, 8, 12));
 
     free_expr(p);
     return 0;
@@ -75,7 +75,7 @@ const char *test_wsum_hess_prod_axis_zero_mixed_zeros(void)
 {
     /* x is 5x3 variable, global index 1, total 16 vars
      * x = [1, 1, 1, 1, 1, 2, 0, 3, 4, 5, 1, 0, 0, 2, 3] (column-major)
-     * Matrix (column-major):
+     * matrix (column-major):
      *     [1, 2, 1]
      *     [1, 0, 0]
      *     [1, 3, 0]
@@ -183,9 +183,9 @@ const char *test_wsum_hess_prod_axis_zero_mixed_zeros(void)
         }
     }
 
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->p, expected_p, 17));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->i, expected_i, 75));
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 75));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->wsum_hess, expected_p, expected_i, 16, 75));
+    mu_assert("vals fail", cmp_values(p->wsum_hess, expected_x, 75));
 
     free_expr(p);
     return 0;
@@ -196,7 +196,7 @@ const char *test_wsum_hess_prod_axis_zero_one_zero(void)
     /* Test with a column that has exactly 1 zero
      * x is 2x2 variable, global index 1, total 5 vars
      * x = [1.0, 1.0, 2.0, 0.0] (column-major)
-     * Matrix (column-major):
+     * matrix (column-major):
      *     [1, 2]
      *     [1, 0]
      *
@@ -245,9 +245,9 @@ const char *test_wsum_hess_prod_axis_zero_one_zero(void)
      */
     int expected_i[8] = {1, 2, 1, 2, 3, 4, 3, 4};
 
-    mu_assert("vals fail", cmp_double_array(p->wsum_hess->x, expected_x, 8));
-    mu_assert("rows fail", cmp_int_array(p->wsum_hess->p, expected_p, 6));
-    mu_assert("cols fail", cmp_int_array(p->wsum_hess->i, expected_i, 8));
+    mu_assert("vals fail", cmp_values(p->wsum_hess, expected_x, 8));
+    mu_assert("sparsity fail",
+              cmp_sparsity(p->wsum_hess, expected_p, expected_i, 5, 8));
 
     free_expr(p);
     return 0;
diff --git a/tests/wsum_hess/other/test_quad_form.h b/tests/wsum_hess/other/test_quad_form.h
index 73f8a1d..bd62d5a 100644
--- a/tests/wsum_hess/other/test_quad_form.h
+++ b/tests/wsum_hess/other/test_quad_form.h
@@ -17,7 +17,7 @@ const char *test_wsum_hess_quad_form(void)
      * [0 3 4 1]
      * [0 0 1 6]
      */
-    CSR_Matrix *Q = new_csr_matrix(4, 4, 10);
+    CSR_matrix *Q = new_CSR_matrix(4, 4, 10);
     double Qx[10] = {1.0, 2.0, 2.0, 5.0, 3.0, 3.0, 4.0, 1.0, 1.0, 6.0};
     int Qi[10] = {0, 1, 0, 1, 2, 1, 2, 3, 2, 3};
     int Qp[5] = {0, 2, 5, 8, 10};
@@ -38,11 +38,11 @@ const char *test_wsum_hess_quad_form(void)
     int expected_i[10] = {3, 4, 3, 4, 5, 4, 5, 6, 5, 6};
     double expected_x[10] = {4.0, 8.0, 8.0, 20.0, 12.0, 12.0, 16.0, 4.0, 4.0, 24.0};
 
-    mu_assert("p array fails", cmp_int_array(node->wsum_hess->p, expected_p, 11));
-    mu_assert("i array fails", cmp_int_array(node->wsum_hess->i, expected_i, 10));
-    mu_assert("x array fails", cmp_double_array(node->wsum_hess->x, expected_x, 10));
+    mu_assert("sparsity fail",
+              cmp_sparsity(node->wsum_hess, expected_p, expected_i, 10, 10));
+    mu_assert("vals fail", cmp_values(node->wsum_hess, expected_x, 10));
 
     free_expr(node);
-    free_csr_matrix(Q);
+    free_CSR_matrix(Q);
     return 0;
 }