BioAnalyticResource · rmobmina · Oct 14, 2021 · Nov 5, 2021 · Nov 25, 2021 · Nov 25, 2021
diff --git a/.gitignore b/.gitignore
@@ -144,3 +144,6 @@ output/*
 
 # Local sqlite mirrors generated from MySQL dumps
 config/databases/*.db
+
+# Archive test data — large DB dumps, exports, and CSVs; not for version control
+archive/test_data/
diff --git a/api/Archive/efp_tables_structure_sample_data_dump_01_28_25.csv b/api/Archive/efp_tables_structure_sample_data_dump_01_28_25.csv
diff --git a/api/Archive/efp_tables_structure_sample_data_dump_01_28_25.sql b/api/Archive/efp_tables_structure_sample_data_dump_01_28_25.sql
diff --git a/api/Archive/embryo_efp_feb_6_2025_dump.sql b/api/Archive/embryo_efp_feb_6_2025_dump.sql
diff --git a/api/Archive/sample_data_export_feb_4.csv b/api/Archive/sample_data_export_feb_4.csv
diff --git a/api/Archive/sample_data_results.csv b/api/Archive/sample_data_results.csv
diff --git a/api/Archive/schema.sql b/api/Archive/schema.sql
diff --git a/api/Archive/structural_diffs_sample_data_dump_01_28_25.csv b/api/Archive/structural_diffs_sample_data_dump_01_28_25.csv
diff --git a/api/Archive/structural_diffs_sample_data_dump_01_28_25.sql b/api/Archive/structural_diffs_sample_data_dump_01_28_25.sql
diff --git a/api/__init__.py b/api/__init__.py
@@ -1,15 +1,52 @@
+import re as _re
+import sqlite3
+import statistics as _statistics
+
 from flask import Flask
 from flask_sqlalchemy import SQLAlchemy
 from flask_restx import Api
 from flask_cors import CORS
 from flask_caching import Cache
 from flask_limiter import Limiter
 from flask_limiter.util import get_remote_address
+from sqlalchemy import event
+from sqlalchemy.engine import Engine
 import os
 from pathlib import Path
 import tempfile
 
 
+@event.listens_for(Engine, "connect")
+def _register_sqlite_functions(dbapi_conn, connection_record):
+    """Register MySQL-compatible functions for SQLite (used in CI and local tests)."""
+    if not isinstance(dbapi_conn, sqlite3.Connection):
+        return
+
+    class _PopStdDev:
+        """Population standard deviation aggregate (equivalent to MySQL STD())."""
+
+        def __init__(self):
+            self._vals = []
+
+        def step(self, value):
+            if value is not None:
+                self._vals.append(float(value))
+
+        def finalize(self):
+            if len(self._vals) < 2:
+                return None
+            return _statistics.pstdev(self._vals)
+
+    dbapi_conn.create_aggregate("std", 1, _PopStdDev)
+
+    def _regexp_replace(string, pattern, replacement):
+        if string is None:
+            return None
+        return _re.sub(pattern, replacement, string)
+
+    dbapi_conn.create_function("regexp_replace", 3, _regexp_replace)
+
+
 def create_app():
     """Initialize the app factory based on the official Flask documentation"""
     bar_app = Flask(__name__)
@@ -149,6 +186,7 @@ def create_app():
     from api.resources.fastpheno import fastpheno
     from api.resources.llama3 import llama3
     from api.resources.gene_expression import gene_expression
+    from api.resources.gene_density import gene_density
 
     bar_api.add_namespace(gene_information)
     bar_api.add_namespace(gaia)
@@ -165,6 +203,7 @@ def create_app():
     bar_api.add_namespace(fastpheno)
     bar_api.add_namespace(llama3)
     bar_api.add_namespace(gene_expression)
+    bar_api.add_namespace(gene_density)
     bar_api.init_app(bar_app)
     return bar_app
 

diff --git a/api/genedensity.cgi b/api/genedensity.cgi
@@ -0,0 +1,48 @@
+#!/usr/bin/python3
+
+import cgi
+import MySQLdb
+import json
+import urllib.request, urllib.error, urllib.parse
+import math
+
+# Retrieve parameters
+arguments = cgi.FieldStorage()
+species = arguments['species'].value
+binSize = arguments['binSize'].value
+
+# Print header
+print('Content-Type: application/json\n')
+
+try:
+    gene = {}
+    if species == 'Arabidopsis_thaliana':
+        con = MySQLdb.connect('localhost', 'SAMPLE_USER', 'SAMPLE_PW', 'eplant2')
+        cur = con.cursor()
+        query = 'SELECT geneId,start,end FROM tair10_gff3 WHERE type=\"gene\";'
+        cur.execute(query)
+
+        # Create and initialize bins for each chromosome
+        chrLengths = [30427671, 19698289, 23459830, 18585056, 26975502, 154478, 366924]
+        chrNames = ['1', '2', '3', '4', '5', 'C', 'M']
+        bins = [[], [], [], [], [], [], []]
+
+        for n in range(7):
+            for m in range(int(math.ceil(float(chrLengths[n]) / float(binSize)))):
+                bins[n].append(0)
+
+        for row in cur:
+            index = chrNames.index(row[0][2])
+            startBin = int(math.floor(float(row[1]) / float(binSize)))
+            endBin = int(math.floor(float(row[2]) / float(binSize)))
+            for n in range(startBin, endBin + 1):
+                bins[index][n] += 1
+
+        chrNames = ['Chr 1', 'Chr 2', 'Chr 3', 'Chr 4', 'Chr 5', 'Chr C', 'Chr M']
+        output = []
+        for n in range(7):
+            output.append({'name': chrNames[n], 'density': bins[n]})
+
+        print(json.dumps(output))
+except:
+    print('{}')
diff --git a/api/idautocomplete.cgi b/api/idautocomplete.cgi
@@ -0,0 +1,47 @@
+#!/usr/bin/python3
+
+import cgi
+import MySQLdb
+import json
+
+# Retrieve parameters
+arguments = cgi.FieldStorage()
+species = arguments['species'].value
+term = arguments['term'].value
+
+# Print header
+print('Content-Type: application/json')
+print('Access-Control-Allow-Origin: *\n')
+
+try:
+    if species == 'Arabidopsis_thaliana':
+        con = MySQLdb.connect('localhost', 'SAMPLE_USER', 'SAMPLE_PW', 'eplant2')
+        cur = con.cursor()
+
+        # First add aliases
+        query = 'SELECT agi,alias FROM agi_alias WHERE agi LIKE "%' + term + '%" OR alias LIKE "%' + term + '%" LIMIT 15;'
+        cur.execute(query)
+        output = []
+        for row in cur:
+            output.append(row[0] + "/" + row[1])
+
+        # Now add names
+        query = 'SELECT agi,name FROM agi_names WHERE agi LIKE "%' + term + '%" OR name LIKE "%' + term + '%" LIMIT 15;'
+        cur.execute(query)
+        for row in cur:
+            output.append(row[0] + "/" + row[1])
+
+        if len(output) < 15:
+            query = 'SELECT geneId FROM tair10_gff3 WHERE type="gene" AND geneId LIKE "%' + term + '%" LIMIT ' + str(15 - len(output)) + ';'
+            cur.execute(query)
+            for row in cur:
+                duplicate = False
+                for identifier in output:
+                    if identifier.upper().startswith(row[0].upper()):
+                        duplicate = True
+                        break
+                if not duplicate:
+                    output.append(row[0])
+        print(json.dumps(output))
+except:
+    print("{}")
diff --git a/api/models/efp_dynamic.py b/api/models/efp_dynamic.py
@@ -1,8 +1,11 @@
 """
-Dynamic SQLAlchemy model generation for simple eFP databases.
+Reena Obmina | BCB330 Project 2025-2026 | University of Toronto
 
-This module provides runtime generation of SQLAlchemy ORM models from schema
-definitions, enabling dynamic database access without hardcoded model classes.
+Dynamic SQLAlchemy model generation for all eFP databases.
+
+At import time, one ORM model class is generated per database entry in
+SIMPLE_EFP_DATABASE_SCHEMAS and stored in SIMPLE_EFP_SAMPLE_MODELS.
+This replaces ~1,984 lines of hand-written boilerplate with a single registry.
 """
 
 from __future__ import annotations

diff --git a/api/models/efp_schemas.py b/api/models/efp_schemas.py
@@ -1,8 +1,12 @@
 """
-Compact schema definitions for eFP databases exposing a sample_data table.
+Reena Obmina | BCB330 Project 2025-2026 | University of Toronto
 
-Each database only needs 3 columns: data_probeset_id, data_signal, data_bot_id.
-All databases share the same column structure (VARCHAR(255) for string columns).
+Schema definitions for all eFP databases that expose a sample_data table.
+
+Every database shares the same three-column structure:
+  data_probeset_id (VARCHAR 255), data_signal (FLOAT), data_bot_id (VARCHAR 255).
+
+To add a new database, append one tuple to _SPECS — no other changes needed.
 """
 
 from __future__ import annotations
@@ -25,11 +29,12 @@
 
 
 def _schema(species: str, charset: str = "latin1") -> DatabaseSpec:
-    """Build a schema for one eFP database.
+    """Build a schema entry for one eFP database.
 
-    :param species: Species name for metadata.
-    :param charset: MySQL character set ('latin1' or 'utf8mb4').
-    :return: Full database schema specification.
+    :param species: Species name stored in metadata (e.g., 'arabidopsis').
+    :param charset: MySQL character set — 'latin1' for most, 'utf8mb4' for non-Latin labels.
+    :returns: Full database schema dict ready for model generation.
+    :rtype: DatabaseSpec
     """
     return {
         **_SCHEMA_TEMPLATE,
@@ -234,9 +239,7 @@ def _schema(species: str, charset: str = "latin1") -> DatabaseSpec:
     ("willow", "willow"),
 ]
 
-# databases that store Affymetrix/microarray probeset IDs instead of gene identifiers.
-# For Arabidopsis databases in this set, the API will auto-convert AGI → probeset
-# via the at_agi_lookup service before querying expression data.
+# Databases that store Affymetrix/microarray probeset IDs instead of gene identifiers.
 _PROBESET_DBS = {
     # Arabidopsis microarray databases (Affymetrix ATH1 chip, need AGI→probeset lookup)
     "affydb",
@@ -272,7 +275,7 @@ def _schema(species: str, charset: str = "latin1") -> DatabaseSpec:
     "triticale_mas",
 }
 
-# databases that use utf8mb4 charset (all others default to latin1)
+# Databases that use utf8mb4 charset (all others default to latin1)
 _UTF8MB4 = {
     "actinidia_bud_development", "actinidia_flower_fruit_development",
     "actinidia_postharvest", "actinidia_vegetative_growth", "apple",

diff --git a/api/models/eplant2.py b/api/models/eplant2.py
@@ -75,3 +75,11 @@ class AgiAlias(db.Model):
 
     agi: db.Mapped[str] = db.mapped_column(db.String(30), nullable=False, primary_key=True)
     alias: db.Mapped[str] = db.mapped_column(db.String(30), nullable=False, primary_key=True)
+
+
+class AgiNames(db.Model):
+    __bind_key__ = "eplant2"
+    __tablename__ = "agi_names"
+
+    agi: db.Mapped[str] = db.mapped_column(db.String(30), nullable=False, primary_key=True)
+    name: db.Mapped[str] = db.mapped_column(db.String(255), nullable=False, primary_key=True)
diff --git a/api/resources/gene_density.py b/api/resources/gene_density.py
@@ -0,0 +1,107 @@
+"""
+Reena Obmina | UTEA Project 2026 | University of Toronto
+
+Gene density endpoint for the BAR API.
+
+Returns per-bin gene density across all Arabidopsis thaliana chromosomes for a
+given bin size (in base pairs), as used by Eplant's ChromosomeView to colour
+chromosomes by gene density.
+
+Reads:  eplant2.tair10_gff3
+Writes: JSON — list of chromosomes with density arrays
+
+Usage::
+
+    GET /gene_density?species=Arabidopsis_thaliana&bin_size=143061.51645207437
+"""
+
+from flask import request
+from flask_restx import Namespace, Resource
+from markupsafe import escape
+from sqlalchemy import func
+from api import db
+from api.models.eplant2 import TAIR10GFF3
+from api.utils.bar_utils import BARUtils
+import math
+
+gene_density = Namespace("Gene Density", description="Gene density API", path="/gene_density")
+
+# Arabidopsis thaliana chromosome lengths (bp) and display order
+_CHR_LENGTHS = {
+    "1": 30427671,
+    "2": 19698289,
+    "3": 23459830,
+    "4": 18585056,
+    "5": 26975502,
+    "C": 154478,
+    "M": 366924,
+}
+_CHR_ORDER = ["1", "2", "3", "4", "5", "C", "M"]
+
+
+@gene_density.route("")
+class GeneDensity(Resource):
+    @gene_density.param("species", description="Species name", default="Arabidopsis_thaliana")
+    @gene_density.param("bin_size", description="Bin size in base pairs", default="143061.51645207437")
+    def get(self):
+        """Returns gene density per chromosome bin for the given species and bin size."""
+        species = escape(request.args.get("species", ""))
+        bin_size_str = request.args.get("bin_size", "")
+
+        if not species:
+            return BARUtils.error_exit("Missing species parameter"), 400
+        if not bin_size_str:
+            return BARUtils.error_exit("Missing bin_size parameter"), 400
+
+        try:
+            bin_size = float(bin_size_str)
+            if bin_size <= 0:
+                return BARUtils.error_exit("bin_size must be a positive number"), 400
+        except ValueError:
+            return BARUtils.error_exit("Invalid bin_size"), 400
+
+        if species != "Arabidopsis_thaliana":
+            return BARUtils.error_exit("Invalid species"), 400
+
+        bins = {c: [0] * math.ceil(_CHR_LENGTHS[c] / bin_size) for c in _CHR_ORDER}
+
+        chr_expr = func.substr(TAIR10GFF3.geneId, 3, 1)
+        start_bin_expr = func.floor(TAIR10GFF3.Start / bin_size)
+        end_bin_expr = func.floor(TAIR10GFF3.End / bin_size)
+
+        # Aggregated query for single-bin genes (~98%+ of all genes at typical zoom levels).
+        # FLOOR(start/binSize) == FLOOR(end/binSize) means the gene fits within one bin,
+        # so GROUP BY is safe and avoids fetching one row per gene.
+        single_bin_rows = db.session.execute(
+            db.select(chr_expr, start_bin_expr, func.count())
+            .where(
+                TAIR10GFF3.Type == "gene",
+                start_bin_expr == end_bin_expr,
+            )
+            .group_by(chr_expr, start_bin_expr)
+        ).all()
+
+        for chr_char, bin_idx, cnt in single_bin_rows:
+            if chr_char in bins:
+                idx = int(bin_idx)
+                if 0 <= idx < len(bins[chr_char]):
+                    bins[chr_char][idx] += cnt
+
+        # Individual rows for genes that span multiple bins (rare — typically <2% of genes).
+        # Each such gene is counted once in every bin it spans, matching the original behaviour.
+        multi_bin_rows = db.session.execute(
+            db.select(chr_expr, start_bin_expr, end_bin_expr)
+            .where(
+                TAIR10GFF3.Type == "gene",
+                start_bin_expr != end_bin_expr,
+            )
+        ).all()
+
+        for chr_char, start_bin, end_bin in multi_bin_rows:
+            if chr_char in bins:
+                for n in range(int(start_bin), int(end_bin) + 1):
+                    if 0 <= n < len(bins[chr_char]):
+                        bins[chr_char][n] += 1
+
+        output = [{"name": f"Chr {c}", "density": bins[c]} for c in _CHR_ORDER]
+        return BARUtils.success_exit(output)