Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
7fccc10
Merge pull request #91 from BioAnalyticResource/dev
asherpasha Oct 14, 2021
5b9265e
Merge pull request #93 from BioAnalyticResource/dev
asherpasha Nov 5, 2021
7bcdbdf
Merge pull request #97 from BioAnalyticResource/dev
asherpasha Nov 25, 2021
a05f196
Merge pull request #101 from BioAnalyticResource/dev
asherpasha Nov 25, 2021
29e97ab
Merge pull request #119 from BioAnalyticResource/dev
asherpasha Jan 14, 2022
b1fa5fe
Merge pull request #123 from BioAnalyticResource/dev
asherpasha Jan 24, 2022
21b07f5
Merge pull request #132 from BioAnalyticResource/dev
asherpasha Apr 1, 2022
cf5fe5d
Merge pull request #140 from BioAnalyticResource/dev
asherpasha Jun 7, 2022
a184ca5
Merge pull request #145 from BioAnalyticResource/dev
asherpasha Aug 22, 2022
87259b8
Merge pull request #150 from BioAnalyticResource/dev
asherpasha Oct 6, 2022
952e648
Merge pull request #154 from BioAnalyticResource/dev
asherpasha Dec 22, 2022
5c98688
Merge pull request #156 from BioAnalyticResource/dev
asherpasha Jan 1, 2023
b0f3a50
Merge pull request #161 from BioAnalyticResource/dev
asherpasha Feb 17, 2023
5cd3240
Merge pull request #169 from BioAnalyticResource/dev
asherpasha Apr 6, 2023
43fbe2a
Merge pull request #179 from BioAnalyticResource/dev
asherpasha Jul 8, 2023
8b2fdc5
Merge pull request #185 from BioAnalyticResource/dev
asherpasha Jul 26, 2023
804f4aa
Merge pull request #187 from BioAnalyticResource/dev
asherpasha Jul 31, 2023
bf73dcc
Merge pull request #197 from BioAnalyticResource/dev
asherpasha Oct 23, 2023
6cd621b
Merge pull request #202 from BioAnalyticResource/dev
asherpasha Nov 2, 2023
9e7d9b0
Merge pull request #204 from BioAnalyticResource/dev
asherpasha Jan 19, 2024
24a2290
Merge pull request #206 from BioAnalyticResource/dev
asherpasha Mar 4, 2024
20c1679
Merge pull request #208 from BioAnalyticResource/dev
asherpasha Mar 18, 2024
a995369
Merge pull request #215 from BioAnalyticResource/dev
asherpasha Apr 26, 2024
16ce80c
Merge pull request #227 from BioAnalyticResource/dev
asherpasha Jul 8, 2024
3131d5d
Merge pull request #231 from BioAnalyticResource/dev
asherpasha Jul 30, 2024
acf7770
Merge pull request #237 from BioAnalyticResource/dev
asherpasha Aug 21, 2024
a195141
Merge pull request #241 from BioAnalyticResource/dev
asherpasha Sep 27, 2024
9d6a957
Merge pull request #243 from BioAnalyticResource/dev
asherpasha Oct 2, 2024
2885de8
Merge pull request #246 from BioAnalyticResource/dev
asherpasha Dec 1, 2024
9c33aa2
Merge pull request #248 from BioAnalyticResource/dev
asherpasha Dec 4, 2024
a4da029
Merge pull request #250 from BioAnalyticResource/dev
asherpasha Dec 23, 2024
36c52f6
Merge pull request #253 from BioAnalyticResource/dev
asherpasha Jan 21, 2025
0168dae
Merge pull request #256 from BioAnalyticResource/dev
asherpasha Feb 8, 2025
9c31979
Merge pull request #258 from BioAnalyticResource/dev
asherpasha Feb 8, 2025
79ef834
Merge pull request #264 from BioAnalyticResource/dev
asherpasha Apr 19, 2025
2f612af
Merge pull request #274 from BioAnalyticResource/dev
asherpasha Jul 25, 2025
30c03f2
Merge pull request #277 from BioAnalyticResource/dev
asherpasha Aug 7, 2025
e719beb
Merge pull request #279 from BioAnalyticResource/dev
asherpasha Sep 25, 2025
78071ca
Merge pull request #281 from BioAnalyticResource/dev
asherpasha Sep 29, 2025
5aac55a
Merge pull request #283 from BioAnalyticResource/dev
asherpasha Oct 29, 2025
f039563
Merge pull request #298 from BioAnalyticResource/dev
asherpasha Jan 22, 2026
a320300
Merge pull request #311 from BioAnalyticResource/dev
asherpasha Apr 3, 2026
2f87b46
resolve merge conflicts and add remaining EFP pipeline files
rmobmina Apr 7, 2026
a51755c
standardise docstrings and add attribution headers across all EFP files
rmobmina Apr 7, 2026
086115c
fix flake8 E226 whitespace around arithmetic operator
rmobmina Apr 7, 2026
013d2c9
lots of docs added and enhanced comments and notes
rmobmina Apr 13, 2026
708bcf5
cleaned entire repo one last time and ran flake8
rmobmina Apr 13, 2026
4549a93
tests should now all pass
rmobmina Apr 13, 2026
e94514d
tests should pass now
rmobmina Apr 13, 2026
ae0ad34
converted gene_query endpoint from POST to GET
rmobmina Apr 16, 2026
da3530f
added auto complete endpoint
rmobmina Apr 22, 2026
5a6430c
Added autocomplete and genedensity endpoints + tests
rmobmina Apr 30, 2026
4afa6b7
add namespaces to init
rmobmina Apr 30, 2026
999bdbf
Merge remote-tracking branch 'upstream/master'
rmobmina Apr 30, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -144,3 +144,6 @@ output/*

# Local sqlite mirrors generated from MySQL dumps
config/databases/*.db

# Archive test data — large DB dumps, exports, and CSVs; not for version control
archive/test_data/
1,128 changes: 0 additions & 1,128 deletions api/Archive/efp_tables_structure_sample_data_dump_01_28_25.csv

This file was deleted.

1,128 changes: 0 additions & 1,128 deletions api/Archive/efp_tables_structure_sample_data_dump_01_28_25.sql

This file was deleted.

79 changes: 0 additions & 79 deletions api/Archive/embryo_efp_feb_6_2025_dump.sql

This file was deleted.

956 changes: 0 additions & 956 deletions api/Archive/sample_data_export_feb_4.csv

This file was deleted.

1,729 changes: 0 additions & 1,729 deletions api/Archive/sample_data_results.csv

This file was deleted.

23,169 changes: 0 additions & 23,169 deletions api/Archive/schema.sql

This file was deleted.

114 changes: 0 additions & 114 deletions api/Archive/structural_diffs_sample_data_dump_01_28_25.csv

This file was deleted.

114 changes: 0 additions & 114 deletions api/Archive/structural_diffs_sample_data_dump_01_28_25.sql

This file was deleted.

39 changes: 39 additions & 0 deletions api/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,52 @@
import re as _re
import sqlite3
import statistics as _statistics

from flask import Flask
from flask_sqlalchemy import SQLAlchemy
from flask_restx import Api
from flask_cors import CORS
from flask_caching import Cache
from flask_limiter import Limiter
from flask_limiter.util import get_remote_address
from sqlalchemy import event
from sqlalchemy.engine import Engine
import os
from pathlib import Path
import tempfile


@event.listens_for(Engine, "connect")
def _register_sqlite_functions(dbapi_conn, connection_record):
"""Register MySQL-compatible functions for SQLite (used in CI and local tests)."""
if not isinstance(dbapi_conn, sqlite3.Connection):
return

class _PopStdDev:
"""Population standard deviation aggregate (equivalent to MySQL STD())."""

def __init__(self):
self._vals = []

def step(self, value):
if value is not None:
self._vals.append(float(value))

def finalize(self):
if len(self._vals) < 2:
return None
return _statistics.pstdev(self._vals)

dbapi_conn.create_aggregate("std", 1, _PopStdDev)

def _regexp_replace(string, pattern, replacement):
if string is None:
return None
return _re.sub(pattern, replacement, string)

dbapi_conn.create_function("regexp_replace", 3, _regexp_replace)


def create_app():
"""Initialize the app factory based on the official Flask documentation"""
bar_app = Flask(__name__)
Expand Down Expand Up @@ -149,6 +186,7 @@ def create_app():
from api.resources.fastpheno import fastpheno
from api.resources.llama3 import llama3
from api.resources.gene_expression import gene_expression
from api.resources.gene_density import gene_density

bar_api.add_namespace(gene_information)
bar_api.add_namespace(gaia)
Expand All @@ -165,6 +203,7 @@ def create_app():
bar_api.add_namespace(fastpheno)
bar_api.add_namespace(llama3)
bar_api.add_namespace(gene_expression)
bar_api.add_namespace(gene_density)
bar_api.init_app(bar_app)
return bar_app

Expand Down
48 changes: 48 additions & 0 deletions api/genedensity.cgi
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/usr/bin/python3

import cgi
import MySQLdb
import json
import urllib.request, urllib.error, urllib.parse
import math

# Retrieve parameters
arguments = cgi.FieldStorage()
species = arguments['species'].value
binSize = arguments['binSize'].value

# Print header
print('Content-Type: application/json\n')

try:
gene = {}
if species == 'Arabidopsis_thaliana':
con = MySQLdb.connect('localhost', 'SAMPLE_USER', 'SAMPLE_PW', 'eplant2')
cur = con.cursor()
query = 'SELECT geneId,start,end FROM tair10_gff3 WHERE type=\"gene\";'
cur.execute(query)

# Create and initialize bins for each chromosome
chrLengths = [30427671, 19698289, 23459830, 18585056, 26975502, 154478, 366924]
chrNames = ['1', '2', '3', '4', '5', 'C', 'M']
bins = [[], [], [], [], [], [], []]

for n in range(7):
for m in range(int(math.ceil(float(chrLengths[n]) / float(binSize)))):
bins[n].append(0)

for row in cur:
index = chrNames.index(row[0][2])
startBin = int(math.floor(float(row[1]) / float(binSize)))
endBin = int(math.floor(float(row[2]) / float(binSize)))
for n in range(startBin, endBin + 1):
bins[index][n] += 1

chrNames = ['Chr 1', 'Chr 2', 'Chr 3', 'Chr 4', 'Chr 5', 'Chr C', 'Chr M']
output = []
for n in range(7):
output.append({'name': chrNames[n], 'density': bins[n]})

print(json.dumps(output))
except:
print('{}')
47 changes: 47 additions & 0 deletions api/idautocomplete.cgi
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/python3

import cgi
import MySQLdb
import json

# Retrieve parameters
arguments = cgi.FieldStorage()
species = arguments['species'].value
term = arguments['term'].value

# Print header
print('Content-Type: application/json')
print('Access-Control-Allow-Origin: *\n')

try:
if species == 'Arabidopsis_thaliana':
con = MySQLdb.connect('localhost', 'SAMPLE_USER', 'SAMPLE_PW', 'eplant2')
cur = con.cursor()

# First add aliases
query = 'SELECT agi,alias FROM agi_alias WHERE agi LIKE "%' + term + '%" OR alias LIKE "%' + term + '%" LIMIT 15;'
cur.execute(query)
output = []
for row in cur:
output.append(row[0] + "/" + row[1])

# Now add names
query = 'SELECT agi,name FROM agi_names WHERE agi LIKE "%' + term + '%" OR name LIKE "%' + term + '%" LIMIT 15;'
cur.execute(query)
for row in cur:
output.append(row[0] + "/" + row[1])

if len(output) < 15:
query = 'SELECT geneId FROM tair10_gff3 WHERE type="gene" AND geneId LIKE "%' + term + '%" LIMIT ' + str(15 - len(output)) + ';'
cur.execute(query)
for row in cur:
duplicate = False
for identifier in output:
if identifier.upper().startswith(row[0].upper()):
duplicate = True
break
if not duplicate:
output.append(row[0])
print(json.dumps(output))
except:
print("{}")
9 changes: 6 additions & 3 deletions api/models/efp_dynamic.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
"""
Dynamic SQLAlchemy model generation for simple eFP databases.
Reena Obmina | BCB330 Project 2025-2026 | University of Toronto

This module provides runtime generation of SQLAlchemy ORM models from schema
definitions, enabling dynamic database access without hardcoded model classes.
Dynamic SQLAlchemy model generation for all eFP databases.

At import time, one ORM model class is generated per database entry in
SIMPLE_EFP_DATABASE_SCHEMAS and stored in SIMPLE_EFP_SAMPLE_MODELS.
This replaces ~1,984 lines of hand-written boilerplate with a single registry.
"""

from __future__ import annotations
Expand Down
25 changes: 14 additions & 11 deletions api/models/efp_schemas.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
"""
Compact schema definitions for eFP databases exposing a sample_data table.
Reena Obmina | BCB330 Project 2025-2026 | University of Toronto

Each database only needs 3 columns: data_probeset_id, data_signal, data_bot_id.
All databases share the same column structure (VARCHAR(255) for string columns).
Schema definitions for all eFP databases that expose a sample_data table.

Every database shares the same three-column structure:
data_probeset_id (VARCHAR 255), data_signal (FLOAT), data_bot_id (VARCHAR 255).

To add a new database, append one tuple to _SPECS — no other changes needed.
"""

from __future__ import annotations
Expand All @@ -25,11 +29,12 @@


def _schema(species: str, charset: str = "latin1") -> DatabaseSpec:
"""Build a schema for one eFP database.
"""Build a schema entry for one eFP database.

:param species: Species name for metadata.
:param charset: MySQL character set ('latin1' or 'utf8mb4').
:return: Full database schema specification.
:param species: Species name stored in metadata (e.g., 'arabidopsis').
:param charset: MySQL character set — 'latin1' for most, 'utf8mb4' for non-Latin labels.
:returns: Full database schema dict ready for model generation.
:rtype: DatabaseSpec
"""
return {
**_SCHEMA_TEMPLATE,
Expand Down Expand Up @@ -234,9 +239,7 @@ def _schema(species: str, charset: str = "latin1") -> DatabaseSpec:
("willow", "willow"),
]

# databases that store Affymetrix/microarray probeset IDs instead of gene identifiers.
# For Arabidopsis databases in this set, the API will auto-convert AGI → probeset
# via the at_agi_lookup service before querying expression data.
# Databases that store Affymetrix/microarray probeset IDs instead of gene identifiers.
_PROBESET_DBS = {
# Arabidopsis microarray databases (Affymetrix ATH1 chip, need AGI→probeset lookup)
"affydb",
Expand Down Expand Up @@ -272,7 +275,7 @@ def _schema(species: str, charset: str = "latin1") -> DatabaseSpec:
"triticale_mas",
}

# databases that use utf8mb4 charset (all others default to latin1)
# Databases that use utf8mb4 charset (all others default to latin1)
_UTF8MB4 = {
"actinidia_bud_development", "actinidia_flower_fruit_development",
"actinidia_postharvest", "actinidia_vegetative_growth", "apple",
Expand Down
8 changes: 8 additions & 0 deletions api/models/eplant2.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,11 @@ class AgiAlias(db.Model):

agi: db.Mapped[str] = db.mapped_column(db.String(30), nullable=False, primary_key=True)
alias: db.Mapped[str] = db.mapped_column(db.String(30), nullable=False, primary_key=True)


class AgiNames(db.Model):
__bind_key__ = "eplant2"
__tablename__ = "agi_names"

agi: db.Mapped[str] = db.mapped_column(db.String(30), nullable=False, primary_key=True)
name: db.Mapped[str] = db.mapped_column(db.String(255), nullable=False, primary_key=True)
107 changes: 107 additions & 0 deletions api/resources/gene_density.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
"""
Reena Obmina | UTEA Project 2026 | University of Toronto

Gene density endpoint for the BAR API.

Returns per-bin gene density across all Arabidopsis thaliana chromosomes for a
given bin size (in base pairs), as used by Eplant's ChromosomeView to colour
chromosomes by gene density.

Reads: eplant2.tair10_gff3
Writes: JSON — list of chromosomes with density arrays

Usage::

GET /gene_density?species=Arabidopsis_thaliana&bin_size=143061.51645207437
"""

from flask import request
from flask_restx import Namespace, Resource
from markupsafe import escape
from sqlalchemy import func
from api import db
from api.models.eplant2 import TAIR10GFF3
from api.utils.bar_utils import BARUtils
import math

gene_density = Namespace("Gene Density", description="Gene density API", path="/gene_density")

# Arabidopsis thaliana chromosome lengths (bp) and display order
_CHR_LENGTHS = {
"1": 30427671,
"2": 19698289,
"3": 23459830,
"4": 18585056,
"5": 26975502,
"C": 154478,
"M": 366924,
}
_CHR_ORDER = ["1", "2", "3", "4", "5", "C", "M"]


@gene_density.route("")
class GeneDensity(Resource):
@gene_density.param("species", description="Species name", default="Arabidopsis_thaliana")
@gene_density.param("bin_size", description="Bin size in base pairs", default="143061.51645207437")
def get(self):
"""Returns gene density per chromosome bin for the given species and bin size."""
species = escape(request.args.get("species", ""))
bin_size_str = request.args.get("bin_size", "")

if not species:
return BARUtils.error_exit("Missing species parameter"), 400
if not bin_size_str:
return BARUtils.error_exit("Missing bin_size parameter"), 400

try:
bin_size = float(bin_size_str)
if bin_size <= 0:
return BARUtils.error_exit("bin_size must be a positive number"), 400
except ValueError:
return BARUtils.error_exit("Invalid bin_size"), 400

if species != "Arabidopsis_thaliana":
return BARUtils.error_exit("Invalid species"), 400

bins = {c: [0] * math.ceil(_CHR_LENGTHS[c] / bin_size) for c in _CHR_ORDER}

chr_expr = func.substr(TAIR10GFF3.geneId, 3, 1)
start_bin_expr = func.floor(TAIR10GFF3.Start / bin_size)
end_bin_expr = func.floor(TAIR10GFF3.End / bin_size)

# Aggregated query for single-bin genes (~98%+ of all genes at typical zoom levels).
# FLOOR(start/binSize) == FLOOR(end/binSize) means the gene fits within one bin,
# so GROUP BY is safe and avoids fetching one row per gene.
single_bin_rows = db.session.execute(
db.select(chr_expr, start_bin_expr, func.count())
.where(
TAIR10GFF3.Type == "gene",
start_bin_expr == end_bin_expr,
)
.group_by(chr_expr, start_bin_expr)
).all()

for chr_char, bin_idx, cnt in single_bin_rows:
if chr_char in bins:
idx = int(bin_idx)
if 0 <= idx < len(bins[chr_char]):
bins[chr_char][idx] += cnt

# Individual rows for genes that span multiple bins (rare — typically <2% of genes).
# Each such gene is counted once in every bin it spans, matching the original behaviour.
multi_bin_rows = db.session.execute(
db.select(chr_expr, start_bin_expr, end_bin_expr)
.where(
TAIR10GFF3.Type == "gene",
start_bin_expr != end_bin_expr,
)
).all()

for chr_char, start_bin, end_bin in multi_bin_rows:
if chr_char in bins:
for n in range(int(start_bin), int(end_bin) + 1):
if 0 <= n < len(bins[chr_char]):
bins[chr_char][n] += 1

output = [{"name": f"Chr {c}", "density": bins[c]} for c in _CHR_ORDER]
return BARUtils.success_exit(output)
Loading
Loading