From 2447feb4c3bc67ced6a5e0d4d3656d0c604c3446 Mon Sep 17 00:00:00 2001 From: Rita Silva Date: Wed, 17 Sep 2025 16:47:55 +0100 Subject: [PATCH 1/7] Revert "Merge pull request #75 from DiseaseTranscriptomicsLab/main" This reverts commit 4c58b64de448bec1fd7bc5f78ff33bc25093bb59, reversing changes made to f6335ca6817d5e7ef3a17844e7c5b997e310283f. --- DESCRIPTION | 6 +- NAMESPACE | 1 - NEWS.md | 17 - R/FPR_Simulation.R | 2 +- R/PlotScores.R | 32 +- R/data.R | 25 +- R/geneset_similarity.R | 237 +++---------- R/zzz.R | 13 - README.Rmd | 101 +++--- README.md | 195 +++++------ man/FPR_Simulation.Rd | 2 +- man/counts_example.Rd | 10 +- man/geneset_similarity.Rd | 41 +-- man/genesets_example.Rd | 4 +- man/metadata_example.Rd | 2 +- tests/testthat/test-geneset_similarity.R | 44 ++- .../articles/Article_BenchmarkingMode.Rmd | 2 +- vignettes/articles/Article_DiscoveryMode.Rmd | 2 +- .../articles/Article_GeneSetSimilarity.Rmd | 12 +- vignettes/markeR.Rmd | 317 +++++++++--------- 20 files changed, 443 insertions(+), 622 deletions(-) delete mode 100644 R/zzz.R diff --git a/DESCRIPTION b/DESCRIPTION index 0c8e339..375b4a3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: markeR Title: An R Toolkit for Evaluating Gene Signatures as Phenotypic Markers -Version: 0.99.5 +Version: 0.99.3 Authors@R: c( person("Rita", "Martins-Silva", @@ -58,9 +58,7 @@ Suggests: rmarkdown, roxygen2, mockery, - covr, - magick, - BiocStyle + covr Config/testthat/edition: 3 Depends: R (>= 4.5.0) diff --git a/NAMESPACE b/NAMESPACE index 45c11f7..5339a3d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -63,7 +63,6 @@ importFrom(pROC,roc) importFrom(reshape2,melt) importFrom(rstatix,t_test) importFrom(scales,hue_pal) -importFrom(scales,rescale) importFrom(scales,squish) importFrom(stats,TukeyHSD) importFrom(stats,anova) diff --git a/NEWS.md b/NEWS.md index 538bb1e..96e6b5c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,20 +1,3 @@ -# markeR 0.99.5 (17 Sep, 2025) - -- Minor fix in `.onAttach()` to avoid errors when checking `ggplot2` version and ensure the startup warning works correctly. - -# markeR 0.99.4 (17 Sep, 2025) - -## General -- Addressed feedback from the Bioconductor review process with updates to documentation and vignette style. - -## Documentation and vignette -- Updated vignette style to **Bioconductor’s BiocStyle** with automatic table of contents. -- Improved vignette content with small corrections. -- Revised dataset documentation by adding explicit `usage: data(object)` entries. - -## Functions -- Updated `geneset_similarity()` color handling: replaced the single `color_values` parameter with three new parameters — `color`, `neutral_color`, and `cold_color`, for more interpretable visualization. - # markeR 0.99.3 (21 Aug, 2025) ## Package size and structure diff --git a/R/FPR_Simulation.R b/R/FPR_Simulation.R index 02c760b..1fb9d29 100644 --- a/R/FPR_Simulation.R +++ b/R/FPR_Simulation.R @@ -108,7 +108,7 @@ utils::globalVariables(c( "cohen", "method", "contrast" )) #' @export #' FPR_Simulation <- function(data, metadata, original_signatures, Variable, - gene_list = NULL, number_of_sims=100, title=NULL, + gene_list = NULL, number_of_sims=10, title=NULL, widthTitle = 30, titlesize = 12, pointSize = 2, labsize = 10,mode = c( "none","simple","medium","extensive"), ColorValues=NULL, ncol=NULL, nrow=NULL) { diff --git a/R/PlotScores.R b/R/PlotScores.R index 8ab0ad4..5a3a94c 100644 --- a/R/PlotScores.R +++ b/R/PlotScores.R @@ -786,16 +786,16 @@ PlotScores_Categorical <- function(data, metadata, gene_sets, if (!is.null(title)) title <- wrap_title(title, width = widthTitle) # Create label for y axis based on method. - # if (method == "ssGSEA") { - # ylab <- "ssGSEA Enrichment Score" - # } else if (method == "logmedian") { - # ylab <- "Normalised Signature Score" - # } else if (method == "ranking") { - # ylab <- "Signature Genes' Ranking" - # } + if (method == "ssGSEA") { + ylab <- "ssGSEA Enrichment Score" + } else if (method == "logmedian") { + ylab <- "Normalised Signature Score" + } else if (method == "ranking") { + ylab <- "Signature Genes' Ranking" + } combined_plot <- ggpubr::annotate_figure(combined_plot, - left = grid::textGrob(paste0("Gene Set's Score (", method, ")"), + left = grid::textGrob(ylab, rot = 90, vjust = 1, gp = grid::gpar(cex = 1.3, fontsize = labsize)), @@ -1093,16 +1093,16 @@ PlotScores_Numeric <- function(data, if (!is.null(title)) title <- wrap_title(title, width = widthTitle) # Create label for y axis based on method. - # if (method == "ssGSEA") { - # ylab <- "ssGSEA Enrichment Score" - # } else if (method == "logmedian") { - # ylab <- "Normalised Signature Score" - # } else if (method == "ranking") { - # ylab <- "Signature Genes' Ranking" - # } + if (method == "ssGSEA") { + ylab <- "ssGSEA Enrichment Score" + } else if (method == "logmedian") { + ylab <- "Normalised Signature Score" + } else if (method == "ranking") { + ylab <- "Signature Genes' Ranking" + } combined_plot <- ggpubr::annotate_figure(combined_plot, - left = grid::textGrob(paste0("Gene Set's Score (", method, ")"), + left = grid::textGrob(ylab, rot = 90, vjust = 1, gp = grid::gpar(cex = 1.3, diff --git a/R/data.R b/R/data.R index 2c8557a..748e00a 100644 --- a/R/data.R +++ b/R/data.R @@ -15,7 +15,6 @@ #' for senescent samples, "young" for proliferative).} #' } #' -#' #' @source \url{https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE63577} #' #' @references Marthandan S, Priebe S, Baumgart M, Groth M et al. Similarities @@ -25,15 +24,14 @@ #' @references Marthandan S, Baumgart M, Priebe S, Groth M et al. Conserved #' Senescence Associated Genes and Pathways in Primary Human Fibroblasts #' Detected by RNA-Seq. PLoS One 2016;11(5):e0154531. PMID: 27140416 -#' -#' @usage data(metadata_example) +#' +#' @keywords datasets "metadata_example" #' Gene Expression Counts for Marthandan et al. (2016) RNA-Seq Data #' -#' A numeric matrix containing filtered and normalized (non log-transformed) -#' gene expression data from the Marthandan et al. (2016) study (GEO accession -#' GSE63577). +#' A numeric matrix containing filtered and normalized gene expression data from +#' the Marthandan et al. (2016) study (GEO accession GSE63577). #' #' Raw FASTQ files were downloaded using `fasterq-dump` (v2.11.0) and processed #' in a reproducible conda environment (Python v3.11.5). Quality control was @@ -41,7 +39,8 @@ #' Pseudo-alignment to the RefSeq transcriptome (NCBI release 109) was performed #' using kallisto (v0.44.0). Genes with low expression (mean count < 70 in all #' conditions) were filtered out. Count normalization factors were calculated -#' with `edgeR::calcNormFactors`. +#' with `edgeR::calcNormFactors`, and log2-transformed values were obtained via +#' `limma::voom`. #' #' Intermediate time points for HFF and MRC5 cell lines were excluded, resulting #' in a final dataset with 45 high-quality samples across proliferative, @@ -53,7 +52,7 @@ #' #' @format A numeric matrix with rows as genes (gene symbols) and columns as #' samples (sample IDs). -#' +#' #' @source \url{https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE63577} #' #' @references Marthandan S, Priebe S, Baumgart M, Groth M et al. Similarities @@ -64,8 +63,8 @@ #' Senescence Associated Genes and Pathways in Primary Human Fibroblasts #' Detected by RNA-Seq. #' *PLoS One* 2016;11(5):e0154531. PMID: 27140416 -#' -#' @usage data(counts_example) +#' +#' @keywords datasets "counts_example" #' Example Gene Sets for Cellular Senescence @@ -76,15 +75,15 @@ #' curated gene set of commonly reported senescence markers, #' with directionality (+1 or -1).} #' \item{REACTOME_Senescence}{Character vector of gene symbols. The -#' REACTOME_CELLULAR_SENESCENCE from MSigDB database No directionality.} +#' REACTOME_CELLULAR_SENESCENCE from MSigDB pathway. No directionality.} #' \item{HernandezSegura}{A data frame with columns `gene` and `direction`. #' A gene set from Hernandez-Segura et al. (2017), with directionality (+1 or -1).} #' } -#' +#' #' @references Hernandez-Segura A, de Jong TV, Melov S, Guryev V, Campisi J, #' Demaria M. Unmasking Transcriptional Heterogeneity in Senescent Cells. #' *Curr Biol.* 2017 Sep 11;27(17):2652-2660.e4. doi: 10.1016/j.cub.2017.07.033. #' Epub 2017 Aug 30. PMID: 28844647; PMCID: PMC5788810. -#' @usage data(genesets_example) +#' @keywords datasets "genesets_example" diff --git a/R/geneset_similarity.R b/R/geneset_similarity.R index 2079ee6..f92c833 100644 --- a/R/geneset_similarity.R +++ b/R/geneset_similarity.R @@ -21,31 +21,12 @@ #' Odds Ratio required for a gene set to be included in the plot. Default is #' 1. #' @param pval_threshold (only if method == "odds_ratio" only) Numeric. Maximum -#' adjusted p-value required for a gene set to be included in the plot. -#' Default is 0.05. -#' @param limits Numeric vector of length 2. Limits for color scale. If `NULL`, -#' is automatically set to c(0,1) for Jaccard or the range of OR for odds -#' ratio. +#' adjusted p-value to show a label. Default is 0.05. +#' @param limits Numeric vector of length 2. Limits for color scale. #' @param title_size Integer specifying the font size for the plot title. #' Default is `12`. -#' @param color Character. The color for the maximum of the scale. Default is -#' `red.` -#' - If `method = "jaccard"`, the scale goes from `neutral_color` to `color`. -#' - If `method = "odds_ratio"` and any OR >= 1, the scale ends at `color`. -#' - If `method = "odds_ratio"` and all OR <= 1, `color` is not used; instead, the scale -#' runs from `cold_color` (minimum) to `neutral_color` (OR = 1, if present; -#' otherwise `neutral_color` is the maximum). -#' @param neutral_color Character. The neutral reference color. Default is -#' `white`. -#' - If `method = "jaccard"`, this is the minimum of the scale. -#' - If `method = "odds_ratio"` and any OR >= 1, this corresponds to OR = 1 if such values exist; otherwise it is the minimum of the scale. -#' - If `method = "odds_ratio"` and all OR <= 1, this corresponds to OR = 1 if such values exist; otherwise it is the maximum of the scale (with `cold_color` as the minimum). -#' @param cold_color Character. The color for values below OR = 1 (only used -#' when `method = "odds_ratio"`). Default is `blue`. -#' - If `method = "odds_ratio"` and any OR < 1, the scale runs from `cold_color` -#' (minimum) to `neutral_color` (OR = 1 if present; otherwise `neutral_color` -#' is the maximum). -#' - Ignored if `method = "jaccard"` or if all OR >= 1. +#' @param color_values Character vector of colors used for the fill gradient. +#' Default is `c("#F9F4AE", "#B44141")`. #' @param title Optional. Custom title for the plot. If `NULL`, the title #' defaults to `"Signature Overlap"`. #' @param jaccard_threshold (only if method == "jaccard" only) Numeric. Minimum @@ -64,13 +45,13 @@ #' \describe{ #' \item{\code{plot}}{The \pkg{ggplot2} object of the similarity heatmap.} #' \item{\code{data}}{The data frame object containing the similarity -#' scores per pair of gene sets.} +#' scores aper pair of gene sets.} #' } #' #' @import ggplot2 #' @importFrom tibble tibble #' @importFrom msigdbr msigdbr -#' @importFrom scales squish rescale +#' @importFrom scales squish #' #' @examples #' # Create two simple gene signatures @@ -116,9 +97,7 @@ geneset_similarity <- function( pval_threshold = 0.05, limits = NULL, title_size = 12, - color = "#B44141", # color for the maximum of the scale - neutral_color = "white", # neutral reference color - cold_color = "#4173B4", # color for OR < 1 when applicable + color_values = c("#F9F4AE", "#B44141"), title = NULL, jaccard_threshold = 0, msig_subset = NULL, @@ -156,15 +135,14 @@ geneset_similarity <- function( if (!is.null(limits) && (!is.numeric(limits) || length(limits) != 2)) { stop("limits must be a numeric vector of length 2.") } - - if (!is.null(limits) && any(limits < 0 | !is.finite(limits))) { - warning("Limits contain negative, or non-finite values. Ensure limits are positive finite numbers.") - } if (!is.numeric(title_size) || title_size <= 0) { stop("title_size must be a positive numeric value.") } - + + if (!is.character(color_values) || length(color_values) < 2) { + stop("color_values must be a character vector with two colors.") + } if (!is.null(title) && !is.character(title)) { stop("title must be a character string or NULL.") @@ -254,12 +232,11 @@ geneset_similarity <- function( ft <- stats::fisher.test(cont_tbl) score <- log10(ft$estimate) - # if (!is.na(ft$p.value) && ft$p.value <= pval_threshold && ft$estimate >= or_threshold) { - # label <- sprintf("%.1f", 10^score) # to show non log values in heatmap - # #label <- sprintf("%.1f", score) - # } else { - # label <- "" - # } + if (!is.na(ft$p.value) && ft$p.value <= pval_threshold && ft$estimate >= or_threshold) { + label <- sprintf("%.1f", score) + } else { + label <- "" + } pval <- ft$p.value } @@ -267,7 +244,7 @@ geneset_similarity <- function( Reference_Signature = ref_name, Compared_Signature = comp_name, Score = score, - #Label = label, + Label = label, Pval = pval, stringsAsFactors = FALSE ) @@ -282,24 +259,19 @@ geneset_similarity <- function( if (metric == "odds_ratio") { # Filter groups where any 10^Score >= threshold - # keep_rows <- by(similarity_df, similarity_df$Compared_Signature, function(group) { - # any(10^group$Score >= or_threshold, na.rm = TRUE) - # }) - keep_rows <- by(similarity_df, similarity_df$Compared_Signature, function(group) { - any(10^group$Score >= or_threshold & group$Pval <= pval_threshold, na.rm = TRUE) + any(10^group$Score >= or_threshold, na.rm = TRUE) }) - - + kept_signatures <- names(keep_rows[keep_rows]) similarity_df <- similarity_df[similarity_df$Compared_Signature %in% kept_signatures, , drop = FALSE] # Add Label column - # similarity_df$Label <- ifelse( - # similarity_df$Pval <= pval_threshold, - # sprintf("%.1f", similarity_df$Score), - # "" - # ) + similarity_df$Label <- ifelse( + similarity_df$Pval <= pval_threshold, + sprintf("%.1f", similarity_df$Score), + "" + ) } if (metric == "jaccard" && jaccard_threshold > 0) { @@ -315,168 +287,43 @@ geneset_similarity <- function( data <- similarity_df - - if (nrow(similarity_df) == 0) { - stop("No signatures passed the filtering criteria.") - } - similarity_df$Reference_Signature <- vapply(similarity_df$Reference_Signature, function(x) wrap_title(x, width_text), character(1)) similarity_df$Compared_Signature <- vapply(similarity_df$Compared_Signature, function(x) wrap_title(x, width_text), character(1)) -# -# if (is.null(limits)) { -# if (metric == "jaccard") { -# limits <- c(0, 1) -# } else { -# # For odds ratio, we set limits based on the data -# limits <- c(min(similarity_df$Score[is.finite(similarity_df$Score)], na.rm = TRUE), max(similarity_df$Score, na.rm = TRUE)) -# -# } -# } -# -# -# -# plt <- ggplot(similarity_df, aes(x = .data$Reference_Signature, -# y = .data$Compared_Signature, fill = .data$Score)) + -# geom_tile(color = "white") + -# #geom_text(aes(label = .data$Label), color = "black") + -# scale_fill_gradientn(colors = color_values, limits = limits, -# oob = scales::squish, na.value = na_color) + -# labs( -# x = "", -# y = "Compared Signature", -# fill = ifelse(metric == "jaccard", "Jaccard Index", "log10(OR)"), -# title = ifelse(is.null(title), paste("Signature Overlap (", metric, ")"), title) -# ) + -# theme_minimal() + -# theme( -# axis.text.x = element_text(angle = 45, hjust = 1), -# plot.title = element_text(hjust = 0.5, size = title_size) -# ) -# -# plt -# -# invisible(list(plot=plt, -# data=data)) - - - # ---------------------------- - # Safe handling of limits (user provides OR) - # ---------------------------- + if (is.null(limits)) { if (metric == "jaccard") { limits <- c(0, 1) - } else { # odds_ratio - - # Extract finite scores - finite_scores <- similarity_df$Score[is.finite(similarity_df$Score)] - - # Identify if there are any OR < 1 (logOR < 0) - has_below1 <- any(finite_scores < 0) - - # Compute padding for -Inf (original OR = 0) - if (has_below1) { - # Place -Inf one log unit below the minimum finite score < 0 - min_below <- min(finite_scores[finite_scores < 0]) - pad_value <- min_below - 1 - } else { - # All OR >= 1 → map -Inf slightly above the maximum score, then invert sign - max_score <- max(finite_scores) - pad_value <- -(max_score + 1) - } - - # Replace -Inf in Score with computed padding - similarity_df$Score[is.infinite(similarity_df$Score) & similarity_df$Score < 0] <- pad_value - - # Convert back from log - OR_values <- 10^similarity_df$Score - - # Replace any zero or negative OR with a small number - OR_values[OR_values <= 0] <- 1e-6 - #similarity_df$Score[is.infinite(similarity_df$Score) & similarity_df$Score < 0] <- log10(1e-6) - - # Compute limits - limits <- c(min(OR_values, na.rm = TRUE), max(OR_values, na.rm = TRUE)) - } - } - - - # Convert OR limits to log space (Score already log10 OR) - log_limits <- if (metric == "odds_ratio") log10(limits) else limits - if (min(log_limits) == max(log_limits)) { - log_limits <- log_limits + c(-0.01, 0.01) # small padding - } - zero <- 0 # neutral color at OR = 1 → log10(1) = 0 - - # ---------------------------- - # Define fill colors - # ---------------------------- - if (metric == "jaccard") { - fill_colors <- c(neutral_color, color) - fill_values <- c(log_limits[1], log_limits[2]) - } else if (metric == "odds_ratio") { - min_lim <- log_limits[1] - max_lim <- log_limits[2] - - if (min_lim >= zero) { - fill_colors <- c(neutral_color, color) - fill_values <- c(min_lim, max_lim) - } else if (max_lim <= zero) { - fill_colors <- c(cold_color, neutral_color) - fill_values <- c(min_lim, max_lim) } else { - fill_colors <- c(cold_color, neutral_color, color) - fill_values <- c(min_lim, zero, max_lim) - } - - # ---------------------------- - # Safe legend breaks in OR space - # ---------------------------- - valid_OR <- limits[limits > 0 & is.finite(limits)] - if (length(valid_OR) == 0) valid_OR <- 1 # fallback - - log_breaks <- 10^seq(floor(log10(min(valid_OR))), ceiling(log10(max(valid_OR)))) - log_breaks <- log_breaks[log_breaks >= min(valid_OR) & log_breaks <= max(valid_OR)] + # For odds ratio, we set limits based on the data + # but ensure they are at least 0 to avoid negative log10 values + limits <- c(0, max(similarity_df$Score, na.rm = TRUE)) } - - # ---------------------------- - # Build plot - # ---------------------------- - plt <- ggplot(similarity_df, aes( - x = .data$Reference_Signature, - y = .data$Compared_Signature, - fill = .data$Score)) + + } + + plt <- ggplot(similarity_df, aes(x = .data$Reference_Signature, + y = .data$Compared_Signature, fill = .data$Score)) + geom_tile(color = "white") + - scale_fill_gradientn( - colors = fill_colors, - values = scales::rescale(fill_values), - limits = log_limits, - oob = scales::squish, - na.value = na_color, - trans = "identity", # already logged - breaks = if (metric == "odds_ratio") log10(log_breaks) else waiver(), - labels = if (metric == "odds_ratio") log_breaks else waiver() - ) + + geom_text(aes(label = .data$Label), color = "black") + + scale_fill_gradientn(colors = color_values, limits = limits, + oob = scales::squish, na.value = na_color) + labs( x = "", y = "Compared Signature", - fill = ifelse(metric == "jaccard", "Jaccard Index", "Odds Ratio"), - title = ifelse(is.null(title), "Signature Overlap", title) + fill = ifelse(metric == "jaccard", "Jaccard Index", "log10(OR)"), + title = ifelse(is.null(title), paste("Signature Overlap (", metric, ")"), title) ) + theme_minimal() + theme( axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5, size = title_size) ) - - if (metric == "odds_ratio") { - data$Score <- 10^data$Score # convert back to OR for data output - } - - invisible(list(plot = plt, data = data)) - - + + plt + + invisible(list(plot=plt, + data=data)) } diff --git a/R/zzz.R b/R/zzz.R deleted file mode 100644 index d34382c..0000000 --- a/R/zzz.R +++ /dev/null @@ -1,13 +0,0 @@ -.onAttach <- function(libname, pkgname) { - gg_version <- tryCatch( - utils::packageVersion("ggplot2"), - error = function(e) NULL - ) - - if (!is.null(gg_version) && gg_version > "3.5.2") { - packageStartupMessage( - "Warning: markeR has been tested with ggplot2 <= 3.5.2. ", - "Using newer versions may cause incompatibilities." - ) - } -} diff --git a/README.Rmd b/README.Rmd index 3a38d03..00431d1 100644 --- a/README.Rmd +++ b/README.Rmd @@ -26,13 +26,13 @@ knitr::opts_chunk$set( -**`markeR`** is an R package that provides a modular and extensible framework for the systematic evaluation of gene sets as phenotypic markers using transcriptomic data. The package is designed to support both quantitative analyses and visual exploration of gene set behaviour across experimental and clinical phenotypes. +**markeR** provides a suite of methods for using gene sets to quantify and evaluate the extent to which a given gene signature marks a specific phenotype from gene expression data. The package implements various scoring, enrichment and classification approaches, along with tools to compute performance metrics and visualize results. -> **To cite `markeR` please use:** +> **To cite markeR please use:** > -> Martins-Silva R, Kaizeler A, Barbosa-Morais N (2025). _markeR: an R Toolkit for Evaluating Gene Sets as Phenotypic Markers_. Gulbenkian Institute for Molecular Medicine, Faculdade de Medicina, Universidade de Lisboa, Lisbon, Portugal. R package version 0.99.5, https://github.com/DiseaseTranscriptomicsLab/markeR. +> Martins-Silva R, Kaizeler A, Barbosa-Morais N (2025). _markeR: an R Toolkit for Evaluating Gene Sets as Phenotypic Markers_. Gulbenkian Institute for Molecular Medicine, Faculdade de Medicina, Universidade de Lisboa, Lisbon, Portugal. R package version 0.99.3, https://github.com/DiseaseTranscriptomicsLab/markeR. -The folder `inst/Paper/` is in the **paper** branch and contains all scripts and materials used in the original `markeR` paper to reproduce analyses and figures. You can browse it [here](https://github.com/DiseaseTranscriptomicsLab/markeR/tree/paper/inst/Paper). +The folder `inst/Paper/` is in the **paper** branch and contains all scripts and materials used in the original markeR paper to reproduce analyses and figures. You can browse it [here](https://github.com/DiseaseTranscriptomicsLab/markeR/tree/paper/inst/Paper). ![](man/figures/Workflow.png) @@ -57,8 +57,9 @@ The folder `inst/Paper/` is in the **paper** branch and contains all scripts and ## Installation -Install the latest development release of markeR from [GitHub](https://github.com/) with: +Install the latest development release of markeR from [GitHub](https://github.com/) with: + ``` r # install.packages("devtools") devtools::install_github("DiseaseTranscriptomicsLab/markeR@*release") @@ -79,6 +80,8 @@ This package is officially supported on `R > 4.5.0`. ⚠️ Older versions of `R ## Common Workflow +`markeR` provides a modular pipeline to quantify transcriptomic signatures and assess their association with phenotypic or clinical variables. The typical workflow includes the following steps: + ### 1. Input Requirements Depending on the analysis mode, inputs vary slightly. @@ -115,10 +118,7 @@ gene_sets ``` * **Expression Data Frame**: - A filtered and normalised, non log-transformed, gene expression matrix (genes × samples). Row names must be gene identifiers; column names must match sample IDs in the metadata. - - **Warning:** If you are using microarray data or outputs from common RNA-seq pipelines (*e.g.*, edgeR), note that the expression values may already be log2-normalised. The input to `markeR` must necessarily be **non-log-transformed**. If your data are log2-transformed, you can revert them by applying `2^data`. - + A filtered and normalised gene expression data frame (genes × samples). Row names must be gene identifiers, and column names must match the sample IDs in the metadata. ```{r example-expression-matrix, echo=FALSE} # Simulate expression matrix: 10 genes × 5 samples @@ -134,7 +134,7 @@ head(expr_df) ``` * **Sample Metadata**: - A data frame with samples as rows and annotations as columns. The first column should contain sample IDs matching the expression matrix column names. + A data frame with annotations for each sample, with the sample ID in the first column. The row names must match the column names of the expression matrix. ```{r example-metadata, echo=FALSE} # Simulate sample metadata @@ -151,79 +151,82 @@ metadata ### 2. Select Mode of Analysis -`markeR` provides two modes of operation: - -* **Benchmarking**: -evaluates gene sets' performance in marking a metadata variable, *i.e.*, a phenotype, returning comparative visualisations across scoring and enrichment methods. +* **Discovery Mode**: + Explore how a single, well-characterised gene set relates to a specific variable of interest. Suitable for hypothesis generation or signature projection. -* **Discovery**: -examines the relationship between a gene set and one or more variables of interest, suitable for exploratory or hypothesis-generating analyses. +* **Benchmarking Mode**: + Evaluate one or more gene sets against multiple metadata variables using a standardised scoring and effect size framework. This mode provides comprehensive visualisations and comparisons across methods. ### 3. Choose a Quantification Approach -Two complementary strategies are implemented for quantifying associations between gene sets and phenotypes: +`markeR` supports two complementary strategies for quantifying the association between gene sets and phenotypes: #### 3.1 Score-Based Approach +This strategy generates a **single numeric score per sample**, reflecting the activity of a gene set. It enables flexible downstream analyses, including comparisons across phenotypic groups. -A score summarising the collective expression of a gene set therein is assigned **to each sample**. Scores can be visualised using built-in functions, or used directly in downstream analyses (*e.g.*, comparisons between phenotypic groups of samples, correlations with numerical phenotypes). +Three scoring methods are available: -Available methods: +* **Log2-median**: Calculates the median log2 expression of the genes in the set. Sensitive to absolute shifts in expression. -* **Log2-median**: mean of the across-sample normalised log2 median-centred expression levels of the genes in the set; for bidirectional gene sets, the sample score is the partial score for the subset of putatively upregulated genes minus that of the downregulated subset. +* **Ranking**: Ranks all genes within each sample and averages the ranks of gene set members. Captures relative ordering rather than magnitude. -* **Ranking**: mean expression rank of gene set members in each sample; for bidirectional gene sets, the sample score is the partial score for the subset of putatively upregulated genes minus that of the downregulated subset, and normalised by the number of genes in the set. +* **ssGSEA**: Computes a single-sample gene set enrichment score using the ssGSEA algorithm. Reflects the coordinated up- or down-regulation of the set in each sample. -* **ssGSEA**: single-sample gene set enrichment score using ssGSEA; for bidirectional gene sets, the sample score is the partial score for the subset of putatively upregulated genes minus that of the downregulated subset. - -Gene sets that are robust phenotypic markers are expected to yield consistently high scores across methods. +These methods vary in assumptions and sensitivity. Robust gene sets are expected to perform consistently across all three. #### 3.2 Enrichment-Based Approach -Enrichment-based methods implement **Gene Set Enrichment Analysis (GSEA)**. Genes are ranked according to differential expression statistics, and a Normalised Enrichment Score (NES) per variable of interest is computed, accompanied by a p-value adjusted for multiple hypothesis testing. +This approach uses a classical **gene set enrichment analysis (GSEA)** framework to evaluate whether the gene set is significantly overrepresented at the top or bottom of a ranked list of genes (e.g., ranked by fold change or correlation with phenotype). -### 4. Visualisation and Evaluation +* **GSEA**: Computes a Normalised Enrichment Score (NES) for each contrast or variable of interest, adjusting for gene set size and multiple testing. + +Use this approach when interested in collective behaviour of gene sets in relation to ranked differential signals. -In **Benchmarking Mode**, `markeR` offers a range of visual summaries: +### 4. Visualisation and Evaluation -* Violin plots of score distributions by categorical phenotype; -* Scatter plots of association between scores and numerical phenotypes; -* Volcano plots and heatmaps of scores or differential gene set expression based on effect sizes (Cohen’s *d* or *f*); -* ROC curves and respective AUC values of gene sets' phenotypic classification performance; -* Violin plots of effect size distributions (Cohen’s *d*) for pairwise group differences in scores, for original and simulated gene sets; -* Plots summarising NES alongside adjusted p-values (*e.g.*, lollipop plots); -* GSEA plots showing running enrichment scores across ranked gene lists. +In **Benchmarking Mode**, `markeR` offers a range of visual summaries: +* Violin or scatter plots showing score distributions by phenotype +* Volcano plots and heatmaps based on effect sizes (Cohen’s *d* or *f*) +* ROC curves and AUC values +* Null distribution testing using random gene sets matched for size and directionality +* Lollipop plots summarising enrichment scores (NES) with adjusted p-values +* Enrichment plots showing running enrichment scores across ranked gene lists +* Volcano plots In **Discovery Mode**, the output focuses on a single gene set: -* Score distributions stratified by variable; -* Effect sizes for pairwise and multiple-group differences (Cohen's *d* and *f*, respectively); -* Cross-variable summaries of NES and adjusted p-values (*e.g.*, lollipop plots). +* Score distributions by phenotype +* Pairwise contrasts (Cohen’s *d*) and overall effect sizes (Cohen’s *f*) +* Enrichment score summaries (NES) with adjusted p-values (e.g., lollipop plots) -The Benchmarking Mode offers the most comprehensive set of features. Users are allowed to seamlessly move from Discovery to Benchmarking once a variable of interest has been identified and further testing is required. Benchmarking is designed to evaluate multiple gene sets simultaneously, whereas Discovery focuses on the performance of a single gene set. +Benchmarking mode offers the most comprehensive set of features and allows users to seamlessly move from discovery to benchmarking mode once a variable of interest has been identified and further testing is required. The main difference from Discovery mode is that Benchmarking is designed to evaluate multiple gene sets simultaneously, whereas Discovery mode focuses on quantifying a single, robust gene set. ### 5. Individual Gene Exploration -To better understand the contribution of individual genes within a gene set, and identify whether specific genes drive the set's collective signal, `markeR` provides `VisualiseIndividualGenes.` Available options include: +To better understand the contribution of individual genes within a gene set and identify whether specific genes drive the overall signal, `markeR` offers a suite of gene-level exploratory analyses, including: -* Expression heatmaps of genes across samples or groups of samples; -* Violin plots showing cross-sample expression distributions of individual genes; -* Heatmaps of pairwise cross-sample expression correlation between genes in the set; -* ROC curves and AUC values to evaluate single genes' performance as phenotypic markers; -* Effect size estimation (Cohen’s *d*) of expression differences between groups of samples; -* Principal Component Analysis (PCA) of expression of genes in the set, to evaluate which genes dominate collective variance and how samples separate according to the gene set's expression. +* Expression heatmaps of genes across samples and groups +* Violin plots showing expression distributions of individual genes +* Correlation heatmaps to reveal co-expression patterns among genes in the set +* ROC curves and AUC values for individual genes to evaluate their discriminatory power +* Effect size calculations (Cohen’s *d*) per gene to quantify differential expression +* Principal Component Analysis (PCA) on gene set genes to assess variance explained and sample clustering ### 6. Compare with Reference Gene Sets -`markeR` also supports comparison of user-defined gene sets against reference collections (e.g., MSigDB). Two complementary similarity metrics are implemented: +`markeR` allows comparison of user-defined gene sets to reference sets (e.g., from MSigDB) using: * **Jaccard Index**: -the ratio of the number of genes in common over the total number of genes in the two sets. + Measures gene overlap relative to union size. + +* **Log Odds Ratio (logOR)**: + Computes enrichment using a user-defined gene universe and Fisher’s exact test. -* **Log Odds Ratio (logOR)** from Fisher’s exact test of association between gene sets, given a specified gene universe. +Filters can be applied based on similarity thresholds (e.g., minimum Jaccard, OR, or p-value). + -Filters can be applied based on similarity thresholds (e.g., minimum Jaccard, OR, or Fisher's test p-value). ## Contact diff --git a/README.md b/README.md index 1a9a9b2..bf5f7b0 100644 --- a/README.md +++ b/README.md @@ -16,22 +16,22 @@ Check](https://github.com/DiseaseTranscriptomicsLab/markeR/actions/workflows/bio -**`markeR`** is an R package that provides a modular and extensible -framework for the systematic evaluation of gene sets as phenotypic -markers using transcriptomic data. The package is designed to support -both quantitative analyses and visual exploration of gene set behaviour -across experimental and clinical phenotypes. +**markeR** provides a suite of methods for using gene sets to quantify +and evaluate the extent to which a given gene signature marks a specific +phenotype from gene expression data. The package implements various +scoring, enrichment and classification approaches, along with tools to +compute performance metrics and visualize results. -> **To cite `markeR` please use:** +> **To cite markeR please use:** > > Martins-Silva R, Kaizeler A, Barbosa-Morais N (2025). *markeR: an R > Toolkit for Evaluating Gene Sets as Phenotypic Markers*. Gulbenkian > Institute for Molecular Medicine, Faculdade de Medicina, Universidade -> de Lisboa, Lisbon, Portugal. R package version 0.99.5, +> de Lisboa, Lisbon, Portugal. R package version 0.99.3, > . The folder `inst/Paper/` is in the **paper** branch and contains all -scripts and materials used in the original `markeR` paper to reproduce +scripts and materials used in the original markeR paper to reproduce analyses and figures. You can browse it [here](https://github.com/DiseaseTranscriptomicsLab/markeR/tree/paper/inst/Paper). @@ -58,7 +58,8 @@ analyses and figures. You can browse it ## Installation -Install the latest development release of markeR from [GitHub](https://github.com/) with: +Install the latest development release of markeR from +[GitHub](https://github.com/) with: ``` r # install.packages("devtools") @@ -88,6 +89,10 @@ restore compatibility. ## Common Workflow +`markeR` provides a modular pipeline to quantify transcriptomic +signatures and assess their association with phenotypic or clinical +variables. The typical workflow includes the following steps: + ### 1. Input Requirements Depending on the analysis mode, inputs vary slightly. @@ -117,15 +122,9 @@ gene_sets ``` - **Expression Data Frame**: - A filtered and normalised, non log-transformed, gene expression matrix - (genes × samples). Row names must be gene identifiers; column names - must match sample IDs in the metadata. - - **Warning:** If you are using microarray data or outputs from common - RNA-seq pipelines (*e.g.*, edgeR), note that the expression values may - already be log2-normalised. The input to `markeR` must necessarily be - **non-log-transformed**. If your data are log2-transformed, you can - revert them by applying `2^data`. + A filtered and normalised gene expression data frame (genes × + samples). Row names must be gene identifiers, and column names must + match the sample IDs in the metadata. ``` r head(expr_df) @@ -139,9 +138,9 @@ head(expr_df) ``` - **Sample Metadata**: - A data frame with samples as rows and annotations as columns. The - first column should contain sample IDs matching the expression matrix - column names. + A data frame with annotations for each sample, with the sample ID in + the first column. The row names must match the column names of the + expression matrix. ``` r metadata @@ -155,124 +154,114 @@ metadata ### 2. Select Mode of Analysis -`markeR` provides two modes of operation: - -- **Benchmarking**: evaluates gene sets’ performance in marking a - metadata variable, *i.e.*, a phenotype, returning comparative - visualisations across scoring and enrichment methods. +- **Discovery Mode**: Explore how a single, well-characterised gene set + relates to a specific variable of interest. Suitable for hypothesis + generation or signature projection. -- **Discovery**: examines the relationship between a gene set and one or - more variables of interest, suitable for exploratory or - hypothesis-generating analyses. +- **Benchmarking Mode**: Evaluate one or more gene sets against multiple + metadata variables using a standardised scoring and effect size + framework. This mode provides comprehensive visualisations and + comparisons across methods. ### 3. Choose a Quantification Approach -Two complementary strategies are implemented for quantifying -associations between gene sets and phenotypes: +`markeR` supports two complementary strategies for quantifying the +association between gene sets and phenotypes: #### 3.1 Score-Based Approach -A score summarising the collective expression of a gene set therein is -assigned **to each sample**. Scores can be visualised using built-in -functions, or used directly in downstream analyses (*e.g.*, comparisons -between phenotypic groups of samples, correlations with numerical -phenotypes). +This strategy generates a **single numeric score per sample**, +reflecting the activity of a gene set. It enables flexible downstream +analyses, including comparisons across phenotypic groups. -Available methods: +Three scoring methods are available: -- **Log2-median**: mean of the across-sample normalised log2 - median-centred expression levels of the genes in the set; for - bidirectional gene sets, the sample score is the partial score for the - subset of putatively upregulated genes minus that of the downregulated - subset. +- **Log2-median**: Calculates the median log2 expression of the genes in + the set. Sensitive to absolute shifts in expression. -- **Ranking**: mean expression rank of gene set members in each sample; - for bidirectional gene sets, the sample score is the partial score for - the subset of putatively upregulated genes minus that of the - downregulated subset, and normalised by the number of genes in the - set. +- **Ranking**: Ranks all genes within each sample and averages the ranks + of gene set members. Captures relative ordering rather than magnitude. -- **ssGSEA**: single-sample gene set enrichment score using ssGSEA; for - bidirectional gene sets, the sample score is the partial score for the - subset of putatively upregulated genes minus that of the downregulated - subset. +- **ssGSEA**: Computes a single-sample gene set enrichment score using + the ssGSEA algorithm. Reflects the coordinated up- or down-regulation + of the set in each sample. -Gene sets that are robust phenotypic markers are expected to yield -consistently high scores across methods. +These methods vary in assumptions and sensitivity. Robust gene sets are +expected to perform consistently across all three. #### 3.2 Enrichment-Based Approach -Enrichment-based methods implement **Gene Set Enrichment Analysis -(GSEA)**. Genes are ranked according to differential expression -statistics, and a Normalised Enrichment Score (NES) per variable of -interest is computed, accompanied by a p-value adjusted for multiple -hypothesis testing. +This approach uses a classical **gene set enrichment analysis (GSEA)** +framework to evaluate whether the gene set is significantly +overrepresented at the top or bottom of a ranked list of genes (e.g., +ranked by fold change or correlation with phenotype). + +- **GSEA**: Computes a Normalised Enrichment Score (NES) for each + contrast or variable of interest, adjusting for gene set size and + multiple testing. + +Use this approach when interested in collective behaviour of gene sets +in relation to ranked differential signals. ### 4. Visualisation and Evaluation In **Benchmarking Mode**, `markeR` offers a range of visual summaries: -- Violin plots of score distributions by categorical phenotype; -- Scatter plots of association between scores and numerical phenotypes; -- Volcano plots and heatmaps of scores or differential gene set - expression based on effect sizes (Cohen’s *d* or *f*); -- ROC curves and respective AUC values of gene sets’ phenotypic - classification performance; -- Violin plots of effect size distributions (Cohen’s *d*) for pairwise - group differences in scores, for original and simulated gene sets; -- Plots summarising NES alongside adjusted p-values (*e.g.*, lollipop - plots); -- GSEA plots showing running enrichment scores across ranked gene lists. +- Violin or scatter plots showing score distributions by phenotype +- Volcano plots and heatmaps based on effect sizes (Cohen’s *d* or *f*) +- ROC curves and AUC values +- Null distribution testing using random gene sets matched for size and + directionality +- Lollipop plots summarising enrichment scores (NES) with adjusted + p-values +- Enrichment plots showing running enrichment scores across ranked gene + lists +- Volcano plots In **Discovery Mode**, the output focuses on a single gene set: -- Score distributions stratified by variable; -- Effect sizes for pairwise and multiple-group differences (Cohen’s *d* - and *f*, respectively); -- Cross-variable summaries of NES and adjusted p-values (*e.g.*, - lollipop plots). +- Score distributions by phenotype +- Pairwise contrasts (Cohen’s *d*) and overall effect sizes (Cohen’s + *f*) +- Enrichment score summaries (NES) with adjusted p-values (e.g., + lollipop plots) -The Benchmarking Mode offers the most comprehensive set of features. -Users are allowed to seamlessly move from Discovery to Benchmarking once +Benchmarking mode offers the most comprehensive set of features and +allows users to seamlessly move from discovery to benchmarking mode once a variable of interest has been identified and further testing is -required. Benchmarking is designed to evaluate multiple gene sets -simultaneously, whereas Discovery focuses on the performance of a single -gene set. +required. The main difference from Discovery mode is that Benchmarking +is designed to evaluate multiple gene sets simultaneously, whereas +Discovery mode focuses on quantifying a single, robust gene set. ### 5. Individual Gene Exploration To better understand the contribution of individual genes within a gene -set, and identify whether specific genes drive the set’s collective -signal, `markeR` provides `VisualiseIndividualGenes.` Available options -include: - -- Expression heatmaps of genes across samples or groups of samples; -- Violin plots showing cross-sample expression distributions of - individual genes; -- Heatmaps of pairwise cross-sample expression correlation between genes - in the set; -- ROC curves and AUC values to evaluate single genes’ performance as - phenotypic markers; -- Effect size estimation (Cohen’s *d*) of expression differences between - groups of samples; -- Principal Component Analysis (PCA) of expression of genes in the set, - to evaluate which genes dominate collective variance and how samples - separate according to the gene set’s expression. +set and identify whether specific genes drive the overall signal, +`markeR` offers a suite of gene-level exploratory analyses, including: + +- Expression heatmaps of genes across samples and groups +- Violin plots showing expression distributions of individual genes +- Correlation heatmaps to reveal co-expression patterns among genes in + the set +- ROC curves and AUC values for individual genes to evaluate their + discriminatory power +- Effect size calculations (Cohen’s *d*) per gene to quantify + differential expression +- Principal Component Analysis (PCA) on gene set genes to assess + variance explained and sample clustering ### 6. Compare with Reference Gene Sets -`markeR` also supports comparison of user-defined gene sets against -reference collections (e.g., MSigDB). Two complementary similarity -metrics are implemented: +`markeR` allows comparison of user-defined gene sets to reference sets +(e.g., from MSigDB) using: -- **Jaccard Index**: the ratio of the number of genes in common over the - total number of genes in the two sets. +- **Jaccard Index**: Measures gene overlap relative to union size. -- **Log Odds Ratio (logOR)** from Fisher’s exact test of association - between gene sets, given a specified gene universe. +- **Log Odds Ratio (logOR)**: Computes enrichment using a user-defined + gene universe and Fisher’s exact test. Filters can be applied based on similarity thresholds (e.g., minimum -Jaccard, OR, or Fisher’s test p-value). +Jaccard, OR, or p-value). ## Contact diff --git a/man/FPR_Simulation.Rd b/man/FPR_Simulation.Rd index b9fd5bf..ece45bd 100644 --- a/man/FPR_Simulation.Rd +++ b/man/FPR_Simulation.Rd @@ -10,7 +10,7 @@ FPR_Simulation( original_signatures, Variable, gene_list = NULL, - number_of_sims = 100, + number_of_sims = 10, title = NULL, widthTitle = 30, titlesize = 12, diff --git a/man/counts_example.Rd b/man/counts_example.Rd index 4998e47..25658d2 100644 --- a/man/counts_example.Rd +++ b/man/counts_example.Rd @@ -12,12 +12,11 @@ samples (sample IDs). \url{https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE63577} } \usage{ -data(counts_example) +counts_example } \description{ -A numeric matrix containing filtered and normalized (non log-transformed) -gene expression data from the Marthandan et al. (2016) study (GEO accession -GSE63577). +A numeric matrix containing filtered and normalized gene expression data from +the Marthandan et al. (2016) study (GEO accession GSE63577). } \details{ Raw FASTQ files were downloaded using \code{fasterq-dump} (v2.11.0) and processed @@ -26,7 +25,8 @@ conducted using FastQC (v0.12.1) and summarised with MultiQC (v1.14). Pseudo-alignment to the RefSeq transcriptome (NCBI release 109) was performed using kallisto (v0.44.0). Genes with low expression (mean count < 70 in all conditions) were filtered out. Count normalization factors were calculated -with \code{edgeR::calcNormFactors}. +with \code{edgeR::calcNormFactors}, and log2-transformed values were obtained via +\code{limma::voom}. Intermediate time points for HFF and MRC5 cell lines were excluded, resulting in a final dataset with 45 high-quality samples across proliferative, diff --git a/man/geneset_similarity.Rd b/man/geneset_similarity.Rd index c4c5991..35dd72f 100644 --- a/man/geneset_similarity.Rd +++ b/man/geneset_similarity.Rd @@ -15,9 +15,7 @@ geneset_similarity( pval_threshold = 0.05, limits = NULL, title_size = 12, - color = "#B44141", - neutral_color = "white", - cold_color = "#4173B4", + color_values = c("#F9F4AE", "#B44141"), title = NULL, jaccard_threshold = 0, msig_subset = NULL, @@ -50,42 +48,15 @@ Odds Ratio required for a gene set to be included in the plot. Default is 1.} \item{pval_threshold}{(only if method == "odds_ratio" only) Numeric. Maximum -adjusted p-value required for a gene set to be included in the plot. -Default is 0.05.} +adjusted p-value to show a label. Default is 0.05.} -\item{limits}{Numeric vector of length 2. Limits for color scale. If \code{NULL}, -is automatically set to c(0,1) for Jaccard or the range of OR for odds -ratio.} +\item{limits}{Numeric vector of length 2. Limits for color scale.} \item{title_size}{Integer specifying the font size for the plot title. Default is \code{12}.} -\item{color}{Character. The color for the maximum of the scale. Default is -\code{red.} -\itemize{ -\item If \code{method = "jaccard"}, the scale goes from \code{neutral_color} to \code{color}. -\item If \code{method = "odds_ratio"} and any OR >= 1, the scale ends at \code{color}. -\item If \code{method = "odds_ratio"} and all OR <= 1, \code{color} is not used; instead, the scale -runs from \code{cold_color} (minimum) to \code{neutral_color} (OR = 1, if present; -otherwise \code{neutral_color} is the maximum). -}} - -\item{neutral_color}{Character. The neutral reference color. Default is -\code{white}. -\itemize{ -\item If \code{method = "jaccard"}, this is the minimum of the scale. -\item If \code{method = "odds_ratio"} and any OR >= 1, this corresponds to OR = 1 if such values exist; otherwise it is the minimum of the scale. -\item If \code{method = "odds_ratio"} and all OR <= 1, this corresponds to OR = 1 if such values exist; otherwise it is the maximum of the scale (with \code{cold_color} as the minimum). -}} - -\item{cold_color}{Character. The color for values below OR = 1 (only used -when \code{method = "odds_ratio"}). Default is \code{blue}. -\itemize{ -\item If \code{method = "odds_ratio"} and any OR < 1, the scale runs from \code{cold_color} -(minimum) to \code{neutral_color} (OR = 1 if present; otherwise \code{neutral_color} -is the maximum). -\item Ignored if \code{method = "jaccard"} or if all OR >= 1. -}} +\item{color_values}{Character vector of colors used for the fill gradient. +Default is \code{c("#F9F4AE", "#B44141")}.} \item{title}{Optional. Custom title for the plot. If \code{NULL}, the title defaults to \code{"Signature Overlap"}.} @@ -110,7 +81,7 @@ Invisibly returns a list containing: \describe{ \item{\code{plot}}{The \pkg{ggplot2} object of the similarity heatmap.} \item{\code{data}}{The data frame object containing the similarity -scores per pair of gene sets.} +scores aper pair of gene sets.} } } \description{ diff --git a/man/genesets_example.Rd b/man/genesets_example.Rd index 274f199..a63a58d 100644 --- a/man/genesets_example.Rd +++ b/man/genesets_example.Rd @@ -11,13 +11,13 @@ A named list of length 3: curated gene set of commonly reported senescence markers, with directionality (+1 or -1).} \item{REACTOME_Senescence}{Character vector of gene symbols. The -REACTOME_CELLULAR_SENESCENCE from MSigDB database No directionality.} +REACTOME_CELLULAR_SENESCENCE from MSigDB pathway. No directionality.} \item{HernandezSegura}{A data frame with columns \code{gene} and \code{direction}. A gene set from Hernandez-Segura et al. (2017), with directionality (+1 or -1).} } } \usage{ -data(genesets_example) +genesets_example } \description{ Example Gene Sets for Cellular Senescence diff --git a/man/metadata_example.Rd b/man/metadata_example.Rd index f30d8d2..05b3ca1 100644 --- a/man/metadata_example.Rd +++ b/man/metadata_example.Rd @@ -21,7 +21,7 @@ for senescent samples, "young" for proliferative).} \url{https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE63577} } \usage{ -data(metadata_example) +metadata_example } \description{ A data frame containing metadata for samples from the Marthandan et al. diff --git a/tests/testthat/test-geneset_similarity.R b/tests/testthat/test-geneset_similarity.R index 71739ec..62c311b 100644 --- a/tests/testthat/test-geneset_similarity.R +++ b/tests/testthat/test-geneset_similarity.R @@ -16,6 +16,39 @@ test_that("geneset_similarity returns jaccard 1 for identical sets", { expect_equal(sim, 1) }) +test_that("geneset_similarity pval_threshold filters labels as expected for odds_ratio", { + # Fake simple signatures to ensure overlap + sig1 <- c("GENE1", "GENE2", "GENE3", "GENE4", "GENE5") + sig2 <- c("GENE1", "GENE2", "GENE6", "GENE7", "GENE8") + signatures <- list(A = sig1) + others <- list(B = sig2) + # Large universe to ensure non-perfect overlap + universe <- toupper(paste0("GENE", 1:50)) + + # Run with a permissive pval_threshold (should label if pval < threshold) + res <- geneset_similarity( + signatures = signatures, + other_user_signatures = others, + metric = "odds_ratio", + universe = universe, + pval_threshold = 1 # allow all p-values + ) + d <- res$data + expect_true(any(d$Label != "")) # Some label should be shown + + # Run with a restrictive pval_threshold (should blank out most labels) + res2 <- geneset_similarity( + signatures = signatures, + other_user_signatures = others, + metric = "odds_ratio", + universe = universe, + pval_threshold = 1e-10 # extremely small, almost nothing will label + ) + d2 <- res2$data + expect_true(all(d2$Label == "")) # All labels should be blank +}) + + test_that("geneset_similarity returns expected odds ratio values", { # Simple signatures with known overlap sig1 <- c("GENE1", "GENE2", "GENE3", "GENE4") @@ -34,19 +67,19 @@ test_that("geneset_similarity returns expected odds ratio values", { cont_tbl <- matrix(c(2, 2, 2, 4), nrow = 2) fisher_res <- fisher.test(cont_tbl) expected_or <- as.numeric(fisher_res$estimate) - + expected_log10or <- log10(expected_or) + # Run the function res <- geneset_similarity( signatures = signatures, other_user_signatures = others, metric = "odds_ratio", - universe = universe, - pval_threshold=1 + universe = universe ) d <- res$data # Check the actual odds ratio (on log10 scale) is close to expected - expect_equal(d$Score, expected_or, tolerance = 1e-6) + expect_equal(d$Score, expected_log10or, tolerance = 1e-6) }) test_that("geneset_similarity returns expected Jaccard index values with H collection", { @@ -71,8 +104,7 @@ test_that("geneset_similarity returns expected Jaccard index values with H colle signatures = signatures, metric = "jaccard", collection = "H", - msig_subset = "HALLMARK_MYC_TARGETS_V1", - pval_threshold=1 + msig_subset = "HALLMARK_MYC_TARGETS_V1" ) d <- res$data # Find the row for this comparison diff --git a/vignettes/articles/Article_BenchmarkingMode.Rmd b/vignettes/articles/Article_BenchmarkingMode.Rmd index 065d3c6..71ac98b 100644 --- a/vignettes/articles/Article_BenchmarkingMode.Rmd +++ b/vignettes/articles/Article_BenchmarkingMode.Rmd @@ -384,7 +384,7 @@ FPR_Simulation(data = counts_example, metadata = metadata_example, original_signatures = genesets_example, gene_list = row.names(counts_example), - number_of_sims = 100, + number_of_sims = 10, title = "Marthandan et al. 2016", widthTitle = 30, Variable = "Condition", diff --git a/vignettes/articles/Article_DiscoveryMode.Rmd b/vignettes/articles/Article_DiscoveryMode.Rmd index f7148ef..d430b47 100644 --- a/vignettes/articles/Article_DiscoveryMode.Rmd +++ b/vignettes/articles/Article_DiscoveryMode.Rmd @@ -12,7 +12,7 @@ knitr::opts_chunk$set( ) ``` -This vignette provides a comprehensive introduction to the **markeR** package, focusing on its **Discovery Mode**. The discovery mode was designed for users who are interested in quantifying a gene set in a given dataset to explore associations with phenotypic or clinical variables. This approach is particularly suited to hypothesis generation where the phenotype marked by the gene set is of known biological or clinical relevance. +This vignette provides a comprehensive introduction to the **markeR** package, focusing on its **Discovery Mode**. The discovery mode was designed for users who are interested in quantifying a known, robust gene set in a given dataset to explore associations with other phenotypic or clinical variables. This approach is particularly suited to hypothesis generation where the phenotype marked by the gene set is of known biological or clinical relevance. # Case-study: Senescence diff --git a/vignettes/articles/Article_GeneSetSimilarity.Rmd b/vignettes/articles/Article_GeneSetSimilarity.Rmd index 59c854e..7c94f52 100644 --- a/vignettes/articles/Article_GeneSetSimilarity.Rmd +++ b/vignettes/articles/Article_GeneSetSimilarity.Rmd @@ -75,8 +75,7 @@ geneset_similarity( subcollection = "CP:KEGG_LEGACY", jaccard_threshold = 0, msig_subset = c("KEGG_MTOR_SIGNALING_PATHWAY", "KEGG_APOPTOSIS", "NON_EXISTENT_PATHWAY"), - metric = "jaccard", - limits=c(0,0.1) + metric = "jaccard" )$plot ``` @@ -85,14 +84,14 @@ geneset_similarity( # Similarity via Log Odds Ratio -The log odds ratio (logOR) provides a statistically grounded alternative for assessing gene set similarity. It measures **enrichment of one set within another**, relative to a defined background or **gene universe**, using a 2×2 contingency table. +The log odds ratio (logOR) provides a statistically grounded alternative for assessing gene set similarity. It measures **enrichment of one set within another**, relative to a defined background or **gene universe**, using a 2×2 contingency table and a one-sided **Fisher’s exact test**. - **Log odds ratio (logOR)**: Derived from contingency tables using: - Genes in both sets - Genes in one but not the other - Gene universe as background - Log-transformed odds ratios are visualized. + Log-transformed odds ratios are visualized; statistical significance is assessed via the adjusted p-value. > **Note**: When using `metric = "odds_ratio"`, the `universe` parameter **must** be supplied. @@ -115,14 +114,13 @@ geneset_similarity( msigdbr::msigdbr(species = "Homo sapiens", category = "C2")$gene_symbol )), or_threshold = 100, #log10OR = 2 - width_text=50, - pval_threshold = 0.05 + pval_threshold = 0.05, + width_text=50 )$plot ``` - # Session Information ```{r} diff --git a/vignettes/markeR.Rmd b/vignettes/markeR.Rmd index f4839c1..d345210 100644 --- a/vignettes/markeR.Rmd +++ b/vignettes/markeR.Rmd @@ -2,10 +2,7 @@ title: "Introduction to markeR" author: "Rita M. Silva" date: "`r Sys.Date()`" -output: - BiocStyle::html_document: - toc: true - toc_depth: 3 +output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Introduction to markeR} %\VignetteEngine{knitr::rmarkdown} @@ -24,36 +21,59 @@ dpi = 72, fig.retina = 1 ) ``` - -# Introduction -**`markeR`** is an R package that provides a modular and extensible framework for the systematic evaluation of gene sets as phenotypic markers using transcriptomic data. The package is designed to support both quantitative analyses and visual exploration of gene set behaviour across experimental and clinical phenotypes. +# Table of Contents + +1. [Introduction](#introduction) + - [1.1. Primary Objectives](#primary-objectives) + - [1.2. Features and Capabilities](#features-and-capabilities) +2. [Installation](#installation) +3. [Common Workflow](#common-workflow) + - [3.1. Input Requirements](#input-requirements) + - [Gene sets](#gene-sets) + - [Expression data frame](#expression-data-frame) + - [Sample metadata](#sample-metadata) + - [3.2. Select Mode of Analysis](#select-mode-of-analysis) + - [3.3. Choose a Quantification Approach](#choose-a-quantification-approach) + - [3.3.1 Score-Based Approach](#score-based-approach) + - [3.3.2 Enrichment-Based Approach](#enrichment-based-approach) + - [3.4. Visualisation and Evaluation](#visualisation-and-evaluation) + - [3.4.1 Benchmarking Mode](#benchmarking-mode) + - [3.4.2 Discovery Mode](#discovery-mode) + - [3.5. Individual Gene Exploration (Optional)](#individual-gene-exploration-optional) + - [3.6. Compare with Reference Gene Sets (Optional)](#compare-with-reference-gene-sets-optional) +4. [Further Reading](#further-reading) +5. [Contact](#contact) +6. [Session Information](#session-information) + +# 1. Introduction -In this vignette, we demonstrate the core functionalities of `markeR` using a pre-processed gene expression dataset and associated metadata from Marthandan et al. (2016) (GSE63577). This dataset comprises human fibroblast samples cultured under two experimental conditions: replicative senescence and proliferative control. +**markeR** is an R package designed to provide a modular, flexible framework for evaluating gene sets as phenotypic markers using transcriptomic data. It supports researchers in both quantitative analysis and visual exploration of gene set behaviour across experimental or clinical phenotypes. -## Primary Objectives +In this vignette, we will illustrate markeR functionalities using a pre-processed gene expression dataset and metadata derived from the Marthandan et al. (2016) study (GSE63577). This dataset contains human fibroblast samples cultured under two conditions: replicative senescence and proliferative control. -The primary objectives of **`markeR`** are to: +## 1.1. Primary Objectives -- Establish a reproducible and standardised framework for quantifying gene sets as phenotypic markers across transcriptomic datasets. -- Provide a unified interface for complementary analytical strategies, including score-based and enrichment-based methods. -- Enable systematic comparison of user-defined gene sets with curated reference collections (e.g., MSigDB) to enhance biological interpretability. -- Deliver modular tools for visualisation and evaluation, allowing exploration of both gene set behaviour and individual gene-level contributions. +The primary objectives of **markeR** are to: -## Features and Capabilities +- Facilitate reproducible and standardized quantification of gene set activity. +- Provide flexible options for both score-based and enrichment-based analyses. +- Enable comparison of user-defined signatures against reference gene sets (e.g., MSigDB) for biological interpretation. +- Offer modular visualization and evaluation tools to explore gene set behaviour at both the set and individual gene level. -The package integrates multiple analytical strategies for quantifying phenotypes using gene sets, supporting both unidirectional and bidirectional (*i.e.*, respectively without and with information on the expected direction of change in expression of genes upon phenotype) gene set definitions: +## 1.2. Features and Capabilities -- **Score-based methods**: log2-median expression, ranking approaches, and single-sample gene set enrichment analysis (ssGSEA) to quantify coordinated expression within a gene set. -- **Enrichment-based methods**: GSEA using moderated t- or B-statistics. -- **Gene-level exploration**: expression heatmaps, violin plots, ROC curve analysis, area under the curve (AUC) calculations, effect size estimation, and principal component analysis (PCA) to characterise the contribution of individual genes. -- **Gene set similarity assessment**: Jaccard index and log odds ratio (logOR) calculations to compare user-defined gene sets with reference or user-defined collections. +The package integrates multiple approaches for characterizing phenotypes: -The package is implemented with flexibility and extensibility as core design principles. Visualisation leverages `ggplot2`, `ComplexHeatmap`, `ggpubr`, `cowplot`, and `grid`, enabling a wide range of customisation options. The modular structure ensures compatibility with additional methods in future versions of the package. +- **Score-based methods**: log2-median, ranking, and single-sample GSEA (ssGSEA) to quantify the coordinated expression of genes in a set. +- **Enrichment-based methods**: GSEA using moderated t- or B-statistics, with options to handle unidirectional and bidirectional gene sets. +- **Gene-level exploration**: expression heatmaps, violin plots, ROC curves, AUC calculations, effect size measures, and PCA analysis to assess individual gene contributions. -# Installation +The package is designed to be fully customizable, supporting diverse visualization strategies via `ggplot2`, `ComplexHeatmap`, `ggpubr`, `cowplot`, and `grid.` Its modular structure allows easy integration of new functionalities while providing a robust framework for reproducible, standardized phenotypic characterization of gene sets. -Install the latest development release of `markeR` from [GitHub](https://github.com/) with: +# 2. Installation + +Install the latest development release of markeR from [GitHub](https://github.com/) with: ```r devtools::install_github("DiseaseTranscriptomicsLab/markeR@*release") @@ -62,16 +82,18 @@ devtools::install_github("DiseaseTranscriptomicsLab/markeR@*release") ```{r, echo=FALSE} library(markeR) ``` -# Common Workflow +# 3. Common Workflow + +`markeR` provides a modular pipeline to quantify transcriptomic signatures and assess their association with phenotypic or clinical variables. -## Input Requirements +## 3.1. Input Requirements -### Gene Sets +### Gene sets A named list where each element is a gene set. -- Use a **character vector** when putative direction of change in expression of genes upon phenotype is unknown (unidirectional). -- Use a **data frame** with columns `gene` and `direction` (values `+1` for up-, `-1` for down-regulation) when direction is known (bidirectional). +- Use a **character vector** when direction is unknown (unidirectional). +- Use a **data frame** with columns `gene` and `direction` (values `+1` for up, `-1` for down) when direction is known (bidirectional). ```{r example-gene-sets-vector, echo=FALSE} # Gene set without direction @@ -96,25 +118,18 @@ gene_sets <- list( gene_sets ``` -For this vignette, we will use three senescence-related gene sets that ship with the package (see `?genesets_example` for more information): - -* **LiteratureMarkers** (bidirectional; gene set of commonly reported senescence markers in the literature) -* **REACTOME_CELLULAR_SENESCENCE** (unidirectional; from the MSigDB database) -* **HernandezSegura** (bidirectional; from [Hernandez-Segura et al. (2017)](https://pubmed.ncbi.nlm.nih.gov/28844647/)) +For this vignette, we will use three senescence-related gene sets that ship with the package: **LiteratureMarkers** (bidirectional), **REACTOME_CELLULAR_SENESCENCE** (undirected), and **HernandezSegura** (bidirectional). See `?genesets_example`. ```{r loadsig} # Load example gene sets data(genesets_example) ``` -### Expression Data Frame +### Expression data frame -A filtered and normalised, non log-transformed, gene expression matrix (genes × samples). Row names must be gene identifiers; column names must match sample IDs in the metadata. +A filtered and normalised gene expression matrix (genes × samples). Row names must be gene identifiers; column names must match sample IDs in the metadata. -**Warning:** If you are using microarray data or outputs from common RNA-seq pipelines (*e.g.*, edgeR), note that the expression values may already be log2-normalised. The input to `markeR` must necessarily be **non-log-transformed**. If your data are log2-transformed, you can revert them by applying `2^data`. - - -In this vignette, we use a pre‑processed RNA-seq dataset from Marthandan et al. (2016, GSE63577), with normalised read counts for human fibroblasts under **replicative senescence** and **proliferative control**. See `?counts_example` for structure. +In this vignette, we use a pre‑processed dataset from Marthandan et al. (2016, GSE63577) with human fibroblasts under **replicative senescence** and **proliferative control**. See `?counts_example` for structure. ```{r loaddata} # Load example expression data @@ -122,9 +137,9 @@ data(counts_example) counts_example[1:5, 1:5] ``` -### Sample Metadata +### Sample metadata -A data frame with samples as rows and annotations as columns. The first column should contain sample IDs matching the expression matrix column names. +A data frame with annotations per sample. Row names must match the expression matrix column names; the first column contains sample IDs. We use the accompanying metadata from the Marthandan et al. (2016) study. See `?metadata_example`. @@ -134,60 +149,64 @@ data(metadata_example) head(metadata_example) ``` -## Select Mode of Analysis - -`markeR` provides two modes of operation: +## 3.2. Select Mode of Analysis -* **Benchmarking**: -evaluates gene sets' performance in marking a metadata variable, *i.e.*, a phenotype, returning comparative visualisations across scoring and enrichment methods. +* **Benchmarking Mode**: +Evaluate one or more gene sets against a metadata variable using a standardised scoring and effect size framework. This mode provides comprehensive visualisations and comparisons across methods. -* **Discovery**: -examines the relationship between a gene set and one or more variables of interest, suitable for exploratory or hypothesis-generating analyses. +* **Discovery Mode**: +Explore how a single, well-characterised gene set relates to a specific variable of interest. Suitable for hypothesis generation or signature projection. -## Choose a Quantification Approach +## 3.3. Choose a Quantification Approach + +`markeR` supports two complementary strategies for quantifying the association between gene sets and phenotypes: + +### 3.3.1 Score-Based Approach -Two complementary strategies are implemented for quantifying associations between gene sets and phenotypes: +This strategy generates a **single numeric score per sample**, reflecting the activity of a gene set. It enables flexible downstream analyses, including comparisons across phenotypic groups. -### Score-Based Approach +Three scoring methods are available: -A score summarising the collective expression of a gene set therein is assigned **to each sample**. Scores can be visualised using built-in functions, or used directly in downstream analyses (*e.g.*, comparisons between phenotypic groups of samples, correlations with numerical phenotypes). +* **Log2-median**: Calculates the median log2 expression of the genes in the set. Sensitive to absolute shifts in expression. -Available methods: +* **Ranking**: Ranks all genes within each sample and averages the ranks of gene set members. Captures relative ordering rather than magnitude. -* **Log2-median**: mean of the across-sample normalised log2 median-centred expression levels of the genes in the set; for bidirectional gene sets, the sample score is the partial score for the subset of putatively upregulated genes minus that of the downregulated subset. +* **ssGSEA**: Computes a single-sample gene set enrichment score using the ssGSEA algorithm. Reflects the coordinated up- or down-regulation of the set in each sample. -* **Ranking**: mean expression rank of gene set members in each sample; for bidirectional gene sets, the sample score is the partial score for the subset of putatively upregulated genes minus that of the downregulated subset, and normalised by the number of genes in the set. +Robust gene sets are expected to perform consistently across all three. -* **ssGSEA**: single-sample gene set enrichment score using ssGSEA; for bidirectional gene sets, the sample score is the partial score for the subset of putatively upregulated genes minus that of the downregulated subset. +### 3.3.2 Enrichment-Based Approach -Gene sets that are robust phenotypic markers are expected to yield consistently high scores across methods. +This approach uses a classical **gene set enrichment analysis (GSEA)** framework to evaluate whether the gene set is significantly overrepresented at the top or bottom of a ranked list of genes (e.g., ranked by fold change or correlation with phenotype). -### Enrichment-Based Approach +* **GSEA**: Computes a Normalised Enrichment Score (NES) for each contrast or variable of interest, adjusting for gene set size and multiple testing. -Enrichment-based methods implement **Gene Set Enrichment Analysis (GSEA)**. Genes are ranked according to differential expression statistics, and a Normalised Enrichment Score (NES) per variable of interest is computed, accompanied by a p-value adjusted for multiple hypothesis testing. +Use this approach when interested in collective behaviour of gene sets in relation to ranked differential signals. -## Visualisation and Evaluation +## 3.4. Visualisation and Evaluation -### Benchmarking Mode +### 3.4.1 Benchmarking Mode -`markeR` offers a range of visual summaries for Benchmarking: +In **Benchmarking Mode**, `markeR` offers a range of visual summaries: -* Violin plots of score distributions by categorical phenotype; -* Scatter plots of association between scores and numerical phenotypes; -* Volcano plots and heatmaps of scores or differential gene set expression based on effect sizes (Cohen’s *d* or *f*); -* ROC curves and respective AUC values of gene sets' phenotypic classification performance; -* Violin plots of effect size distributions (Cohen’s *d*) for pairwise group differences in scores, for original and simulated gene sets; -* Plots summarising NES alongside adjusted p-values (*e.g.*, lollipop plots); -* GSEA plots showing running enrichment scores across ranked gene lists. +* Violin or scatter plots showing score distributions by phenotype +* Volcano plots and heatmaps based on effect sizes (Cohen’s *d* or *f*) +* ROC curves and AUC values +* Null distribution testing using random gene sets matched for size and directionality +* Lollipop plots summarising enrichment scores (NES) with adjusted p-values +* Enrichment plots showing running enrichment scores across ranked gene lists +* Volcano plots -Example of common workflows in Benchmarking Mode (full tutorial [here][tutorial-benchmarking]): +Example of common workflows in Benchmarking mode (full tutorial [here][tutorial-benchmarking]): -#### Score-based approaches +#### Score-based approach -Score-based methods facilitate direct comparisons of gene set activity between phenotypic groups (*e.g.*, being senescent) and can be used in downstream analyses, including for example correlation with other metadata variables of interest (*e.g.*, day of sequencing) . -We first compute log2-median scores. In this dataset, the HernandezSegura and LiteratureMarkers gene sets yield large effect sizes (|Cohen’s d|), whereas REACTOME_CELLULAR_SENESCENCE shows more modest separation between senescent and proliferative fibroblasts. +We begin by quantifying gene set activity using score-based methods. These approaches generate numeric scores per sample that reflect the coordinated expression of genes in each set. This allows easy comparison of gene set behavior across phenotypic groups, while giving a single score per sample, which can be useful in certain contexts. -```{r fig.width=8, fig.height=4, warning=FALSE, message=FALSE} +First, we compute and plot scores using the log2-median method, which calculates median-centered expression for each gene and averages across the gene set to provide a robust summary of coordinated activity. We then assess whether the available gene sets distinguish between Senescent and Proliferative conditions. These results suggest that the HernandezSegura and LiteratureMarkers gene sets show very strong effect sizes (|Cohen's d|), while the REACTOME_CELLULAR_SENESCENCE gene set shows a more modest effect. The distributions of scores also overlap more for the latter. + + +```{r fig.width=8, fig.height=4, out.width="100%", warning=FALSE, message=FALSE} # Quantification approach: scores # Method: log2-median PlotScores(data = counts_example, @@ -204,9 +223,7 @@ PlotScores(data = counts_example, ``` - -Next, scores are calculated using multiple methods (log2-median, ranking, ssGSEA) to assess consistency across strategies. Outputs include heatmaps and volcano plots of effect sizes (|Cohen’s d|). HernandezSegura and LiteratureMarkers consistently discriminate Senescent from Proliferative samples, while REACTOME_CELLULAR_SENESCENCE shows weaker and less consistent separation. - +Next, we calculate scores using multiple methods (log2-median, ranking, and ssGSEA) to compare results across scoring strategies. The output includes heatmaps and volcano plots to visualize effect sizes (|Cohen's d|) between conditions. These analyses confirm that the HernandezSegura and LiteratureMarkers gene sets consistently discriminate Senescence from Proliferative samples across all methods, whereas REACTOME_CELLULAR_SENESCENCE does not. ```{r warning=FALSE, message=FALSE} Overall_Scores <- PlotScores(data = counts_example, @@ -235,17 +252,17 @@ Overall_Scores <- PlotScores(data = counts_example, colorPalette="Paired") ``` -```{r Overall_Scores_heatmap, fig.width=10, fig.height=2, warning=FALSE, message=FALSE} +```{r Overall_Scores_heatmap, fig.width=10, fig.height=2, out.width="100%", warning=FALSE, message=FALSE} Overall_Scores$heatmap ``` -```{r Overall_Scores_volcano, fig.width=6, fig.height=3, warning=FALSE, message=FALSE} +```{r Overall_Scores_volcano, fig.width=6, fig.height=3, out.width="70%", warning=FALSE, message=FALSE} Overall_Scores$volcano ``` -The discriminatory capacity of gene set scores can also be assessed using ROC curves and corresponding AUC values. HernandezSegura and LiteratureMarkers exhibit consistently high AUCs across scoring methods, while REACTOME_CELLULAR_SENESCENCE shows more heterogeneous performance. +ROC curves assess the discriminatory power of gene set scores, indicating how well a signature separates different experimental or clinical groups. When discriminating between Senescent and Proliferative samples, the REACTOME_CELLULAR_SENESCENCE gene set shows more heterogeneous ROC curves and AUC values across scoring methods, reflecting less consistent performance compared to the HernandezSegura and LiteratureMarkers gene sets. -```{r roc_scores, fig.width=10, fig.height=3, warning=FALSE, message=FALSE} +```{r roc_scores, fig.width=10, fig.height=3, out.width="100%", warning=FALSE, message=FALSE} ROC_Scores(data = counts_example, metadata = metadata_example, gene_sets=genesets_example, @@ -263,15 +280,16 @@ ROC_Scores(data = counts_example, ``` -Finally, false positive rates for effect sizes are estimated by simulating random gene sets of equal size. This step provides a null distribution against which observed scores can be compared. In this example, the LiteratureMarkers signature demonstrates the strongest performance. Increasing the number of simulations would yield finer resolution at the cost of additional computational time. +Finally, we perform simulations using random gene sets to estimate the false positive rate. This step informs whether observed gene set signals exceed what would be expected by chance, improving confidence in the results. By simulating 10 random gene sets of the same size, we see that the LiteratureMarkers gene set performs best, with only two of the random sets achieving higher effect sizes than the original one using the ranking method. While increasing the number of simulated sets would yield finer resolution, this comes at the cost of additional computational time. + + +```{r FDRSim, fig.width=12, fig.height=3, out.width="100%", warning=FALSE, message=FALSE} -```{r FDRSim, fig.width=12, fig.height=3,warning=FALSE, message=FALSE} -set.seed("123456") FPR_Simulation(data = counts_example, metadata = metadata_example, original_signatures = genesets_example, gene_list = row.names(counts_example), - number_of_sims = 100, + number_of_sims = 10, title = "Marthandan et al. 2016", widthTitle = 30, Variable = "Condition", @@ -286,11 +304,13 @@ FPR_Simulation(data = counts_example, ``` -#### Enrichment-based approaches - -The first step is the quantification of differential expression. Here, a design matrix (as defined by the `limma` package) is automatically constructed from the `Condition` variable in the metadata, defining the contrast `Senescent - Proliferative`. Internally, this fits a linear model without an intercept, enabling quick comparison between levels of a categorical variable. +#### Enrichment-based approach + +We now shift focus to enrichment-based methods, which rely on differential expression statistics to evaluate whether gene sets show coordinated changes between conditions. This approach complements score-based methods by explicitly leveraging contrasts between groups and quantifying statistical significance. + +The first step is to compute differential expression. In this example, we build the design matrix automatically from the `Condition` variable in the metadata and define the contrast `Senescent - Proliferative`. Internally, this fits a linear model without an intercept, enabling quick comparison between levels of a categorical variable. -```{r DEGs, fig.width=10, fig.height=3, warning=FALSE, message=FALSE} +```{r DEGs, fig.width=10, fig.height=3, out.width="90%", warning=FALSE, message=FALSE} # Build design matrix from variables (Condition) and apply a contrast. # The design matrix is constructed automatically using the variable "Condition". DEGs <- calculateDE(data = counts_example, @@ -300,15 +320,11 @@ DEGs <- calculateDE(data = counts_example, DEGs$`Senescent-Proliferative`[1:5,] ``` -Differential expression results can be visualised with volcano plots. These highlight the magnitude of expression changes (log2 fold change) against statistical significance (adjusted p-value for the moderated t-statistic), with genes from predefined sets emphasised (up- in green and downregulated in red). In this example: +Once differential expression is calculated, we can visualize the results with a volcano plot. This plot highlights genes in our predefined gene sets, showing the magnitude of differential expression (log fold-change) versus statistical significance (adjusted p-value). Up- (green) and downregulated (red) genes are color-coded. For the HernandezSegura gene set, green genes mostly appear in the positive logFC range and red genes in the negative range, though these are not the genes with the most extreme logFC values. In the LiteratureMarkers gene set, two red genes (LMNB1 and MKI67) show very negative logFC, which is consistent with their roles as proliferation markers since senescent cells are non-proliferative. Genes from the REACTOME_CELLULAR_SENESCENCE gene set are shown in blue, as this set lacks information on directionality; these genes are scattered across the full range of logFC values. -* Genes in the HernandezSegura set annotated as upregulated (green) display positive log2 fold changes, whereas those annotated as downregulated (red) show negative log2 fold changes. This pattern is consistent with the annotation of the set, although these genes are not necessarily those exhibiting the largest absolute fold changes. -* In the LiteratureMarkers set, *LMNB1* and *MKI67* exhibit strongly negative log2 fold change, consistent with their roles as proliferation markers absent in senescent cells. -* Genes from the REACTOME_CELLULAR_SENESCENCE set are undirected and appear across the full range of log2 fold change values, diluting discriminatory power. - -```{r DEGsvolcano, fig.width=10, fig.height=3,warning=FALSE, message=FALSE} -# Change order: gene sets in columns, contrast in rows +```{r DEGsvolcano, fig.width=10, fig.height=3, out.width="100%", warning=FALSE, message=FALSE} +# Change order: signatures in columns, contrast in rows plotVolcano(DEGs, genes = genesets_example, N = NULL, x = "logFC", y = "-log10(adj.P.Val)", pointSize = 2, @@ -318,8 +334,7 @@ plotVolcano(DEGs, genes = genesets_example, labsize = 10, widthlabs = 24, invert = TRUE) ``` -We next apply GSEA to the differential expression results. This evaluates whether genes from each set are non-randomly concentrated at the top or bottom of the ranked list. Results are reported as NES with multiple-testing adjusted p-values. Gene ranking can be based on alternative statistics (*e.g.*, moderated t or B-statistic), which influence whether results are interpreted as enrichment/depletion or as altered directionality. For a full example and explanation, see the online tutorial [here][tutorial-benchmarking]. - +Next, we perform Gene Set Enrichment Analysis (GSEA) using the differential expression results. This approach evaluates whether members of each gene set are non-randomly distributed toward the top or bottom of the ranked gene list, providing normalized enrichment scores (NES) and adjusted p-values. The ranking of genes can be based on different metrics, reflecting the expected directionality of the gene set. This is also reflected in the plot labels below: gene sets are marked as altered when genes are ordered by the B statistic, or enriched/depleted when ordered by the t-statistic. For a full example and explanation, see the tutorial [here][tutorial-benchmarking]. ```{r GSEA, warning=FALSE, message=FALSE} GSEAresults <- runGSEA(DEGList = DEGs, @@ -329,19 +344,19 @@ GSEAresults <- runGSEA(DEGList = DEGs, GSEAresults ``` - -Enrichment curves from `fgsea` show the running enrichment score across the ranked list, indicating the distribution of set members therein. +To visualize the enrichment of each gene set along the ranked gene list, we use enrichment curves from `fgsea.` These plots show the running enrichment score across the ranked list, highlighting where members of each gene set are concentrated. -```{r GSEA_plotenrichment, fig.width=10, fig.height=3, warning=FALSE, message=FALSE} + +```{r GSEA_plotenrichment, fig.width=10, fig.height=3, out.width="100%", warning=FALSE, message=FALSE} plotGSEAenrichment(GSEA_results=GSEAresults, DEGList=DEGs, gene_sets=genesets_example, widthTitle=40, grid = TRUE, titlesize = 10, nrow=1, ncol=3) ``` -Enrichment results can also be summarised with a lollipop plot, which compactly shows the NES and highlights statistically significant gene sets. Here, we can see that the HernandezSegura gene set clearly exhibits the strongest enrichment signal. When many gene sets and contrasts are evaluated (which is not the case in this vignette), NES and adjusted p-values can be summarized in a scatter (volcano-style) plot for a simpler visualisation, using `plotCombinedGSEA`. - +We can also summarize enrichment results in a lollipop plot, which compactly shows the normalized enrichment score and highlights statistically significant gene sets. This provides a quick overview of which pathways are most strongly altered. Here, we can see that the HernandezSegura gene set clearly exhibits the strongest enrichment signal. + ```{r GSEA_lollypop, fig.width=5, fig.height=4, out.width="60%", warning=FALSE, message=FALSE} plotNESlollipop(GSEA_results=GSEAresults, saturation_value=NULL, @@ -353,33 +368,34 @@ plotNESlollipop(GSEA_results=GSEAresults, widthlabels=13, title=NULL, titlesize=12) ``` - -From this exercise in Benchmarking Mode, we can see that two gene sets clearly perform best at discriminating between Senescent and Proliferative conditions: HernandezSegura and LiteratureMarkers. The REACTOME_CELLULAR_SENESCENCE gene set does not show a strong signal; the absense of information on putative gene up- or downregulation upon phenotype potentially dilutes the signal, highlighting the importance of providing directionality when available. +Finally, a volcano-style scatter plot combines NES and significance for all gene sets, making it easy to identify which sets show the strongest and most statistically robust alteration. -Scoring and enrichment approaches provide complementary insights. Score-based methods offer sample-level resolution, capturing strong contributions from individual genes, while enrichment-based methods evaluate coordinated behaviour across the set. +```{r GSEA_volcano, fig.width=8, fig.height=3, out.width="100%", warning=FALSE, message=FALSE} +plotCombinedGSEA(GSEAresults, sig_threshold = 0.05, PointSize=6, widthlegend = 26 ) +``` + +From this exercise in benchmarking mode, we can see that two gene sets clearly perform best at discriminating between Senescent and Proliferative conditions: HernandezSegura and LiteratureMarkers. The REACTOME_CELLULAR_SENESCENCE gene set does not show a strong signal; since it is undirected and lacks information on up- or downregulation, this also dilutes the signal, highlighting the importance of providing directionality when available. + +Across methods, scoring and enrichment approaches provide complementary insights. Score-based methods offer sample-level resolution, capturing strong contributions from individual genes, while enrichment-based methods evaluate coordinated behaviour across the set, typically at the group level. Even among scoring methods, rank-based approaches (e.g., ssGSEA or rank scoring) are generally more robust to technical noise, whereas magnitude-based methods (e.g., log2-median) better detect shifts in well-controlled data. Performance also depends on sample size and gene set size, explaining why HernandezSegura may appear stronger in enrichment analyses and LiteratureMarkers in scoring analyses. Integrating both approaches provides the most comprehensive view of gene set behaviour. -### Discovery Mode +### 3.4.2 Discovery Mode -In **Discovery** (full tutorial [here][tutorial-discovery]), analyses focus on a single gene set of interest, providing a targeted view of its behaviour across experimental variables (*i.e.*, phenotypes). The output includes: +In **Discovery Mode** (full tutorial [here][tutorial-discovery]), the output focuses on a single gene set: -* Score distributions stratified by variable; -* Effect sizes for pairwise and multiple-group differences (Cohen's *d* and *f*, respectively); -* Cross-variable summaries of NES and adjusted p-values (*e.g.*, lollipop plots). +* Score distributions by phenotype +* Pairwise contrasts (Cohen’s *d*) and overall effect sizes (Cohen’s *f*) +* Enrichment score summaries (NES) with adjusted p-values (e.g., lollipop plots) -We illustrate this using the HernandezSegura gene set from our example collection: +We start by selecting the gene set of interest. Here, we use the HernandezSegura gene set from our example collection. ```{r} HernandezSegura_GeneSet <- list(HernandezSegura=genesets_example$HernandezSegura) ``` -For illustration purposes, two synthetic variables were added to the data: +For illustration, we add two hypothetical variables to the metadata: `DaysToSequencing`, representing the number of days between sample preparation and sequencing, and `Researcher`, identifying the person who processed each sample. This allows us to explore associations between gene set activity and different types of variables (categorical or continuous). -* `DaysToSequencing`, the number of days between sample preparation and sequencing; -* `Researcher`, identifying the person who processed each sample. - -This enables exploration of associations between gene set activity and both categorical and continuous variables. ```{r load_metadata_discovery} data(metadata_example) @@ -389,11 +405,9 @@ metadata_example$DaysToSequencing <- sample(c(1:20),39, replace = TRUE) head(metadata_example) ``` -We can then examine how the gene set associates with these variables using enrichment-based approaches in Discovery Mode. The resulting plot highlights significant associations across variables and visually summarises the direction and strength of the effects. The “simple” mode provides comparison of effect sizes across pairwise contrasts between only two levels of the variable, but can be changed to more levels of comparison (see `?VariableAssociation`). - -In this example, the HernandezSegura gene set shows significant enrichment in samples sequenced by Francisca relative to those processed by Ana or John, which would suggest a differentiated researcher-associated technical impact on the samples' biological phenotype. This gene set shows also a strong depletion in proliferative samples, which is expected given its annotation as senescence-associated and results from the Benchmarking Mode. +We can then examine how the selected gene set associates with these variables using enrichment-based approaches in Discovery Mode. The resulting plot highlights significant associations across variables and visually summarizes the direction and strength of the effect. The “simple” mode provides comparison of effect sizes for pairwise contrasts between only two levels of the variable, but can be changed to more levels of comparison (see `?VariableAssociation`). Here, we can see that the HernandezSegura gene set is significantly enriched in samples sequenced by Francisca, when compared to those processed by Ana or John, suggesting that this gene set may be particularly relevant to her experimental conditions or sample processing. This gene set has also a strong enrichment for the comparison between proliferative and senescent, which is expected given the nature of the gene set and the results from the Benchmarking mode. -```{r GSEA_varassoc, fig.width=6, fig.height=6,warning=FALSE, message=FALSE , out.width="70%"} +```{r GSEA_varassoc, fig.width=6, fig.height=6, out.width="60%", warning=FALSE, message=FALSE} VariableAssociation( data = counts_example, metadata = metadata_example, @@ -412,11 +426,12 @@ VariableAssociation( ) $plot ``` - -Alternatively, **score-based methods** provide a per-sample metric of gene set activity, which can then be summarised across variables. Here, we compute log2-median scores. The `Condition` variable shows a large effect size (Cohen’s f), confirming strong discrimination between Senescent and Proliferative samples. In contrast, `Researcher` does not show a detectable association, in contrast to enrichment-based results. This divergence illustrates the value of applying both enrichment- and score-based approaches in a complementary manner. -```{r variableassoc_score_sen, fig.width=7, fig.height=7,warning=FALSE, message=FALSE , out.width="70%" } +Next, we evaluate the association using a score-based method (here, log2-median). This approach calculates per-sample scores for the gene set and summarizes them across variables. In this analysis, Condition (representing whether the sample is senescent or proliferative) shows a strong effect, reflected by a large Cohen’s f. This effect size metric is directly comparable across different variable types (categorical, numeric, etc.), making it particularly versatile. In contrast, the variable Researcher does not show a significant effect here, unlike in the enrichment analysis. This divergence illustrates the value of applying both enrichment- and score-based approaches in a complementary manner. + + +```{r variableassoc_score_sen, fig.width=7, fig.height=7, out.width="100%", warning=FALSE, message=FALSE} VariableAssociation(data = counts_example, metadata = metadata_example, method = "logmedian", @@ -430,37 +445,36 @@ VariableAssociation(data = counts_example, ``` -The Benchmarking Mode offers the most comprehensive set of features. Users are allowed to seamlessly move from Discovery to Benchmarking once a variable of interest has been identified and further testing is required. Benchmarking is designed to evaluate multiple gene sets simultaneously, whereas Discovery focuses on the performance of a single gene set. - +Benchmarking mode offers the most comprehensive set of features and allows users to seamlessly move from discovery to benchmarking mode once a variable of interest has been identified and further testing is required. The main difference from Discovery mode is that Benchmarking is designed to evaluate multiple gene sets simultaneously, whereas Discovery mode focuses on quantifying a single, robust gene set. -## Individual Gene Exploration +## 3.5. Individual Gene Exploration -To better understand the contribution of individual genes within a gene set, and identify whether specific genes drive the set's collective signal, `markeR` provides `VisualiseIndividualGenes.` Available options include: +To better understand the contribution of individual genes within a gene set and identify whether specific genes drive the overall signal, `markeR` offers a suite of gene-level exploratory analyses (via `VisualiseIndividualGenes`), including: -* Expression heatmaps of genes across samples or groups of samples; -* Violin plots showing cross-sample expression distributions of individual genes; -* Heatmaps of pairwise cross-sample expression correlation between genes in the set; -* ROC curves and AUC values to evaluate single genes' performance as phenotypic markers; -* Effect size estimation (Cohen’s *d*) of expression differences between groups of samples; -* Principal Component Analysis (PCA) of expression of genes in the set, to evaluate which genes dominate collective variance and how samples separate according to the gene set's expression. +* Expression heatmaps of genes across samples and groups +* Violin plots showing expression distributions of individual genes +* Correlation heatmaps to reveal co-expression patterns among genes in the set +* ROC curves and AUC values for individual genes to evaluate their discriminatory power +* Effect size calculations (Cohen’s *d*) per gene to quantify differential expression +* Principal Component Analysis (PCA) on gene set genes to assess variance explained and sample clustering -For a complete overview, see `?VisualiseIndividualGenes` and the extended online tutorial [here](https://diseasetranscriptomicslab.github.io/markeR/articles/Tutorial_BenchmarkingMode.html#visualise-individual-gene-behaviour). - -## Compare with Reference Gene Sets +See `?VisualiseIndividualGenes` for more details on the available options and the extended online tutorial [here](https://diseasetranscriptomicslab.github.io/markeR/articles/Tutorial_BenchmarkingMode.html#visualise-individual-gene-behaviour). -`markeR` also supports comparison of user-defined gene sets against reference collections (e.g., MSigDB). Two complementary similarity metrics are implemented: +## 3.6. Compare with Reference Gene Sets -* **Jaccard Index**: -the ratio of the number of genes in common over the total number of genes in the two sets. +`markeR` allows comparison of user-defined gene sets to reference sets (e.g., from MSigDB) using: -* **Log Odds Ratio (logOR)** from Fisher’s exact test of association between gene sets, given a specified gene universe. +* **Jaccard Index**: +Measures gene overlap relative to union size. -Filters can be applied based on similarity thresholds (e.g., minimum Jaccard, OR, or Fisher's test p-value). +* **Log Odds Ratio (logOR)**: +Computes enrichment using a user-defined gene universe and Fisher’s exact test. -Example of Gene Set Similarity (full tutorial [here][tutorial-signaturesimilarity]): We compare two user-defined gene sets against other user gene set and the MSigDB C2:CP:KEGG_LEGACY collection. Similarity is summarised in a heatmap of log odds ratios, highlighting associations above a defined threshold (e.g., OR > 100 for at least one of the signatures being compared). The underlying data for the heatmap can be accessed via `$data` for further analysis. +Filters can be applied based on similarity thresholds (e.g., minimum Jaccard, OR, or p-value). +Example of Gene Set Similarity (full tutorial [here][tutorial-signaturesimilarity]). Here, we are seeing how two user-defined signatures compare to a set of other user-defined signatures, as well as to a collection of reference gene sets from MSigDB (C2:CP:KEGG_LEGACY). The results are visualized in a heatmap, showing the similarity between the signatures based on log odds ratio (at least one with log10OR > 2). -```{r, fig.width=6, fig.height=8} +```{r, fig.width=6, fig.height=8, out.width="60%"} # Example data signature1 <- c("TP53", "BRCA1", "MYC", "EGFR", "CDK2") @@ -485,14 +499,15 @@ geneset_similarity( unlist(signature_list), msigdbr::msigdbr(species = "Homo sapiens", category = "C2")$gene_symbol )), - or_threshold = 100, - width_text=50, - pval_threshold = 0.05 + or_threshold = 100, #log10OR = 2 + pval_threshold = 0.05, + width_text=50 )$plot + ``` - -# Further Reading + +# 4. Further Reading Full tutorials with extended examples: @@ -500,7 +515,7 @@ Full tutorials with extended examples: - [Discovery Mode](https://diseasetranscriptomicslab.github.io/markeR/articles/Article_DiscoveryMode.html) - [Signature Similarity](https://diseasetranscriptomicslab.github.io/markeR/articles/Article_GeneSetSimilarity.html) -# Contact +# 5. Contact 📩 For questions, contact: @@ -514,7 +529,7 @@ Email: [rita.silva@medicina.ulisboa.pt](mailto:rita.silva@medicina.ulisboa.pt) [tutorial-signaturesimilarity]: https://diseasetranscriptomicslab.github.io/markeR/articles/Article_GeneSetSimilarity.html -# Session Information +# 6. Session Information ```{r} sessionInfo() From 20e32c33f97a72573b7d99bb79c10569b70ea9d4 Mon Sep 17 00:00:00 2001 From: Rita Silva Date: Wed, 17 Sep 2025 16:48:39 +0100 Subject: [PATCH 2/7] Reapply "Merge pull request #75 from DiseaseTranscriptomicsLab/main" This reverts commit 2447feb4c3bc67ced6a5e0d4d3656d0c604c3446. --- DESCRIPTION | 6 +- NAMESPACE | 1 + NEWS.md | 17 + R/FPR_Simulation.R | 2 +- R/PlotScores.R | 32 +- R/data.R | 25 +- R/geneset_similarity.R | 237 ++++++++++--- R/zzz.R | 13 + README.Rmd | 101 +++--- README.md | 195 ++++++----- man/FPR_Simulation.Rd | 2 +- man/counts_example.Rd | 10 +- man/geneset_similarity.Rd | 41 ++- man/genesets_example.Rd | 4 +- man/metadata_example.Rd | 2 +- tests/testthat/test-geneset_similarity.R | 44 +-- .../articles/Article_BenchmarkingMode.Rmd | 2 +- vignettes/articles/Article_DiscoveryMode.Rmd | 2 +- .../articles/Article_GeneSetSimilarity.Rmd | 12 +- vignettes/markeR.Rmd | 317 +++++++++--------- 20 files changed, 622 insertions(+), 443 deletions(-) create mode 100644 R/zzz.R diff --git a/DESCRIPTION b/DESCRIPTION index 375b4a3..0c8e339 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: markeR Title: An R Toolkit for Evaluating Gene Signatures as Phenotypic Markers -Version: 0.99.3 +Version: 0.99.5 Authors@R: c( person("Rita", "Martins-Silva", @@ -58,7 +58,9 @@ Suggests: rmarkdown, roxygen2, mockery, - covr + covr, + magick, + BiocStyle Config/testthat/edition: 3 Depends: R (>= 4.5.0) diff --git a/NAMESPACE b/NAMESPACE index 5339a3d..45c11f7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -63,6 +63,7 @@ importFrom(pROC,roc) importFrom(reshape2,melt) importFrom(rstatix,t_test) importFrom(scales,hue_pal) +importFrom(scales,rescale) importFrom(scales,squish) importFrom(stats,TukeyHSD) importFrom(stats,anova) diff --git a/NEWS.md b/NEWS.md index 96e6b5c..538bb1e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,20 @@ +# markeR 0.99.5 (17 Sep, 2025) + +- Minor fix in `.onAttach()` to avoid errors when checking `ggplot2` version and ensure the startup warning works correctly. + +# markeR 0.99.4 (17 Sep, 2025) + +## General +- Addressed feedback from the Bioconductor review process with updates to documentation and vignette style. + +## Documentation and vignette +- Updated vignette style to **Bioconductor’s BiocStyle** with automatic table of contents. +- Improved vignette content with small corrections. +- Revised dataset documentation by adding explicit `usage: data(object)` entries. + +## Functions +- Updated `geneset_similarity()` color handling: replaced the single `color_values` parameter with three new parameters — `color`, `neutral_color`, and `cold_color`, for more interpretable visualization. + # markeR 0.99.3 (21 Aug, 2025) ## Package size and structure diff --git a/R/FPR_Simulation.R b/R/FPR_Simulation.R index 1fb9d29..02c760b 100644 --- a/R/FPR_Simulation.R +++ b/R/FPR_Simulation.R @@ -108,7 +108,7 @@ utils::globalVariables(c( "cohen", "method", "contrast" )) #' @export #' FPR_Simulation <- function(data, metadata, original_signatures, Variable, - gene_list = NULL, number_of_sims=10, title=NULL, + gene_list = NULL, number_of_sims=100, title=NULL, widthTitle = 30, titlesize = 12, pointSize = 2, labsize = 10,mode = c( "none","simple","medium","extensive"), ColorValues=NULL, ncol=NULL, nrow=NULL) { diff --git a/R/PlotScores.R b/R/PlotScores.R index 5a3a94c..8ab0ad4 100644 --- a/R/PlotScores.R +++ b/R/PlotScores.R @@ -786,16 +786,16 @@ PlotScores_Categorical <- function(data, metadata, gene_sets, if (!is.null(title)) title <- wrap_title(title, width = widthTitle) # Create label for y axis based on method. - if (method == "ssGSEA") { - ylab <- "ssGSEA Enrichment Score" - } else if (method == "logmedian") { - ylab <- "Normalised Signature Score" - } else if (method == "ranking") { - ylab <- "Signature Genes' Ranking" - } + # if (method == "ssGSEA") { + # ylab <- "ssGSEA Enrichment Score" + # } else if (method == "logmedian") { + # ylab <- "Normalised Signature Score" + # } else if (method == "ranking") { + # ylab <- "Signature Genes' Ranking" + # } combined_plot <- ggpubr::annotate_figure(combined_plot, - left = grid::textGrob(ylab, + left = grid::textGrob(paste0("Gene Set's Score (", method, ")"), rot = 90, vjust = 1, gp = grid::gpar(cex = 1.3, fontsize = labsize)), @@ -1093,16 +1093,16 @@ PlotScores_Numeric <- function(data, if (!is.null(title)) title <- wrap_title(title, width = widthTitle) # Create label for y axis based on method. - if (method == "ssGSEA") { - ylab <- "ssGSEA Enrichment Score" - } else if (method == "logmedian") { - ylab <- "Normalised Signature Score" - } else if (method == "ranking") { - ylab <- "Signature Genes' Ranking" - } + # if (method == "ssGSEA") { + # ylab <- "ssGSEA Enrichment Score" + # } else if (method == "logmedian") { + # ylab <- "Normalised Signature Score" + # } else if (method == "ranking") { + # ylab <- "Signature Genes' Ranking" + # } combined_plot <- ggpubr::annotate_figure(combined_plot, - left = grid::textGrob(ylab, + left = grid::textGrob(paste0("Gene Set's Score (", method, ")"), rot = 90, vjust = 1, gp = grid::gpar(cex = 1.3, diff --git a/R/data.R b/R/data.R index 748e00a..2c8557a 100644 --- a/R/data.R +++ b/R/data.R @@ -15,6 +15,7 @@ #' for senescent samples, "young" for proliferative).} #' } #' +#' #' @source \url{https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE63577} #' #' @references Marthandan S, Priebe S, Baumgart M, Groth M et al. Similarities @@ -24,14 +25,15 @@ #' @references Marthandan S, Baumgart M, Priebe S, Groth M et al. Conserved #' Senescence Associated Genes and Pathways in Primary Human Fibroblasts #' Detected by RNA-Seq. PLoS One 2016;11(5):e0154531. PMID: 27140416 -#' -#' @keywords datasets +#' +#' @usage data(metadata_example) "metadata_example" #' Gene Expression Counts for Marthandan et al. (2016) RNA-Seq Data #' -#' A numeric matrix containing filtered and normalized gene expression data from -#' the Marthandan et al. (2016) study (GEO accession GSE63577). +#' A numeric matrix containing filtered and normalized (non log-transformed) +#' gene expression data from the Marthandan et al. (2016) study (GEO accession +#' GSE63577). #' #' Raw FASTQ files were downloaded using `fasterq-dump` (v2.11.0) and processed #' in a reproducible conda environment (Python v3.11.5). Quality control was @@ -39,8 +41,7 @@ #' Pseudo-alignment to the RefSeq transcriptome (NCBI release 109) was performed #' using kallisto (v0.44.0). Genes with low expression (mean count < 70 in all #' conditions) were filtered out. Count normalization factors were calculated -#' with `edgeR::calcNormFactors`, and log2-transformed values were obtained via -#' `limma::voom`. +#' with `edgeR::calcNormFactors`. #' #' Intermediate time points for HFF and MRC5 cell lines were excluded, resulting #' in a final dataset with 45 high-quality samples across proliferative, @@ -52,7 +53,7 @@ #' #' @format A numeric matrix with rows as genes (gene symbols) and columns as #' samples (sample IDs). -#' +#' #' @source \url{https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE63577} #' #' @references Marthandan S, Priebe S, Baumgart M, Groth M et al. Similarities @@ -63,8 +64,8 @@ #' Senescence Associated Genes and Pathways in Primary Human Fibroblasts #' Detected by RNA-Seq. #' *PLoS One* 2016;11(5):e0154531. PMID: 27140416 -#' -#' @keywords datasets +#' +#' @usage data(counts_example) "counts_example" #' Example Gene Sets for Cellular Senescence @@ -75,15 +76,15 @@ #' curated gene set of commonly reported senescence markers, #' with directionality (+1 or -1).} #' \item{REACTOME_Senescence}{Character vector of gene symbols. The -#' REACTOME_CELLULAR_SENESCENCE from MSigDB pathway. No directionality.} +#' REACTOME_CELLULAR_SENESCENCE from MSigDB database No directionality.} #' \item{HernandezSegura}{A data frame with columns `gene` and `direction`. #' A gene set from Hernandez-Segura et al. (2017), with directionality (+1 or -1).} #' } -#' +#' #' @references Hernandez-Segura A, de Jong TV, Melov S, Guryev V, Campisi J, #' Demaria M. Unmasking Transcriptional Heterogeneity in Senescent Cells. #' *Curr Biol.* 2017 Sep 11;27(17):2652-2660.e4. doi: 10.1016/j.cub.2017.07.033. #' Epub 2017 Aug 30. PMID: 28844647; PMCID: PMC5788810. -#' @keywords datasets +#' @usage data(genesets_example) "genesets_example" diff --git a/R/geneset_similarity.R b/R/geneset_similarity.R index f92c833..2079ee6 100644 --- a/R/geneset_similarity.R +++ b/R/geneset_similarity.R @@ -21,12 +21,31 @@ #' Odds Ratio required for a gene set to be included in the plot. Default is #' 1. #' @param pval_threshold (only if method == "odds_ratio" only) Numeric. Maximum -#' adjusted p-value to show a label. Default is 0.05. -#' @param limits Numeric vector of length 2. Limits for color scale. +#' adjusted p-value required for a gene set to be included in the plot. +#' Default is 0.05. +#' @param limits Numeric vector of length 2. Limits for color scale. If `NULL`, +#' is automatically set to c(0,1) for Jaccard or the range of OR for odds +#' ratio. #' @param title_size Integer specifying the font size for the plot title. #' Default is `12`. -#' @param color_values Character vector of colors used for the fill gradient. -#' Default is `c("#F9F4AE", "#B44141")`. +#' @param color Character. The color for the maximum of the scale. Default is +#' `red.` +#' - If `method = "jaccard"`, the scale goes from `neutral_color` to `color`. +#' - If `method = "odds_ratio"` and any OR >= 1, the scale ends at `color`. +#' - If `method = "odds_ratio"` and all OR <= 1, `color` is not used; instead, the scale +#' runs from `cold_color` (minimum) to `neutral_color` (OR = 1, if present; +#' otherwise `neutral_color` is the maximum). +#' @param neutral_color Character. The neutral reference color. Default is +#' `white`. +#' - If `method = "jaccard"`, this is the minimum of the scale. +#' - If `method = "odds_ratio"` and any OR >= 1, this corresponds to OR = 1 if such values exist; otherwise it is the minimum of the scale. +#' - If `method = "odds_ratio"` and all OR <= 1, this corresponds to OR = 1 if such values exist; otherwise it is the maximum of the scale (with `cold_color` as the minimum). +#' @param cold_color Character. The color for values below OR = 1 (only used +#' when `method = "odds_ratio"`). Default is `blue`. +#' - If `method = "odds_ratio"` and any OR < 1, the scale runs from `cold_color` +#' (minimum) to `neutral_color` (OR = 1 if present; otherwise `neutral_color` +#' is the maximum). +#' - Ignored if `method = "jaccard"` or if all OR >= 1. #' @param title Optional. Custom title for the plot. If `NULL`, the title #' defaults to `"Signature Overlap"`. #' @param jaccard_threshold (only if method == "jaccard" only) Numeric. Minimum @@ -45,13 +64,13 @@ #' \describe{ #' \item{\code{plot}}{The \pkg{ggplot2} object of the similarity heatmap.} #' \item{\code{data}}{The data frame object containing the similarity -#' scores aper pair of gene sets.} +#' scores per pair of gene sets.} #' } #' #' @import ggplot2 #' @importFrom tibble tibble #' @importFrom msigdbr msigdbr -#' @importFrom scales squish +#' @importFrom scales squish rescale #' #' @examples #' # Create two simple gene signatures @@ -97,7 +116,9 @@ geneset_similarity <- function( pval_threshold = 0.05, limits = NULL, title_size = 12, - color_values = c("#F9F4AE", "#B44141"), + color = "#B44141", # color for the maximum of the scale + neutral_color = "white", # neutral reference color + cold_color = "#4173B4", # color for OR < 1 when applicable title = NULL, jaccard_threshold = 0, msig_subset = NULL, @@ -135,14 +156,15 @@ geneset_similarity <- function( if (!is.null(limits) && (!is.numeric(limits) || length(limits) != 2)) { stop("limits must be a numeric vector of length 2.") } + + if (!is.null(limits) && any(limits < 0 | !is.finite(limits))) { + warning("Limits contain negative, or non-finite values. Ensure limits are positive finite numbers.") + } if (!is.numeric(title_size) || title_size <= 0) { stop("title_size must be a positive numeric value.") } - - if (!is.character(color_values) || length(color_values) < 2) { - stop("color_values must be a character vector with two colors.") - } + if (!is.null(title) && !is.character(title)) { stop("title must be a character string or NULL.") @@ -232,11 +254,12 @@ geneset_similarity <- function( ft <- stats::fisher.test(cont_tbl) score <- log10(ft$estimate) - if (!is.na(ft$p.value) && ft$p.value <= pval_threshold && ft$estimate >= or_threshold) { - label <- sprintf("%.1f", score) - } else { - label <- "" - } + # if (!is.na(ft$p.value) && ft$p.value <= pval_threshold && ft$estimate >= or_threshold) { + # label <- sprintf("%.1f", 10^score) # to show non log values in heatmap + # #label <- sprintf("%.1f", score) + # } else { + # label <- "" + # } pval <- ft$p.value } @@ -244,7 +267,7 @@ geneset_similarity <- function( Reference_Signature = ref_name, Compared_Signature = comp_name, Score = score, - Label = label, + #Label = label, Pval = pval, stringsAsFactors = FALSE ) @@ -259,19 +282,24 @@ geneset_similarity <- function( if (metric == "odds_ratio") { # Filter groups where any 10^Score >= threshold + # keep_rows <- by(similarity_df, similarity_df$Compared_Signature, function(group) { + # any(10^group$Score >= or_threshold, na.rm = TRUE) + # }) + keep_rows <- by(similarity_df, similarity_df$Compared_Signature, function(group) { - any(10^group$Score >= or_threshold, na.rm = TRUE) + any(10^group$Score >= or_threshold & group$Pval <= pval_threshold, na.rm = TRUE) }) - + + kept_signatures <- names(keep_rows[keep_rows]) similarity_df <- similarity_df[similarity_df$Compared_Signature %in% kept_signatures, , drop = FALSE] # Add Label column - similarity_df$Label <- ifelse( - similarity_df$Pval <= pval_threshold, - sprintf("%.1f", similarity_df$Score), - "" - ) + # similarity_df$Label <- ifelse( + # similarity_df$Pval <= pval_threshold, + # sprintf("%.1f", similarity_df$Score), + # "" + # ) } if (metric == "jaccard" && jaccard_threshold > 0) { @@ -287,43 +315,168 @@ geneset_similarity <- function( data <- similarity_df + + if (nrow(similarity_df) == 0) { + stop("No signatures passed the filtering criteria.") + } + similarity_df$Reference_Signature <- vapply(similarity_df$Reference_Signature, function(x) wrap_title(x, width_text), character(1)) similarity_df$Compared_Signature <- vapply(similarity_df$Compared_Signature, function(x) wrap_title(x, width_text), character(1)) - +# +# if (is.null(limits)) { +# if (metric == "jaccard") { +# limits <- c(0, 1) +# } else { +# # For odds ratio, we set limits based on the data +# limits <- c(min(similarity_df$Score[is.finite(similarity_df$Score)], na.rm = TRUE), max(similarity_df$Score, na.rm = TRUE)) +# +# } +# } +# +# +# +# plt <- ggplot(similarity_df, aes(x = .data$Reference_Signature, +# y = .data$Compared_Signature, fill = .data$Score)) + +# geom_tile(color = "white") + +# #geom_text(aes(label = .data$Label), color = "black") + +# scale_fill_gradientn(colors = color_values, limits = limits, +# oob = scales::squish, na.value = na_color) + +# labs( +# x = "", +# y = "Compared Signature", +# fill = ifelse(metric == "jaccard", "Jaccard Index", "log10(OR)"), +# title = ifelse(is.null(title), paste("Signature Overlap (", metric, ")"), title) +# ) + +# theme_minimal() + +# theme( +# axis.text.x = element_text(angle = 45, hjust = 1), +# plot.title = element_text(hjust = 0.5, size = title_size) +# ) +# +# plt +# +# invisible(list(plot=plt, +# data=data)) + + + # ---------------------------- + # Safe handling of limits (user provides OR) + # ---------------------------- if (is.null(limits)) { if (metric == "jaccard") { limits <- c(0, 1) - } else { - # For odds ratio, we set limits based on the data - # but ensure they are at least 0 to avoid negative log10 values - limits <- c(0, max(similarity_df$Score, na.rm = TRUE)) - } + } else { # odds_ratio + + # Extract finite scores + finite_scores <- similarity_df$Score[is.finite(similarity_df$Score)] + + # Identify if there are any OR < 1 (logOR < 0) + has_below1 <- any(finite_scores < 0) + + # Compute padding for -Inf (original OR = 0) + if (has_below1) { + # Place -Inf one log unit below the minimum finite score < 0 + min_below <- min(finite_scores[finite_scores < 0]) + pad_value <- min_below - 1 + } else { + # All OR >= 1 → map -Inf slightly above the maximum score, then invert sign + max_score <- max(finite_scores) + pad_value <- -(max_score + 1) + } + + # Replace -Inf in Score with computed padding + similarity_df$Score[is.infinite(similarity_df$Score) & similarity_df$Score < 0] <- pad_value + + # Convert back from log + OR_values <- 10^similarity_df$Score + + # Replace any zero or negative OR with a small number + OR_values[OR_values <= 0] <- 1e-6 + #similarity_df$Score[is.infinite(similarity_df$Score) & similarity_df$Score < 0] <- log10(1e-6) + + # Compute limits + limits <- c(min(OR_values, na.rm = TRUE), max(OR_values, na.rm = TRUE)) + } } - plt <- ggplot(similarity_df, aes(x = .data$Reference_Signature, - y = .data$Compared_Signature, fill = .data$Score)) + + + # Convert OR limits to log space (Score already log10 OR) + log_limits <- if (metric == "odds_ratio") log10(limits) else limits + if (min(log_limits) == max(log_limits)) { + log_limits <- log_limits + c(-0.01, 0.01) # small padding + } + zero <- 0 # neutral color at OR = 1 → log10(1) = 0 + + # ---------------------------- + # Define fill colors + # ---------------------------- + if (metric == "jaccard") { + fill_colors <- c(neutral_color, color) + fill_values <- c(log_limits[1], log_limits[2]) + } else if (metric == "odds_ratio") { + min_lim <- log_limits[1] + max_lim <- log_limits[2] + + if (min_lim >= zero) { + fill_colors <- c(neutral_color, color) + fill_values <- c(min_lim, max_lim) + } else if (max_lim <= zero) { + fill_colors <- c(cold_color, neutral_color) + fill_values <- c(min_lim, max_lim) + } else { + fill_colors <- c(cold_color, neutral_color, color) + fill_values <- c(min_lim, zero, max_lim) + } + + # ---------------------------- + # Safe legend breaks in OR space + # ---------------------------- + valid_OR <- limits[limits > 0 & is.finite(limits)] + if (length(valid_OR) == 0) valid_OR <- 1 # fallback + + log_breaks <- 10^seq(floor(log10(min(valid_OR))), ceiling(log10(max(valid_OR)))) + log_breaks <- log_breaks[log_breaks >= min(valid_OR) & log_breaks <= max(valid_OR)] + } + + # ---------------------------- + # Build plot + # ---------------------------- + plt <- ggplot(similarity_df, aes( + x = .data$Reference_Signature, + y = .data$Compared_Signature, + fill = .data$Score)) + geom_tile(color = "white") + - geom_text(aes(label = .data$Label), color = "black") + - scale_fill_gradientn(colors = color_values, limits = limits, - oob = scales::squish, na.value = na_color) + + scale_fill_gradientn( + colors = fill_colors, + values = scales::rescale(fill_values), + limits = log_limits, + oob = scales::squish, + na.value = na_color, + trans = "identity", # already logged + breaks = if (metric == "odds_ratio") log10(log_breaks) else waiver(), + labels = if (metric == "odds_ratio") log_breaks else waiver() + ) + labs( x = "", y = "Compared Signature", - fill = ifelse(metric == "jaccard", "Jaccard Index", "log10(OR)"), - title = ifelse(is.null(title), paste("Signature Overlap (", metric, ")"), title) + fill = ifelse(metric == "jaccard", "Jaccard Index", "Odds Ratio"), + title = ifelse(is.null(title), "Signature Overlap", title) ) + theme_minimal() + theme( axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5, size = title_size) ) - - plt - - invisible(list(plot=plt, - data=data)) + + if (metric == "odds_ratio") { + data$Score <- 10^data$Score # convert back to OR for data output + } + + invisible(list(plot = plt, data = data)) + + } diff --git a/R/zzz.R b/R/zzz.R new file mode 100644 index 0000000..d34382c --- /dev/null +++ b/R/zzz.R @@ -0,0 +1,13 @@ +.onAttach <- function(libname, pkgname) { + gg_version <- tryCatch( + utils::packageVersion("ggplot2"), + error = function(e) NULL + ) + + if (!is.null(gg_version) && gg_version > "3.5.2") { + packageStartupMessage( + "Warning: markeR has been tested with ggplot2 <= 3.5.2. ", + "Using newer versions may cause incompatibilities." + ) + } +} diff --git a/README.Rmd b/README.Rmd index 00431d1..3a38d03 100644 --- a/README.Rmd +++ b/README.Rmd @@ -26,13 +26,13 @@ knitr::opts_chunk$set( -**markeR** provides a suite of methods for using gene sets to quantify and evaluate the extent to which a given gene signature marks a specific phenotype from gene expression data. The package implements various scoring, enrichment and classification approaches, along with tools to compute performance metrics and visualize results. +**`markeR`** is an R package that provides a modular and extensible framework for the systematic evaluation of gene sets as phenotypic markers using transcriptomic data. The package is designed to support both quantitative analyses and visual exploration of gene set behaviour across experimental and clinical phenotypes. -> **To cite markeR please use:** +> **To cite `markeR` please use:** > -> Martins-Silva R, Kaizeler A, Barbosa-Morais N (2025). _markeR: an R Toolkit for Evaluating Gene Sets as Phenotypic Markers_. Gulbenkian Institute for Molecular Medicine, Faculdade de Medicina, Universidade de Lisboa, Lisbon, Portugal. R package version 0.99.3, https://github.com/DiseaseTranscriptomicsLab/markeR. +> Martins-Silva R, Kaizeler A, Barbosa-Morais N (2025). _markeR: an R Toolkit for Evaluating Gene Sets as Phenotypic Markers_. Gulbenkian Institute for Molecular Medicine, Faculdade de Medicina, Universidade de Lisboa, Lisbon, Portugal. R package version 0.99.5, https://github.com/DiseaseTranscriptomicsLab/markeR. -The folder `inst/Paper/` is in the **paper** branch and contains all scripts and materials used in the original markeR paper to reproduce analyses and figures. You can browse it [here](https://github.com/DiseaseTranscriptomicsLab/markeR/tree/paper/inst/Paper). +The folder `inst/Paper/` is in the **paper** branch and contains all scripts and materials used in the original `markeR` paper to reproduce analyses and figures. You can browse it [here](https://github.com/DiseaseTranscriptomicsLab/markeR/tree/paper/inst/Paper). ![](man/figures/Workflow.png) @@ -57,9 +57,8 @@ The folder `inst/Paper/` is in the **paper** branch and contains all scripts and ## Installation - Install the latest development release of markeR from [GitHub](https://github.com/) with: - + ``` r # install.packages("devtools") devtools::install_github("DiseaseTranscriptomicsLab/markeR@*release") @@ -80,8 +79,6 @@ This package is officially supported on `R > 4.5.0`. ⚠️ Older versions of `R ## Common Workflow -`markeR` provides a modular pipeline to quantify transcriptomic signatures and assess their association with phenotypic or clinical variables. The typical workflow includes the following steps: - ### 1. Input Requirements Depending on the analysis mode, inputs vary slightly. @@ -118,7 +115,10 @@ gene_sets ``` * **Expression Data Frame**: - A filtered and normalised gene expression data frame (genes × samples). Row names must be gene identifiers, and column names must match the sample IDs in the metadata. + A filtered and normalised, non log-transformed, gene expression matrix (genes × samples). Row names must be gene identifiers; column names must match sample IDs in the metadata. + + **Warning:** If you are using microarray data or outputs from common RNA-seq pipelines (*e.g.*, edgeR), note that the expression values may already be log2-normalised. The input to `markeR` must necessarily be **non-log-transformed**. If your data are log2-transformed, you can revert them by applying `2^data`. + ```{r example-expression-matrix, echo=FALSE} # Simulate expression matrix: 10 genes × 5 samples @@ -134,7 +134,7 @@ head(expr_df) ``` * **Sample Metadata**: - A data frame with annotations for each sample, with the sample ID in the first column. The row names must match the column names of the expression matrix. + A data frame with samples as rows and annotations as columns. The first column should contain sample IDs matching the expression matrix column names. ```{r example-metadata, echo=FALSE} # Simulate sample metadata @@ -151,82 +151,79 @@ metadata ### 2. Select Mode of Analysis -* **Discovery Mode**: - Explore how a single, well-characterised gene set relates to a specific variable of interest. Suitable for hypothesis generation or signature projection. +`markeR` provides two modes of operation: + +* **Benchmarking**: +evaluates gene sets' performance in marking a metadata variable, *i.e.*, a phenotype, returning comparative visualisations across scoring and enrichment methods. -* **Benchmarking Mode**: - Evaluate one or more gene sets against multiple metadata variables using a standardised scoring and effect size framework. This mode provides comprehensive visualisations and comparisons across methods. +* **Discovery**: +examines the relationship between a gene set and one or more variables of interest, suitable for exploratory or hypothesis-generating analyses. ### 3. Choose a Quantification Approach -`markeR` supports two complementary strategies for quantifying the association between gene sets and phenotypes: +Two complementary strategies are implemented for quantifying associations between gene sets and phenotypes: #### 3.1 Score-Based Approach -This strategy generates a **single numeric score per sample**, reflecting the activity of a gene set. It enables flexible downstream analyses, including comparisons across phenotypic groups. -Three scoring methods are available: +A score summarising the collective expression of a gene set therein is assigned **to each sample**. Scores can be visualised using built-in functions, or used directly in downstream analyses (*e.g.*, comparisons between phenotypic groups of samples, correlations with numerical phenotypes). -* **Log2-median**: Calculates the median log2 expression of the genes in the set. Sensitive to absolute shifts in expression. +Available methods: -* **Ranking**: Ranks all genes within each sample and averages the ranks of gene set members. Captures relative ordering rather than magnitude. +* **Log2-median**: mean of the across-sample normalised log2 median-centred expression levels of the genes in the set; for bidirectional gene sets, the sample score is the partial score for the subset of putatively upregulated genes minus that of the downregulated subset. -* **ssGSEA**: Computes a single-sample gene set enrichment score using the ssGSEA algorithm. Reflects the coordinated up- or down-regulation of the set in each sample. +* **Ranking**: mean expression rank of gene set members in each sample; for bidirectional gene sets, the sample score is the partial score for the subset of putatively upregulated genes minus that of the downregulated subset, and normalised by the number of genes in the set. -These methods vary in assumptions and sensitivity. Robust gene sets are expected to perform consistently across all three. +* **ssGSEA**: single-sample gene set enrichment score using ssGSEA; for bidirectional gene sets, the sample score is the partial score for the subset of putatively upregulated genes minus that of the downregulated subset. -#### 3.2 Enrichment-Based Approach - -This approach uses a classical **gene set enrichment analysis (GSEA)** framework to evaluate whether the gene set is significantly overrepresented at the top or bottom of a ranked list of genes (e.g., ranked by fold change or correlation with phenotype). +Gene sets that are robust phenotypic markers are expected to yield consistently high scores across methods. -* **GSEA**: Computes a Normalised Enrichment Score (NES) for each contrast or variable of interest, adjusting for gene set size and multiple testing. +#### 3.2 Enrichment-Based Approach -Use this approach when interested in collective behaviour of gene sets in relation to ranked differential signals. +Enrichment-based methods implement **Gene Set Enrichment Analysis (GSEA)**. Genes are ranked according to differential expression statistics, and a Normalised Enrichment Score (NES) per variable of interest is computed, accompanied by a p-value adjusted for multiple hypothesis testing. ### 4. Visualisation and Evaluation -In **Benchmarking Mode**, `markeR` offers a range of visual summaries: +In **Benchmarking Mode**, `markeR` offers a range of visual summaries: + +* Violin plots of score distributions by categorical phenotype; +* Scatter plots of association between scores and numerical phenotypes; +* Volcano plots and heatmaps of scores or differential gene set expression based on effect sizes (Cohen’s *d* or *f*); +* ROC curves and respective AUC values of gene sets' phenotypic classification performance; +* Violin plots of effect size distributions (Cohen’s *d*) for pairwise group differences in scores, for original and simulated gene sets; +* Plots summarising NES alongside adjusted p-values (*e.g.*, lollipop plots); +* GSEA plots showing running enrichment scores across ranked gene lists. -* Violin or scatter plots showing score distributions by phenotype -* Volcano plots and heatmaps based on effect sizes (Cohen’s *d* or *f*) -* ROC curves and AUC values -* Null distribution testing using random gene sets matched for size and directionality -* Lollipop plots summarising enrichment scores (NES) with adjusted p-values -* Enrichment plots showing running enrichment scores across ranked gene lists -* Volcano plots In **Discovery Mode**, the output focuses on a single gene set: -* Score distributions by phenotype -* Pairwise contrasts (Cohen’s *d*) and overall effect sizes (Cohen’s *f*) -* Enrichment score summaries (NES) with adjusted p-values (e.g., lollipop plots) +* Score distributions stratified by variable; +* Effect sizes for pairwise and multiple-group differences (Cohen's *d* and *f*, respectively); +* Cross-variable summaries of NES and adjusted p-values (*e.g.*, lollipop plots). -Benchmarking mode offers the most comprehensive set of features and allows users to seamlessly move from discovery to benchmarking mode once a variable of interest has been identified and further testing is required. The main difference from Discovery mode is that Benchmarking is designed to evaluate multiple gene sets simultaneously, whereas Discovery mode focuses on quantifying a single, robust gene set. +The Benchmarking Mode offers the most comprehensive set of features. Users are allowed to seamlessly move from Discovery to Benchmarking once a variable of interest has been identified and further testing is required. Benchmarking is designed to evaluate multiple gene sets simultaneously, whereas Discovery focuses on the performance of a single gene set. ### 5. Individual Gene Exploration -To better understand the contribution of individual genes within a gene set and identify whether specific genes drive the overall signal, `markeR` offers a suite of gene-level exploratory analyses, including: +To better understand the contribution of individual genes within a gene set, and identify whether specific genes drive the set's collective signal, `markeR` provides `VisualiseIndividualGenes.` Available options include: -* Expression heatmaps of genes across samples and groups -* Violin plots showing expression distributions of individual genes -* Correlation heatmaps to reveal co-expression patterns among genes in the set -* ROC curves and AUC values for individual genes to evaluate their discriminatory power -* Effect size calculations (Cohen’s *d*) per gene to quantify differential expression -* Principal Component Analysis (PCA) on gene set genes to assess variance explained and sample clustering +* Expression heatmaps of genes across samples or groups of samples; +* Violin plots showing cross-sample expression distributions of individual genes; +* Heatmaps of pairwise cross-sample expression correlation between genes in the set; +* ROC curves and AUC values to evaluate single genes' performance as phenotypic markers; +* Effect size estimation (Cohen’s *d*) of expression differences between groups of samples; +* Principal Component Analysis (PCA) of expression of genes in the set, to evaluate which genes dominate collective variance and how samples separate according to the gene set's expression. ### 6. Compare with Reference Gene Sets -`markeR` allows comparison of user-defined gene sets to reference sets (e.g., from MSigDB) using: +`markeR` also supports comparison of user-defined gene sets against reference collections (e.g., MSigDB). Two complementary similarity metrics are implemented: * **Jaccard Index**: - Measures gene overlap relative to union size. - -* **Log Odds Ratio (logOR)**: - Computes enrichment using a user-defined gene universe and Fisher’s exact test. +the ratio of the number of genes in common over the total number of genes in the two sets. -Filters can be applied based on similarity thresholds (e.g., minimum Jaccard, OR, or p-value). - +* **Log Odds Ratio (logOR)** from Fisher’s exact test of association between gene sets, given a specified gene universe. +Filters can be applied based on similarity thresholds (e.g., minimum Jaccard, OR, or Fisher's test p-value). ## Contact diff --git a/README.md b/README.md index bf5f7b0..1a9a9b2 100644 --- a/README.md +++ b/README.md @@ -16,22 +16,22 @@ Check](https://github.com/DiseaseTranscriptomicsLab/markeR/actions/workflows/bio -**markeR** provides a suite of methods for using gene sets to quantify -and evaluate the extent to which a given gene signature marks a specific -phenotype from gene expression data. The package implements various -scoring, enrichment and classification approaches, along with tools to -compute performance metrics and visualize results. +**`markeR`** is an R package that provides a modular and extensible +framework for the systematic evaluation of gene sets as phenotypic +markers using transcriptomic data. The package is designed to support +both quantitative analyses and visual exploration of gene set behaviour +across experimental and clinical phenotypes. -> **To cite markeR please use:** +> **To cite `markeR` please use:** > > Martins-Silva R, Kaizeler A, Barbosa-Morais N (2025). *markeR: an R > Toolkit for Evaluating Gene Sets as Phenotypic Markers*. Gulbenkian > Institute for Molecular Medicine, Faculdade de Medicina, Universidade -> de Lisboa, Lisbon, Portugal. R package version 0.99.3, +> de Lisboa, Lisbon, Portugal. R package version 0.99.5, > . The folder `inst/Paper/` is in the **paper** branch and contains all -scripts and materials used in the original markeR paper to reproduce +scripts and materials used in the original `markeR` paper to reproduce analyses and figures. You can browse it [here](https://github.com/DiseaseTranscriptomicsLab/markeR/tree/paper/inst/Paper). @@ -58,8 +58,7 @@ analyses and figures. You can browse it ## Installation -Install the latest development release of markeR from -[GitHub](https://github.com/) with: +Install the latest development release of markeR from [GitHub](https://github.com/) with: ``` r # install.packages("devtools") @@ -89,10 +88,6 @@ restore compatibility. ## Common Workflow -`markeR` provides a modular pipeline to quantify transcriptomic -signatures and assess their association with phenotypic or clinical -variables. The typical workflow includes the following steps: - ### 1. Input Requirements Depending on the analysis mode, inputs vary slightly. @@ -122,9 +117,15 @@ gene_sets ``` - **Expression Data Frame**: - A filtered and normalised gene expression data frame (genes × - samples). Row names must be gene identifiers, and column names must - match the sample IDs in the metadata. + A filtered and normalised, non log-transformed, gene expression matrix + (genes × samples). Row names must be gene identifiers; column names + must match sample IDs in the metadata. + + **Warning:** If you are using microarray data or outputs from common + RNA-seq pipelines (*e.g.*, edgeR), note that the expression values may + already be log2-normalised. The input to `markeR` must necessarily be + **non-log-transformed**. If your data are log2-transformed, you can + revert them by applying `2^data`. ``` r head(expr_df) @@ -138,9 +139,9 @@ head(expr_df) ``` - **Sample Metadata**: - A data frame with annotations for each sample, with the sample ID in - the first column. The row names must match the column names of the - expression matrix. + A data frame with samples as rows and annotations as columns. The + first column should contain sample IDs matching the expression matrix + column names. ``` r metadata @@ -154,114 +155,124 @@ metadata ### 2. Select Mode of Analysis -- **Discovery Mode**: Explore how a single, well-characterised gene set - relates to a specific variable of interest. Suitable for hypothesis - generation or signature projection. +`markeR` provides two modes of operation: + +- **Benchmarking**: evaluates gene sets’ performance in marking a + metadata variable, *i.e.*, a phenotype, returning comparative + visualisations across scoring and enrichment methods. -- **Benchmarking Mode**: Evaluate one or more gene sets against multiple - metadata variables using a standardised scoring and effect size - framework. This mode provides comprehensive visualisations and - comparisons across methods. +- **Discovery**: examines the relationship between a gene set and one or + more variables of interest, suitable for exploratory or + hypothesis-generating analyses. ### 3. Choose a Quantification Approach -`markeR` supports two complementary strategies for quantifying the -association between gene sets and phenotypes: +Two complementary strategies are implemented for quantifying +associations between gene sets and phenotypes: #### 3.1 Score-Based Approach -This strategy generates a **single numeric score per sample**, -reflecting the activity of a gene set. It enables flexible downstream -analyses, including comparisons across phenotypic groups. +A score summarising the collective expression of a gene set therein is +assigned **to each sample**. Scores can be visualised using built-in +functions, or used directly in downstream analyses (*e.g.*, comparisons +between phenotypic groups of samples, correlations with numerical +phenotypes). -Three scoring methods are available: +Available methods: -- **Log2-median**: Calculates the median log2 expression of the genes in - the set. Sensitive to absolute shifts in expression. +- **Log2-median**: mean of the across-sample normalised log2 + median-centred expression levels of the genes in the set; for + bidirectional gene sets, the sample score is the partial score for the + subset of putatively upregulated genes minus that of the downregulated + subset. -- **Ranking**: Ranks all genes within each sample and averages the ranks - of gene set members. Captures relative ordering rather than magnitude. +- **Ranking**: mean expression rank of gene set members in each sample; + for bidirectional gene sets, the sample score is the partial score for + the subset of putatively upregulated genes minus that of the + downregulated subset, and normalised by the number of genes in the + set. -- **ssGSEA**: Computes a single-sample gene set enrichment score using - the ssGSEA algorithm. Reflects the coordinated up- or down-regulation - of the set in each sample. +- **ssGSEA**: single-sample gene set enrichment score using ssGSEA; for + bidirectional gene sets, the sample score is the partial score for the + subset of putatively upregulated genes minus that of the downregulated + subset. -These methods vary in assumptions and sensitivity. Robust gene sets are -expected to perform consistently across all three. +Gene sets that are robust phenotypic markers are expected to yield +consistently high scores across methods. #### 3.2 Enrichment-Based Approach -This approach uses a classical **gene set enrichment analysis (GSEA)** -framework to evaluate whether the gene set is significantly -overrepresented at the top or bottom of a ranked list of genes (e.g., -ranked by fold change or correlation with phenotype). - -- **GSEA**: Computes a Normalised Enrichment Score (NES) for each - contrast or variable of interest, adjusting for gene set size and - multiple testing. - -Use this approach when interested in collective behaviour of gene sets -in relation to ranked differential signals. +Enrichment-based methods implement **Gene Set Enrichment Analysis +(GSEA)**. Genes are ranked according to differential expression +statistics, and a Normalised Enrichment Score (NES) per variable of +interest is computed, accompanied by a p-value adjusted for multiple +hypothesis testing. ### 4. Visualisation and Evaluation In **Benchmarking Mode**, `markeR` offers a range of visual summaries: -- Violin or scatter plots showing score distributions by phenotype -- Volcano plots and heatmaps based on effect sizes (Cohen’s *d* or *f*) -- ROC curves and AUC values -- Null distribution testing using random gene sets matched for size and - directionality -- Lollipop plots summarising enrichment scores (NES) with adjusted - p-values -- Enrichment plots showing running enrichment scores across ranked gene - lists -- Volcano plots +- Violin plots of score distributions by categorical phenotype; +- Scatter plots of association between scores and numerical phenotypes; +- Volcano plots and heatmaps of scores or differential gene set + expression based on effect sizes (Cohen’s *d* or *f*); +- ROC curves and respective AUC values of gene sets’ phenotypic + classification performance; +- Violin plots of effect size distributions (Cohen’s *d*) for pairwise + group differences in scores, for original and simulated gene sets; +- Plots summarising NES alongside adjusted p-values (*e.g.*, lollipop + plots); +- GSEA plots showing running enrichment scores across ranked gene lists. In **Discovery Mode**, the output focuses on a single gene set: -- Score distributions by phenotype -- Pairwise contrasts (Cohen’s *d*) and overall effect sizes (Cohen’s - *f*) -- Enrichment score summaries (NES) with adjusted p-values (e.g., - lollipop plots) +- Score distributions stratified by variable; +- Effect sizes for pairwise and multiple-group differences (Cohen’s *d* + and *f*, respectively); +- Cross-variable summaries of NES and adjusted p-values (*e.g.*, + lollipop plots). -Benchmarking mode offers the most comprehensive set of features and -allows users to seamlessly move from discovery to benchmarking mode once +The Benchmarking Mode offers the most comprehensive set of features. +Users are allowed to seamlessly move from Discovery to Benchmarking once a variable of interest has been identified and further testing is -required. The main difference from Discovery mode is that Benchmarking -is designed to evaluate multiple gene sets simultaneously, whereas -Discovery mode focuses on quantifying a single, robust gene set. +required. Benchmarking is designed to evaluate multiple gene sets +simultaneously, whereas Discovery focuses on the performance of a single +gene set. ### 5. Individual Gene Exploration To better understand the contribution of individual genes within a gene -set and identify whether specific genes drive the overall signal, -`markeR` offers a suite of gene-level exploratory analyses, including: - -- Expression heatmaps of genes across samples and groups -- Violin plots showing expression distributions of individual genes -- Correlation heatmaps to reveal co-expression patterns among genes in - the set -- ROC curves and AUC values for individual genes to evaluate their - discriminatory power -- Effect size calculations (Cohen’s *d*) per gene to quantify - differential expression -- Principal Component Analysis (PCA) on gene set genes to assess - variance explained and sample clustering +set, and identify whether specific genes drive the set’s collective +signal, `markeR` provides `VisualiseIndividualGenes.` Available options +include: + +- Expression heatmaps of genes across samples or groups of samples; +- Violin plots showing cross-sample expression distributions of + individual genes; +- Heatmaps of pairwise cross-sample expression correlation between genes + in the set; +- ROC curves and AUC values to evaluate single genes’ performance as + phenotypic markers; +- Effect size estimation (Cohen’s *d*) of expression differences between + groups of samples; +- Principal Component Analysis (PCA) of expression of genes in the set, + to evaluate which genes dominate collective variance and how samples + separate according to the gene set’s expression. ### 6. Compare with Reference Gene Sets -`markeR` allows comparison of user-defined gene sets to reference sets -(e.g., from MSigDB) using: +`markeR` also supports comparison of user-defined gene sets against +reference collections (e.g., MSigDB). Two complementary similarity +metrics are implemented: -- **Jaccard Index**: Measures gene overlap relative to union size. +- **Jaccard Index**: the ratio of the number of genes in common over the + total number of genes in the two sets. -- **Log Odds Ratio (logOR)**: Computes enrichment using a user-defined - gene universe and Fisher’s exact test. +- **Log Odds Ratio (logOR)** from Fisher’s exact test of association + between gene sets, given a specified gene universe. Filters can be applied based on similarity thresholds (e.g., minimum -Jaccard, OR, or p-value). +Jaccard, OR, or Fisher’s test p-value). ## Contact diff --git a/man/FPR_Simulation.Rd b/man/FPR_Simulation.Rd index ece45bd..b9fd5bf 100644 --- a/man/FPR_Simulation.Rd +++ b/man/FPR_Simulation.Rd @@ -10,7 +10,7 @@ FPR_Simulation( original_signatures, Variable, gene_list = NULL, - number_of_sims = 10, + number_of_sims = 100, title = NULL, widthTitle = 30, titlesize = 12, diff --git a/man/counts_example.Rd b/man/counts_example.Rd index 25658d2..4998e47 100644 --- a/man/counts_example.Rd +++ b/man/counts_example.Rd @@ -12,11 +12,12 @@ samples (sample IDs). \url{https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE63577} } \usage{ -counts_example +data(counts_example) } \description{ -A numeric matrix containing filtered and normalized gene expression data from -the Marthandan et al. (2016) study (GEO accession GSE63577). +A numeric matrix containing filtered and normalized (non log-transformed) +gene expression data from the Marthandan et al. (2016) study (GEO accession +GSE63577). } \details{ Raw FASTQ files were downloaded using \code{fasterq-dump} (v2.11.0) and processed @@ -25,8 +26,7 @@ conducted using FastQC (v0.12.1) and summarised with MultiQC (v1.14). Pseudo-alignment to the RefSeq transcriptome (NCBI release 109) was performed using kallisto (v0.44.0). Genes with low expression (mean count < 70 in all conditions) were filtered out. Count normalization factors were calculated -with \code{edgeR::calcNormFactors}, and log2-transformed values were obtained via -\code{limma::voom}. +with \code{edgeR::calcNormFactors}. Intermediate time points for HFF and MRC5 cell lines were excluded, resulting in a final dataset with 45 high-quality samples across proliferative, diff --git a/man/geneset_similarity.Rd b/man/geneset_similarity.Rd index 35dd72f..c4c5991 100644 --- a/man/geneset_similarity.Rd +++ b/man/geneset_similarity.Rd @@ -15,7 +15,9 @@ geneset_similarity( pval_threshold = 0.05, limits = NULL, title_size = 12, - color_values = c("#F9F4AE", "#B44141"), + color = "#B44141", + neutral_color = "white", + cold_color = "#4173B4", title = NULL, jaccard_threshold = 0, msig_subset = NULL, @@ -48,15 +50,42 @@ Odds Ratio required for a gene set to be included in the plot. Default is 1.} \item{pval_threshold}{(only if method == "odds_ratio" only) Numeric. Maximum -adjusted p-value to show a label. Default is 0.05.} +adjusted p-value required for a gene set to be included in the plot. +Default is 0.05.} -\item{limits}{Numeric vector of length 2. Limits for color scale.} +\item{limits}{Numeric vector of length 2. Limits for color scale. If \code{NULL}, +is automatically set to c(0,1) for Jaccard or the range of OR for odds +ratio.} \item{title_size}{Integer specifying the font size for the plot title. Default is \code{12}.} -\item{color_values}{Character vector of colors used for the fill gradient. -Default is \code{c("#F9F4AE", "#B44141")}.} +\item{color}{Character. The color for the maximum of the scale. Default is +\code{red.} +\itemize{ +\item If \code{method = "jaccard"}, the scale goes from \code{neutral_color} to \code{color}. +\item If \code{method = "odds_ratio"} and any OR >= 1, the scale ends at \code{color}. +\item If \code{method = "odds_ratio"} and all OR <= 1, \code{color} is not used; instead, the scale +runs from \code{cold_color} (minimum) to \code{neutral_color} (OR = 1, if present; +otherwise \code{neutral_color} is the maximum). +}} + +\item{neutral_color}{Character. The neutral reference color. Default is +\code{white}. +\itemize{ +\item If \code{method = "jaccard"}, this is the minimum of the scale. +\item If \code{method = "odds_ratio"} and any OR >= 1, this corresponds to OR = 1 if such values exist; otherwise it is the minimum of the scale. +\item If \code{method = "odds_ratio"} and all OR <= 1, this corresponds to OR = 1 if such values exist; otherwise it is the maximum of the scale (with \code{cold_color} as the minimum). +}} + +\item{cold_color}{Character. The color for values below OR = 1 (only used +when \code{method = "odds_ratio"}). Default is \code{blue}. +\itemize{ +\item If \code{method = "odds_ratio"} and any OR < 1, the scale runs from \code{cold_color} +(minimum) to \code{neutral_color} (OR = 1 if present; otherwise \code{neutral_color} +is the maximum). +\item Ignored if \code{method = "jaccard"} or if all OR >= 1. +}} \item{title}{Optional. Custom title for the plot. If \code{NULL}, the title defaults to \code{"Signature Overlap"}.} @@ -81,7 +110,7 @@ Invisibly returns a list containing: \describe{ \item{\code{plot}}{The \pkg{ggplot2} object of the similarity heatmap.} \item{\code{data}}{The data frame object containing the similarity -scores aper pair of gene sets.} +scores per pair of gene sets.} } } \description{ diff --git a/man/genesets_example.Rd b/man/genesets_example.Rd index a63a58d..274f199 100644 --- a/man/genesets_example.Rd +++ b/man/genesets_example.Rd @@ -11,13 +11,13 @@ A named list of length 3: curated gene set of commonly reported senescence markers, with directionality (+1 or -1).} \item{REACTOME_Senescence}{Character vector of gene symbols. The -REACTOME_CELLULAR_SENESCENCE from MSigDB pathway. No directionality.} +REACTOME_CELLULAR_SENESCENCE from MSigDB database No directionality.} \item{HernandezSegura}{A data frame with columns \code{gene} and \code{direction}. A gene set from Hernandez-Segura et al. (2017), with directionality (+1 or -1).} } } \usage{ -genesets_example +data(genesets_example) } \description{ Example Gene Sets for Cellular Senescence diff --git a/man/metadata_example.Rd b/man/metadata_example.Rd index 05b3ca1..f30d8d2 100644 --- a/man/metadata_example.Rd +++ b/man/metadata_example.Rd @@ -21,7 +21,7 @@ for senescent samples, "young" for proliferative).} \url{https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE63577} } \usage{ -metadata_example +data(metadata_example) } \description{ A data frame containing metadata for samples from the Marthandan et al. diff --git a/tests/testthat/test-geneset_similarity.R b/tests/testthat/test-geneset_similarity.R index 62c311b..71739ec 100644 --- a/tests/testthat/test-geneset_similarity.R +++ b/tests/testthat/test-geneset_similarity.R @@ -16,39 +16,6 @@ test_that("geneset_similarity returns jaccard 1 for identical sets", { expect_equal(sim, 1) }) -test_that("geneset_similarity pval_threshold filters labels as expected for odds_ratio", { - # Fake simple signatures to ensure overlap - sig1 <- c("GENE1", "GENE2", "GENE3", "GENE4", "GENE5") - sig2 <- c("GENE1", "GENE2", "GENE6", "GENE7", "GENE8") - signatures <- list(A = sig1) - others <- list(B = sig2) - # Large universe to ensure non-perfect overlap - universe <- toupper(paste0("GENE", 1:50)) - - # Run with a permissive pval_threshold (should label if pval < threshold) - res <- geneset_similarity( - signatures = signatures, - other_user_signatures = others, - metric = "odds_ratio", - universe = universe, - pval_threshold = 1 # allow all p-values - ) - d <- res$data - expect_true(any(d$Label != "")) # Some label should be shown - - # Run with a restrictive pval_threshold (should blank out most labels) - res2 <- geneset_similarity( - signatures = signatures, - other_user_signatures = others, - metric = "odds_ratio", - universe = universe, - pval_threshold = 1e-10 # extremely small, almost nothing will label - ) - d2 <- res2$data - expect_true(all(d2$Label == "")) # All labels should be blank -}) - - test_that("geneset_similarity returns expected odds ratio values", { # Simple signatures with known overlap sig1 <- c("GENE1", "GENE2", "GENE3", "GENE4") @@ -67,19 +34,19 @@ test_that("geneset_similarity returns expected odds ratio values", { cont_tbl <- matrix(c(2, 2, 2, 4), nrow = 2) fisher_res <- fisher.test(cont_tbl) expected_or <- as.numeric(fisher_res$estimate) - expected_log10or <- log10(expected_or) - + # Run the function res <- geneset_similarity( signatures = signatures, other_user_signatures = others, metric = "odds_ratio", - universe = universe + universe = universe, + pval_threshold=1 ) d <- res$data # Check the actual odds ratio (on log10 scale) is close to expected - expect_equal(d$Score, expected_log10or, tolerance = 1e-6) + expect_equal(d$Score, expected_or, tolerance = 1e-6) }) test_that("geneset_similarity returns expected Jaccard index values with H collection", { @@ -104,7 +71,8 @@ test_that("geneset_similarity returns expected Jaccard index values with H colle signatures = signatures, metric = "jaccard", collection = "H", - msig_subset = "HALLMARK_MYC_TARGETS_V1" + msig_subset = "HALLMARK_MYC_TARGETS_V1", + pval_threshold=1 ) d <- res$data # Find the row for this comparison diff --git a/vignettes/articles/Article_BenchmarkingMode.Rmd b/vignettes/articles/Article_BenchmarkingMode.Rmd index 71ac98b..065d3c6 100644 --- a/vignettes/articles/Article_BenchmarkingMode.Rmd +++ b/vignettes/articles/Article_BenchmarkingMode.Rmd @@ -384,7 +384,7 @@ FPR_Simulation(data = counts_example, metadata = metadata_example, original_signatures = genesets_example, gene_list = row.names(counts_example), - number_of_sims = 10, + number_of_sims = 100, title = "Marthandan et al. 2016", widthTitle = 30, Variable = "Condition", diff --git a/vignettes/articles/Article_DiscoveryMode.Rmd b/vignettes/articles/Article_DiscoveryMode.Rmd index d430b47..f7148ef 100644 --- a/vignettes/articles/Article_DiscoveryMode.Rmd +++ b/vignettes/articles/Article_DiscoveryMode.Rmd @@ -12,7 +12,7 @@ knitr::opts_chunk$set( ) ``` -This vignette provides a comprehensive introduction to the **markeR** package, focusing on its **Discovery Mode**. The discovery mode was designed for users who are interested in quantifying a known, robust gene set in a given dataset to explore associations with other phenotypic or clinical variables. This approach is particularly suited to hypothesis generation where the phenotype marked by the gene set is of known biological or clinical relevance. +This vignette provides a comprehensive introduction to the **markeR** package, focusing on its **Discovery Mode**. The discovery mode was designed for users who are interested in quantifying a gene set in a given dataset to explore associations with phenotypic or clinical variables. This approach is particularly suited to hypothesis generation where the phenotype marked by the gene set is of known biological or clinical relevance. # Case-study: Senescence diff --git a/vignettes/articles/Article_GeneSetSimilarity.Rmd b/vignettes/articles/Article_GeneSetSimilarity.Rmd index 7c94f52..59c854e 100644 --- a/vignettes/articles/Article_GeneSetSimilarity.Rmd +++ b/vignettes/articles/Article_GeneSetSimilarity.Rmd @@ -75,7 +75,8 @@ geneset_similarity( subcollection = "CP:KEGG_LEGACY", jaccard_threshold = 0, msig_subset = c("KEGG_MTOR_SIGNALING_PATHWAY", "KEGG_APOPTOSIS", "NON_EXISTENT_PATHWAY"), - metric = "jaccard" + metric = "jaccard", + limits=c(0,0.1) )$plot ``` @@ -84,14 +85,14 @@ geneset_similarity( # Similarity via Log Odds Ratio -The log odds ratio (logOR) provides a statistically grounded alternative for assessing gene set similarity. It measures **enrichment of one set within another**, relative to a defined background or **gene universe**, using a 2×2 contingency table and a one-sided **Fisher’s exact test**. +The log odds ratio (logOR) provides a statistically grounded alternative for assessing gene set similarity. It measures **enrichment of one set within another**, relative to a defined background or **gene universe**, using a 2×2 contingency table. - **Log odds ratio (logOR)**: Derived from contingency tables using: - Genes in both sets - Genes in one but not the other - Gene universe as background - Log-transformed odds ratios are visualized; statistical significance is assessed via the adjusted p-value. + Log-transformed odds ratios are visualized. > **Note**: When using `metric = "odds_ratio"`, the `universe` parameter **must** be supplied. @@ -114,13 +115,14 @@ geneset_similarity( msigdbr::msigdbr(species = "Homo sapiens", category = "C2")$gene_symbol )), or_threshold = 100, #log10OR = 2 - pval_threshold = 0.05, - width_text=50 + width_text=50, + pval_threshold = 0.05 )$plot ``` + # Session Information ```{r} diff --git a/vignettes/markeR.Rmd b/vignettes/markeR.Rmd index d345210..f4839c1 100644 --- a/vignettes/markeR.Rmd +++ b/vignettes/markeR.Rmd @@ -2,7 +2,10 @@ title: "Introduction to markeR" author: "Rita M. Silva" date: "`r Sys.Date()`" -output: rmarkdown::html_vignette +output: + BiocStyle::html_document: + toc: true + toc_depth: 3 vignette: > %\VignetteIndexEntry{Introduction to markeR} %\VignetteEngine{knitr::rmarkdown} @@ -21,59 +24,36 @@ dpi = 72, fig.retina = 1 ) ``` + +# Introduction -# Table of Contents - -1. [Introduction](#introduction) - - [1.1. Primary Objectives](#primary-objectives) - - [1.2. Features and Capabilities](#features-and-capabilities) -2. [Installation](#installation) -3. [Common Workflow](#common-workflow) - - [3.1. Input Requirements](#input-requirements) - - [Gene sets](#gene-sets) - - [Expression data frame](#expression-data-frame) - - [Sample metadata](#sample-metadata) - - [3.2. Select Mode of Analysis](#select-mode-of-analysis) - - [3.3. Choose a Quantification Approach](#choose-a-quantification-approach) - - [3.3.1 Score-Based Approach](#score-based-approach) - - [3.3.2 Enrichment-Based Approach](#enrichment-based-approach) - - [3.4. Visualisation and Evaluation](#visualisation-and-evaluation) - - [3.4.1 Benchmarking Mode](#benchmarking-mode) - - [3.4.2 Discovery Mode](#discovery-mode) - - [3.5. Individual Gene Exploration (Optional)](#individual-gene-exploration-optional) - - [3.6. Compare with Reference Gene Sets (Optional)](#compare-with-reference-gene-sets-optional) -4. [Further Reading](#further-reading) -5. [Contact](#contact) -6. [Session Information](#session-information) - -# 1. Introduction - -**markeR** is an R package designed to provide a modular, flexible framework for evaluating gene sets as phenotypic markers using transcriptomic data. It supports researchers in both quantitative analysis and visual exploration of gene set behaviour across experimental or clinical phenotypes. +**`markeR`** is an R package that provides a modular and extensible framework for the systematic evaluation of gene sets as phenotypic markers using transcriptomic data. The package is designed to support both quantitative analyses and visual exploration of gene set behaviour across experimental and clinical phenotypes. -In this vignette, we will illustrate markeR functionalities using a pre-processed gene expression dataset and metadata derived from the Marthandan et al. (2016) study (GSE63577). This dataset contains human fibroblast samples cultured under two conditions: replicative senescence and proliferative control. +In this vignette, we demonstrate the core functionalities of `markeR` using a pre-processed gene expression dataset and associated metadata from Marthandan et al. (2016) (GSE63577). This dataset comprises human fibroblast samples cultured under two experimental conditions: replicative senescence and proliferative control. -## 1.1. Primary Objectives +## Primary Objectives -The primary objectives of **markeR** are to: +The primary objectives of **`markeR`** are to: -- Facilitate reproducible and standardized quantification of gene set activity. -- Provide flexible options for both score-based and enrichment-based analyses. -- Enable comparison of user-defined signatures against reference gene sets (e.g., MSigDB) for biological interpretation. -- Offer modular visualization and evaluation tools to explore gene set behaviour at both the set and individual gene level. +- Establish a reproducible and standardised framework for quantifying gene sets as phenotypic markers across transcriptomic datasets. +- Provide a unified interface for complementary analytical strategies, including score-based and enrichment-based methods. +- Enable systematic comparison of user-defined gene sets with curated reference collections (e.g., MSigDB) to enhance biological interpretability. +- Deliver modular tools for visualisation and evaluation, allowing exploration of both gene set behaviour and individual gene-level contributions. -## 1.2. Features and Capabilities +## Features and Capabilities -The package integrates multiple approaches for characterizing phenotypes: +The package integrates multiple analytical strategies for quantifying phenotypes using gene sets, supporting both unidirectional and bidirectional (*i.e.*, respectively without and with information on the expected direction of change in expression of genes upon phenotype) gene set definitions: -- **Score-based methods**: log2-median, ranking, and single-sample GSEA (ssGSEA) to quantify the coordinated expression of genes in a set. -- **Enrichment-based methods**: GSEA using moderated t- or B-statistics, with options to handle unidirectional and bidirectional gene sets. -- **Gene-level exploration**: expression heatmaps, violin plots, ROC curves, AUC calculations, effect size measures, and PCA analysis to assess individual gene contributions. +- **Score-based methods**: log2-median expression, ranking approaches, and single-sample gene set enrichment analysis (ssGSEA) to quantify coordinated expression within a gene set. +- **Enrichment-based methods**: GSEA using moderated t- or B-statistics. +- **Gene-level exploration**: expression heatmaps, violin plots, ROC curve analysis, area under the curve (AUC) calculations, effect size estimation, and principal component analysis (PCA) to characterise the contribution of individual genes. +- **Gene set similarity assessment**: Jaccard index and log odds ratio (logOR) calculations to compare user-defined gene sets with reference or user-defined collections. -The package is designed to be fully customizable, supporting diverse visualization strategies via `ggplot2`, `ComplexHeatmap`, `ggpubr`, `cowplot`, and `grid.` Its modular structure allows easy integration of new functionalities while providing a robust framework for reproducible, standardized phenotypic characterization of gene sets. +The package is implemented with flexibility and extensibility as core design principles. Visualisation leverages `ggplot2`, `ComplexHeatmap`, `ggpubr`, `cowplot`, and `grid`, enabling a wide range of customisation options. The modular structure ensures compatibility with additional methods in future versions of the package. -# 2. Installation +# Installation -Install the latest development release of markeR from [GitHub](https://github.com/) with: +Install the latest development release of `markeR` from [GitHub](https://github.com/) with: ```r devtools::install_github("DiseaseTranscriptomicsLab/markeR@*release") @@ -82,18 +62,16 @@ devtools::install_github("DiseaseTranscriptomicsLab/markeR@*release") ```{r, echo=FALSE} library(markeR) ``` -# 3. Common Workflow - -`markeR` provides a modular pipeline to quantify transcriptomic signatures and assess their association with phenotypic or clinical variables. +# Common Workflow -## 3.1. Input Requirements +## Input Requirements -### Gene sets +### Gene Sets A named list where each element is a gene set. -- Use a **character vector** when direction is unknown (unidirectional). -- Use a **data frame** with columns `gene` and `direction` (values `+1` for up, `-1` for down) when direction is known (bidirectional). +- Use a **character vector** when putative direction of change in expression of genes upon phenotype is unknown (unidirectional). +- Use a **data frame** with columns `gene` and `direction` (values `+1` for up-, `-1` for down-regulation) when direction is known (bidirectional). ```{r example-gene-sets-vector, echo=FALSE} # Gene set without direction @@ -118,18 +96,25 @@ gene_sets <- list( gene_sets ``` -For this vignette, we will use three senescence-related gene sets that ship with the package: **LiteratureMarkers** (bidirectional), **REACTOME_CELLULAR_SENESCENCE** (undirected), and **HernandezSegura** (bidirectional). See `?genesets_example`. +For this vignette, we will use three senescence-related gene sets that ship with the package (see `?genesets_example` for more information): + +* **LiteratureMarkers** (bidirectional; gene set of commonly reported senescence markers in the literature) +* **REACTOME_CELLULAR_SENESCENCE** (unidirectional; from the MSigDB database) +* **HernandezSegura** (bidirectional; from [Hernandez-Segura et al. (2017)](https://pubmed.ncbi.nlm.nih.gov/28844647/)) ```{r loadsig} # Load example gene sets data(genesets_example) ``` -### Expression data frame +### Expression Data Frame -A filtered and normalised gene expression matrix (genes × samples). Row names must be gene identifiers; column names must match sample IDs in the metadata. +A filtered and normalised, non log-transformed, gene expression matrix (genes × samples). Row names must be gene identifiers; column names must match sample IDs in the metadata. -In this vignette, we use a pre‑processed dataset from Marthandan et al. (2016, GSE63577) with human fibroblasts under **replicative senescence** and **proliferative control**. See `?counts_example` for structure. +**Warning:** If you are using microarray data or outputs from common RNA-seq pipelines (*e.g.*, edgeR), note that the expression values may already be log2-normalised. The input to `markeR` must necessarily be **non-log-transformed**. If your data are log2-transformed, you can revert them by applying `2^data`. + + +In this vignette, we use a pre‑processed RNA-seq dataset from Marthandan et al. (2016, GSE63577), with normalised read counts for human fibroblasts under **replicative senescence** and **proliferative control**. See `?counts_example` for structure. ```{r loaddata} # Load example expression data @@ -137,9 +122,9 @@ data(counts_example) counts_example[1:5, 1:5] ``` -### Sample metadata +### Sample Metadata -A data frame with annotations per sample. Row names must match the expression matrix column names; the first column contains sample IDs. +A data frame with samples as rows and annotations as columns. The first column should contain sample IDs matching the expression matrix column names. We use the accompanying metadata from the Marthandan et al. (2016) study. See `?metadata_example`. @@ -149,64 +134,60 @@ data(metadata_example) head(metadata_example) ``` -## 3.2. Select Mode of Analysis - -* **Benchmarking Mode**: -Evaluate one or more gene sets against a metadata variable using a standardised scoring and effect size framework. This mode provides comprehensive visualisations and comparisons across methods. - -* **Discovery Mode**: -Explore how a single, well-characterised gene set relates to a specific variable of interest. Suitable for hypothesis generation or signature projection. - -## 3.3. Choose a Quantification Approach +## Select Mode of Analysis -`markeR` supports two complementary strategies for quantifying the association between gene sets and phenotypes: +`markeR` provides two modes of operation: -### 3.3.1 Score-Based Approach +* **Benchmarking**: +evaluates gene sets' performance in marking a metadata variable, *i.e.*, a phenotype, returning comparative visualisations across scoring and enrichment methods. -This strategy generates a **single numeric score per sample**, reflecting the activity of a gene set. It enables flexible downstream analyses, including comparisons across phenotypic groups. - -Three scoring methods are available: +* **Discovery**: +examines the relationship between a gene set and one or more variables of interest, suitable for exploratory or hypothesis-generating analyses. + +## Choose a Quantification Approach -* **Log2-median**: Calculates the median log2 expression of the genes in the set. Sensitive to absolute shifts in expression. +Two complementary strategies are implemented for quantifying associations between gene sets and phenotypes: -* **Ranking**: Ranks all genes within each sample and averages the ranks of gene set members. Captures relative ordering rather than magnitude. +### Score-Based Approach -* **ssGSEA**: Computes a single-sample gene set enrichment score using the ssGSEA algorithm. Reflects the coordinated up- or down-regulation of the set in each sample. +A score summarising the collective expression of a gene set therein is assigned **to each sample**. Scores can be visualised using built-in functions, or used directly in downstream analyses (*e.g.*, comparisons between phenotypic groups of samples, correlations with numerical phenotypes). -Robust gene sets are expected to perform consistently across all three. +Available methods: -### 3.3.2 Enrichment-Based Approach +* **Log2-median**: mean of the across-sample normalised log2 median-centred expression levels of the genes in the set; for bidirectional gene sets, the sample score is the partial score for the subset of putatively upregulated genes minus that of the downregulated subset. -This approach uses a classical **gene set enrichment analysis (GSEA)** framework to evaluate whether the gene set is significantly overrepresented at the top or bottom of a ranked list of genes (e.g., ranked by fold change or correlation with phenotype). +* **Ranking**: mean expression rank of gene set members in each sample; for bidirectional gene sets, the sample score is the partial score for the subset of putatively upregulated genes minus that of the downregulated subset, and normalised by the number of genes in the set. -* **GSEA**: Computes a Normalised Enrichment Score (NES) for each contrast or variable of interest, adjusting for gene set size and multiple testing. +* **ssGSEA**: single-sample gene set enrichment score using ssGSEA; for bidirectional gene sets, the sample score is the partial score for the subset of putatively upregulated genes minus that of the downregulated subset. -Use this approach when interested in collective behaviour of gene sets in relation to ranked differential signals. +Gene sets that are robust phenotypic markers are expected to yield consistently high scores across methods. -## 3.4. Visualisation and Evaluation +### Enrichment-Based Approach -### 3.4.1 Benchmarking Mode +Enrichment-based methods implement **Gene Set Enrichment Analysis (GSEA)**. Genes are ranked according to differential expression statistics, and a Normalised Enrichment Score (NES) per variable of interest is computed, accompanied by a p-value adjusted for multiple hypothesis testing. -In **Benchmarking Mode**, `markeR` offers a range of visual summaries: +## Visualisation and Evaluation -* Violin or scatter plots showing score distributions by phenotype -* Volcano plots and heatmaps based on effect sizes (Cohen’s *d* or *f*) -* ROC curves and AUC values -* Null distribution testing using random gene sets matched for size and directionality -* Lollipop plots summarising enrichment scores (NES) with adjusted p-values -* Enrichment plots showing running enrichment scores across ranked gene lists -* Volcano plots +### Benchmarking Mode -Example of common workflows in Benchmarking mode (full tutorial [here][tutorial-benchmarking]): +`markeR` offers a range of visual summaries for Benchmarking: -#### Score-based approach +* Violin plots of score distributions by categorical phenotype; +* Scatter plots of association between scores and numerical phenotypes; +* Volcano plots and heatmaps of scores or differential gene set expression based on effect sizes (Cohen’s *d* or *f*); +* ROC curves and respective AUC values of gene sets' phenotypic classification performance; +* Violin plots of effect size distributions (Cohen’s *d*) for pairwise group differences in scores, for original and simulated gene sets; +* Plots summarising NES alongside adjusted p-values (*e.g.*, lollipop plots); +* GSEA plots showing running enrichment scores across ranked gene lists. -We begin by quantifying gene set activity using score-based methods. These approaches generate numeric scores per sample that reflect the coordinated expression of genes in each set. This allows easy comparison of gene set behavior across phenotypic groups, while giving a single score per sample, which can be useful in certain contexts. +Example of common workflows in Benchmarking Mode (full tutorial [here][tutorial-benchmarking]): -First, we compute and plot scores using the log2-median method, which calculates median-centered expression for each gene and averages across the gene set to provide a robust summary of coordinated activity. We then assess whether the available gene sets distinguish between Senescent and Proliferative conditions. These results suggest that the HernandezSegura and LiteratureMarkers gene sets show very strong effect sizes (|Cohen's d|), while the REACTOME_CELLULAR_SENESCENCE gene set shows a more modest effect. The distributions of scores also overlap more for the latter. +#### Score-based approaches +Score-based methods facilitate direct comparisons of gene set activity between phenotypic groups (*e.g.*, being senescent) and can be used in downstream analyses, including for example correlation with other metadata variables of interest (*e.g.*, day of sequencing) . +We first compute log2-median scores. In this dataset, the HernandezSegura and LiteratureMarkers gene sets yield large effect sizes (|Cohen’s d|), whereas REACTOME_CELLULAR_SENESCENCE shows more modest separation between senescent and proliferative fibroblasts. -```{r fig.width=8, fig.height=4, out.width="100%", warning=FALSE, message=FALSE} +```{r fig.width=8, fig.height=4, warning=FALSE, message=FALSE} # Quantification approach: scores # Method: log2-median PlotScores(data = counts_example, @@ -223,7 +204,9 @@ PlotScores(data = counts_example, ``` -Next, we calculate scores using multiple methods (log2-median, ranking, and ssGSEA) to compare results across scoring strategies. The output includes heatmaps and volcano plots to visualize effect sizes (|Cohen's d|) between conditions. These analyses confirm that the HernandezSegura and LiteratureMarkers gene sets consistently discriminate Senescence from Proliferative samples across all methods, whereas REACTOME_CELLULAR_SENESCENCE does not. + +Next, scores are calculated using multiple methods (log2-median, ranking, ssGSEA) to assess consistency across strategies. Outputs include heatmaps and volcano plots of effect sizes (|Cohen’s d|). HernandezSegura and LiteratureMarkers consistently discriminate Senescent from Proliferative samples, while REACTOME_CELLULAR_SENESCENCE shows weaker and less consistent separation. + ```{r warning=FALSE, message=FALSE} Overall_Scores <- PlotScores(data = counts_example, @@ -252,17 +235,17 @@ Overall_Scores <- PlotScores(data = counts_example, colorPalette="Paired") ``` -```{r Overall_Scores_heatmap, fig.width=10, fig.height=2, out.width="100%", warning=FALSE, message=FALSE} +```{r Overall_Scores_heatmap, fig.width=10, fig.height=2, warning=FALSE, message=FALSE} Overall_Scores$heatmap ``` -```{r Overall_Scores_volcano, fig.width=6, fig.height=3, out.width="70%", warning=FALSE, message=FALSE} +```{r Overall_Scores_volcano, fig.width=6, fig.height=3, warning=FALSE, message=FALSE} Overall_Scores$volcano ``` -ROC curves assess the discriminatory power of gene set scores, indicating how well a signature separates different experimental or clinical groups. When discriminating between Senescent and Proliferative samples, the REACTOME_CELLULAR_SENESCENCE gene set shows more heterogeneous ROC curves and AUC values across scoring methods, reflecting less consistent performance compared to the HernandezSegura and LiteratureMarkers gene sets. +The discriminatory capacity of gene set scores can also be assessed using ROC curves and corresponding AUC values. HernandezSegura and LiteratureMarkers exhibit consistently high AUCs across scoring methods, while REACTOME_CELLULAR_SENESCENCE shows more heterogeneous performance. -```{r roc_scores, fig.width=10, fig.height=3, out.width="100%", warning=FALSE, message=FALSE} +```{r roc_scores, fig.width=10, fig.height=3, warning=FALSE, message=FALSE} ROC_Scores(data = counts_example, metadata = metadata_example, gene_sets=genesets_example, @@ -280,16 +263,15 @@ ROC_Scores(data = counts_example, ``` -Finally, we perform simulations using random gene sets to estimate the false positive rate. This step informs whether observed gene set signals exceed what would be expected by chance, improving confidence in the results. By simulating 10 random gene sets of the same size, we see that the LiteratureMarkers gene set performs best, with only two of the random sets achieving higher effect sizes than the original one using the ranking method. While increasing the number of simulated sets would yield finer resolution, this comes at the cost of additional computational time. - - -```{r FDRSim, fig.width=12, fig.height=3, out.width="100%", warning=FALSE, message=FALSE} +Finally, false positive rates for effect sizes are estimated by simulating random gene sets of equal size. This step provides a null distribution against which observed scores can be compared. In this example, the LiteratureMarkers signature demonstrates the strongest performance. Increasing the number of simulations would yield finer resolution at the cost of additional computational time. +```{r FDRSim, fig.width=12, fig.height=3,warning=FALSE, message=FALSE} +set.seed("123456") FPR_Simulation(data = counts_example, metadata = metadata_example, original_signatures = genesets_example, gene_list = row.names(counts_example), - number_of_sims = 10, + number_of_sims = 100, title = "Marthandan et al. 2016", widthTitle = 30, Variable = "Condition", @@ -304,13 +286,11 @@ FPR_Simulation(data = counts_example, ``` -#### Enrichment-based approach - -We now shift focus to enrichment-based methods, which rely on differential expression statistics to evaluate whether gene sets show coordinated changes between conditions. This approach complements score-based methods by explicitly leveraging contrasts between groups and quantifying statistical significance. - -The first step is to compute differential expression. In this example, we build the design matrix automatically from the `Condition` variable in the metadata and define the contrast `Senescent - Proliferative`. Internally, this fits a linear model without an intercept, enabling quick comparison between levels of a categorical variable. +#### Enrichment-based approaches + +The first step is the quantification of differential expression. Here, a design matrix (as defined by the `limma` package) is automatically constructed from the `Condition` variable in the metadata, defining the contrast `Senescent - Proliferative`. Internally, this fits a linear model without an intercept, enabling quick comparison between levels of a categorical variable. -```{r DEGs, fig.width=10, fig.height=3, out.width="90%", warning=FALSE, message=FALSE} +```{r DEGs, fig.width=10, fig.height=3, warning=FALSE, message=FALSE} # Build design matrix from variables (Condition) and apply a contrast. # The design matrix is constructed automatically using the variable "Condition". DEGs <- calculateDE(data = counts_example, @@ -320,11 +300,15 @@ DEGs <- calculateDE(data = counts_example, DEGs$`Senescent-Proliferative`[1:5,] ``` -Once differential expression is calculated, we can visualize the results with a volcano plot. This plot highlights genes in our predefined gene sets, showing the magnitude of differential expression (log fold-change) versus statistical significance (adjusted p-value). Up- (green) and downregulated (red) genes are color-coded. For the HernandezSegura gene set, green genes mostly appear in the positive logFC range and red genes in the negative range, though these are not the genes with the most extreme logFC values. In the LiteratureMarkers gene set, two red genes (LMNB1 and MKI67) show very negative logFC, which is consistent with their roles as proliferation markers since senescent cells are non-proliferative. Genes from the REACTOME_CELLULAR_SENESCENCE gene set are shown in blue, as this set lacks information on directionality; these genes are scattered across the full range of logFC values. +Differential expression results can be visualised with volcano plots. These highlight the magnitude of expression changes (log2 fold change) against statistical significance (adjusted p-value for the moderated t-statistic), with genes from predefined sets emphasised (up- in green and downregulated in red). In this example: +* Genes in the HernandezSegura set annotated as upregulated (green) display positive log2 fold changes, whereas those annotated as downregulated (red) show negative log2 fold changes. This pattern is consistent with the annotation of the set, although these genes are not necessarily those exhibiting the largest absolute fold changes. +* In the LiteratureMarkers set, *LMNB1* and *MKI67* exhibit strongly negative log2 fold change, consistent with their roles as proliferation markers absent in senescent cells. +* Genes from the REACTOME_CELLULAR_SENESCENCE set are undirected and appear across the full range of log2 fold change values, diluting discriminatory power. -```{r DEGsvolcano, fig.width=10, fig.height=3, out.width="100%", warning=FALSE, message=FALSE} -# Change order: signatures in columns, contrast in rows + +```{r DEGsvolcano, fig.width=10, fig.height=3,warning=FALSE, message=FALSE} +# Change order: gene sets in columns, contrast in rows plotVolcano(DEGs, genes = genesets_example, N = NULL, x = "logFC", y = "-log10(adj.P.Val)", pointSize = 2, @@ -334,7 +318,8 @@ plotVolcano(DEGs, genes = genesets_example, labsize = 10, widthlabs = 24, invert = TRUE) ``` -Next, we perform Gene Set Enrichment Analysis (GSEA) using the differential expression results. This approach evaluates whether members of each gene set are non-randomly distributed toward the top or bottom of the ranked gene list, providing normalized enrichment scores (NES) and adjusted p-values. The ranking of genes can be based on different metrics, reflecting the expected directionality of the gene set. This is also reflected in the plot labels below: gene sets are marked as altered when genes are ordered by the B statistic, or enriched/depleted when ordered by the t-statistic. For a full example and explanation, see the tutorial [here][tutorial-benchmarking]. +We next apply GSEA to the differential expression results. This evaluates whether genes from each set are non-randomly concentrated at the top or bottom of the ranked list. Results are reported as NES with multiple-testing adjusted p-values. Gene ranking can be based on alternative statistics (*e.g.*, moderated t or B-statistic), which influence whether results are interpreted as enrichment/depletion or as altered directionality. For a full example and explanation, see the online tutorial [here][tutorial-benchmarking]. + ```{r GSEA, warning=FALSE, message=FALSE} GSEAresults <- runGSEA(DEGList = DEGs, @@ -344,19 +329,19 @@ GSEAresults <- runGSEA(DEGList = DEGs, GSEAresults ``` - -To visualize the enrichment of each gene set along the ranked gene list, we use enrichment curves from `fgsea.` These plots show the running enrichment score across the ranked list, highlighting where members of each gene set are concentrated. + +Enrichment curves from `fgsea` show the running enrichment score across the ranked list, indicating the distribution of set members therein. -```{r GSEA_plotenrichment, fig.width=10, fig.height=3, out.width="100%", warning=FALSE, message=FALSE} +```{r GSEA_plotenrichment, fig.width=10, fig.height=3, warning=FALSE, message=FALSE} plotGSEAenrichment(GSEA_results=GSEAresults, DEGList=DEGs, gene_sets=genesets_example, widthTitle=40, grid = TRUE, titlesize = 10, nrow=1, ncol=3) ``` -We can also summarize enrichment results in a lollipop plot, which compactly shows the normalized enrichment score and highlights statistically significant gene sets. This provides a quick overview of which pathways are most strongly altered. Here, we can see that the HernandezSegura gene set clearly exhibits the strongest enrichment signal. - +Enrichment results can also be summarised with a lollipop plot, which compactly shows the NES and highlights statistically significant gene sets. Here, we can see that the HernandezSegura gene set clearly exhibits the strongest enrichment signal. When many gene sets and contrasts are evaluated (which is not the case in this vignette), NES and adjusted p-values can be summarized in a scatter (volcano-style) plot for a simpler visualisation, using `plotCombinedGSEA`. + ```{r GSEA_lollypop, fig.width=5, fig.height=4, out.width="60%", warning=FALSE, message=FALSE} plotNESlollipop(GSEA_results=GSEAresults, saturation_value=NULL, @@ -368,34 +353,33 @@ plotNESlollipop(GSEA_results=GSEAresults, widthlabels=13, title=NULL, titlesize=12) ``` + -Finally, a volcano-style scatter plot combines NES and significance for all gene sets, making it easy to identify which sets show the strongest and most statistically robust alteration. +From this exercise in Benchmarking Mode, we can see that two gene sets clearly perform best at discriminating between Senescent and Proliferative conditions: HernandezSegura and LiteratureMarkers. The REACTOME_CELLULAR_SENESCENCE gene set does not show a strong signal; the absense of information on putative gene up- or downregulation upon phenotype potentially dilutes the signal, highlighting the importance of providing directionality when available. -```{r GSEA_volcano, fig.width=8, fig.height=3, out.width="100%", warning=FALSE, message=FALSE} -plotCombinedGSEA(GSEAresults, sig_threshold = 0.05, PointSize=6, widthlegend = 26 ) -``` - -From this exercise in benchmarking mode, we can see that two gene sets clearly perform best at discriminating between Senescent and Proliferative conditions: HernandezSegura and LiteratureMarkers. The REACTOME_CELLULAR_SENESCENCE gene set does not show a strong signal; since it is undirected and lacks information on up- or downregulation, this also dilutes the signal, highlighting the importance of providing directionality when available. - -Across methods, scoring and enrichment approaches provide complementary insights. Score-based methods offer sample-level resolution, capturing strong contributions from individual genes, while enrichment-based methods evaluate coordinated behaviour across the set, typically at the group level. Even among scoring methods, rank-based approaches (e.g., ssGSEA or rank scoring) are generally more robust to technical noise, whereas magnitude-based methods (e.g., log2-median) better detect shifts in well-controlled data. Performance also depends on sample size and gene set size, explaining why HernandezSegura may appear stronger in enrichment analyses and LiteratureMarkers in scoring analyses. Integrating both approaches provides the most comprehensive view of gene set behaviour. +Scoring and enrichment approaches provide complementary insights. Score-based methods offer sample-level resolution, capturing strong contributions from individual genes, while enrichment-based methods evaluate coordinated behaviour across the set. -### 3.4.2 Discovery Mode +### Discovery Mode -In **Discovery Mode** (full tutorial [here][tutorial-discovery]), the output focuses on a single gene set: +In **Discovery** (full tutorial [here][tutorial-discovery]), analyses focus on a single gene set of interest, providing a targeted view of its behaviour across experimental variables (*i.e.*, phenotypes). The output includes: -* Score distributions by phenotype -* Pairwise contrasts (Cohen’s *d*) and overall effect sizes (Cohen’s *f*) -* Enrichment score summaries (NES) with adjusted p-values (e.g., lollipop plots) +* Score distributions stratified by variable; +* Effect sizes for pairwise and multiple-group differences (Cohen's *d* and *f*, respectively); +* Cross-variable summaries of NES and adjusted p-values (*e.g.*, lollipop plots). -We start by selecting the gene set of interest. Here, we use the HernandezSegura gene set from our example collection. +We illustrate this using the HernandezSegura gene set from our example collection: ```{r} HernandezSegura_GeneSet <- list(HernandezSegura=genesets_example$HernandezSegura) ``` -For illustration, we add two hypothetical variables to the metadata: `DaysToSequencing`, representing the number of days between sample preparation and sequencing, and `Researcher`, identifying the person who processed each sample. This allows us to explore associations between gene set activity and different types of variables (categorical or continuous). +For illustration purposes, two synthetic variables were added to the data: +* `DaysToSequencing`, the number of days between sample preparation and sequencing; +* `Researcher`, identifying the person who processed each sample. + +This enables exploration of associations between gene set activity and both categorical and continuous variables. ```{r load_metadata_discovery} data(metadata_example) @@ -405,9 +389,11 @@ metadata_example$DaysToSequencing <- sample(c(1:20),39, replace = TRUE) head(metadata_example) ``` -We can then examine how the selected gene set associates with these variables using enrichment-based approaches in Discovery Mode. The resulting plot highlights significant associations across variables and visually summarizes the direction and strength of the effect. The “simple” mode provides comparison of effect sizes for pairwise contrasts between only two levels of the variable, but can be changed to more levels of comparison (see `?VariableAssociation`). Here, we can see that the HernandezSegura gene set is significantly enriched in samples sequenced by Francisca, when compared to those processed by Ana or John, suggesting that this gene set may be particularly relevant to her experimental conditions or sample processing. This gene set has also a strong enrichment for the comparison between proliferative and senescent, which is expected given the nature of the gene set and the results from the Benchmarking mode. +We can then examine how the gene set associates with these variables using enrichment-based approaches in Discovery Mode. The resulting plot highlights significant associations across variables and visually summarises the direction and strength of the effects. The “simple” mode provides comparison of effect sizes across pairwise contrasts between only two levels of the variable, but can be changed to more levels of comparison (see `?VariableAssociation`). + +In this example, the HernandezSegura gene set shows significant enrichment in samples sequenced by Francisca relative to those processed by Ana or John, which would suggest a differentiated researcher-associated technical impact on the samples' biological phenotype. This gene set shows also a strong depletion in proliferative samples, which is expected given its annotation as senescence-associated and results from the Benchmarking Mode. -```{r GSEA_varassoc, fig.width=6, fig.height=6, out.width="60%", warning=FALSE, message=FALSE} +```{r GSEA_varassoc, fig.width=6, fig.height=6,warning=FALSE, message=FALSE , out.width="70%"} VariableAssociation( data = counts_example, metadata = metadata_example, @@ -426,12 +412,11 @@ VariableAssociation( ) $plot ``` + +Alternatively, **score-based methods** provide a per-sample metric of gene set activity, which can then be summarised across variables. Here, we compute log2-median scores. The `Condition` variable shows a large effect size (Cohen’s f), confirming strong discrimination between Senescent and Proliferative samples. In contrast, `Researcher` does not show a detectable association, in contrast to enrichment-based results. This divergence illustrates the value of applying both enrichment- and score-based approaches in a complementary manner. -Next, we evaluate the association using a score-based method (here, log2-median). This approach calculates per-sample scores for the gene set and summarizes them across variables. In this analysis, Condition (representing whether the sample is senescent or proliferative) shows a strong effect, reflected by a large Cohen’s f. This effect size metric is directly comparable across different variable types (categorical, numeric, etc.), making it particularly versatile. In contrast, the variable Researcher does not show a significant effect here, unlike in the enrichment analysis. This divergence illustrates the value of applying both enrichment- and score-based approaches in a complementary manner. - - -```{r variableassoc_score_sen, fig.width=7, fig.height=7, out.width="100%", warning=FALSE, message=FALSE} +```{r variableassoc_score_sen, fig.width=7, fig.height=7,warning=FALSE, message=FALSE , out.width="70%" } VariableAssociation(data = counts_example, metadata = metadata_example, method = "logmedian", @@ -445,36 +430,37 @@ VariableAssociation(data = counts_example, ``` -Benchmarking mode offers the most comprehensive set of features and allows users to seamlessly move from discovery to benchmarking mode once a variable of interest has been identified and further testing is required. The main difference from Discovery mode is that Benchmarking is designed to evaluate multiple gene sets simultaneously, whereas Discovery mode focuses on quantifying a single, robust gene set. - -## 3.5. Individual Gene Exploration +The Benchmarking Mode offers the most comprehensive set of features. Users are allowed to seamlessly move from Discovery to Benchmarking once a variable of interest has been identified and further testing is required. Benchmarking is designed to evaluate multiple gene sets simultaneously, whereas Discovery focuses on the performance of a single gene set. + -To better understand the contribution of individual genes within a gene set and identify whether specific genes drive the overall signal, `markeR` offers a suite of gene-level exploratory analyses (via `VisualiseIndividualGenes`), including: +## Individual Gene Exploration -* Expression heatmaps of genes across samples and groups -* Violin plots showing expression distributions of individual genes -* Correlation heatmaps to reveal co-expression patterns among genes in the set -* ROC curves and AUC values for individual genes to evaluate their discriminatory power -* Effect size calculations (Cohen’s *d*) per gene to quantify differential expression -* Principal Component Analysis (PCA) on gene set genes to assess variance explained and sample clustering +To better understand the contribution of individual genes within a gene set, and identify whether specific genes drive the set's collective signal, `markeR` provides `VisualiseIndividualGenes.` Available options include: -See `?VisualiseIndividualGenes` for more details on the available options and the extended online tutorial [here](https://diseasetranscriptomicslab.github.io/markeR/articles/Tutorial_BenchmarkingMode.html#visualise-individual-gene-behaviour). +* Expression heatmaps of genes across samples or groups of samples; +* Violin plots showing cross-sample expression distributions of individual genes; +* Heatmaps of pairwise cross-sample expression correlation between genes in the set; +* ROC curves and AUC values to evaluate single genes' performance as phenotypic markers; +* Effect size estimation (Cohen’s *d*) of expression differences between groups of samples; +* Principal Component Analysis (PCA) of expression of genes in the set, to evaluate which genes dominate collective variance and how samples separate according to the gene set's expression. -## 3.6. Compare with Reference Gene Sets +For a complete overview, see `?VisualiseIndividualGenes` and the extended online tutorial [here](https://diseasetranscriptomicslab.github.io/markeR/articles/Tutorial_BenchmarkingMode.html#visualise-individual-gene-behaviour). + +## Compare with Reference Gene Sets -`markeR` allows comparison of user-defined gene sets to reference sets (e.g., from MSigDB) using: +`markeR` also supports comparison of user-defined gene sets against reference collections (e.g., MSigDB). Two complementary similarity metrics are implemented: * **Jaccard Index**: -Measures gene overlap relative to union size. +the ratio of the number of genes in common over the total number of genes in the two sets. + +* **Log Odds Ratio (logOR)** from Fisher’s exact test of association between gene sets, given a specified gene universe. -* **Log Odds Ratio (logOR)**: -Computes enrichment using a user-defined gene universe and Fisher’s exact test. +Filters can be applied based on similarity thresholds (e.g., minimum Jaccard, OR, or Fisher's test p-value). -Filters can be applied based on similarity thresholds (e.g., minimum Jaccard, OR, or p-value). +Example of Gene Set Similarity (full tutorial [here][tutorial-signaturesimilarity]): We compare two user-defined gene sets against other user gene set and the MSigDB C2:CP:KEGG_LEGACY collection. Similarity is summarised in a heatmap of log odds ratios, highlighting associations above a defined threshold (e.g., OR > 100 for at least one of the signatures being compared). The underlying data for the heatmap can be accessed via `$data` for further analysis. -Example of Gene Set Similarity (full tutorial [here][tutorial-signaturesimilarity]). Here, we are seeing how two user-defined signatures compare to a set of other user-defined signatures, as well as to a collection of reference gene sets from MSigDB (C2:CP:KEGG_LEGACY). The results are visualized in a heatmap, showing the similarity between the signatures based on log odds ratio (at least one with log10OR > 2). -```{r, fig.width=6, fig.height=8, out.width="60%"} +```{r, fig.width=6, fig.height=8} # Example data signature1 <- c("TP53", "BRCA1", "MYC", "EGFR", "CDK2") @@ -499,15 +485,14 @@ geneset_similarity( unlist(signature_list), msigdbr::msigdbr(species = "Homo sapiens", category = "C2")$gene_symbol )), - or_threshold = 100, #log10OR = 2 - pval_threshold = 0.05, - width_text=50 + or_threshold = 100, + width_text=50, + pval_threshold = 0.05 )$plot - ``` + - -# 4. Further Reading +# Further Reading Full tutorials with extended examples: @@ -515,7 +500,7 @@ Full tutorials with extended examples: - [Discovery Mode](https://diseasetranscriptomicslab.github.io/markeR/articles/Article_DiscoveryMode.html) - [Signature Similarity](https://diseasetranscriptomicslab.github.io/markeR/articles/Article_GeneSetSimilarity.html) -# 5. Contact +# Contact 📩 For questions, contact: @@ -529,7 +514,7 @@ Email: [rita.silva@medicina.ulisboa.pt](mailto:rita.silva@medicina.ulisboa.pt) [tutorial-signaturesimilarity]: https://diseasetranscriptomicslab.github.io/markeR/articles/Article_GeneSetSimilarity.html -# 6. Session Information +# Session Information ```{r} sessionInfo() From d3da934e362eee30b4b540169295e461fa8c227d Mon Sep 17 00:00:00 2001 From: Rita Silva <55759937+ritamtbsilva@users.noreply.github.com> Date: Wed, 17 Sep 2025 17:05:17 +0100 Subject: [PATCH 3/7] Update pkgdown.yaml --- .github/workflows/pkgdown.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 41e7590..bceb80e 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -5,6 +5,9 @@ on: branches: [default] pull_request: branches: [default] + release: + types: [published] + branches: [default] workflow_dispatch: branches: [default] From db5fafec75c4ce8d61cf84cafa98c1a9a9df1f9e Mon Sep 17 00:00:00 2001 From: Rita Silva Date: Fri, 19 Sep 2025 16:03:52 +0100 Subject: [PATCH 4/7] clarify vignettes --- .../articles/Article_BenchmarkingMode.Rmd | 118 +++++++++++------- vignettes/articles/Article_DiscoveryMode.Rmd | 66 +++++----- .../articles/Article_GeneSetSimilarity.Rmd | 16 +-- 3 files changed, 120 insertions(+), 80 deletions(-) diff --git a/vignettes/articles/Article_BenchmarkingMode.Rmd b/vignettes/articles/Article_BenchmarkingMode.Rmd index 065d3c6..ff87934 100644 --- a/vignettes/articles/Article_BenchmarkingMode.Rmd +++ b/vignettes/articles/Article_BenchmarkingMode.Rmd @@ -12,22 +12,24 @@ knitr::opts_chunk$set( ``` -This vignette provides a comprehensive introduction to the **markeR** package, focusing on its **Benchmarking Mode**. This mode is designed to evaluate the performance of gene signatures in quantifying specific biological states or phenotypes, such as disease states or cellular conditions. It allows users to assess the robustness and reliability of gene signatures across various conditions, providing a standardized framework for benchmarking. - +This vignette provides a comprehensive introduction to the **markeR** package, focusing on its **Benchmarking Mode**. This mode is designed to evaluate gene sets' performance in marking a metadata variable, *i.e.*, a phenotype such as disease or cellular condition, returning comparative visualisations across scoring and enrichment methods. It allows users to assess the robustness and reliability of gene sets across various conditions, providing a standardized framework for benchmarking. + # Case-study: Senescence -We will be using an already pre-processed gene expression dataset, derived from the Marthandan et al. (2016) study (GSE63577), that includes human fibroblast samples cultured under two different conditions: replicative senescence and proliferative control. The dataset has already been filtered and normalized using the `edgeR` package. For more information about the dataset structure, see the help pages for `?counts_example` and `?metadata_example`. +In this vignette, we use a pre‑processed RNA-seq dataset from Marthandan et al. (2016, GSE63577), with normalised read counts for human fibroblasts under **replicative senescence** and **proliferative control**. See `?counts_example` for preprocessing details and structure. `markeR` requires as input a filtered and normalised, non log-transformed, gene expression matrix (genes × samples). Row names must be gene identifiers; column names must match sample IDs in the metadata. + +We use the accompanying metadata from the Marthandan et al. (2016) study (see `?metadata_example`). -This dataset serves as a working example to demonstrate the main functionalities of the markeR package. In particular, it will be used to showcase the two primary modules designed for benchmarking gene signatures: +This dataset serves as a working example to demonstrate the main functionalities of the `markeR` package. In particular, it will be used to showcase the two primary modules of `markeR` for quantifying phenotypes using gene sets: -- **Score**: calculates expression-based signature scores for each sample, and -- **Enrichment**: evaluates the over-representation of gene signatures within ranked gene lists. +- **Score-based methods**: log2-median expression, ranking approaches, and single-sample gene set enrichment analysis (ssGSEA) to quantify coordinated expression within a gene set. +- **Enrichment-based methods**: GSEA using moderated t- or B-statistics. To illustrate the usage of `markeR`, we use three gene sets commonly associated with cellular senescence: -- **LiteratureMarkers**: A small, curated set of well-established senescence-associated genes repeatedly reported in the literature. This concise gene set includes key markers often used for validating senescence phenotypes. This set includes information on the directionality of gene regulation (i.e., whether genes are typically up- or down-regulated in senescence). +- **LiteratureMarkers**: A small, curated set of well-established senescence-associated genes repeatedly reported in the literature. This concise gene set includes key markers often used for validating senescence phenotypes. This set includes information on the expected direction of change in expression of genes upon phenotype (i.e., whether genes are expected to be up- or down-regulated in senescence). - **REACTOME_CELLULAR_SENESCENCE**: A comprehensive gene set from the MSigDB REACTOME collection, representing known molecular pathways involved in cellular senescence and commonly used in enrichment-based analyses. This is also treated as an undirected gene set without information on the up- or down-regulation of individual genes. -- **HernandezSegura**: A transcriptomic gene set identified by Hernandez-Segura et al. (2017) as consistently altered across multiple senescence models. This set includes information on the directionality of gene regulation. It has shown strong performance in classification and enrichment analyses, including in the original paper of markeR. +- **HernandezSegura**: A transcriptomic gene set identified by Hernandez-Segura et al. (2017) as consistently altered across multiple senescence models. This set includes information on the expected direction of change in expression of genes upon phenotype. ```{r example} library(markeR) @@ -45,7 +47,12 @@ data(counts_example) counts_example[1:5,1:5] ``` -For illustration purposes of different variable types, let's imagine we also had two additional variables: one indicating the number of days between sample preparation and sequencing (`DaysToSequencing`), and another identifying the person who processed each sample (`researcher`). These variables are hypothetical and not part of the original study design. +For illustration purposes, two synthetic variables were added to the data: + +* `DaysToSequencing`, the number of days between sample preparation and sequencing; +* `Researcher`, identifying the person who processed each sample. + +This enables exploration of associations between gene set activity and both categorical and continuous variables. ```{r load_metadata} @@ -61,28 +68,40 @@ head(metadata_example) # Calculate Senescence Scores -The `CalculateScores` function computes the signature scores for each sample based on predefined gene sets, such as a senescence gene set. It returns a named list where each entry corresponds to a specific gene set and includes the calculated scores, along with metadata (if available). When setting `method = "all"`, the function returns a list, where each element corresponds to a scoring method and contains the respective data frame of scores, allowing comparison between methods. The function allows users to select from three different scoring methods: +The `CalculateScores` function computes the gene set scores for each sample based on predefined gene sets, such as a senescence gene set. It returns a named list where each entry corresponds to a specific gene set and includes the calculated scores, along with metadata (if available). When setting `method = "all"`, the function returns a list, where each element corresponds to a scoring method and contains the respective data frame of scores, allowing comparison between methods. The function allows users to select from three different scoring methods: + +* **logmedian**: mean of the across-sample normalised log2 median-centred expression levels of the genes in the set; for bidirectional gene sets, the sample score is the partial score for the subset of putatively upregulated genes minus that of the downregulated subset. + +* **Ranking**: mean expression rank of gene set members in each sample; for bidirectional gene sets, the sample score is the partial score for the subset of putatively upregulated genes minus that of the downregulated subset, and normalised by the number of genes in the set. + +* **ssGSEA**: single-sample gene set enrichment score using ssGSEA; for bidirectional gene sets, the sample score is the partial score for the subset of putatively upregulated genes minus that of the downregulated subset. + +These methods are very similar and, when applied to a robust gene set, are expected to yield similar results across all three methods. Empirically, a good gene set will be one that shows consistent results, both in the calculated scores and in Cohen's d or f statistics, across different methods. If the gene set is not robust, or if there is considerable noise, the results across methods may differ significantly. -- **ssGSEA**: Computes an enrichment score for each gene set in each sample. -- **logmedian**: Calculates the score as the sum of the normalized (log2-median-centered) expression values of the genes in the gene set, divided by the number of genes. -- **ranking**: Determines the score by ranking the expression of the genes in the gene set and normalizing the result. +The `PlotScores` function computes and visualizes gene set scores according to the chosen method and variable type: -These methods are very similar and, when applied to a robust gene set, will yield similar results across all three methods. Empirically, a good gene set will be one that shows consistent results, both in the calculated scores and in Cohen's d or F statistics, across different methods. If the gene set is not robust, or if there is considerable noise, the results across methods may differ significantly. Consistent scores across methods typically indicate a more reliable and meaningful gene set. These methods are explained in more detail below, allowing the user to select the most appropriate one for their analysis. +- `method = "all"` -The `PlotScores` function can be used to compute and visualize the scores in various ways, depending on the method and variable chosen. + - Categorical `Variable`: produces a heatmap of Cohen's *d* and a volcano plot of all pairwise group contrasts. + - Numeric `Variable`: produces a heatmap of Cohen's *F* statistics and a volcano plot of associations. -- If `method = "all"` and the variable is categorical, it will return a heatmap of Cohen's d or F statistics and a volcano plot showing contrasts between all groups of that variable. -- If `method = "all"` and the variable is numeric, a heatmap of Cohen's F and a volcano plot will be produced. -- If `method != "all"` and the variable is categorical, it will generate a violin plot for each gene set. -- If `method != "all"` and the variable is `NULL`, a density plot of the score distribution will be displayed. -- If `method != "all"` and the variable is numeric, a scatter plot will be created to show the relationship between the scores and the numeric variable. +- `method != "all"` + + - Categorical `Variable`: generates violin plots of scores per gene set. + - Numeric `Variable`: generates scatter plots of scores versus the numeric variable. + - `Variable` is `NULL`: displays a density plot of the score distribution. + +This structure clearly links each combination of method and variable type to the resulting visualization, avoiding ambiguity. ## logmedian method - -The following example uses the **`logmedian`** method to calculate a gene signature score. This method first applies a log2 transformation to the expression values, and then centers them by subtracting the median expression (across all samples) for each genes. The score for each sample is then computed by summing the normalised expression values of the genes in the gene set, and dividing by the number of genes in the gene set. This normalization makes each gene’s expression relative to its typical behavior across the dataset, allowing for meaningful comparisons between genes with different expression scales. By using log2 median-centering, the method ensures that both highly and lowly expressed genes contribute comparably to the score, as long as their variances are similar. This normalization emphasizes relative changes in expression rather than absolute values, allowing the score to reflect the coordinated behavior of the genes in a gene set. Users can calculate the gene signature score for each sample based on one or more predefined gene sets (signatures). -Here’s an example where we calculate the signature score using the "logmedian" method: +The **logmedian** method computes a gene signature score per sample as follows. First, gene expression values are log2-transformed and each gene is median-centered across all samples. For a given sample, the score is the mean of the median-centered expression values of the genes in the set (De Almeida et al., 2019). Each gene’s contribution is thus evaluated relative to its baseline expression, and the resulting score quantifies the coordinated activity of the gene set. + +For bidirectional gene sets, where genes are annotated by expected direction of regulation, the score is calculated as the difference between the mean of the upregulated genes and the mean of the downregulated genes. Users can compute scores for individual samples using one or more predefined gene sets. + +The following example demonstrates calculation of a gene signature score using the logmedian method: + ```{r} df_Scores <- CalculateScores(data = counts_example, @@ -96,9 +115,11 @@ head(df_Scores$HernandezSegura) head(df_Scores$LiteratureMarkers) ``` -The user can also chose to directly plot the scores. +Users can directly visualize gene set scores with the plotting functions. -Effect sizes can be computed using the `compute_cohen` parameter (default = `T`): when the grouping variable has only two levels, Cohen’s d is calculated by default. If there are more than two levels, Cohen’s f is used unless a specific pairwise comparison is defined via `cond_cohend`, in which case Cohen’s d is reported for that comparison. If `pvalcalc = TRUE` (default = `FALSE`), an associated p-value (not corrected for multiple testing) is also reported. The p-value is derived from a two-sample t-test for two-group comparisons or numeric variables, or from an ANOVA for multi-group comparisons. +Effect sizes are computed via the `compute_cohen` parameter (default = `TRUE`). For a categorical metadata variable with two levels, Cohen’s d is calculated; for more than two levels, Cohen’s f is used unless a specific pairwise comparison is specified via `cond_cohend`, in which case Cohen’s d is reported for that comparison. + +If `pvalcalc = TRUE` (default = `FALSE`), an associated p-value is reported (uncorrected for multiple testing). p-values are derived from a two-sample t-test for two-group or numeric-variable comparisons, and from ANOVA for multi-group comparisons. ```{r exampleScore, fig.width=8, fig.height=4, out.width="80%", warning=FALSE, message=FALSE} @@ -133,8 +154,7 @@ PlotScores(data = counts_example, ``` -Interestingly, when we provide directionality for a signature—such as the *Literature_Senescence* set—the interpretation of the results can change substantially. For example, without specifying direction, senescent samples may appear to have lower scores than proliferative ones. But once directionality is accounted for, the scores shift in a way that aligns better with biological expectations. Therefore, it is **strongly advised** that, whenever possible (i.e., if known), the user states the putative regulation “sign” of the genes in the gene set This helps ensure more accurate and meaningful interpretations of the data. - +Providing gene directionality can substantially affect score interpretation. For example, in the *Literature_Senescence* signature, omitting direction of change in expression of genes upon phenotype may lead to senescent samples appearing to have lower scores than proliferative ones. Incorporating directionality aligns the scores with biological expectations. Therefore, it is **strongly recommended** to specify the expected direction of gene expression changes in a set whenever this information is available, ensuring more accurate and meaningful interpretation of the results. ```{r exampleScore_bidirectional, fig.width=6, fig.height=4, out.width="60%", warning=FALSE, message=FALSE} @@ -165,9 +185,11 @@ PlotScores(data = counts_example, ``` -To use the function for numeric variables, the user should specify the relevant parameters, including the numeric variable to be analysed. The function will generate a scatter plot for the numeric variable, optionally calculating Cohen's f as the effect size. The user can choose a correlation method (e.g., Pearson, Spearman, or Kendall) to assess the relationship between the variable and the signature scores. The plot will also include optional p-value calculations for comparisons. +When analyzing a numeric variable, the function generates a scatter plot of the variable against gene set scores and can optionally compute an effect size using Cohen’s f. The user may select a correlation method (Pearson, Spearman, or Kendall) to quantify the association between the numeric variable and the scores. Optional p-value calculations for the association can also be included. + +The following example illustrates how to configure the function for a numeric variable: + -Here is an example of how to configure the function for numeric variables: ```{r examplenumeric, fig.width=8, fig.height=3.5, out.width="80%", warning=FALSE, message=FALSE} PlotScores(data = counts_example, @@ -188,8 +210,9 @@ PlotScores(data = counts_example, widthTitle = 26, cor = "pearson") ``` + +To visualize the overall distribution of scores across gene sets, the `PlotScores` function can be used without specifying the `GroupingVariable` parameter, i.e, without grouping scores by any metadata variable.. In this case, it generates a grid of density plots, with each plot representing the score distribution for a specific gene set. -For users interested in viewing the overall distribution of scores across gene signatures, the `PlotScores` function can be used without specifying the `GroupingVariable` parameter, i.e, without grouping scores by any metadata variable. In this case, the function will automatically generate a grid of density plots, with each plot representing the distribution of scores for a specific gene set ```{r plotscores_density, fig.width=8, fig.height=3, out.width="80%", warning=FALSE, message=FALSE} @@ -210,7 +233,8 @@ PlotScores(data = counts_example, ## ssGSEA method -The same approach can be applied for **ssGSEA** (single-sample Gene Set Enrichment Analysis; Barbie et al. (2009)) for score calculation and visualization, both for unidirectional and bidirectional signatures. ssGSEA computes an enrichment score for each gene signature in each sample using an adaptation of the `gsva()` function from the `GSVA` package. This method is useful for evaluating gene set enrichment in individual samples rather than groups, as described in the sections below. +Single sample Gene Set Enrichment Analysis (**ssGSEA**) was implemented using a modified version of the `GSVA` package’s `gsva()` function (Barbie et al., 2009), based on the original Gene Set Enrichment Analysis (GSEA) method (Subramanian et al., 2005). ssGSEA ranks all genes by their expression within each sample and computes a running-sum statistic over the ranked list. For unidirectional gene sets (i.e., with no information on the expected direction of each gene’s regulation upon phenotype), the ssGSEA sample score reflects overall coordinated expression of the genes in the set. For bidirectional gene sets, the score is calculated as the ssGSEA score computed for the upregulated subset of genes minus the score computed for the downregulated subset. + ```{r examplessGSEA, fig.width=8, fig.height=4, out.width="80%", warning=FALSE, message=FALSE} @@ -247,9 +271,9 @@ PlotScores(data = counts_example, ## Ranking method -The **ranking** method computes gene signature scores for each sample by ranking the expression of signature genes in the dataset and normalizing the score based on the total number of genes. +The **ranking** method calculates gene signature scores using a non-parametric approach based on the relative expression of genes within a set. For each sample, all genes are ranked by expression. The score is then calculated as the sum of the ranks of the genes in the gene set, multiplied by +1 (for upregulated genes those with unspecified direction of regulation change upon phenotype) or -1 (downregulated genes), and normalised by the number of genes in the set. -The following example demonstrates the use of the "ranking" method for both unidirectional and bidirectional signatures: +The following example demonstrates the use of the "ranking" method for both unidirectional and bidirectional gene sets: ```{r ranking, fig.width=8, fig.height=4, out.width="80%", warning=FALSE, message=FALSE} @@ -293,6 +317,7 @@ The `mode` parameter controls how contrasts are generated for categorical variab - **"medium"**: Includes comparisons between one group and the union of other groups (e.g., A - (B + C + D); B - (A + C + D)), allowing for broader contrasts beyond simple pairwise comparisons. - **"extensive"**: Allows for all possible algebraic combinations of group levels (e.g., (A + B) - (C + D)). +In this example, HernandezSegura and LiteratureMarkers consistently discriminate Senescent from Proliferative samples, while REACTOME_CELLULAR_SENESCENCE shows weaker and less consistent separation. ```{r Overall_Scores, fig.width=6, fig.height=3, out.width="80%", warning=FALSE, message=FALSE} @@ -374,6 +399,10 @@ AUC_Scores(data = counts_example, title="Marthandan et al. 2016") ``` + + HernandezSegura and LiteratureMarkers exhibit consistently high AUCs across scoring methods, while REACTOME_CELLULAR_SENESCENCE shows more heterogeneous performance. + + ## False Positive Rate (FPR) Calculations The user can assess the significance of gene set scores by comparing observed effect sizes against a distribution of those originated by random gene sets with the same number of genes and matched directionality. For each original gene set, the function calculates the observed Cohen's d (and p‑value) using (`GroupingVariable`). It then generates a number of simulated gene sets (`number_of_sims`) by randomly sampling the same number of genes from a user provided gene list (`gene_list`) and computes their Cohen's d values. The simulation results are visualised as violin plots of the distribution of Cohen's d values for each method, overlaid with the observed values of the original gene sets, and a 95th percentile threshold. Significance is indicated by distinct point shapes based on the associated p‑value. @@ -430,7 +459,7 @@ DEGs2 <- calculateDE(data = counts_example, DEGs2$`Senescent-Proliferative`[1:5,] ``` -After running differential expression analysis (for example, using the `calculateDE` function), the user can visualize their results with the `plotVolcano` function. This function provides a flexible interface for exploring their data by allowing the user to: +After running differential expression analysis (using the `calculateDE` function), the user can visualize their results with the `plotVolcano` function. This function provides a flexible interface for exploring their data by allowing the user to: - **Plot Differential Gene Expression Statistics:** Display a volcano plot with chosen statistics (e.g., log fold-change on the x-axis and –log₁₀ adjusted p-value on the y-axis). @@ -438,7 +467,7 @@ After running differential expression analysis (for example, using the `calculat Highlight genes that pass user-specified thresholds by adjusting `threshold_y` and `threshold_x`. - **Annotate Top and Bottom N Genes:** Optionally, label the top (and bottom) N genes based on the chosen statistic to quickly identify the most significant genes. -- **Highlight Gene Signatures:** If the user provides a list of gene signatures using the `genes` argument, the function can highlight these genes in the plot. The user can also specify distinct colors for putativelyupregulated and downregulated if their direction is known, or a color for genes that do not have a putative direction. +- **Highlight Gene Signatures:** If the user provides a list of gene signatures using the `genes` argument, the function can highlight these genes in the plot. The user can also specify distinct colors for putatively upregulated and downregulated if their direction is known, or a color for genes that do not have a putative direction. Below is an example usage of `plotVolcano` that visualizes differential expression results from a `DEResultsList`. The first plot shows the default behavior, generating a basic volcano plot without thresholds or gene highlights. Subsequent examples demonstrate how to customize the plot: @@ -446,8 +475,7 @@ Below is an example usage of `plotVolcano` that visualizes differential expressi - Annotating the top and bottom N genes by effect size, - And using gene signatures to color genes across multiple plots arranged by contrast and signature. -These examples illustrate how users can customise the output plot to highlight biologically meaningful patterns or focus on specific gene sets. - +These examples illustrate how users can customise the output plot to highlight biologically meaningful patterns or focus on specific gene sets. ```{r volcanos_DEGs, fig.width=4, fig.height=3, out.width="40%", warning=FALSE, message=FALSE} @@ -469,6 +497,12 @@ plotVolcano(DEGs, genes = NULL, N = 5, ``` +In this example: + +* Genes in the HernandezSegura set annotated as upregulated (green) display positive log2 fold changes, whereas those annotated as downregulated (red) show negative log2 fold changes. This pattern is consistent with the annotation of the set, although these genes are not necessarily those exhibiting the largest absolute fold changes. +* In the LiteratureMarkers set, *LMNB1* and *MKI67* exhibit strongly negative log2 fold change, consistent with their roles as proliferation markers absent in senescent cells. +* Genes from the REACTOME_CELLULAR_SENESCENCE set are undirected and appear across the full range of log2 fold change values, diluting discriminatory power. + ```{r volcanos_DEGs3, fig.width=10, fig.height=3, out.width="90%", warning=FALSE, message=FALSE} # Change order: signatures in columns, contrast in rows @@ -624,10 +658,13 @@ Based on these very simple analyses, the REACTOME_CELLULAR_SENESCENCE showed con These findings highlight important trade-offs: while scoring methods offer per-sample resolution and are less sensitive (in terms of statistical significance) to gene set size, making them useful for classification tasks, they may be overly influenced by a small subset of genes, which could limit biological interpretability. While more robust to sample heterogeneity and better at capturing coordinated expression changes, enrichment-based methods are sensitive to gene set composition and size. Caution is warranted when interpreting results, especially from score-based approaches, as strong signals may not always reflect the intended biological process, but rather a handful of dominant genes. + ## Visualise Individual Gene Behaviour. As highlighted in the previous section, score-based approaches can be disproportionately influenced by a small subset of genes. To address this, `markeR` includes dedicated functions for exploring individual gene behaviour, enabling users to assess if and which genes may be driving the overall signal. We demonstrate this functionality using the `LiteratureMarkers` gene set. +`markeR` provides the wrapper function `VisualiseIndividualGenes` for plotting individual genes. In this tutorial, we illustrate each visualization function separately for clarity. However, the same outputs can be generated through the wrapper by setting the `type` argument to the desired visualization strategy (i.e., `"expression"`, `"correlation"`,`"violin"`, `"roc"`, `"auc"`, `"rocauc"`, `"cohend"`, `"pca"`), and the wrapper automatically dispatches to the correct function with the appropriate parameters. + The `ExpressionHeatmap` function generates a heatmap to display the expression levels of selected senescence genes across samples. Samples are annotated by a chosen condition, and expression values are color-scaled for easy visual comparison. Clustering options and customizable color palettes allow for flexible and informative visualization. ```{r example_exprheatmap, fig.width=8, fig.height=3, out.width="70%", warning=FALSE, message=FALSE} @@ -653,10 +690,6 @@ ExpressionHeatmap(data=counts_example, scale_position="right") ``` -To standardize the visualisation of individual genes across multiple analyses, we created a wrapper function called `VisualiseIndividualGenes`. This function consolidates several internal plotting functions, including `ExpressionHeatmap` and other listed below, into a single, user-friendly interface. - -The output remains consistent with the individual functions, but users can specify the desired type (i.e., "expression", "correlation","violin", "roc", "auc", "rocauc", "cohend", "pca"), and the wrapper automatically dispatches to the correct function with the appropriate parameters. This design simplifies and unifies the workflow for exploring gene-level patterns across various analysis types. - ```{r example_exprheatmap2, fig.width=8, fig.height=3, out.width="70%", warning=FALSE, message=FALSE} VisualiseIndividualGenes(type="expression", data=counts_example, @@ -674,7 +707,6 @@ VisualiseIndividualGenes(type="expression", ``` - The `IndividualGenes_Violins` function creates violin plots to visualize the expression distributions of selected senescence genes across conditions. Jittered points represent individual samples, and grouping (x axis, `GroupingVariable`) and color variables (`ColorVariable` and `ColorValues`) from the metadata allow for additional stratification and insight. Customization options include layout, point size, colors, and axis labeling. ```{r exampleviolins, fig.width=10, fig.height=3, out.width="100%", warning=FALSE, message=FALSE} @@ -790,7 +822,7 @@ plotPCA(data = counts_example, ## Comment -As demonstrated by the behaviour of individual genes in the `LiteratureMarkers` gene set, LMNB1, MKI67, and GLB1 appear to drive the overall signal. These genes consistently show higher performance metrics (e.g., Cohen’s d, AUC), strong expression changes between conditions, and LMNB1 and MKI67 specifically have correlated expression patterns. This underscores the importance of examining gene-level behaviour, as a strong overall signature score may reflect the influence of only a few informative genes, rather than coordinated activity across the entire set. In this case, the strong performance of the `LiteratureMarkers` set in scoring approaches is likely driven by these genes. However, relying heavily on a few markers can be a caveat: for example, MKI67 and LMNB1 (proliferation-related genes) may also change in other biological contexts like quiescence or differentiation, potentially limiting their specificity for senescence. Thus, the choice of gene set and analysis strategy should be guided by the research question, and complemented with both score distributions, enrichment analyses, and individual gene behaviour. Notably, scoring with just these three genes yielded results similar (or, even, slightly better) to the full `LiteratureMarkers` set. +As demonstrated by the behaviour of individual genes in the `LiteratureMarkers` gene set, *LMNB1*, *MKI67*, and *GLB1* appear to drive the overall signal. These genes consistently show higher performance metrics (e.g., Cohen’s d, AUC), strong expression changes between conditions, and *LMNB1* and *MKI67* specifically have correlated expression patterns. This underscores the importance of examining gene-level behaviour, as a strong overall signature score may reflect the influence of only a few informative genes, rather than coordinated activity across the entire set. In this case, the strong performance of the `LiteratureMarkers` set in scoring approaches is likely driven by these genes. However, relying heavily on a few markers can be a caveat: for example, *MKI67* and *LMNB1* (proliferation-related genes) may also change in other biological contexts like quiescence or differentiation, potentially limiting their specificity for senescence. Thus, the choice of gene set and analysis strategy should be guided by the research question, and complemented with both score distributions, enrichment analyses, and individual gene behaviour. Notably, scoring with just these three genes yielded results similar to the full `LiteratureMarkers` set. ```{r example_genesdrivingresults, fig.width=6, fig.height=4, out.width="60%", warning=FALSE, message=FALSE} diff --git a/vignettes/articles/Article_DiscoveryMode.Rmd b/vignettes/articles/Article_DiscoveryMode.Rmd index f7148ef..c51e45c 100644 --- a/vignettes/articles/Article_DiscoveryMode.Rmd +++ b/vignettes/articles/Article_DiscoveryMode.Rmd @@ -12,18 +12,20 @@ knitr::opts_chunk$set( ) ``` -This vignette provides a comprehensive introduction to the **markeR** package, focusing on its **Discovery Mode**. The discovery mode was designed for users who are interested in quantifying a gene set in a given dataset to explore associations with phenotypic or clinical variables. This approach is particularly suited to hypothesis generation where the phenotype marked by the gene set is of known biological or clinical relevance. - +This vignette provides a comprehensive introduction to the **`markeR`** package, focusing on its **Discovery Mode**. The discovery mode was designed for users who are interested in examining the relationship between a gene set and one or more metadata variables of interest, being suitable for exploratory or hypothesis-generating analyses. + # Case-study: Senescence -We will be using an already pre-processed gene expression dataset, derived from the Marthandan et al. (2016) study (GSE63577), that includes human fibroblast samples cultured under two different conditions: replicative senescence and proliferative control. The dataset has already been filtered and normalized using the `edgeR` package. For more information about the dataset structure, see the help pages for `?counts_example` and `?metadata_example`. +In this vignette, we use a pre‑processed RNA-seq dataset from Marthandan et al. (2016, GSE63577), with normalised read counts for human fibroblasts under **replicative senescence** and **proliferative control**. See `?counts_example` for preprocessing details and structure. `markeR` requires as input a filtered and normalised, non log-transformed, gene expression matrix (genes × samples). Row names must be gene identifiers; column names must match sample IDs in the metadata. + +We use the accompanying metadata from the Marthandan et al. (2016) study (see `?metadata_example`). -This dataset serves as a working example to demonstrate the main functionalities of the markeR package. In particular, it will be used to showcase the two primary modules designed for benchmarking gene signatures: +This dataset serves as a working example to demonstrate the main functionalities of the `markeR` package. In particular, it will be used to showcase the two primary modules of `markeR` for quantifying phenotypes using gene sets: -- **Score**: calculates expression-based signature scores for each sample, and -- **Enrichment**: evaluates the over-representation of gene signatures within ranked gene lists. +- **Score-based methods**: log2-median expression, ranking approaches, and single-sample gene set enrichment analysis (ssGSEA) to quantify coordinated expression within a gene set. +- **Enrichment-based methods**: GSEA using moderated t- or B-statistics. -To illustrate the usage of `markeR`, we use the **HernandezSegura** gene set: A transcriptomic gene set identified by Hernandez-Segura et al. (2017) as consistently altered across multiple senescence models. This set includes information on the directionality of gene regulation. It has shown strong performance in classification and enrichment analyses, including in the original paper of markeR, and also in the Tutorial on the *Benchmarking Mode* of `markeR`. +To illustrate the usage of `markeR`, we use the **HernandezSegura** gene set: A transcriptomic gene set identified by Hernandez-Segura et al. (2017) as consistently altered across multiple senescence models. This set includes information on the direction of change in expression of genes upon phenotype. ```{r example} library(markeR) @@ -42,8 +44,12 @@ data(counts_example) counts_example[1:5,1:5] ``` -For illustration purposes of different variable types, let's imagine we also had two additional variables: one indicating the number of days between sample preparation and sequencing (`DaysToSequencing`), and another identifying the person who processed each sample (`researcher`). These variables are hypothetical and not part of the original study design. - +For illustration purposes, two synthetic variables were added to the data: + +* `DaysToSequencing`, the number of days between sample preparation and sequencing; +* `Researcher`, identifying the person who processed each sample. + +This enables exploration of associations between gene set activity and both categorical and continuous variables. ```{r load_metadata} data(metadata_example) @@ -58,24 +64,16 @@ head(metadata_example) # Score-Based approaches -The **Score** module in `markeR` quantifies the association between a gene signature and phenotypic variables by calculating a score for each sample based on the expression of genes in the signature. This score can then be correlated with other variables, such as clinical or experimental conditions: - -- **Quantifies associations** between phenotype variables and a gene signature score using *Cohen's effect sizes* and *p-values*. -- **Visualizes** results through lollipop plots, contrast plots, and distribution plots. - -This is useful for identifying: - -- **Biological relationships** (e.g., phenotype-score associations) -- **Technical confounders** (e.g., batch effects) - +A score summarising the collective expression of a gene set is assigned **to each sample**. Scores can be visualised using built-in functions, or used directly in downstream analyses (*e.g.*, comparisons between phenotypic groups of samples, correlations with numerical phenotypes). + ## Outputs -The main function returns a structured list with: +If `method = "logmedian"` (or `ssGSEA`, `ranking`), the main function `VariableAssociation()` returns a structured list with: - **`overall`**: Effect sizes (*Cohen’s f*) and p-values for each variable. - **`contrasts`**: For categorical variables, pairwise or grouped comparisons using *Cohen’s d* with BH-adjusted p-values. - **`plot`**: A combined visualization showing: - 1. Lollipop plot of effect sizes (*Cohen’s f*) + 1. Lollipop plot of effect sizes (i.e., *Cohen’s f* per variable) 2. Distribution plots of the score by variable (density or scatter) 3. Lollipop plots of contrasts (*Cohen’s d*) for categorical variables, if applicable - **`plot_overall`**, **`plot_contrasts`**, **`plot_distributions`**: Individual components of the combined plot. @@ -96,13 +94,10 @@ For this example, we use: - The **`logmedian`** scoring method - **`mode = "extensive"`** for thorough contrast analysis -We also include two synthetic phenotypic variables: +Though artificial, `DaysToSequencing` and `Researcher` mimic potential **technical covariates**. Strong associations between these and the score could indicate **batch effects**, where technical variation may confound biological interpretation. -- **`Researcher`** — a categorical variable representing who processed each sample -- **`DaysToSequencing`** — a numeric variable indicating time between preparation and sequencing - -Though artificial, these mimic potential **technical covariates**. Strong associations between these and the score could indicate **batch effects**, where technical variation may confound biological interpretation. +In this example, the `Condition` variable shows a large effect size (Cohen’s f and Cohen's d), confirming strong discrimination between Senescent and Proliferative samples. The remaining variables don't show significant associations, suggesting no major batch effects that might be reflected in the computed scores. ```{r variableassoc_score_sen, fig.width=7, fig.height=7, out.width="100%", warning=FALSE, message=FALSE} results_scoreassoc_bidirect <- VariableAssociation(data = counts_example, @@ -118,20 +113,24 @@ results_scoreassoc_bidirect <- VariableAssociation(data = counts_example, results_scoreassoc_bidirect$Overall results_scoreassoc_bidirect$Contrasts ``` + + + # Enrichment-based approaches -The `GSEA_VariableAssociation()` function evaluates how phenotypic variables are associated with **gene set activity**, using enrichment scores derived from gene expression statistics (B- or t-statistics). This allows users to understand whether a gene set is enriched or depleted in relation to different sample attributes. +Enrichment-based methods implement **Gene Set Enrichment Analysis (GSEA)**. Genes are ranked according to differential expression statistics, and a Normalised Enrichment Score (NES) per variable of interest is computed, accompanied by a p-value adjusted for multiple hypothesis testing. + ## Outputs -The `GSEA_VariableAssociation()` function returns a list with two elements: +If `method = "GSEA"`, the main function `VariableAssociation()` returns a structured list with: - **`data`**: A tidy data frame of GSEA results. For each variable contrast, this includes: - **Contrast**: The comparison performed (e.g., A - B, A - (B+C)) - **Statistic**: The metric used for gene ranking (either *t* or *B*) - **NES**: Normalized Enrichment Score - - **Adjusted p-value**: Multiple-testing corrected p-value (e.g., Benjamini–Hochberg) + - **Adjusted p-value**: Multiple-testing corrected p-value (Benjamini–Hochberg) - **Gene Set Name**: The gene set being tested - **`plot`**: A `ggplot2` object showing the NES and significance of each contrast as a **lollipop plot**: @@ -156,8 +155,13 @@ Depending on the statistic used (`t` or `B`), the interpretation of results vari ## Example: Exploring Gene Set Enrichment by Variable -The following code evaluates how three phenotypic variables — `Condition`, `Researcher`, and `DaysToSequencing` — are associated with the **HernandezSegura** gene set: +The following code evaluates how three phenotypic variables (`Condition`, `Researcher`, and `DaysToSequencing`) are associated with the **HernandezSegura** gene set. + + In this example, the HernandezSegura gene set shows significant enrichment in samples sequenced by Francisca relative to those processed by Ana or John, which would suggest a differentiated researcher-associated technical impact on the samples' biological phenotype. This gene set shows also a strong depletion in proliferative samples, which is expected given its annotation as senescence-associated and results from the score-based approach. + + + ```{r GSEA_varassoc, fig.width=6, fig.height=6, out.width="60%", warning=FALSE, message=FALSE} varassoc_gsea <- VariableAssociation( data = counts_example, @@ -179,6 +183,8 @@ varassoc_gsea <- VariableAssociation( varassoc_gsea$data ``` + + # Session Information ```{r} diff --git a/vignettes/articles/Article_GeneSetSimilarity.Rmd b/vignettes/articles/Article_GeneSetSimilarity.Rmd index 59c854e..d950653 100644 --- a/vignettes/articles/Article_GeneSetSimilarity.Rmd +++ b/vignettes/articles/Article_GeneSetSimilarity.Rmd @@ -11,18 +11,22 @@ knitr::opts_chunk$set( ) ``` -Even if a user-defined gene signature demonstrates strong discriminatory power between conditions, it may reflect known biological pathways rather than novel mechanisms. To address this, the `geneset_similarity()` function computes pairwise **Jaccard indices** or **log odds ratios (logOR)** between user-provided gene signatures and a reference set, quantifying their overlap as a percentage or a statistical enrichment. +Even if a user-defined gene signature demonstrates strong discriminatory power between conditions, it may reflect known biological pathways rather than novel mechanisms. To address this, the `geneset_similarity()` function implements two complementary similarity metrics: +* **Jaccard Index**: +the ratio of the number of genes in common over the total number of genes in the two sets. + +* **Log Odds Ratio (logOR)** from Fisher’s exact test of association between gene sets, given a specified gene universe. + Users can compare their signatures to: -* **Custom gene sets**, defined manually, or +* **Custom gene sets**, defined manually; * **MSigDB collections**, via the [`msigdbr`](https://cran.r-project.org/package=msigdbr) package. The function provides options to: -* Filter by **Jaccard index threshold**, using `jaccard_threshold` -* Filter by **odds ratio and p-value**, using `or_threshold` and `pval_threshold` -* Limit the number of top-matching reference signatures shown, using `num_sigs_toplot` +* Filter by **Jaccard index threshold**, using `jaccard_threshold`; +* Filter by **odds ratio and p-value**, using `or_threshold` and `pval_threshold`, respectively. # Similarity via Jaccard Index @@ -92,12 +96,10 @@ The log odds ratio (logOR) provides a statistically grounded alternative for ass - Genes in both sets - Genes in one but not the other - Gene universe as background - Log-transformed odds ratios are visualized. > **Note**: When using `metric = "odds_ratio"`, the `universe` parameter **must** be supplied. - ## Example 3: Compare against user-defined and MSigDB gene sets ```{r, fig.width=6, fig.height=8, out.width="60%"} From 55addc469807bf79ea0b2e43bc0d3b2455755a2c Mon Sep 17 00:00:00 2001 From: A Wokaty Date: Tue, 28 Apr 2026 09:05:32 -0400 Subject: [PATCH 5/7] bump x.y.z version to even y prior to creation of RELEASE_3_23 branch --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 17cce5f..9ef2f0b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: markeR Title: An R Toolkit for Evaluating Gene Signatures as Phenotypic Markers -Version: 1.1.2 +Version: 1.2.0 Authors@R: c( person("Rita", "Martins-Silva", From ec844f3429bbab9ed6ea5885b1816ce71f1199a5 Mon Sep 17 00:00:00 2001 From: A Wokaty Date: Tue, 28 Apr 2026 09:05:32 -0400 Subject: [PATCH 6/7] bump x.y.z version to odd y following creation of RELEASE_3_23 branch --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9ef2f0b..d8f41ca 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: markeR Title: An R Toolkit for Evaluating Gene Signatures as Phenotypic Markers -Version: 1.2.0 +Version: 1.3.0 Authors@R: c( person("Rita", "Martins-Silva", From d319036a97b34cb168c5d73b0e71fc7c58b8d2f1 Mon Sep 17 00:00:00 2001 From: Rita Silva Date: Fri, 8 May 2026 15:24:52 +0100 Subject: [PATCH 7/7] bioc > v1.2 --- DESCRIPTION | 2 +- README.Rmd | 2 +- README.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 17cce5f..aab1bbb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: markeR Title: An R Toolkit for Evaluating Gene Signatures as Phenotypic Markers -Version: 1.1.2 +Version: 1.2 Authors@R: c( person("Rita", "Martins-Silva", diff --git a/README.Rmd b/README.Rmd index 94f25d9..4bfc4ca 100644 --- a/README.Rmd +++ b/README.Rmd @@ -30,7 +30,7 @@ knitr::opts_chunk$set( > **To cite `markeR` please use:** > -> Martins-Silva R, Kaizeler A, Barbosa-Morais NL (2025). _markeR: An R Toolkit for Evaluating Gene Signatures as Phenotypic Markers_. doi:10.18129/B9.bioc.markeR, R package version 1.1.2, https://bioconductor.org/packages/markeR. +> Martins-Silva R, Kaizeler A, Barbosa-Morais NL (2025). _markeR: An R Toolkit for Evaluating Gene Signatures as Phenotypic Markers_. doi:10.18129/B9.bioc.markeR, R package version 1.2, https://bioconductor.org/packages/markeR. The folder `inst/Paper/` is in the **paper** branch and contains all scripts and materials used in the original `markeR` paper to reproduce analyses and figures. You can browse it [here](https://github.com/DiseaseTranscriptomicsLab/markeR/tree/paper/inst/Paper). diff --git a/README.md b/README.md index 0daabf8..521decb 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ across experimental and clinical phenotypes. > > Martins-Silva R, Kaizeler A, Barbosa-Morais NL (2025). *markeR: An R > Toolkit for Evaluating Gene Signatures as Phenotypic Markers*. -> , R package version 1.1.2, +> , R package version 1.2, > . The folder `inst/Paper/` is in the **paper** branch and contains all