diff --git a/.Rbuildignore b/.Rbuildignore index 39e8c0c..d713df3 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -19,3 +19,6 @@ ^\\.github$ ^\\.git$ ^codecov\.yml$ +^vignettes/articles$ +^data_aux$ +^python(/.*)?$ \ No newline at end of file diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 115be99..423dbfc 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -2,8 +2,14 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: [main, master] + branches: [main, default] pull_request: + branches: [main, default] + release: + types: [published] + branches: [main, default] + workflow_dispatch: + branches: [main, default] name: R-CMD-check.yaml @@ -48,6 +54,7 @@ jobs: with: upload-snapshots: true build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")' + error-on: '"error"' - name: Test coverage if: runner.os == 'Linux' && matrix.config.r == 'release' diff --git a/.github/workflows/Rminversion.yaml b/.github/workflows/Rminversion.yaml deleted file mode 100644 index 5b2efde..0000000 --- a/.github/workflows/Rminversion.yaml +++ /dev/null @@ -1,36 +0,0 @@ -name: Test minimal R version - -on: - push: - branches: [main, master] - pull_request: - branches: [main, master] - -jobs: - r-cmd-check: - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] - r-version: ['3.5.0', '3.6.0', '4.0.0', '4.1.0', '4.2.0', '4.3.0', '4.4.0','4.5.0'] - - name: R ${{ matrix.r-version }} on ${{ matrix.os }} - - steps: - - uses: actions/checkout@v4 - - - name: Set up R ${{ matrix.r-version }} - uses: r-lib/actions/setup-r@v2 - with: - r-version: ${{ matrix.r-version }} - - - name: Install package dependencies - uses: r-lib/actions/setup-r-dependencies@v2 - with: - extra-packages: rcmdcheck - needs: check - - - name: Run R CMD check - run: rcmdcheck::rcmdcheck(args = "--no-manual", error_on = "error") - shell: Rscript {0} diff --git a/.github/workflows/bioc-check.yml b/.github/workflows/bioc-check.yml index a9b7fda..b2c11bc 100644 --- a/.github/workflows/bioc-check.yml +++ b/.github/workflows/bioc-check.yml @@ -2,9 +2,14 @@ name: Bioconductor Check on: push: - branches: [main, master] + branches: [main, default] pull_request: - branches: [main, master] + branches: [main, default] + release: + types: [published] + branches: [main, default] + workflow_dispatch: + branches: [main, default] jobs: bioccheck: @@ -13,7 +18,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - r-version: ['4.4.0','4.5.0'] + r-version: ['4.5.0'] steps: - uses: actions/checkout@v4 @@ -37,4 +42,4 @@ jobs: - name: Run BiocCheck run: BiocCheck::BiocCheck(".", quit_with_status=TRUE) - shell: Rscript {0} \ No newline at end of file + shell: Rscript {0} diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index bfc9f4d..bceb80e 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -2,11 +2,14 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: [main, master] + branches: [default] pull_request: + branches: [default] release: types: [published] + branches: [default] workflow_dispatch: + branches: [default] name: pkgdown.yaml diff --git a/.gitignore b/.gitignore index 953bcfd..32e172a 100644 --- a/.gitignore +++ b/.gitignore @@ -9,11 +9,13 @@ docs renv.lock .obsidian/ -/inst/Paper/data/ -/inst/Paper/localjobs/ markeR.BiocCheck/ /doc/ /Meta/ markeR.Rproj .DS_Store -**/.DS_Store \ No newline at end of file +**/.DS_Store +inst/doc +data_aux +markeR.Rcheck +/python/.venv diff --git a/DESCRIPTION b/DESCRIPTION index 5475c78..f9c3058 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: markeR Title: An R Toolkit for Evaluating Gene Signatures as Phenotypic Markers -Version: 0.99.2 +Version: 1.3 Authors@R: c( person("Rita", "Martins-Silva", @@ -14,17 +14,18 @@ Authors@R: comment=c(ORCID="0000-0002-1215-0538")) ) Description: - markeR provides a suite of methods for using gene sets (signatures) to quantify and evaluate the - extent to which a given gene signature marks a specific phenotype. The package implements various - scoring, enrichment and classification approaches, along with tools to compute - performance metrics and visualize results, making it a valuable resource for transcriptomics research (bulk RNA-seq). + markeR is an R package that provides a modular and extensible framework for the systematic evaluation of gene sets as + phenotypic markers using transcriptomic data. The package is designed to support both quantitative analyses and visual exploration of + gene set behaviour across experimental and clinical phenotypes. It implements multiple methods, including score-based and enrichment + approaches, and also allows the exploration of expression behaviour of individual genes. In addition, users can assess the + similarity of their own gene sets against established collections (e.g., those from MSigDB), facilitating biological interpretation. License: Artistic-2.0 biocViews: GeneExpression, Transcriptomics, Visualization, Software, GeneSetEnrichment, Classification Encoding: UTF-8 Language: en-GB LazyData: false Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Additional_repositories: https://bioconductor.org/packages/release/bioc Imports: circlize, @@ -46,7 +47,6 @@ Imports: limma, ggrepel, effectsize, - patchwork, msigdbr, tibble Suggests: @@ -59,11 +59,14 @@ Suggests: rmarkdown, roxygen2, mockery, - covr + covr, + magick, + BiocStyle Config/testthat/edition: 3 Depends: - R (>= 3.5.0) + R (>= 4.5.0) URL: https://diseasetranscriptomicslab.github.io/markeR/, https://github.com/DiseaseTranscriptomicsLab/markeR BugReports: https://github.com/DiseaseTranscriptomicsLab/markeR/issues VignetteBuilder: knitr +Config/Needs/website: rmarkdown diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 99ef319..0000000 --- a/LICENSE +++ /dev/null @@ -1,180 +0,0 @@ -The Artistic License 2.0 - -Copyright (c) 2000-2006, The Perl Foundation. - -Everyone is permitted to copy and distribute verbatim copies of this license -document, but changing it is not allowed. - -Preamble - -This license establishes the terms under which a given free software Package -may be copied, modified, distributed, and/or redistributed. The intent is that -the Copyright Holder maintains some artistic control over the development of -that Package while still keeping the Package available as open source and free -software. - -You are always permitted to make arrangements wholly outside of this license -directly with the Copyright Holder of a given Package. If the terms of this -license do not permit the full use that you propose to make of the Package, -you should contact the Copyright Holder and seek a different licensing -arrangement. - -Definitions - -"Copyright Holder" means the individual(s) or organization(s) named in the -copyright notice for the entire Package. - -"Contributor" means any party that has contributed code or other material to -the Package, in accordance with the Copyright Holder's procedures. - -"You" and "your" means any person who would like to copy, distribute, or -modify the Package. - -"Package" means the collection of files distributed by the Copyright Holder, -and derivatives of that collection and/or of those files. A given Package may -consist of either the Standard Version, or a Modified Version. - -"Distribute" means providing a copy of the Package or making it accessible to -anyone else, or in the case of a company or organization, to others outside of -your company or organization. - -"Distributor Fee" means any fee that you charge for Distributing this Package -or providing support for this Package to another party. It does not mean -licensing fees. - -"Standard Version" refers to the Package if it has not been modified, or has -been modified only in ways explicitly requested by the Copyright Holder. - -"Modified Version" means the Package, if it has been changed, and such changes -were not explicitly requested by the Copyright Holder. - -"Original License" means this Artistic License as Distributed with the -Standard Version of the Package, in its current version or as it may be -modified by The Perl Foundation in the future. - -"Source" form means the source code, documentation source, and configuration -files for the Package. - -"Compiled" form means the compiled bytecode, object code, binary, or any other -form resulting from mechanical transformation or translation of the Source -form. - -Permission for Use and Modification Without Distribution - -(1) You are permitted to use the Standard Version and create and use Modified -Versions for any purpose without restriction, provided that you do not -Distribute the Modified Version. - -Permissions for Redistribution of the Standard Version - -(2) You may Distribute verbatim copies of the Source form of the Standard -Version of this Package in any medium without restriction, either gratis or -for a Distributor Fee, provided that you duplicate all of the original -copyright notices and associated disclaimers. At your discretion, such -verbatim copies may or may not include a Compiled form of the Package. - -(3) You may apply any bug fixes, portability changes, and other modifications -made available from the Copyright Holder. The resulting Package will still be -considered the Standard Version, and as such will be subject to the Original -License. - -Distribution of Modified Versions of the Package as Source - -(4) You may Distribute your Modified Version as Source (either gratis or for a -Distributor Fee, and with or without a Compiled form of the Modified Version) -provided that you clearly document how it differs from the Standard Version, -including, but not limited to, documenting any non-standard features, -executables, or modules, and provided that you do at least ONE of the -following: - -(a) make the Modified Version available to the Copyright Holder of the -Standard Version, under the Original License, so that the Copyright Holder may -include your modifications in the Standard Version. - -(b) ensure that installation of your Modified Version does not prevent the -user installing or running the Standard Version. In addition, the Modified -Version must bear a name that is different from the name of the Standard -Version. - -(c) allow anyone who receives a copy of the Modified Version to make the -Source form of the Modified Version available to others under - -(i) the Original License or - -(ii) a license that permits the licensee to freely copy, modify and -redistribute the Modified Version using the same licensing terms that apply to -the copy that the licensee received, and requires that the Source form of the -Modified Version, and of any works derived from it, be made freely available -in that license fees are prohibited but Distributor Fees are allowed. - -Distribution of Compiled Forms of the Standard Version or Modified Versions -without the Source - -(5) You may Distribute Compiled forms of the Standard Version without the -Source, provided that you include complete instructions on how to get the -Source of the Standard Version. Such instructions must be valid at the time of -your distribution. If these instructions, at any time while you are carrying -out such distribution, become invalid, you must provide new instructions on -demand or cease further distribution. If you provide valid instructions or -cease distribution within thirty days after you become aware that the -instructions are invalid, then you do not forfeit any of your rights under -this license. - -(6) You may Distribute a Modified Version in Compiled form without the Source, -provided that you comply with Section 4 with respect to the Source of the -Modified Version. - -Aggregating or Linking the Package - -(7) You may aggregate the Package (either the Standard Version or Modified -Version) with other packages and Distribute the resulting aggregation provided -that you do not charge a licensing fee for the Package. Distributor Fees are -permitted, and licensing fees for other components in the aggregation are -permitted. The terms of this license apply to the use and Distribution of the -Standard or Modified Versions as included in the aggregation. - -(8) You are permitted to link Modified and Standard Versions with other works, -to embed the Package in a larger work of your own, or to build stand-alone -binary or bytecode versions of applications that include the Package, and -Distribute the result without restriction, provided the result does not expose -a direct interface to the Package. - -Items That are Not Considered Part of a Modified Version - -(9) Works (including, but not limited to, modules and scripts) that merely -extend or make use of the Package, do not, by themselves, cause the Package to -be a Modified Version. In addition, such works are not considered parts of the -Package itself, and are not subject to the terms of this license. - -General Provisions - -(10) Any use, modification, and distribution of the Standard or Modified -Versions is governed by this Artistic License. By using, modifying or -distributing the Package, you accept this license. Do not use, modify, or -distribute the Package, if you do not accept this license. - -(11) If your Modified Version has been derived from a Modified Version made by -someone other than you, you are nevertheless required to ensure that your -Modified Version complies with the requirements of this license. - -(12) This license does not grant you the right to use any trademark, service -mark, tradename, or logo of the Copyright Holder. - -(13) This license includes the non-exclusive, worldwide, free-of-charge patent -license to make, have made, use, offer to sell, sell, import and otherwise -transfer the Package with respect to any patent claims licensable by the -Copyright Holder that are necessarily infringed by the Package. If you -institute patent litigation (including a cross-claim or counterclaim) against -any party alleging that the Package constitutes direct or contributory patent -infringement, then this Artistic License to you shall terminate on the date -that such litigation is filed. - -(14) Disclaimer of Warranty: - -THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS IS' -AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. THE IMPLIED WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT ARE -DISCLAIMED TO THE EXTENT PERMITTED BY YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, -NO COPYRIGHT HOLDER OR CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, -INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE -PACKAGE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/NAMESPACE b/NAMESPACE index 70ccfba..45c11f7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -20,11 +20,7 @@ export(plotNESlollipop) export(plotPCA) export(plotVolcano) export(runGSEA) -import(RColorBrewer) -import(fgsea) import(ggplot2) -import(ggpubr) -import(grid) importFrom(ComplexHeatmap,Heatmap) importFrom(ComplexHeatmap,HeatmapAnnotation) importFrom(ComplexHeatmap,draw) @@ -33,28 +29,17 @@ importFrom(circlize,colorRamp2) importFrom(edgeR,DGEList) importFrom(effectsize,eta_squared) importFrom(fgsea,fgsea) +importFrom(fgsea,plotEnrichment) importFrom(ggh4x,facet_grid2) -importFrom(ggplot2,aes) importFrom(ggplot2,element_blank) importFrom(ggplot2,element_line) -importFrom(ggplot2,element_rect) importFrom(ggplot2,element_text) -importFrom(ggplot2,facet_wrap) -importFrom(ggplot2,geom_hline) -importFrom(ggplot2,geom_line) -importFrom(ggplot2,geom_point) importFrom(ggplot2,geom_text) importFrom(ggplot2,geom_tile) -importFrom(ggplot2,geom_vline) importFrom(ggplot2,ggplot) -importFrom(ggplot2,ggtitle) importFrom(ggplot2,labs) importFrom(ggplot2,margin) -importFrom(ggplot2,scale_color_manual) importFrom(ggplot2,scale_fill_gradientn) -importFrom(ggplot2,scale_shape_manual) -importFrom(ggplot2,theme) -importFrom(ggplot2,theme_bw) importFrom(ggplot2,theme_minimal) importFrom(ggpubr,annotate_figure) importFrom(ggpubr,ggarrange) @@ -78,6 +63,7 @@ importFrom(pROC,roc) importFrom(reshape2,melt) importFrom(rstatix,t_test) importFrom(scales,hue_pal) +importFrom(scales,rescale) importFrom(scales,squish) importFrom(stats,TukeyHSD) importFrom(stats,anova) diff --git a/NEWS.md b/NEWS.md index 00b710c..4d66e94 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,69 @@ +# markeR 1.1.2 (12 Mar, 2026) + +## Minor Changes +- Moved Python bridge scripts from `inst/python/` to a top-level `python/` + directory, as these are supplementary scripts not part of the R package itself. +- Added `requirements.txt` to the `python/` directory listing all needed + Python dependencies (`rpy2`, `pandas`, `numpy`, and optionally + `ipython` and `jupyter`) for easier environment setup. +- Removed redundant code snippets from the Python bridge scripts. + +# markeR 1.1.1 (11 Mar, 2026) + + - Added `p.adjust.method` parameter across all functions performing or + depending on multiple testing correction, allowing users to specify + any correction method supported by `stats::p.adjust()`, beyond the default + Benjamini-Hochberg FDR. +- Added Python bridge scripts in `inst/python/` for users who wish to call + markeR from a Python environment via `rpy2`. Includes a tutorial workflow + script and a generic command-line wrapper capable of invoking any exported + markeR function. See `inst/python/README.md` for installation and usage. + +# markeR 1.0.0 (31 Oct, 2025) + +- Official Bioconductor Release. + +# markeR 0.99.5 (17 Sep, 2025) + +- Minor fix in `.onAttach()` to avoid errors when checking `ggplot2` version and ensure the startup warning works correctly. + +# markeR 0.99.4 (17 Sep, 2025) + +## General +- Addressed feedback from the Bioconductor review process with updates to documentation and vignette style. + +## Documentation and vignette +- Updated vignette style to **Bioconductor’s BiocStyle** with automatic table of contents. +- Improved vignette content with small corrections. +- Revised dataset documentation by adding explicit `usage: data(object)` entries. + +## Functions +- Updated `geneset_similarity()` color handling: replaced the single `color_values` parameter with three new parameters — `color`, `neutral_color`, and `cold_color`, for more interpretable visualization. + +# markeR 0.99.3 (21 Aug, 2025) + +## Package size and structure +- Reduced package size below the 5 MB limit by converting long vignettes into `pkgdown` articles and keeping only a shorter vignette in the package. +- Moved `inst/Paper` to a dedicated `paper` branch for better repository organization. +- Removed unnecessary `LICENSE` file (already declared in `DESCRIPTION`). + +## Documentation +- Added a concise main vignette (`markeR`) with installation, introduction, and a basic workflow. +- Converted three longer vignettes into `pkgdown` articles (linked at the end of the main vignette). +- Added runnable examples for `VariableAssociation`. + +## NAMESPACE and dependencies +- Replaced broad imports with `importFrom()` for most packages (except `ggplot2`, retained as full import). +- Removed unused `patchwork` import. +- Added missing imports from `stats` and `grDevices` to resolve `R CMD check` notes. + +## Code quality +- Replaced all `sapply()` calls with `vapply()`. +- Replaced `1:...` usage with `seq_len()` or `seq_along()`. +- Standardized assignment to `<-` instead of `=`. +- Fixed some redundant `stop()`/`warning()` conditions to provide clearer input validation. +- Addressed “no visible binding” notes by using `.data$` or `utils::globalVariables()`. + # markeR 0.99.2 (23 Jul, 2025) * Minor fixes in documentation diff --git a/R/CalculateScores.R b/R/CalculateScores.R index 5aae247..88df3b6 100644 --- a/R/CalculateScores.R +++ b/R/CalculateScores.R @@ -113,11 +113,12 @@ CalculateScores <- function(data, metadata, gene_sets, method = c("ssGSEA", "logmedian","ranking", "all")) { data <- as.data.frame(data) # Ensure data is a data frame method <- match.arg(method) # Validate method input - - if (!is.data.frame(data)) stop("Error: data must be a data-frame") - if (!is.null(metadata) && !is.data.frame(metadata)) stop( - "Error: metadata must be a data-frame") - if (!is.list(gene_sets)) stop("Error: gene_sets must be a list") + + + if (!is.data.frame(data) || (!is.null(metadata) && !is.data.frame(metadata)) || !is.list(gene_sets)) { + stop("Invalid input: 'data' must be a data-frame, 'metadata' (if provided) must be a data-frame, and 'gene_sets' must be a list.") + } + # Change first column name to default name "sample", for merging purposes if (!is.null(metadata)) colnames(metadata)[1] <- "sample" diff --git a/R/CalculateScores_Ranking.R b/R/CalculateScores_Ranking.R index 3e242e4..cc7f1fb 100644 --- a/R/CalculateScores_Ranking.R +++ b/R/CalculateScores_Ranking.R @@ -52,11 +52,19 @@ CalculateScores_Ranking <- function(data, metadata = NULL, gene_sets) { data <- as.data.frame(data) # Ensure data is a data frame ResultsList <- list() - - if (!is.data.frame(data)) stop("Error: data must be a data frame") - if (!is.null(metadata) && !is.data.frame(metadata)) stop( - "Error: metadata must be a data frame") - if (!is.list(gene_sets)) stop("Error: gene_sets must be a list") + + + if (!is.data.frame(data) || (!is.null(metadata) && !is.data.frame(metadata)) || !is.list(gene_sets)) { + stop( + paste( + if (!is.data.frame(data)) "Error: 'data' must be a data frame." else NULL, + if (!is.null(metadata) && !is.data.frame(metadata)) "Error: 'metadata' must be a data frame." else NULL, + if (!is.list(gene_sets)) "Error: 'gene_sets' must be a list." else NULL, + collapse = " " + ) + ) + } + # Change first column name to default name "sample" for merging purposes if (!is.null(metadata)) colnames(metadata)[1] <- "sample" @@ -78,12 +86,16 @@ CalculateScores_Ranking <- function(data, metadata = NULL, gene_sets) { signaturegenes_up <- signature[signature[,2] == 1, 1] signaturegenes_down <- signature[signature[,2] == -1, 1] - - # Apply getRanking function to each sample (column) - rankings_up <- sapply(colnames(data), - function(sample) getRanking(data, sample, signaturegenes_up)) - rankings_down <- sapply(colnames(data), - function(sample) getRanking(data, sample, signaturegenes_down)) + + + rankings_up <- vapply(colnames(data), + function(sample) getRanking(data, sample, signaturegenes_up), + numeric(1)) + + rankings_down <- vapply(colnames(data), + function(sample) getRanking(data, sample, signaturegenes_down), + numeric(1)) + ranking_final <- (rankings_up - rankings_down) / length(universe_genes) ranking_final <- data.frame(sample = colnames(data), score = ranking_final) @@ -93,11 +105,11 @@ CalculateScores_Ranking <- function(data, metadata = NULL, gene_sets) { message(paste0("Considering unidirectional gene signature mode for signature ", sig)) signaturegenes <- signature[, 1] - - # Apply getRanking function to each sample (column) - rankings <- sapply(colnames(data), - function(sample) getRanking(data, sample, signaturegenes)) - + + rankings <- vapply(colnames(data), + function(sample) getRanking(data, sample, signaturegenes), + numeric(1)) + ranking_final <- rankings / length(universe_genes) ranking_final <- data.frame(sample = colnames(data), score = ranking_final) @@ -107,9 +119,10 @@ CalculateScores_Ranking <- function(data, metadata = NULL, gene_sets) { signaturegenes <- signature - # Apply getRanking function to each sample (column) - rankings <- sapply(colnames(data), - function(sample) getRanking(data, sample, signaturegenes)) + # Apply getRanking function to each sample (column) + rankings <- vapply(colnames(data), + function(sample) getRanking(data, sample, signaturegenes), + numeric(1)) ranking_final <- rankings / length(universe_genes) ranking_final <- data.frame(sample = colnames(data), score = ranking_final) @@ -169,7 +182,7 @@ getRanking <- function(data, sample, geneset) { # Order from least to most expressed expressiongene <- expressiongene[order(expressiongene, decreasing = FALSE)] ranking <- match(geneset, names(expressiongene)) # Find gene positions in ordered list - ranking <- as.vector(na.omit(ranking)) # Remove missing genes + ranking <- as.vector(stats::na.omit(ranking)) # Remove missing genes return(sum(ranking)) # Return sum of ranks } diff --git a/R/CalculateScores_logmedian.R b/R/CalculateScores_logmedian.R index 6a77d99..b654b51 100644 --- a/R/CalculateScores_logmedian.R +++ b/R/CalculateScores_logmedian.R @@ -93,13 +93,13 @@ calculateScore_logmedian_bidirectional <- function(data, signature) { signaturegenes_down <- signature[signature[, 2] == -1, ] # Compute log-median scores for upregulated genes - data_subset_up <- na.omit(subset(log2(data + 1), row.names(data) %in% signaturegenes_up[, 1])) - data_subset_up <- data_subset_up - apply(data_subset_up, 1, median) + data_subset_up <- stats::na.omit(subset(log2(data + 1), row.names(data) %in% signaturegenes_up[, 1])) + data_subset_up <- data_subset_up - apply(data_subset_up, 1, stats::median) score_up <- colSums(data_subset_up) / nrow(data_subset_up) # Compute log-median scores for downregulated genes - data_subset_down <- na.omit(subset(log2(data + 1), row.names(data) %in% signaturegenes_down[, 1])) - data_subset_down <- data_subset_down - apply(data_subset_down, 1, median) + data_subset_down <- stats::na.omit(subset(log2(data + 1), row.names(data) %in% signaturegenes_down[, 1])) + data_subset_down <- data_subset_down - apply(data_subset_down, 1, stats::median) score_down <- colSums(data_subset_down) / nrow(data_subset_down) score <- score_up - score_down @@ -125,9 +125,9 @@ calculateScore_logmedian_unidirectional <- function(data, signature) { data <- as.data.frame(data) # Ensure data is a data frame if (is.data.frame(signature)) signature <- as.vector(signature[, 1]) - data_subset <- na.omit(subset(log2(data + 1), row.names(data) %in% signature)) + data_subset <- stats::na.omit(subset(log2(data + 1), row.names(data) %in% signature)) # Center gene in its log2 median - data_subset <- data_subset - apply(data_subset, 1, median) + data_subset <- data_subset - apply(data_subset, 1, stats::median) # Normalize by signature size dfScore <- colSums(data_subset) / nrow(data_subset) dfScore <- data.frame(sample = names(dfScore), score = dfScore) diff --git a/R/CalculateScores_ssGSEA.R b/R/CalculateScores_ssGSEA.R index a192b87..c8e9700 100644 --- a/R/CalculateScores_ssGSEA.R +++ b/R/CalculateScores_ssGSEA.R @@ -169,10 +169,11 @@ CalculateScores_ssGSEA_bidirectional <- function(data, signature) { ResultsList <- list() mtx <- log2(data) - mtx <- as.matrix(mtx) - - up_genes <- subset(signature, Signal == 1)$Gene - down_genes <- subset(signature, Signal == -1)$Gene + mtx <- as.matrix(mtx) + + up_genes <- signature[signature[["Signal"]] == 1, "Gene"] + down_genes <- signature[signature[["Signal"]] == -1, "Gene"] + ################## ssGSEA for UP genes ################## diff --git a/R/CohenD_IndividualGenes.R b/R/CohenD_IndividualGenes.R index 4589ea0..4b0d93d 100644 --- a/R/CohenD_IndividualGenes.R +++ b/R/CohenD_IndividualGenes.R @@ -166,8 +166,8 @@ CohenD_IndividualGenes <- function(data, metadata, if(n1 < 2 || n2 < 2) return(NA) m1 <- mean(x) m2 <- mean(y) - s1 <- sd(x) - s2 <- sd(y) + s1 <- stats::sd(x) + s2 <- stats::sd(y) pooled_sd <- sqrt(((n1 - 1) * s1^2 + (n2 - 1) * s2^2) / (n1 + n2 - 2)) if (pooled_sd == 0) return(NA) d <- (m1 - m2) / pooled_sd @@ -205,8 +205,8 @@ CohenD_IndividualGenes <- function(data, metadata, if (is.null(title)) title <- paste("Cohen's d for variable", condition_var, "(", paste(class, collapse = ", "), " vs others)") - barplot <- ggplot2::ggplot(effect_values, ggplot2::aes(y = reorder(Gene, CohensD), - x = CohensD)) + + barplot <- ggplot2::ggplot(effect_values, ggplot2::aes(y = stats::reorder(.data$Gene, .data$CohensD), + x = .data$CohensD)) + ggplot2::geom_bar(stat = "identity", fill = fillcolor) + #ggplot2::coord_flip() + coord_cartesian(xlim = c(0, max(effect_values$CohensD) + 0.1))+ diff --git a/R/CorrelationHeatmap.R b/R/CorrelationHeatmap.R index b103512..82e85b5 100644 --- a/R/CorrelationHeatmap.R +++ b/R/CorrelationHeatmap.R @@ -138,7 +138,7 @@ CorrelationHeatmap <- function(data, metadata = NULL, genes, separate.by = NULL, # Subset data to selected genes #data <- data[rownames(data) %in% genes, , drop = FALSE] - data <- na.omit(as.data.frame(data[genes,])) # to keep input order + data <- stats::na.omit(as.data.frame(data[genes,])) # to keep input order if (!is.null(separate.by) && is.null(metadata)) { stop("separate.by is not NULL but metadata is missing. Please specify metadata.") diff --git a/R/FPR_Simulation.R b/R/FPR_Simulation.R index e22566e..126e44a 100644 --- a/R/FPR_Simulation.R +++ b/R/FPR_Simulation.R @@ -1,3 +1,4 @@ +utils::globalVariables(c( "cohen", "method", "contrast" )) #' FPR Simulation Plot #' #' This function simulates false positive rates (FPR) by generating simulated @@ -43,7 +44,11 @@ #' grid layout. If `NULL`, layout is auto-calculated. #' @param nrow Integer. Number of rows for arranging signature plots in a grid #' layout. If `NULL`, layout is auto-calculated. -#' +#' @param p.adjust.method Character string specifying the method to use for +#' multiple testing correction. Must be one of \code{"BH"} (Benjamini-Hochberg, +#' default), \code{"holm"}, \code{"hommel"}, \code{"bonferroni"}, +#' \code{"BY"} (Benjamini-Yekutieli), \code{"fdr"}, or \code{"none"}. +#' Passed to \code{\link[stats]{p.adjust}}. #' #' @return Invisibly returns a list containing: #' \describe{ @@ -103,14 +108,14 @@ #' ) #' #' @import ggplot2 -#' @import ggpubr +#' @importFrom ggpubr ggarrange annotate_figure #' @export #' FPR_Simulation <- function(data, metadata, original_signatures, Variable, - gene_list = NULL, number_of_sims=10, title=NULL, + gene_list = NULL, number_of_sims=100, title=NULL, widthTitle = 30, titlesize = 12, pointSize = 2, labsize = 10,mode = c( "none","simple","medium","extensive"), - ColorValues=NULL, ncol=NULL, nrow=NULL) { + ColorValues=NULL, ncol=NULL, nrow=NULL, p.adjust.method="BH") { data <- as.data.frame(data) # Ensure data is a data frame if (is.null(gene_list)) gene_list <- row.names(data) @@ -127,7 +132,7 @@ FPR_Simulation <- function(data, metadata, original_signatures, Variable, results <- suppressMessages(CohenF_allConditions(data = data, metadata = metadata, gene_sets = original_signatures, - variable = Variable )) + variable = Variable, p.adjust.method = p.adjust.method )) cohentype <- "f" } else { @@ -137,7 +142,7 @@ FPR_Simulation <- function(data, metadata, original_signatures, Variable, results <- suppressMessages(CohenF_allConditions(data = data, metadata = metadata, gene_sets = original_signatures, - variable = Variable )) + variable = Variable, p.adjust.method = p.adjust.method )) cohentype <- "f" } else { @@ -145,7 +150,7 @@ FPR_Simulation <- function(data, metadata, original_signatures, Variable, results <- suppressMessages(CohenD_allConditions(data = data, metadata = metadata, gene_sets = original_signatures, - variable = Variable, mode = mode)) + variable = Variable, mode = mode, p.adjust.method = p.adjust.method)) cohentype <- "d" } @@ -204,7 +209,8 @@ FPR_Simulation <- function(data, metadata, original_signatures, Variable, # Generate simulated signatures based on the current signature simulatedsigs <- list() - for (sim in 1:number_of_sims) { +# for (sim in 1:number_of_sims) { + for (sim in seq_len(number_of_sims)) { cur_model_sig <- cur_sig # copy the current signature cur_model_sig$Gene <- sample(gene_list, nrow(cur_sig)) # simulate by sampling genes simulatedsigs[[paste0("sim", sim)]] <- cur_model_sig @@ -217,14 +223,14 @@ FPR_Simulation <- function(data, metadata, original_signatures, Variable, metadata = metadata, gene_sets = simulatedsigs, variable = Variable, - mode = mode + mode = mode, p.adjust.method = p.adjust.method )) } else { results2 <- suppressMessages(CohenF_allConditions( data = data, metadata = metadata, gene_sets = simulatedsigs, - variable = Variable + variable = Variable, p.adjust.method = p.adjust.method )) } @@ -239,7 +245,7 @@ FPR_Simulation <- function(data, metadata, original_signatures, Variable, } else if (cohentype=="f"){ cohen_mat <- sig_data$CohenF } else { - stop("Error: results2 format not valid.") + stop("Error: results format not valid.") } padj_mat <- sig_data$padj @@ -272,35 +278,7 @@ FPR_Simulation <- function(data, metadata, original_signatures, Variable, # needed to define the quantile dashed lines final_df$method <- factor(final_df$method, levels = methods) - - # - # # Restructure simulation results into a list (one data frame per method) - # restructured <- lapply(methods, function(m) { - # data.frame( - # CohensD = sapply(results2, function(sim) sim$CohenD[m, 1]), - # Pval = sapply(results2, function(sim) sim$PValue[m, 1]) - # ) - # }) - # names(restructured) <- methods - - # Combine simulation data into one long-format data frame - # sim_data <- do.call(rbind, lapply(methods, function(m) { - # df <- restructured[[m]] - # df$Method <- m - # df$Shape <- ifelse(df$Pval < 0.05, "Significant", "Not Significant") - # df - # })) - # # Set Method as a factor to control order in the plot - # sim_data$Method <- factor(sim_data$Method, levels = methods) - - # Compute only the 95th percentile (top 5% threshold) for each method - # q_data <- do.call(rbind, lapply(methods, function(m) { - # cd_vals <- final_df_simulated$cohen[final_df_simulated$method == m] - # data.frame( - # Method = m, - # q_high = as.numeric(quantile(cd_vals, 0.95, na.rm = TRUE)) - # ) - # })) + # Calculate FPR for each Original observation final_df$FPR <- NA @@ -322,7 +300,7 @@ FPR_Simulation <- function(data, metadata, original_signatures, Variable, for (mt in methods) { subset_df <- final_df[final_df$method == mt & final_df$contrast == ct, ] if (nrow(subset_df) == 0) next - q95 <- quantile(subset_df$cohen, 0.95, na.rm = TRUE) + q95 <- stats::quantile(subset_df$cohen, 0.95, na.rm = TRUE) xpos <- which(methods == mt) q_data <- rbind(q_data, data.frame( method = mt, contrast = ct, q_high = q95, @@ -334,8 +312,9 @@ FPR_Simulation <- function(data, metadata, original_signatures, Variable, # Ensuring the label is always on top # Compute max cohen per method + contrast across both Simulated and Original - all_max <- aggregate(cohen ~ method + contrast, data = final_df, FUN = max) + all_max <- stats::aggregate(cohen ~ method + contrast, data = final_df, FUN = max) + # Extract FPR values from Original rows original_df <- final_df[final_df$type == "Original", ] @@ -359,19 +338,19 @@ FPR_Simulation <- function(data, metadata, original_signatures, Variable, # Build the plot for the current signature p <- ggplot2::ggplot() + geom_jitter(data = final_df[final_df$type == "Simulated",], - aes(y = cohen, x = method, color = type), + aes(y = .data$cohen, x = .data$method, color = .data$type), width = 0.3, height = 0,size = pointSize, alpha = 0.5) + - geom_violin(data = final_df, aes(y = cohen, x = method), + geom_violin(data = final_df, aes(y = .data$cohen, x = .data$method), fill = "#F0F0F0", color = "black", alpha = 0.5) + geom_jitter(data = final_df[final_df$type == "Original",], aes(y = cohen, x = method, color = type), width = 0.3, height = 0, size = pointSize, alpha = 1) + geom_text(data = all_max, - aes(x = method, y = y, label = label), + aes(x = .data$method, y = .data$y, label = .data$label), size = 3, inherit.aes = FALSE) + geom_segment(data = q_data, - aes(x = xmin, xend = xmax, y = q_high, yend = q_high), + aes(x = .data$xmin, xend = .data$xmax, y = .data$q_high, yend = .data$q_high), linetype = "dashed", color = "red", inherit.aes = FALSE) + labs(title = wrap_title(sig, widthTitle), y = ifelse(cohentype == "d", "|Cohen's d|", "|Cohen's f|"), diff --git a/R/GSEA_VariableAssociation.R b/R/GSEA_VariableAssociation.R index d4603da..159d815 100644 --- a/R/GSEA_VariableAssociation.R +++ b/R/GSEA_VariableAssociation.R @@ -57,55 +57,17 @@ #' removed before analysis, leading to a loss of data to be fitted in the #' model. #' @param printplt Boolean specifying if plot is to be printed. Default: `TRUE`. -#' +#' @param p.adjust.method Character string specifying the method to use for +#' multiple testing correction. Must be one of \code{"BH"} (Benjamini-Hochberg, +#' default), \code{"holm"}, \code{"hommel"}, \code{"bonferroni"}, +#' \code{"BY"} (Benjamini-Yekutieli), \code{"fdr"}, or \code{"none"}. +#' Passed to \code{\link[stats]{p.adjust}}. +#' #' @return A list with two elements: #' - `data`: A data frame containing the GSEA results, including normalized #' enrichment scores (NES), adjusted p-values, and contrasts. #' - `plot`: A ggplot2 object visualizing the GSEA results as a lollipop plot. -#' -#' -#' @examples -#' # Simulate gene expression data (genes as rows, samples as columns) -#' set.seed(42) -#' expr <- as.data.frame(matrix(rnorm(500), nrow = 50, ncol = 10)) -#' rownames(expr) <- paste0("Gene", 1:50) -#' colnames(expr) <- paste0("Sample", 1:10) -#' -#' # Simulate metadata (categorical and continuous) -#' metadata <- data.frame( -#' sampleID = paste0("Sample", 1:10), -#' Group = rep(c("A", "B"), each = 5), -#' Age = sample(20:60, 10), -#' row.names = colnames(expr) -#' ) -#' -#' # Define a toy gene set: one gene set only for discovery mode! -#' gene_set <- list( -#' Signature1 = paste0("Gene", 1:10) -#' ) -#' -#' # Score-based association (e.g., logmedian) -#' res_score <- VariableAssociation( -#' method = "logmedian", -#' data = expr, -#' metadata = metadata, -#' cols = c("Group", "Age"), -#' gene_set = gene_set -#' ) -#' print(res_score$Overall) -#' print(res_score$plot) -#' -#' # GSEA-based association (if GSEA_VariableAssociation is available) -#' # res_gsea <- VariableAssociation( -#' # method = "GSEA", -#' # data = expr, -#' # metadata = metadata, -#' # cols = "Group", -#' # gene_set = gene_set -#' # ) -#' # print(res_gsea$data) -#' print(res_score$plot) -#' +#' #' @keywords internal GSEA_VariableAssociation <- function(data, metadata, cols, stat=NULL, mode=c("simple","medium","extensive"), @@ -113,7 +75,8 @@ GSEA_VariableAssociation <- function(data, metadata, cols, stat=NULL, signif_color = "red", saturation_value=NULL, sig_threshold = 0.05, widthlabels=18, labsize=10, titlesize=14, pointSize=5, - ignore_NAs = FALSE, printplt =TRUE) { + ignore_NAs = FALSE, printplt =TRUE, + p.adjust.method = "BH") { data <- as.data.frame(data) # Ensure data is a data frame mode <- match.arg(mode) metadata <- metadata[, cols %in% colnames(metadata), drop = FALSE] @@ -139,7 +102,7 @@ GSEA_VariableAssociation <- function(data, metadata, cols, stat=NULL, if (variable_types[var] == "Numeric") { # Use a model matrix for continuous variables - design <- model.matrix(as.formula(paste("~1+", var)), data = metadata) + design <- stats::model.matrix(as.formula(paste("~1+", var)), data = metadata) DEGs_var <- calculateDE(data = data, metadata = metadata, modelmat = design, contrasts = c(var), @@ -172,15 +135,19 @@ GSEA_VariableAssociation <- function(data, metadata, cols, stat=NULL, combined_results$Contrast <- cont_vec # correct adjusted p value to correct for multiple testing for the contrasts? - combined_results$padj <- p.adjust(combined_results$padj, method = "BH") + combined_results$padj <- stats::p.adjust(combined_results$padj, method = p.adjust.method) combined_results_toreturn <- combined_results # Ensure contrast ordering - combined_results$Contrast <- sapply(combined_results$Contrast, - function(x) wrap_title(x, widthlabels)) + # combined_results$Contrast <- sapply(combined_results$Contrast, + # function(x) wrap_title(x, widthlabels)) + combined_results$Contrast <- vapply(combined_results$Contrast, + function(x) wrap_title(x, widthlabels), + FUN.VALUE = character(1)) + combined_results$Contrast <- factor(combined_results$Contrast, levels = combined_results$Contrast[order(combined_results$NES)]) @@ -196,17 +163,17 @@ GSEA_VariableAssociation <- function(data, metadata, cols, stat=NULL, } - plot <- ggplot2::ggplot(combined_results, ggplot2::aes(x = NES, y = Contrast, - fill = -log10(padj))) + + plot <- ggplot2::ggplot(combined_results, ggplot2::aes(x = .data$NES, y = .data$Contrast, + fill = -log10(.data$padj))) + ggplot2::geom_segment(ggplot2::aes( - yend = Contrast, + yend = .data$Contrast, xend = 0, - linetype = ifelse(stat_used == "B" & NES < 0, "dashed", "solid"), - color = ifelse(stat_used == "B" & NES < 0, "grey", "black") + linetype = ifelse(.data$stat_used == "B" & .data$NES < 0, "dashed", "solid"), + color = ifelse(.data$stat_used == "B" & .data$NES < 0, "grey", "black") ), size = .5) + ggplot2::geom_point(ggplot2::aes( stroke = 1.2, - color = ifelse(stat_used == "B" & NES < 0, "grey", "black") + color = ifelse(.data$stat_used == "B" & .data$NES < 0, "grey", "black") ), shape = 21, size = pointSize) + ggplot2::scale_fill_gradient2(low = nonsignif_color, mid = nonsignif_color, @@ -309,7 +276,8 @@ generate_all_contrasts <- function(levels, mode = "simple") { # 3. Groupwise comparisons (extensive mode) group_contrasts <- c() - for (i in 1:(n-1)) { + # for (i in 1:(n-1)) { + for (i in seq_len(max(0, n - 1))) { left_groups <- combn(levels, i, simplify = FALSE) # Subsets for the first group for (left in left_groups) { right <- setdiff(levels, left) # Remaining elements for the second group diff --git a/R/Heatmap_Cohen.R b/R/Heatmap_Cohen.R index 03840c9..89aac9d 100644 --- a/R/Heatmap_Cohen.R +++ b/R/Heatmap_Cohen.R @@ -33,6 +33,7 @@ #' @param ColorValues A character vector specifying the colors for the gradient #' fill in the heatmaps. Default is \code{c("#F9F4AE", "#B44141")}. #' @param title Title for the grid of plots. +#' #' @return A list with two elements: #' \describe{ #' \item{plt}{A combined heatmap arranged in a grid using \code{ggpubr::ggarrange}.} @@ -55,14 +56,13 @@ #' @seealso \code{\link{CohenD_allConditions}}, #' \code{\link{CohenF_allConditions}} #' -#' @importFrom ggplot2 ggplot geom_tile geom_text labs scale_fill_gradientn -#' theme_minimal element_text element_blank element_line margin +#' @import ggplot2 #' @importFrom ggpubr ggarrange #' #' @keywords internal Heatmap_Cohen <- function(cohenlist, nrow = NULL, ncol = NULL, limits = NULL, widthTitle = 22, titlesize = 12, ColorValues = NULL, - title=NULL ) { + title=NULL) { cohentype <- ifelse("CohenD" %in% names(cohenlist[[1]]), "d", ifelse("CohenF" %in% names(cohenlist[[1]]), "f", NULL)) @@ -112,9 +112,9 @@ Heatmap_Cohen <- function(cohenlist, nrow = NULL, ncol = NULL, limits = NULL, limits <- if (is.null(limits)) c(0 , max(long_data$Cohen, na.rm = TRUE)) else limits # Create heatmap using ggplot2 - p <- ggplot2::ggplot(long_data, ggplot2::aes(x = Var2, y = Var1, fill = Cohen)) + + p <- ggplot2::ggplot(long_data, ggplot2::aes(x = .data$Var2, y = .data$Var1, fill = .data$Cohen)) + ggplot2::geom_tile() + - ggplot2::geom_text(aes(label = label), color = "black", size = 3) + + ggplot2::geom_text(aes(label = .data$label), color = "black", size = 3) + ggplot2::scale_fill_gradientn(colors = ColorValues, limits = limits) + ggplot2::labs(title = signature_title, x = NULL, y = NULL, fill = ifelse(cohentype=="d", "|Cohen\'s d|", "|Cohen\'s f|")) + @@ -202,7 +202,12 @@ Heatmap_Cohen <- function(cohenlist, nrow = NULL, ncol = NULL, limits = NULL, #' groups. #' - `"extensive"`: All possible groupwise contrasts, ensuring balance in the #' number of terms on each side. -#' +#' @param p.adjust.method Character string specifying the method to use for +#' multiple testing correction. Must be one of \code{"BH"} (Benjamini-Hochberg, +#' default), \code{"holm"}, \code{"hommel"}, \code{"bonferroni"}, +#' \code{"BY"} (Benjamini-Yekutieli), \code{"fdr"}, or \code{"none"}. +#' Passed to \code{\link[stats]{p.adjust}}. +#' #' @return A named list where each element corresponds to a gene signature. Each #' signature element is a list with three components: #' \describe{ @@ -228,7 +233,7 @@ Heatmap_Cohen <- function(cohenlist, nrow = NULL, ncol = NULL, limits = NULL, #' #' @keywords internal CohenD_allConditions <- function(data, metadata, gene_sets, variable, - mode = c("simple","medium","extensive")) { + mode = c("simple","medium","extensive"), p.adjust.method = "BH") { data <- as.data.frame(data) # Ensure data is a data frame # Step 1: Check if variable exists in metadata if (!variable %in% colnames(metadata)) { @@ -266,8 +271,8 @@ CohenD_allConditions <- function(data, metadata, gene_sets, variable, quantitative_var = "score", mode=mode) # Convert to named vectors (column names = comparisons) - cohen_d_results[[method]] <- setNames(cohen_results$CohenD, cohen_results$contrast) - p_value_results[[method]] <- setNames(cohen_results$PValue, cohen_results$contrast) + cohen_d_results[[method]] <- stats::setNames(cohen_results$CohenD, cohen_results$contrast) + p_value_results[[method]] <- stats::setNames(cohen_results$PValue, cohen_results$contrast) } # Convert lists to data frames @@ -293,8 +298,8 @@ CohenD_allConditions <- function(data, metadata, gene_sets, variable, } } - # Step 2: Apply BH correction within each method - all_padj <- lapply(all_pvalues, function(pvals) p.adjust(pvals, method = "BH")) + # Step 2: Apply correction within each method + all_padj <- lapply(all_pvalues, function(pvals) stats::p.adjust(pvals, method = p.adjust.method)) # Step 3: Store corrected p-values back into result_list index_tracker <- list() # Track index position for each method @@ -345,8 +350,8 @@ cohen_d <- function(x, y) { if(n1 < 2 || n2 < 2) return(NA) m1 <- mean(x) m2 <- mean(y) - s1 <- sd(x) - s2 <- sd(y) + s1 <- stats::sd(x) + s2 <- stats::sd(y) pooled_sd <- sqrt(((n1 - 1) * s1^2 + (n2 - 1) * s2^2) / (n1 + n2 - 2)) if (pooled_sd == 0) return(NA) d <- (m1 - m2) / pooled_sd @@ -403,9 +408,7 @@ compute_cohen_d <- function(dfScore, variable, quantitative_var="score", dfScore_subset <- create_contrast_column(dfScore, variable, pair) group1 <- levels(dfScore_subset$cohentest)[1] group2 <- levels(dfScore_subset$cohentest)[2] - - # group1 <- unique(dfScore_subset$cohentest)[1] - # group2 <- unique(dfScore_subset$cohentest)[2] + x <- dfScore_subset[dfScore_subset[["cohentest"]] == group1, quantitative_var, drop = TRUE] @@ -498,6 +501,12 @@ flatten_results <- function(nested_list) { #' downregulated). #' @param variable A string specifying the categorical variable in #' \code{metadata} used to model the gene signature scores. +#' @param p.adjust.method Character string specifying the method to use for +#' multiple testing correction. Must be one of \code{"BH"} (Benjamini-Hochberg, +#' default), \code{"holm"}, \code{"hommel"}, \code{"bonferroni"}, +#' \code{"BY"} (Benjamini-Yekutieli), \code{"fdr"}, or \code{"none"}. +#' Passed to \code{\link[stats]{p.adjust}}. +#' #' @return A named list where each element corresponds to a gene signature. Each #' signature element is a list with three components: #' \describe{ @@ -511,7 +520,7 @@ flatten_results <- function(nested_list) { #' } #' #' @keywords internal -CohenF_allConditions <- function(data, metadata, gene_sets, variable ) { +CohenF_allConditions <- function(data, metadata, gene_sets, variable, p.adjust.method = "BH" ) { data <- as.data.frame(data) # Ensure data is a data frame # Step 1: Check if variable exists in metadata if (!variable %in% colnames(metadata)) { @@ -555,8 +564,8 @@ CohenF_allConditions <- function(data, metadata, gene_sets, variable ) { results_var <- compute_cohens_f_pval(model, type) # Convert to named vectors (column names = comparisons) - cohen_f_results[[method]] <- setNames(results_var["Cohen_f"], variable) - p_value_results[[method]] <- setNames(results_var["P_Value"], variable) + cohen_f_results[[method]] <- stats::setNames(results_var["Cohen_f"], variable) + p_value_results[[method]] <- stats::setNames(results_var["P_Value"], variable) } # Convert lists to data frames @@ -582,8 +591,8 @@ CohenF_allConditions <- function(data, metadata, gene_sets, variable ) { } } - # Step 2: Apply BH correction within each method - all_padj <- lapply(all_pvalues, function(pvals) p.adjust(pvals, method = "BH")) + # Step 2: Apply correction within each method + all_padj <- lapply(all_pvalues, function(pvals) stats::p.adjust(pvals, method = p.adjust.method)) # Step 3: Store corrected p-values back into result_list index_tracker <- list() # Track index position for each method diff --git a/R/IndividualGenes_Violins.R b/R/IndividualGenes_Violins.R index 19186ca..13b7c70 100644 --- a/R/IndividualGenes_Violins.R +++ b/R/IndividualGenes_Violins.R @@ -149,7 +149,7 @@ IndividualGenes_Violins <- function(data, metadata=NULL, genes,GroupingVariable, plt <- plt + ggplot2::geom_violin(alpha=0.4) # Add median summary crossbar. - plt <- plt + ggplot2::stat_summary(fun = median, fun.min = median, fun.max = median, + plt <- plt + ggplot2::stat_summary(fun = stats::median, fun.min = stats::median, fun.max = stats::median, geom = "crossbar", width = 0.25, position = ggplot2::position_dodge(width = 0.13)) diff --git a/R/PlotScores.R b/R/PlotScores.R index cc8ed03..0dc36cf 100644 --- a/R/PlotScores.R +++ b/R/PlotScores.R @@ -1,3 +1,5 @@ +utils::globalVariables(c("score")) + #' Plot gene signature scores using various methods. #' #' Computes and visualizes gene signature scores using one or more methods, @@ -112,7 +114,12 @@ #' @param cor Correlation method for numeric variables. One of `"pearson"` #' (default), `"spearman"`, or `"kendall"`. Only applies when the variable is #' numeric and `method != "all"`. -#' +#' @param p.adjust.method Character string specifying the method to use for +#' multiple testing correction. Must be one of \code{"BH"} (Benjamini-Hochberg, +#' default), \code{"holm"}, \code{"hommel"}, \code{"bonferroni"}, +#' \code{"BY"} (Benjamini-Yekutieli), \code{"fdr"}, or \code{"none"}. +#' Passed to \code{\link[stats]{p.adjust}}. Only if `method == "all"`. +#' #' @return Depending on `method`: #' #' If `method = "all"`, returns a list with `heatmap` and `volcano` ggplot objects. @@ -239,7 +246,8 @@ PlotScores <- function(data, metadata, gene_sets, cond_cohend = NULL, pvalcalc = FALSE, mode = c("simple","medium","extensive"), widthlegend=22, sig_threshold=0.05, cohen_threshold=0.5, - colorPalette="Set3", cor=c("pearson","spearman","kendall")) { + colorPalette="Set3", cor=c("pearson","spearman","kendall"), + p.adjust.method="BH") { data <- as.data.frame(data) # Ensure data is a data frame method <- match.arg(method) mode <- match.arg(mode) @@ -252,13 +260,13 @@ PlotScores <- function(data, metadata, gene_sets, if (type =="Numeric"){ cohenlist <- CohenF_allConditions(data = data, metadata = metadata, - gene_sets = gene_sets, variable = Variable ) + gene_sets = gene_sets, variable = Variable, p.adjust.method = p.adjust.method ) } else { cohenlist <- CohenD_allConditions(data = data, metadata = metadata, gene_sets = gene_sets, variable = Variable, - mode = mode) + mode = mode, p.adjust.method = p.adjust.method ) } @@ -499,11 +507,11 @@ PlotScores_Categorical <- function(data, metadata, gene_sets, ColorValues <- if (is.null(ColorValues)) "#ECBD78" else ColorValues - p <- ggplot2::ggplot(df, ggplot2::aes(x = score)) + + p <- ggplot2::ggplot(df, ggplot2::aes(x = .data$score)) + ggplot2::geom_density(fill = ColorValues, alpha = 0.5) + ggplot2::labs(title = "Density Plot of Score", x = xlab, y = "Density") + # add points below density - ggplot2::geom_rug(ggplot2::aes(x = score), color=ColorValues, sides = "b", + ggplot2::geom_rug(ggplot2::aes(x = .data$score), color=ColorValues, sides = "b", alpha = 0.8, size = .5, length = grid::unit(0.035, "npc")) # Customize the plot appearance. @@ -596,7 +604,7 @@ PlotScores_Categorical <- function(data, metadata, gene_sets, p <- p + ggplot2::geom_violin(alpha = 0.5, scale = "width") # Add median summary crossbar. - p <- p + ggplot2::stat_summary(fun = median, fun.min = median, fun.max = median, + p <- p + ggplot2::stat_summary(fun = stats::median, fun.min = stats::median, fun.max = stats::median, geom = "crossbar", width = 0.25, position = ggplot2::position_dodge(width = 0.13)) @@ -624,7 +632,7 @@ PlotScores_Categorical <- function(data, metadata, gene_sets, line1 <- wrap_title(paste0("Cohen's d = ", format(signif(cohen_d_results, digits=3), - scientific = TRUE)), + scientific = FALSE)), width = widthTitle) line2 <- wrap_title(paste0("p = ", format(signif(p_val, digits=3), scientific = TRUE)), @@ -634,7 +642,7 @@ PlotScores_Categorical <- function(data, metadata, gene_sets, } else { subtitle <- wrap_title(paste0("Cohen's d = ", format(signif(cohen_d_results, digits=3), - scientific = TRUE)), + scientific = FALSE)), width = widthTitle) } @@ -663,7 +671,7 @@ PlotScores_Categorical <- function(data, metadata, gene_sets, line1 <- wrap_title(paste0("Cohen's d = ", format(signif(cohen_d_results, digits = 3), - scientific = TRUE)), + scientific = FALSE)), width = widthTitle) line2 <- wrap_title(paste0("p = ", format(signif(p_val, digits = 3), scientific = TRUE)), @@ -672,7 +680,7 @@ PlotScores_Categorical <- function(data, metadata, gene_sets, } else { subtitle <- wrap_title(paste0("Cohen's d = ", format(signif(cohen_d_results, digits = 3), - scientific = TRUE)), + scientific = FALSE)), width = widthTitle) } @@ -693,7 +701,7 @@ PlotScores_Categorical <- function(data, metadata, gene_sets, line1 <- wrap_title(paste0("Cohen's f = ", format(signif(results_var["Cohen_f"], digits = 3), - scientific = TRUE)), width = widthTitle) + scientific = FALSE)), width = widthTitle) line2 <- wrap_title(paste0("p = ", format(signif(results_var["P_Value"], digits = 3), scientific = TRUE)), width = widthTitle) @@ -701,7 +709,7 @@ PlotScores_Categorical <- function(data, metadata, gene_sets, } else { subtitle <- wrap_title(paste0("Cohen's f = ", format(signif(results_var["Cohen_f"], digits = 3), - scientific = TRUE)), width = widthTitle) + scientific = FALSE)), width = widthTitle) } } @@ -718,7 +726,7 @@ PlotScores_Categorical <- function(data, metadata, gene_sets, if (ConnectGroups && !is.null(ColorVariable)) { p <- p + ggplot2::stat_summary(ggplot2::aes_string(group = ColorVariable, color = ColorVariable), - fun.y = median, geom = "line", size = 1.5, + fun.y = stats::median, geom = "line", size = 1.5, alpha = 0.75, show.legend = FALSE) } @@ -784,16 +792,16 @@ PlotScores_Categorical <- function(data, metadata, gene_sets, if (!is.null(title)) title <- wrap_title(title, width = widthTitle) # Create label for y axis based on method. - if (method == "ssGSEA") { - ylab <- "ssGSEA Enrichment Score" - } else if (method == "logmedian") { - ylab <- "Normalised Signature Score" - } else if (method == "ranking") { - ylab <- "Signature Genes' Ranking" - } + # if (method == "ssGSEA") { + # ylab <- "ssGSEA Enrichment Score" + # } else if (method == "logmedian") { + # ylab <- "Normalised Signature Score" + # } else if (method == "ranking") { + # ylab <- "Signature Genes' Ranking" + # } combined_plot <- ggpubr::annotate_figure(combined_plot, - left = grid::textGrob(ylab, + left = grid::textGrob(paste0("Gene Set's Score (", method, ")"), rot = 90, vjust = 1, gp = grid::gpar(cex = 1.3, fontsize = labsize)), @@ -1023,7 +1031,7 @@ PlotScores_Numeric <- function(data, if (pvalcalc) { line1 <- wrap_title(paste0("Cohen's f = ", format(signif(results_var["Cohen_f"], digits=3), - scientific = TRUE)), width = widthTitle) + scientific = FALSE)), width = widthTitle) line2 <- wrap_title(paste0("p = ", format(signif(results_var["P_Value"], digits=3), scientific = TRUE)), width = widthTitle) @@ -1032,7 +1040,7 @@ PlotScores_Numeric <- function(data, } else { subtitle <- wrap_title(paste0("Cohen's f = ", format(signif(results_var["Cohen_f"], - digits=3), scientific = TRUE)), + digits=3), scientific = FALSE)), width = widthTitle) } @@ -1091,16 +1099,16 @@ PlotScores_Numeric <- function(data, if (!is.null(title)) title <- wrap_title(title, width = widthTitle) # Create label for y axis based on method. - if (method == "ssGSEA") { - ylab <- "ssGSEA Enrichment Score" - } else if (method == "logmedian") { - ylab <- "Normalised Signature Score" - } else if (method == "ranking") { - ylab <- "Signature Genes' Ranking" - } + # if (method == "ssGSEA") { + # ylab <- "ssGSEA Enrichment Score" + # } else if (method == "logmedian") { + # ylab <- "Normalised Signature Score" + # } else if (method == "ranking") { + # ylab <- "Signature Genes' Ranking" + # } combined_plot <- ggpubr::annotate_figure(combined_plot, - left = grid::textGrob(ylab, + left = grid::textGrob(paste0("Gene Set's Score (", method, ")"), rot = 90, vjust = 1, gp = grid::gpar(cex = 1.3, diff --git a/R/ROCAUC_Scores.R b/R/ROCAUC_Scores.R index f2e3eb0..e8aa3b0 100644 --- a/R/ROCAUC_Scores.R +++ b/R/ROCAUC_Scores.R @@ -123,7 +123,7 @@ ROCAUC_Scores_Calculate <- function(data, metadata, gene_sets, method = c("logme #'@param title Title for the grid of plots. #'@return A `ggplot2` or `ggarrange` object containing the ROC curve plots. #' -#'@importFrom ggplot2 ggplot geom_line aes labs theme scale_color_manual +#'@import ggplot2 #'@importFrom ggpubr annotate_figure ggarrange #' #'@examples @@ -204,7 +204,7 @@ ROC_Scores <- function(data, } # Create the ROC plot with all methods on the same plot - p <- ggplot2::ggplot(combined_df, ggplot2::aes(x = FPR, y = TPR, color = Method)) + + p <- ggplot2::ggplot(combined_df, ggplot2::aes(x = .data$FPR, y = .data$TPR, color = .data$Method)) + ggplot2::geom_line(size = 1) + # Plot all ROC curves on the same plot ggplot2::scale_color_manual(values = colors) + # Ensure correct color mapping for each method ggplot2::labs(title = wrap_title(signature,widthTitle), @@ -225,9 +225,9 @@ ROC_Scores <- function(data, length.out = length(auc_values))) # Adjust the vertical positions p <- p + ggplot2::geom_label(data = auc_texts, - ggplot2::aes(x = x, y = y, - label = paste0("AUC ", Method, " = ", round(AUC, 2), ""), - color = Method), + ggplot2::aes(x = .data$x, y = .data$y, + label = paste0("AUC ", .data$Method, " = ", round(.data$AUC, 2), ""), + color = .data$Method), size = 3, vjust = 0, # Adjust vertical position hjust = 1, # Adjust horizontal position to align to the right @@ -436,9 +436,9 @@ AUC_Scores <- function(data, metadata, gene_sets, limits <- if (is.null(limits)) c(0.5 , 1) else limits # Create heatmap using ggplot2 - p <- ggplot2::ggplot(long_data, ggplot2::aes(x = Method, y = Contrast, fill = AUC)) + + p <- ggplot2::ggplot(long_data, ggplot2::aes(x = .data$Method, y = .data$Contrast, fill = .data$AUC)) + ggplot2::geom_tile() + - ggplot2::geom_text(aes(label = label), color = "black", size = 3) + + ggplot2::geom_text(aes(label = .data$label), color = "black", size = 3) + ggplot2::scale_fill_gradientn(colors = ColorValues, limits = limits) + ggplot2::labs(title = signature_title, x = NULL, y = NULL, fill = "AUC") + ggplot2::theme_bw() + diff --git a/R/ROCandAUCplot.R b/R/ROCandAUCplot.R index 3a727cc..bca12a7 100644 --- a/R/ROCandAUCplot.R +++ b/R/ROCandAUCplot.R @@ -259,9 +259,9 @@ ROCandAUCplot <- function(data, metadata, legend_position <- if (length(unique(roc_df[[group_var]])) > 1 && group_var != "All") "bottom" else "none" - roc_plot_local <- ggplot2::ggplot(roc_df, ggplot2::aes(x = FPR, y = TPR, color = Group, group = Group)) + + roc_plot_local <- ggplot2::ggplot(roc_df, ggplot2::aes(x = .data$FPR, y = .data$TPR, color = .data$Group, group = .data$Group)) + ggplot2::geom_line(size = 1) + - ggplot2::facet_wrap(~ Gene, scales = "free", ncol = roc_params_local$ncol, nrow = roc_params_local$nrow) + + ggplot2::facet_wrap(~ .data$Gene, scales = "free", ncol = roc_params_local$ncol, nrow = roc_params_local$nrow) + ggplot2::theme_minimal() + ggplot2::labs( title = final_title, @@ -356,7 +356,7 @@ ROCandAUCplot <- function(data, metadata, fillcolor <- ifelse(is.null(auc_params$colors), "#3B415B", auc_params$colors[1]) - barplot <- ggplot2::ggplot(auc_sorted, ggplot2::aes(y = reorder(Gene, AUC), x = AUC)) + + barplot <- ggplot2::ggplot(auc_sorted, ggplot2::aes(y = stats::reorder(.data$Gene, .data$AUC), x = .data$AUC)) + ggplot2::geom_bar(stat = "identity", fill = fillcolor) + #ggplot2::coord_flip() + coord_cartesian(xlim = c(0.5, 1))+ diff --git a/R/Score_VariableAssociation.R b/R/Score_VariableAssociation.R index 5cdfdc1..8c38b61 100644 --- a/R/Score_VariableAssociation.R +++ b/R/Score_VariableAssociation.R @@ -161,7 +161,12 @@ create_contrast_column <- function(metadata, variable_name, contrast) { #' @param color_palette A string specifying the color palette for discrete #' variables. Default: `"Set2"`. #' @param printplt Boolean specifying if plot is to be printed. Default: `TRUE`. -#' +#' @param p.adjust.method Character string specifying the method to use for +#' multiple testing correction. Must be one of \code{"BH"} (Benjamini-Hochberg, +#' default), \code{"holm"}, \code{"hommel"}, \code{"bonferroni"}, +#' \code{"BY"} (Benjamini-Yekutieli), \code{"fdr"}, or \code{"none"}. +#' Passed to \code{\link[stats]{p.adjust}}. +#' #' @return A list with: #' - `Overall`: Data frame of effect sizes and p-values for each contrasted #' phenotypic variable. @@ -207,7 +212,8 @@ Score_VariableAssociation <- function(data, discrete_colors=NULL, continuous_color = "#8C6D03", color_palette = "Set2", - printplt =TRUE){ + printplt =TRUE, + p.adjust.method = "BH"){ method <- match.arg(method) # Validate method input mode <- match.arg(mode) data <- as.data.frame(data) # Ensure data is a data frame @@ -296,8 +302,7 @@ Score_VariableAssociation <- function(data, # Would happen if we have only numeric variables if (nrow(df_results_contrast)!=0){ - df_results_contrast$padj <- p.adjust(df_results_contrast$PValue, - method = "BH") + df_results_contrast$padj <- stats::p.adjust(df_results_contrast$PValue, method = p.adjust.method) if(is.null(saturation_value)){ if (min(df_results_contrast$padj)>sig_threshold){ @@ -313,20 +318,24 @@ Score_VariableAssociation <- function(data, ########### CONTRAST MODE PLOT ############ # Ensure contrast ordering - df_results_contrast$Contrast <- sapply(df_results_contrast$Contrast, - function(x) wrap_title( - x, widthlabels)) + # df_results_contrast$Contrast <- sapply(df_results_contrast$Contrast, + # function(x) wrap_title( + # x, widthlabels)) + df_results_contrast$Contrast <- vapply(df_results_contrast$Contrast, + function(x) wrap_title(x, widthlabels), + FUN.VALUE = character(1)) + df_results_contrast$Contrast <- factor(df_results_contrast$Contrast, levels = df_results_contrast$Contrast [order(df_results_contrast$CohenD)]) plot_contrasts <- ggplot2::ggplot(df_results_contrast, - ggplot2::aes(x = CohenD, - y = Contrast, - fill = -log10(padj))) + + ggplot2::aes(x = .data$CohenD, + y = .data$Contrast, + fill = -log10(.data$padj))) + ggplot2::geom_segment(ggplot2::aes( - yend = Contrast, + yend = .data$Contrast, xend = 0, linetype = "solid", color = "black" @@ -359,7 +368,7 @@ Score_VariableAssociation <- function(data, axis.text = ggplot2::element_text(size = labsize), axis.title.y = element_text(face = "bold") ) + - ggplot2::facet_grid(Variable ~., scales = "free", switch = "y", + ggplot2::facet_grid(.data$Variable ~., scales = "free", switch = "y", space = "free" ) + theme(strip.background =element_rect(fill="white")) @@ -381,10 +390,10 @@ Score_VariableAssociation <- function(data, ########### OVERALL MODE PLOT ############ plot_overall <- ggplot2::ggplot(df_results_overall, - ggplot2::aes(x = Cohen_f, y = Variable, - fill = -log10(P_Value))) + + ggplot2::aes(x = .data$Cohen_f, y = .data$Variable, + fill = -log10(.data$P_Value))) + ggplot2::geom_segment(ggplot2::aes( - yend = Variable, + yend = .data$Variable, xend = 0, linetype = "solid", color = "black" @@ -455,7 +464,7 @@ Score_VariableAssociation <- function(data, colors <- discrete_colors[[var]] } else { num_levels <- length(unique(df_ranking[[var]])) - colors <- colorRampPalette(RColorBrewer::brewer.pal( + colors <- grDevices::colorRampPalette(RColorBrewer::brewer.pal( 8, color_palette))(num_levels) } diff --git a/R/VariableAssociation.R b/R/VariableAssociation.R index fb6572e..a321be0 100644 --- a/R/VariableAssociation.R +++ b/R/VariableAssociation.R @@ -31,34 +31,29 @@ identify_variable_type <- function(df, cols = NULL) { if (is.null(cols)) return("Unknown") if (!is.null(cols)) df <- df[, cols, drop = FALSE] - - variable_types <- sapply(names(df), function(col_name) { - + + + variable_types <- vapply(names(df), function(col_name) { col <- df[[col_name]] unique_vals <- length(unique(col)) - + if (is.numeric(col) | is.integer(col)) { - # if (unique_vals > 10) { - return("Numeric") - # } else if (unique_vals == 2) { - # return("Categorical Bin") - # } else { - # return("Categorical Multi") - # } + return("Numeric") } else if (is.character(col) || is.factor(col)) { if (unique_vals == 2) { return("Categorical Bin") } else if (unique_vals > 10) { - warning(paste0("Warning: Number of unique values in '", col_name, "' - is too high (>10). Consider removing this variable - from the analysis.")) + warning(paste0("Warning: Number of unique values in '", col_name, + "' is too high (>10). Consider removing this variable ", + "from the analysis.")) return("Categorical Multi") } else { return("Categorical Multi") } } return("Unknown") - }, USE.NAMES = TRUE) + }, FUN.VALUE = character(1), USE.NAMES = TRUE) + return(variable_types) } @@ -101,7 +96,12 @@ identify_variable_type <- function(df, cols = NULL) { #' @param categorical_multi The statistical test for multi-level categorical #' variables. #' Options: `"anova"` (default) or `"kruskal-wallis"`. -#' +#' @param p.adjust.method Character string specifying the method to use for +#' multiple testing correction. Must be one of \code{"BH"} (Benjamini-Hochberg, +#' default), \code{"holm"}, \code{"hommel"}, \code{"bonferroni"}, +#' \code{"BY"} (Benjamini-Yekutieli), \code{"fdr"}, or \code{"none"}. +#' Passed to \code{\link[stats]{p.adjust}}. +#' #' @return A named list (one entry per variable being analysed) where each #' element is a data frame with: #' - **Metric**: The test statistic (correlation coefficient, t-statistic, @@ -135,7 +135,7 @@ identify_variable_type <- function(df, cols = NULL) { compute_stat_tests <- function(df, target_var, cols = NULL, numeric = "pearson", categorical_bin = "t.test", - categorical_multi = "anova") { + categorical_multi = "anova", p.adjust.method="BH") { # Ensure only one method is selected per variable type if (length(numeric) > 1 | length(categorical_bin) > 1 | @@ -194,7 +194,7 @@ compute_stat_tests <- function(df, target_var, cols = NULL, test_df <- rbind(test_df, tukey_df) } else if (categorical_multi == "kruskal-wallis") { - test_result <- kruskal.test(df[[target_var]] ~ df[[var]]) + test_result <- stats::kruskal.test(df[[target_var]] ~ df[[var]]) test_df <- data.frame(metric = test_result$statistic, p_value = test_result$p.value) row.names(test_df) <- "Kruskal-Wallis" @@ -207,7 +207,7 @@ compute_stat_tests <- function(df, target_var, cols = NULL, # scientific notation test_df$metric <- formatC(test_df$metric, format = "e", digits = 2) # correct for multiple testing per variable - test_df$p_value <- p.adjust(test_df$p_value, method = "BH") + test_df$p_value <- stats::p.adjust(test_df$p_value, method = p.adjust.method) test_df$p_value <- formatC(test_df$p_value, format = "e", digits = 3) @@ -240,7 +240,6 @@ compute_stat_tests <- function(df, target_var, cols = NULL, #' - `"ranking"` #' - `"GSEA"` #' -#' @section Shared Arguments (All Methods): #' @param data A data frame with gene expression data (genes as rows, #' samples as columns). #' @param metadata A data frame containing sample metadata; the first column @@ -273,7 +272,12 @@ compute_stat_tests <- function(df, target_var, cols = NULL, #' (`"B"` or `"t"`). Auto-detected if `NULL`. #' @param ignore_NAs (GSEA only) Logical. If `TRUE`, rows with NA metadata are #' removed. Default: `FALSE`. -#' +#' @param p.adjust.method Character string specifying the method to use for +#' multiple testing correction. Must be one of \code{"BH"} (Benjamini-Hochberg, +#' default), \code{"holm"}, \code{"hommel"}, \code{"bonferroni"}, +#' \code{"BY"} (Benjamini-Yekutieli), \code{"fdr"}, or \code{"none"}. +#' Passed to \code{\link[stats]{p.adjust}}. +#' #' @return A list with method-specific results and ggplot2-based visualizations: #' #' **For score-based methods (`logmedian`, `ssGSEA`, `ranking`):** @@ -297,6 +301,49 @@ compute_stat_tests <- function(df, target_var, cols = NULL, #' scores (NES), adjusted p-values, and contrasts. #' - `plot`: A ggplot2 lollipop plot of GSEA enrichment across contrasts. #' +#' @examples +#' # Simulate gene expression data (genes as rows, samples as columns) +#' set.seed(42) +#' expr <- as.data.frame(matrix(rnorm(500), nrow = 50, ncol = 10)) +#' rownames(expr) <- paste0("Gene", 1:50) +#' colnames(expr) <- paste0("Sample", 1:10) +#' +#' # Simulate metadata (categorical and continuous) +#' metadata <- data.frame( +#' sampleID = paste0("Sample", 1:10), +#' Group = rep(c("A", "B"), each = 5), +#' Age = sample(20:60, 10), +#' row.names = colnames(expr) +#' ) +#' +#' # Define a toy gene set: one gene set only for discovery mode! +#' gene_set <- list( +#' Signature1 = paste0("Gene", 1:10) +#' ) +#' +#' # Score-based association (e.g., logmedian) +#' res_score <- VariableAssociation( +#' method = "logmedian", +#' data = expr, +#' metadata = metadata, +#' cols = c("Group", "Age"), +#' gene_set = gene_set +#' ) +#' print(res_score$Overall) +#' print(res_score$plot) +#' +#' # GSEA-based association (if GSEA_VariableAssociation is available) +#' # res_gsea <- VariableAssociation( +#' # method = "GSEA", +#' # data = expr, +#' # metadata = metadata, +#' # cols = "Group", +#' # gene_set = gene_set +#' # ) +#' # print(res_gsea$data) +#' print(res_score$plot) +#' +#' #' @export VariableAssociation <- function(method = c("ssGSEA", "logmedian", "ranking", "GSEA"), @@ -318,7 +365,8 @@ VariableAssociation <- function(method = c("ssGSEA", "logmedian", discrete_colors = NULL, continuous_color = "#8C6D03", color_palette = "Set2", - printplt = TRUE) { + printplt = TRUE, + p.adjust.method = "BH") { method <- match.arg(method) mode <- match.arg(mode) data <- as.data.frame(data) # Ensure data is a data frame @@ -338,7 +386,8 @@ VariableAssociation <- function(method = c("ssGSEA", "logmedian", labsize = labsize, titlesize = titlesize, pointSize = pointSize, - ignore_NAs = ignore_NAs + ignore_NAs = ignore_NAs, + p.adjust.method = p.adjust.method ) } else if (method %in% c("ssGSEA", "logmedian", "ranking")) { @@ -360,7 +409,8 @@ VariableAssociation <- function(method = c("ssGSEA", "logmedian", discrete_colors = discrete_colors, continuous_color = continuous_color, color_palette = color_palette, - printplt = printplt + printplt = printplt, + p.adjust.method = p.adjust.method ) } diff --git a/R/Volcano_Cohen.R b/R/Volcano_Cohen.R index c6870b5..fabd21c 100644 --- a/R/Volcano_Cohen.R +++ b/R/Volcano_Cohen.R @@ -39,7 +39,7 @@ #' #' @seealso \code{\link{CohenD_allConditions}} #' -#' @importFrom ggplot2 ggplot geom_point geom_vline geom_hline facet_wrap labs scale_color_manual scale_shape_manual theme_bw ggtitle theme element_text element_rect +#' @import ggplot2 #' @importFrom RColorBrewer brewer.pal #' @keywords internal Volcano_Cohen <- function(cohenlist, @@ -88,12 +88,16 @@ Volcano_Cohen <- function(cohenlist, final_df <- do.call(rbind, rows) # Wrap long signature names - final_df$signature <- sapply(final_df$signature, - function(x) wrap_title(x, widthlegend)) + # final_df$signature <- sapply(final_df$signature, + # function(x) wrap_title(x, widthlegend)) + + final_df$signature <- vapply(final_df$signature, + function(x) wrap_title(x, widthlegend), + character(1)) # Handle colors if (is.null(ColorValues)) { - ColorValues <- colorRampPalette(RColorBrewer::brewer.pal(12, colorPalette))( + ColorValues <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(12, colorPalette))( length(unique(final_df$signature))) } else { if (!is.null(ColorValues[["volcano"]])) { @@ -105,13 +109,13 @@ Volcano_Cohen <- function(cohenlist, } # Generate plot - plt <- ggplot2::ggplot(final_df, ggplot2::aes(x = abs(cohen), - y = -log10(padj), - shape = method)) + + plt <- ggplot2::ggplot(final_df, ggplot2::aes(x = abs(.data$cohen), + y = -log10(.data$padj), + shape = .data$method)) + ggplot2::geom_point(colour = "black", size = pointSize) + - ggplot2::geom_point(ggplot2::aes(colour = signature), + ggplot2::geom_point(ggplot2::aes(colour = .data$signature), size = pointSize - 1.5) + - ggplot2::facet_wrap(. ~ contrast, scales = "free") + + ggplot2::facet_wrap(. ~ .data$contrast, scales = "free") + ggplot2::geom_hline(yintercept = -log10(sig_threshold), linetype = "dashed", color = "black", size = 0.5) + diff --git a/R/calculateDE.R b/R/calculateDE.R index 41c9801..d2c4c30 100644 --- a/R/calculateDE.R +++ b/R/calculateDE.R @@ -106,53 +106,46 @@ calculateDE <- function(data, metadata=NULL, variables=NULL, modelmat = NULL, variables <- setdiff(variables, "") # Remove empty strings return(variables) } - - + # Validate inputs - if (!is.matrix(data) && !is.data.frame(data)) stop( - "Error: 'data' must be a matrix or a data frame.") - if (is.null(rownames(data))) stop( - "Error: 'data' must have row names corresponding to gene identifiers.") - if (!is.null(metadata) && !is.data.frame(metadata)) stop( - "Error: 'metadata' must be a data frame.") - if (!is.null(metadata) && (ncol(data) != nrow(metadata))) stop( - "Error: Number of samples in 'data' does not match number of rows in 'metadata'.") - - # add "." after each variable and remove spaces - # Important to avoid errors in design matrix - # if (!is.null(metadata)){ - # #metadata <- as.data.frame(lapply(metadata, function(x) paste0(".", x))) - # #colnames(metadata) <- paste0(colnames(metadata),".") - # metadata <- as.data.frame(lapply(metadata, function(x) gsub(" ", "", x))) - # } - # if(!is.null(variables)){ - # variables <- gsub(" ", "", variables) - # #variables <- paste0(variables,".") - # } - #if(!is.null(modelmat)) colnames(modelmat) <- gsub(" ", "", colnames(modelmat)) - - - - # Reorder and subset metadata to match data - # counts: matrix or data frame with column names as sample IDs - # metadata: data frame with at least one column containing sample IDs - - # 1. Find the metadata column that best matches column names of count matrix - sample_ids <- colnames(data) - best_match_col <- which.max(sapply(metadata, function(col) sum(sample_ids %in% col))) - - # 2. Extract matched column - matched_col <- metadata[[best_match_col]] - - # 3. Subset metadata to only those samples present in the count matrix - metadata_matched <- metadata[matched_col %in% sample_ids, ] - - # 4. Reorder metadata to match column order of count matrix - rownames(metadata_matched) <- metadata_matched[[best_match_col]] - # drop = FALSE to preserve data frame format - metadata_matched <- metadata_matched[sample_ids, , drop = FALSE] - metadata <- metadata_matched - + if ((!is.matrix(data) && !is.data.frame(data)) || is.null(rownames(data))) { + stop("Error: 'data' must be a matrix or data frame with row names corresponding to gene identifiers.") + } + + if (!is.null(metadata)) { + if (!is.data.frame(metadata) || ncol(data) != nrow(metadata)) { + stop( + "Error with 'metadata': must be a data frame, and the number of rows must match the number of samples in 'data'." + ) + } + } + + + if (!is.null(metadata)){ + # Reorder and subset metadata to match data + # counts: matrix or data frame with column names as sample IDs + # metadata: data frame with at least one column containing sample IDs + + # 1. Find the metadata column that best matches column names of count matrix + sample_ids <- colnames(data) + #best_match_col <- which.max(sapply(metadata, function(col) sum(sample_ids %in% col))) + best_match_col <- which.max(vapply(metadata, function(col) sum(sample_ids %in% col), numeric(1))) + + # print message saying which column was used to match samples + message("Using metadata column '", colnames(metadata)[best_match_col], "' to match samples (data column names).") + + # 2. Extract matched column + matched_col <- metadata[[best_match_col]] + + # 3. Subset metadata to only those samples present in the count matrix + metadata_matched <- metadata[matched_col %in% sample_ids, ] + + # 4. Reorder metadata to match column order of count matrix + rownames(metadata_matched) <- metadata_matched[[best_match_col]] + # drop = FALSE to preserve data frame format + metadata_matched <- metadata_matched[sample_ids, , drop = FALSE] + metadata <- metadata_matched + } if (ignore_NAs & !is.null(variables)) { @@ -167,27 +160,16 @@ calculateDE <- function(data, metadata=NULL, variables=NULL, modelmat = NULL, # Construct design matrix design_matrix <- tryCatch({ - if (!is.null(modelmat)) { - if (!is.matrix(modelmat)) stop("Error: 'modelmat' must be a matrix.") - if (nrow(modelmat) != ncol(data)) stop( - "Error: Rows in 'modelmat' must match the number of samples in 'data'. - Check if your metadata has any NAs or consider using ignore_NAs = TRUE.") - modelmat - # } else if (!is.null(lmexpression)) { - # lmexpression <- as.formula(lmexpression, env = parent.frame()) - # design_matrix <- model.matrix(lmexpression, data = metadata) - # vars <- extract_variables(lmexpression) - # #colnames(design_matrix) <- gsub("^Condition","",colnames(design_matrix)) - # - # colnames(design_matrix) <- remove_prefix(colnames(design_matrix), vars) - # colnames(design_matrix) <- gsub(" ", "", colnames(design_matrix)) - # #colnames(design_matrix) <- sub("^[^.]*\\.", "", colnames(design_matrix)) - # design_matrix + if (!is.null(modelmat)) { + if (!is.matrix(modelmat) || (nrow(modelmat) != ncol(data))) { + stop("Error: 'modelmat' must be a matrix with rows equal to the number of samples in 'data'. + Check if your metadata has any NAs or consider using ignore_NAs = TRUE.") + } + + modelmat } else { design_formula <- as.formula(paste("~0+", paste(variables, collapse = " + "))) - design_matrix <- model.matrix(design_formula, data = metadata) - #colnames(design_matrix) <- gsub("^Condition","",colnames(design_matrix)) - #colnames(design_matrix) <- sub("^[^.]*\\.", "", colnames(design_matrix)) # remove the variable name + design_matrix <- stats::model.matrix(design_formula, data = metadata) colnames(design_matrix) <- remove_prefix(colnames(design_matrix), variables) colnames(design_matrix) <- gsub(" ", "", colnames(design_matrix)) # remove spaces design_matrix diff --git a/R/data.R b/R/data.R index 748e00a..2c8557a 100644 --- a/R/data.R +++ b/R/data.R @@ -15,6 +15,7 @@ #' for senescent samples, "young" for proliferative).} #' } #' +#' #' @source \url{https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE63577} #' #' @references Marthandan S, Priebe S, Baumgart M, Groth M et al. Similarities @@ -24,14 +25,15 @@ #' @references Marthandan S, Baumgart M, Priebe S, Groth M et al. Conserved #' Senescence Associated Genes and Pathways in Primary Human Fibroblasts #' Detected by RNA-Seq. PLoS One 2016;11(5):e0154531. PMID: 27140416 -#' -#' @keywords datasets +#' +#' @usage data(metadata_example) "metadata_example" #' Gene Expression Counts for Marthandan et al. (2016) RNA-Seq Data #' -#' A numeric matrix containing filtered and normalized gene expression data from -#' the Marthandan et al. (2016) study (GEO accession GSE63577). +#' A numeric matrix containing filtered and normalized (non log-transformed) +#' gene expression data from the Marthandan et al. (2016) study (GEO accession +#' GSE63577). #' #' Raw FASTQ files were downloaded using `fasterq-dump` (v2.11.0) and processed #' in a reproducible conda environment (Python v3.11.5). Quality control was @@ -39,8 +41,7 @@ #' Pseudo-alignment to the RefSeq transcriptome (NCBI release 109) was performed #' using kallisto (v0.44.0). Genes with low expression (mean count < 70 in all #' conditions) were filtered out. Count normalization factors were calculated -#' with `edgeR::calcNormFactors`, and log2-transformed values were obtained via -#' `limma::voom`. +#' with `edgeR::calcNormFactors`. #' #' Intermediate time points for HFF and MRC5 cell lines were excluded, resulting #' in a final dataset with 45 high-quality samples across proliferative, @@ -52,7 +53,7 @@ #' #' @format A numeric matrix with rows as genes (gene symbols) and columns as #' samples (sample IDs). -#' +#' #' @source \url{https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE63577} #' #' @references Marthandan S, Priebe S, Baumgart M, Groth M et al. Similarities @@ -63,8 +64,8 @@ #' Senescence Associated Genes and Pathways in Primary Human Fibroblasts #' Detected by RNA-Seq. #' *PLoS One* 2016;11(5):e0154531. PMID: 27140416 -#' -#' @keywords datasets +#' +#' @usage data(counts_example) "counts_example" #' Example Gene Sets for Cellular Senescence @@ -75,15 +76,15 @@ #' curated gene set of commonly reported senescence markers, #' with directionality (+1 or -1).} #' \item{REACTOME_Senescence}{Character vector of gene symbols. The -#' REACTOME_CELLULAR_SENESCENCE from MSigDB pathway. No directionality.} +#' REACTOME_CELLULAR_SENESCENCE from MSigDB database No directionality.} #' \item{HernandezSegura}{A data frame with columns `gene` and `direction`. #' A gene set from Hernandez-Segura et al. (2017), with directionality (+1 or -1).} #' } -#' +#' #' @references Hernandez-Segura A, de Jong TV, Melov S, Guryev V, Campisi J, #' Demaria M. Unmasking Transcriptional Heterogeneity in Senescent Cells. #' *Curr Biol.* 2017 Sep 11;27(17):2652-2660.e4. doi: 10.1016/j.cub.2017.07.033. #' Epub 2017 Aug 30. PMID: 28844647; PMCID: PMC5788810. -#' @keywords datasets +#' @usage data(genesets_example) "genesets_example" diff --git a/R/geneset_similarity.R b/R/geneset_similarity.R index 828d326..2079ee6 100644 --- a/R/geneset_similarity.R +++ b/R/geneset_similarity.R @@ -21,12 +21,31 @@ #' Odds Ratio required for a gene set to be included in the plot. Default is #' 1. #' @param pval_threshold (only if method == "odds_ratio" only) Numeric. Maximum -#' adjusted p-value to show a label. Default is 0.05. -#' @param limits Numeric vector of length 2. Limits for color scale. +#' adjusted p-value required for a gene set to be included in the plot. +#' Default is 0.05. +#' @param limits Numeric vector of length 2. Limits for color scale. If `NULL`, +#' is automatically set to c(0,1) for Jaccard or the range of OR for odds +#' ratio. #' @param title_size Integer specifying the font size for the plot title. #' Default is `12`. -#' @param color_values Character vector of colors used for the fill gradient. -#' Default is `c("#F9F4AE", "#B44141")`. +#' @param color Character. The color for the maximum of the scale. Default is +#' `red.` +#' - If `method = "jaccard"`, the scale goes from `neutral_color` to `color`. +#' - If `method = "odds_ratio"` and any OR >= 1, the scale ends at `color`. +#' - If `method = "odds_ratio"` and all OR <= 1, `color` is not used; instead, the scale +#' runs from `cold_color` (minimum) to `neutral_color` (OR = 1, if present; +#' otherwise `neutral_color` is the maximum). +#' @param neutral_color Character. The neutral reference color. Default is +#' `white`. +#' - If `method = "jaccard"`, this is the minimum of the scale. +#' - If `method = "odds_ratio"` and any OR >= 1, this corresponds to OR = 1 if such values exist; otherwise it is the minimum of the scale. +#' - If `method = "odds_ratio"` and all OR <= 1, this corresponds to OR = 1 if such values exist; otherwise it is the maximum of the scale (with `cold_color` as the minimum). +#' @param cold_color Character. The color for values below OR = 1 (only used +#' when `method = "odds_ratio"`). Default is `blue`. +#' - If `method = "odds_ratio"` and any OR < 1, the scale runs from `cold_color` +#' (minimum) to `neutral_color` (OR = 1 if present; otherwise `neutral_color` +#' is the maximum). +#' - Ignored if `method = "jaccard"` or if all OR >= 1. #' @param title Optional. Custom title for the plot. If `NULL`, the title #' defaults to `"Signature Overlap"`. #' @param jaccard_threshold (only if method == "jaccard" only) Numeric. Minimum @@ -45,13 +64,13 @@ #' \describe{ #' \item{\code{plot}}{The \pkg{ggplot2} object of the similarity heatmap.} #' \item{\code{data}}{The data frame object containing the similarity -#' scores aper pair of gene sets.} +#' scores per pair of gene sets.} #' } #' #' @import ggplot2 #' @importFrom tibble tibble #' @importFrom msigdbr msigdbr -#' @importFrom scales squish +#' @importFrom scales squish rescale #' #' @examples #' # Create two simple gene signatures @@ -97,7 +116,9 @@ geneset_similarity <- function( pval_threshold = 0.05, limits = NULL, title_size = 12, - color_values = c("#F9F4AE", "#B44141"), + color = "#B44141", # color for the maximum of the scale + neutral_color = "white", # neutral reference color + cold_color = "#4173B4", # color for OR < 1 when applicable title = NULL, jaccard_threshold = 0, msig_subset = NULL, @@ -107,11 +128,11 @@ geneset_similarity <- function( if (is.null(signatures) || length(signatures) == 0) { stop("You must provide at least one signature.") } - if (!is.list(signatures) || !all(sapply(signatures, is.character))) { + if (!is.list(signatures) || !all(vapply(signatures, is.character, logical(1)))) { stop("Signatures must be a named list of character vectors.") } if (!is.null(other_user_signatures) && (!is.list(other_user_signatures) || - !all(sapply(other_user_signatures, is.character)))) { + !all(vapply(other_user_signatures, is.character, logical(1))))) { stop("Other user signatures must be a named list of character vectors.") } if (!is.null(collection) && !is.character(collection)) { @@ -135,14 +156,15 @@ geneset_similarity <- function( if (!is.null(limits) && (!is.numeric(limits) || length(limits) != 2)) { stop("limits must be a numeric vector of length 2.") } + + if (!is.null(limits) && any(limits < 0 | !is.finite(limits))) { + warning("Limits contain negative, or non-finite values. Ensure limits are positive finite numbers.") + } if (!is.numeric(title_size) || title_size <= 0) { stop("title_size must be a positive numeric value.") } - - if (!is.character(color_values) || length(color_values) < 2) { - stop("color_values must be a character vector with two colors.") - } + if (!is.null(title) && !is.character(title)) { stop("title must be a character string or NULL.") @@ -229,14 +251,15 @@ geneset_similarity <- function( d <- length(setdiff(universe, union(sig1, sig2))) cont_tbl <- matrix(c(a, b, c, d), nrow = 2) - ft <- fisher.test(cont_tbl) + ft <- stats::fisher.test(cont_tbl) score <- log10(ft$estimate) - if (!is.na(ft$p.value) && ft$p.value <= pval_threshold && ft$estimate >= or_threshold) { - label <- sprintf("%.1f", score) - } else { - label <- "" - } + # if (!is.na(ft$p.value) && ft$p.value <= pval_threshold && ft$estimate >= or_threshold) { + # label <- sprintf("%.1f", 10^score) # to show non log values in heatmap + # #label <- sprintf("%.1f", score) + # } else { + # label <- "" + # } pval <- ft$p.value } @@ -244,7 +267,7 @@ geneset_similarity <- function( Reference_Signature = ref_name, Compared_Signature = comp_name, Score = score, - Label = label, + #Label = label, Pval = pval, stringsAsFactors = FALSE ) @@ -259,19 +282,24 @@ geneset_similarity <- function( if (metric == "odds_ratio") { # Filter groups where any 10^Score >= threshold + # keep_rows <- by(similarity_df, similarity_df$Compared_Signature, function(group) { + # any(10^group$Score >= or_threshold, na.rm = TRUE) + # }) + keep_rows <- by(similarity_df, similarity_df$Compared_Signature, function(group) { - any(10^group$Score >= or_threshold, na.rm = TRUE) + any(10^group$Score >= or_threshold & group$Pval <= pval_threshold, na.rm = TRUE) }) - + + kept_signatures <- names(keep_rows[keep_rows]) similarity_df <- similarity_df[similarity_df$Compared_Signature %in% kept_signatures, , drop = FALSE] # Add Label column - similarity_df$Label <- ifelse( - similarity_df$Pval <= pval_threshold, - sprintf("%.1f", similarity_df$Score), - "" - ) + # similarity_df$Label <- ifelse( + # similarity_df$Pval <= pval_threshold, + # sprintf("%.1f", similarity_df$Score), + # "" + # ) } if (metric == "jaccard" && jaccard_threshold > 0) { @@ -285,42 +313,170 @@ geneset_similarity <- function( } data <- similarity_df - - similarity_df$Reference_Signature <- sapply(similarity_df$Reference_Signature, - function(x) wrap_title(x, width_text)) - similarity_df$Compared_Signature <- sapply(similarity_df$Compared_Signature, - function(x) wrap_title(x, width_text)) - + + + + if (nrow(similarity_df) == 0) { + stop("No signatures passed the filtering criteria.") + } + + similarity_df$Reference_Signature <- vapply(similarity_df$Reference_Signature, + function(x) wrap_title(x, width_text), + character(1)) + similarity_df$Compared_Signature <- vapply(similarity_df$Compared_Signature, + function(x) wrap_title(x, width_text), + character(1)) +# +# if (is.null(limits)) { +# if (metric == "jaccard") { +# limits <- c(0, 1) +# } else { +# # For odds ratio, we set limits based on the data +# limits <- c(min(similarity_df$Score[is.finite(similarity_df$Score)], na.rm = TRUE), max(similarity_df$Score, na.rm = TRUE)) +# +# } +# } +# +# +# +# plt <- ggplot(similarity_df, aes(x = .data$Reference_Signature, +# y = .data$Compared_Signature, fill = .data$Score)) + +# geom_tile(color = "white") + +# #geom_text(aes(label = .data$Label), color = "black") + +# scale_fill_gradientn(colors = color_values, limits = limits, +# oob = scales::squish, na.value = na_color) + +# labs( +# x = "", +# y = "Compared Signature", +# fill = ifelse(metric == "jaccard", "Jaccard Index", "log10(OR)"), +# title = ifelse(is.null(title), paste("Signature Overlap (", metric, ")"), title) +# ) + +# theme_minimal() + +# theme( +# axis.text.x = element_text(angle = 45, hjust = 1), +# plot.title = element_text(hjust = 0.5, size = title_size) +# ) +# +# plt +# +# invisible(list(plot=plt, +# data=data)) + + + # ---------------------------- + # Safe handling of limits (user provides OR) + # ---------------------------- if (is.null(limits)) { if (metric == "jaccard") { limits <- c(0, 1) - } else { - # For odds ratio, we set limits based on the data - # but ensure they are at least 0 to avoid negative log10 values - limits <- c(0, max(similarity_df$Score, na.rm = TRUE)) - } + } else { # odds_ratio + + # Extract finite scores + finite_scores <- similarity_df$Score[is.finite(similarity_df$Score)] + + # Identify if there are any OR < 1 (logOR < 0) + has_below1 <- any(finite_scores < 0) + + # Compute padding for -Inf (original OR = 0) + if (has_below1) { + # Place -Inf one log unit below the minimum finite score < 0 + min_below <- min(finite_scores[finite_scores < 0]) + pad_value <- min_below - 1 + } else { + # All OR >= 1 → map -Inf slightly above the maximum score, then invert sign + max_score <- max(finite_scores) + pad_value <- -(max_score + 1) + } + + # Replace -Inf in Score with computed padding + similarity_df$Score[is.infinite(similarity_df$Score) & similarity_df$Score < 0] <- pad_value + + # Convert back from log + OR_values <- 10^similarity_df$Score + + # Replace any zero or negative OR with a small number + OR_values[OR_values <= 0] <- 1e-6 + #similarity_df$Score[is.infinite(similarity_df$Score) & similarity_df$Score < 0] <- log10(1e-6) + + # Compute limits + limits <- c(min(OR_values, na.rm = TRUE), max(OR_values, na.rm = TRUE)) + } } - plt <- ggplot(similarity_df, aes(x = Reference_Signature, - y = Compared_Signature, fill = Score)) + + + # Convert OR limits to log space (Score already log10 OR) + log_limits <- if (metric == "odds_ratio") log10(limits) else limits + if (min(log_limits) == max(log_limits)) { + log_limits <- log_limits + c(-0.01, 0.01) # small padding + } + zero <- 0 # neutral color at OR = 1 → log10(1) = 0 + + # ---------------------------- + # Define fill colors + # ---------------------------- + if (metric == "jaccard") { + fill_colors <- c(neutral_color, color) + fill_values <- c(log_limits[1], log_limits[2]) + } else if (metric == "odds_ratio") { + min_lim <- log_limits[1] + max_lim <- log_limits[2] + + if (min_lim >= zero) { + fill_colors <- c(neutral_color, color) + fill_values <- c(min_lim, max_lim) + } else if (max_lim <= zero) { + fill_colors <- c(cold_color, neutral_color) + fill_values <- c(min_lim, max_lim) + } else { + fill_colors <- c(cold_color, neutral_color, color) + fill_values <- c(min_lim, zero, max_lim) + } + + # ---------------------------- + # Safe legend breaks in OR space + # ---------------------------- + valid_OR <- limits[limits > 0 & is.finite(limits)] + if (length(valid_OR) == 0) valid_OR <- 1 # fallback + + log_breaks <- 10^seq(floor(log10(min(valid_OR))), ceiling(log10(max(valid_OR)))) + log_breaks <- log_breaks[log_breaks >= min(valid_OR) & log_breaks <= max(valid_OR)] + } + + # ---------------------------- + # Build plot + # ---------------------------- + plt <- ggplot(similarity_df, aes( + x = .data$Reference_Signature, + y = .data$Compared_Signature, + fill = .data$Score)) + geom_tile(color = "white") + - geom_text(aes(label = Label), color = "black") + - scale_fill_gradientn(colors = color_values, limits = limits, - oob = scales::squish, na.value = na_color) + + scale_fill_gradientn( + colors = fill_colors, + values = scales::rescale(fill_values), + limits = log_limits, + oob = scales::squish, + na.value = na_color, + trans = "identity", # already logged + breaks = if (metric == "odds_ratio") log10(log_breaks) else waiver(), + labels = if (metric == "odds_ratio") log_breaks else waiver() + ) + labs( x = "", y = "Compared Signature", - fill = ifelse(metric == "jaccard", "Jaccard Index", "log10(OR)"), - title = ifelse(is.null(title), paste("Signature Overlap (", metric, ")"), title) + fill = ifelse(metric == "jaccard", "Jaccard Index", "Odds Ratio"), + title = ifelse(is.null(title), "Signature Overlap", title) ) + theme_minimal() + theme( axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5, size = title_size) ) - - plt - - invisible(list(plot=plt, - data=data)) + + if (metric == "odds_ratio") { + data$Score <- 10^data$Score # convert back to OR for data output + } + + invisible(list(plot = plt, data = data)) + + } diff --git a/R/plotCombinedGSEA.R b/R/plotCombinedGSEA.R index 675b4a0..617d1ef 100644 --- a/R/plotCombinedGSEA.R +++ b/R/plotCombinedGSEA.R @@ -43,7 +43,7 @@ #' plotCombinedGSEA(GSEA_results, sig_threshold = 0.05, PointSize = 4) #' #' @import ggplot2 -#' @import RColorBrewer +#' @importFrom RColorBrewer brewer.pal #' @export plotCombinedGSEA <- function(GSEA_results, sig_threshold = 0.05, PointSize = 4, widthlegend=16) { @@ -65,15 +65,19 @@ plotCombinedGSEA <- function(GSEA_results, sig_threshold = 0.05, PointSize = 4, # Create a color palette for pathways # RColorBrewer has palettes for discrete color scales - pathway_colors <- colorRampPalette(RColorBrewer::brewer.pal(12, "Set3"))(length(unique(combined_data$pathway))) + pathway_colors <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(12, "Set3"))(length(unique(combined_data$pathway))) - combined_data$pathway <- sapply(combined_data$pathway, function(x) wrap_title(x, widthlegend)) + # combined_data$pathway <- sapply(combined_data$pathway, function(x) wrap_title(x, widthlegend)) + combined_data$pathway <- vapply(combined_data$pathway, + function(x) wrap_title(x, widthlegend), + character(1)) + # Create the plot - plot <- ggplot2::ggplot(combined_data, ggplot2::aes(x = NES, y = logpadj, - shape = contrast)) + + plot <- ggplot2::ggplot(combined_data, ggplot2::aes(x = .data$NES, y = .data$logpadj, + shape = .data$contrast)) + ggplot2::geom_point(colour="black", size = PointSize) + - ggplot2::geom_point(ggplot2::aes(colour = factor(pathway)) , + ggplot2::geom_point(ggplot2::aes(colour = factor(.data$pathway)) , size = PointSize-2.5) + ggplot2::geom_hline(yintercept = -log10(sig_threshold), linetype = "dashed", color = "black", size = .5) + diff --git a/R/plotGSEAenrichment.R b/R/plotGSEAenrichment.R index bdfd29c..ea79520 100644 --- a/R/plotGSEAenrichment.R +++ b/R/plotGSEAenrichment.R @@ -53,7 +53,9 @@ #' plot <- plotCombinedGSEA(GSEA_results, sig_threshold = 0.05, PointSize = 7) #' print(plot) #' -#' @import ggplot2 ggpubr fgsea +#' @import ggplot2 +#' @importFrom ggpubr ggarrange +#' @importFrom fgsea plotEnrichment #' @export plotGSEAenrichment <- function(GSEA_results, DEGList, gene_sets, widthTitle = 24, grid = FALSE, nrow=NULL, ncol=NULL, titlesize=12) { @@ -85,7 +87,7 @@ plotGSEAenrichment <- function(GSEA_results, DEGList, gene_sets, widthTitle = 24 stat_used <- gsea_row$stat_used # order ranks by stat used - ranks <- setNames(deg_df[,stat_used, drop=TRUE], rownames(deg_df)) + ranks <- stats::setNames(deg_df[,stat_used, drop=TRUE], rownames(deg_df)) ranks <- sort(ranks, decreasing = TRUE) nes_value <- round(gsea_row$NES, 2) @@ -118,7 +120,7 @@ plotGSEAenrichment <- function(GSEA_results, DEGList, gene_sets, widthTitle = 24 directions <- as.numeric(gs[[2]]) ranks_adjusted <- ranks idx <- which(names(ranks_adjusted) %in% gs_genes) - lookup <- setNames(directions, gs_genes) + lookup <- stats::setNames(directions, gs_genes) ranks_adjusted[idx] <- ranks_adjusted[idx] * lookup[names(ranks_adjusted)[idx]] plot <- fgsea::plotEnrichment(gs_genes, sort(ranks_adjusted, decreasing = TRUE)) diff --git a/R/plotNESlollipop.R b/R/plotNESlollipop.R index 8f30d75..16a0500 100644 --- a/R/plotNESlollipop.R +++ b/R/plotNESlollipop.R @@ -95,7 +95,7 @@ #' #' @import ggplot2 #' @importFrom ggpubr annotate_figure ggarrange -#' @import grid +#' @importFrom grid textGrob gpar #' @export plotNESlollipop <- function(GSEA_results, signif_color = "red", nonsignif_color = "white", @@ -121,15 +121,19 @@ plotNESlollipop <- function(GSEA_results, } # Ensure contrast ordering - res$pathway <- sapply(res$pathway, function(x) wrap_title(x, widthlabels)) + # res$pathway <- sapply(res$pathway, function(x) wrap_title(x, widthlabels)) + res$pathway <- vapply(res$pathway, + function(x) wrap_title(x, widthlabels), + FUN.VALUE = character(1)) + res$pathway <- factor(res$pathway, levels = res$pathway[order(res$NES)]) - plot <- ggplot2::ggplot(res, ggplot2::aes(x = NES, y = pathway, fill = -log10(padj))) + + plot <- ggplot2::ggplot(res, ggplot2::aes(x = .data$NES, y = .data$pathway, fill = -log10(.data$padj))) + # Add a condition for dashed lines and points for B statistic and negative NES - ggplot2::geom_segment(ggplot2::aes(yend = pathway, + ggplot2::geom_segment(ggplot2::aes(yend = .data$pathway, xend = 0, - linetype = ifelse(stat_used == "B" & NES < 0, + linetype = ifelse(.data$stat_used == "B" & .data$NES < 0, "dashed", "solid")), size = .5, color = ifelse(res$stat_used == "B" & res$NES < 0, @@ -138,7 +142,7 @@ plotNESlollipop <- function(GSEA_results, shape = 21, stroke = 1.2, size = 4, - ggplot2::aes(color = ifelse(stat_used == "B" & NES < 0, + ggplot2::aes(color = ifelse(.data$stat_used == "B" & .data$NES < 0, "darkgrey", "black")) # Points color for B and negative NES ) + diff --git a/R/plotPCA.R b/R/plotPCA.R index 3ce34b0..c816788 100644 --- a/R/plotPCA.R +++ b/R/plotPCA.R @@ -118,13 +118,14 @@ plotPCA <- function(data, metadata=NULL, genes=NULL, scale=FALSE, center=TRUE, if (nPCs > ncol(PCAcounts)) stop("Error: Number of genes too low for number of chosen PCs. Please reduce number of PCs.") - PCAcounts <- cbind(PCAcounts[,1:nPCs],y$samples) - + #PCAcounts <- cbind(PCAcounts[,1:nPCs],y$samples) + PCAcounts <- cbind(PCAcounts[, seq_len(nPCs), drop = FALSE], y$samples) + pltList <- list() for (pc in PCs){ pc <- unlist(pc) - ev = PCAdata$sdev^2 + ev <- PCAdata$sdev^2 pc_x <- round(100*ev[pc[1]]/sum(ev),2) pc_y <- round(100*ev[pc[2]]/sum(ev),2) diff --git a/R/plotVolcano.R b/R/plotVolcano.R index 609cc49..c8da073 100644 --- a/R/plotVolcano.R +++ b/R/plotVolcano.R @@ -262,7 +262,7 @@ plotVolcano <- function(DEResultsList, genes = NULL, N = NULL, # Add to plot p <- p + ggplot2::geom_point(data = plot_data, - aes(color = Direction), + aes(color = .data$Direction), size = pointSize, alpha = 0.8) + ggplot2::scale_color_manual(values = c( @@ -281,7 +281,11 @@ plotVolcano <- function(DEResultsList, genes = NULL, N = NULL, ## Annotate top N genes if requested if (!is.null(N)) { genes_stat <- fit[order(fit[, x], decreasing = TRUE), ] - annotationgenes <- row.names(genes_stat)[c(1:N, (nrow(genes_stat) - N + 1):nrow(genes_stat))] + # annotationgenes <- row.names(genes_stat)[c(1:N, (nrow(genes_stat) - N + 1):nrow(genes_stat))] + annotationgenes <- row.names(genes_stat)[ + c(seq_len(N), seq_len(N) + nrow(genes_stat) - N) + ] + p <- p + ggplot2::geom_point(data = fit[annotationgenes, ], color = highlightcolor, size = pointSize) + diff --git a/R/runGSEA.R b/R/runGSEA.R index 38ab412..d80279c 100644 --- a/R/runGSEA.R +++ b/R/runGSEA.R @@ -28,7 +28,13 @@ #' number of contrasts tested per signature and provides more stringent #' control of false discovery rate across multiple comparisons. If `FALSE`, #' the function only corrects for the number of gene sets. -#' +#' +#' @param p.adjust.method Character string specifying the method to use for +#' multiple testing correction. Must be one of \code{"BH"} (Benjamini-Hochberg, +#' default), \code{"holm"}, \code{"hommel"}, \code{"bonferroni"}, +#' \code{"BY"} (Benjamini-Yekutieli), \code{"fdr"}, or \code{"none"}. +#' Passed to \code{\link[stats]{p.adjust}}. +#' #' @param nPermSimple Number of permutations in the simple fgsea implementation #' for preliminary estimation of P-values. Parameter from fgsea. #' @@ -57,7 +63,7 @@ #' #' @importFrom fgsea fgsea #' @export -runGSEA <- function(DEGList, gene_sets, stat = NULL, ContrastCorrection=FALSE, nPermSimple=10000) { +runGSEA <- function(DEGList, gene_sets, stat = NULL, ContrastCorrection=FALSE, nPermSimple=10000, p.adjust.method="BH") { # Initialize storage for results across contrasts results_by_contrast <- list() @@ -80,7 +86,7 @@ runGSEA <- function(DEGList, gene_sets, stat = NULL, ContrastCorrection=FALSE, n } # Create the ranking vector for GSEA - ranks <- setNames(deg_df[[current_stat]], rownames(deg_df)) + ranks <- stats::setNames(deg_df[[current_stat]], rownames(deg_df)) if (current_stat=="t") { @@ -150,7 +156,7 @@ runGSEA <- function(DEGList, gene_sets, stat = NULL, ContrastCorrection=FALSE, n combined_df <- do.call(rbind, Map(cbind, results_by_contrast, df_name = names(results_by_contrast))) # Step 2: Adjust p-values across all data - combined_df$padj <- p.adjust(combined_df$pval, method = "BH") + combined_df$padj <- stats::p.adjust(combined_df$pval, method = p.adjust.method) # Step 3: Split back into the original list structure list_of_dfs <- split(combined_df, combined_df$df_name) @@ -163,7 +169,7 @@ runGSEA <- function(DEGList, gene_sets, stat = NULL, ContrastCorrection=FALSE, n # Step 1: Adjust p-values for each data frame individually results_by_contrast <- lapply(results_by_contrast, function(df) { - df$padj <- p.adjust(df$pval, method = "BH") # Adjust p-values per data frame + df$padj <- stats::p.adjust(df$pval, method = p.adjust.method) # Adjust p-values per data frame return(df) }) diff --git a/R/ssGSEA_alternative.R b/R/ssGSEA_alternative.R index f76acf7..c5642a3 100644 --- a/R/ssGSEA_alternative.R +++ b/R/ssGSEA_alternative.R @@ -79,52 +79,55 @@ ssGSEA_alternative <- function(X, scale = TRUE, norm = FALSE, single = TRUE) { - row_names = rownames(X) - num_genes = nrow(X) - gene_sets = lapply(gene_sets, function(genes) { which(row_names %in% genes) }) + row_names <- rownames(X) + num_genes <- nrow(X) + gene_sets <- lapply(gene_sets, function(genes) { which(row_names %in% genes) }) # Ranks for genes - R = colRanking(X, ties.method = 'average') + R <- colRanking(X, ties.method = 'average') # Calculate enrichment score (es) for each sample (column) - es = apply(R, 2, function(R_col) { - gene_ranks = order(R_col, decreasing = TRUE) - - # Calc es for each gene set - es_sample = sapply(gene_sets, function(gene_set_idx) { - # pos: match (within the gene set) - # neg: non-match (outside the gene set) - indicator_pos = gene_ranks %in% gene_set_idx - indicator_neg = !indicator_pos - - rank_alpha = (R_col[gene_ranks] * indicator_pos) ^ alpha - - step_cdf_pos = cumsum(rank_alpha) / sum(rank_alpha) - step_cdf_neg = cumsum(indicator_neg) / sum(indicator_neg) - - step_cdf_diff = step_cdf_pos - step_cdf_neg - - # Normalize by gene number - if (scale) step_cdf_diff = step_cdf_diff / num_genes - - # Use ssGSEA or not - if (single) { - sum(step_cdf_diff) - } else { - step_cdf_diff[which.max(abs(step_cdf_diff))] - } - }) + es <- apply(R, 2, function(R_col) { + gene_ranks <- order(R_col, decreasing = TRUE) + + + es_sample <- vapply( + gene_sets, + function(gene_set_idx) { + # pos: match (within the gene set) + # neg: non-match (outside the gene set) + indicator_pos <- gene_ranks %in% gene_set_idx + indicator_neg <- !indicator_pos + + rank_alpha <- (R_col[gene_ranks] * indicator_pos) ^ alpha + + step_cdf_pos <- cumsum(rank_alpha) / sum(rank_alpha) + step_cdf_neg <- cumsum(indicator_neg) / sum(indicator_neg) + + step_cdf_diff <- step_cdf_pos - step_cdf_neg + + if (scale) step_cdf_diff <- step_cdf_diff / num_genes + + if (single) { + sum(step_cdf_diff) + } else { + step_cdf_diff[which.max(abs(step_cdf_diff))] + } + }, + numeric(1) # <- expected output per iteration + ) + unlist(es_sample) - }) + }) - if (length(gene_sets) == 1) es = matrix(es, nrow = 1) + if (length(gene_sets) == 1) es <- matrix(es, nrow = 1) # Normalize by absolute diff between max and min - if (norm) es = es / diff(range(es)) + if (norm) es <- es / diff(range(es)) # Prepare output - rownames(es) = names(gene_sets) - colnames(es) = colnames(X) + rownames(es) <- names(gene_sets) + colnames(es) <- colnames(X) return(es) } diff --git a/R/zzz.R b/R/zzz.R new file mode 100644 index 0000000..d34382c --- /dev/null +++ b/R/zzz.R @@ -0,0 +1,13 @@ +.onAttach <- function(libname, pkgname) { + gg_version <- tryCatch( + utils::packageVersion("ggplot2"), + error = function(e) NULL + ) + + if (!is.null(gg_version) && gg_version > "3.5.2") { + packageStartupMessage( + "Warning: markeR has been tested with ggplot2 <= 3.5.2. ", + "Using newer versions may cause incompatibilities." + ) + } +} diff --git a/README.Rmd b/README.Rmd index 032ecd7..da4d6b3 100644 --- a/README.Rmd +++ b/README.Rmd @@ -17,22 +17,23 @@ knitr::opts_chunk$set( -![](https://img.shields.io/badge/status-development-yellowgreen) + [![Pkgdown](https://img.shields.io/badge/docs-pkgdown-blue.svg)](https://diseasetranscriptomicslab.github.io/markeR/) -[![Minimal R Version](https://img.shields.io/badge/min%20R-4.4.0-blue.svg)](https://github.com/DiseaseTranscriptomicsLab/markeR/actions/workflows/Rminversion.yaml) +![Minimal R Version](https://img.shields.io/badge/min%20R-4.5.0-blue.svg) [![codecov](https://codecov.io/gh/DiseaseTranscriptomicsLab/markeR/graph/badge.svg?token=7T1I4JCJG6)](https://codecov.io/gh/DiseaseTranscriptomicsLab/markeR) -**markeR** provides a suite of methods for using gene sets (signatures) to quantify and evaluate the extent to which a given gene signature marks a specific phenotype from gene expression data. The package implements various scoring, enrichment and classification approaches, along with tools to compute performance metrics and visualize results. +**`markeR`** is an R package that provides a modular and extensible framework for the systematic evaluation of gene sets as phenotypic markers using transcriptomic data. The package is designed to support both quantitative analyses and visual exploration of gene set behaviour across experimental and clinical phenotypes. -> **To cite markeR please use:** +> **To cite `markeR` please use:** > -> Martins-Silva R, Kaizeler A, Barbosa-Morais N (2025). _markeR: an R Toolkit for Evaluating Gene Sets as Phenotypic Markers_. Gulbenkian Institute for Molecular Medicine, Faculdade de Medicina, Universidade de Lisboa, Lisbon, Portugal. R package version 0.99.2, https://github.com/DiseaseTranscriptomicsLab/markeR. +> Martins-Silva R, Kaizeler A, Barbosa-Morais NL (2025). _markeR: An R Toolkit for Evaluating Gene Signatures as Phenotypic Markers_. doi:10.18129/B9.bioc.markeR, R package version 1.3, https://bioconductor.org/packages/markeR. -`inst/Paper/` — Contains all scripts and materials used in the original markeR paper to reproduce analyses and figures. +The folder `inst/Paper/` is in the **paper** branch and contains all scripts and materials used in the original `markeR` paper to reproduce analyses and figures. You can browse it [here](https://github.com/DiseaseTranscriptomicsLab/markeR/tree/paper/inst/Paper). + ![](man/figures/Workflow.png) @@ -51,12 +52,25 @@ knitr::opts_chunk$set( - [4. Visualisation and Evaluation](#4-visualisation-and-evaluation) - [5. Individual Gene Exploration (Optional)](#5-individual-gene-exploration-optional) - [6. Compare with Reference Gene Sets (Optional)](#6-compare-with-reference-gene-sets-optional) +- [Python Bridge](#python-bridge) - [Contact](#contact) ## Installation -The latest development release of markeR from [GitHub](https://github.com/) can be installed with: + +Install the latest release from Bioconductor: + +```{r, eval=FALSE} +# Install from Bioconductor +if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager") +BiocManager::install("markeR") +library(markeR) +``` + + +Or install the latest development release of `markeR` from [GitHub](https://github.com/) with: ``` r # install.packages("devtools") @@ -67,25 +81,17 @@ devtools::install_github("DiseaseTranscriptomicsLab/markeR@*release") The following tutorials are available: +* [Introduction to markeR][tutorial-introduction] * [Benchmarking Mode][tutorial-benchmarking] * [Discovery Mode][tutorial-discovery] * [Signature Similarity][tutorial-signaturesimilarity] ## Requirements -This package is officially supported on (based on a GitHub Actions workflow that tests against multiple `R` versions): - -- `R` `4.4.x` -- `R` `4.5.x` - -However, due to `Bioconductor` submission requirements, `R` version `4.5.0` will be listed as required upon package installation in future releases. - -⚠️ Compatibility with older R versions depends on the specific versions of dependencies installed. Older versions of `R` (including `R` `3.5.x`, `3.6.x`, `4.0.x`, `4.1.x`, `4.2.x`, and `4.3.x`) may work, but are not officially supported due to upstream dependency constraints. In some cases, installing older versions of dependencies (e.g., via `renv`, `CRAN` snapshots, or `checkpoint`) can restore compatibility. +This package is officially supported on `R > 4.5.0`. ⚠️ Older versions of `R` may work, but are not officially supported due to upstream dependency constraints. In some cases, installing older versions of dependencies (e.g., via `renv`, `CRAN` snapshots, or `checkpoint`) can restore compatibility. ## Common Workflow -`markeR` provides a modular pipeline to quantify transcriptomic signatures and assess their association with phenotypic or clinical variables. The typical workflow includes the following steps: - ### 1. Input Requirements Depending on the analysis mode, inputs vary slightly. @@ -122,7 +128,10 @@ gene_sets ``` * **Expression Data Frame**: - A filtered and normalised gene expression data frame (genes × samples). Row names must be gene identifiers, and column names must match the sample IDs in the metadata. + A filtered and normalised, non log-transformed, gene expression matrix (genes × samples). Row names must be gene identifiers; column names must match sample IDs in the metadata. + + **Warning:** If you are using microarray data or outputs from common RNA-seq pipelines (*e.g.*, edgeR), note that the expression values may already be log2-normalised. The input to `markeR` must necessarily be **non-log-transformed**. If your data are log2-transformed, you can revert them by applying `2^data`. + ```{r example-expression-matrix, echo=FALSE} # Simulate expression matrix: 10 genes × 5 samples @@ -138,7 +147,7 @@ head(expr_df) ``` * **Sample Metadata**: - A data frame with annotations for each sample, with the sample ID in the first column. The row names must match the column names of the expression matrix. + A data frame with samples as rows and annotations as columns. The first column should contain sample IDs matching the expression matrix column names. ```{r example-metadata, echo=FALSE} # Simulate sample metadata @@ -155,78 +164,88 @@ metadata ### 2. Select Mode of Analysis -* **Discovery Mode**: - Explore how a single, well-characterised gene set relates to a specific variable of interest. Suitable for hypothesis generation or signature projection. +`markeR` provides two modes of operation: -* **Benchmarking Mode**: - Evaluate one or more gene sets against multiple metadata variables using a standardised scoring and effect size framework. This mode provides comprehensive visualisations and comparisons across methods. +* **Benchmarking**: +evaluates gene sets' performance in marking a metadata variable, *i.e.*, a phenotype, returning comparative visualisations across scoring and enrichment methods. + +* **Discovery**: +examines the relationship between a gene set and one or more variables of interest, suitable for exploratory or hypothesis-generating analyses. ### 3. Choose a Quantification Approach -`markeR` supports two complementary strategies for quantifying the association between gene sets and phenotypes: +Two complementary strategies are implemented for quantifying associations between gene sets and phenotypes: #### 3.1 Score-Based Approach -This strategy generates a **single numeric score per sample**, reflecting the activity of a gene set. It enables flexible downstream analyses, including comparisons across phenotypic groups. - -Three scoring methods are available: -* **Log2-median**: Calculates the median log2 expression of the genes in the set. Sensitive to absolute shifts in expression. +A score summarising the collective expression of a gene set therein is assigned **to each sample**. Scores can be visualised using built-in functions, or used directly in downstream analyses (*e.g.*, comparisons between phenotypic groups of samples, correlations with numerical phenotypes). -* **Ranking**: Ranks all genes within each sample and averages the ranks of gene set members. Captures relative ordering rather than magnitude. +Available methods: -* **ssGSEA**: Computes a single-sample gene set enrichment score using the ssGSEA algorithm. Reflects the coordinated up- or down-regulation of the set in each sample. +* **Log2-median**: mean of the across-sample normalised log2 median-centred expression levels of the genes in the set; for bidirectional gene sets, the sample score is the partial score for the subset of putatively upregulated genes minus that of the downregulated subset. -These methods vary in assumptions and sensitivity. Robust gene sets are expected to perform consistently across all three. +* **Ranking**: mean expression rank of gene set members in each sample; for bidirectional gene sets, the sample score is the partial score for the subset of putatively upregulated genes minus that of the downregulated subset, and normalised by the number of genes in the set. -#### 3.2 Enrichment-Based Approach +* **ssGSEA**: single-sample gene set enrichment score using ssGSEA; for bidirectional gene sets, the sample score is the partial score for the subset of putatively upregulated genes minus that of the downregulated subset. -This approach uses a classical **gene set enrichment analysis (GSEA)** framework to evaluate whether the gene set is significantly overrepresented at the top or bottom of a ranked list of genes (e.g., ranked by fold change or correlation with phenotype). +Gene sets that are robust phenotypic markers are expected to yield consistently high scores across methods. -* **GSEA**: Computes a Normalised Enrichment Score (NES) for each contrast or variable of interest, adjusting for gene set size and multiple testing. +#### 3.2 Enrichment-Based Approach -Use this approach when interested in collective behaviour of gene sets in relation to ranked differential signals. +Enrichment-based methods implement **Gene Set Enrichment Analysis (GSEA)**. Genes are ranked according to differential expression statistics, and a Normalised Enrichment Score (NES) per variable of interest is computed, accompanied by a p-value adjusted for multiple hypothesis testing. ### 4. Visualisation and Evaluation -In **Benchmarking Mode**, `markeR` offers a range of visual summaries: +In **Benchmarking Mode**, `markeR` offers a range of visual summaries: -* Violin or scatter plots showing score distributions by phenotype -* Volcano plots and heatmaps based on effect sizes (Cohen’s *d* or *f*) -* ROC curves and AUC values -* Null distribution testing using random gene sets matched for size and directionality +* Violin plots of score distributions by categorical phenotype; +* Scatter plots of association between scores and numerical phenotypes; +* Volcano plots and heatmaps of scores or differential gene set expression based on effect sizes (Cohen’s *d* or *f*); +* ROC curves and respective AUC values of gene sets' phenotypic classification performance; +* Violin plots of effect size distributions (Cohen’s *d*) for pairwise group differences in scores, for original and simulated gene sets; +* Plots summarising NES alongside adjusted p-values (*e.g.*, lollipop plots); +* GSEA plots showing running enrichment scores across ranked gene lists. -In **Discovery Mode**, the output focuses on a single gene set: -* Score distributions by phenotype -* Pairwise contrasts (Cohen’s *d*) and overall effect sizes (Cohen’s *f*) -* Enrichment score summaries (NES) with adjusted p-values (e.g., lollipop plots) +In **Discovery Mode**, the output focuses on a single gene set: -Benchmarking mode offers the most comprehensive set of features and allows users to seamlessly move from discovery to benchmarking mode once a variable of interest has been identified and further testing is required. The main difference from Discovery mode is that Benchmarking is designed to evaluate multiple gene sets simultaneously, whereas Discovery mode focuses on quantifying a single, robust gene set. +* Score distributions stratified by variable; +* Effect sizes for pairwise and multiple-group differences (Cohen's *d* and *f*, respectively); +* Cross-variable summaries of NES and adjusted p-values (*e.g.*, lollipop plots). -### 5. Individual Gene Exploration (Optional) +The Benchmarking Mode offers the most comprehensive set of features. Users are allowed to seamlessly move from Discovery to Benchmarking once a variable of interest has been identified and further testing is required. Benchmarking is designed to evaluate multiple gene sets simultaneously, whereas Discovery focuses on the performance of a single gene set. -To better understand the contribution of individual genes within a gene set and identify whether specific genes drive the overall signal, `markeR` offers a suite of gene-level exploratory analyses, including: +### 5. Individual Gene Exploration + +To better understand the contribution of individual genes within a gene set, and identify whether specific genes drive the set's collective signal, `markeR` provides `VisualiseIndividualGenes.` Available options include: -* Expression heatmaps of genes across samples and groups -* Violin plots showing expression distributions of individual genes -* Correlation heatmaps to reveal co-expression patterns among genes in the set -* ROC curves and AUC values for individual genes to evaluate their discriminatory power -* Effect size calculations (Cohen’s *d*) per gene to quantify differential expression -* Principal Component Analysis (PCA) on gene set genes to assess variance explained and sample clustering +* Expression heatmaps of genes across samples or groups of samples; +* Violin plots showing cross-sample expression distributions of individual genes; +* Heatmaps of pairwise cross-sample expression correlation between genes in the set; +* ROC curves and AUC values to evaluate single genes' performance as phenotypic markers; +* Effect size estimation (Cohen’s *d*) of expression differences between groups of samples; +* Principal Component Analysis (PCA) of expression of genes in the set, to evaluate which genes dominate collective variance and how samples separate according to the gene set's expression. -### 6. Compare with Reference Gene Sets (Optional) +### 6. Compare with Reference Gene Sets -`markeR` allows comparison of user-defined gene sets to reference sets (e.g., from MSigDB) using: +`markeR` also supports comparison of user-defined gene sets against reference collections (e.g., MSigDB). Two complementary similarity metrics are implemented: * **Jaccard Index**: - Measures gene overlap relative to union size. +the ratio of the number of genes in common over the total number of genes in the two sets. -* **Log Odds Ratio (logOR)**: - Computes enrichment using a user-defined gene universe and Fisher’s exact test. +* **Log Odds Ratio (logOR)** from Fisher’s exact test of association between gene sets, given a specified gene universe. -Filters can be applied based on similarity thresholds (e.g., minimum Jaccard, OR, or p-value). - +Filters can be applied based on similarity thresholds (e.g., minimum Jaccard, OR, or Fisher's test p-value). + +## Python Bridge + +For users who prefer Python, a lightweight bridge is available in +`python/` that allows calling any `markeR` function from a Python +environment via [`rpy2`](https://rpy2.github.io/). It includes a tutorial +workflow script and a generic command-line wrapper. See +[`python/README.md`](inst/python/README.md) for installation +instructions and usage examples. ## Contact @@ -236,7 +255,7 @@ Filters can be applied based on similarity thresholds (e.g., minimum Jaccard, OR **Rita Martins-Silva** Email: [rita.silva@medicina.ulisboa.pt](mailto:rita.silva@medicina.ulisboa.pt) - -[tutorial-benchmarking]: https://diseasetranscriptomicslab.github.io/markeR/articles/Tutorial_BenchmarkingMode.html -[tutorial-discovery]: https://diseasetranscriptomicslab.github.io/markeR/articles/Tutorial_DiscoveryMode.html -[tutorial-signaturesimilarity]: https://diseasetranscriptomicslab.github.io/markeR/articles/Tutorial_GeneSetSimilarity.html +[tutorial-introduction]: https://diseasetranscriptomicslab.github.io/markeR/articles/markeR.html +[tutorial-benchmarking]: https://diseasetranscriptomicslab.github.io/markeR/articles/Article_BenchmarkingMode.html +[tutorial-discovery]: https://diseasetranscriptomicslab.github.io/markeR/articles/Article_DiscoveryMode.html +[tutorial-signaturesimilarity]: https://diseasetranscriptomicslab.github.io/markeR/articles/Article_GeneSetSimilarity.html diff --git a/README.md b/README.md index a7266d4..af60464 100644 --- a/README.md +++ b/README.md @@ -5,32 +5,34 @@ -![](https://img.shields.io/badge/status-development-yellowgreen) + + [![Pkgdown](https://img.shields.io/badge/docs-pkgdown-blue.svg)](https://diseasetranscriptomicslab.github.io/markeR/) -[![Minimal R -Version](https://img.shields.io/badge/min%20R-4.4.0-blue.svg)](https://github.com/DiseaseTranscriptomicsLab/markeR/actions/workflows/Rminversion.yaml) +![Minimal R +Version](https://img.shields.io/badge/min%20R-4.5.0-blue.svg) [![codecov](https://codecov.io/gh/DiseaseTranscriptomicsLab/markeR/graph/badge.svg?token=7T1I4JCJG6)](https://codecov.io/gh/DiseaseTranscriptomicsLab/markeR) -**markeR** provides a suite of methods for using gene sets (signatures) -to quantify and evaluate the extent to which a given gene signature -marks a specific phenotype from gene expression data. The package -implements various scoring, enrichment and classification approaches, -along with tools to compute performance metrics and visualize results. +**`markeR`** is an R package that provides a modular and extensible +framework for the systematic evaluation of gene sets as phenotypic +markers using transcriptomic data. The package is designed to support +both quantitative analyses and visual exploration of gene set behaviour +across experimental and clinical phenotypes. -> **To cite markeR please use:** +> **To cite `markeR` please use:** > -> Martins-Silva R, Kaizeler A, Barbosa-Morais N (2025). *markeR: an R -> Toolkit for Evaluating Gene Sets as Phenotypic Markers*. Gulbenkian -> Institute for Molecular Medicine, Faculdade de Medicina, Universidade -> de Lisboa, Lisbon, Portugal. R package version 0.99.2, -> . +> Martins-Silva R, Kaizeler A, Barbosa-Morais NL (2025). *markeR: An R +> Toolkit for Evaluating Gene Signatures as Phenotypic Markers*. +> , R package version 1.3, +> . -`inst/Paper/` — Contains all scripts and materials used in the original -markeR paper to reproduce analyses and figures. +The folder `inst/Paper/` is in the **paper** branch and contains all +scripts and materials used in the original `markeR` paper to reproduce +analyses and figures. You can browse it +[here](https://github.com/DiseaseTranscriptomicsLab/markeR/tree/paper/inst/Paper). ![](man/figures/Workflow.png) @@ -51,12 +53,23 @@ markeR paper to reproduce analyses and figures. (Optional)](#5-individual-gene-exploration-optional) - [6. Compare with Reference Gene Sets (Optional)](#6-compare-with-reference-gene-sets-optional) +- [Python Bridge](#python-bridge) - [Contact](#contact) ## Installation -The latest development release of markeR from -[GitHub](https://github.com/) can be installed with: +Install the latest release from Bioconductor: + +``` r +# Install from Bioconductor +if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager") +BiocManager::install("markeR") +library(markeR) +``` + +Or install the latest development release of `markeR` from +[GitHub](https://github.com/) with: ``` r # install.packages("devtools") @@ -67,38 +80,25 @@ devtools::install_github("DiseaseTranscriptomicsLab/markeR@*release") The following tutorials are available: +- [Introduction to + markeR](https://diseasetranscriptomicslab.github.io/markeR/articles/markeR.html) - [Benchmarking - Mode](https://diseasetranscriptomicslab.github.io/markeR/articles/Tutorial_BenchmarkingMode.html) + Mode](https://diseasetranscriptomicslab.github.io/markeR/articles/Article_BenchmarkingMode.html) - [Discovery - Mode](https://diseasetranscriptomicslab.github.io/markeR/articles/Tutorial_DiscoveryMode.html) + Mode](https://diseasetranscriptomicslab.github.io/markeR/articles/Article_DiscoveryMode.html) - [Signature - Similarity](https://diseasetranscriptomicslab.github.io/markeR/articles/Tutorial_GeneSetSimilarity.html) + Similarity](https://diseasetranscriptomicslab.github.io/markeR/articles/Article_GeneSetSimilarity.html) ## Requirements -This package is officially supported on (based on a GitHub Actions -workflow that tests against multiple `R` versions): - -- `R` `4.4.x` -- `R` `4.5.x` - -However, due to `Bioconductor` submission requirements, `R` version -`4.5.0` will be listed as required upon package installation in future -releases. - -⚠️ Compatibility with older R versions depends on the specific versions -of dependencies installed. Older versions of `R` (including `R` `3.5.x`, -`3.6.x`, `4.0.x`, `4.1.x`, `4.2.x`, and `4.3.x`) may work, but are not -officially supported due to upstream dependency constraints. In some -cases, installing older versions of dependencies (e.g., via `renv`, -`CRAN` snapshots, or `checkpoint`) can restore compatibility. +This package is officially supported on `R > 4.5.0`. ⚠️ Older versions +of `R` may work, but are not officially supported due to upstream +dependency constraints. In some cases, installing older versions of +dependencies (e.g., via `renv`, `CRAN` snapshots, or `checkpoint`) can +restore compatibility. ## Common Workflow -`markeR` provides a modular pipeline to quantify transcriptomic -signatures and assess their association with phenotypic or clinical -variables. The typical workflow includes the following steps: - ### 1. Input Requirements Depending on the analysis mode, inputs vary slightly. @@ -128,9 +128,15 @@ gene_sets ``` - **Expression Data Frame**: - A filtered and normalised gene expression data frame (genes × - samples). Row names must be gene identifiers, and column names must - match the sample IDs in the metadata. + A filtered and normalised, non log-transformed, gene expression matrix + (genes × samples). Row names must be gene identifiers; column names + must match sample IDs in the metadata. + + **Warning:** If you are using microarray data or outputs from common + RNA-seq pipelines (*e.g.*, edgeR), note that the expression values may + already be log2-normalised. The input to `markeR` must necessarily be + **non-log-transformed**. If your data are log2-transformed, you can + revert them by applying `2^data`. ``` r head(expr_df) @@ -144,9 +150,9 @@ head(expr_df) ``` - **Sample Metadata**: - A data frame with annotations for each sample, with the sample ID in - the first column. The row names must match the column names of the - expression matrix. + A data frame with samples as rows and annotations as columns. The + first column should contain sample IDs matching the expression matrix + column names. ``` r metadata @@ -160,109 +166,133 @@ metadata ### 2. Select Mode of Analysis -- **Discovery Mode**: Explore how a single, well-characterised gene set - relates to a specific variable of interest. Suitable for hypothesis - generation or signature projection. +`markeR` provides two modes of operation: + +- **Benchmarking**: evaluates gene sets’ performance in marking a + metadata variable, *i.e.*, a phenotype, returning comparative + visualisations across scoring and enrichment methods. -- **Benchmarking Mode**: Evaluate one or more gene sets against multiple - metadata variables using a standardised scoring and effect size - framework. This mode provides comprehensive visualisations and - comparisons across methods. +- **Discovery**: examines the relationship between a gene set and one or + more variables of interest, suitable for exploratory or + hypothesis-generating analyses. ### 3. Choose a Quantification Approach -`markeR` supports two complementary strategies for quantifying the -association between gene sets and phenotypes: +Two complementary strategies are implemented for quantifying +associations between gene sets and phenotypes: #### 3.1 Score-Based Approach -This strategy generates a **single numeric score per sample**, -reflecting the activity of a gene set. It enables flexible downstream -analyses, including comparisons across phenotypic groups. +A score summarising the collective expression of a gene set therein is +assigned **to each sample**. Scores can be visualised using built-in +functions, or used directly in downstream analyses (*e.g.*, comparisons +between phenotypic groups of samples, correlations with numerical +phenotypes). -Three scoring methods are available: +Available methods: -- **Log2-median**: Calculates the median log2 expression of the genes in - the set. Sensitive to absolute shifts in expression. +- **Log2-median**: mean of the across-sample normalised log2 + median-centred expression levels of the genes in the set; for + bidirectional gene sets, the sample score is the partial score for the + subset of putatively upregulated genes minus that of the downregulated + subset. -- **Ranking**: Ranks all genes within each sample and averages the ranks - of gene set members. Captures relative ordering rather than magnitude. +- **Ranking**: mean expression rank of gene set members in each sample; + for bidirectional gene sets, the sample score is the partial score for + the subset of putatively upregulated genes minus that of the + downregulated subset, and normalised by the number of genes in the + set. -- **ssGSEA**: Computes a single-sample gene set enrichment score using - the ssGSEA algorithm. Reflects the coordinated up- or down-regulation - of the set in each sample. +- **ssGSEA**: single-sample gene set enrichment score using ssGSEA; for + bidirectional gene sets, the sample score is the partial score for the + subset of putatively upregulated genes minus that of the downregulated + subset. -These methods vary in assumptions and sensitivity. Robust gene sets are -expected to perform consistently across all three. +Gene sets that are robust phenotypic markers are expected to yield +consistently high scores across methods. #### 3.2 Enrichment-Based Approach -This approach uses a classical **gene set enrichment analysis (GSEA)** -framework to evaluate whether the gene set is significantly -overrepresented at the top or bottom of a ranked list of genes (e.g., -ranked by fold change or correlation with phenotype). - -- **GSEA**: Computes a Normalised Enrichment Score (NES) for each - contrast or variable of interest, adjusting for gene set size and - multiple testing. - -Use this approach when interested in collective behaviour of gene sets -in relation to ranked differential signals. +Enrichment-based methods implement **Gene Set Enrichment Analysis +(GSEA)**. Genes are ranked according to differential expression +statistics, and a Normalised Enrichment Score (NES) per variable of +interest is computed, accompanied by a p-value adjusted for multiple +hypothesis testing. ### 4. Visualisation and Evaluation In **Benchmarking Mode**, `markeR` offers a range of visual summaries: -- Violin or scatter plots showing score distributions by phenotype -- Volcano plots and heatmaps based on effect sizes (Cohen’s *d* or *f*) -- ROC curves and AUC values -- Null distribution testing using random gene sets matched for size and - directionality +- Violin plots of score distributions by categorical phenotype; +- Scatter plots of association between scores and numerical phenotypes; +- Volcano plots and heatmaps of scores or differential gene set + expression based on effect sizes (Cohen’s *d* or *f*); +- ROC curves and respective AUC values of gene sets’ phenotypic + classification performance; +- Violin plots of effect size distributions (Cohen’s *d*) for pairwise + group differences in scores, for original and simulated gene sets; +- Plots summarising NES alongside adjusted p-values (*e.g.*, lollipop + plots); +- GSEA plots showing running enrichment scores across ranked gene lists. In **Discovery Mode**, the output focuses on a single gene set: -- Score distributions by phenotype -- Pairwise contrasts (Cohen’s *d*) and overall effect sizes (Cohen’s - *f*) -- Enrichment score summaries (NES) with adjusted p-values (e.g., - lollipop plots) +- Score distributions stratified by variable; +- Effect sizes for pairwise and multiple-group differences (Cohen’s *d* + and *f*, respectively); +- Cross-variable summaries of NES and adjusted p-values (*e.g.*, + lollipop plots). -Benchmarking mode offers the most comprehensive set of features and -allows users to seamlessly move from discovery to benchmarking mode once +The Benchmarking Mode offers the most comprehensive set of features. +Users are allowed to seamlessly move from Discovery to Benchmarking once a variable of interest has been identified and further testing is -required. The main difference from Discovery mode is that Benchmarking -is designed to evaluate multiple gene sets simultaneously, whereas -Discovery mode focuses on quantifying a single, robust gene set. +required. Benchmarking is designed to evaluate multiple gene sets +simultaneously, whereas Discovery focuses on the performance of a single +gene set. -### 5. Individual Gene Exploration (Optional) +### 5. Individual Gene Exploration To better understand the contribution of individual genes within a gene -set and identify whether specific genes drive the overall signal, -`markeR` offers a suite of gene-level exploratory analyses, including: - -- Expression heatmaps of genes across samples and groups -- Violin plots showing expression distributions of individual genes -- Correlation heatmaps to reveal co-expression patterns among genes in - the set -- ROC curves and AUC values for individual genes to evaluate their - discriminatory power -- Effect size calculations (Cohen’s *d*) per gene to quantify - differential expression -- Principal Component Analysis (PCA) on gene set genes to assess - variance explained and sample clustering +set, and identify whether specific genes drive the set’s collective +signal, `markeR` provides `VisualiseIndividualGenes.` Available options +include: + +- Expression heatmaps of genes across samples or groups of samples; +- Violin plots showing cross-sample expression distributions of + individual genes; +- Heatmaps of pairwise cross-sample expression correlation between genes + in the set; +- ROC curves and AUC values to evaluate single genes’ performance as + phenotypic markers; +- Effect size estimation (Cohen’s *d*) of expression differences between + groups of samples; +- Principal Component Analysis (PCA) of expression of genes in the set, + to evaluate which genes dominate collective variance and how samples + separate according to the gene set’s expression. + +### 6. Compare with Reference Gene Sets + +`markeR` also supports comparison of user-defined gene sets against +reference collections (e.g., MSigDB). Two complementary similarity +metrics are implemented: + +- **Jaccard Index**: the ratio of the number of genes in common over the + total number of genes in the two sets. + +- **Log Odds Ratio (logOR)** from Fisher’s exact test of association + between gene sets, given a specified gene universe. -### 6. Compare with Reference Gene Sets (Optional) - -`markeR` allows comparison of user-defined gene sets to reference sets -(e.g., from MSigDB) using: - -- **Jaccard Index**: Measures gene overlap relative to union size. +Filters can be applied based on similarity thresholds (e.g., minimum +Jaccard, OR, or Fisher’s test p-value). -- **Log Odds Ratio (logOR)**: Computes enrichment using a user-defined - gene universe and Fisher’s exact test. +## Python Bridge -Filters can be applied based on similarity thresholds (e.g., minimum -Jaccard, OR, or p-value). +For users who prefer Python, a lightweight bridge is available in +`python/` that allows calling any `markeR` function from a Python +environment via [`rpy2`](https://rpy2.github.io/). It includes a +tutorial workflow script and a generic command-line wrapper. See +[`python/README.md`](inst/python/README.md) for installation +instructions and usage examples. ## Contact diff --git a/_pkgdown.yml b/_pkgdown.yml index b7206c0..2f0132b 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -11,7 +11,7 @@ badges: true home: sidebar: structure: [links, license, community, citation, authors, dev] - + reference: - title: "Package Help Page" contents: diff --git a/docs/authors.html b/docs/authors.html index cd6f9f9..45a8ed8 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -7,9 +7,7 @@ markeR - - 0.9.3 - + 0.99.2