reactome · shivanshuyadav921 · Mar 29, 2026
diff --git a/README.md b/README.md
@@ -2,6 +2,12 @@
 
 An [MCP (Model Context Protocol)](https://modelcontextprotocol.io/) server that exposes the [Reactome](https://reactome.org/) pathway knowledgebase to AI assistants. It wraps Reactome's Content Service and Analysis Service REST APIs, giving LLMs the ability to search, browse, analyse, and export biological pathway data through natural language.
 
+This project implements an advanced Retrieval-Augmented Generation (RAG) system on top of Reactome data. It introduces a query understanding module that classifies user intent and decomposes complex biological questions into retrievable sub-queries, improving retrieval quality.
+
+Additionally, it includes a citation and verification layer that ensures all generated answers are grounded in Reactome by automatically attaching and validating pathway IDs using the Reactome REST API.
+
+The result is a more accurate, interpretable, and trustworthy biological question-answering system.
+
 ## Features
 
 - **Pathway enrichment analysis** — submit gene/protein lists and retrieve over-representation results, including p-values, FDR, and found/not-found identifiers
@@ -12,6 +18,7 @@ An [MCP (Model Context Protocol)](https://modelcontextprotocol.io/) server that
 - **Export** — diagrams (PNG/SVG/JPG/GIF), SBGN, SBML, PDF reports, and CSV/JSON analysis results
 - **Species & disease** — list available species and disease annotations
 - **ID mapping** — map external identifiers (UniProt, Ensembl, CHEBI, etc.) to Reactome pathways and reactions
+- **RAG Pipeline** — advanced retrieval-augmented generation with query understanding, citation verification, and Reactome-grounded answers
 
 Over 40 tools and 10 resources are registered — see [Tools](#tools) and [Resources](#resources) below for the full list.
 
@@ -165,6 +172,14 @@ Starts a local web UI with an MCP bridge for browser-based exploration.
 | `reactome_orthology` | Get orthologous events/entities in another species |
 | `reactome_query` | Query any Reactome database object by identifier |
 
+### RAG Pipeline (3 tools)
+
+| Tool | Description |
+|------|-------------|
+| `reactome_query_understanding` | Analyze biological queries to classify intent and decompose into sub-queries |
+| `reactome_citation_verification` | Verify citations in answers and ensure grounding in Reactome data |
+| `reactome_rag_pipeline` | Complete RAG pipeline: understanding → retrieval → generation → verification |
+
 ## Resources
 
 ### Static

diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -26,6 +26,7 @@
   "license": "Apache-2.0",
   "dependencies": {
     "@modelcontextprotocol/sdk": "^1.12.0",
+    "openai": "^6.33.0",
     "zod": "^3.25.0"
   },
   "devDependencies": {

diff --git a/src/citationVerification.ts b/src/citationVerification.ts
@@ -0,0 +1,146 @@
+import { contentClient } from "./clients/content.js";
+
+export interface CitationVerificationResult {
+  final_answer: string;
+  valid_ids: string[];
+  invalid_ids: string[];
+}
+
+export interface ContextEntry {
+  statement: string;
+  id: string;
+}
+
+/**
+ * Extract Reactome IDs from text using regex pattern R-HSA-\d+
+ */
+function extractReactomeIds(text: string): string[] {
+  const pattern = /R-HSA-\d+/g;
+  const matches = text.match(pattern);
+  return matches ? [...new Set(matches)] : []; // Remove duplicates
+}
+
+/**
+ * Validate a Reactome ID by querying the API
+ */
+async function validateReactomeId(id: string): Promise<boolean> {
+  try {
+    await contentClient.get(`/data/query/enhanced/${encodeURIComponent(id)}`);
+    return true;
+  } catch (error) {
+    return false;
+  }
+}
+
+/**
+ * Parse context string into statement-ID pairs
+ * Assumes context format like: "statement [R-HSA-XXXXX]"
+ */
+function parseContext(context: string): ContextEntry[] {
+  const entries: ContextEntry[] = [];
+  // Split by newlines or other delimiters if multiple entries
+  const lines = context.split('\n').filter(line => line.trim());
+
+  for (const line of lines) {
+    // Match patterns like "statement [R-HSA-XXXXX]" or "statement → something [R-HSA-XXXXX]"
+    const match = line.match(/^(.+?)\s*\[(R-HSA-\d+)\]$/);
+    if (match) {
+      entries.push({
+        statement: match[1].trim(),
+        id: match[2].trim()
+      });
+    }
+  }
+
+  return entries;
+}
+
+/**
+ * Find the best matching Reactome ID for a statement from context
+ */
+function findMatchingId(statement: string, contextEntries: ContextEntry[]): string | null {
+  // Simple matching: check if statement contains key parts of context entries
+  for (const entry of contextEntries) {
+    // Normalize for comparison
+    const stmtLower = statement.toLowerCase();
+    const entryLower = entry.statement.toLowerCase();
+
+    // Check if statement contains the context statement or vice versa
+    if (stmtLower.includes(entryLower) || entryLower.includes(stmtLower)) {
+      return entry.id;
+    }
+
+    // Check for key biological terms (simple heuristic)
+    const stmtWords = stmtLower.split(/\s+/);
+    const entryWords = entryLower.split(/\s+/);
+    const commonWords = stmtWords.filter(word => entryWords.includes(word) && word.length > 3);
+    if (commonWords.length >= 2) { // At least 2 common significant words
+      return entry.id;
+    }
+  }
+
+  return null;
+}
+
+/**
+ * Process the answer to add citations where missing
+ */
+function injectCitations(answer: string, contextEntries: ContextEntry[]): string {
+  const sentences = answer.split(/[.!?]+/).filter(s => s.trim());
+
+  const processedSentences: string[] = [];
+
+  for (const sentence of sentences) {
+    const trimmed = sentence.trim();
+    if (!trimmed) continue;
+
+    // Check if sentence already has Reactome ID
+    if (extractReactomeIds(trimmed).length > 0) {
+      processedSentences.push(trimmed);
+    } else {
+      // Try to find matching ID from context
+      const matchingId = findMatchingId(trimmed, contextEntries);
+      if (matchingId) {
+        processedSentences.push(`${trimmed} [${matchingId}]`);
+      } else {
+        processedSentences.push(`${trimmed} [No validated Reactome reference found]`);
+      }
+    }
+  }
+
+  return processedSentences.join('. ') + (processedSentences.length > 0 ? '.' : '');
+}
+
+/**
+ * Main function to verify citations and ensure all biological claims are grounded
+ */
+export async function verifyCitations(
+  answer: string,
+  context: string
+): Promise<CitationVerificationResult> {
+  // Extract all Reactome IDs from the answer
+  const extractedIds = extractReactomeIds(answer);
+
+  // Validate each ID
+  const validationPromises = extractedIds.map(async (id) => ({
+    id,
+    isValid: await validateReactomeId(id)
+  }));
+
+  const validationResults = await Promise.all(validationPromises);
+
+  const validIds = validationResults.filter(r => r.isValid).map(r => r.id);
+  const invalidIds = validationResults.filter(r => !r.isValid).map(r => r.id);
+
+  // Parse context for citation injection
+  const contextEntries = parseContext(context);
+
+  // Inject citations into answer where missing
+  const finalAnswer = injectCitations(answer, contextEntries);
+
+  return {
+    final_answer: finalAnswer,
+    valid_ids: validIds,
+    invalid_ids: invalidIds
+  };
+}
diff --git a/src/queryUnderstanding.ts b/src/queryUnderstanding.ts
@@ -0,0 +1,79 @@
+import OpenAI from 'openai';
+
+const openai = new OpenAI({
+  apiKey: process.env.OPENAI_API_KEY,
+});
+
+export interface QueryUnderstandingResult {
+  original_query: string;
+  intent: 'explanation' | 'comparison' | 'lookup' | 'disease' | 'mechanism';
+  confidence: number;
+  sub_queries: string[];
+}
+
+export async function understandQuery(query: string): Promise<QueryUnderstandingResult> {
+  const prompt = `
+You are a biological query understanding system for a RAG chatbot. Your task is to analyze a user query and return a structured JSON response.
+
+First, classify the intent into one of these categories:
+- explanation: asking for clarification or description of biological concepts
+- comparison: comparing different biological entities, processes, or conditions
+- lookup: searching for specific information like names, IDs, or basic facts
+- disease: queries related to diseases, their causes, symptoms, or treatments
+- mechanism: asking about how biological processes work at a molecular or cellular level
+
+Then, determine if the query is complex or multi-part. If it is, break it into smaller, independent sub-queries that:
+- Are self-contained
+- Can retrieve relevant biological information independently
+- Preserve the original intent
+- Avoid redundancy
+
+For simple queries, return a single sub-query that is the query itself.
+
+Output must be valid JSON in this exact format:
+{
+  "original_query": "<user query>",
+  "intent": "<classified intent>",
+  "confidence": <0 to 1>,
+  "sub_queries": [
+    "<sub-query 1>",
+    "<sub-query 2>",
+    ...
+  ]
+}
+
+Be robust to simple, complex, and ambiguous queries.
+
+Query: "${query}"
+`;
+
+  const response = await openai.chat.completions.create({
+    model: 'gpt-4o-mini', // or gpt-3.5-turbo for cost
+    messages: [{ role: 'user', content: prompt }],
+    temperature: 0.1, // low for consistency
+  });
+
+  const content = response.choices[0]?.message?.content;
+  if (!content) {
+    throw new Error('No response from LLM');
+  }
+
+  try {
+    const result: QueryUnderstandingResult = JSON.parse(content.trim());
+    // Validate the structure
+    if (
+      typeof result.original_query === 'string' &&
+      ['explanation', 'comparison', 'lookup', 'disease', 'mechanism'].includes(result.intent) &&
+      typeof result.confidence === 'number' &&
+      result.confidence >= 0 && result.confidence <= 1 &&
+      Array.isArray(result.sub_queries) &&
+      result.sub_queries.every(q => typeof q === 'string')
+    ) {
+      return result;
+    } else {
+      throw new Error('Invalid response structure');
+    }
+  } catch (error) {
+    throw new Error(`Failed to parse LLM response: ${error}`);
+  }
+}