diff --git a/README.md b/README.md index a97e2d1..28460a2 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,12 @@ An [MCP (Model Context Protocol)](https://modelcontextprotocol.io/) server that exposes the [Reactome](https://reactome.org/) pathway knowledgebase to AI assistants. It wraps Reactome's Content Service and Analysis Service REST APIs, giving LLMs the ability to search, browse, analyse, and export biological pathway data through natural language. +This project implements an advanced Retrieval-Augmented Generation (RAG) system on top of Reactome data. It introduces a query understanding module that classifies user intent and decomposes complex biological questions into retrievable sub-queries, improving retrieval quality. + +Additionally, it includes a citation and verification layer that ensures all generated answers are grounded in Reactome by automatically attaching and validating pathway IDs using the Reactome REST API. + +The result is a more accurate, interpretable, and trustworthy biological question-answering system. + ## Features - **Pathway enrichment analysis** — submit gene/protein lists and retrieve over-representation results, including p-values, FDR, and found/not-found identifiers @@ -12,6 +18,7 @@ An [MCP (Model Context Protocol)](https://modelcontextprotocol.io/) server that - **Export** — diagrams (PNG/SVG/JPG/GIF), SBGN, SBML, PDF reports, and CSV/JSON analysis results - **Species & disease** — list available species and disease annotations - **ID mapping** — map external identifiers (UniProt, Ensembl, CHEBI, etc.) to Reactome pathways and reactions +- **RAG Pipeline** — advanced retrieval-augmented generation with query understanding, citation verification, and Reactome-grounded answers Over 40 tools and 10 resources are registered — see [Tools](#tools) and [Resources](#resources) below for the full list. @@ -165,6 +172,14 @@ Starts a local web UI with an MCP bridge for browser-based exploration. | `reactome_orthology` | Get orthologous events/entities in another species | | `reactome_query` | Query any Reactome database object by identifier | +### RAG Pipeline (3 tools) + +| Tool | Description | +|------|-------------| +| `reactome_query_understanding` | Analyze biological queries to classify intent and decompose into sub-queries | +| `reactome_citation_verification` | Verify citations in answers and ensure grounding in Reactome data | +| `reactome_rag_pipeline` | Complete RAG pipeline: understanding → retrieval → generation → verification | + ## Resources ### Static diff --git a/package-lock.json b/package-lock.json index 9d63d8e..11f5f86 100644 --- a/package-lock.json +++ b/package-lock.json @@ -7,9 +7,10 @@ "": { "name": "reactome-mcp", "version": "1.0.0", - "license": "MIT", + "license": "Apache-2.0", "dependencies": { "@modelcontextprotocol/sdk": "^1.12.0", + "openai": "^6.33.0", "zod": "^3.25.0" }, "bin": { @@ -585,6 +586,16 @@ "node": ">= 0.4" } }, + "node_modules/hono": { + "version": "4.12.9", + "resolved": "https://registry.npmjs.org/hono/-/hono-4.12.9.tgz", + "integrity": "sha512-wy3T8Zm2bsEvxKZM5w21VdHDDcwVS1yUFFY6i8UobSsKfFceT7TOwhbhfKsDyx7tYQlmRM5FLpIuYvNFyjctiA==", + "license": "MIT", + "peer": true, + "engines": { + "node": ">=16.9.0" + } + }, "node_modules/http-errors": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.1.tgz", @@ -781,6 +792,27 @@ "wrappy": "1" } }, + "node_modules/openai": { + "version": "6.33.0", + "resolved": "https://registry.npmjs.org/openai/-/openai-6.33.0.tgz", + "integrity": "sha512-xAYN1W3YsDXJWA5F277135YfkEk6H7D3D6vWwRhJ3OEkzRgcyK8z/P5P9Gyi/wB4N8kK9kM5ZjprfvyHagKmpw==", + "license": "Apache-2.0", + "bin": { + "openai": "bin/cli" + }, + "peerDependencies": { + "ws": "^8.18.0", + "zod": "^3.25 || ^4.0" + }, + "peerDependenciesMeta": { + "ws": { + "optional": true + }, + "zod": { + "optional": true + } + } + }, "node_modules/parseurl": { "version": "1.3.3", "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz", diff --git a/package.json b/package.json index dc0f456..6b8518f 100644 --- a/package.json +++ b/package.json @@ -26,6 +26,7 @@ "license": "Apache-2.0", "dependencies": { "@modelcontextprotocol/sdk": "^1.12.0", + "openai": "^6.33.0", "zod": "^3.25.0" }, "devDependencies": { diff --git a/src/citationVerification.ts b/src/citationVerification.ts new file mode 100644 index 0000000..61db370 --- /dev/null +++ b/src/citationVerification.ts @@ -0,0 +1,146 @@ +import { contentClient } from "./clients/content.js"; + +export interface CitationVerificationResult { + final_answer: string; + valid_ids: string[]; + invalid_ids: string[]; +} + +export interface ContextEntry { + statement: string; + id: string; +} + +/** + * Extract Reactome IDs from text using regex pattern R-HSA-\d+ + */ +function extractReactomeIds(text: string): string[] { + const pattern = /R-HSA-\d+/g; + const matches = text.match(pattern); + return matches ? [...new Set(matches)] : []; // Remove duplicates +} + +/** + * Validate a Reactome ID by querying the API + */ +async function validateReactomeId(id: string): Promise { + try { + await contentClient.get(`/data/query/enhanced/${encodeURIComponent(id)}`); + return true; + } catch (error) { + return false; + } +} + +/** + * Parse context string into statement-ID pairs + * Assumes context format like: "statement [R-HSA-XXXXX]" + */ +function parseContext(context: string): ContextEntry[] { + const entries: ContextEntry[] = []; + // Split by newlines or other delimiters if multiple entries + const lines = context.split('\n').filter(line => line.trim()); + + for (const line of lines) { + // Match patterns like "statement [R-HSA-XXXXX]" or "statement → something [R-HSA-XXXXX]" + const match = line.match(/^(.+?)\s*\[(R-HSA-\d+)\]$/); + if (match) { + entries.push({ + statement: match[1].trim(), + id: match[2].trim() + }); + } + } + + return entries; +} + +/** + * Find the best matching Reactome ID for a statement from context + */ +function findMatchingId(statement: string, contextEntries: ContextEntry[]): string | null { + // Simple matching: check if statement contains key parts of context entries + for (const entry of contextEntries) { + // Normalize for comparison + const stmtLower = statement.toLowerCase(); + const entryLower = entry.statement.toLowerCase(); + + // Check if statement contains the context statement or vice versa + if (stmtLower.includes(entryLower) || entryLower.includes(stmtLower)) { + return entry.id; + } + + // Check for key biological terms (simple heuristic) + const stmtWords = stmtLower.split(/\s+/); + const entryWords = entryLower.split(/\s+/); + const commonWords = stmtWords.filter(word => entryWords.includes(word) && word.length > 3); + if (commonWords.length >= 2) { // At least 2 common significant words + return entry.id; + } + } + + return null; +} + +/** + * Process the answer to add citations where missing + */ +function injectCitations(answer: string, contextEntries: ContextEntry[]): string { + const sentences = answer.split(/[.!?]+/).filter(s => s.trim()); + + const processedSentences: string[] = []; + + for (const sentence of sentences) { + const trimmed = sentence.trim(); + if (!trimmed) continue; + + // Check if sentence already has Reactome ID + if (extractReactomeIds(trimmed).length > 0) { + processedSentences.push(trimmed); + } else { + // Try to find matching ID from context + const matchingId = findMatchingId(trimmed, contextEntries); + if (matchingId) { + processedSentences.push(`${trimmed} [${matchingId}]`); + } else { + processedSentences.push(`${trimmed} [No validated Reactome reference found]`); + } + } + } + + return processedSentences.join('. ') + (processedSentences.length > 0 ? '.' : ''); +} + +/** + * Main function to verify citations and ensure all biological claims are grounded + */ +export async function verifyCitations( + answer: string, + context: string +): Promise { + // Extract all Reactome IDs from the answer + const extractedIds = extractReactomeIds(answer); + + // Validate each ID + const validationPromises = extractedIds.map(async (id) => ({ + id, + isValid: await validateReactomeId(id) + })); + + const validationResults = await Promise.all(validationPromises); + + const validIds = validationResults.filter(r => r.isValid).map(r => r.id); + const invalidIds = validationResults.filter(r => !r.isValid).map(r => r.id); + + // Parse context for citation injection + const contextEntries = parseContext(context); + + // Inject citations into answer where missing + const finalAnswer = injectCitations(answer, contextEntries); + + return { + final_answer: finalAnswer, + valid_ids: validIds, + invalid_ids: invalidIds + }; +} \ No newline at end of file diff --git a/src/queryUnderstanding.ts b/src/queryUnderstanding.ts new file mode 100644 index 0000000..5231b75 --- /dev/null +++ b/src/queryUnderstanding.ts @@ -0,0 +1,79 @@ +import OpenAI from 'openai'; + +const openai = new OpenAI({ + apiKey: process.env.OPENAI_API_KEY, +}); + +export interface QueryUnderstandingResult { + original_query: string; + intent: 'explanation' | 'comparison' | 'lookup' | 'disease' | 'mechanism'; + confidence: number; + sub_queries: string[]; +} + +export async function understandQuery(query: string): Promise { + const prompt = ` +You are a biological query understanding system for a RAG chatbot. Your task is to analyze a user query and return a structured JSON response. + +First, classify the intent into one of these categories: +- explanation: asking for clarification or description of biological concepts +- comparison: comparing different biological entities, processes, or conditions +- lookup: searching for specific information like names, IDs, or basic facts +- disease: queries related to diseases, their causes, symptoms, or treatments +- mechanism: asking about how biological processes work at a molecular or cellular level + +Then, determine if the query is complex or multi-part. If it is, break it into smaller, independent sub-queries that: +- Are self-contained +- Can retrieve relevant biological information independently +- Preserve the original intent +- Avoid redundancy + +For simple queries, return a single sub-query that is the query itself. + +Output must be valid JSON in this exact format: +{ + "original_query": "", + "intent": "", + "confidence": <0 to 1>, + "sub_queries": [ + "", + "", + ... + ] +} + +Be robust to simple, complex, and ambiguous queries. + +Query: "${query}" +`; + + const response = await openai.chat.completions.create({ + model: 'gpt-4o-mini', // or gpt-3.5-turbo for cost + messages: [{ role: 'user', content: prompt }], + temperature: 0.1, // low for consistency + }); + + const content = response.choices[0]?.message?.content; + if (!content) { + throw new Error('No response from LLM'); + } + + try { + const result: QueryUnderstandingResult = JSON.parse(content.trim()); + // Validate the structure + if ( + typeof result.original_query === 'string' && + ['explanation', 'comparison', 'lookup', 'disease', 'mechanism'].includes(result.intent) && + typeof result.confidence === 'number' && + result.confidence >= 0 && result.confidence <= 1 && + Array.isArray(result.sub_queries) && + result.sub_queries.every(q => typeof q === 'string') + ) { + return result; + } else { + throw new Error('Invalid response structure'); + } + } catch (error) { + throw new Error(`Failed to parse LLM response: ${error}`); + } +} \ No newline at end of file diff --git a/src/ragPipeline.ts b/src/ragPipeline.ts new file mode 100644 index 0000000..0b7e8ff --- /dev/null +++ b/src/ragPipeline.ts @@ -0,0 +1,174 @@ +import OpenAI from 'openai'; +import { contentClient } from './clients/content.js'; +import { understandQuery, QueryUnderstandingResult } from './queryUnderstanding.js'; +import { verifyCitations, CitationVerificationResult } from './citationVerification.js'; +import type { SearchResult, SearchEntry } from './types/index.js'; + +const openai = new OpenAI({ + apiKey: process.env.OPENAI_API_KEY, +}); + +export interface RAGPipelineResult { + intent: string; + sub_queries: string[]; + final_answer: string; + valid_ids: string[]; + invalid_ids: string[]; +} + +interface MergedSearchEntry extends SearchEntry { + sourceSubQuery: string; +} + +/** + * Perform search for a single sub-query + */ +async function searchSubQuery(subQuery: string): Promise { + try { + const params: Record = { + query: subQuery, + rows: 50, // Get more results for better coverage + cluster: true, + }; + + const result = await contentClient.get('/search/query', params); + + // Flatten results + const entries: SearchEntry[] = []; + for (const group of result.results) { + entries.push(...group.entries); + } + + return entries; + } catch (error) { + console.error(`Search failed for sub-query "${subQuery}":`, error); + return []; + } +} + +/** + * Merge and deduplicate search results from multiple sub-queries + */ +function mergeAndDeduplicateResults(results: MergedSearchEntry[]): SearchEntry[] { + const seen = new Set(); + const deduplicated: SearchEntry[] = []; + + for (const entry of results) { + if (!seen.has(entry.stId)) { + seen.add(entry.stId); + deduplicated.push(entry); + } + } + + return deduplicated; +} + +/** + * Construct structured context from search results + */ +function constructContext(entries: SearchEntry[]): string { + const contextLines: string[] = []; + + for (const entry of entries) { + const name = entry.name.replace(/<[^>]*>/g, ''); // Strip HTML + const type = entry.exactType; + const id = entry.stId; + + contextLines.push(`${name} (${type}) [${id}]`); + + if (entry.summation) { + const summary = entry.summation.replace(/<[^>]*>/g, ''); + contextLines.push(` Description: ${summary}`); + } + + if (entry.species && entry.species.length > 0) { + contextLines.push(` Species: ${entry.species.join(', ')}`); + } + + if (entry.referenceIdentifier) { + const ref = entry.referenceName + ? `${entry.referenceIdentifier} (${entry.referenceName})` + : entry.referenceIdentifier; + contextLines.push(` Reference: ${ref}`); + } + + contextLines.push(''); // Empty line between entries + } + + return contextLines.join('\n'); +} + +/** + * Generate answer using LLM with context and citation enforcement + */ +async function generateAnswer(originalQuery: string, context: string): Promise { + const prompt = ` +You are a biological expert answering questions based solely on the provided Reactome context. Your answer must be grounded in the given information and include Reactome IDs for every biological claim. + +Context from Reactome: +${context} + +User Question: ${originalQuery} + +Instructions: +- Answer using ONLY the information in the context above +- For every biological entity, pathway, reaction, or process mentioned, include the Reactome ID in brackets [R-HSA-XXXXX] +- If something is not covered in the context, state that clearly +- Be comprehensive but concise +- Structure your answer logically + +Answer: +`; + + const response = await openai.chat.completions.create({ + model: 'gpt-4o-mini', + messages: [{ role: 'user', content: prompt }], + temperature: 0.1, // Low for factual answers + max_tokens: 2000, + }); + + const content = response.choices[0]?.message?.content; + if (!content) { + throw new Error('No response from LLM'); + } + + return content.trim(); +} + +/** + * Main RAG pipeline function + */ +export async function runRAGPipeline(userQuery: string): Promise { + // Step 1: Query Understanding + const queryResult: QueryUnderstandingResult = await understandQuery(userQuery); + + // Step 2: Retrieval using sub-queries + const searchPromises = queryResult.sub_queries.map(async (subQuery) => { + const entries = await searchSubQuery(subQuery); + return entries.map(entry => ({ ...entry, sourceSubQuery: subQuery })); + }); + + const searchResults = await Promise.all(searchPromises); + const allResults = searchResults.flat(); + + // Step 3: Merge and deduplicate + const deduplicatedEntries = mergeAndDeduplicateResults(allResults); + + // Step 4: Construct context + const context = constructContext(deduplicatedEntries); + + // Step 5: Generate answer with LLM + const rawAnswer = await generateAnswer(userQuery, context); + + // Step 6: Citation verification + const verificationResult: CitationVerificationResult = await verifyCitations(rawAnswer, context); + + // Step 7: Return structured result + return { + intent: queryResult.intent, + sub_queries: queryResult.sub_queries, + final_answer: verificationResult.final_answer, + valid_ids: verificationResult.valid_ids, + invalid_ids: verificationResult.invalid_ids, + }; +} \ No newline at end of file diff --git a/src/tools/index.ts b/src/tools/index.ts index af3a1c3..06f7cc9 100644 --- a/src/tools/index.ts +++ b/src/tools/index.ts @@ -9,6 +9,9 @@ import { registerSearchTools } from "./search.js"; import { registerEntityTools } from "./entity.js"; import { registerExportTools } from "./export.js"; import { registerInteractorTools } from "./interactors.js"; +import { understandQuery } from "../queryUnderstanding.js"; +import { verifyCitations } from "../citationVerification.js"; +import { runRAGPipeline } from "../ragPipeline.js"; export function registerAllTools(server: McpServer) { // Register tools from all modules @@ -223,4 +226,53 @@ function registerUtilityTools(server: McpServer) { }; } ); + + // Query understanding for biological RAG chatbot + server.tool( + "reactome_query_understanding", + "Analyze a biological query to classify its intent and decompose it into sub-queries for RAG retrieval.", + { + query: z.string().describe("The natural language biological query to analyze"), + }, + async ({ query }) => { + const result = await understandQuery(query); + + return { + content: [{ type: "text", text: JSON.stringify(result, null, 2) }], + }; + } + ); + + // Citation and verification for biological RAG chatbot + server.tool( + "reactome_citation_verification", + "Verify citations in LLM-generated answers and ensure all biological claims are grounded in valid Reactome data.", + { + answer: z.string().describe("The LLM-generated answer to verify"), + context: z.string().describe("The retrieved biological context with Reactome IDs"), + }, + async ({ answer, context }) => { + const result = await verifyCitations(answer, context); + + return { + content: [{ type: "text", text: JSON.stringify(result, null, 2) }], + }; + } + ); + + // Complete RAG pipeline for biological queries + server.tool( + "reactome_rag_pipeline", + "Run the complete RAG pipeline: query understanding, retrieval, answer generation, and citation verification.", + { + query: z.string().describe("The user's biological query"), + }, + async ({ query }) => { + const result = await runRAGPipeline(query); + + return { + content: [{ type: "text", text: JSON.stringify(result, null, 2) }], + }; + } + ); }