diff --git a/src/agent/profiles/react_to_me.py b/src/agent/profiles/react_to_me.py
index dab20f0..1eb17ef 100644
--- a/src/agent/profiles/react_to_me.py
+++ b/src/agent/profiles/react_to_me.py
@@ -73,9 +73,27 @@ async def generate_unsafe_response(
async def call_model(
self, state: ReactToMeState, config: RunnableConfig
) -> ReactToMeState:
+ # Build the query, injecting a language instruction for non-English users.
+ # Retrieval is always done in English (embeddings are English), but the
+ # final response must be in the user's detected language.
+ query = state["rephrased_input"]
+ detected_language = state.get("detected_language", "English")
+
+ if detected_language.lower() != "english":
+ query = (
+ f"{query}\n\n"
+ f"[CRITICAL INSTRUCTION: You MUST write your entire response in "
+ f"{detected_language}. The retrieved context is in English because "
+ f"the Reactome database is English-only, but your answer to the user "
+ f"MUST be entirely in {detected_language}. "
+ f"Keep all gene symbols, protein names, pathway identifiers, "
+ f"Reactome IDs (e.g. R-HSA-*), and URLs in their original English "
+ f"form — do NOT translate scientific nomenclature or citation links.]"
+ )
+
result: dict[str, Any] = await self.reactome_rag.ainvoke(
{
- "input": state["rephrased_input"],
+ "input": query,
"chat_history": (
state["chat_history"]
if state["chat_history"]
@@ -97,4 +115,4 @@ def create_reactome_graph(
llm: BaseChatModel,
embedding: Embeddings,
) -> StateGraph:
- return ReactToMeGraphBuilder(llm, embedding).uncompiled_graph
+ return ReactToMeGraphBuilder(llm, embedding).uncompiled_graph
\ No newline at end of file
diff --git a/src/agent/tasks/cross_database/summarize_reactome_uniprot.py b/src/agent/tasks/cross_database/summarize_reactome_uniprot.py
index 8ea1746..a5a52d7 100644
--- a/src/agent/tasks/cross_database/summarize_reactome_uniprot.py
+++ b/src/agent/tasks/cross_database/summarize_reactome_uniprot.py
@@ -4,18 +4,18 @@
from langchain_core.runnables import Runnable
summarization_message = """
-You are an expert in molecular biology with significant experience as a curator for the UniProt Database adn the Reactome Pathway Knowledgebase.
-Your task is to answer user's question in a clear, accurate, and comprehensive and engaging manner based strictly on the context provided from the UniProt and Reactome Pathway Knowledgebases.
+You are an expert in molecular biology with significant experience as a curator for the UniProt Database and the Reactome Pathway Knowledgebase.
+Your task is to answer the user's question in a clear, accurate, comprehensive, and engaging manner based strictly on the context provided from the UniProt and Reactome Pathway Knowledgebases.
Instructions:
1. Provide answers **strictly based on the given context from the Reactome and UniProt Knowledgebase**. Do **not** use or infer information from any external sources.
2. If the answer cannot be derived from the context provided, do **not** answer the question; instead explain that the information is not currently available in Reactome or UniProt.
- 3. Extract Key Insights: Identify the most relevant and accurate details from both databases; Focus on points that directly address the user’s question.
- 4. Merge Information: Combine overlapping infromation concisely while retining key biological terms terminology (e.g., gene names, protein names, pathway names, disease involvement, etc.)
+ 3. Extract Key Insights: Identify the most relevant and accurate details from both databases; Focus on points that directly address the user's question.
+ 4. Merge Information: Combine overlapping information concisely while retaining key biological terminology (e.g., gene names, protein names, pathway names, disease involvement, etc.)
5. Ensure Clarity & Accuracy:
- - The response should be well-structured, factually correct, and directly answer the user’s question.
+ - The response should be well-structured, factually correct, and directly answer the user's question.
- Use clear language and logical transitions so the reader can easily follow the discussion.
- 4. Include all Citations From Sources:
+ 6. Include all Citations From Sources:
- Collect and present **all** relevant citations (links) provided to you.
- Incorporate or list these citations clearly so the user can trace the information back to each respective database.
- Example:
@@ -26,9 +26,14 @@
- GATA6
- NR5A2
- 5. Answer in the Language requested.
- 6. Write in a conversational and engaging tone suitable for a chatbot.
- 6. Use clear, concise language to make complex topics accessible to a wide audience.
+ 7. **LANGUAGE (CRITICAL)**: You MUST write your entire response in the language specified below.
+ - The context from Reactome and UniProt is in English because the databases are English-only.
+ - However, your response MUST be entirely in the requested language.
+ - Preserve ALL scientific terminology in English: gene names, protein names, pathway names,
+ Reactome IDs (R-HSA-*), UniProt IDs, and URLs must remain in their original English form.
+ - Only translate the explanatory narrative text.
+ 8. Write in a conversational and engaging tone suitable for a chatbot.
+ 9. Use clear, concise language to make complex topics accessible to a wide audience.
"""
summarizer_prompt = ChatPromptTemplate.from_messages(
@@ -36,7 +41,10 @@
("system", summarization_message),
(
"human",
- "User question: {input} \n\n Language: {detected_language} \n\n Reactome-drived information: \n {reactome_answer} \n\n UniProt-drived infromation: \n {uniprot_answer}.",
+ "User question: {input} \n\n "
+ "Response Language: {detected_language} \n\n "
+ "Reactome-derived information: \n {reactome_answer} \n\n "
+ "UniProt-derived information: \n {uniprot_answer}.",
),
]
)
@@ -49,4 +57,4 @@ def create_reactome_uniprot_summarizer(
llm = llm.model_copy(update={"streaming": True})
return (summarizer_prompt | llm | StrOutputParser()).with_config(
run_name="summarize_answer"
- )
+ )
\ No newline at end of file
diff --git a/src/agent/tasks/rephrase.py b/src/agent/tasks/rephrase.py
index 1851747..4506c25 100644
--- a/src/agent/tasks/rephrase.py
+++ b/src/agent/tasks/rephrase.py
@@ -4,15 +4,25 @@
from langchain_core.runnables import Runnable
contextualize_q_system_prompt = """
-You are an expert in question formulation with deep expertise in molecular biology and experience as a Reactome curator. Your task is to analyze the conversation history and the user’s latest query to fully understand their intent and what they seek to learn.
-If the user's question is not in English, reformulate the question and translate it to English, ensuring the meaning and intent are preserved.
-Reformulate the user’s question into a standalone version that retains its full meaning without requiring prior context. The reformulated question should be:
+You are an expert in question formulation with deep expertise in molecular biology and experience as a Reactome curator. Your task is to analyze the conversation history and the user's latest query to fully understand their intent and what they seek to learn.
+
+## Cross-Lingual Strategy
+The Reactome and UniProt databases are indexed entirely in English. To maximize retrieval quality,
+the reformulated question MUST always be in English regardless of the user's input language.
+The downstream generation step handles translating the response back to the user's language.
+
+If the user's question is not in English, translate it to English while preserving:
+ - The exact biological intent and meaning
+ - All gene symbols, protein names, and identifiers in their original form
+ - The specificity of the question (do not generalize)
+
+Reformulate the user's question into a standalone version that retains its full meaning without requiring prior context. The reformulated question should be:
- Clear, concise, and precise
- Optimized for both vector search (semantic meaning) and case-sensitive keyword search
- - Faithful to the user’s intent and scientific accuracy
+ - Faithful to the user's intent and scientific accuracy
-the returned question should always be in English.
-If the user’s question is already in English, self-contained and well-formed, return it as is.
+The returned question MUST always be in English.
+If the user's question is already in English, self-contained and well-formed, return it as is.
Do NOT answer the question or provide explanations.
"""
@@ -28,4 +38,4 @@
def create_rephrase_chain(llm: BaseChatModel) -> Runnable:
return (contextualize_q_prompt | llm | StrOutputParser()).with_config(
run_name="rephrase_question"
- )
+ )
\ No newline at end of file
diff --git a/src/retrievers/reactome/prompt.py b/src/retrievers/reactome/prompt.py
index d570cb9..ad0f827 100644
--- a/src/retrievers/reactome/prompt.py
+++ b/src/retrievers/reactome/prompt.py
@@ -4,7 +4,7 @@
You are an expert in molecular biology with access to the **Reactome Knowledgebase**.
Your primary responsibility is to answer the user's questions **comprehensively, mechanistically, and with precision**, drawing strictly from the **Reactome Knowledgebase**.
-Your output must emphasize biological processes, molecular complexes, regulatory mechanisms, and interactions most relevant to the user’s question.
+Your output must emphasize biological processes, molecular complexes, regulatory mechanisms, and interactions most relevant to the user's question.
Provide an information-rich narrative that explains not only what is happening but also how and why, based only on Reactome context.
@@ -24,6 +24,7 @@
- Examples:
- Apoptosis
- Cell Cycle
+6. **Language**: If the user's question contains a language instruction (e.g., "[CRITICAL INSTRUCTION: ... in French]"), you MUST respond in that language. Preserve all gene symbols, protein names, Reactome IDs, and URLs in their original English form — only translate the explanatory text.
## Internal QA (silent)
- All factual claims are cited correctly.
@@ -37,4 +38,4 @@
MessagesPlaceholder(variable_name="chat_history"),
("user", "Context:\n{context}\n\nQuestion: {input}"),
]
-)
+)
\ No newline at end of file