From 7206ada19d66f33abc6de30d0498597bdebe74c4 Mon Sep 17 00:00:00 2001 From: bleedblack1 Date: Thu, 12 Mar 2026 11:05:58 +0530 Subject: [PATCH] Complete fix multi language support --- src/agent/profiles/react_to_me.py | 22 ++++++++++++-- .../summarize_reactome_uniprot.py | 30 ++++++++++++------- src/agent/tasks/rephrase.py | 24 ++++++++++----- src/retrievers/reactome/prompt.py | 5 ++-- 4 files changed, 59 insertions(+), 22 deletions(-) diff --git a/src/agent/profiles/react_to_me.py b/src/agent/profiles/react_to_me.py index dab20f0..1eb17ef 100644 --- a/src/agent/profiles/react_to_me.py +++ b/src/agent/profiles/react_to_me.py @@ -73,9 +73,27 @@ async def generate_unsafe_response( async def call_model( self, state: ReactToMeState, config: RunnableConfig ) -> ReactToMeState: + # Build the query, injecting a language instruction for non-English users. + # Retrieval is always done in English (embeddings are English), but the + # final response must be in the user's detected language. + query = state["rephrased_input"] + detected_language = state.get("detected_language", "English") + + if detected_language.lower() != "english": + query = ( + f"{query}\n\n" + f"[CRITICAL INSTRUCTION: You MUST write your entire response in " + f"{detected_language}. The retrieved context is in English because " + f"the Reactome database is English-only, but your answer to the user " + f"MUST be entirely in {detected_language}. " + f"Keep all gene symbols, protein names, pathway identifiers, " + f"Reactome IDs (e.g. R-HSA-*), and URLs in their original English " + f"form — do NOT translate scientific nomenclature or citation links.]" + ) + result: dict[str, Any] = await self.reactome_rag.ainvoke( { - "input": state["rephrased_input"], + "input": query, "chat_history": ( state["chat_history"] if state["chat_history"] @@ -97,4 +115,4 @@ def create_reactome_graph( llm: BaseChatModel, embedding: Embeddings, ) -> StateGraph: - return ReactToMeGraphBuilder(llm, embedding).uncompiled_graph + return ReactToMeGraphBuilder(llm, embedding).uncompiled_graph \ No newline at end of file diff --git a/src/agent/tasks/cross_database/summarize_reactome_uniprot.py b/src/agent/tasks/cross_database/summarize_reactome_uniprot.py index 8ea1746..a5a52d7 100644 --- a/src/agent/tasks/cross_database/summarize_reactome_uniprot.py +++ b/src/agent/tasks/cross_database/summarize_reactome_uniprot.py @@ -4,18 +4,18 @@ from langchain_core.runnables import Runnable summarization_message = """ -You are an expert in molecular biology with significant experience as a curator for the UniProt Database adn the Reactome Pathway Knowledgebase. -Your task is to answer user's question in a clear, accurate, and comprehensive and engaging manner based strictly on the context provided from the UniProt and Reactome Pathway Knowledgebases. +You are an expert in molecular biology with significant experience as a curator for the UniProt Database and the Reactome Pathway Knowledgebase. +Your task is to answer the user's question in a clear, accurate, comprehensive, and engaging manner based strictly on the context provided from the UniProt and Reactome Pathway Knowledgebases. Instructions: 1. Provide answers **strictly based on the given context from the Reactome and UniProt Knowledgebase**. Do **not** use or infer information from any external sources. 2. If the answer cannot be derived from the context provided, do **not** answer the question; instead explain that the information is not currently available in Reactome or UniProt. - 3. Extract Key Insights: Identify the most relevant and accurate details from both databases; Focus on points that directly address the user’s question. - 4. Merge Information: Combine overlapping infromation concisely while retining key biological terms terminology (e.g., gene names, protein names, pathway names, disease involvement, etc.) + 3. Extract Key Insights: Identify the most relevant and accurate details from both databases; Focus on points that directly address the user's question. + 4. Merge Information: Combine overlapping information concisely while retaining key biological terminology (e.g., gene names, protein names, pathway names, disease involvement, etc.) 5. Ensure Clarity & Accuracy: - - The response should be well-structured, factually correct, and directly answer the user’s question. + - The response should be well-structured, factually correct, and directly answer the user's question. - Use clear language and logical transitions so the reader can easily follow the discussion. - 4. Include all Citations From Sources: + 6. Include all Citations From Sources: - Collect and present **all** relevant citations (links) provided to you. - Incorporate or list these citations clearly so the user can trace the information back to each respective database. - Example: @@ -26,9 +26,14 @@ - GATA6 - NR5A2 - 5. Answer in the Language requested. - 6. Write in a conversational and engaging tone suitable for a chatbot. - 6. Use clear, concise language to make complex topics accessible to a wide audience. + 7. **LANGUAGE (CRITICAL)**: You MUST write your entire response in the language specified below. + - The context from Reactome and UniProt is in English because the databases are English-only. + - However, your response MUST be entirely in the requested language. + - Preserve ALL scientific terminology in English: gene names, protein names, pathway names, + Reactome IDs (R-HSA-*), UniProt IDs, and URLs must remain in their original English form. + - Only translate the explanatory narrative text. + 8. Write in a conversational and engaging tone suitable for a chatbot. + 9. Use clear, concise language to make complex topics accessible to a wide audience. """ summarizer_prompt = ChatPromptTemplate.from_messages( @@ -36,7 +41,10 @@ ("system", summarization_message), ( "human", - "User question: {input} \n\n Language: {detected_language} \n\n Reactome-drived information: \n {reactome_answer} \n\n UniProt-drived infromation: \n {uniprot_answer}.", + "User question: {input} \n\n " + "Response Language: {detected_language} \n\n " + "Reactome-derived information: \n {reactome_answer} \n\n " + "UniProt-derived information: \n {uniprot_answer}.", ), ] ) @@ -49,4 +57,4 @@ def create_reactome_uniprot_summarizer( llm = llm.model_copy(update={"streaming": True}) return (summarizer_prompt | llm | StrOutputParser()).with_config( run_name="summarize_answer" - ) + ) \ No newline at end of file diff --git a/src/agent/tasks/rephrase.py b/src/agent/tasks/rephrase.py index 1851747..4506c25 100644 --- a/src/agent/tasks/rephrase.py +++ b/src/agent/tasks/rephrase.py @@ -4,15 +4,25 @@ from langchain_core.runnables import Runnable contextualize_q_system_prompt = """ -You are an expert in question formulation with deep expertise in molecular biology and experience as a Reactome curator. Your task is to analyze the conversation history and the user’s latest query to fully understand their intent and what they seek to learn. -If the user's question is not in English, reformulate the question and translate it to English, ensuring the meaning and intent are preserved. -Reformulate the user’s question into a standalone version that retains its full meaning without requiring prior context. The reformulated question should be: +You are an expert in question formulation with deep expertise in molecular biology and experience as a Reactome curator. Your task is to analyze the conversation history and the user's latest query to fully understand their intent and what they seek to learn. + +## Cross-Lingual Strategy +The Reactome and UniProt databases are indexed entirely in English. To maximize retrieval quality, +the reformulated question MUST always be in English regardless of the user's input language. +The downstream generation step handles translating the response back to the user's language. + +If the user's question is not in English, translate it to English while preserving: + - The exact biological intent and meaning + - All gene symbols, protein names, and identifiers in their original form + - The specificity of the question (do not generalize) + +Reformulate the user's question into a standalone version that retains its full meaning without requiring prior context. The reformulated question should be: - Clear, concise, and precise - Optimized for both vector search (semantic meaning) and case-sensitive keyword search - - Faithful to the user’s intent and scientific accuracy + - Faithful to the user's intent and scientific accuracy -the returned question should always be in English. -If the user’s question is already in English, self-contained and well-formed, return it as is. +The returned question MUST always be in English. +If the user's question is already in English, self-contained and well-formed, return it as is. Do NOT answer the question or provide explanations. """ @@ -28,4 +38,4 @@ def create_rephrase_chain(llm: BaseChatModel) -> Runnable: return (contextualize_q_prompt | llm | StrOutputParser()).with_config( run_name="rephrase_question" - ) + ) \ No newline at end of file diff --git a/src/retrievers/reactome/prompt.py b/src/retrievers/reactome/prompt.py index d570cb9..ad0f827 100644 --- a/src/retrievers/reactome/prompt.py +++ b/src/retrievers/reactome/prompt.py @@ -4,7 +4,7 @@ You are an expert in molecular biology with access to the **Reactome Knowledgebase**. Your primary responsibility is to answer the user's questions **comprehensively, mechanistically, and with precision**, drawing strictly from the **Reactome Knowledgebase**. -Your output must emphasize biological processes, molecular complexes, regulatory mechanisms, and interactions most relevant to the user’s question. +Your output must emphasize biological processes, molecular complexes, regulatory mechanisms, and interactions most relevant to the user's question. Provide an information-rich narrative that explains not only what is happening but also how and why, based only on Reactome context. @@ -24,6 +24,7 @@ - Examples: - Apoptosis - Cell Cycle +6. **Language**: If the user's question contains a language instruction (e.g., "[CRITICAL INSTRUCTION: ... in French]"), you MUST respond in that language. Preserve all gene symbols, protein names, Reactome IDs, and URLs in their original English form — only translate the explanatory text. ## Internal QA (silent) - All factual claims are cited correctly. @@ -37,4 +38,4 @@ MessagesPlaceholder(variable_name="chat_history"), ("user", "Context:\n{context}\n\nQuestion: {input}"), ] -) +) \ No newline at end of file