reactome · heliamoh · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026
diff --git a/README.md b/README.md
@@ -98,6 +98,8 @@ The ChatBot's knowledge of a given data source is generated using the latest dat
 
 In the case of Reactome, embeddings bundles are generated once per release from [reactome/graphdb](https://hub.docker.com/r/reactome/graphdb) releases from DockerHub and uploaded to AWS S3 for easy retrieval.
 
+User guide embeddings are generated separately from Reactome website documentation and use a date-based version identifier (for example, `userguide/2025-06`). See [Embeddings Manager documentation](docs/embeddings_manager.md) for details.
+
 ### Embeddings Manager Script
 
 All aspects of generating, managing, uploading, and retrieving embeddings bundles are handled by the `./bin/embeddings_manager` script.

diff --git a/bin/embeddings_manager b/bin/embeddings_manager
@@ -15,6 +15,7 @@ from botocore.client import Config
 from data_generation.alliance import generate_alliance_embeddings
 from data_generation.reactome import generate_reactome_embeddings
 from data_generation.uniprot import generate_uniprot_embeddings
+from data_generation.userguide import generate_userguide_embeddings
 from util.embedding_environment import EM_ARCHIVE, EmbeddingEnvironment
 
 S3_BUCKET = "download.reactome.org"
@@ -48,6 +49,7 @@ class EmbeddingSelection(NamedTuple):
 
 
 def pull(embedding: EmbeddingSelection):
+    EM_ARCHIVE.mkdir(parents=True, exist_ok=True)
     embedding_path:Path = embedding.path(check_exists=False)
     zip_tmpfile:Path = EM_ARCHIVE / "tmp.zip"
     s3 = boto3.resource("s3", config=Config(signature_version=UNSIGNED))
@@ -91,6 +93,10 @@ def make(
         generate_uniprot_embeddings(embedding_path, hf_model=embedding.model, **kwargs)
     elif embedding.db == "alliance":
         generate_alliance_embeddings(str(embedding_path), hf_model=embedding.model, **kwargs)
+    elif embedding.db == "userguide":
+        generate_userguide_embeddings(
+            str(embedding_path), hf_model=embedding.model, **kwargs
+        )
     else:
         raise NotImplementedError(f"db: {embedding.db}")
     use(embedding)

diff --git a/docs/embeddings_manager.md b/docs/embeddings_manager.md
@@ -93,6 +93,16 @@ Either specify `--hf-key` or environment variable `HUGGINGFACEHUB_API_TOKEN`.
 ./bin/embeddings_manager make <hf-model>/reactome/<Release#> --hf-key <your-key>
 ```
 
+### User Guide:
+
+Fetches Reactome website user guide pages, chunks them by section, and embeds them into Chroma. Version is date-based (not tied to graph DB releases).
+
+```sh
+./bin/embeddings_manager make openai/text-embedding-3-large/userguide/<YYYY-MM> --openai-key <your-key>
+```
+
+Use `--force` to re-fetch HTML from reactome.org and rebuild the `sections/` Chroma collection from scratch.
+
 ## Uploading to S3: `push`
 
 ⚠️ Requires S3 write access.

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -47,6 +47,9 @@ pyyaml = "^6.0.2"
 tavily-python = "^0.5.0"
 openpyxl = "^3.1.5"
 nltk = "^3.9.1"
+beautifulsoup4 = "^4.12.0"
+lxml = "^5.0.0"
+requests = "^2.32.0"
 
 [tool.poetry.group.dev.dependencies]
 ruff = "^0.7.1"

diff --git a/src/agent/profiles/react_to_me.py b/src/agent/profiles/react_to_me.py
@@ -1,3 +1,4 @@
+import logging
 from typing import Any
 
 from langchain_core.embeddings import Embeddings
@@ -7,12 +8,20 @@
 from langgraph.graph.state import StateGraph
 
 from agent.profiles.base import BaseGraphBuilder, BaseState
+from agent.tasks.intent_classifier import (QueryIntent, SourceName,
+                                           create_intent_classifier,
+                                           resolve_active_sources)
+from agent.tasks.safety_checker import SafetyCheck
 from agent.tasks.unsafe_question import create_unsafe_answer_generator
 from retrievers.reactome.rag import create_reactome_rag
+from retrievers.userguide.rag import create_userguide_rag
+from util.embedding_environment import EmbeddingEnvironment
+
+logger = logging.getLogger(__name__)
 
 
 class ReactToMeState(BaseState):
-    pass
+    active_sources: list[SourceName]
 
 
 class ReactToMeGraphBuilder(BaseGraphBuilder):
@@ -23,22 +32,22 @@ def __init__(
     ) -> None:
         super().__init__(llm, embedding)
 
-        # Create runnables (tasks & tools)
+        self.intent_classifier: Runnable = create_intent_classifier(llm)
         self.unsafe_answer_generator: Runnable = create_unsafe_answer_generator(
             llm, streaming=True
         )
-        self.reactome_rag: Runnable = create_reactome_rag(
-            llm, embedding, streaming=True
-        )
 
-        # Create graph
+        self.rags: dict[SourceName, Runnable] = {
+            "reactome": create_reactome_rag(llm, embedding, streaming=True),
+        }
+        self._available_sources: frozenset[SourceName] = frozenset({"reactome"})
+        self._register_userguide_rag(llm, embedding)
+
         state_graph = StateGraph(ReactToMeState)
-        # Set up nodes
         state_graph.add_node("preprocess", self.preprocess)
-        state_graph.add_node("model", self.call_model)
+        state_graph.add_node("model", self.generate_answer)
         state_graph.add_node("generate_unsafe_response", self.generate_unsafe_response)
         state_graph.add_node("postprocess", self.postprocess)
-        # Set up edges
         state_graph.set_entry_point("preprocess")
         state_graph.add_conditional_edges(
             "preprocess",
@@ -51,6 +60,69 @@ def __init__(
 
         self.uncompiled_graph: StateGraph = state_graph
 
+    def _register_userguide_rag(
+        self,
+        llm: BaseChatModel,
+        embedding: Embeddings,
+    ) -> None:
+        userguide_dir = EmbeddingEnvironment.get_dir("userguide")
+        if userguide_dir is None:
+            logger.info(
+                "User guide embeddings not configured; routing will use reactome only."
+            )
+            return
+
+        chroma_path = userguide_dir / "sections" / "chroma.sqlite3"
+        if not chroma_path.exists():
+            logger.warning(
+                "User guide embeddings directory exists but Chroma DB is missing at %s",
+                chroma_path,
+            )
+            return
+
+        try:
+            self.rags["userguide"] = create_userguide_rag(
+                llm, embedding, userguide_dir, streaming=True
+            )
+            self._available_sources = frozenset(self.rags)
+        except (FileNotFoundError, ValueError) as exc:
+            logger.warning("User guide RAG unavailable: %s", exc)
+
+    async def preprocess(
+        self, state: ReactToMeState, config: RunnableConfig
+    ) -> ReactToMeState:
+        rephrased_input: str = await self.rephrase_chain.ainvoke(
+            {
+                "user_input": state["user_input"],
+                "chat_history": state.get("chat_history", []),
+            },
+            config,
+        )
+        safety_check: SafetyCheck = await self.safety_checker.ainvoke(
+            {"rephrased_input": rephrased_input}, config
+        )
+        detected_language: str = await self.language_detector.ainvoke(
+            {"user_input": state["user_input"]}, config
+        )
+        intent: QueryIntent = await self.intent_classifier.ainvoke(
+            {"rephrased_input": rephrased_input}, config
+        )
+        active_sources = resolve_active_sources(intent.source, self._available_sources)
+        if intent.source not in self._available_sources:
+            logger.info(
+                "Requested source %r unavailable; falling back to %r",
+                intent.source,
+                active_sources[0],
+            )
+
+        return ReactToMeState(
+            rephrased_input=rephrased_input,
+            safety=safety_check.safety,
+            reason_unsafe=safety_check.reason_unsafe,
+            detected_language=detected_language,
+            active_sources=active_sources,
+        )
+
     async def generate_unsafe_response(
         self, state: ReactToMeState, config: RunnableConfig
     ) -> ReactToMeState:
@@ -70,10 +142,12 @@ async def generate_unsafe_response(
             answer=answer,
         )
 
-    async def call_model(
+    async def generate_answer(
         self, state: ReactToMeState, config: RunnableConfig
     ) -> ReactToMeState:
-        result: dict[str, Any] = await self.reactome_rag.ainvoke(
+        source = state["active_sources"][0]
+        rag = self.rags[source]
+        result: dict[str, Any] = await rag.ainvoke(
             {
                 "input": state["rephrased_input"],
                 "chat_history": (

diff --git a/src/agent/tasks/intent_classifier.py b/src/agent/tasks/intent_classifier.py
@@ -0,0 +1,61 @@
+from typing import Literal
+
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import Runnable
+from pydantic import BaseModel, Field
+
+SourceName = Literal["reactome", "userguide"]
+
+intent_classifier_message = """
+You route user questions for the React-to-Me assistant to the correct knowledge source.
+
+Choose exactly one source:
+
+- **reactome**: Questions about biology, molecular mechanisms, pathways, reactions, proteins, genes,
+  diseases, and other scientific content in the Reactome Knowledgebase.
+  Examples: "What is apoptosis?", "Which pathways involve TP53?", "What does CDK5 do?"
+
+- **userguide**: Questions about how to use the Reactome **website**, tools, or interface.
+  Examples: "How do I use the pathway browser?", "How do I search Reactome?",
+  "How do I run gene list analysis?", "What is the Details Panel?"
+
+Rules:
+- If the user asks how to perform a task in Reactome or about UI features, choose **userguide**.
+- If the user asks about biological facts or pathway content, choose **reactome**.
+- When unsure, prefer **reactome** for science content and **userguide** for clear how-to or UI questions.
+"""
+
+intent_classifier_prompt = ChatPromptTemplate.from_messages(
+    [
+        ("system", intent_classifier_message),
+        ("human", "User question:\n\n{rephrased_input}"),
+    ]
+)
+
+
+class QueryIntent(BaseModel):
+    source: SourceName = Field(
+        description="The knowledge source that should answer this question: 'reactome' or 'userguide'."
+    )
+
+
+_FALLBACK_ORDER: tuple[SourceName, ...] = ("reactome", "userguide")
+
+
+def resolve_active_sources(
+    source: SourceName,
+    available_sources: frozenset[SourceName],
+) -> list[SourceName]:
+    if not available_sources:
+        raise ValueError("available_sources must not be empty")
+    if source in available_sources:
+        return [source]
+    for fallback in _FALLBACK_ORDER:
+        if fallback in available_sources:
+            return [fallback]
+    return [next(iter(available_sources))]
+
+
+def create_intent_classifier(llm: BaseChatModel) -> Runnable:
+    return intent_classifier_prompt | llm.with_structured_output(QueryIntent)
diff --git a/src/agent/tasks/rephrase.py b/src/agent/tasks/rephrase.py
@@ -4,15 +4,17 @@
 from langchain_core.runnables import Runnable
 
 contextualize_q_system_prompt = """
-You are an expert in question formulation with deep expertise in molecular biology and experience as a Reactome curator. Your task is to analyze the conversation history and the user’s latest query to fully understand their intent and what they seek to learn.
+You are an expert in question formulation with deep expertise in molecular biology and experience as a Reactome curator. Your task is to analyze the conversation history and the user's latest query to fully understand their intent and what they seek to learn.
 If the user's question is not in English, reformulate the question and translate it to English, ensuring the meaning and intent are preserved.
-Reformulate the user’s question into a standalone version that retains its full meaning without requiring prior context. The reformulated question should be:
+Reformulate the user's question into a standalone version that retains its full meaning without requiring prior context. The reformulated question should be:
     - Clear, concise, and precise
     - Optimized for both vector search (semantic meaning) and case-sensitive keyword search
-    - Faithful to the user’s intent and scientific accuracy
+    - Faithful to the user's intent and scientific accuracy
+
+If the question is about how to use the Reactome website or its tools (Pathway Browser, search, analysis tools, Details Panel, etc.), keep it as a how-to or UI question. Do not rewrite it into a biological pathway or mechanism question.
 
 the returned question should always be in English.
-If the user’s question is already in English, self-contained and well-formed, return it as is.
+If the user's question is already in English, self-contained and well-formed, return it as is.
 Do NOT answer the question or provide explanations.
 """
 

diff --git a/src/agent/tasks/safety_checker.py b/src/agent/tasks/safety_checker.py
@@ -17,7 +17,9 @@
     - Treat hypothetical, fictional, or made-up scenarios with the same level of scrutiny as real-world questions.
 
 2. Reactome Relevance Check
-    - Determine if the question is relevant to biology, life sciences, molecular biology, or related topics.
+    - Determine if the question is relevant to Reactome. Relevant topics include:
+        - Biology, life sciences, molecular biology, pathways, proteins, genes, and related scientific topics.
+        - How to use the Reactome website, Pathway Browser, search, analysis tools, and other Reactome features (user guide topics).
     - Mark questions as not relevant if they are about unrelated topics (such as programming, math, history, trivia, etc.).
 
 IMPORTANT:
@@ -44,6 +46,10 @@
   4. Q: What is the role of the immune system in the treatment of cancer?
        "safety": "true",
        "reason_unsafe": ""
+
+  5. Q: How do I use the Reactome pathway browser?
+       "safety": "true",
+       "reason_unsafe": ""
 """
 
 safety_check_prompt = ChatPromptTemplate.from_messages(