Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ The ChatBot's knowledge of a given data source is generated using the latest dat

In the case of Reactome, embeddings bundles are generated once per release from [reactome/graphdb](https://hub.docker.com/r/reactome/graphdb) releases from DockerHub and uploaded to AWS S3 for easy retrieval.

User guide embeddings are generated separately from Reactome website documentation and use a date-based version identifier (for example, `userguide/2025-06`). See [Embeddings Manager documentation](docs/embeddings_manager.md) for details.

### Embeddings Manager Script

All aspects of generating, managing, uploading, and retrieving embeddings bundles are handled by the `./bin/embeddings_manager` script.
Expand Down
6 changes: 6 additions & 0 deletions bin/embeddings_manager
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ from botocore.client import Config
from data_generation.alliance import generate_alliance_embeddings
from data_generation.reactome import generate_reactome_embeddings
from data_generation.uniprot import generate_uniprot_embeddings
from data_generation.userguide import generate_userguide_embeddings
from util.embedding_environment import EM_ARCHIVE, EmbeddingEnvironment

S3_BUCKET = "download.reactome.org"
Expand Down Expand Up @@ -48,6 +49,7 @@ class EmbeddingSelection(NamedTuple):


def pull(embedding: EmbeddingSelection):
EM_ARCHIVE.mkdir(parents=True, exist_ok=True)
embedding_path:Path = embedding.path(check_exists=False)
zip_tmpfile:Path = EM_ARCHIVE / "tmp.zip"
s3 = boto3.resource("s3", config=Config(signature_version=UNSIGNED))
Expand Down Expand Up @@ -91,6 +93,10 @@ def make(
generate_uniprot_embeddings(embedding_path, hf_model=embedding.model, **kwargs)
elif embedding.db == "alliance":
generate_alliance_embeddings(str(embedding_path), hf_model=embedding.model, **kwargs)
elif embedding.db == "userguide":
generate_userguide_embeddings(
str(embedding_path), hf_model=embedding.model, **kwargs
)
else:
raise NotImplementedError(f"db: {embedding.db}")
use(embedding)
Expand Down
10 changes: 10 additions & 0 deletions docs/embeddings_manager.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,16 @@ Either specify `--hf-key` or environment variable `HUGGINGFACEHUB_API_TOKEN`.
./bin/embeddings_manager make <hf-model>/reactome/<Release#> --hf-key <your-key>
```

### User Guide:

Fetches Reactome website user guide pages, chunks them by section, and embeds them into Chroma. Version is date-based (not tied to graph DB releases).

```sh
./bin/embeddings_manager make openai/text-embedding-3-large/userguide/<YYYY-MM> --openai-key <your-key>
```

Use `--force` to re-fetch HTML from reactome.org and rebuild the `sections/` Chroma collection from scratch.

## Uploading to S3: `push`

⚠️ Requires S3 write access.
Expand Down
185 changes: 183 additions & 2 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ pyyaml = "^6.0.2"
tavily-python = "^0.5.0"
openpyxl = "^3.1.5"
nltk = "^3.9.1"
beautifulsoup4 = "^4.12.0"
lxml = "^5.0.0"
Comment thread
heliamoh marked this conversation as resolved.
requests = "^2.32.0"

[tool.poetry.group.dev.dependencies]
ruff = "^0.7.1"
Expand Down
96 changes: 85 additions & 11 deletions src/agent/profiles/react_to_me.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
from typing import Any

from langchain_core.embeddings import Embeddings
Expand All @@ -7,12 +8,20 @@
from langgraph.graph.state import StateGraph

from agent.profiles.base import BaseGraphBuilder, BaseState
from agent.tasks.intent_classifier import (QueryIntent, SourceName,
create_intent_classifier,
resolve_active_sources)
from agent.tasks.safety_checker import SafetyCheck
from agent.tasks.unsafe_question import create_unsafe_answer_generator
from retrievers.reactome.rag import create_reactome_rag
from retrievers.userguide.rag import create_userguide_rag
from util.embedding_environment import EmbeddingEnvironment

logger = logging.getLogger(__name__)


class ReactToMeState(BaseState):
pass
active_sources: list[SourceName]


class ReactToMeGraphBuilder(BaseGraphBuilder):
Expand All @@ -23,22 +32,22 @@ def __init__(
) -> None:
super().__init__(llm, embedding)

# Create runnables (tasks & tools)
self.intent_classifier: Runnable = create_intent_classifier(llm)
self.unsafe_answer_generator: Runnable = create_unsafe_answer_generator(
llm, streaming=True
)
self.reactome_rag: Runnable = create_reactome_rag(
llm, embedding, streaming=True
)

# Create graph
self.rags: dict[SourceName, Runnable] = {
"reactome": create_reactome_rag(llm, embedding, streaming=True),
}
self._available_sources: frozenset[SourceName] = frozenset({"reactome"})
self._register_userguide_rag(llm, embedding)

state_graph = StateGraph(ReactToMeState)
# Set up nodes
state_graph.add_node("preprocess", self.preprocess)
state_graph.add_node("model", self.call_model)
state_graph.add_node("model", self.generate_answer)
state_graph.add_node("generate_unsafe_response", self.generate_unsafe_response)
state_graph.add_node("postprocess", self.postprocess)
# Set up edges
state_graph.set_entry_point("preprocess")
state_graph.add_conditional_edges(
"preprocess",
Expand All @@ -51,6 +60,69 @@ def __init__(

self.uncompiled_graph: StateGraph = state_graph

def _register_userguide_rag(
self,
llm: BaseChatModel,
embedding: Embeddings,
) -> None:
userguide_dir = EmbeddingEnvironment.get_dir("userguide")
if userguide_dir is None:
logger.info(
"User guide embeddings not configured; routing will use reactome only."
)
return

chroma_path = userguide_dir / "sections" / "chroma.sqlite3"
if not chroma_path.exists():
logger.warning(
"User guide embeddings directory exists but Chroma DB is missing at %s",
chroma_path,
)
return

try:
self.rags["userguide"] = create_userguide_rag(
llm, embedding, userguide_dir, streaming=True
)
self._available_sources = frozenset(self.rags)
except (FileNotFoundError, ValueError) as exc:
logger.warning("User guide RAG unavailable: %s", exc)

async def preprocess(
self, state: ReactToMeState, config: RunnableConfig
) -> ReactToMeState:
rephrased_input: str = await self.rephrase_chain.ainvoke(
{
"user_input": state["user_input"],
"chat_history": state.get("chat_history", []),
},
config,
)
safety_check: SafetyCheck = await self.safety_checker.ainvoke(
{"rephrased_input": rephrased_input}, config
)
detected_language: str = await self.language_detector.ainvoke(
{"user_input": state["user_input"]}, config
)
intent: QueryIntent = await self.intent_classifier.ainvoke(
{"rephrased_input": rephrased_input}, config
)
active_sources = resolve_active_sources(intent.source, self._available_sources)
if intent.source not in self._available_sources:
logger.info(
"Requested source %r unavailable; falling back to %r",
intent.source,
active_sources[0],
)

return ReactToMeState(
rephrased_input=rephrased_input,
safety=safety_check.safety,
reason_unsafe=safety_check.reason_unsafe,
detected_language=detected_language,
active_sources=active_sources,
)

async def generate_unsafe_response(
self, state: ReactToMeState, config: RunnableConfig
) -> ReactToMeState:
Expand All @@ -70,10 +142,12 @@ async def generate_unsafe_response(
answer=answer,
)

async def call_model(
async def generate_answer(
self, state: ReactToMeState, config: RunnableConfig
) -> ReactToMeState:
result: dict[str, Any] = await self.reactome_rag.ainvoke(
source = state["active_sources"][0]
rag = self.rags[source]
result: dict[str, Any] = await rag.ainvoke(
{
"input": state["rephrased_input"],
"chat_history": (
Expand Down
61 changes: 61 additions & 0 deletions src/agent/tasks/intent_classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from typing import Literal

from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import Runnable
from pydantic import BaseModel, Field

SourceName = Literal["reactome", "userguide"]

intent_classifier_message = """
You route user questions for the React-to-Me assistant to the correct knowledge source.

Choose exactly one source:

- **reactome**: Questions about biology, molecular mechanisms, pathways, reactions, proteins, genes,
diseases, and other scientific content in the Reactome Knowledgebase.
Examples: "What is apoptosis?", "Which pathways involve TP53?", "What does CDK5 do?"

- **userguide**: Questions about how to use the Reactome **website**, tools, or interface.
Examples: "How do I use the pathway browser?", "How do I search Reactome?",
"How do I run gene list analysis?", "What is the Details Panel?"

Rules:
- If the user asks how to perform a task in Reactome or about UI features, choose **userguide**.
- If the user asks about biological facts or pathway content, choose **reactome**.
- When unsure, prefer **reactome** for science content and **userguide** for clear how-to or UI questions.
"""

intent_classifier_prompt = ChatPromptTemplate.from_messages(
[
("system", intent_classifier_message),
("human", "User question:\n\n{rephrased_input}"),
]
)


class QueryIntent(BaseModel):
source: SourceName = Field(
description="The knowledge source that should answer this question: 'reactome' or 'userguide'."
)


_FALLBACK_ORDER: tuple[SourceName, ...] = ("reactome", "userguide")


def resolve_active_sources(
source: SourceName,
available_sources: frozenset[SourceName],
) -> list[SourceName]:
if not available_sources:
raise ValueError("available_sources must not be empty")
if source in available_sources:
return [source]
for fallback in _FALLBACK_ORDER:
if fallback in available_sources:
return [fallback]
return [next(iter(available_sources))]


def create_intent_classifier(llm: BaseChatModel) -> Runnable:
return intent_classifier_prompt | llm.with_structured_output(QueryIntent)
10 changes: 6 additions & 4 deletions src/agent/tasks/rephrase.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,17 @@
from langchain_core.runnables import Runnable

contextualize_q_system_prompt = """
You are an expert in question formulation with deep expertise in molecular biology and experience as a Reactome curator. Your task is to analyze the conversation history and the users latest query to fully understand their intent and what they seek to learn.
You are an expert in question formulation with deep expertise in molecular biology and experience as a Reactome curator. Your task is to analyze the conversation history and the user's latest query to fully understand their intent and what they seek to learn.
If the user's question is not in English, reformulate the question and translate it to English, ensuring the meaning and intent are preserved.
Reformulate the users question into a standalone version that retains its full meaning without requiring prior context. The reformulated question should be:
Reformulate the user's question into a standalone version that retains its full meaning without requiring prior context. The reformulated question should be:
- Clear, concise, and precise
- Optimized for both vector search (semantic meaning) and case-sensitive keyword search
- Faithful to the user’s intent and scientific accuracy
- Faithful to the user's intent and scientific accuracy

If the question is about how to use the Reactome website or its tools (Pathway Browser, search, analysis tools, Details Panel, etc.), keep it as a how-to or UI question. Do not rewrite it into a biological pathway or mechanism question.

the returned question should always be in English.
If the users question is already in English, self-contained and well-formed, return it as is.
If the user's question is already in English, self-contained and well-formed, return it as is.
Do NOT answer the question or provide explanations.
"""

Expand Down
8 changes: 7 additions & 1 deletion src/agent/tasks/safety_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
- Treat hypothetical, fictional, or made-up scenarios with the same level of scrutiny as real-world questions.

2. Reactome Relevance Check
- Determine if the question is relevant to biology, life sciences, molecular biology, or related topics.
- Determine if the question is relevant to Reactome. Relevant topics include:
- Biology, life sciences, molecular biology, pathways, proteins, genes, and related scientific topics.
- How to use the Reactome website, Pathway Browser, search, analysis tools, and other Reactome features (user guide topics).
- Mark questions as not relevant if they are about unrelated topics (such as programming, math, history, trivia, etc.).

IMPORTANT:
Expand All @@ -44,6 +46,10 @@
4. Q: What is the role of the immune system in the treatment of cancer?
"safety": "true",
"reason_unsafe": ""

5. Q: How do I use the Reactome pathway browser?
"safety": "true",
"reason_unsafe": ""
"""

safety_check_prompt = ChatPromptTemplate.from_messages(
Expand Down
Loading
Loading