From ab631390aec14c20810cb2da91fd07246fdefc07 Mon Sep 17 00:00:00 2001 From: Helia Mohammadi Date: Thu, 25 Jun 2026 18:05:57 -0400 Subject: [PATCH 1/4] Add Reactome user guide Q&A with intent-based routing. --- README.md | 2 + bin/embeddings_manager | 6 + docs/embeddings_manager.md | 10 + poetry.lock | 185 +++++++++++++- pyproject.toml | 2 + src/agent/profiles/react_to_me.py | 97 +++++++- src/agent/tasks/intent_classifier.py | 53 ++++ src/agent/tasks/rephrase.py | 10 +- src/agent/tasks/safety_checker.py | 8 +- src/data_generation/userguide/__init__.py | 91 +++++++ src/data_generation/userguide/fetch.py | 54 +++++ src/data_generation/userguide/html_loader.py | 240 +++++++++++++++++++ src/data_generation/userguide/urls.py | 16 ++ src/retrievers/rag_chain.py | 5 +- src/retrievers/userguide/prompt.py | 37 +++ src/retrievers/userguide/rag.py | 35 +++ src/retrievers/userguide/retriever.py | 37 +++ 17 files changed, 869 insertions(+), 19 deletions(-) create mode 100644 src/agent/tasks/intent_classifier.py create mode 100644 src/data_generation/userguide/__init__.py create mode 100644 src/data_generation/userguide/fetch.py create mode 100644 src/data_generation/userguide/html_loader.py create mode 100644 src/data_generation/userguide/urls.py create mode 100644 src/retrievers/userguide/prompt.py create mode 100644 src/retrievers/userguide/rag.py create mode 100644 src/retrievers/userguide/retriever.py diff --git a/README.md b/README.md index f38a4d3..bc36618 100644 --- a/README.md +++ b/README.md @@ -98,6 +98,8 @@ The ChatBot's knowledge of a given data source is generated using the latest dat In the case of Reactome, embeddings bundles are generated once per release from [reactome/graphdb](https://hub.docker.com/r/reactome/graphdb) releases from DockerHub and uploaded to AWS S3 for easy retrieval. +User guide embeddings are generated separately from Reactome website documentation and use a date-based version identifier (for example, `userguide/2025-06`). See [Embeddings Manager documentation](docs/embeddings_manager.md) for details. + ### Embeddings Manager Script All aspects of generating, managing, uploading, and retrieving embeddings bundles are handled by the `./bin/embeddings_manager` script. diff --git a/bin/embeddings_manager b/bin/embeddings_manager index 385e315..fdef4fa 100755 --- a/bin/embeddings_manager +++ b/bin/embeddings_manager @@ -15,6 +15,7 @@ from botocore.client import Config from data_generation.alliance import generate_alliance_embeddings from data_generation.reactome import generate_reactome_embeddings from data_generation.uniprot import generate_uniprot_embeddings +from data_generation.userguide import generate_userguide_embeddings from util.embedding_environment import EM_ARCHIVE, EmbeddingEnvironment S3_BUCKET = "download.reactome.org" @@ -48,6 +49,7 @@ class EmbeddingSelection(NamedTuple): def pull(embedding: EmbeddingSelection): + EM_ARCHIVE.mkdir(parents=True, exist_ok=True) embedding_path:Path = embedding.path(check_exists=False) zip_tmpfile:Path = EM_ARCHIVE / "tmp.zip" s3 = boto3.resource("s3", config=Config(signature_version=UNSIGNED)) @@ -91,6 +93,10 @@ def make( generate_uniprot_embeddings(embedding_path, hf_model=embedding.model, **kwargs) elif embedding.db == "alliance": generate_alliance_embeddings(str(embedding_path), hf_model=embedding.model, **kwargs) + elif embedding.db == "userguide": + generate_userguide_embeddings( + str(embedding_path), hf_model=embedding.model, **kwargs + ) else: raise NotImplementedError(f"db: {embedding.db}") use(embedding) diff --git a/docs/embeddings_manager.md b/docs/embeddings_manager.md index 20b84da..d36d248 100644 --- a/docs/embeddings_manager.md +++ b/docs/embeddings_manager.md @@ -93,6 +93,16 @@ Either specify `--hf-key` or environment variable `HUGGINGFACEHUB_API_TOKEN`. ./bin/embeddings_manager make /reactome/ --hf-key ``` +### User Guide: + +Fetches Reactome website user guide pages, chunks them by section, and embeds them into Chroma. Version is date-based (not tied to graph DB releases). + +```sh +./bin/embeddings_manager make openai/text-embedding-3-large/userguide/ --openai-key +``` + +Use `--force` to re-fetch HTML from reactome.org and rebuild the `sections/` Chroma collection from scratch. + ## Uploading to S3: `push` ⚠️ Requires S3 write access. diff --git a/poetry.lock b/poetry.lock index e21a020..d4697ad 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "aiofiles" @@ -335,6 +335,28 @@ files = [ tests = ["pytest (>=3.2.1,!=3.3.0)"] typecheck = ["mypy"] +[[package]] +name = "beautifulsoup4" +version = "4.15.0" +description = "Screen-scraping library" +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "beautifulsoup4-4.15.0-py3-none-any.whl", hash = "sha256:d6f88de62e1d4e38ecb1077eb9724cd0eff29d2a08ca16a401e9b9e93f117cf9"}, + {file = "beautifulsoup4-4.15.0.tar.gz", hash = "sha256:288e3ca7d54b06f2ac191970bc275c1939cb46d450b255bf6718b04aa37ab4f7"}, +] + +[package.dependencies] +soupsieve = ">=1.6.1" +typing-extensions = ">=4.0.0" + +[package.extras] +cchardet = ["cchardet"] +chardet = ["chardet"] +charset-normalizer = ["charset-normalizer"] +html5lib = ["html5lib"] +lxml = ["lxml"] + [[package]] name = "bidict" version = "0.23.1" @@ -2072,6 +2094,154 @@ httpx = ">=0.23.0" packaging = ">=23.0" pydantic = ">=1,<3" +[[package]] +name = "lxml" +version = "5.4.0" +description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." +optional = false +python-versions = ">=3.6" +files = [ + {file = "lxml-5.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e7bc6df34d42322c5289e37e9971d6ed114e3776b45fa879f734bded9d1fea9c"}, + {file = "lxml-5.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6854f8bd8a1536f8a1d9a3655e6354faa6406621cf857dc27b681b69860645c7"}, + {file = "lxml-5.4.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:696ea9e87442467819ac22394ca36cb3d01848dad1be6fac3fb612d3bd5a12cf"}, + {file = "lxml-5.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ef80aeac414f33c24b3815ecd560cee272786c3adfa5f31316d8b349bfade28"}, + {file = "lxml-5.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b9c2754cef6963f3408ab381ea55f47dabc6f78f4b8ebb0f0b25cf1ac1f7609"}, + {file = "lxml-5.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7a62cc23d754bb449d63ff35334acc9f5c02e6dae830d78dab4dd12b78a524f4"}, + {file = "lxml-5.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f82125bc7203c5ae8633a7d5d20bcfdff0ba33e436e4ab0abc026a53a8960b7"}, + {file = "lxml-5.4.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:b67319b4aef1a6c56576ff544b67a2a6fbd7eaee485b241cabf53115e8908b8f"}, + {file = "lxml-5.4.0-cp310-cp310-manylinux_2_28_ppc64le.whl", hash = "sha256:a8ef956fce64c8551221f395ba21d0724fed6b9b6242ca4f2f7beb4ce2f41997"}, + {file = "lxml-5.4.0-cp310-cp310-manylinux_2_28_s390x.whl", hash = "sha256:0a01ce7d8479dce84fc03324e3b0c9c90b1ece9a9bb6a1b6c9025e7e4520e78c"}, + {file = "lxml-5.4.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:91505d3ddebf268bb1588eb0f63821f738d20e1e7f05d3c647a5ca900288760b"}, + {file = "lxml-5.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a3bcdde35d82ff385f4ede021df801b5c4a5bcdfb61ea87caabcebfc4945dc1b"}, + {file = "lxml-5.4.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:aea7c06667b987787c7d1f5e1dfcd70419b711cdb47d6b4bb4ad4b76777a0563"}, + {file = "lxml-5.4.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:a7fb111eef4d05909b82152721a59c1b14d0f365e2be4c742a473c5d7372f4f5"}, + {file = "lxml-5.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:43d549b876ce64aa18b2328faff70f5877f8c6dede415f80a2f799d31644d776"}, + {file = "lxml-5.4.0-cp310-cp310-win32.whl", hash = "sha256:75133890e40d229d6c5837b0312abbe5bac1c342452cf0e12523477cd3aa21e7"}, + {file = "lxml-5.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:de5b4e1088523e2b6f730d0509a9a813355b7f5659d70eb4f319c76beea2e250"}, + {file = "lxml-5.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:98a3912194c079ef37e716ed228ae0dcb960992100461b704aea4e93af6b0bb9"}, + {file = "lxml-5.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0ea0252b51d296a75f6118ed0d8696888e7403408ad42345d7dfd0d1e93309a7"}, + {file = "lxml-5.4.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b92b69441d1bd39f4940f9eadfa417a25862242ca2c396b406f9272ef09cdcaa"}, + {file = "lxml-5.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20e16c08254b9b6466526bc1828d9370ee6c0d60a4b64836bc3ac2917d1e16df"}, + {file = "lxml-5.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7605c1c32c3d6e8c990dd28a0970a3cbbf1429d5b92279e37fda05fb0c92190e"}, + {file = "lxml-5.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ecf4c4b83f1ab3d5a7ace10bafcb6f11df6156857a3c418244cef41ca9fa3e44"}, + {file = "lxml-5.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0cef4feae82709eed352cd7e97ae062ef6ae9c7b5dbe3663f104cd2c0e8d94ba"}, + {file = "lxml-5.4.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:df53330a3bff250f10472ce96a9af28628ff1f4efc51ccba351a8820bca2a8ba"}, + {file = "lxml-5.4.0-cp311-cp311-manylinux_2_28_ppc64le.whl", hash = "sha256:aefe1a7cb852fa61150fcb21a8c8fcea7b58c4cb11fbe59c97a0a4b31cae3c8c"}, + {file = "lxml-5.4.0-cp311-cp311-manylinux_2_28_s390x.whl", hash = "sha256:ef5a7178fcc73b7d8c07229e89f8eb45b2908a9238eb90dcfc46571ccf0383b8"}, + {file = "lxml-5.4.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:d2ed1b3cb9ff1c10e6e8b00941bb2e5bb568b307bfc6b17dffbbe8be5eecba86"}, + {file = "lxml-5.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:72ac9762a9f8ce74c9eed4a4e74306f2f18613a6b71fa065495a67ac227b3056"}, + {file = "lxml-5.4.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f5cb182f6396706dc6cc1896dd02b1c889d644c081b0cdec38747573db88a7d7"}, + {file = "lxml-5.4.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:3a3178b4873df8ef9457a4875703488eb1622632a9cee6d76464b60e90adbfcd"}, + {file = "lxml-5.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e094ec83694b59d263802ed03a8384594fcce477ce484b0cbcd0008a211ca751"}, + {file = "lxml-5.4.0-cp311-cp311-win32.whl", hash = "sha256:4329422de653cdb2b72afa39b0aa04252fca9071550044904b2e7036d9d97fe4"}, + {file = "lxml-5.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:fd3be6481ef54b8cfd0e1e953323b7aa9d9789b94842d0e5b142ef4bb7999539"}, + {file = "lxml-5.4.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b5aff6f3e818e6bdbbb38e5967520f174b18f539c2b9de867b1e7fde6f8d95a4"}, + {file = "lxml-5.4.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:942a5d73f739ad7c452bf739a62a0f83e2578afd6b8e5406308731f4ce78b16d"}, + {file = "lxml-5.4.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:460508a4b07364d6abf53acaa0a90b6d370fafde5693ef37602566613a9b0779"}, + {file = "lxml-5.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:529024ab3a505fed78fe3cc5ddc079464e709f6c892733e3f5842007cec8ac6e"}, + {file = "lxml-5.4.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7ca56ebc2c474e8f3d5761debfd9283b8b18c76c4fc0967b74aeafba1f5647f9"}, + {file = "lxml-5.4.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a81e1196f0a5b4167a8dafe3a66aa67c4addac1b22dc47947abd5d5c7a3f24b5"}, + {file = "lxml-5.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00b8686694423ddae324cf614e1b9659c2edb754de617703c3d29ff568448df5"}, + {file = "lxml-5.4.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:c5681160758d3f6ac5b4fea370495c48aac0989d6a0f01bb9a72ad8ef5ab75c4"}, + {file = "lxml-5.4.0-cp312-cp312-manylinux_2_28_ppc64le.whl", hash = "sha256:2dc191e60425ad70e75a68c9fd90ab284df64d9cd410ba8d2b641c0c45bc006e"}, + {file = "lxml-5.4.0-cp312-cp312-manylinux_2_28_s390x.whl", hash = "sha256:67f779374c6b9753ae0a0195a892a1c234ce8416e4448fe1e9f34746482070a7"}, + {file = "lxml-5.4.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:79d5bfa9c1b455336f52343130b2067164040604e41f6dc4d8313867ed540079"}, + {file = "lxml-5.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3d3c30ba1c9b48c68489dc1829a6eede9873f52edca1dda900066542528d6b20"}, + {file = "lxml-5.4.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:1af80c6316ae68aded77e91cd9d80648f7dd40406cef73df841aa3c36f6907c8"}, + {file = "lxml-5.4.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:4d885698f5019abe0de3d352caf9466d5de2baded00a06ef3f1216c1a58ae78f"}, + {file = "lxml-5.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:aea53d51859b6c64e7c51d522c03cc2c48b9b5d6172126854cc7f01aa11f52bc"}, + {file = "lxml-5.4.0-cp312-cp312-win32.whl", hash = "sha256:d90b729fd2732df28130c064aac9bb8aff14ba20baa4aee7bd0795ff1187545f"}, + {file = "lxml-5.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1dc4ca99e89c335a7ed47d38964abcb36c5910790f9bd106f2a8fa2ee0b909d2"}, + {file = "lxml-5.4.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:773e27b62920199c6197130632c18fb7ead3257fce1ffb7d286912e56ddb79e0"}, + {file = "lxml-5.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ce9c671845de9699904b1e9df95acfe8dfc183f2310f163cdaa91a3535af95de"}, + {file = "lxml-5.4.0-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9454b8d8200ec99a224df8854786262b1bd6461f4280064c807303c642c05e76"}, + {file = "lxml-5.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cccd007d5c95279e529c146d095f1d39ac05139de26c098166c4beb9374b0f4d"}, + {file = "lxml-5.4.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0fce1294a0497edb034cb416ad3e77ecc89b313cff7adbee5334e4dc0d11f422"}, + {file = "lxml-5.4.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:24974f774f3a78ac12b95e3a20ef0931795ff04dbb16db81a90c37f589819551"}, + {file = "lxml-5.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:497cab4d8254c2a90bf988f162ace2ddbfdd806fce3bda3f581b9d24c852e03c"}, + {file = "lxml-5.4.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:e794f698ae4c5084414efea0f5cc9f4ac562ec02d66e1484ff822ef97c2cadff"}, + {file = "lxml-5.4.0-cp313-cp313-manylinux_2_28_ppc64le.whl", hash = "sha256:2c62891b1ea3094bb12097822b3d44b93fc6c325f2043c4d2736a8ff09e65f60"}, + {file = "lxml-5.4.0-cp313-cp313-manylinux_2_28_s390x.whl", hash = "sha256:142accb3e4d1edae4b392bd165a9abdee8a3c432a2cca193df995bc3886249c8"}, + {file = "lxml-5.4.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:1a42b3a19346e5601d1b8296ff6ef3d76038058f311902edd574461e9c036982"}, + {file = "lxml-5.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4291d3c409a17febf817259cb37bc62cb7eb398bcc95c1356947e2871911ae61"}, + {file = "lxml-5.4.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4f5322cf38fe0e21c2d73901abf68e6329dc02a4994e483adbcf92b568a09a54"}, + {file = "lxml-5.4.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:0be91891bdb06ebe65122aa6bf3fc94489960cf7e03033c6f83a90863b23c58b"}, + {file = "lxml-5.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:15a665ad90054a3d4f397bc40f73948d48e36e4c09f9bcffc7d90c87410e478a"}, + {file = "lxml-5.4.0-cp313-cp313-win32.whl", hash = "sha256:d5663bc1b471c79f5c833cffbc9b87d7bf13f87e055a5c86c363ccd2348d7e82"}, + {file = "lxml-5.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:bcb7a1096b4b6b24ce1ac24d4942ad98f983cd3810f9711bcd0293f43a9d8b9f"}, + {file = "lxml-5.4.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:7be701c24e7f843e6788353c055d806e8bd8466b52907bafe5d13ec6a6dbaecd"}, + {file = "lxml-5.4.0-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fb54f7c6bafaa808f27166569b1511fc42701a7713858dddc08afdde9746849e"}, + {file = "lxml-5.4.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:97dac543661e84a284502e0cf8a67b5c711b0ad5fb661d1bd505c02f8cf716d7"}, + {file = "lxml-5.4.0-cp36-cp36m-manylinux_2_28_x86_64.whl", hash = "sha256:c70e93fba207106cb16bf852e421c37bbded92acd5964390aad07cb50d60f5cf"}, + {file = "lxml-5.4.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:9c886b481aefdf818ad44846145f6eaf373a20d200b5ce1a5c8e1bc2d8745410"}, + {file = "lxml-5.4.0-cp36-cp36m-musllinux_1_2_x86_64.whl", hash = "sha256:fa0e294046de09acd6146be0ed6727d1f42ded4ce3ea1e9a19c11b6774eea27c"}, + {file = "lxml-5.4.0-cp36-cp36m-win32.whl", hash = "sha256:61c7bbf432f09ee44b1ccaa24896d21075e533cd01477966a5ff5a71d88b2f56"}, + {file = "lxml-5.4.0-cp36-cp36m-win_amd64.whl", hash = "sha256:7ce1a171ec325192c6a636b64c94418e71a1964f56d002cc28122fceff0b6121"}, + {file = "lxml-5.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:795f61bcaf8770e1b37eec24edf9771b307df3af74d1d6f27d812e15a9ff3872"}, + {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:29f451a4b614a7b5b6c2e043d7b64a15bd8304d7e767055e8ab68387a8cacf4e"}, + {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:891f7f991a68d20c75cb13c5c9142b2a3f9eb161f1f12a9489c82172d1f133c0"}, + {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4aa412a82e460571fad592d0f93ce9935a20090029ba08eca05c614f99b0cc92"}, + {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:ac7ba71f9561cd7d7b55e1ea5511543c0282e2b6450f122672a2694621d63b7e"}, + {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:c5d32f5284012deaccd37da1e2cd42f081feaa76981f0eaa474351b68df813c5"}, + {file = "lxml-5.4.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:ce31158630a6ac85bddd6b830cffd46085ff90498b397bd0a259f59d27a12188"}, + {file = "lxml-5.4.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:31e63621e073e04697c1b2d23fcb89991790eef370ec37ce4d5d469f40924ed6"}, + {file = "lxml-5.4.0-cp37-cp37m-win32.whl", hash = "sha256:be2ba4c3c5b7900246a8f866580700ef0d538f2ca32535e991027bdaba944063"}, + {file = "lxml-5.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:09846782b1ef650b321484ad429217f5154da4d6e786636c38e434fa32e94e49"}, + {file = "lxml-5.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:eaf24066ad0b30917186420d51e2e3edf4b0e2ea68d8cd885b14dc8afdcf6556"}, + {file = "lxml-5.4.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b31a3a77501d86d8ade128abb01082724c0dfd9524f542f2f07d693c9f1175f"}, + {file = "lxml-5.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e108352e203c7afd0eb91d782582f00a0b16a948d204d4dec8565024fafeea5"}, + {file = "lxml-5.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a11a96c3b3f7551c8a8109aa65e8594e551d5a84c76bf950da33d0fb6dfafab7"}, + {file = "lxml-5.4.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:ca755eebf0d9e62d6cb013f1261e510317a41bf4650f22963474a663fdfe02aa"}, + {file = "lxml-5.4.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:4cd915c0fb1bed47b5e6d6edd424ac25856252f09120e3e8ba5154b6b921860e"}, + {file = "lxml-5.4.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:226046e386556a45ebc787871d6d2467b32c37ce76c2680f5c608e25823ffc84"}, + {file = "lxml-5.4.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:b108134b9667bcd71236c5a02aad5ddd073e372fb5d48ea74853e009fe38acb6"}, + {file = "lxml-5.4.0-cp38-cp38-win32.whl", hash = "sha256:1320091caa89805df7dcb9e908add28166113dcd062590668514dbd510798c88"}, + {file = "lxml-5.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:073eb6dcdf1f587d9b88c8c93528b57eccda40209cf9be549d469b942b41d70b"}, + {file = "lxml-5.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:bda3ea44c39eb74e2488297bb39d47186ed01342f0022c8ff407c250ac3f498e"}, + {file = "lxml-5.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9ceaf423b50ecfc23ca00b7f50b64baba85fb3fb91c53e2c9d00bc86150c7e40"}, + {file = "lxml-5.4.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:664cdc733bc87449fe781dbb1f309090966c11cc0c0cd7b84af956a02a8a4729"}, + {file = "lxml-5.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:67ed8a40665b84d161bae3181aa2763beea3747f748bca5874b4af4d75998f87"}, + {file = "lxml-5.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9b4a3bd174cc9cdaa1afbc4620c049038b441d6ba07629d89a83b408e54c35cd"}, + {file = "lxml-5.4.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:b0989737a3ba6cf2a16efb857fb0dfa20bc5c542737fddb6d893fde48be45433"}, + {file = "lxml-5.4.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:dc0af80267edc68adf85f2a5d9be1cdf062f973db6790c1d065e45025fa26140"}, + {file = "lxml-5.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:639978bccb04c42677db43c79bdaa23785dc7f9b83bfd87570da8207872f1ce5"}, + {file = "lxml-5.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:5a99d86351f9c15e4a901fc56404b485b1462039db59288b203f8c629260a142"}, + {file = "lxml-5.4.0-cp39-cp39-win32.whl", hash = "sha256:3e6d5557989cdc3ebb5302bbdc42b439733a841891762ded9514e74f60319ad6"}, + {file = "lxml-5.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:a8c9b7f16b63e65bbba889acb436a1034a82d34fa09752d754f88d708eca80e1"}, + {file = "lxml-5.4.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:1b717b00a71b901b4667226bba282dd462c42ccf618ade12f9ba3674e1fabc55"}, + {file = "lxml-5.4.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27a9ded0f0b52098ff89dd4c418325b987feed2ea5cc86e8860b0f844285d740"}, + {file = "lxml-5.4.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b7ce10634113651d6f383aa712a194179dcd496bd8c41e191cec2099fa09de5"}, + {file = "lxml-5.4.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:53370c26500d22b45182f98847243efb518d268374a9570409d2e2276232fd37"}, + {file = "lxml-5.4.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:c6364038c519dffdbe07e3cf42e6a7f8b90c275d4d1617a69bb59734c1a2d571"}, + {file = "lxml-5.4.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:b12cb6527599808ada9eb2cd6e0e7d3d8f13fe7bbb01c6311255a15ded4c7ab4"}, + {file = "lxml-5.4.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:5f11a1526ebd0dee85e7b1e39e39a0cc0d9d03fb527f56d8457f6df48a10dc0c"}, + {file = "lxml-5.4.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48b4afaf38bf79109bb060d9016fad014a9a48fb244e11b94f74ae366a64d252"}, + {file = "lxml-5.4.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de6f6bb8a7840c7bf216fb83eec4e2f79f7325eca8858167b68708b929ab2172"}, + {file = "lxml-5.4.0-pp37-pypy37_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:5cca36a194a4eb4e2ed6be36923d3cffd03dcdf477515dea687185506583d4c9"}, + {file = "lxml-5.4.0-pp37-pypy37_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:b7c86884ad23d61b025989d99bfdd92a7351de956e01c61307cb87035960bcb1"}, + {file = "lxml-5.4.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:53d9469ab5460402c19553b56c3648746774ecd0681b1b27ea74d5d8a3ef5590"}, + {file = "lxml-5.4.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:56dbdbab0551532bb26c19c914848d7251d73edb507c3079d6805fa8bba5b706"}, + {file = "lxml-5.4.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:14479c2ad1cb08b62bb941ba8e0e05938524ee3c3114644df905d2331c76cd57"}, + {file = "lxml-5.4.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32697d2ea994e0db19c1df9e40275ffe84973e4232b5c274f47e7c1ec9763cdd"}, + {file = "lxml-5.4.0-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:24f6df5f24fc3385f622c0c9d63fe34604893bc1a5bdbb2dbf5870f85f9a404a"}, + {file = "lxml-5.4.0-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:151d6c40bc9db11e960619d2bf2ec5829f0aaffb10b41dcf6ad2ce0f3c0b2325"}, + {file = "lxml-5.4.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:4025bf2884ac4370a3243c5aa8d66d3cb9e15d3ddd0af2d796eccc5f0244390e"}, + {file = "lxml-5.4.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:9459e6892f59ecea2e2584ee1058f5d8f629446eab52ba2305ae13a32a059530"}, + {file = "lxml-5.4.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:47fb24cc0f052f0576ea382872b3fc7e1f7e3028e53299ea751839418ade92a6"}, + {file = "lxml-5.4.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50441c9de951a153c698b9b99992e806b71c1f36d14b154592580ff4a9d0d877"}, + {file = "lxml-5.4.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:ab339536aa798b1e17750733663d272038bf28069761d5be57cb4a9b0137b4f8"}, + {file = "lxml-5.4.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9776af1aad5a4b4a1317242ee2bea51da54b2a7b7b48674be736d463c999f37d"}, + {file = "lxml-5.4.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:63e7968ff83da2eb6fdda967483a7a023aa497d85ad8f05c3ad9b1f2e8c84987"}, + {file = "lxml-5.4.0.tar.gz", hash = "sha256:d12832e1dbea4be280b22fd0ea7c9b87f0d8fc51ba06e92dc62d52f804f78ebd"}, +] + +[package.extras] +cssselect = ["cssselect (>=0.7)"] +html-clean = ["lxml_html_clean"] +html5 = ["html5lib"] +htmlsoup = ["BeautifulSoup4"] +source = ["Cython (>=3.0.11,<3.1.0)"] + [[package]] name = "markdown-it-py" version = "3.0.0" @@ -4769,6 +4939,17 @@ files = [ {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, ] +[[package]] +name = "soupsieve" +version = "2.8.4" +description = "A modern CSS selector implementation for Beautiful Soup." +optional = false +python-versions = ">=3.9" +files = [ + {file = "soupsieve-2.8.4-py3-none-any.whl", hash = "sha256:e7e6b0769c8f51ed59acab6e994b00621096cfb1c640a7509295987388fbaf65"}, + {file = "soupsieve-2.8.4.tar.gz", hash = "sha256:e121fd02e975c695e4e9e8774a5ee35d74714b59307868dcc5319ad2d9e3328e"}, +] + [[package]] name = "sqlalchemy" version = "2.0.37" @@ -6142,4 +6323,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = ">=3.12, <4" -content-hash = "5acb48fa66fd8699daaf4fde50646599217adacc034c994b7dfd79d542dcf6c4" +content-hash = "ff5060b8459b80fb03436b776212a70d58e59704054a30a20191bdd71039c63c" diff --git a/pyproject.toml b/pyproject.toml index 9e89357..cef69ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,8 @@ pyyaml = "^6.0.2" tavily-python = "^0.5.0" openpyxl = "^3.1.5" nltk = "^3.9.1" +beautifulsoup4 = "^4.12.0" +lxml = "^5.0.0" [tool.poetry.group.dev.dependencies] ruff = "^0.7.1" diff --git a/src/agent/profiles/react_to_me.py b/src/agent/profiles/react_to_me.py index dab20f0..80e5500 100644 --- a/src/agent/profiles/react_to_me.py +++ b/src/agent/profiles/react_to_me.py @@ -1,3 +1,4 @@ +import logging from typing import Any from langchain_core.embeddings import Embeddings @@ -7,12 +8,23 @@ from langgraph.graph.state import StateGraph from agent.profiles.base import BaseGraphBuilder, BaseState +from agent.tasks.intent_classifier import ( + QueryIntent, + SourceName, + create_intent_classifier, + resolve_active_sources, +) +from agent.tasks.safety_checker import SafetyCheck from agent.tasks.unsafe_question import create_unsafe_answer_generator from retrievers.reactome.rag import create_reactome_rag +from retrievers.userguide.rag import create_userguide_rag +from util.embedding_environment import EmbeddingEnvironment + +logger = logging.getLogger(__name__) class ReactToMeState(BaseState): - pass + active_sources: list[SourceName] class ReactToMeGraphBuilder(BaseGraphBuilder): @@ -23,22 +35,22 @@ def __init__( ) -> None: super().__init__(llm, embedding) - # Create runnables (tasks & tools) + self.intent_classifier: Runnable = create_intent_classifier(llm) self.unsafe_answer_generator: Runnable = create_unsafe_answer_generator( llm, streaming=True ) - self.reactome_rag: Runnable = create_reactome_rag( - llm, embedding, streaming=True - ) - # Create graph + self.rags: dict[SourceName, Runnable] = { + "reactome": create_reactome_rag(llm, embedding, streaming=True), + } + self._available_sources: frozenset[SourceName] = frozenset({"reactome"}) + self._register_userguide_rag(llm, embedding) + state_graph = StateGraph(ReactToMeState) - # Set up nodes state_graph.add_node("preprocess", self.preprocess) - state_graph.add_node("model", self.call_model) + state_graph.add_node("model", self.generate_answer) state_graph.add_node("generate_unsafe_response", self.generate_unsafe_response) state_graph.add_node("postprocess", self.postprocess) - # Set up edges state_graph.set_entry_point("preprocess") state_graph.add_conditional_edges( "preprocess", @@ -51,6 +63,67 @@ def __init__( self.uncompiled_graph: StateGraph = state_graph + def _register_userguide_rag( + self, + llm: BaseChatModel, + embedding: Embeddings, + ) -> None: + userguide_dir = EmbeddingEnvironment.get_dir("userguide") + if userguide_dir is None: + logger.info("User guide embeddings not configured; routing will use reactome only.") + return + + chroma_path = userguide_dir / "sections" / "chroma.sqlite3" + if not chroma_path.exists(): + logger.warning( + "User guide embeddings directory exists but Chroma DB is missing at %s", + chroma_path, + ) + return + + try: + self.rags["userguide"] = create_userguide_rag( + llm, embedding, userguide_dir, streaming=True + ) + self._available_sources = frozenset(self.rags) + except (FileNotFoundError, ValueError) as exc: + logger.warning("User guide RAG unavailable: %s", exc) + + async def preprocess( + self, state: ReactToMeState, config: RunnableConfig + ) -> ReactToMeState: + rephrased_input: str = await self.rephrase_chain.ainvoke( + { + "user_input": state["user_input"], + "chat_history": state.get("chat_history", []), + }, + config, + ) + safety_check: SafetyCheck = await self.safety_checker.ainvoke( + {"rephrased_input": rephrased_input}, config + ) + detected_language: str = await self.language_detector.ainvoke( + {"user_input": state["user_input"]}, config + ) + intent: QueryIntent = await self.intent_classifier.ainvoke( + {"rephrased_input": rephrased_input}, config + ) + active_sources = resolve_active_sources(intent.source, self._available_sources) + if intent.source not in self._available_sources: + logger.info( + "Requested source %r unavailable; falling back to %r", + intent.source, + active_sources[0], + ) + + return ReactToMeState( + rephrased_input=rephrased_input, + safety=safety_check.safety, + reason_unsafe=safety_check.reason_unsafe, + detected_language=detected_language, + active_sources=active_sources, + ) + async def generate_unsafe_response( self, state: ReactToMeState, config: RunnableConfig ) -> ReactToMeState: @@ -70,10 +143,12 @@ async def generate_unsafe_response( answer=answer, ) - async def call_model( + async def generate_answer( self, state: ReactToMeState, config: RunnableConfig ) -> ReactToMeState: - result: dict[str, Any] = await self.reactome_rag.ainvoke( + source = state["active_sources"][0] + rag = self.rags[source] + result: dict[str, Any] = await rag.ainvoke( { "input": state["rephrased_input"], "chat_history": ( diff --git a/src/agent/tasks/intent_classifier.py b/src/agent/tasks/intent_classifier.py new file mode 100644 index 0000000..932023c --- /dev/null +++ b/src/agent/tasks/intent_classifier.py @@ -0,0 +1,53 @@ +from typing import Literal + +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.prompts import ChatPromptTemplate +from langchain_core.runnables import Runnable +from pydantic import BaseModel, Field + +SourceName = Literal["reactome", "userguide"] + +intent_classifier_message = """ +You route user questions for the React-to-Me assistant to the correct knowledge source. + +Choose exactly one source: + +- **reactome**: Questions about biology, molecular mechanisms, pathways, reactions, proteins, genes, + diseases, and other scientific content in the Reactome Knowledgebase. + Examples: "What is apoptosis?", "Which pathways involve TP53?", "What does CDK5 do?" + +- **userguide**: Questions about how to use the Reactome **website**, tools, or interface. + Examples: "How do I use the pathway browser?", "How do I search Reactome?", + "How do I run gene list analysis?", "What is the Details Panel?" + +Rules: +- If the user asks how to perform a task in Reactome or about UI features, choose **userguide**. +- If the user asks about biological facts or pathway content, choose **reactome**. +- When unsure, prefer **reactome** for science content and **userguide** for clear how-to or UI questions. +""" + +intent_classifier_prompt = ChatPromptTemplate.from_messages( + [ + ("system", intent_classifier_message), + ("human", "User question:\n\n{rephrased_input}"), + ] +) + + +class QueryIntent(BaseModel): + source: SourceName = Field( + description="The knowledge source that should answer this question: 'reactome' or 'userguide'." + ) + + +def resolve_active_sources( + source: SourceName, + available_sources: frozenset[SourceName], +) -> list[SourceName]: + if source in available_sources: + return [source] + return ["reactome"] + + +def create_intent_classifier(llm: BaseChatModel) -> Runnable: + return intent_classifier_prompt | llm.with_structured_output(QueryIntent) diff --git a/src/agent/tasks/rephrase.py b/src/agent/tasks/rephrase.py index 1851747..23aaeaf 100644 --- a/src/agent/tasks/rephrase.py +++ b/src/agent/tasks/rephrase.py @@ -4,15 +4,17 @@ from langchain_core.runnables import Runnable contextualize_q_system_prompt = """ -You are an expert in question formulation with deep expertise in molecular biology and experience as a Reactome curator. Your task is to analyze the conversation history and the user’s latest query to fully understand their intent and what they seek to learn. +You are an expert in question formulation with deep expertise in molecular biology and experience as a Reactome curator. Your task is to analyze the conversation history and the user's latest query to fully understand their intent and what they seek to learn. If the user's question is not in English, reformulate the question and translate it to English, ensuring the meaning and intent are preserved. -Reformulate the user’s question into a standalone version that retains its full meaning without requiring prior context. The reformulated question should be: +Reformulate the user's question into a standalone version that retains its full meaning without requiring prior context. The reformulated question should be: - Clear, concise, and precise - Optimized for both vector search (semantic meaning) and case-sensitive keyword search - - Faithful to the user’s intent and scientific accuracy + - Faithful to the user's intent and scientific accuracy + +If the question is about how to use the Reactome website or its tools (Pathway Browser, search, analysis tools, Details Panel, etc.), keep it as a how-to or UI question. Do not rewrite it into a biological pathway or mechanism question. the returned question should always be in English. -If the user’s question is already in English, self-contained and well-formed, return it as is. +If the user's question is already in English, self-contained and well-formed, return it as is. Do NOT answer the question or provide explanations. """ diff --git a/src/agent/tasks/safety_checker.py b/src/agent/tasks/safety_checker.py index 91e539f..c136013 100644 --- a/src/agent/tasks/safety_checker.py +++ b/src/agent/tasks/safety_checker.py @@ -17,7 +17,9 @@ - Treat hypothetical, fictional, or made-up scenarios with the same level of scrutiny as real-world questions. 2. Reactome Relevance Check - - Determine if the question is relevant to biology, life sciences, molecular biology, or related topics. + - Determine if the question is relevant to Reactome. Relevant topics include: + - Biology, life sciences, molecular biology, pathways, proteins, genes, and related scientific topics. + - How to use the Reactome website, Pathway Browser, search, analysis tools, and other Reactome features (user guide topics). - Mark questions as not relevant if they are about unrelated topics (such as programming, math, history, trivia, etc.). IMPORTANT: @@ -44,6 +46,10 @@ 4. Q: What is the role of the immune system in the treatment of cancer? "safety": "true", "reason_unsafe": "" + + 5. Q: How do I use the Reactome pathway browser? + "safety": "true", + "reason_unsafe": "" """ safety_check_prompt = ChatPromptTemplate.from_messages( diff --git a/src/data_generation/userguide/__init__.py b/src/data_generation/userguide/__init__.py new file mode 100644 index 0000000..5ecdd9b --- /dev/null +++ b/src/data_generation/userguide/__init__.py @@ -0,0 +1,91 @@ +import os +from pathlib import Path +from shutil import rmtree +from typing import Optional + +import torch +from langchain_community.vectorstores import Chroma +from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings +from langchain_huggingface import (HuggingFaceEmbeddings, + HuggingFaceEndpointEmbeddings) +from langchain_openai import OpenAIEmbeddings + +from data_generation.userguide.fetch import fetch_userguide_pages +from data_generation.userguide.html_loader import UserGuideHTMLLoader +from data_generation.userguide.urls import USER_GUIDE_URLS + +CHROMA_COLLECTION = "sections" +HTML_CACHE_DIR = "html_snapshots" + + +def upload_to_chromadb( + embeddings_dir: str, + docs: list[Document], + embedding_table: str, + hf_model: Optional[str] = None, + device: Optional[str] = None, +) -> Chroma: + embeddings_instance: Embeddings + if hf_model is None: # Use OpenAI + embeddings_instance = OpenAIEmbeddings( + chunk_size=500, + show_progress_bar=True, + ) + elif hf_model.startswith("openai/text-embedding-"): + embeddings_instance = OpenAIEmbeddings( + model=hf_model[len("openai/") :], + chunk_size=500, + show_progress_bar=True, + ) + elif "HUGGINGFACEHUB_API_TOKEN" in os.environ: + embeddings_instance = HuggingFaceEndpointEmbeddings( + huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"], + model=hf_model, + ) + else: + if device == "cuda": + torch.cuda.empty_cache() + embeddings_instance = HuggingFaceEmbeddings( + model_name=hf_model, + model_kwargs={"device": device, "trust_remote_code": True}, + encode_kwargs={"batch_size": 12, "normalize_embeddings": False}, + ) + + return Chroma.from_documents( + documents=docs, + embedding=embeddings_instance, + persist_directory=os.path.join(embeddings_dir, embedding_table), + ) + + +def generate_userguide_embeddings( + embeddings_dir: str, + force: bool = False, + hf_model: Optional[str] = None, + device: Optional[str] = None, + **_, +) -> None: + embeddings_path = Path(embeddings_dir) + chroma_dir = embeddings_path / CHROMA_COLLECTION + if force and chroma_dir.exists(): + rmtree(chroma_dir) + + cache_dir = embeddings_path / HTML_CACHE_DIR + html_paths = fetch_userguide_pages( + USER_GUIDE_URLS, + cache_dir=cache_dir, + force=force, + ) + + loader = UserGuideHTMLLoader(html_paths) + docs = loader.load() + print(f"Loaded {len(docs)} user guide sections from {len(html_paths)} pages") + + if not docs: + raise RuntimeError("No user guide documents were produced") + + db = upload_to_chromadb( + embeddings_dir, docs, CHROMA_COLLECTION, hf_model, device + ) + print(db._collection.count()) diff --git a/src/data_generation/userguide/fetch.py b/src/data_generation/userguide/fetch.py new file mode 100644 index 0000000..c57f85a --- /dev/null +++ b/src/data_generation/userguide/fetch.py @@ -0,0 +1,54 @@ +import time +from collections.abc import Sequence +from pathlib import Path +from urllib.parse import urlparse + +import requests + +USER_AGENT = "ReactomeChatbot/1.0 (+https://github.com/reactome/reactome_chatbot)" +REQUEST_DELAY_SECONDS = 0.5 + + +def url_to_slug(url: str) -> str: + """Derive a filesystem-safe slug from a user guide URL path.""" + path = urlparse(url).path.strip("/") + return path.replace("/", "_") if path else "index" + + +def fetch_userguide_pages( + urls: Sequence[str], + cache_dir: Path, + *, + force: bool = False, +) -> dict[str, Path]: + """Download user guide HTML pages, using on-disk cache when available. + + Args: + urls: Canonical user guide URLs to download. + cache_dir: Directory for cached ``.html`` files. + force: When ``True``, re-download pages even if cached. + + Returns: + Mapping of each URL to its local cached HTML path. + """ + cache_dir = Path(cache_dir) + cache_dir.mkdir(parents=True, exist_ok=True) + session = requests.Session() + session.headers["User-Agent"] = USER_AGENT + + html_paths: dict[str, Path] = {} + for i, url in enumerate(urls): + cache_path = cache_dir / f"{url_to_slug(url)}.html" + if cache_path.exists() and not force: + html_paths[url] = cache_path + continue + + response = session.get(url, timeout=60) + response.raise_for_status() + cache_path.write_text(response.text, encoding=response.encoding or "utf-8") + html_paths[url] = cache_path + + if i < len(urls) - 1: + time.sleep(REQUEST_DELAY_SECONDS) + + return html_paths diff --git a/src/data_generation/userguide/html_loader.py b/src/data_generation/userguide/html_loader.py new file mode 100644 index 0000000..03080b1 --- /dev/null +++ b/src/data_generation/userguide/html_loader.py @@ -0,0 +1,240 @@ +import re +from pathlib import Path + +from bs4 import BeautifulSoup, Tag +from langchain_community.document_loaders.base import BaseLoader +from langchain_core.documents import Document +from langchain_text_splitters import RecursiveCharacterTextSplitter + +SPLIT_CANDIDATES = ("h2", "h3", "h4") +MIN_SECTION_HEADINGS = 2 +MAX_CHUNK_CHARS = 4000 +MIN_CHUNK_CHARS = 80 +CHUNK_OVERLAP = 200 + +SPAMBOT_PATTERN = re.compile( + r"This email address is being protected from spambots.*", + re.IGNORECASE | re.DOTALL, +) +COLLECTIBLE_TAGS = frozenset( + { + "p", + "ul", + "ol", + "table", + "blockquote", + "pre", + "div", + "dl", + "h3", + "h4", + "h5", + "h6", + } +) + + +def choose_split_tag(article_body: Tag) -> str | None: + """Pick the shallowest heading level with enough sections for this page.""" + for tag in SPLIT_CANDIDATES: + if len(article_body.find_all(tag)) >= MIN_SECTION_HEADINGS: + return tag + return None + + +def _heading_title(heading: Tag) -> str: + title = heading.get_text(separator=" ", strip=True) + if title: + return title + image = heading.find("img", alt=True) + if image is not None: + alt = image.get("alt", "").strip() + if alt: + return alt + return "" + + +def split_article_body_into_sections( + article_body: Tag, +) -> list[tuple[str, int, list[Tag]]]: + """Split article body into titled sections using adaptive heading boundaries.""" + split_tag = choose_split_tag(article_body) + if split_tag is None: + return [("Introduction", 0, _collect_all_content(article_body))] + + headings = article_body.find_all(split_tag) + sections: list[tuple[str, int, list[Tag]]] = [] + + intro_nodes = _collect_intro(article_body, headings[0]) + if intro_nodes: + sections.append(("Introduction", 0, intro_nodes)) + + for index, heading in enumerate(headings): + title = _heading_title(heading) + level = int(split_tag[1]) + next_heading = headings[index + 1] if index + 1 < len(headings) else None + nodes = _collect_between(heading, next_heading, split_tag) + if not title and not nodes: + continue + sections.append((title or "Untitled", level, nodes)) + + return sections + + +def _should_collect_block(element: Tag, collected: list[Tag]) -> bool: + if element.name not in COLLECTIBLE_TAGS: + return False + if not element.get_text(strip=True): + return False + for other in collected: + if element in other.descendants or other in element.descendants: + return False + return True + + +def _collect_intro(article_body: Tag, first_heading: Tag) -> list[Tag]: + collected: list[Tag] = [] + for element in article_body.descendants: + if element is first_heading: + break + if isinstance(element, Tag) and _should_collect_block(element, collected): + collected.append(element) + return collected + + +def _collect_between( + start_heading: Tag, + end_heading: Tag | None, + split_tag: str, +) -> list[Tag]: + collected: list[Tag] = [] + for element in start_heading.next_elements: + if end_heading is not None and element is end_heading: + break + if isinstance(element, Tag) and element.name == split_tag: + break + if isinstance(element, Tag) and _should_collect_block(element, collected): + collected.append(element) + return collected + + +def _collect_all_content(article_body: Tag) -> list[Tag]: + collected: list[Tag] = [] + for element in article_body.descendants: + if isinstance(element, Tag) and _should_collect_block(element, collected): + collected.append(element) + return collected + + +class UserGuideHTMLLoader(BaseLoader): + """Loads Reactome user guide HTML pages into section-level documents. + + Each document represents one section of a user guide page. The loader picks + the shallowest heading level (``h2``, ``h3``, or ``h4``) that yields at + least two sections on that page. Deeper headings remain within their parent + section. Oversized sections are split for embedding. + + The ``source`` metadata field is set to the canonical page URL. Section + titles and page titles are included in both metadata and ``page_content``. + + Output Example: + .. code-block:: txt + + Page: Pathway Browser + Section: Event Hierarchy + + The order of reactions from top to bottom... + """ + + def __init__(self, html_paths: dict[str, Path]) -> None: + """ + Args: + html_paths: Mapping of canonical page URLs to local HTML file paths. + """ + self.html_paths = html_paths + self._splitter = RecursiveCharacterTextSplitter( + chunk_size=MAX_CHUNK_CHARS, + chunk_overlap=CHUNK_OVERLAP, + ) + + def load(self) -> list[Document]: + """Load data into document objects.""" + documents: list[Document] = [] + for url, path in self.html_paths.items(): + documents.extend(self._load_page(url, path)) + return documents + + def _load_page(self, url: str, path: Path) -> list[Document]: + html = path.read_text(encoding="utf-8") + soup = BeautifulSoup(html, "lxml") + page_title = self._extract_page_title(soup) + article_body = soup.select_one('[itemprop="articleBody"]') + if article_body is None: + raise ValueError(f"No article body found for {url}") + + sections = split_article_body_into_sections(article_body) + + documents: list[Document] = [] + for section_title, section_level, nodes in sections: + text = self._nodes_to_text(nodes) + text = SPAMBOT_PATTERN.sub("", text).strip() + if section_level == 0 and len(text) < MIN_CHUNK_CHARS: + continue + if section_level > 0 and len(text) < MIN_CHUNK_CHARS: + text = ( + f"{section_title}\n\n{text}".strip() + if text + else section_title + ) + + page_content_prefix = ( + f"URL: {url}\nPage: {page_title}\nSection: {section_title}\n\n" + ) + chunks = self._splitter.split_text(text) + for chunk_index, chunk in enumerate(chunks): + documents.append( + Document( + page_content=page_content_prefix + chunk, + metadata={ + "source": url, + "page_title": page_title, + "section_title": section_title, + "section_level": str(section_level), + "chunk_index": str(chunk_index), + }, + ) + ) + return documents + + def _extract_page_title(self, soup: BeautifulSoup) -> str: + header = soup.select_one(".page-header h2") + if header: + title = header.get_text(strip=True) + if title: + return title + if soup.title and soup.title.string: + return soup.title.string.replace(" - Reactome Pathway Database", "").strip() + return "Unknown" + + def _nodes_to_text(self, nodes: list[Tag]) -> str: + parts: list[str] = [] + for node in nodes: + if node.name == "ul": + for li in node.find_all("li", recursive=False): + item = li.get_text(separator=" ", strip=True) + if item: + parts.append(f"- {item}") + elif node.name == "ol": + for i, li in enumerate(node.find_all("li", recursive=False), start=1): + item = li.get_text(separator=" ", strip=True) + if item: + parts.append(f"{i}. {item}") + elif node.name == "table": + text = node.get_text(separator=" ", strip=True) + if text: + parts.append(text) + else: + text = node.get_text(separator="\n", strip=True) + if text: + parts.append(text) + return "\n\n".join(parts) diff --git a/src/data_generation/userguide/urls.py b/src/data_generation/userguide/urls.py new file mode 100644 index 0000000..94970e5 --- /dev/null +++ b/src/data_generation/userguide/urls.py @@ -0,0 +1,16 @@ +"""Canonical Reactome user guide URLs for ingestion.""" + +REACTOME_BASE = "https://reactome.org" + +USER_GUIDE_URLS: tuple[str, ...] = ( + f"{REACTOME_BASE}/userguide", + f"{REACTOME_BASE}/userguide/pathway-browser", + f"{REACTOME_BASE}/userguide/searching", + f"{REACTOME_BASE}/userguide/details-panel", + f"{REACTOME_BASE}/userguide/analysis", + f"{REACTOME_BASE}/userguide/analysis/gsa", + f"{REACTOME_BASE}/userguide/diseases", + f"{REACTOME_BASE}/userguide/cytomics", + f"{REACTOME_BASE}/userguide/review-status", + f"{REACTOME_BASE}/userguide/reactome-fiviz", +) diff --git a/src/retrievers/rag_chain.py b/src/retrievers/rag_chain.py index 3e5df8e..11c0051 100644 --- a/src/retrievers/rag_chain.py +++ b/src/retrievers/rag_chain.py @@ -1,7 +1,7 @@ from langchain.chains.combine_documents import create_stuff_documents_chain from langchain.chains.retrieval import create_retrieval_chain from langchain_core.language_models.chat_models import BaseChatModel -from langchain_core.prompts import ChatPromptTemplate +from langchain_core.prompts import BasePromptTemplate, ChatPromptTemplate from langchain_core.retrievers import BaseRetriever from langchain_core.runnables import Runnable @@ -10,11 +10,14 @@ def create_rag_chain( llm: BaseChatModel, retriever: BaseRetriever, qa_prompt: ChatPromptTemplate, + *, + document_prompt: BasePromptTemplate | None = None, ) -> Runnable: # Create the documents chain question_answer_chain: Runnable = create_stuff_documents_chain( llm=llm, prompt=qa_prompt, + document_prompt=document_prompt, ) # Create the retrieval chain diff --git a/src/retrievers/userguide/prompt.py b/src/retrievers/userguide/prompt.py new file mode 100644 index 0000000..cbb3efb --- /dev/null +++ b/src/retrievers/userguide/prompt.py @@ -0,0 +1,37 @@ +from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder + +userguide_system_prompt = """ +You are a helpful guide to the **Reactome website** and its tools. +Your primary responsibility is to answer questions about **how to use Reactome** — the Pathway Browser, search, analysis tools, Details Panel, and related features — using only the user guide excerpts provided in the context. + +## Answering Guidelines +1. Strict source discipline: Use only the information explicitly provided from the Reactome user guide. Do not invent steps, buttons, menus, or workflows. + - If the context does not contain enough information to answer, say the user guide does not currently cover that topic. Do **not** guess. +2. Inline citations required: Every factual statement must include ≥1 inline anchor citation in the format: display_name + - Use the **exact** URL from the context (the line starting with `URL:`). Copy it verbatim. + - Never guess, shorten, or construct URLs from page titles (for example, do not turn "ReactomeGSA" into `/userguide/reactomegsa`). + - Use a clear display name (page title or section title). + - If multiple excerpts support the same fact, cite them together (space-separated). +3. How-to focus: Give clear, actionable steps when the user asks how to perform a task. Name UI elements accurately (buttons, panels, tabs) as they appear in the context. +4. Tone and style: + - Write in a clear, friendly, and conversational tone. + - Use accessible language; avoid unnecessary jargon. + - Prefer numbered steps for multi-step procedures. +5. Source list at the end: After the main answer, provide a bullet-point list of each unique citation anchor exactly once, in the same display_name format. + - Examples: + - Pathway Browser + - Searching Reactome + +## Internal QA (silent) +- All factual claims are cited correctly. +- No UI steps or features are invented beyond the provided context. +- The Sources list is complete and de-duplicated. +""" + +userguide_qa_prompt = ChatPromptTemplate.from_messages( + [ + ("system", userguide_system_prompt), + MessagesPlaceholder(variable_name="chat_history"), + ("user", "Context:\n{context}\n\nQuestion: {input}"), + ] +) diff --git a/src/retrievers/userguide/rag.py b/src/retrievers/userguide/rag.py new file mode 100644 index 0000000..68f72d3 --- /dev/null +++ b/src/retrievers/userguide/rag.py @@ -0,0 +1,35 @@ +from pathlib import Path + +from langchain_core.embeddings import Embeddings +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.prompts import PromptTemplate +from langchain_core.runnables import Runnable + +from retrievers.rag_chain import create_rag_chain +from retrievers.userguide.prompt import userguide_qa_prompt +from retrievers.userguide.retriever import create_userguide_retriever +from util.embedding_environment import EmbeddingEnvironment + +USERGUIDE_DOCUMENT_PROMPT = PromptTemplate.from_template( + "URL: {source}\n{page_content}" +) + + +def create_userguide_rag( + llm: BaseChatModel, + embedding: Embeddings, + embeddings_directory: Path = EmbeddingEnvironment.get_dir("userguide"), + *, + streaming: bool = False, +) -> Runnable: + userguide_retriever = create_userguide_retriever(embedding, embeddings_directory) + + if streaming: + llm = llm.model_copy(update={"streaming": True}) + + return create_rag_chain( + llm, + userguide_retriever, + userguide_qa_prompt, + document_prompt=USERGUIDE_DOCUMENT_PROMPT, + ) diff --git a/src/retrievers/userguide/retriever.py b/src/retrievers/userguide/retriever.py new file mode 100644 index 0000000..8500cb8 --- /dev/null +++ b/src/retrievers/userguide/retriever.py @@ -0,0 +1,37 @@ +from pathlib import Path + +from langchain_chroma.vectorstores import Chroma +from langchain_core.embeddings import Embeddings +from langchain_core.retrievers import BaseRetriever + +from retrievers.csv_chroma import chroma_settings + +CHROMA_COLLECTION = "sections" +DEFAULT_SEARCH_K = 6 + + +def create_userguide_retriever( + embedding: Embeddings, + embeddings_directory: Path | None, + *, + k: int = DEFAULT_SEARCH_K, +) -> BaseRetriever: + if embeddings_directory is None: + raise ValueError( + "User guide embeddings are not configured. " + "Run ./bin/embeddings_manager use /userguide/." + ) + + chroma_path = Path(embeddings_directory) / CHROMA_COLLECTION + if not (chroma_path / "chroma.sqlite3").is_file(): + raise FileNotFoundError( + f"User guide Chroma collection not found at {chroma_path}. " + "Run ./bin/embeddings_manager make /userguide/." + ) + + vectordb = Chroma( + persist_directory=str(chroma_path), + embedding_function=embedding, + client_settings=chroma_settings, + ) + return vectordb.as_retriever(search_kwargs={"k": k}) From e285e0c30bcbe2a5f82aa1d15d1fdde2c8a03634 Mon Sep 17 00:00:00 2001 From: Helia Mohammadi Date: Thu, 25 Jun 2026 18:14:55 -0400 Subject: [PATCH 2/4] fix lint and formatting --- src/agent/profiles/react_to_me.py | 13 ++++++------- src/data_generation/userguide/__init__.py | 4 +--- src/data_generation/userguide/html_loader.py | 6 +----- 3 files changed, 8 insertions(+), 15 deletions(-) diff --git a/src/agent/profiles/react_to_me.py b/src/agent/profiles/react_to_me.py index 80e5500..fac0ea2 100644 --- a/src/agent/profiles/react_to_me.py +++ b/src/agent/profiles/react_to_me.py @@ -8,12 +8,9 @@ from langgraph.graph.state import StateGraph from agent.profiles.base import BaseGraphBuilder, BaseState -from agent.tasks.intent_classifier import ( - QueryIntent, - SourceName, - create_intent_classifier, - resolve_active_sources, -) +from agent.tasks.intent_classifier import (QueryIntent, SourceName, + create_intent_classifier, + resolve_active_sources) from agent.tasks.safety_checker import SafetyCheck from agent.tasks.unsafe_question import create_unsafe_answer_generator from retrievers.reactome.rag import create_reactome_rag @@ -70,7 +67,9 @@ def _register_userguide_rag( ) -> None: userguide_dir = EmbeddingEnvironment.get_dir("userguide") if userguide_dir is None: - logger.info("User guide embeddings not configured; routing will use reactome only.") + logger.info( + "User guide embeddings not configured; routing will use reactome only." + ) return chroma_path = userguide_dir / "sections" / "chroma.sqlite3" diff --git a/src/data_generation/userguide/__init__.py b/src/data_generation/userguide/__init__.py index 5ecdd9b..bbc113e 100644 --- a/src/data_generation/userguide/__init__.py +++ b/src/data_generation/userguide/__init__.py @@ -85,7 +85,5 @@ def generate_userguide_embeddings( if not docs: raise RuntimeError("No user guide documents were produced") - db = upload_to_chromadb( - embeddings_dir, docs, CHROMA_COLLECTION, hf_model, device - ) + db = upload_to_chromadb(embeddings_dir, docs, CHROMA_COLLECTION, hf_model, device) print(db._collection.count()) diff --git a/src/data_generation/userguide/html_loader.py b/src/data_generation/userguide/html_loader.py index 03080b1..8654dde 100644 --- a/src/data_generation/userguide/html_loader.py +++ b/src/data_generation/userguide/html_loader.py @@ -181,11 +181,7 @@ def _load_page(self, url: str, path: Path) -> list[Document]: if section_level == 0 and len(text) < MIN_CHUNK_CHARS: continue if section_level > 0 and len(text) < MIN_CHUNK_CHARS: - text = ( - f"{section_title}\n\n{text}".strip() - if text - else section_title - ) + text = f"{section_title}\n\n{text}".strip() if text else section_title page_content_prefix = ( f"URL: {url}\nPage: {page_title}\nSection: {section_title}\n\n" From d8a704c7a9ecb02f13314135d965026c8f94120d Mon Sep 17 00:00:00 2001 From: Helia Mohammadi Date: Thu, 25 Jun 2026 18:21:38 -0400 Subject: [PATCH 3/4] URL dedup, DOM perf, safe routing fallback, requests dep. --- poetry.lock | 2 +- pyproject.toml | 1 + src/agent/tasks/intent_classifier.py | 10 +++++++++- src/data_generation/userguide/html_loader.py | 2 +- src/retrievers/userguide/rag.py | 12 +----------- 5 files changed, 13 insertions(+), 14 deletions(-) diff --git a/poetry.lock b/poetry.lock index d4697ad..483ab18 100644 --- a/poetry.lock +++ b/poetry.lock @@ -6323,4 +6323,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = ">=3.12, <4" -content-hash = "ff5060b8459b80fb03436b776212a70d58e59704054a30a20191bdd71039c63c" +content-hash = "44ddb6b6c435385a7e316dd81e05eb72bbc5b413d69d82f2359a3cfa086dcae3" diff --git a/pyproject.toml b/pyproject.toml index cef69ee..8829238 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,7 @@ openpyxl = "^3.1.5" nltk = "^3.9.1" beautifulsoup4 = "^4.12.0" lxml = "^5.0.0" +requests = "^2.32.0" [tool.poetry.group.dev.dependencies] ruff = "^0.7.1" diff --git a/src/agent/tasks/intent_classifier.py b/src/agent/tasks/intent_classifier.py index 932023c..17aa2b0 100644 --- a/src/agent/tasks/intent_classifier.py +++ b/src/agent/tasks/intent_classifier.py @@ -40,13 +40,21 @@ class QueryIntent(BaseModel): ) +_FALLBACK_ORDER: tuple[SourceName, ...] = ("reactome", "userguide") + + def resolve_active_sources( source: SourceName, available_sources: frozenset[SourceName], ) -> list[SourceName]: + if not available_sources: + raise ValueError("available_sources must not be empty") if source in available_sources: return [source] - return ["reactome"] + for fallback in _FALLBACK_ORDER: + if fallback in available_sources: + return [fallback] + return [next(iter(available_sources))] def create_intent_classifier(llm: BaseChatModel) -> Runnable: diff --git a/src/data_generation/userguide/html_loader.py b/src/data_generation/userguide/html_loader.py index 8654dde..03d8ceb 100644 --- a/src/data_generation/userguide/html_loader.py +++ b/src/data_generation/userguide/html_loader.py @@ -87,7 +87,7 @@ def _should_collect_block(element: Tag, collected: list[Tag]) -> bool: if not element.get_text(strip=True): return False for other in collected: - if element in other.descendants or other in element.descendants: + if other in element.parents or element in other.parents: return False return True diff --git a/src/retrievers/userguide/rag.py b/src/retrievers/userguide/rag.py index 68f72d3..6bf1c9f 100644 --- a/src/retrievers/userguide/rag.py +++ b/src/retrievers/userguide/rag.py @@ -2,7 +2,6 @@ from langchain_core.embeddings import Embeddings from langchain_core.language_models.chat_models import BaseChatModel -from langchain_core.prompts import PromptTemplate from langchain_core.runnables import Runnable from retrievers.rag_chain import create_rag_chain @@ -10,10 +9,6 @@ from retrievers.userguide.retriever import create_userguide_retriever from util.embedding_environment import EmbeddingEnvironment -USERGUIDE_DOCUMENT_PROMPT = PromptTemplate.from_template( - "URL: {source}\n{page_content}" -) - def create_userguide_rag( llm: BaseChatModel, @@ -27,9 +22,4 @@ def create_userguide_rag( if streaming: llm = llm.model_copy(update={"streaming": True}) - return create_rag_chain( - llm, - userguide_retriever, - userguide_qa_prompt, - document_prompt=USERGUIDE_DOCUMENT_PROMPT, - ) + return create_rag_chain(llm, userguide_retriever, userguide_qa_prompt) From f99b816a8e2b575d127186481424f550ff5acaac Mon Sep 17 00:00:00 2001 From: Helia Mohammadi Date: Thu, 25 Jun 2026 18:30:16 -0400 Subject: [PATCH 4/4] resolve mypy error --- src/data_generation/userguide/html_loader.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/data_generation/userguide/html_loader.py b/src/data_generation/userguide/html_loader.py index 03d8ceb..28ecc81 100644 --- a/src/data_generation/userguide/html_loader.py +++ b/src/data_generation/userguide/html_loader.py @@ -48,9 +48,11 @@ def _heading_title(heading: Tag) -> str: return title image = heading.find("img", alt=True) if image is not None: - alt = image.get("alt", "").strip() - if alt: - return alt + alt_attr = image.get("alt") + if isinstance(alt_attr, str): + alt = alt_attr.strip() + if alt: + return alt return ""