diff --git a/bin/embeddings_manager b/bin/embeddings_manager index 385e315..819521a 100755 --- a/bin/embeddings_manager +++ b/bin/embeddings_manager @@ -2,6 +2,7 @@ import os import re +import sys from argparse import ArgumentParser from pathlib import Path, PurePosixPath from shutil import rmtree @@ -9,6 +10,7 @@ from typing import NamedTuple, Self from zipfile import ZIP_DEFLATED, ZipFile import boto3 +import botocore.exceptions from botocore import UNSIGNED from botocore.client import Config @@ -47,6 +49,42 @@ class EmbeddingSelection(NamedTuple): return cls(*match.groups()) + +def _handle_s3_error(error: botocore.exceptions.ClientError, action: str) -> None: + """Print a human-readable S3 error message and exit.""" + code = error.response["Error"]["Code"] + message = error.response["Error"].get("Message", "") + if code in ("403", "AccessDenied"): + print( + f"\nERROR: S3 access denied while trying to {action}.\n" + f" Bucket : {S3_BUCKET}\n" + f" Code : {code} - {message}\n\n" + f"Possible causes:\n" + f" - The S3 bucket is temporarily restricted.\n" + f" - The requested embedding does not exist on S3.\n\n" + f"What to do:\n" + f" 1. Check open issues at https://github.com/reactome/reactome_chatbot/issues\n" + f" 2. Contact maintainers if the bucket should be publicly accessible.\n", + file=sys.stderr, + ) + elif code == "NoSuchKey": + print( + f"\nERROR: Embedding not found on S3 while trying to {action}.\n" + f" The requested path does not exist in the bucket.\n\n" + f"Run this to see available embeddings:\n" + f" bin/embeddings_manager ls-remote\n", + file=sys.stderr, + ) + else: + print( + f"\nERROR: S3 error while trying to {action}.\n" + f" Code : {code}\n" + f" Message : {message}\n", + file=sys.stderr, + ) + sys.exit(1) + + def pull(embedding: EmbeddingSelection): embedding_path:Path = embedding.path(check_exists=False) zip_tmpfile:Path = EM_ARCHIVE / "tmp.zip" @@ -59,6 +97,8 @@ def pull(embedding: EmbeddingSelection): print("Decompressing...") with ZipFile(zip_tmpfile, "r") as zipf: zipf.extractall(embedding_path) + except botocore.exceptions.ClientError as e: + _handle_s3_error(e, action=f"download '{embedding}'") finally: zip_tmpfile.unlink(missing_ok=True) print(f"Saved to {embedding_path}") @@ -130,10 +170,13 @@ def ls(): def ls_remote(): s3 = boto3.resource("s3", config=Config(signature_version=UNSIGNED)) s3_bucket = s3.Bucket(S3_BUCKET) - for obj in s3_bucket.objects.filter(Prefix=str(S3_PREFIX)): - relative_path = PurePosixPath(obj.key).relative_to(S3_PREFIX) - if len(relative_path.parts) == 4: - print(relative_path) + try: + for obj in s3_bucket.objects.filter(Prefix=str(S3_PREFIX)): + relative_path = PurePosixPath(obj.key).relative_to(S3_PREFIX) + if len(relative_path.parts) == 4: + print(relative_path) + except botocore.exceptions.ClientError as e: + _handle_s3_error(e, action="list remote embeddings") def which(): @@ -144,7 +187,6 @@ def which(): if __name__ == "__main__": parser = ArgumentParser() - # Parent parser for selecting embeddings selection_parser = ArgumentParser(add_help=False) selection_parser.add_argument( "embedding", @@ -152,7 +194,6 @@ if __name__ == "__main__": help="Embedding selection: ///" ) - # Subcommands subparsers = parser.add_subparsers(required=True) pull_parser = subparsers.add_parser( "pull", @@ -206,7 +247,6 @@ if __name__ == "__main__": ) which_parser.set_defaults(func=which) - # Command-specific arguments make_parser.add_argument( "--openai-key", help="API key for OpenAI" diff --git a/src/retrievers/reactome/rag.py b/src/retrievers/reactome/rag.py index 485b6e5..aa8c574 100644 --- a/src/retrievers/reactome/rag.py +++ b/src/retrievers/reactome/rag.py @@ -15,7 +15,7 @@ def create_reactome_rag( llm: BaseChatModel, embedding: Embeddings, - embeddings_directory: Path = EmbeddingEnvironment.get_dir("reactome"), + embeddings_directory: Path = EmbeddingEnvironment.get_dir_or_raise("reactome"), *, streaming: bool = False, ) -> Runnable: diff --git a/src/retrievers/uniprot/rag.py b/src/retrievers/uniprot/rag.py index 99702d7..8bf4cf2 100644 --- a/src/retrievers/uniprot/rag.py +++ b/src/retrievers/uniprot/rag.py @@ -15,7 +15,7 @@ def create_uniprot_rag( llm: BaseChatModel, embedding: Embeddings, - embeddings_directory: Path = EmbeddingEnvironment.get_dir("uniprot"), + embeddings_directory: Path = EmbeddingEnvironment.get_dir_or_raise("uniprot"), *, streaming: bool = False, ) -> Runnable: diff --git a/src/util/embedding_environment.py b/src/util/embedding_environment.py index ab4a43f..0ac97dd 100644 --- a/src/util/embedding_environment.py +++ b/src/util/embedding_environment.py @@ -14,7 +14,7 @@ def __init__(self, env_path: str): self.embeddings[db] = embedding_path @classmethod - def _get(cls): # -> Self + def _get(cls): if EM_CURRENT.exists(): with EM_CURRENT.open("r") as current_fp: env_path = current_fp.read() @@ -33,6 +33,39 @@ def get_dir(cls, key: str) -> Path | None: else: return None + @classmethod + def get_dir_or_raise(cls, key: str) -> Path: + """ + Like get_dir(), but raises RuntimeError with actionable install + instructions instead of returning None. + + Prevents downstream AttributeError: 'NoneType' object has no + attribute 'glob' when embeddings are not installed. + + Raises: + RuntimeError: if no embeddings are configured for `key`, + or if the configured directory does not exist on disk. + """ + directory = cls.get_dir(key) + if directory is None: + raise RuntimeError( + f"\n[ERROR] No embeddings configured for '{key}'.\n" + f"Install them with:\n\n" + f" ./bin/embeddings_manager install " + f"openai/text-embedding-3-large/{key}/ReleaseXX\n\n" + f"List available versions with:\n" + f" ./bin/embeddings_manager ls-remote\n" + ) + if not directory.exists(): + raise RuntimeError( + f"\n[ERROR] Embeddings directory configured but missing on disk:\n" + f" {directory}\n\n" + f"Re-install with:\n" + f" ./bin/embeddings_manager install " + f"openai/text-embedding-3-large/{key}/ReleaseXX\n" + ) + return directory + @classmethod def get_model(cls, key: str) -> str: return str(cls._get().embeddings[key].parent.parent) @@ -44,4 +77,4 @@ def set_one(cls, embedding_path: Path) -> None: embeddings_dict[db] = embedding_path env_path: str = ":".join(map(str, embeddings_dict.values())) with EM_CURRENT.open("w") as current_fp: - current_fp.write(env_path) + current_fp.write(env_path) \ No newline at end of file