diff --git a/README.md b/README.md index de6bddf..98bec34 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,3 @@ - # ReAgentAI @@ -15,11 +14,14 @@ ReAgentAI is an advanced chemical assistant powered by AI that provides comprehe - **Molecular Visualization**: Create high-quality images of chemical structures and reaction pathways - **Similarity Search**: Find structurally similar molecules using molecular fingerprints and Tanimoto similarity - **SMILES Validation**: Verify and validate SMILES strings for chemical accuracy +- **Chemical Information Retrieval**: Access comprehensive chemical data through PubChem integration +- **Chemical Name/SMILES Conversion**: Convert between chemical names and SMILES using authoritative PubChem database - **Chemical Knowledge**: Access comprehensive chemistry information through integrated web search -### Datasets & Models +### Integrated Databases & Models - **USPTO-trained models**: Leveraging one of the largest chemical reaction databases - **ZINC stock collection**: Access to commercially available compounds +- **PubChem database**: Integration with the NIH's comprehensive chemical database - **Curated molecular datasets**: ~16,000 drug-like molecules for similarity searches ## 🛠 Setup @@ -85,6 +87,9 @@ ReAgentAI supports various chemistry-related queries: - **Molecular similarity**: "Find molecules similar to ethanol" or "What compounds are structurally related to benzene?" - **Structure visualization**: "Show me the structure of morphine" or "Generate an image of the synthesis route" - **Chemical validation**: "Is this SMILES string valid: CCO?" +- **Chemical information**: "What is the IUPAC name and molecular formula of Paracetamol?" +- **Name to SMILES**: "What is the SMILES string for Gabapentin?" +- **SMILES to name**: "What chemical name corresponds to this SMILES: CN1C=NC2=C1C(=O)N(C(=O)N2C)C?" - **General chemistry**: "What are the properties of acetaminophen?" ## 🔧 Architecture @@ -93,6 +98,7 @@ ReAgentAI is built with: - **Pydantic AI**: For robust AI agent framework - **RDKit**: Chemical informatics and molecular manipulation - **AiZynthFinder**: Retrosynthetic analysis engine +- **PubChemPy**: Interface for accessing the PubChem database - **Google Gemini**: Large language model for natural language processing - **Gradio**: User-friendly web interface diff --git a/pyproject.toml b/pyproject.toml index 0db98b0..64a6b14 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,7 @@ dependencies = [ "python-dotenv>=1.1.0", "gradio>=5.29.1", "pydantic-ai-slim[duckduckgo]>=0.2.4", + "pubchempy>=1.0.4", ] [tool.black] diff --git a/src/reagentai/agents/main/instructions.txt b/src/reagentai/agents/main/instructions.txt index 2733bbf..837671a 100644 --- a/src/reagentai/agents/main/instructions.txt +++ b/src/reagentai/agents/main/instructions.txt @@ -1,4 +1,3 @@ - You are ReAgentAI, an advanced and highly precise chemical assistant. Your primary function is to answer chemistry-related questions, perform retrosynthesis, and visualize chemical structures and reaction pathways. Your core principle is to always use your available tools to provide accurate, reliable, and thoroughly grounded information. **Core Responsibilities:** @@ -65,6 +64,21 @@ You are ReAgentAI, an advanced and highly precise chemical assistant. Your prima * `target_smiles_list` (optional): A list of SMILES strings to search against. If not provided, defaults to a curated dataset of ~16,000 drug-like molecules commonly used in chemical informatics. * `top_n` (optional): The number of most similar molecules to return (defaults to 5). * **Output Interpretation:** Returns a list of tuples containing SMILES strings and their Tanimoto similarity scores (0-1 range, where 1 indicates identical molecules and 0 indicates completely dissimilar). Present the results clearly, explaining that higher scores indicate greater structural similarity. +* **`get_smiles_from_name`:** + * **Purpose:** Retrieve the canonical SMILES string for a chemical compound using its common or IUPAC name (via PubChem database). + * **Usage:** Use this tool when a user provides a chemical name and you need its SMILES. If the input is already a valid SMILES, it will be returned as is. + * **Input:** `compound_name` (string) + * **Output Interpretation:** Returns the canonical SMILES string from PubChem, or the input if it is already a valid SMILES. +* **`get_compound_info`:** + * **Purpose:** Retrieve detailed chemical information from PubChem, including SMILES, molecular formula, molecular weight, IUPAC name, and synonyms. + * **Usage:** Use this tool when the user asks for detailed information about a compound by name. + * **Input:** `compound_name` (string) + * **Output Interpretation:** Returns a dictionary with keys like 'smiles', 'molecular_formula', 'molecular_weight', 'iupac_name', 'cid', and 'synonyms'. +* **`get_name_from_smiles`:** + * **Purpose:** Find the best-matching chemical name (IUPAC or synonym) for a given SMILES string using PubChem. + * **Usage:** Use this tool when you have a SMILES and need to present a human-readable name for it. + * **Input:** `smiles` (string) + * **Output Interpretation:** Returns the IUPAC name if available, otherwise the first synonym from PubChem. Your responses should always be professional, clear, and reflect your expert chemical knowledge, meticulously supported by your tool usage. diff --git a/src/reagentai/agents/main/main_agent.py b/src/reagentai/agents/main/main_agent.py index 41afe85..6019cf3 100644 --- a/src/reagentai/agents/main/main_agent.py +++ b/src/reagentai/agents/main/main_agent.py @@ -12,6 +12,11 @@ from src.reagentai.common.aizynthfinder import initialize_aizynthfinder from src.reagentai.constants import AIZYNTHFINDER_CONFIG_PATH from src.reagentai.tools.image import route_to_image, smiles_to_image +from src.reagentai.tools.pubchem import ( + get_compound_info, + get_name_from_smiles, + get_smiles_from_name, +) from src.reagentai.tools.retrosynthesis import perform_retrosynthesis from src.reagentai.tools.smiles import find_similar_molecules, is_valid_smiles @@ -199,6 +204,9 @@ def create_main_agent() -> MainAgent: Tool(smiles_to_image), Tool(route_to_image), Tool(find_similar_molecules), + Tool(get_smiles_from_name), + Tool(get_compound_info), + Tool(get_name_from_smiles), duckduckgo_search_tool(), ] diff --git a/src/reagentai/constants.py b/src/reagentai/constants.py index d9639b8..0b7b4d2 100644 --- a/src/reagentai/constants.py +++ b/src/reagentai/constants.py @@ -11,6 +11,9 @@ "How to synthesize Aspirin? Can u tell me the best steps to achieve this?", "Suggest a retrosynthesis for Ibuprofen. Show all molecule images from the best route.", "Find molecules similar to Aspirin. Show the top 5.", + "What is the IUPAC name and molecular formula of Paracetamol?", + "Convert this SMILES to a chemical name: CC(=O)OC1=CC=CC=C1C(=O)O", + "Tell me the detailed properties of Gabapentin.", ] DEFAULT_LOG_LEVEL: int = INFO diff --git a/src/reagentai/tools/pubchem.py b/src/reagentai/tools/pubchem.py new file mode 100644 index 0000000..1ce3c7a --- /dev/null +++ b/src/reagentai/tools/pubchem.py @@ -0,0 +1,222 @@ +import logging + +import pubchempy as pcp +from pydantic_ai.exceptions import ModelRetry + +from src.reagentai.tools.smiles import is_valid_smiles + +logger = logging.getLogger(__name__) + + +def get_smiles_from_name(compound_name: str) -> str: + """ + Retrieve the SMILES string for a chemical compound using its common name via PubChem. + + This function searches the PubChem database to find the canonical SMILES representation + of a chemical compound based on its common name, IUPAC name, or other identifiers. + PubChem is a comprehensive chemical database maintained by the NIH that contains + millions of chemical structures and their properties. + + Args: + compound_name (str): The name of the chemical compound to search for. + This can be a common name (e.g., "aspirin", "caffeine"), + IUPAC name, trade name, or other chemical identifier. + + Returns: + str: The canonical SMILES string of the compound as found in PubChem. + + Raises: + ModelRetry: If the compound name is not found in PubChem, if no valid + SMILES string could be retrieved, or if there's a network issue. + + Example: + >>> smiles = get_smiles_from_name("aspirin") + >>> print(smiles) + "CC(=O)OC1=CC=CC=C1C(=O)O" + + >>> smiles = get_smiles_from_name("caffeine") + >>> print(smiles) + "CN1C=NC2=C1C(=O)N(C(=O)N2C)C" + """ + logger.info(f"[TASK] [GET_SMILES_FROM_NAME] Arguments: compound_name: {compound_name}") + + if not compound_name or not compound_name.strip(): + logger.error("Empty or invalid compound name provided") + raise ModelRetry("Compound name cannot be empty") + + compound_name = compound_name.strip() + + # Check if the input is already a valid SMILES string + if is_valid_smiles(compound_name): + logger.info(f"Input appears to be a valid SMILES, returning as is: {compound_name}") + return compound_name + + try: + # Search for the compound by name + compounds = pcp.get_compounds(compound_name, "name") + + if not compounds: + logger.warning(f"No compounds found for name: {compound_name}") + raise ModelRetry(f"No compound found in PubChem for name: '{compound_name}'") + + # Get the first (most relevant) compound + compound = compounds[0] + + # Retrieve the canonical SMILES + smiles = compound.canonical_smiles + + if not smiles: + logger.error(f"No SMILES found for compound: {compound_name}") + raise ModelRetry(f"No SMILES string available for compound: '{compound_name}'") + + logger.info(f"Successfully retrieved SMILES for {compound_name}: {smiles}") + logger.debug(f"PubChem CID: {compound.cid}") + + return smiles + + except Exception as e: + if isinstance(e, ModelRetry): + raise # Re-raise ModelRetry as-is + + logger.error(f"Error retrieving SMILES for {compound_name}: {str(e)}") + + # For all exceptions, wrap in ModelRetry + error_msg = f"Failed to retrieve SMILES for '{compound_name}': {str(e)}" + if "connection" in str(e).lower() or "network" in str(e).lower(): + error_msg = f"Failed to connect to PubChem: {str(e)}" + + raise ModelRetry(error_msg) from e + + +def get_compound_info(compound_name: str) -> dict[str, str | list | None]: + """ + Retrieve comprehensive information about a chemical compound from PubChem. + + This function provides additional chemical information beyond just the SMILES string, + including molecular formula, molecular weight, IUPAC name, and other identifiers. + + Args: + compound_name (str): The name of the chemical compound to search for. + + Returns: + dict[str, Optional[str]]: A dictionary containing compound information with keys: + - 'smiles': Canonical SMILES string + - 'molecular_formula': Molecular formula + - 'molecular_weight': Molecular weight in g/mol + - 'iupac_name': IUPAC systematic name + - 'cid': PubChem Compound ID + - 'synonyms': List of alternative names (first 5) + + Raises: + ModelRetry: If the compound name is not found in PubChem or if there's a network issue. + + Example: + >>> info = get_compound_info("aspirin") + >>> print(info['smiles']) + "CC(=O)OC1=CC=CC=C1C(=O)O" + >>> print(info['molecular_formula']) + "C9H8O4" + """ + logger.info(f"[TASK] [GET_COMPOUND_INFO] Arguments: compound_name: {compound_name}") + + if not compound_name or not compound_name.strip(): + logger.error("Empty or invalid compound name provided") + raise ModelRetry("Compound name cannot be empty") + + compound_name = compound_name.strip() + + try: + # Search for the compound by name + compounds = pcp.get_compounds(compound_name, "name") + + if not compounds: + logger.warning(f"No compounds found for name: {compound_name}") + raise ModelRetry(f"No compound found in PubChem for name: '{compound_name}'") + + # Get the first (most relevant) compound + compound = compounds[0] + + # Extract comprehensive information + info = { + "smiles": getattr(compound, "canonical_smiles", None), + "molecular_formula": getattr(compound, "molecular_formula", None), + "molecular_weight": str(getattr(compound, "molecular_weight", None)) + if hasattr(compound, "molecular_weight") + else None, + "iupac_name": getattr(compound, "iupac_name", None), + "cid": str(getattr(compound, "cid", None)) if hasattr(compound, "cid") else None, + "synonyms": getattr(compound, "synonyms", [])[:5] + if hasattr(compound, "synonyms") + else [], + } + + logger.info(f"Successfully retrieved compound info for {compound_name}") + logger.debug(f"Compound info: {info}") + + return info + + except Exception as e: + if isinstance(e, ModelRetry): + raise # Re-raise ModelRetry as-is + + logger.error(f"Error retrieving compound info for {compound_name}: {str(e)}") + + # For all exceptions, wrap in ModelRetry + error_msg = f"Failed to retrieve compound info for '{compound_name}': {str(e)}" + if "connection" in str(e).lower() or "network" in str(e).lower(): + error_msg = f"Failed to connect to PubChem: {str(e)}" + + raise ModelRetry(error_msg) from e + + +def get_name_from_smiles(smiles: str) -> str: + """ + Retrieve the best-matching chemical name for a given SMILES string using PubChem. + + Args: + smiles (str): The SMILES string of the compound. + + Returns: + str: The best-matching chemical name (IUPAC or synonym) from PubChem. + + Raises: + ModelRetry: If no compound is found for the SMILES, if no name is available, + or if there's a network issue connecting to PubChem servers. + """ + logger.info(f"[TASK] [GET_NAME_FROM_SMILES] Arguments: smiles: {smiles}") + + if not smiles or not smiles.strip(): + logger.error("Empty or invalid SMILES provided") + raise ModelRetry("SMILES string cannot be empty") + + smiles = smiles.strip() + + try: + compounds = pcp.get_compounds(smiles, "smiles") + if not compounds: + logger.warning(f"No compounds found for SMILES: {smiles}") + raise ModelRetry(f"No compound found in PubChem for SMILES: '{smiles}'") + compound = compounds[0] + # Prefer IUPAC name, fall back to first synonym + name = getattr(compound, "iupac_name", None) + if not name: + synonyms = getattr(compound, "synonyms", []) + if synonyms: + name = synonyms[0] + if not name: + logger.error(f"No name found for SMILES: {smiles}") + raise ModelRetry(f"No name available for SMILES: '{smiles}'") + logger.info(f"Successfully retrieved name for SMILES {smiles}: {name}") + return name + except Exception as e: + if isinstance(e, ModelRetry): + raise # Re-raise ModelRetry as-is + + logger.error(f"Error retrieving name for SMILES {smiles}: {str(e)}") + + # For all exceptions, wrap in ModelRetry + error_msg = f"Failed to retrieve name for SMILES '{smiles}': {str(e)}" + if "connection" in str(e).lower() or "network" in str(e).lower(): + error_msg = f"Failed to connect to PubChem: {str(e)}" + + raise ModelRetry(error_msg) from e diff --git a/src/reagentai/ui/app.py b/src/reagentai/ui/app.py index 6292138..d98fcd0 100644 --- a/src/reagentai/ui/app.py +++ b/src/reagentai/ui/app.py @@ -1,7 +1,7 @@ import functools import gradio as gr -from pydantic_ai import UnexpectedModelBehavior, UsageLimitExceeded, ModelHTTPError +from pydantic_ai import ModelHTTPError, UnexpectedModelBehavior, UsageLimitExceeded from pydantic_ai.messages import ToolCallPart, ToolReturnPart from src.reagentai.agents.main.main_agent import MainAgent @@ -37,6 +37,12 @@ def create_settings_panel( visible=True, ) + gr.Examples( + examples=EXAMPLE_PROMPTS, + inputs=chat_input_component, + label="Example Prompts", + ) + gr.Markdown("### Tool Usage History") tool_display = gr.Chatbot( type="messages", @@ -45,12 +51,6 @@ def create_settings_panel( elem_id="tool_display", ) - gr.Examples( - examples=EXAMPLE_PROMPTS, - inputs=chat_input_component, - label="Example Prompts", - ) - return model_dropdown, usage_counter, tool_display diff --git a/uv.lock b/uv.lock index 40fb04c..4d66919 100644 --- a/uv.lock +++ b/uv.lock @@ -2440,6 +2440,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", size = 13993, upload-time = "2020-12-28T15:15:28.35Z" }, ] +[[package]] +name = "pubchempy" +version = "1.0.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/aa/fb/8de3aa9804b614dbc8dc5c16ed061d819cc360e0ddecda3dcd01c1552339/PubChemPy-1.0.4.tar.gz", hash = "sha256:24e9dc2fc90ab153b2764bf805e510b1410700884faf0510a9e7cf0d61d8ed0e", size = 29767, upload-time = "2017-04-11T18:36:23.649Z" } + [[package]] name = "pure-eval" version = "0.2.3" @@ -2926,6 +2932,7 @@ source = { virtual = "." } dependencies = [ { name = "aizynthfinder" }, { name = "gradio" }, + { name = "pubchempy" }, { name = "pydantic-ai" }, { name = "pydantic-ai-slim", extra = ["duckduckgo"] }, { name = "python-dotenv" }, @@ -2935,6 +2942,7 @@ dependencies = [ requires-dist = [ { name = "aizynthfinder", specifier = ">=4.3.2" }, { name = "gradio", specifier = ">=5.29.1" }, + { name = "pubchempy", specifier = ">=1.0.4" }, { name = "pydantic-ai", specifier = ">=0.2.4" }, { name = "pydantic-ai-slim", extras = ["duckduckgo"], specifier = ">=0.2.4" }, { name = "python-dotenv", specifier = ">=1.1.0" },