From c8e3bc0620ee1e9ce9be2ed99f728e10d4fa0c3e Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 4 Nov 2025 20:37:57 +0000 Subject: [PATCH] Migrate from OpenAI to LiteLLM for provider-agnostic LLM support BREAKING CHANGE: This is a major version update (0.4.3 -> 0.5.0) Changes: - Replace openai dependency with litellm for multi-provider support - Add support for OpenAI, OpenRouter, Anthropic, and 100+ other providers - Add api_base parameter for custom API endpoints - Add **litellm_kwargs for provider-specific configuration - Make api_key optional (can use environment variables) - Update _clean_batch to use litellm.completion() with JSON mode - Update tests to mock litellm instead of openai - Update documentation (README.md and CLAUDE.md) with provider examples - Update example.py with multi-provider usage examples Benefits: - Users can now choose from 100+ LLM providers - Better support for OpenRouter (access multiple models through one API) - More flexible API key management (environment variables or direct) - Consistent interface across all providers Migration guide: - Existing OpenAI usage continues to work with minimal changes - API key can now be set via OPENAI_API_KEY environment variable - For other providers, use model prefixes (e.g., "openrouter/model-name") --- CLAUDE.md | 53 ++++++++++++++++---- README.md | 72 +++++++++++++++++++++++++-- example.py | 18 +++++-- llm_data_cleaner/cleaner.py | 99 ++++++++++++++++++++++++++++++++----- pyproject.toml | 6 +-- tests/test_cleaner.py | 62 ++++++++++++----------- 6 files changed, 249 insertions(+), 61 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 0edad3a..4bb07ef 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -2,10 +2,11 @@ ## Project Overview -**LLM Data Cleaner** is a Python package that automates the transformation of messy text columns into well-structured data using OpenAI models. It eliminates the need for complex regular expressions or manual parsing while ensuring output conforms to a schema. +**LLM Data Cleaner** is a Python package that automates the transformation of messy text columns into well-structured data using LLM APIs. It supports **OpenAI, OpenRouter, Anthropic, and 100+ other providers** through LiteLLM. The package eliminates the need for complex regular expressions or manual parsing while ensuring output conforms to a schema. ### Key Features -- **Automated data cleaning** using OpenAI's language models +- **Provider-agnostic** - Works with OpenAI, OpenRouter, Anthropic, and 100+ providers via LiteLLM +- **Automated data cleaning** using language models - **Schema validation** with Pydantic models - **Batch processing** to respect API rate limits - **YAML-based configuration** for reusable cleaning instructions @@ -37,24 +38,27 @@ The main class that orchestrates data cleaning operations. **Key Responsibilities:** - Batch processing of DataFrame columns -- Communication with OpenAI API using structured outputs +- Communication with LLM APIs using LiteLLM for provider-agnostic access +- Structured output parsing with JSON mode and Pydantic validation - Retry logic for failed API calls - Progress tracking with tqdm **Important Methods:** - `clean_dataframe(df, instructions)` - Main entry point for cleaning data - `_process_batch()` - Processes a single batch of rows -- `_clean_batch()` - Makes API calls with retry logic +- `_clean_batch()` - Makes API calls with retry logic using LiteLLM - `_make_batch_model()` - Creates Pydantic models for batch responses **Configuration Parameters:** -- `api_key`: OpenAI API key (required) -- `model`: Model to use (default: "gpt-4o-2024-08-06") +- `api_key`: API key for the LLM provider (optional if set via environment variables) +- `model`: Model name (default: "gpt-4o-2024-08-06"). Use provider prefixes for non-OpenAI models (e.g., "openrouter/anthropic/claude-3-opus") - `batch_size`: Number of rows per API call (default: 10) - `max_retries`: Retry attempts for failed calls (default: 3) - `retry_delay`: Seconds between retries (default: 5) - `temperature`: Model temperature (default: 0.0) - `system_prompt`: Custom system prompt template (optional) +- `api_base`: Base URL for the API (e.g., "https://openrouter.ai/api/v1" for OpenRouter) +- `**litellm_kwargs`: Additional arguments to pass to litellm.completion() ### 2. Utilities (`llm_data_cleaner/utils.py`) @@ -91,7 +95,7 @@ column_name: ## Dependencies ### Production -- `openai ^1.0.0` - OpenAI API client +- `litellm ^1.0.0` - Unified API client for 100+ LLM providers (OpenAI, OpenRouter, Anthropic, etc.) - `pydantic ^2.0.0` - Data validation and schema definition - `pandas ^2.2.3` - DataFrame operations - `pyyaml ^6.0.2` - YAML parsing @@ -164,13 +168,23 @@ Test files are located in `tests/`: ### Common Issues -1. **API Key Not Set**: Store OpenAI API key in `.secrets/OPENAI_API_KEY` or pass directly +1. **API Key Not Set**: Set API key via environment variables (OPENAI_API_KEY, OPENROUTER_API_KEY, ANTHROPIC_API_KEY) or pass directly to constructor 2. **Rate Limits**: Adjust `batch_size` to control API call frequency 3. **Schema Validation Errors**: Ensure Pydantic models include `index: int` field 4. **Missing Columns**: DataCleaner skips columns not in DataFrame with a warning +5. **Provider-specific Configuration**: Use `api_base` parameter for custom endpoints (e.g., OpenRouter) +6. **Model Naming**: Use provider prefixes for non-OpenAI models (e.g., "openrouter/model-name", "anthropic/model-name") ## API Changes +### Version 0.5.0 +- **BREAKING**: Migrated from OpenAI library to LiteLLM for provider-agnostic support +- Now supports OpenAI, OpenRouter, Anthropic, and 100+ other providers +- Added `api_base` parameter for custom API endpoints +- Added `**litellm_kwargs` for provider-specific configuration +- API key is now optional in constructor (can use environment variables) +- Model parameter now supports provider prefixes (e.g., "openrouter/", "anthropic/") + ### Version 0.4.x - Migrated from deprecated `client.responses.parse()` to supported OpenAI methods - Added `jsonize()` utility for consistent data serialization @@ -184,11 +198,32 @@ Test files are located in `tests/`: 1. **Define schemas** (Pydantic or YAML) 2. **Create instructions** dictionary mapping columns to prompts and schemas -3. **Initialize DataCleaner** with API key and configuration +3. **Initialize DataCleaner** with API key, model name, and configuration 4. **Load data** into pandas DataFrame 5. **Call `clean_dataframe()`** to process 6. **Access results** in `cleaned_*` columns +### Provider-Specific Examples + +**OpenAI (default):** +```python +cleaner = DataCleaner(api_key="sk-...", model="gpt-4o-2024-08-06") +``` + +**OpenRouter:** +```python +cleaner = DataCleaner( + api_key="sk-or-...", + model="openrouter/anthropic/claude-3-opus", + api_base="https://openrouter.ai/api/v1" +) +``` + +**Anthropic:** +```python +cleaner = DataCleaner(api_key="sk-ant-...", model="claude-3-opus-20240229") +``` + ## Authors - Miklós Koren (koren@codedthinking.com) - Gergely Attila Kiss (kiss@codedthinking.com) diff --git a/README.md b/README.md index 2652055..50f5833 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,10 @@ # LLM Data Cleaner -LLM Data Cleaner automates the transformation of messy text columns into well structured data using OpenAI models. It eliminates the need for complex regular expressions or manual parsing while ensuring the output conforms to a schema. +LLM Data Cleaner automates the transformation of messy text columns into well structured data using LLM APIs. It supports **OpenAI, OpenRouter, Anthropic, and 100+ other providers** through LiteLLM. The package eliminates the need for complex regular expressions or manual parsing while ensuring the output conforms to a schema. ## Why use it? +- **Provider-agnostic** – works with OpenAI, OpenRouter, Anthropic, and many other LLM providers. - **Less manual work** – delegate repetitive cleaning tasks to a language model. - **Consistent results** – validate responses with Pydantic models. - **Batch processing** – send rows in chunks to respect API rate limits. @@ -26,13 +27,24 @@ poetry add git+https://github.com/codedthinking/llm_data_cleaner.git 1. Create Pydantic models describing the cleaned values. 2. Define a dictionary of instructions mapping column names to a prompt and schema. -3. Instantiate `DataCleaner` with your OpenAI API key. +3. Instantiate `DataCleaner` with your API key and model name. 4. Load your raw CSV file with `pandas`. 5. Call `clean_dataframe(df, instructions)`. 6. Inspect the returned DataFrame which contains new `cleaned_*` columns. 7. Save or further process the cleaned data. -## Example: inline models +## Supported Providers + +Thanks to LiteLLM, this package supports 100+ LLM providers including: + +- **OpenAI** (GPT-4, GPT-3.5, etc.) +- **OpenRouter** (access to multiple models through one API) +- **Anthropic** (Claude models) +- **Cohere**, **AI21**, **Replicate**, **Hugging Face**, and many more + +See [LiteLLM's provider list](https://docs.litellm.ai/docs/providers) for the complete list. + +## Example: inline models with OpenAI ```python import pandas as pd @@ -60,6 +72,7 @@ instructions = { }, } +# OpenAI (default) cleaner = DataCleaner(api_key="YOUR_OPENAI_API_KEY") raw_df = pd.DataFrame({ "address": ["Budapest Váci út 1", "1200 Vienna Mariahilfer Straße 10"], @@ -69,6 +82,36 @@ cleaned = cleaner.clean_dataframe(raw_df, instructions) print(cleaned) ``` +## Example: using OpenRouter + +```python +from llm_data_cleaner import DataCleaner + +# OpenRouter allows you to access multiple models through one API +cleaner = DataCleaner( + api_key="YOUR_OPENROUTER_API_KEY", + model="openrouter/anthropic/claude-3-opus", + api_base="https://openrouter.ai/api/v1" +) + +# Use the same instructions and DataFrame as above +cleaned = cleaner.clean_dataframe(raw_df, instructions) +``` + +## Example: using Anthropic Claude + +```python +from llm_data_cleaner import DataCleaner + +# Anthropic Claude models +cleaner = DataCleaner( + api_key="YOUR_ANTHROPIC_API_KEY", + model="claude-3-opus-20240229" +) + +cleaned = cleaner.clean_dataframe(raw_df, instructions) +``` + ## Example: loading YAML instructions ```python @@ -76,7 +119,7 @@ from llm_data_cleaner import DataCleaner, load_yaml_instructions import pandas as pd instructions = load_yaml_instructions("instructions.yaml") -cleaner = DataCleaner(api_key="YOUR_OPENAI_API_KEY", system_prompt="{column_prompt}") +cleaner = DataCleaner(api_key="YOUR_API_KEY", system_prompt="{column_prompt}") raw_df = pd.read_csv("data.csv") result = cleaner.clean_dataframe(raw_df, instructions) ``` @@ -84,6 +127,27 @@ result = cleaner.clean_dataframe(raw_df, instructions) `load_yaml_instructions` reads the same structure shown above from a YAML file so cleaning rules can be shared without modifying code. +## Environment Variables + +You can also set API keys via environment variables instead of passing them directly: + +```bash +# For OpenAI +export OPENAI_API_KEY="sk-..." + +# For OpenRouter +export OPENROUTER_API_KEY="sk-or-..." + +# For Anthropic +export ANTHROPIC_API_KEY="sk-ant-..." +``` + +Then initialize without the api_key parameter: + +```python +cleaner = DataCleaner(model="gpt-4o-2024-08-06") # Uses OPENAI_API_KEY from environment +``` + ## Authors - Miklós Koren diff --git a/example.py b/example.py index ae82287..24765af 100755 --- a/example.py +++ b/example.py @@ -19,12 +19,13 @@ class JobTitleItem(BaseModel): yaml_instructions = load_yaml_instructions("instructions.yaml") -# Set your OpenAI API key, reading from .secrets/OPENAI_API_KEY +# Set your API key, reading from .secrets/OPENAI_API_KEY +# You can use OpenAI, OpenRouter, Anthropic, or other providers with open(".secrets/OPENAI_API_KEY", "r") as f: api_key = f.read().strip() # Ensure the API key is set if not api_key: - raise ValueError("API key is not set. Please provide a valid OpenAI API key.") + raise ValueError("API key is not set. Please provide a valid API key.") # Create a sample DataFrame data = { "education": [ @@ -61,12 +62,21 @@ class JobTitleItem(BaseModel): }, } # Initialize the cleaner with a batch size (default is 20) +# For OpenAI (default): cleaner = DataCleaner( - api_key=api_key, - batch_size=20, + api_key=api_key, + batch_size=20, system_prompt='Follow these instructions, but return the answers in Greek. {column_prompt}.', ) +# For OpenRouter, you would use: +# cleaner = DataCleaner( +# api_key="your-openrouter-key", +# model="openrouter/anthropic/claude-3-opus", +# api_base="https://openrouter.ai/api/v1", +# batch_size=20 +# ) + # Clean the data result = cleaner.clean_dataframe(df, instructions) diff --git a/llm_data_cleaner/cleaner.py b/llm_data_cleaner/cleaner.py index aa7e04f..5d8949e 100644 --- a/llm_data_cleaner/cleaner.py +++ b/llm_data_cleaner/cleaner.py @@ -1,35 +1,81 @@ import os import pandas as pd from typing import Dict, Any, Type, List, Optional -from openai import OpenAI +import litellm from pydantic import BaseModel, create_model, ConfigDict from llm_data_cleaner.utils import InstructionField, InstructionSchema import time from tqdm import tqdm from .utils import jsonize +import json class DataCleaner: """ - Batch DataCleaner that uses OpenAI's responses.parse method with auto-generated prompts. + Batch DataCleaner that uses LiteLLM for provider-agnostic LLM API calls with auto-generated prompts. + Supports OpenAI, OpenRouter, Anthropic, and 100+ other providers. """ def __init__( self, - api_key: str, + api_key: str = None, model: str = "gpt-4o-2024-08-06", max_retries: int = 3, retry_delay: int = 5, batch_size: int = 10, system_prompt: str = None, temperature: float = 0.0, + api_base: str = None, + **litellm_kwargs, ): - self.client = OpenAI(api_key=api_key) + """ + Initialize DataCleaner with LiteLLM support for multiple providers. + + Args: + api_key: API key for the LLM provider (can also be set via environment variables) + model: Model name (e.g., "gpt-4o-2024-08-06" for OpenAI, "openrouter/model-name" for OpenRouter) + max_retries: Number of retry attempts for failed API calls + retry_delay: Delay in seconds between retries + batch_size: Number of rows to process per API call + system_prompt: Custom system prompt template (must contain {column_prompt}) + temperature: Model temperature for generation + api_base: Base URL for the API (e.g., for OpenRouter: "https://openrouter.ai/api/v1") + **litellm_kwargs: Additional arguments to pass to litellm.completion() + + Examples: + # OpenAI (default) + cleaner = DataCleaner(api_key="sk-...") + + # OpenRouter + cleaner = DataCleaner( + api_key="sk-or-...", + model="openrouter/anthropic/claude-3-opus", + api_base="https://openrouter.ai/api/v1" + ) + + # Anthropic + cleaner = DataCleaner(api_key="sk-ant-...", model="claude-3-opus-20240229") + """ + self.api_key = api_key self.model = model self.max_retries = max_retries self.retry_delay = retry_delay self.batch_size = batch_size self.temperature = temperature + self.api_base = api_base + self.litellm_kwargs = litellm_kwargs + + # Set API key in environment if provided (LiteLLM reads from env vars) + if api_key: + # LiteLLM automatically detects the provider from the model name + # and uses the appropriate environment variable + if model.startswith("openrouter/"): + os.environ["OPENROUTER_API_KEY"] = api_key + elif model.startswith("anthropic/") or "claude" in model.lower(): + os.environ["ANTHROPIC_API_KEY"] = api_key + else: + # Default to OpenAI + os.environ["OPENAI_API_KEY"] = api_key # General system prompt format, set once for all tasks (you may tweak further) if system_prompt: @@ -136,15 +182,46 @@ def _clean_batch( messages: list, pyd_model_batch: Type[BaseModel] ): + """ + Clean a batch using LiteLLM completion API with JSON mode. + Parses the response back into the Pydantic model. + """ for attempt in range(self.max_retries): try: - resp = self.client.responses.parse( - model=self.model, - input=messages, - text_format=pyd_model_batch, - temperature=self.temperature, - ) - return resp.output_parsed + # Get JSON schema from Pydantic model for the prompt + schema_json = pyd_model_batch.model_json_schema() + + # Enhance system message with schema information + enhanced_messages = messages.copy() + enhanced_messages[0] = { + "role": "system", + "content": f"{messages[0]['content']}\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema_json, indent=2)}" + } + + # Prepare LiteLLM completion arguments + completion_kwargs = { + "model": self.model, + "messages": enhanced_messages, + "temperature": self.temperature, + "response_format": {"type": "json_object"}, + **self.litellm_kwargs + } + + # Add api_base if provided + if self.api_base: + completion_kwargs["api_base"] = self.api_base + + # Call LiteLLM + response = litellm.completion(**completion_kwargs) + + # Parse JSON response + content = response.choices[0].message.content + parsed_json = json.loads(content) + + # Convert JSON to Pydantic model + parsed_model = pyd_model_batch.model_validate(parsed_json) + + return parsed_model except Exception as e: print(f"Batch cleaning error: {e} (attempt {attempt+1}/{self.max_retries})") time.sleep(self.retry_delay) diff --git a/pyproject.toml b/pyproject.toml index 2778b32..ae5ad3e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,14 +1,14 @@ [tool.poetry] name = "llm_data_cleaner" -version = "0.4.3" -description = "A Python package for cleaning data using OpenAI API" +version = "0.5.0" +description = "A Python package for cleaning data using LLM APIs (OpenAI, OpenRouter, and more)" authors = ["Miklós Koren ", "Gergely Attila Kiss "] readme = "README.md" packages = [{include = "llm_data_cleaner"}] [tool.poetry.dependencies] python = ">=3.9,<4.0" -openai = "^1.0.0" +litellm = "^1.0.0" pydantic = "^2.0.0" jsonschema = "^4.0.0" tqdm = "^4.65.0" diff --git a/tests/test_cleaner.py b/tests/test_cleaner.py index affb7a4..ceef3e6 100644 --- a/tests/test_cleaner.py +++ b/tests/test_cleaner.py @@ -71,51 +71,53 @@ def test_validate_instructions(self): class TestDataCleaner(unittest.TestCase): - @patch('llm_data_cleaner.cleaner.OpenAI') - def test_clean_dataframe(self, mock_openai): - # Create a mock response - mock_completion = MagicMock() - mock_completion.choices[0].message.content = json.dumps({"year": 2020, "university": "Example University"}) - - # Set up the mock client - mock_client = MagicMock() - mock_client.chat.completions.create.return_value = mock_completion - mock_openai.return_value = mock_client - + @patch('llm_data_cleaner.cleaner.litellm.completion') + def test_clean_dataframe(self, mock_completion): + # Create a mock response matching LiteLLM's response structure + mock_response = MagicMock() + mock_response.choices = [MagicMock()] + mock_response.choices[0].message.content = json.dumps({ + "cleaned": [ + {"index": 0, "year": 2020, "university": "Example University"} + ] + }) + mock_completion.return_value = mock_response + # Create a test DataFrame df = pd.DataFrame({ "education": ["Graduated from Example University in 2020"] }) - + + # Create Pydantic model for testing + from pydantic import BaseModel + + class EducationItem(BaseModel): + index: int + year: int | None + university: str | None + # Create cleaning instructions instructions = { "education": { "prompt": "Extract year and university name", - "schema": { - "type": "object", - "properties": { - "year": {"type": ["integer", "null"]}, - "university": {"type": ["string", "null"]} - }, - "required": ["year", "university"] - } + "schema": EducationItem } } - + # Initialize the cleaner cleaner = DataCleaner(api_key="dummy-key") - + # Clean the data result = cleaner.clean_dataframe(df, instructions) - + # Assertions - self.assertIn("original_education", result.columns) - self.assertIn("cleaned_education", result.columns) - - # Check the API was called with the correct arguments - mock_client.chat.completions.create.assert_called_once() - call_args = mock_client.chat.completions.create.call_args[1] - self.assertEqual(call_args["model"], "gpt-4o") + self.assertIn("cleaned_year", result.columns) + self.assertIn("cleaned_university", result.columns) + + # Check the API was called + mock_completion.assert_called() + call_args = mock_completion.call_args[1] + self.assertEqual(call_args["model"], "gpt-4o-2024-08-06") self.assertEqual(call_args["temperature"], 0.0) self.assertEqual(call_args["response_format"], {"type": "json_object"})