From c8e3bc0620ee1e9ce9be2ed99f728e10d4fa0c3e Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 4 Nov 2025 20:37:57 +0000
Subject: [PATCH] Migrate from OpenAI to LiteLLM for provider-agnostic LLM
 support

BREAKING CHANGE: This is a major version update (0.4.3 -> 0.5.0)

Changes:
- Replace openai dependency with litellm for multi-provider support
- Add support for OpenAI, OpenRouter, Anthropic, and 100+ other providers
- Add api_base parameter for custom API endpoints
- Add **litellm_kwargs for provider-specific configuration
- Make api_key optional (can use environment variables)
- Update _clean_batch to use litellm.completion() with JSON mode
- Update tests to mock litellm instead of openai
- Update documentation (README.md and CLAUDE.md) with provider examples
- Update example.py with multi-provider usage examples

Benefits:
- Users can now choose from 100+ LLM providers
- Better support for OpenRouter (access multiple models through one API)
- More flexible API key management (environment variables or direct)
- Consistent interface across all providers

Migration guide:
- Existing OpenAI usage continues to work with minimal changes
- API key can now be set via OPENAI_API_KEY environment variable
- For other providers, use model prefixes (e.g., "openrouter/model-name")
---
 CLAUDE.md                   | 53 ++++++++++++++++----
 README.md                   | 72 +++++++++++++++++++++++++--
 example.py                  | 18 +++++--
 llm_data_cleaner/cleaner.py | 99 ++++++++++++++++++++++++++++++++-----
 pyproject.toml              |  6 +--
 tests/test_cleaner.py       | 62 ++++++++++++-----------
 6 files changed, 249 insertions(+), 61 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 0edad3a..4bb07ef 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -2,10 +2,11 @@
 
 ## Project Overview
 
-**LLM Data Cleaner** is a Python package that automates the transformation of messy text columns into well-structured data using OpenAI models. It eliminates the need for complex regular expressions or manual parsing while ensuring output conforms to a schema.
+**LLM Data Cleaner** is a Python package that automates the transformation of messy text columns into well-structured data using LLM APIs. It supports **OpenAI, OpenRouter, Anthropic, and 100+ other providers** through LiteLLM. The package eliminates the need for complex regular expressions or manual parsing while ensuring output conforms to a schema.
 
 ### Key Features
-- **Automated data cleaning** using OpenAI's language models
+- **Provider-agnostic** - Works with OpenAI, OpenRouter, Anthropic, and 100+ providers via LiteLLM
+- **Automated data cleaning** using language models
 - **Schema validation** with Pydantic models
 - **Batch processing** to respect API rate limits
 - **YAML-based configuration** for reusable cleaning instructions
@@ -37,24 +38,27 @@ The main class that orchestrates data cleaning operations.
 
 **Key Responsibilities:**
 - Batch processing of DataFrame columns
-- Communication with OpenAI API using structured outputs
+- Communication with LLM APIs using LiteLLM for provider-agnostic access
+- Structured output parsing with JSON mode and Pydantic validation
 - Retry logic for failed API calls
 - Progress tracking with tqdm
 
 **Important Methods:**
 - `clean_dataframe(df, instructions)` - Main entry point for cleaning data
 - `_process_batch()` - Processes a single batch of rows
-- `_clean_batch()` - Makes API calls with retry logic
+- `_clean_batch()` - Makes API calls with retry logic using LiteLLM
 - `_make_batch_model()` - Creates Pydantic models for batch responses
 
 **Configuration Parameters:**
-- `api_key`: OpenAI API key (required)
-- `model`: Model to use (default: "gpt-4o-2024-08-06")
+- `api_key`: API key for the LLM provider (optional if set via environment variables)
+- `model`: Model name (default: "gpt-4o-2024-08-06"). Use provider prefixes for non-OpenAI models (e.g., "openrouter/anthropic/claude-3-opus")
 - `batch_size`: Number of rows per API call (default: 10)
 - `max_retries`: Retry attempts for failed calls (default: 3)
 - `retry_delay`: Seconds between retries (default: 5)
 - `temperature`: Model temperature (default: 0.0)
 - `system_prompt`: Custom system prompt template (optional)
+- `api_base`: Base URL for the API (e.g., "https://openrouter.ai/api/v1" for OpenRouter)
+- `**litellm_kwargs`: Additional arguments to pass to litellm.completion()
 
 ### 2. Utilities (`llm_data_cleaner/utils.py`)
 
@@ -91,7 +95,7 @@ column_name:
 ## Dependencies
 
 ### Production
-- `openai ^1.0.0` - OpenAI API client
+- `litellm ^1.0.0` - Unified API client for 100+ LLM providers (OpenAI, OpenRouter, Anthropic, etc.)
 - `pydantic ^2.0.0` - Data validation and schema definition
 - `pandas ^2.2.3` - DataFrame operations
 - `pyyaml ^6.0.2` - YAML parsing
@@ -164,13 +168,23 @@ Test files are located in `tests/`:
 
 ### Common Issues
 
-1. **API Key Not Set**: Store OpenAI API key in `.secrets/OPENAI_API_KEY` or pass directly
+1. **API Key Not Set**: Set API key via environment variables (OPENAI_API_KEY, OPENROUTER_API_KEY, ANTHROPIC_API_KEY) or pass directly to constructor
 2. **Rate Limits**: Adjust `batch_size` to control API call frequency
 3. **Schema Validation Errors**: Ensure Pydantic models include `index: int` field
 4. **Missing Columns**: DataCleaner skips columns not in DataFrame with a warning
+5. **Provider-specific Configuration**: Use `api_base` parameter for custom endpoints (e.g., OpenRouter)
+6. **Model Naming**: Use provider prefixes for non-OpenAI models (e.g., "openrouter/model-name", "anthropic/model-name")
 
 ## API Changes
 
+### Version 0.5.0
+- **BREAKING**: Migrated from OpenAI library to LiteLLM for provider-agnostic support
+- Now supports OpenAI, OpenRouter, Anthropic, and 100+ other providers
+- Added `api_base` parameter for custom API endpoints
+- Added `**litellm_kwargs` for provider-specific configuration
+- API key is now optional in constructor (can use environment variables)
+- Model parameter now supports provider prefixes (e.g., "openrouter/", "anthropic/")
+
 ### Version 0.4.x
 - Migrated from deprecated `client.responses.parse()` to supported OpenAI methods
 - Added `jsonize()` utility for consistent data serialization
@@ -184,11 +198,32 @@ Test files are located in `tests/`:
 
 1. **Define schemas** (Pydantic or YAML)
 2. **Create instructions** dictionary mapping columns to prompts and schemas
-3. **Initialize DataCleaner** with API key and configuration
+3. **Initialize DataCleaner** with API key, model name, and configuration
 4. **Load data** into pandas DataFrame
 5. **Call `clean_dataframe()`** to process
 6. **Access results** in `cleaned_*` columns
 
+### Provider-Specific Examples
+
+**OpenAI (default):**
+```python
+cleaner = DataCleaner(api_key="sk-...", model="gpt-4o-2024-08-06")
+```
+
+**OpenRouter:**
+```python
+cleaner = DataCleaner(
+    api_key="sk-or-...",
+    model="openrouter/anthropic/claude-3-opus",
+    api_base="https://openrouter.ai/api/v1"
+)
+```
+
+**Anthropic:**
+```python
+cleaner = DataCleaner(api_key="sk-ant-...", model="claude-3-opus-20240229")
+```
+
 ## Authors
 - Miklós Koren (koren@codedthinking.com)
 - Gergely Attila Kiss (kiss@codedthinking.com)
diff --git a/README.md b/README.md
index 2652055..50f5833 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,10 @@
 # LLM Data Cleaner
 
-LLM Data Cleaner automates the transformation of messy text columns into well structured data using OpenAI models. It eliminates the need for complex regular expressions or manual parsing while ensuring the output conforms to a schema.
+LLM Data Cleaner automates the transformation of messy text columns into well structured data using LLM APIs. It supports **OpenAI, OpenRouter, Anthropic, and 100+ other providers** through LiteLLM. The package eliminates the need for complex regular expressions or manual parsing while ensuring the output conforms to a schema.
 
 ## Why use it?
 
+- **Provider-agnostic** – works with OpenAI, OpenRouter, Anthropic, and many other LLM providers.
 - **Less manual work** – delegate repetitive cleaning tasks to a language model.
 - **Consistent results** – validate responses with Pydantic models.
 - **Batch processing** – send rows in chunks to respect API rate limits.
@@ -26,13 +27,24 @@ poetry add git+https://github.com/codedthinking/llm_data_cleaner.git
 
 1. Create Pydantic models describing the cleaned values.
 2. Define a dictionary of instructions mapping column names to a prompt and schema.
-3. Instantiate `DataCleaner` with your OpenAI API key.
+3. Instantiate `DataCleaner` with your API key and model name.
 4. Load your raw CSV file with `pandas`.
 5. Call `clean_dataframe(df, instructions)`.
 6. Inspect the returned DataFrame which contains new `cleaned_*` columns.
 7. Save or further process the cleaned data.
 
-## Example: inline models
+## Supported Providers
+
+Thanks to LiteLLM, this package supports 100+ LLM providers including:
+
+- **OpenAI** (GPT-4, GPT-3.5, etc.)
+- **OpenRouter** (access to multiple models through one API)
+- **Anthropic** (Claude models)
+- **Cohere**, **AI21**, **Replicate**, **Hugging Face**, and many more
+
+See [LiteLLM's provider list](https://docs.litellm.ai/docs/providers) for the complete list.
+
+## Example: inline models with OpenAI
 
 ```python
 import pandas as pd
@@ -60,6 +72,7 @@ instructions = {
     },
 }
 
+# OpenAI (default)
 cleaner = DataCleaner(api_key="YOUR_OPENAI_API_KEY")
 raw_df = pd.DataFrame({
     "address": ["Budapest Váci út 1", "1200 Vienna Mariahilfer Straße 10"],
@@ -69,6 +82,36 @@ cleaned = cleaner.clean_dataframe(raw_df, instructions)
 print(cleaned)
 ```
 
+## Example: using OpenRouter
+
+```python
+from llm_data_cleaner import DataCleaner
+
+# OpenRouter allows you to access multiple models through one API
+cleaner = DataCleaner(
+    api_key="YOUR_OPENROUTER_API_KEY",
+    model="openrouter/anthropic/claude-3-opus",
+    api_base="https://openrouter.ai/api/v1"
+)
+
+# Use the same instructions and DataFrame as above
+cleaned = cleaner.clean_dataframe(raw_df, instructions)
+```
+
+## Example: using Anthropic Claude
+
+```python
+from llm_data_cleaner import DataCleaner
+
+# Anthropic Claude models
+cleaner = DataCleaner(
+    api_key="YOUR_ANTHROPIC_API_KEY",
+    model="claude-3-opus-20240229"
+)
+
+cleaned = cleaner.clean_dataframe(raw_df, instructions)
+```
+
 ## Example: loading YAML instructions
 
 ```python
@@ -76,7 +119,7 @@ from llm_data_cleaner import DataCleaner, load_yaml_instructions
 import pandas as pd
 
 instructions = load_yaml_instructions("instructions.yaml")
-cleaner = DataCleaner(api_key="YOUR_OPENAI_API_KEY", system_prompt="{column_prompt}")
+cleaner = DataCleaner(api_key="YOUR_API_KEY", system_prompt="{column_prompt}")
 raw_df = pd.read_csv("data.csv")
 result = cleaner.clean_dataframe(raw_df, instructions)
 ```
@@ -84,6 +127,27 @@ result = cleaner.clean_dataframe(raw_df, instructions)
 `load_yaml_instructions` reads the same structure shown above from a YAML file so
 cleaning rules can be shared without modifying code.
 
+## Environment Variables
+
+You can also set API keys via environment variables instead of passing them directly:
+
+```bash
+# For OpenAI
+export OPENAI_API_KEY="sk-..."
+
+# For OpenRouter
+export OPENROUTER_API_KEY="sk-or-..."
+
+# For Anthropic
+export ANTHROPIC_API_KEY="sk-ant-..."
+```
+
+Then initialize without the api_key parameter:
+
+```python
+cleaner = DataCleaner(model="gpt-4o-2024-08-06")  # Uses OPENAI_API_KEY from environment
+```
+
 ## Authors
 
 - Miklós Koren
diff --git a/example.py b/example.py
index ae82287..24765af 100755
--- a/example.py
+++ b/example.py
@@ -19,12 +19,13 @@ class JobTitleItem(BaseModel):
 
 yaml_instructions = load_yaml_instructions("instructions.yaml")
 
-# Set your OpenAI API key, reading from .secrets/OPENAI_API_KEY
+# Set your API key, reading from .secrets/OPENAI_API_KEY
+# You can use OpenAI, OpenRouter, Anthropic, or other providers
 with open(".secrets/OPENAI_API_KEY", "r") as f:
     api_key = f.read().strip()
 # Ensure the API key is set
 if not api_key:
-    raise ValueError("API key is not set. Please provide a valid OpenAI API key.")
+    raise ValueError("API key is not set. Please provide a valid API key.")
 # Create a sample DataFrame
 data = {
     "education": [
@@ -61,12 +62,21 @@ class JobTitleItem(BaseModel):
     },
 }
 # Initialize the cleaner with a batch size (default is 20)
+# For OpenAI (default):
 cleaner = DataCleaner(
-    api_key=api_key, 
-    batch_size=20, 
+    api_key=api_key,
+    batch_size=20,
     system_prompt='Follow these instructions, but return the answers in Greek. {column_prompt}.',
 )
 
+# For OpenRouter, you would use:
+# cleaner = DataCleaner(
+#     api_key="your-openrouter-key",
+#     model="openrouter/anthropic/claude-3-opus",
+#     api_base="https://openrouter.ai/api/v1",
+#     batch_size=20
+# )
+
 # Clean the data
 
 result = cleaner.clean_dataframe(df, instructions)
diff --git a/llm_data_cleaner/cleaner.py b/llm_data_cleaner/cleaner.py
index aa7e04f..5d8949e 100644
--- a/llm_data_cleaner/cleaner.py
+++ b/llm_data_cleaner/cleaner.py
@@ -1,35 +1,81 @@
 import os
 import pandas as pd
 from typing import Dict, Any, Type, List, Optional
-from openai import OpenAI
+import litellm
 from pydantic import BaseModel, create_model, ConfigDict
 from llm_data_cleaner.utils import InstructionField, InstructionSchema
 import time
 from tqdm import tqdm
 from .utils import jsonize
+import json
 
 
 class DataCleaner:
     """
-    Batch DataCleaner that uses OpenAI's responses.parse method with auto-generated prompts.
+    Batch DataCleaner that uses LiteLLM for provider-agnostic LLM API calls with auto-generated prompts.
+    Supports OpenAI, OpenRouter, Anthropic, and 100+ other providers.
     """
 
     def __init__(
         self,
-        api_key: str,
+        api_key: str = None,
         model: str = "gpt-4o-2024-08-06",
         max_retries: int = 3,
         retry_delay: int = 5,
         batch_size: int = 10,
         system_prompt: str = None,
         temperature: float = 0.0,
+        api_base: str = None,
+        **litellm_kwargs,
     ):
-        self.client = OpenAI(api_key=api_key)
+        """
+        Initialize DataCleaner with LiteLLM support for multiple providers.
+
+        Args:
+            api_key: API key for the LLM provider (can also be set via environment variables)
+            model: Model name (e.g., "gpt-4o-2024-08-06" for OpenAI, "openrouter/model-name" for OpenRouter)
+            max_retries: Number of retry attempts for failed API calls
+            retry_delay: Delay in seconds between retries
+            batch_size: Number of rows to process per API call
+            system_prompt: Custom system prompt template (must contain {column_prompt})
+            temperature: Model temperature for generation
+            api_base: Base URL for the API (e.g., for OpenRouter: "https://openrouter.ai/api/v1")
+            **litellm_kwargs: Additional arguments to pass to litellm.completion()
+
+        Examples:
+            # OpenAI (default)
+            cleaner = DataCleaner(api_key="sk-...")
+
+            # OpenRouter
+            cleaner = DataCleaner(
+                api_key="sk-or-...",
+                model="openrouter/anthropic/claude-3-opus",
+                api_base="https://openrouter.ai/api/v1"
+            )
+
+            # Anthropic
+            cleaner = DataCleaner(api_key="sk-ant-...", model="claude-3-opus-20240229")
+        """
+        self.api_key = api_key
         self.model = model
         self.max_retries = max_retries
         self.retry_delay = retry_delay
         self.batch_size = batch_size
         self.temperature = temperature
+        self.api_base = api_base
+        self.litellm_kwargs = litellm_kwargs
+
+        # Set API key in environment if provided (LiteLLM reads from env vars)
+        if api_key:
+            # LiteLLM automatically detects the provider from the model name
+            # and uses the appropriate environment variable
+            if model.startswith("openrouter/"):
+                os.environ["OPENROUTER_API_KEY"] = api_key
+            elif model.startswith("anthropic/") or "claude" in model.lower():
+                os.environ["ANTHROPIC_API_KEY"] = api_key
+            else:
+                # Default to OpenAI
+                os.environ["OPENAI_API_KEY"] = api_key
 
         # General system prompt format, set once for all tasks (you may tweak further)
         if system_prompt:
@@ -136,15 +182,46 @@ def _clean_batch(
         messages: list,
         pyd_model_batch: Type[BaseModel]
     ):
+        """
+        Clean a batch using LiteLLM completion API with JSON mode.
+        Parses the response back into the Pydantic model.
+        """
         for attempt in range(self.max_retries):
             try:
-                resp = self.client.responses.parse(
-                    model=self.model,
-                    input=messages,
-                    text_format=pyd_model_batch,
-                    temperature=self.temperature,
-                )
-                return resp.output_parsed
+                # Get JSON schema from Pydantic model for the prompt
+                schema_json = pyd_model_batch.model_json_schema()
+
+                # Enhance system message with schema information
+                enhanced_messages = messages.copy()
+                enhanced_messages[0] = {
+                    "role": "system",
+                    "content": f"{messages[0]['content']}\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema_json, indent=2)}"
+                }
+
+                # Prepare LiteLLM completion arguments
+                completion_kwargs = {
+                    "model": self.model,
+                    "messages": enhanced_messages,
+                    "temperature": self.temperature,
+                    "response_format": {"type": "json_object"},
+                    **self.litellm_kwargs
+                }
+
+                # Add api_base if provided
+                if self.api_base:
+                    completion_kwargs["api_base"] = self.api_base
+
+                # Call LiteLLM
+                response = litellm.completion(**completion_kwargs)
+
+                # Parse JSON response
+                content = response.choices[0].message.content
+                parsed_json = json.loads(content)
+
+                # Convert JSON to Pydantic model
+                parsed_model = pyd_model_batch.model_validate(parsed_json)
+
+                return parsed_model
             except Exception as e:
                 print(f"Batch cleaning error: {e} (attempt {attempt+1}/{self.max_retries})")
                 time.sleep(self.retry_delay)
diff --git a/pyproject.toml b/pyproject.toml
index 2778b32..ae5ad3e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,14 +1,14 @@
 [tool.poetry]
 name = "llm_data_cleaner"
-version = "0.4.3"
-description = "A Python package for cleaning data using OpenAI API"
+version = "0.5.0"
+description = "A Python package for cleaning data using LLM APIs (OpenAI, OpenRouter, and more)"
 authors = ["Miklós Koren <koren@codedthinking.com>", "Gergely Attila Kiss <kiss@codedthinking.com>"]
 readme = "README.md"
 packages = [{include = "llm_data_cleaner"}]
 
 [tool.poetry.dependencies]
 python = ">=3.9,<4.0"
-openai = "^1.0.0"
+litellm = "^1.0.0"
 pydantic = "^2.0.0"
 jsonschema = "^4.0.0"
 tqdm = "^4.65.0"
diff --git a/tests/test_cleaner.py b/tests/test_cleaner.py
index affb7a4..ceef3e6 100644
--- a/tests/test_cleaner.py
+++ b/tests/test_cleaner.py
@@ -71,51 +71,53 @@ def test_validate_instructions(self):
 
 
 class TestDataCleaner(unittest.TestCase):
-    @patch('llm_data_cleaner.cleaner.OpenAI')
-    def test_clean_dataframe(self, mock_openai):
-        # Create a mock response
-        mock_completion = MagicMock()
-        mock_completion.choices[0].message.content = json.dumps({"year": 2020, "university": "Example University"})
-        
-        # Set up the mock client
-        mock_client = MagicMock()
-        mock_client.chat.completions.create.return_value = mock_completion
-        mock_openai.return_value = mock_client
-        
+    @patch('llm_data_cleaner.cleaner.litellm.completion')
+    def test_clean_dataframe(self, mock_completion):
+        # Create a mock response matching LiteLLM's response structure
+        mock_response = MagicMock()
+        mock_response.choices = [MagicMock()]
+        mock_response.choices[0].message.content = json.dumps({
+            "cleaned": [
+                {"index": 0, "year": 2020, "university": "Example University"}
+            ]
+        })
+        mock_completion.return_value = mock_response
+
         # Create a test DataFrame
         df = pd.DataFrame({
             "education": ["Graduated from Example University in 2020"]
         })
-        
+
+        # Create Pydantic model for testing
+        from pydantic import BaseModel
+
+        class EducationItem(BaseModel):
+            index: int
+            year: int | None
+            university: str | None
+
         # Create cleaning instructions
         instructions = {
             "education": {
                 "prompt": "Extract year and university name",
-                "schema": {
-                    "type": "object",
-                    "properties": {
-                        "year": {"type": ["integer", "null"]},
-                        "university": {"type": ["string", "null"]}
-                    },
-                    "required": ["year", "university"]
-                }
+                "schema": EducationItem
             }
         }
-        
+
         # Initialize the cleaner
         cleaner = DataCleaner(api_key="dummy-key")
-        
+
         # Clean the data
         result = cleaner.clean_dataframe(df, instructions)
-        
+
         # Assertions
-        self.assertIn("original_education", result.columns)
-        self.assertIn("cleaned_education", result.columns)
-        
-        # Check the API was called with the correct arguments
-        mock_client.chat.completions.create.assert_called_once()
-        call_args = mock_client.chat.completions.create.call_args[1]
-        self.assertEqual(call_args["model"], "gpt-4o")
+        self.assertIn("cleaned_year", result.columns)
+        self.assertIn("cleaned_university", result.columns)
+
+        # Check the API was called
+        mock_completion.assert_called()
+        call_args = mock_completion.call_args[1]
+        self.assertEqual(call_args["model"], "gpt-4o-2024-08-06")
         self.assertEqual(call_args["temperature"], 0.0)
         self.assertEqual(call_args["response_format"], {"type": "json_object"})