diff --git a/README.md b/README.md index 5464c6c8..1ef8d521 100644 --- a/README.md +++ b/README.md @@ -265,7 +265,8 @@ pre-commit install ``` You can then run all tests by running `pytest`, or only the CI/CD tests by -running `CICD=1 pytest`. +running `CICD=1 pytest`. See [test/MARKERS_GUIDE.md](test/MARKERS_GUIDE.md) for +details on running specific test categories (e.g., by backend, resource requirements). Tip: you can bypass the hooks by passing the `-n` flag to `git commit`. This is sometimes helpful for intermediate commits that you intend to later diff --git a/docs/tutorial.md b/docs/tutorial.md index 5270ab2c..4bca5c72 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -2,27 +2,54 @@ ## Table of Contents -- [Chapter 1: What Is Generative Programming?](#chapter-1-what-is-generative-programming) -- [Chapter 2: Getting Started with Generative Programming in Mellea](#chapter-2-getting-started-with-generative-programming-in-mellea) - - [Requirements](#requirements) - - [Validating Requirements](#validating-requirements) - - [Instruct - Validate - Repair](#instruct---validate---repair) - - [Model Options](#modeloptions) -- [Chapter 3: Overview of the Standard Library](#chapter-3-overview-of-the-standard-library) -- [Chapter 4: Generative Slots](#chapter-4-generative-slots) -- [Chapter 5: MObjects](#chapter-5-mobjects) - - [Case Study: Working with Documents](#case-study-working-with-documents) -- [Chapter 6: Tuning for Requirements and Components](#chapter-6-tuning-requirements-and-components) -- [Chapter 7: Context Management](#chapter-7-on-context-management) -- [Chapter 8: Implementing Agents](#chapter-8-implementing-agents) - - [Case Study: ReACT](#case-study-implementing-react-in-mellea) - - [The Guarded Nondeterminism Pattern](#guarded-nondeterminism) -- [Chapter 9: Interoperability with Other Frameworks](#chapter-9-interoperability-with-other-frameworks) -- [Chapter 10: Prompt Engineering for Mellea](#chapter-10-prompt-engineering-for-m) - - [Custom Templates](#custom-templates) -- [Chapter 11: Tool Calling](#chapter-11-tool-calling) -- [Chapter 12: Asynchronicity](#chapter-12-asynchronicity) -- [Appendix: Contributing to Mellea](#appendix-contributing-to-mellea) +- [Principles of Generative Programming: The Mellea Approach](#principles-of-generative-programming-the-mellea-approach) + - [Table of Contents](#table-of-contents) + - [Chapter 1: What Is Generative Programming](#chapter-1-what-is-generative-programming) + - [Chapter 2: Getting Started with Generative Programming in Mellea](#chapter-2-getting-started-with-generative-programming-in-mellea) + - [Requirements](#requirements) + - [Validating Requirements](#validating-requirements) + - [Instruct - Validate - Repair](#instruct---validate---repair) + - [ModelOptions](#modeloptions) + - [System Messages](#system-messages) + - [Conclusion](#conclusion) + - [Chapter 3: Overview of the Standard Library](#chapter-3-overview-of-the-standard-library) + - [Chapter 4: Generative Slots](#chapter-4-generative-slots) + - [Example: Sentiment Classifier](#example-sentiment-classifier) + - [Using Generative slots to Provide Compositionality Across Module Boundaries](#using-generative-slots-to-provide-compositionality-across-module-boundaries) + - [Chapter 5: MObjects](#chapter-5-mobjects) + - [Example: A Table as an MObject](#example-a-table-as-an-mobject) + - [Case Study: Working with Documents](#case-study-working-with-documents) + - [MObject methods are tools](#mobject-methods-are-tools) + - [Chapter 6: Tuning Requirements and Components](#chapter-6-tuning-requirements-and-components) + - [Problem Statement](#problem-statement) + - [Training the aLoRA Adapter](#training-the-alora-adapter) + - [Parameters](#parameters) + - [Upload to Hugging Face (Optional)](#upload-to-hugging-face-optional) + - [Integrating the Tuned Model into Mellea](#integrating-the-tuned-model-into-mellea) + - [Chapter 7: On Context Management](#chapter-7-on-context-management) + - [Chapter 8: Implementing Agents](#chapter-8-implementing-agents) + - [Case Study: Implementing ReACT in Mellea](#case-study-implementing-react-in-mellea) + - [Guarded Nondeterminism](#guarded-nondeterminism) + - [Chapter 9: Interoperability with Other Frameworks](#chapter-9-interoperability-with-other-frameworks) + - [Simple mcp server running Mellea](#simple-mcp-server-running-mellea) + - [Running Mellea programs as an openai compatible server (Experimental)](#running-mellea-programs-as-an-openai-compatible-server-experimental) + - [Example `m serve` application](#example-m-serve-application) + - [Chapter 10: Prompt Engineering for M](#chapter-10-prompt-engineering-for-m) + - [Templates](#templates) + - [Template Representations](#template-representations) + - [Customization](#customization) + - [Choosing a Template](#choosing-a-template) + - [Editing an Existing Class](#editing-an-existing-class) + - [Chapter 11: Tool Calling](#chapter-11-tool-calling) + - [Chapter 12: Asynchronicity](#chapter-12-asynchronicity) + - [Asynchronous Functions:](#asynchronous-functions) + - [Asynchronicity in Synchronous Functions](#asynchronicity-in-synchronous-functions) + - [Appendix: Contributing to Mellea](#appendix-contributing-to-mellea) + - [Contributor Guide: Getting Started](#contributor-guide-getting-started) + - [Contributor Guide: Requirements and Verifiers](#contributor-guide-requirements-and-verifiers) + - [Contributor Guide: Components](#contributor-guide-components) + - [Contributor Guide: Specialized Mify](#contributor-guide-specialized-mify) + - [Contributor Guide: Sessions](#contributor-guide-sessions) ## Chapter 1: What Is Generative Programming @@ -1408,7 +1435,8 @@ pre-commit install ``` You can then run all tests by running `pytest`, or only the CI/CD tests by -running `CICD=1 pytest`. +running `CICD=1 pytest`. See [test/MARKERS_GUIDE.md](../test/MARKERS_GUIDE.md) for +details on running specific test categories (e.g., by backend, resource requirements). Tip: you can bypass the hooks by passing the `-n` flag to `git commit`. This is sometimes helpful for intermediate commits that you intend to later diff --git a/pyproject.toml b/pyproject.toml index 2431f6b0..061edec6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,6 +76,7 @@ hf = [ "peft>=0.18.0", # aLoRA support was added in Peft 0.18.0 "transformers>=4.53.2", "trl==0.19.1", + "xgrammar", # Required for constrained decoding in granite_common ] vllm = [ @@ -112,6 +113,7 @@ dev = [ "pdm>=2.24.0", "pytest", "pytest-asyncio", + "psutil", # For test infrastructure: RAM detection in conftest.py "mypy>=1.17.0", "python-semantic-release~=7.32", "nbmake>=1.5.5", @@ -198,7 +200,22 @@ python_version = "3.10" [tool.pytest.ini_options] markers = [ - "qualitative: Marks the test as needing an exact output from an LLM; set by an ENV variable for CICD. All tests marked with this will xfail in CI/CD" + # Backend markers + "ollama: Tests requiring Ollama backend (local, light)", + "openai: Tests requiring OpenAI API (requires API key)", + "watsonx: Tests requiring Watsonx API (requires API key)", + "huggingface: Tests requiring HuggingFace backend (local, heavy)", + "vllm: Tests requiring vLLM backend (local, GPU required)", + "litellm: Tests requiring LiteLLM backend", + + # Capability markers + "requires_api_key: Tests requiring external API keys", + "requires_gpu: Tests requiring GPU", + "requires_heavy_ram: Tests requiring 48GB+ RAM", + "qualitative: Non-deterministic quality tests", + + # Composite markers + "llm: Tests that make LLM calls (needs at least Ollama)", ] asyncio_mode = "auto" # Don't require explicitly marking async tests. diff --git a/test/MARKERS_GUIDE.md b/test/MARKERS_GUIDE.md new file mode 100644 index 00000000..56029b6f --- /dev/null +++ b/test/MARKERS_GUIDE.md @@ -0,0 +1,433 @@ +# Pytest Markers Guide for Mellea Tests + +## Overview + +This guide explains the pytest marker system for categorizing and running mellea tests based on backend requirements, resource availability, and test characteristics. + +## 🎯 What's Automatic vs Manual + +### ✅ Automatic (No Configuration Needed) +When you run `pytest`, the system **automatically detects** and skips tests based on: +- **Ollama availability** - Checks if port 11434 is listening +- **API keys** - Checks environment variables (`OPENAI_API_KEY`, `WATSONX_APIKEY`, etc.) +- **GPU availability** - Checks for CUDA (NVIDIA) or MPS (Apple Silicon) via torch +- **System RAM** - Checks via `psutil.virtual_memory()` (if psutil installed) + +**You don't need to configure anything!** Just run `pytest` and tests will automatically skip with helpful messages if requirements aren't met. + +**Note:** +- GPU detection requires `torch` (included in `mellea[hf]` and `mellea[vllm]`) +- RAM detection requires `psutil` (included in dev dependencies) +- If you're not using dev dependencies, install with: `pip install psutil` + +### ⚠️ Manual (Developer Adds to Test Files) +Developers must **add markers** to test files to indicate what each test needs: +```python +# Developer adds these markers once per test file +pytestmark = [pytest.mark.ollama, pytest.mark.llm] +``` + +**Summary:** Markers are manual (one-time setup per test file), detection is automatic (every test run). + +### 🔧 Override Auto-Detection (Advanced) +Want to try running tests even when requirements aren't met? Use these pytest options: + +```bash +# Try GPU tests without GPU (will use CPU, may be slow/fail) +pytest --ignore-gpu-check test/backends/test_vllm.py + +# Try with less RAM than recommended +pytest --ignore-ram-check test/backends/test_huggingface.py + +# Try without Ollama running +pytest --ignore-ollama-check test/backends/test_ollama.py + +# Try without API keys (will fail at API call) +pytest --ignore-api-key-check test/backends/test_openai.py + +# Combine multiple overrides +pytest --ignore-gpu-check --ignore-ram-check -m "huggingface" +``` + +**Use Cases:** +- Testing with CPU when GPU tests might work (slower but functional) +- Trying with less RAM (might work for smaller models) +- Debugging test infrastructure + +**Warning:** Tests will likely fail if requirements aren't actually met! + +## Quick Start + +```bash +# Run all tests (auto-skips based on your system) +pytest + +# Run only fast unit tests (no LLM calls) +pytest -m "not llm" + +# Run Ollama tests only (local, light resources) +pytest -m "ollama" + +# Run tests that don't require API keys +pytest -m "not requires_api_key" + +# Run infrastructure tests only (skip quality tests) +pytest -m "not qualitative" + +# Run quality tests for Ollama +pytest -m "ollama and qualitative" +``` + +## Marker Categories + +### Backend Markers + +Specify which backend the test uses: + +- **`@pytest.mark.ollama`**: Tests requiring Ollama backend + - Local execution + - Light resources (CPU, ~2-4GB RAM) + - No API key required + - Example: `test/backends/test_ollama.py` + +- **`@pytest.mark.openai`**: Tests requiring OpenAI API + - Requires `OPENAI_API_KEY` environment variable + - Light resources (API calls only) + - Incurs API costs + - Example: `test/backends/test_vision_openai.py` + +- **`@pytest.mark.watsonx`**: Tests requiring Watsonx API + - Requires `WATSONX_APIKEY` or `IBM_CLOUD_API_KEY` environment variable + - Light resources (API calls only) + - Incurs API costs + - Example: `test/backends/test_watsonx.py` + +- **`@pytest.mark.huggingface`**: Tests requiring HuggingFace backend + - Local execution + - Heavy resources (GPU recommended, 16-32GB RAM, ~8GB VRAM) + - Downloads models (~3-8GB) + - No API key required (unless using gated models) + - Example: `test/backends/test_huggingface.py` + +- **`@pytest.mark.vllm`**: Tests requiring vLLM backend + - Local execution + - Heavy resources (GPU required, 16-32GB RAM, 8GB+ VRAM) + - Requires `VLLM_USE_V1=0` environment variable + - Example: `test/backends/test_vllm.py` + +- **`@pytest.mark.litellm`**: Tests requiring LiteLLM backend + - Requirements depend on underlying backend + - Example: `test/backends/test_litellm_ollama.py` + +### Capability Markers + +Specify resource or authentication requirements: + +- **`@pytest.mark.requires_api_key`**: Tests requiring external API keys + - Auto-skipped if required API key not found + - Use with backend markers (openai, watsonx) + +- **`@pytest.mark.requires_gpu`**: Tests requiring GPU + - Auto-skipped if no GPU detected + - Typically used with huggingface, vllm + +- **`@pytest.mark.requires_heavy_ram`**: Tests requiring 48GB+ RAM + - Auto-skipped if insufficient RAM detected + - Typically used with huggingface, vllm + +- **`@pytest.mark.qualitative`**: Non-deterministic quality tests + - Tests LLM output quality rather than infrastructure + - Skipped in CI (when `CICD=1`) + - May be flaky due to model variability + +### Composite Markers + +- **`@pytest.mark.llm`**: Tests that make LLM calls + - Requires at least Ollama to be available + - Use to distinguish from pure unit tests + +## Auto-Detection and Skipping + +The test suite automatically detects your system capabilities and skips tests that cannot run: + +### API Key Detection +```python +# Automatically checks for: +OPENAI_API_KEY # For OpenAI tests +WATSONX_APIKEY # For Watsonx tests +IBM_CLOUD_API_KEY # Alternative for Watsonx +HF_TOKEN # For gated HuggingFace models +``` + +### Backend Availability Detection +```python +# Automatically detects: +- Ollama availability (checks if port 11434 is listening) +``` + +### Resource Detection +```python +# Automatically detects: +- GPU availability (via torch.cuda.is_available()) +- GPU memory (via torch.cuda.get_device_properties()) +- System RAM (via psutil.virtual_memory()) +``` + +### Skip Messages +When a test is skipped, you'll see helpful messages: +``` +SKIPPED [1] test/conftest.py:120: Skipping test: OPENAI_API_KEY not found in environment +SKIPPED [1] test/conftest.py:125: Skipping test: GPU not available +SKIPPED [1] test/conftest.py:130: Skipping test: Insufficient RAM (16.0GB < 32GB) +SKIPPED [1] test/conftest.py:165: Skipping test: Ollama not available (port 11434 not listening) +``` + +## Usage Examples + +### Module-Level Markers + +Apply markers to all tests in a module using `pytestmark`: + +```python +# test/backends/test_ollama.py +import pytest + +# All tests in this module require Ollama and make LLM calls +pytestmark = [pytest.mark.ollama, pytest.mark.llm] + +def test_simple_instruct(session): + # This test inherits ollama and llm markers + ... +``` + +### Multiple Markers + +Combine markers for complex requirements: + +```python +# test/backends/test_huggingface.py +pytestmark = [ + pytest.mark.huggingface, + pytest.mark.llm, + pytest.mark.requires_gpu, + pytest.mark.requires_heavy_ram, +] +``` + +### Individual Test Markers + +Add markers to specific tests: + +```python +@pytest.mark.qualitative +def test_output_quality(session): + # This test checks LLM output quality + result = session.instruct("Write a poem") + assert "poem" in result.value.lower() +``` + +## Running Tests by Category + +### By Backend +```bash +# Ollama only +pytest -m "ollama" + +# HuggingFace only +pytest -m "huggingface" + +# All API-based backends +pytest -m "openai or watsonx" +``` + +### By Resource Requirements +```bash +# Light tests only (no GPU, no heavy RAM) +pytest -m "not (requires_gpu or requires_heavy_ram)" + +# Tests that work without API keys +pytest -m "not requires_api_key" + +# GPU tests only +pytest -m "requires_gpu" +``` + +### By Test Type +```bash +# Infrastructure tests only (deterministic) +pytest -m "not qualitative" + +# Quality tests only (non-deterministic) +pytest -m "qualitative" + +# Fast unit tests (no LLM calls) +pytest -m "not llm" +``` + +### Complex Combinations +```bash +# Ollama infrastructure tests +pytest -m "ollama and not qualitative" + +# All tests that work with just Ollama (no API keys, no GPU) +pytest -m "not (requires_api_key or requires_gpu or requires_heavy_ram)" + +# Quality tests for local backends only +pytest -m "qualitative and (ollama or huggingface or vllm)" +``` + +## CI/CD Integration + +### Current Behavior +- `CICD=1` environment variable skips all qualitative tests +- Module-level skips for heavy backends (huggingface, vllm, watsonx) + +### Recommended CI Matrix +```yaml +# .github/workflows/test.yml +jobs: + unit-tests: + # Fast unit tests, no LLM + run: pytest -m "not llm" + + ollama-tests: + # Ollama infrastructure tests + run: pytest -m "ollama and not qualitative" + + quality-tests: + # Optional: Run quality tests on schedule + if: github.event_name == 'schedule' + run: pytest -m "qualitative and ollama" +``` + +## Adding Markers to New Tests + +### Step 1: Identify Requirements +Ask yourself: +1. Which backend does this test use? +2. Does it require an API key? +3. Does it need a GPU? +4. Does it need heavy RAM (48GB+)? +5. Is it testing output quality (qualitative) or infrastructure? + +### Step 2: Add Appropriate Markers + +For a new Ollama test: +```python +# Use module-level marker if all tests use same backend +pytestmark = [pytest.mark.ollama, pytest.mark.llm] + +@pytest.mark.qualitative # Add if testing output quality +def test_my_new_feature(session): + ... +``` + +For a new HuggingFace test: +```python +pytestmark = [ + pytest.mark.huggingface, + pytest.mark.llm, + pytest.mark.requires_gpu, + pytest.mark.requires_heavy_ram, +] + +@pytest.mark.qualitative +def test_my_new_feature(session): + ... +``` + +### Step 3: Test Your Markers +```bash +# Verify your test is properly marked +pytest --collect-only -m "your_marker" + +# Run just your test +pytest -k "test_my_new_feature" +``` + +## Troubleshooting + +### Test Not Running +```bash +# Check which markers are applied +pytest --collect-only test/path/to/test.py + +# Check why test is being skipped +pytest -v test/path/to/test.py + +# Force run despite auto-skip (will likely fail if requirements not met) +pytest test/path/to/test.py --runxfail +``` + +### Marker Not Recognized +```bash +# List all registered markers +pytest --markers + +# Check pytest.ini configuration +cat pytest.ini +``` + +### Auto-Skip Not Working +```bash +# Debug system capabilities +pytest --setup-show test/path/to/test.py + +# Check conftest.py detection logic +# See test/conftest.py:get_system_capabilities() + +# Run with verbose output to see skip reasons +pytest -v -s test/path/to/test.py +``` + +### Force Run Tests (Override Auto-Skip) +```bash +# Run specific test ignoring auto-skip (useful for debugging) +pytest test/backends/test_ollama.py --runxfail + +# Run with specific marker, will fail if requirements not met +pytest -m "ollama" -v + +# Note: Tests will fail if actual requirements (Ollama, GPU, etc.) aren't met +# This is useful for testing the test infrastructure itself +``` + +## Best Practices + +1. **Use module-level markers** for consistent backend requirements +2. **Combine markers** to accurately describe test requirements +3. **Keep qualitative marker** for non-deterministic tests +4. **Test locally** before pushing to ensure markers work correctly +5. **Document special requirements** in test docstrings + +## Migration from Old System + +### Before (Old System) +```python +# Only qualitative marker +@pytest.mark.qualitative +def test_ollama_instruct(session): + ... +``` + +### After (New System) +```python +# Module-level backend markers +pytestmark = [pytest.mark.ollama, pytest.mark.llm] + +# Keep qualitative for quality tests +@pytest.mark.qualitative +def test_ollama_instruct(session): + ... +``` + +## Related Files + +- `test/conftest.py`: Auto-detection and skip logic +- `pyproject.toml`: Marker definitions and pytest configuration + +## Questions? + +For questions or issues with the marker system: +1. Check this guide first +2. Open an issue on GitHub with the `testing` label \ No newline at end of file diff --git a/test/backends/test_huggingface.py b/test/backends/test_huggingface.py index 925a07e3..6ae8da8a 100644 --- a/test/backends/test_huggingface.py +++ b/test/backends/test_huggingface.py @@ -1,29 +1,35 @@ import asyncio -from copy import copy import faulthandler import os import random import time -from typing import Any, Coroutine +from collections.abc import Coroutine +from copy import copy +from typing import Annotated, Any from unittest.mock import Mock import pydantic import pytest import torch -from typing_extensions import Annotated -# Skip entire module in CI since 17/18 tests are qualitative -pytestmark = pytest.mark.skipif( - int(os.environ.get("CICD", 0)) == 1, - reason="Skipping HuggingFace tests in CI - mostly qualitative tests", -) +# Mark all tests in this module with backend and resource requirements +pytestmark = [ + pytest.mark.huggingface, + pytest.mark.llm, + pytest.mark.requires_gpu, + pytest.mark.requires_heavy_ram, + # Skip entire module in CI since 17/18 tests are qualitative + pytest.mark.skipif( + int(os.environ.get("CICD", 0)) == 1, + reason="Skipping HuggingFace tests in CI - mostly qualitative tests", + ), +] from mellea import MelleaSession +from mellea.backends import ModelOption from mellea.backends.adapters import GraniteCommonAdapter from mellea.backends.cache import SimpleLRUCache -from mellea.formatters import TemplateFormatter from mellea.backends.huggingface import LocalHFBackend, _assert_correct_adapters -from mellea.backends import ModelOption from mellea.core import ( CBlock, Context, @@ -31,10 +37,9 @@ ValidationResult, default_output_to_bool, ) +from mellea.formatters import TemplateFormatter +from mellea.stdlib.components import Intrinsic, Message from mellea.stdlib.context import ChatContext, SimpleContext - -from mellea.stdlib.components import Message -from mellea.stdlib.components import Intrinsic from mellea.stdlib.requirements import ALoraRequirement, LLMaJRequirement diff --git a/test/backends/test_huggingface_tools.py b/test/backends/test_huggingface_tools.py index a908803f..9bbd097b 100644 --- a/test/backends/test_huggingface_tools.py +++ b/test/backends/test_huggingface_tools.py @@ -3,16 +3,22 @@ import pytest # Skip entire module in CI since the single test is qualitative -pytestmark = pytest.mark.skipif( - int(os.environ.get("CICD", 0)) == 1, - reason="Skipping HuggingFace tools tests in CI - qualitative test", -) +pytestmark = [ + pytest.mark.huggingface, + pytest.mark.llm, + pytest.mark.requires_gpu, + pytest.mark.requires_heavy_ram, + pytest.mark.skipif( + int(os.environ.get("CICD", 0)) == 1, + reason="Skipping HuggingFace tools tests in CI - qualitative test", + ), +] import mellea.backends.model_ids as model_ids from mellea import MelleaSession +from mellea.backends import ModelOption from mellea.backends.cache import SimpleLRUCache from mellea.backends.huggingface import LocalHFBackend -from mellea.backends import ModelOption from mellea.stdlib.context import ChatContext diff --git a/test/backends/test_litellm_ollama.py b/test/backends/test_litellm_ollama.py index bb2d3316..2fb70e19 100644 --- a/test/backends/test_litellm_ollama.py +++ b/test/backends/test_litellm_ollama.py @@ -1,16 +1,18 @@ import asyncio import os + import pytest +# Mark all tests in this module as requiring Ollama via LiteLLM +pytestmark = [pytest.mark.litellm, pytest.mark.ollama, pytest.mark.llm] + from mellea import MelleaSession, generative -from mellea.backends import ModelOption +from mellea.backends import ModelOption, model_ids from mellea.backends.litellm import LiteLLMBackend from mellea.core import CBlock -from mellea.stdlib.context import SimpleContext from mellea.stdlib.components import Message +from mellea.stdlib.context import SimpleContext from mellea.stdlib.sampling import RejectionSamplingStrategy -from mellea.backends import model_ids - _MODEL_ID = f"ollama_chat/{model_ids.IBM_GRANITE_4_MICRO_3B.ollama_name}" diff --git a/test/backends/test_litellm_watsonx.py b/test/backends/test_litellm_watsonx.py index 6116c428..014eeb2b 100644 --- a/test/backends/test_litellm_watsonx.py +++ b/test/backends/test_litellm_watsonx.py @@ -1,6 +1,15 @@ import asyncio + import pytest +# Mark all tests in this module as requiring Watsonx via LiteLLM +pytestmark = [ + pytest.mark.litellm, + pytest.mark.watsonx, + pytest.mark.llm, + pytest.mark.requires_api_key, +] + from mellea import MelleaSession from mellea.backends.litellm import LiteLLMBackend from mellea.core import CBlock @@ -16,7 +25,6 @@ def session(): def test_has_potential_event_loop_errors(session): """This test is specific to litellm backends that use watsonx/. It can be removed once that bug is fixed.""" - backend: LiteLLMBackend = session.backend potential_err = backend._has_potential_event_loop_errors() assert not potential_err, "first invocation in an event loop shouldn't flag errors" diff --git a/test/backends/test_ollama.py b/test/backends/test_ollama.py index 20636e8b..c679d95c 100644 --- a/test/backends/test_ollama.py +++ b/test/backends/test_ollama.py @@ -1,17 +1,20 @@ import asyncio import json +from typing import Annotated import pydantic import pytest -from typing_extensions import Annotated from mellea import start_session -from mellea.backends.ollama import OllamaModelBackend from mellea.backends import ModelOption +from mellea.backends.ollama import OllamaModelBackend from mellea.core import CBlock, Requirement from mellea.stdlib.context import SimpleContext from mellea.stdlib.requirements import simple_validate +# Mark all tests in this module as requiring Ollama +pytestmark = [pytest.mark.ollama, pytest.mark.llm] + @pytest.fixture(scope="function") def session(): @@ -40,7 +43,7 @@ def test_instruct_with_requirement(session): ) email_word_count_req = Requirement( - f"The email should be at most 100", + "The email should be at most 100", validation_fn=simple_validate(lambda x: len(" ".split(x)) <= 100), ) @@ -96,7 +99,6 @@ class Email(pydantic.BaseModel): # this is not guaranteed, due to the lack of regexp pattern # assert "@" in email.to.email_address # assert email.to.email_address.endswith("example.com") - pass @pytest.mark.qualitative diff --git a/test/backends/test_openai_ollama.py b/test/backends/test_openai_ollama.py index 71ee0635..86f35174 100644 --- a/test/backends/test_openai_ollama.py +++ b/test/backends/test_openai_ollama.py @@ -7,6 +7,9 @@ import pydantic import pytest +# Mark all tests in this module as requiring Ollama via OpenAI-compatible API +pytestmark = [pytest.mark.openai, pytest.mark.ollama, pytest.mark.llm] + from mellea import MelleaSession from mellea.backends import ModelOption from mellea.backends.model_ids import IBM_GRANITE_4_MICRO_3B @@ -103,7 +106,6 @@ class Email(pydantic.BaseModel): # this is not guaranteed, due to the lack of regexp pattern # assert "@" in email.to.email_address # assert email.to.email_address.endswith("example.com") - pass @pytest.mark.qualitative diff --git a/test/backends/test_vision_ollama.py b/test/backends/test_vision_ollama.py index 740043d9..90549120 100644 --- a/test/backends/test_vision_ollama.py +++ b/test/backends/test_vision_ollama.py @@ -5,11 +5,13 @@ import pytest from PIL import Image +# Mark all tests in this module as requiring Ollama with vision support +pytestmark = [pytest.mark.ollama, pytest.mark.llm] + from mellea import MelleaSession, start_session from mellea.backends import ModelOption from mellea.core import ImageBlock, ModelOutputThunk -from mellea.stdlib.components import Message -from mellea.stdlib.components import Instruction +from mellea.stdlib.components import Instruction, Message @pytest.fixture(scope="module") diff --git a/test/backends/test_vision_openai.py b/test/backends/test_vision_openai.py index 9c958efe..0b2b92da 100644 --- a/test/backends/test_vision_openai.py +++ b/test/backends/test_vision_openai.py @@ -6,6 +6,9 @@ import pytest from PIL import Image +# Mark all tests in this module as requiring OpenAI API with vision support +pytestmark = [pytest.mark.openai, pytest.mark.llm, pytest.mark.requires_api_key] + from mellea import MelleaSession, start_session from mellea.backends import ModelOption from mellea.backends.model_ids import IBM_GRANITE_4_MICRO_3B diff --git a/test/backends/test_vllm.py b/test/backends/test_vllm.py index 05bfc0f8..ed4a0354 100644 --- a/test/backends/test_vllm.py +++ b/test/backends/test_vllm.py @@ -1,19 +1,27 @@ import asyncio import os +from typing import Annotated + import pydantic import pytest -from typing_extensions import Annotated -# Skip entire module in CI since all 8 tests are qualitative -pytestmark = pytest.mark.skipif( - int(os.environ.get("CICD", 0)) == 1, - reason="Skipping vLLM tests in CI - all qualitative tests", -) +# Mark all tests in this module with backend and resource requirements +pytestmark = [ + pytest.mark.vllm, + pytest.mark.llm, + pytest.mark.requires_gpu, + pytest.mark.requires_heavy_ram, + # Skip entire module in CI since all 8 tests are qualitative + pytest.mark.skipif( + int(os.environ.get("CICD", 0)) == 1, + reason="Skipping vLLM tests in CI - all qualitative tests", + ), +] +import mellea.backends.model_ids as model_ids from mellea import MelleaSession -from mellea.backends.vllm import LocalVLLMBackend from mellea.backends import ModelOption -import mellea.backends.model_ids as model_ids +from mellea.backends.vllm import LocalVLLMBackend from mellea.core import CBlock from mellea.stdlib.context import ChatContext, SimpleContext diff --git a/test/backends/test_vllm_tools.py b/test/backends/test_vllm_tools.py index 0f6b21de..2d085deb 100644 --- a/test/backends/test_vllm_tools.py +++ b/test/backends/test_vllm_tools.py @@ -1,16 +1,23 @@ import os + import pytest # Skip entire module in CI since the single test is qualitative -pytestmark = pytest.mark.skipif( - int(os.environ.get("CICD", 0)) == 1, - reason="Skipping vLLM tools tests in CI - qualitative test", -) +pytestmark = [ + pytest.mark.vllm, + pytest.mark.llm, + pytest.mark.requires_gpu, + pytest.mark.requires_heavy_ram, + pytest.mark.skipif( + int(os.environ.get("CICD", 0)) == 1, + reason="Skipping vLLM tools tests in CI - qualitative test", + ), +] +import mellea.backends.model_ids as model_ids from mellea import MelleaSession -from mellea.backends.vllm import LocalVLLMBackend from mellea.backends import ModelOption -import mellea.backends.model_ids as model_ids +from mellea.backends.vllm import LocalVLLMBackend from mellea.stdlib.context import ChatContext diff --git a/test/backends/test_watsonx.py b/test/backends/test_watsonx.py index 1631f488..6902e675 100644 --- a/test/backends/test_watsonx.py +++ b/test/backends/test_watsonx.py @@ -5,17 +5,23 @@ import pydantic import pytest -# Skip entire module in CI since 8/9 tests are qualitative -pytestmark = pytest.mark.skipif( - int(os.environ.get("CICD", 0)) == 1, - reason="Skipping Watsonx tests in CI - mostly qualitative tests", -) +# Mark all tests in this module with backend and auth requirements +pytestmark = [ + pytest.mark.watsonx, + pytest.mark.llm, + pytest.mark.requires_api_key, + # Skip entire module in CI since 8/9 tests are qualitative + pytest.mark.skipif( + int(os.environ.get("CICD", 0)) == 1, + reason="Skipping Watsonx tests in CI - mostly qualitative tests", + ), +] from mellea import MelleaSession -from mellea.formatters import TemplateFormatter from mellea.backends import ModelOption from mellea.backends.watsonx import WatsonxAIBackend from mellea.core import CBlock, ModelOutputThunk +from mellea.formatters import TemplateFormatter from mellea.stdlib.context import ChatContext, SimpleContext @@ -45,7 +51,6 @@ def session(backend: WatsonxAIBackend): @pytest.mark.qualitative def test_filter_chat_completions_kwargs(backend: WatsonxAIBackend): """Detect changes to the WatsonxAI TextChatParameters.""" - known_keys = [ "frequency_penalty", "logprobs", @@ -66,7 +71,7 @@ def test_filter_chat_completions_kwargs(backend: WatsonxAIBackend): "guided_grammar", "guided_json", ] - test_dict = {key: 1 for key in known_keys} + test_dict = dict.fromkeys(known_keys, 1) # Make sure keys that we think should be in the TextChatParameters are there. filtered_dict = backend.filter_chat_completions_kwargs(test_dict) @@ -133,7 +138,6 @@ class Email(pydantic.BaseModel): # this is not guaranteed, due to the lack of regexp pattern # assert "@" in email.to.email_address # assert email.to.email_address.endswith("example.com") - pass @pytest.mark.qualitative diff --git a/test/conftest.py b/test/conftest.py index 10c96e74..8c99a7b1 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -1,27 +1,257 @@ import gc import os +import subprocess +import sys import pytest +# Try to import optional dependencies for system detection +try: + import psutil + + HAS_PSUTIL = True +except ImportError: + HAS_PSUTIL = False + +try: + import torch + + HAS_TORCH = True +except ImportError: + HAS_TORCH = False + + +# ============================================================================ +# System Capability Detection +# ============================================================================ + + +def _check_ollama_available(): + """Check if Ollama is available by checking if port 11434 is listening. + + Note: This only checks if Ollama is running, not which models are loaded. + Tests may still fail if required models (e.g., granite4:micro) are not pulled. + """ + import socket + + try: + # Try to connect to Ollama's default port + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.settimeout(1) + result = sock.connect_ex(("localhost", 11434)) + sock.close() + return result == 0 + except Exception: + return False + + +def get_system_capabilities(): + """Detect system capabilities for test requirements.""" + capabilities = { + "has_gpu": False, + "gpu_memory_gb": 0, + "ram_gb": 0, + "has_api_keys": {}, + "has_ollama": False, + } + + # Detect GPU (CUDA for NVIDIA, MPS for Apple Silicon) + if HAS_TORCH: + has_cuda = torch.cuda.is_available() + has_mps = hasattr(torch.backends, "mps") and torch.backends.mps.is_available() + capabilities["has_gpu"] = has_cuda or has_mps + + if has_cuda: + try: + capabilities["gpu_memory_gb"] = torch.cuda.get_device_properties( + 0 + ).total_memory / (1024**3) + except Exception: + pass + # Note: MPS doesn't provide easy memory query, leave at 0 + + # Detect RAM + if HAS_PSUTIL: + capabilities["ram_gb"] = psutil.virtual_memory().total / (1024**3) + + # Detect API keys + api_key_vars = { + "openai": "OPENAI_API_KEY", + "watsonx": ["WATSONX_APIKEY", "IBM_CLOUD_API_KEY"], + "huggingface": "HF_TOKEN", + } + + for backend, env_vars in api_key_vars.items(): + if isinstance(env_vars, str): + env_vars = [env_vars] + capabilities["has_api_keys"][backend] = any( + os.environ.get(var) for var in env_vars + ) + + # Detect Ollama availability + capabilities["has_ollama"] = _check_ollama_available() + + return capabilities + + +@pytest.fixture(scope="session") +def system_capabilities(): + """Fixture providing system capabilities.""" + return get_system_capabilities() + @pytest.fixture(scope="session") def gh_run() -> int: return int(os.environ.get("CICD", 0)) # type: ignore +# ============================================================================ +# Pytest Marker Registration and CLI Options +# ============================================================================ + + +def pytest_addoption(parser): + """Add custom command-line options.""" + parser.addoption( + "--ignore-gpu-check", + action="store_true", + default=False, + help="Ignore GPU requirement checks (tests may fail without GPU)", + ) + parser.addoption( + "--ignore-ram-check", + action="store_true", + default=False, + help="Ignore RAM requirement checks (tests may fail with insufficient RAM)", + ) + parser.addoption( + "--ignore-ollama-check", + action="store_true", + default=False, + help="Ignore Ollama availability checks (tests will fail if Ollama not running)", + ) + parser.addoption( + "--ignore-api-key-check", + action="store_true", + default=False, + help="Ignore API key checks (tests will fail without valid API keys)", + ) + + +def pytest_configure(config): + """Register custom markers.""" + # Backend markers + config.addinivalue_line( + "markers", "ollama: Tests requiring Ollama backend (local, light)" + ) + config.addinivalue_line( + "markers", "openai: Tests requiring OpenAI API (requires API key)" + ) + config.addinivalue_line( + "markers", "watsonx: Tests requiring Watsonx API (requires API key)" + ) + config.addinivalue_line( + "markers", "huggingface: Tests requiring HuggingFace backend (local, heavy)" + ) + config.addinivalue_line( + "markers", "vllm: Tests requiring vLLM backend (local, GPU required)" + ) + config.addinivalue_line("markers", "litellm: Tests requiring LiteLLM backend") + + # Capability markers + config.addinivalue_line( + "markers", "requires_api_key: Tests requiring external API keys" + ) + config.addinivalue_line("markers", "requires_gpu: Tests requiring GPU") + config.addinivalue_line("markers", "requires_heavy_ram: Tests requiring 16GB+ RAM") + config.addinivalue_line("markers", "qualitative: Non-deterministic quality tests") + + # Composite markers + config.addinivalue_line( + "markers", "llm: Tests that make LLM calls (needs at least Ollama)" + ) + + +# ============================================================================ +# Test Skipping Logic +# ============================================================================ + + def pytest_runtest_setup(item): - """Skip qualitative tests when running in CI environment.""" - # Runs tests *not* marked with `@pytest.mark.qualitative` to run normally. - if not item.get_closest_marker("qualitative"): - return + """Skip tests based on markers and system capabilities. + Can be overridden with command-line options: + - pytest --ignore-gpu-check + - pytest --ignore-ram-check + - pytest --ignore-ollama-check + - pytest --ignore-api-key-check + """ + capabilities = get_system_capabilities() gh_run = int(os.environ.get("CICD", 0)) + config = item.config - if gh_run == 1: + # Check for override flags from CLI + ignore_gpu = config.getoption("--ignore-gpu-check", default=False) + ignore_ram = config.getoption("--ignore-ram-check", default=False) + ignore_ollama = config.getoption("--ignore-ollama-check", default=False) + ignore_api_key = config.getoption("--ignore-api-key-check", default=False) + + # Skip qualitative tests in CI (existing behavior) + if item.get_closest_marker("qualitative") and gh_run == 1: pytest.skip( reason="Skipping qualitative test: got env variable CICD == 1. Used only in gh workflows." ) + # Skip tests requiring API keys if not available (unless override) + if item.get_closest_marker("requires_api_key") and not ignore_api_key: + # Check specific backend markers + for backend in ["openai", "watsonx"]: + if item.get_closest_marker(backend): + if not capabilities["has_api_keys"].get(backend): + pytest.skip( + f"Skipping test: {backend} API key not found in environment" + ) + + # Skip tests requiring GPU if not available (unless override) + if item.get_closest_marker("requires_gpu") and not ignore_gpu: + if not capabilities["has_gpu"]: + pytest.skip("Skipping test: GPU not available") + + # Skip tests requiring heavy RAM if insufficient (unless override) + # NOTE: The 48GB threshold is based on empirical testing: + # - HuggingFace tests with granite-3.3-8b-instruct failed on 32GB M1 MacBook + # - Also failed on 36GB system + # - Set to 48GB as safe threshold for 8B model + overhead + # TODO: Consider per-model thresholds or make configurable + # Can be overridden with: pytest --ignore-ram-check + if item.get_closest_marker("requires_heavy_ram") and not ignore_ram: + RAM_THRESHOLD_GB = 48 # Based on real-world testing + if capabilities["ram_gb"] > 0 and capabilities["ram_gb"] < RAM_THRESHOLD_GB: + pytest.skip( + f"Skipping test: Insufficient RAM ({capabilities['ram_gb']:.1f}GB < {RAM_THRESHOLD_GB}GB)" + ) + + # Backend-specific skipping + if item.get_closest_marker("openai") and not ignore_api_key: + if not capabilities["has_api_keys"].get("openai"): + pytest.skip("Skipping test: OPENAI_API_KEY not found in environment") + + if item.get_closest_marker("watsonx") and not ignore_api_key: + if not capabilities["has_api_keys"].get("watsonx"): + pytest.skip( + "Skipping test: Watsonx API credentials not found in environment" + ) + + if item.get_closest_marker("vllm") and not ignore_gpu: + if not capabilities["has_gpu"]: + pytest.skip("Skipping test: vLLM requires GPU") + + if item.get_closest_marker("ollama") and not ignore_ollama: + if not capabilities["has_ollama"]: + pytest.skip( + "Skipping test: Ollama not available (port 11434 not listening)" + ) + def memory_cleaner(): """Aggressive memory cleanup function.""" @@ -46,6 +276,34 @@ def memory_cleaner(): pass +@pytest.fixture(autouse=True, scope="session") +def normalize_ollama_host(): + """Normalize OLLAMA_HOST to work with client libraries. + + If OLLAMA_HOST is set to 0.0.0.0 (server bind address), change it to + 127.0.0.1:11434 for client connections. This prevents connection errors + when tests try to connect to Ollama. + """ + original_host = os.environ.get("OLLAMA_HOST") + + # If OLLAMA_HOST starts with 0.0.0.0, replace with 127.0.0.1 + if original_host and original_host.startswith("0.0.0.0"): + # Extract port if present, default to 11434 + if ":" in original_host: + port = original_host.split(":", 1)[1] + else: + port = "11434" + os.environ["OLLAMA_HOST"] = f"127.0.0.1:{port}" + + yield + + # Restore original value + if original_host is not None: + os.environ["OLLAMA_HOST"] = original_host + elif "OLLAMA_HOST" in os.environ: + del os.environ["OLLAMA_HOST"] + + @pytest.fixture(autouse=True, scope="function") def aggressive_cleanup(): """Aggressive memory cleanup after each test to prevent OOM on CI runners.""" diff --git a/test/core/test_component_typing.py b/test/core/test_component_typing.py index d6f10f7c..4c83045c 100644 --- a/test/core/test_component_typing.py +++ b/test/core/test_component_typing.py @@ -1,8 +1,11 @@ """Tests for checking the functionality of typed components, model output thunks and sampling results.""" -import pytest from typing import get_args -from mellea import start_session + +import pytest + +import mellea.stdlib.functional as mfuncs +from mellea import MelleaSession, start_session from mellea.backends.model_ids import IBM_GRANITE_4_MICRO_3B from mellea.backends.ollama import OllamaModelBackend from mellea.core import ( @@ -11,15 +14,20 @@ ComponentParseError, Context, ModelOutputThunk, + Requirement, + ValidationResult, ) +from mellea.stdlib.components import Instruction, Message from mellea.stdlib.context import ChatContext, SimpleContext -from mellea.stdlib.components import Message -from mellea.stdlib.components import Instruction -from mellea.core import Requirement, ValidationResult from mellea.stdlib.sampling import BaseSamplingStrategy -from mellea import MelleaSession -import mellea.stdlib.functional as mfuncs +# Module-level markers: Uses granite3.3:8b (8B, heavy) in local mode +pytestmark = [ + pytest.mark.ollama, + pytest.mark.requires_gpu, + pytest.mark.requires_heavy_ram, + pytest.mark.llm, +] class FloatComp(Component[float]): @@ -78,7 +86,7 @@ def session(backend) -> MelleaSession: def test_mot_init_typing(): mot = ModelOutputThunk[float](value="1") assert hasattr(mot, "__orig_class__"), ( - f"mots are generics and should have this field" + "mots are generics and should have this field" ) assert get_args(mot.__orig_class__)[0] == float, ( # type: ignore f"expected float, got {get_args(mot.__orig_class__)[0]} as mot type" # type: ignore @@ -86,7 +94,7 @@ def test_mot_init_typing(): unknown_mot = ModelOutputThunk(value="2") assert not hasattr(unknown_mot, "__orig_class__"), ( - f"unknown mots / mots with no type defined at instantiate don't have this attribute" + "unknown mots / mots with no type defined at instantiate don't have this attribute" ) diff --git a/test/stdlib/components/intrinsic/test_rag.py b/test/stdlib/components/intrinsic/test_rag.py index 27646ba7..6e386a59 100644 --- a/test/stdlib/components/intrinsic/test_rag.py +++ b/test/stdlib/components/intrinsic/test_rag.py @@ -9,16 +9,20 @@ import torch from mellea.backends.huggingface import LocalHFBackend -from mellea.stdlib.components import Document -from mellea.stdlib.context import ChatContext -from mellea.stdlib.components import Message +from mellea.stdlib.components import Document, Message from mellea.stdlib.components.intrinsic import rag +from mellea.stdlib.context import ChatContext # Skip entire module in CI since all 7 tests are qualitative -pytestmark = pytest.mark.skipif( - int(os.environ.get("CICD", 0)) == 1, - reason="Skipping RAG tests in CI - all qualitative tests", -) +pytestmark = [ + pytest.mark.skipif( + int(os.environ.get("CICD", 0)) == 1, + reason="Skipping RAG tests in CI - all qualitative tests", + ), + pytest.mark.huggingface, + pytest.mark.requires_gpu, + pytest.mark.llm, +] DATA_ROOT = pathlib.Path(os.path.dirname(__file__)) / "testdata" """Location of data files for the tests in this file.""" @@ -30,7 +34,6 @@ @pytest.fixture(name="backend") def _backend(): """Backend used by the tests in this file.""" - # Prevent thrashing if the default device is CPU torch.set_num_threads(4) @@ -48,7 +51,8 @@ def _backend(): def _read_input_json(file_name: str): """Shared code for reading data stored in JSON files and converting to Mellea - types.""" + types. + """ with open(DATA_ROOT / "input_json" / file_name, encoding="utf-8") as f: json_data = json.load(f) @@ -70,7 +74,8 @@ def _read_input_json(file_name: str): def _read_output_json(file_name: str): """Shared code for reading canned outputs stored in JSON files and converting - to Mellea types.""" + to Mellea types. + """ with open(DATA_ROOT / "output_json" / file_name, encoding="utf-8") as f: json_data = json.load(f) diff --git a/test/stdlib/components/test_genslot.py b/test/stdlib/components/test_genslot.py index 3c59568f..ba956507 100644 --- a/test/stdlib/components/test_genslot.py +++ b/test/stdlib/components/test_genslot.py @@ -3,20 +3,27 @@ import pytest -from mellea import generative, start_session +from mellea import MelleaSession, generative, start_session from mellea.backends.model_ids import IBM_GRANITE_4_MICRO_3B from mellea.backends.ollama import OllamaModelBackend from mellea.core import Requirement -from mellea.stdlib.context import ChatContext, Context from mellea.stdlib.components.genslot import ( AsyncGenerativeSlot, GenerativeSlot, PreconditionException, SyncGenerativeSlot, ) +from mellea.stdlib.context import ChatContext, Context from mellea.stdlib.requirements import simple_validate from mellea.stdlib.sampling import RejectionSamplingStrategy -from mellea import MelleaSession + +# Module-level markers: Uses granite3.3:8b (8B, heavy) in local mode +pytestmark = [ + pytest.mark.ollama, + pytest.mark.requires_gpu, + pytest.mark.requires_heavy_ram, + pytest.mark.llm, +] @pytest.fixture(scope="module") @@ -161,7 +168,6 @@ def test_arg_extraction(backend, arg_choices, kwarg_choices, errs): Python should catch most of these issues itself. We have to manually raise an exception for the arguments of the original function being positional. """ - # List of all needed values. backend = backend ctx = ChatContext() @@ -242,7 +248,6 @@ def test_with_no_args(session): @generative def generate_text() -> str: """Generate text!""" - ... generate_text(m=session) diff --git a/test/stdlib/sampling/test_sofai_graph_coloring.py b/test/stdlib/sampling/test_sofai_graph_coloring.py index 3f0d9c25..37b06ffc 100644 --- a/test/stdlib/sampling/test_sofai_graph_coloring.py +++ b/test/stdlib/sampling/test_sofai_graph_coloring.py @@ -13,8 +13,7 @@ from mellea.core import GenerateLog, Requirement, ValidationResult from mellea.stdlib.components import Instruction, Message, ModelOutputThunk from mellea.stdlib.context import ChatContext -from mellea.stdlib.sampling import SOFAISamplingStrategy, SamplingResult - +from mellea.stdlib.sampling import SamplingResult, SOFAISamplingStrategy # ============================================================================= # Graph Coloring Test Domain @@ -709,10 +708,13 @@ async def s2_generate(*args, **kwargs): @pytest.mark.qualitative +@pytest.mark.ollama +@pytest.mark.llm class TestSOFAIGraphColoringIntegration: """Integration tests with actual LLM backends. These tests are marked qualitative and skipped in CI. + Uses llama3.2:1b (lightweight, no heavy RAM needed). """ def test_graph_coloring_fresh_start(self): diff --git a/test/stdlib/sampling/test_sofai_sampling.py b/test/stdlib/sampling/test_sofai_sampling.py index ae68587c..bae95ff4 100644 --- a/test/stdlib/sampling/test_sofai_sampling.py +++ b/test/stdlib/sampling/test_sofai_sampling.py @@ -267,10 +267,13 @@ def test_fallback_without_tags(self): @pytest.mark.qualitative +@pytest.mark.ollama +@pytest.mark.llm class TestSOFAIIntegration: """Integration tests for SOFAISamplingStrategy. These tests require actual LLM backends and are marked as qualitative. + Uses llama3.2:1b (lightweight, no heavy RAM needed). """ def test_sofai_with_ollama(self, gh_run): diff --git a/test/stdlib/sampling/test_think_budget_forcing.py b/test/stdlib/sampling/test_think_budget_forcing.py index 747849f1..f4025f0d 100644 --- a/test/stdlib/sampling/test_think_budget_forcing.py +++ b/test/stdlib/sampling/test_think_budget_forcing.py @@ -10,6 +10,15 @@ MODEL_ID = OPENAI_GPT_OSS_20B +# Module-level markers: gpt-oss:20b is a 20B model requiring heavy resources +pytestmark = [ + pytest.mark.ollama, + pytest.mark.requires_gpu, + pytest.mark.requires_heavy_ram, + pytest.mark.llm, + pytest.mark.qualitative, +] + @pytest.fixture(scope="module") def m_session(gh_run): diff --git a/test/stdlib/test_spans.py b/test/stdlib/test_spans.py index 0ddb4bc1..71b03ed0 100644 --- a/test/stdlib/test_spans.py +++ b/test/stdlib/test_spans.py @@ -1,11 +1,19 @@ import pytest from mellea.backends import ModelOption +from mellea.backends.huggingface import LocalHFBackend +from mellea.backends.model_ids import IBM_GRANITE_3_3_8B from mellea.core import CBlock from mellea.stdlib.components import SimpleComponent -from mellea.stdlib.session import start_session, MelleaSession -from mellea.backends.model_ids import IBM_GRANITE_3_3_8B -from mellea.backends.huggingface import LocalHFBackend +from mellea.stdlib.session import MelleaSession, start_session + +# Module-level markers for all tests using granite-3.3-8b (8B model) +pytestmark = [ + pytest.mark.huggingface, + pytest.mark.requires_gpu, + pytest.mark.requires_heavy_ram, + pytest.mark.llm, +] # We edit the context type in the async tests below. Don't change the scope here. diff --git a/uv.lock b/uv.lock index 8d1f179a..f50bf4bb 100644 --- a/uv.lock +++ b/uv.lock @@ -3252,6 +3252,7 @@ all = [ { name = "transformers" }, { name = "trl" }, { name = "vllm" }, + { name = "xgrammar" }, ] docling = [ { name = "docling" }, @@ -3265,6 +3266,7 @@ hf = [ { name = "peft" }, { name = "transformers" }, { name = "trl" }, + { name = "xgrammar" }, ] litellm = [ { name = "litellm" }, @@ -3287,6 +3289,7 @@ dev = [ { name = "nbmake" }, { name = "pdm" }, { name = "pre-commit" }, + { name = "psutil" }, { name = "pylint" }, { name = "pytest" }, { name = "pytest-asyncio" }, @@ -3344,6 +3347,7 @@ requires-dist = [ { name = "types-tqdm" }, { name = "uvicorn" }, { name = "vllm", marker = "extra == 'vllm'", specifier = ">=0.9.1" }, + { name = "xgrammar", marker = "extra == 'hf'" }, ] provides-extras = ["hf", "vllm", "litellm", "watsonx", "docling", "all"] @@ -3354,6 +3358,7 @@ dev = [ { name = "nbmake", specifier = ">=1.5.5" }, { name = "pdm", specifier = ">=2.24.0" }, { name = "pre-commit", specifier = ">=4.2.0" }, + { name = "psutil" }, { name = "pylint", specifier = ">=3.3.4" }, { name = "pytest" }, { name = "pytest-asyncio" }, @@ -3416,6 +3421,59 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7a/f0/8282d9641415e9e33df173516226b404d367a0fc55e1a60424a152913abc/mistune-3.1.4-py3-none-any.whl", hash = "sha256:93691da911e5d9d2e23bc54472892aff676df27a75274962ff9edc210364266d", size = 53481, upload-time = "2025-08-29T07:20:42.218Z" }, ] +[[package]] +name = "mlx" +version = "0.30.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mlx-metal", marker = "sys_platform == 'darwin'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/9c/9c/d6f72f04eeeeaeee8309397efcfa0e923189d0b720f4ac6b3887d0a2f40b/mlx-0.30.3-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:685051761e428336f8f19ae76a761ce99d29ff67c52738f15ce6409e2ff34e6b", size = 568453, upload-time = "2026-01-14T01:16:40.796Z" }, + { url = "https://files.pythonhosted.org/packages/db/59/505717fd63f62d766f054ab8770d08e98b10217c0995bd2555429863fd31/mlx-0.30.3-cp310-cp310-macosx_15_0_arm64.whl", hash = "sha256:e405e6575e3b0b00dd6bd02bdb415b638cd5c2e5faedb696df2b2c8fbe871240", size = 568451, upload-time = "2026-01-14T01:16:42.027Z" }, + { url = "https://files.pythonhosted.org/packages/86/9c/a5319ae8ed0baa76fde80def12391ae13acec1b88904d4ead9bbabc9a083/mlx-0.30.3-cp310-cp310-macosx_26_0_arm64.whl", hash = "sha256:46894eb528457483aec44227f61afdff424cb76d146a6e1727d03ea0f52be41b", size = 568309, upload-time = "2026-01-14T05:52:06.915Z" }, + { url = "https://files.pythonhosted.org/packages/78/b6/dfcfffc41d832a86249715fab336dc8638c2237035287eb24af792484c53/mlx-0.30.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:794e79587a4906bdb3c5473ef936f45008eaaa609a3c498cc29a442b2c829621", size = 568664, upload-time = "2026-01-14T01:16:45.573Z" }, + { url = "https://files.pythonhosted.org/packages/22/9f/22d494b83b611380063da31c2b482db8c620f7ad6531cfcd1e11f7c35852/mlx-0.30.3-cp311-cp311-macosx_15_0_arm64.whl", hash = "sha256:472cdc6eaca8610224621a1561e8c36477eab1a2f0dd3eb49b95484d739c4605", size = 568663, upload-time = "2026-01-14T01:16:46.588Z" }, + { url = "https://files.pythonhosted.org/packages/0d/76/b6fb0500aef8e9ed65d4730d8c34b13d7a770ca863b9af363b5713a16040/mlx-0.30.3-cp311-cp311-macosx_26_0_arm64.whl", hash = "sha256:a5d82be69c7e671dc4d5855d2f6aedcb507817e5985478903ab754b642d9ba01", size = 568522, upload-time = "2026-01-14T05:52:08.334Z" }, + { url = "https://files.pythonhosted.org/packages/11/b3/e24c3a69dad0cf4404bb174c6fed0d804022da64758cd815a254e1cd0627/mlx-0.30.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:0b275168b80645a155b456e1a457a37fb5ee2c251e8fbd8db9e153351a9e2d2f", size = 569398, upload-time = "2026-01-14T01:16:49.804Z" }, + { url = "https://files.pythonhosted.org/packages/0b/87/d0804443da97a06d3439f6efb0ceffa178f530a121f0f4a6c77b39f8bfd7/mlx-0.30.3-cp312-cp312-macosx_15_0_arm64.whl", hash = "sha256:6e818de14864982e832344198240a1dafba7d3316c4eb6f1b8e43b4dd25dd2ef", size = 569396, upload-time = "2026-01-14T01:16:51.007Z" }, + { url = "https://files.pythonhosted.org/packages/cf/dc/7cdd95e4561b73fba8c86bf11293797076120400e472fe2a72ef483b6d8d/mlx-0.30.3-cp312-cp312-macosx_26_0_arm64.whl", hash = "sha256:d23b422209fd4b7ecacef59070321f8c6a122f906a5e9b6683a5fc9e1b8fcd5c", size = 569192, upload-time = "2026-01-14T05:52:09.715Z" }, + { url = "https://files.pythonhosted.org/packages/d0/22/42935d593fe82d3b98eb9d60e4620ed99703886635106f89d407c68f33bc/mlx-0.30.3-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:743fac1e4f9e8e46c8262943c643a31139c255cdb256c99ad496958215ccac1e", size = 569344, upload-time = "2026-01-14T01:16:54.847Z" }, + { url = "https://files.pythonhosted.org/packages/7d/27/f2e7a5236289d45315d0215e8553b4dd7e2faaba3bcb5025b34b25d5ab66/mlx-0.30.3-cp313-cp313-macosx_15_0_arm64.whl", hash = "sha256:3b04ae81655aa0e63a6e8f2c749de3bbce64cf5b168ae10f39ed086dfa99e7f8", size = 569345, upload-time = "2026-01-14T01:16:56.564Z" }, + { url = "https://files.pythonhosted.org/packages/01/41/06b042457f51952456e9bb46b2c6e205ab3a28fc52d6751b5787fdb762b2/mlx-0.30.3-cp313-cp313-macosx_26_0_arm64.whl", hash = "sha256:ba9b5bdb1e929cc130af72efd7f73508c0f4e526d224489af7ec1c6419564659", size = 569213, upload-time = "2026-01-14T05:52:10.86Z" }, + { url = "https://files.pythonhosted.org/packages/82/e2/6e551bd48fb350fbf0ee4cc5cd09485437d260b8f4937f22d8623e14687a/mlx-0.30.3-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:2c27fd8daaae14ca6cf407fcd236006a6e968f7708c8f61a2709116f2e754852", size = 571920, upload-time = "2026-01-14T01:16:59.683Z" }, + { url = "https://files.pythonhosted.org/packages/82/c0/561d1c9d3d12830b0e7fdcbd807585ef20909e398d4bcdbf25e4367543eb/mlx-0.30.3-cp314-cp314-macosx_15_0_arm64.whl", hash = "sha256:b755fd4ed4b6a2ae4dee3766b5a2ea52fcbe83ebd1cf018458e18b74139409f3", size = 571921, upload-time = "2026-01-14T01:17:00.868Z" }, + { url = "https://files.pythonhosted.org/packages/42/1a/fb573fc2edc22a777fa254ff5c0c886ffd2c88aeb1f21c45778ef170f990/mlx-0.30.3-cp314-cp314-macosx_26_0_arm64.whl", hash = "sha256:7e352c0369a2f7e54d4f317b434eab3333918ea9edde1c43c61d36386b6f76bf", size = 571732, upload-time = "2026-01-14T05:52:11.893Z" }, +] + +[[package]] +name = "mlx-lm" +version = "0.29.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "jinja2", marker = "sys_platform == 'darwin'" }, + { name = "mlx", marker = "sys_platform == 'darwin'" }, + { name = "numpy", marker = "sys_platform == 'darwin'" }, + { name = "protobuf", marker = "sys_platform == 'darwin'" }, + { name = "pyyaml", marker = "sys_platform == 'darwin'" }, + { name = "sentencepiece", marker = "sys_platform == 'darwin'" }, + { name = "transformers", marker = "sys_platform == 'darwin'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e3/62/f46e1355256a114808517947f8e83ad6be310c7288c551db0fa678f47923/mlx_lm-0.29.1.tar.gz", hash = "sha256:b99180d8f33d33a077b814e550bfb2d8a59ae003d668fd1f4b3fff62a381d34b", size = 232302, upload-time = "2025-12-16T16:58:27.959Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/53/913099c91d384e115ea078325efd9a0bc1ea3eb3458c694b4596cbd267f2/mlx_lm-0.29.1-py3-none-any.whl", hash = "sha256:440941b3054c2a2216e97615de584cc90fa1ea874782e20699b9895721fad8dc", size = 324884, upload-time = "2025-12-16T16:58:26.36Z" }, +] + +[[package]] +name = "mlx-metal" +version = "0.30.3" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f6/63/4d8f6fefb507c028df4454dabfe8d8e0ad2961bb06510b6aca23d2d5b2be/mlx_metal-0.30.3-py3-none-macosx_14_0_arm64.whl", hash = "sha256:6276312b02353714c7c6515169569fe1c4bebe3229c8ecf1fdb375a13e78c966", size = 37716245, upload-time = "2026-01-14T01:16:34.838Z" }, + { url = "https://files.pythonhosted.org/packages/35/91/1d452e48a4bb4958844fd3bb28ae31b8de110549c009ebec5024ce27ebf3/mlx_metal-0.30.3-py3-none-macosx_15_0_arm64.whl", hash = "sha256:c096c0a3428f3f96a06220f97a36f9528b18bc05173f821eb05bc8458e723fa8", size = 37712125, upload-time = "2026-01-14T01:16:38.619Z" }, + { url = "https://files.pythonhosted.org/packages/fe/36/7a3cbca85542b5ca4faf871e35927f43aa0e3fc830ae5b699780fe723677/mlx_metal-0.30.3-py3-none-macosx_26_0_arm64.whl", hash = "sha256:69068533bd1ee8b0379ce5de57ed5fd313577a10ecab58e1332fd1ff7248a75e", size = 46488962, upload-time = "2026-01-14T05:52:04.523Z" }, +] + [[package]] name = "more-itertools" version = "10.8.0" @@ -8126,6 +8184,7 @@ name = "xgrammar" version = "0.1.19" source = { registry = "https://pypi.org/simple" } dependencies = [ + { name = "mlx-lm", marker = "platform_machine == 'arm64' and sys_platform == 'darwin'" }, { name = "ninja" }, { name = "pydantic" }, { name = "sentencepiece" },