diff --git a/pyproject.toml b/pyproject.toml index 589eea9..3e242ae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,6 +76,9 @@ markers = [ "sgd", "stableadam", ] +filterwarnings = [ + "ignore:Parameter `foreach` is deprecated:DeprecationWarning", +] [tool.ruff] line-length = 140 @@ -157,4 +160,4 @@ explicit = true [[tool.uv.index]] name = "pytorch-xpu" url = "https://download.pytorch.org/whl/xpu" -explicit = true \ No newline at end of file +explicit = true diff --git a/tests/adam_test.py b/tests/adam_test.py deleted file mode 100644 index 022d698..0000000 --- a/tests/adam_test.py +++ /dev/null @@ -1,75 +0,0 @@ -from itertools import product - -import pytest -import torch - -import optimi -from tests import reference - -from tests.optimizer_test import (buffer, run_optimizer, gradient_release, cpu_dim1, cpu_dim2, cpu_dtype, - cpu_ftype, gpu_dim1, gpu_dim2, gpu_dtype, gpu_ftype, gr_dim1, - gr_dim2, gr_dtype, gr_ftype, optimizer_accumulation, gpu_device) - - -optimizers = {} - -optimizers["adam"] = ({'optim':torch.optim.Adam, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), eps=1e-6, weight_decay=0)}, - {'optim':optimi.Adam, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), eps=1e-6, weight_decay=0)}) - -optimizers["adam_l2"] = ({'optim':torch.optim.Adam, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), eps=1e-6, weight_decay=1e-2)}, - {'optim':optimi.Adam, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), eps=1e-6, weight_decay=1e-2, decouple_wd=False)}) - -optimizers["adam_dw"] = ({'optim':torch.optim.AdamW, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), eps=1e-6, weight_decay=1e-2)}, - {'optim':optimi.Adam, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), eps=1e-6, weight_decay=1e-2, decouple_wd=True)}) - -optimizers["adamw"] = ({'optim':torch.optim.AdamW, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), eps=1e-6, weight_decay=1e-2)}, - {'optim':optimi.AdamW, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), eps=1e-6, weight_decay=1e-2)}) - -optimizers["adamw_dlr"] = ({'optim':reference.DecoupledAdamW, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), eps=1e-6, weight_decay=1e-5)}, - {'optim':optimi.AdamW, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), eps=1e-6, weight_decay=1e-5, decouple_lr=True)}) - -optimizer_names = [key for key in optimizers.keys()] - - - -cpu_values = list(product(cpu_dim1, cpu_dim2, cpu_dtype, optimizer_names, cpu_ftype)) -cpu_names = ["dim1_{}_dim2_{}_dtype_{}_optim_{}{}".format(*vals) for vals in cpu_values] - -@pytest.mark.cpu -@pytest.mark.adam -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cpu_values, ids=cpu_names) -def test_optimizer_cpu(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str): - run_optimizer(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device('cpu'), buffer) - - - -cuda_values = list(product(gpu_dim1, gpu_dim2, gpu_dtype, optimizer_names, gpu_ftype)) -cuda_names = ["dim1_{}_dim2_{}_dtype_{}_optim_{}{}".format(*vals) for vals in cuda_values] - -@pytest.mark.gpu -@pytest.mark.adam -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cuda_values, ids=cuda_names) -def test_optimizer_gpu(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str, gpu_device:str): - run_optimizer(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device(gpu_device), buffer) - - - -cuda_values = list(product(gr_dim1, gr_dim2, gr_dtype, optimizer_names, gr_ftype)) -cuda_names = ["dim1_{}_dim2_{}_dtype_{}_optim_{}{}".format(*vals) for vals in cuda_values] - -@pytest.mark.gpu -@pytest.mark.adam -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cuda_values, ids=cuda_names) -def test_gradient_release(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str, gpu_device:str): - gradient_release(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device(gpu_device), - framework_opt_step=torch.rand(1).item() > 0.5) - - -@pytest.mark.gpu -@pytest.mark.adam -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cuda_values, ids=cuda_names) -def test_optimizer_accumulation(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str, gpu_device:str): - if optim_name in ["adam_l2"]: - pytest.skip("Skip tests for Adam with L2 weight decay.") - optimizer_accumulation(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device(gpu_device), - framework_opt_step=torch.rand(1).item() > 0.5) \ No newline at end of file diff --git a/tests/adan_test.py b/tests/adan_test.py deleted file mode 100644 index 6086452..0000000 --- a/tests/adan_test.py +++ /dev/null @@ -1,73 +0,0 @@ -from itertools import product - -import pytest -import torch - -import optimi -from tests import reference - -from tests.optimizer_test import (buffer, run_optimizer, gradient_release, cpu_dim1, cpu_dim2, cpu_dtype, - cpu_ftype, gpu_dim1, gpu_dim2, gpu_dtype, gpu_ftype, gr_dim1, - gr_dim2, gr_dtype, gr_ftype, optimizer_accumulation, gpu_device) - - - -optimizers = {} - -optimizers["adan"] = ({'optim':reference.Adan, 'kwargs':dict(lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-6)}, - {'optim':optimi.Adan, 'kwargs':dict(lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-6, weight_decay=0)}) - -optimizers["adan_wd"] = ({'optim':reference.Adan, 'kwargs':dict(lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-6, weight_decay=2e-2)}, - {'optim':optimi.Adan, 'kwargs':dict(lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-6, weight_decay=2e-2)}) - -optimizers["adan_awd"] = ({'optim':reference.Adan, 'kwargs':dict(lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-6, weight_decay=2e-2, no_prox=True)}, - {'optim':optimi.Adan, 'kwargs':dict(lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-6, weight_decay=2e-2, adam_wd=True)}) - -optimizers["adan_dlr"] = ({'optim':reference.Adan, 'kwargs':dict(lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-6, weight_decay=2e-2)}, - {'optim':optimi.Adan, 'kwargs':dict(lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-6, weight_decay=2e-5, decouple_lr=True)}) - -optimizer_names = [key for key in optimizers.keys()] - - - -cpu_values = list(product(cpu_dim1, cpu_dim2, cpu_dtype, optimizer_names, cpu_ftype)) -cpu_names = ["dim1_{}_dim2_{}_dtype_{}_optim_{}{}".format(*vals) for vals in cpu_values] - -@pytest.mark.cpu -@pytest.mark.adan -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cpu_values, ids=cpu_names) -def test_optimizer_cpu(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str): - run_optimizer(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device('cpu'), buffer) - - - -cuda_values = list(product(gpu_dim1, gpu_dim2, gpu_dtype, optimizer_names, gpu_ftype)) -cuda_names = ["dim1_{}_dim2_{}_dtype_{}_optim_{}{}".format(*vals) for vals in cuda_values] - -@pytest.mark.gpu -@pytest.mark.adan -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cuda_values, ids=cuda_names) -def test_optimizer_gpu(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str, gpu_device:str): - # Adan bfloat16 updates are noisier, so GPU uses fewer test iterations - run_optimizer(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device(gpu_device), buffer, - iterations=20 if dtype == torch.bfloat16 else None) - - - -cuda_values = list(product(gr_dim1, gr_dim2, gr_dtype, optimizer_names, gr_ftype)) -cuda_names = ["dim1_{}_dim2_{}_dtype_{}_optim_{}{}".format(*vals) for vals in cuda_values] - -@pytest.mark.gpu -@pytest.mark.adan -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cuda_values, ids=cuda_names) -def test_gradient_release(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str, gpu_device:str): - gradient_release(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device(gpu_device), - framework_opt_step=torch.rand(1).item() > 0.5) - - -@pytest.mark.gpu -@pytest.mark.adan -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cuda_values, ids=cuda_names) -def test_optimizer_accumulation(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str, gpu_device:str): - optimizer_accumulation(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device(gpu_device), - framework_opt_step=torch.rand(1).item() > 0.5) diff --git a/tests/anyadam_test.py b/tests/anyadam_test.py deleted file mode 100644 index 3d51011..0000000 --- a/tests/anyadam_test.py +++ /dev/null @@ -1,48 +0,0 @@ -from itertools import product - -import pytest -import torch - -import optimi -from tests import reference - -from tests.optimizer_test import buffer, run_optimizer, cpu_dim1, cpu_dim2, cpu_dtype, cpu_ftype, gpu_dim1, gpu_dim2, gpu_dtype, gpu_ftype, gpu_device - - - -optimizers = {} - -optimizers["any_adam"] = ({'optim':reference.AnyPrecisionAdamW, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), eps=1e-6, weight_decay=0)}, - {'optim':optimi.Adam, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), eps=1e-6, weight_decay=0, kahan_sum=True)}) - -optimizers["any_adamw"] = ({'optim':reference.AnyPrecisionAdamW, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), eps=1e-6, weight_decay=1e-2)}, - {'optim':optimi.AdamW, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), eps=1e-6, weight_decay=1e-2, kahan_sum=True)}) - -optimizers["any_adamw_dlr"] = ({'optim':reference.AnyPrecisionAdamW, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), eps=1e-6, weight_decay=1e-2)}, - {'optim':optimi.AdamW, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), eps=1e-6, weight_decay=1e-5, decouple_lr=True, kahan_sum=True)}) - -optimizer_names = [key for key in optimizers.keys()] - - - -cpu_dtype = [torch.bfloat16] -cpu_values = list(product(cpu_dim1, cpu_dim2, cpu_dtype, optimizer_names, cpu_ftype)) -cpu_names = ["dim1_{}_dim2_{}_dtype_{}_optim_{}{}".format(*vals) for vals in cpu_values] - -@pytest.mark.cpu -@pytest.mark.adam -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cpu_values, ids=cpu_names) -def test_optimizer_cpu(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str): - run_optimizer(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device('cpu'), buffer, any_precision=True) - - - -gpu_dtype = [torch.bfloat16] -cuda_values = list(product(gpu_dim1, gpu_dim2, gpu_dtype, optimizer_names, gpu_ftype)) -cuda_names = ["dim1_{}_dim2_{}_dtype_{}_optim_{}{}".format(*vals) for vals in cuda_values] - -@pytest.mark.gpu -@pytest.mark.adam -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cuda_values, ids=cuda_names) -def test_optimizer_gpu(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str, gpu_device:str): - run_optimizer(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device(gpu_device), buffer, any_precision=True) \ No newline at end of file diff --git a/tests/config.py b/tests/config.py new file mode 100644 index 0000000..28446ae --- /dev/null +++ b/tests/config.py @@ -0,0 +1,341 @@ +from __future__ import annotations + +import importlib +import inspect +import warnings +from dataclasses import asdict, dataclass, field, replace +from enum import Enum +from pathlib import Path +from typing import Any + +import torch +from optimi.optimizer import OptimiOptimizer +from torch.optim import Optimizer + + +from optimi.utils import MIN_TORCH_2_6 + + +class OptTestType(Enum): + normal = "normal" + gradient_release = "gradient_release" + accumulation = "accumulation" + + +class DeviceType(Enum): + cpu = "cpu" + gpu = "gpu" + + def is_available(self) -> bool: + if self == DeviceType.cpu: + return True + if self == DeviceType.gpu: + return ( + torch.cuda.is_available() + or (hasattr(torch, "xpu") and torch.xpu.is_available()) + or (hasattr(torch.backends, "mps") and torch.backends.mps.is_available()) + ) + return False + + +class Backend(Enum): + torch = "torch" + triton = "triton" + foreach = "foreach" + + def is_supported(self, device: DeviceType) -> bool: + if self == Backend.triton: + # Triton requires torch >= 2.6 + if not MIN_TORCH_2_6: + return False + # Triton not supported on CPU + if device == DeviceType.cpu: + return False + # Triton not supported on MPS + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + return False + # Triton requires GPU/XPU + if not (torch.cuda.is_available() or (hasattr(torch, "xpu") and torch.xpu.is_available())): + return False + return True + + +@dataclass +class Tolerance: + atol: float = 1e-6 + rtol: float = 1e-5 + max_error_rate: float = 5e-4 + equal_nan: bool = False + + +@dataclass() +class NormalSpec: + iterations_cpu: int = 20 + iterations_gpu: int = 40 + batch_cpu: int = 1 + batch_gpu: int = 32 + max_error_cpu: int = 2 + max_error_gpu: int = 5 + + tolerance: dict[torch.dtype, Tolerance] = field( + default_factory=lambda: { + torch.float32: Tolerance(atol=1e-6, rtol=1e-5, max_error_rate=5e-4), + torch.bfloat16: Tolerance(atol=1e-3, rtol=1e-2, max_error_rate=0.01), + torch.float16: Tolerance(atol=1e-4, rtol=1e-3, max_error_rate=0.01), + } + ) + + +@dataclass() +class GradientReleaseSpec: + iterations: int = 40 + batch: int = 32 + max_error_count: int = 12 # more lenient for noisy updates + + tolerance: dict[torch.dtype, Tolerance] = field( + default_factory=lambda: { + torch.float32: Tolerance(atol=1e-6, rtol=1e-5, max_error_rate=5e-4), + torch.bfloat16: Tolerance(atol=1e-3, rtol=1e-2, max_error_rate=0.01), + torch.float16: Tolerance(atol=1e-4, rtol=1e-3, max_error_rate=0.01), + } + ) + + +@dataclass() +class AccumulationSpec: + iterations: int = 40 + batch: int = 32 + max_error_rate: float = 0.035 + gradient_accumulation_steps: int = 4 + + tolerance: dict[torch.dtype, Tolerance] = field( + default_factory=lambda: { + torch.float32: Tolerance(rtol=1e-2, atol=1e-2), + torch.bfloat16: Tolerance(rtol=1e-2, atol=1e-2), + torch.float16: Tolerance(rtol=1e-2, atol=1e-2), + } + ) + + +@dataclass() +class TestSpec: + normal: NormalSpec = field(default_factory=NormalSpec) + gradient_release: GradientReleaseSpec = field(default_factory=GradientReleaseSpec) + accumulation: AccumulationSpec = field(default_factory=AccumulationSpec) + + +def with_updated_spec( + spec: TestSpec | NormalSpec | GradientReleaseSpec | AccumulationSpec | None, + test_type: OptTestType | None = None, + tolerances_override: dict[torch.dtype, Tolerance] | None = None, +) -> TestSpec: + if isinstance(spec, (NormalSpec, GradientReleaseSpec, AccumulationSpec)): + if isinstance(spec, NormalSpec): + base = TestSpec(normal=spec) + elif isinstance(spec, GradientReleaseSpec): + base = TestSpec(gradient_release=spec) + else: + base = TestSpec(accumulation=spec) + else: + base = spec or TestSpec() + + if tolerances_override is None: + tolerances_override = {} + + if test_type is None: + return base + + if test_type == OptTestType.normal: + merged = {**base.normal.tolerance, **tolerances_override} + return replace(base, normal=replace(base.normal, tolerance=merged)) + if test_type == OptTestType.gradient_release: + merged = {**base.gradient_release.tolerance, **tolerances_override} + return replace(base, gradient_release=replace(base.gradient_release, tolerance=merged)) + if test_type == OptTestType.accumulation: + merged = {**base.accumulation.tolerance, **tolerances_override} + return replace(base, accumulation=replace(base.accumulation, tolerance=merged)) + raise ValueError(f"Unknown test type: {test_type}") + + +@dataclass +class BaseParams: + lr: float = 1e-3 + weight_decay: float = 0.0 + decouple_wd: bool = False + decouple_lr: bool = False + triton: bool = False + + def with_(self, **overrides: Any) -> "BaseParams": + return replace(self, **overrides) + + def _kwargs_for(self, cls: type | None) -> dict[str, Any]: + if cls is None: + return {} + sig = inspect.signature(cls.__init__) + ok = set(sig.parameters) - {"self"} + values = asdict(self) + if values.get("triton") and "triton" not in ok: + warnings.warn(f"{cls.__name__} does not accept triton; ignoring BaseParams.triton=True.", RuntimeWarning) + return {k: v for k, v in values.items() if k in ok} + + def to_optimi_kwargs(self, cls: type[OptimiOptimizer]) -> dict[str, Any]: + return self._kwargs_for(cls) + + def to_reference_kwargs(self, cls: type[Optimizer]) -> dict[str, Any]: + return self._kwargs_for(cls) + + +@dataclass +class OptTest: + # Identification + name: str # e.g. "adam_base" + + # Classes + params + optimi_class: type[OptimiOptimizer] + optimi_params: BaseParams + reference_class: type[Optimizer] + reference_params: BaseParams | None = None + + # Optional fully decoupled reference for decoupled-lr variant + fully_decoupled_reference: type[Optimizer] | None = None + + # Behavior / constraints + skip_tests: list[OptTestType] = field(default_factory=list) + any_precision: bool = False + test_decoupled_wd: bool = True + custom_iterations: dict[OptTestType | tuple[OptTestType, DeviceType] | tuple[OptTestType, DeviceType, torch.dtype], int] | None = None + spec: TestSpec = field(default_factory=TestSpec) + only_dtypes: list[torch.dtype] | None = None + + def __post_init__(self): + if self.reference_params is None: + self.reference_params = self.optimi_params + + @property + def optimizer_name(self) -> str: + return self.name.split("_", 1)[0] + + @property + def variant_name(self) -> str: + return self.name.split("_", 1)[1] if "_" in self.name else "base" + + def to_optimi_kwargs(self, backend: Backend | None = None) -> dict[str, Any]: + kw = self.optimi_params.to_optimi_kwargs(self.optimi_class) + + # Centralize backend controls so runners don't mutate kwargs later + if backend is not None: + if backend == Backend.triton: + kw["triton"] = True + kw["foreach"] = False + elif backend == Backend.torch: + kw["triton"] = False + kw["foreach"] = False + elif backend == Backend.foreach: + kw["triton"] = False + kw["foreach"] = True + else: + raise ValueError(f"Unknown backend: {backend}") + return kw + + def to_reference_kwargs(self, backend: Backend | None = None) -> dict[str, Any]: + assert self.reference_params is not None + kwargs = self.reference_params.to_reference_kwargs(self.reference_class) + # Centralize fused handling for reference optimizers: when not testing + # Optimi's Triton backend, avoid fused codepaths on the reference side + # to mirror legacy parity expectations. + if backend is not None and backend != Backend.triton: + try: + if "fused" in inspect.signature(self.reference_class.__init__).parameters: + kwargs = {**kwargs, "fused": False} + except (ValueError, TypeError): + pass + return kwargs + + def supports_l2_weight_decay(self) -> bool: + return "decouple_wd" in inspect.signature(self.optimi_class.__init__).parameters + + + + +def default_variants(base: OptTest) -> list[OptTest]: + """Generate base + L2 + decoupled variants with minimal boilerplate.""" + out: list[OptTest] = [] + + base_test = OptTest( + name=f"{base.optimizer_name}_base", + optimi_class=base.optimi_class, + optimi_params=base.optimi_params.with_(weight_decay=0.0, decouple_wd=False, decouple_lr=False), + reference_class=base.reference_class, + reference_params=(base.reference_params or base.optimi_params).with_(weight_decay=0.0, decouple_wd=False, decouple_lr=False), + test_decoupled_wd=base.test_decoupled_wd, + skip_tests=list(base.skip_tests), + any_precision=base.any_precision, + custom_iterations=base.custom_iterations, + spec=base.spec, + only_dtypes=base.only_dtypes, + fully_decoupled_reference=base.fully_decoupled_reference, + ) + out.append(base_test) + + optimi_params = inspect.signature(base.optimi_class.__init__).parameters + + # L2 weight decay if optimizer supports decouple_wd arg + if "decouple_wd" in optimi_params: + out.append( + replace( + base_test, + name=f"{base.optimizer_name}_l2_wd", + optimi_params=base.optimi_params.with_(weight_decay=0.01, decouple_wd=False), + reference_params=(base.reference_params or base.optimi_params).with_(weight_decay=0.01, decouple_wd=False), + ) + ) + + # Decoupled weight decay + if base.test_decoupled_wd and "decouple_lr" in optimi_params: + out.append( + replace( + base_test, + name=f"{base.optimizer_name}_decoupled_wd", + optimi_params=base.optimi_params.with_(weight_decay=0.01, decouple_wd=True), + reference_params=(base.reference_params or base.optimi_params).with_(weight_decay=0.01, decouple_wd=True), + ) + ) + + # Decoupled LR (optionally swap reference class) + ref_cls = base.fully_decoupled_reference or base.reference_class + out.append( + replace( + base_test, + name=f"{base.optimizer_name}_decoupled_lr", + optimi_params=base.optimi_params.with_(weight_decay=1e-5, decouple_lr=True), + reference_class=ref_cls, + reference_params=(base.reference_params or base.optimi_params).with_( + weight_decay=1e-5 if base.fully_decoupled_reference else 0.01, + decouple_lr=True, + ), + ) + ) + return out + + +def discover_tests(root: Path | None = None) -> list[OptTest]: + """ + Discover `opt_*.py` modules in this package. Accept exactly: + - TESTS: list[OptTest] + - BASE: OptTest -> expanded via default_variants(BASE) + """ + if root is None: + root = Path(__file__).parent + cases: list[OptTest] = [] + for f in root.glob("opt_*.py"): + mod = importlib.import_module(f".{f.stem}", package=__package__) + if hasattr(mod, "TESTS"): + cases.extend(getattr(mod, "TESTS")) + elif hasattr(mod, "BASE"): + base = getattr(mod, "BASE") + cases.extend(default_variants(base)) + return cases + + +def optimizer_names() -> list[str]: + return sorted({c.optimizer_name for c in discover_tests()}) diff --git a/tests/conftest.py b/tests/conftest.py index 07c912e..f2a0008 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,18 +1,75 @@ +"""Pytest configuration and fixtures for the unified optimizer test framework. + +This module provides pytest configuration, custom mark registration, and the +`gpu_device` fixture used by tests. +""" + +import pytest import torch -try: - import triton -except ImportError: - triton = None +from .config import optimizer_names -def pytest_report_header(config): - if triton is None: - return f"libaries: PyTorch {torch.__version__}" - else: - return f"libaries: PyTorch {torch.__version__}, Triton: {triton.__version__}" +def pytest_configure(config): + "Configure pytest with custom marks for optimizer testing." + # Register device marks + config.addinivalue_line("markers", "cpu: mark test to run on CPU") + config.addinivalue_line("markers", "gpu: mark test to run on GPU") + + # Register dtype marks + config.addinivalue_line("markers", "float32: mark test to run with float32 dtype") + config.addinivalue_line("markers", "bfloat16: mark test to run with bfloat16 dtype") + + # Register backend marks + config.addinivalue_line("markers", "torch: mark test to run with torch backend") + config.addinivalue_line("markers", "foreach: mark test to run with foreach backend") + config.addinivalue_line("markers", "triton: mark test to run with triton backend") + + # Per-optimizer marks (e.g., -m adam, -m sgd) + for opt_name in optimizer_names(): + config.addinivalue_line("markers", f"{opt_name}: mark test for {opt_name} optimizer") def pytest_addoption(parser): - """Add command-line option to specify a single GPU""" + "Add command-line option to specify a single GPU." parser.addoption("--gpu-id", action="store", type=int, default=None, help="Specify a single GPU to use (e.g. --gpu-id=0)") + + +@pytest.fixture() +def gpu_device(worker_id, request): + """Map xdist workers to available GPU devices in a round-robin fashion, + supporting CUDA (NVIDIA/ROCm) and XPU (Intel) backends. + Use a single specified GPU if --gpu-id is provided""" + + # Check if specific GPU was requested + specific_gpu = request.config.getoption("--gpu-id") + + # Determine available GPU backend and device count + if torch.cuda.is_available(): + backend = "cuda" + device_count = torch.cuda.device_count() + elif hasattr(torch, "xpu") and torch.xpu.is_available(): + backend = "xpu" + device_count = torch.xpu.device_count() + elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + backend = "mps" + device_count = 0 + else: + pytest.skip("No GPU backend available") + + if specific_gpu is not None: + return torch.device(f"{backend}:{specific_gpu}") + + if worker_id == "master": + return torch.device(backend) + + # If no devices available, return default backend + if device_count == 0: + return torch.device(backend) + + # Extract worker number from worker_id (e.g., 'gw6' -> 6) + worker_num = int(worker_id.replace("gw", "")) + + # Map worker to GPU index using modulo to round-robin + gpu_idx = (worker_num - 1) % device_count + return torch.device(f"{backend}:{gpu_idx}") diff --git a/tests/lion_test.py b/tests/lion_test.py deleted file mode 100644 index f35e783..0000000 --- a/tests/lion_test.py +++ /dev/null @@ -1,68 +0,0 @@ -from itertools import product - -import pytest -import torch - -import optimi -from tests import reference - -from tests.optimizer_test import (buffer, run_optimizer, gradient_release, cpu_dim1, cpu_dim2, cpu_dtype, - cpu_ftype, gpu_dim1, gpu_dim2, gpu_dtype, gpu_ftype, gr_dim1, - gr_dim2, gr_dtype, gr_ftype, optimizer_accumulation, gpu_device) - - - -optimizers = {} - -optimizers["lion"] = ({'optim':reference.Lion, 'kwargs':dict(lr=1e-4, betas=(0.9, 0.99), weight_decay=0)}, - {'optim':optimi.Lion, 'kwargs':dict(lr=1e-4, betas=(0.9, 0.99), weight_decay=0)}) - -optimizers["lion_wd"] = ({'optim':reference.Lion, 'kwargs':dict(lr=1e-4, betas=(0.9, 0.99), weight_decay=0.1)}, - {'optim':optimi.Lion, 'kwargs':dict(lr=1e-4, betas=(0.9, 0.99), weight_decay=0.1)}) - -optimizers["lion_dlr"] = ({'optim':reference.Lion, 'kwargs':dict(lr=1e-4, betas=(0.9, 0.99), weight_decay=0.1)}, - {'optim':optimi.Lion, 'kwargs':dict(lr=1e-4, betas=(0.9, 0.99), weight_decay=1e-5, decouple_lr=True)}) - -optimizer_names = [key for key in optimizers.keys()] - - - -cpu_values = list(product(cpu_dim1, cpu_dim2, cpu_dtype, optimizer_names, cpu_ftype)) -cpu_names = ["dim1_{}_dim2_{}_dtype_{}_optim_{}{}".format(*vals) for vals in cpu_values] - -@pytest.mark.cpu -@pytest.mark.lion -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cpu_values, ids=cpu_names) -def test_optimizer_cpu(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str): - run_optimizer(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device('cpu'), buffer) - - - -cuda_values = list(product(gpu_dim1, gpu_dim2, gpu_dtype, optimizer_names, gpu_ftype)) -cuda_names = ["dim1_{}_dim2_{}_dtype_{}_optim_{}{}".format(*vals) for vals in cuda_values] - -@pytest.mark.gpu -@pytest.mark.lion -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cuda_values, ids=cuda_names) -def test_optimizer_gpu(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str, gpu_device:str): - run_optimizer(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device(gpu_device), buffer) - - - -cuda_values = list(product(gr_dim1, gr_dim2, gr_dtype, optimizer_names, gr_ftype)) -cuda_names = ["dim1_{}_dim2_{}_dtype_{}_optim_{}{}".format(*vals) for vals in cuda_values] - -@pytest.mark.gpu -@pytest.mark.lion -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cuda_values, ids=cuda_names) -def test_gradient_release(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str, gpu_device:str): - gradient_release(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device(gpu_device), - framework_opt_step=torch.rand(1).item() > 0.5) - - -@pytest.mark.gpu -@pytest.mark.lion -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cuda_values, ids=cuda_names) -def test_optimizer_accumulation(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str, gpu_device:str): - optimizer_accumulation(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device(gpu_device), - framework_opt_step=torch.rand(1).item() > 0.5) \ No newline at end of file diff --git a/tests/opt_adam.py b/tests/opt_adam.py new file mode 100644 index 0000000..e662184 --- /dev/null +++ b/tests/opt_adam.py @@ -0,0 +1,26 @@ +"""Adam optimizer definitions using the new OptTest/variants flow.""" + +from dataclasses import dataclass + +import optimi +import torch + +from .config import BaseParams, OptTest + + +@dataclass +class AdamParams(BaseParams): + betas: tuple[float, float] = (0.9, 0.99) + eps: float = 1e-6 + + +# Provide BASE so the framework generates base/l2/decoupled variants as applicable. +# Disable decoupled WD/LR generation as this is tested in AdamW tests. +BASE = OptTest( + name="adam", + optimi_class=optimi.Adam, + optimi_params=AdamParams(), + reference_class=torch.optim.Adam, + reference_params=AdamParams(), + test_decoupled_wd=False, +) diff --git a/tests/opt_adamw.py b/tests/opt_adamw.py new file mode 100644 index 0000000..9ea41d3 --- /dev/null +++ b/tests/opt_adamw.py @@ -0,0 +1,26 @@ +"""AdamW optimizer definitions using the new OptTest/variants flow.""" + +from dataclasses import dataclass + +import optimi +import torch +from tests import reference + +from .config import BaseParams, OptTest + + +@dataclass +class AdamWParams(BaseParams): + betas: tuple[float, float] = (0.9, 0.99) + eps: float = 1e-6 + + +# Provide BASE with fully_decoupled_reference so decoupled_lr uses DecoupledAdamW +BASE = OptTest( + name="adamw", + optimi_class=optimi.AdamW, + optimi_params=AdamWParams(), + reference_class=torch.optim.AdamW, + reference_params=AdamWParams(), + fully_decoupled_reference=reference.DecoupledAdamW, +) diff --git a/tests/opt_adan.py b/tests/opt_adan.py new file mode 100644 index 0000000..524d5c7 --- /dev/null +++ b/tests/opt_adan.py @@ -0,0 +1,61 @@ +"""Adan optimizer tests using the new OptTest format (manual list).""" + +from dataclasses import dataclass +from typing import Any + +import optimi +import torch +from tests import reference + +from .config import BaseParams, DeviceType, OptTest, OptTestType + + +@dataclass +class AdanParams(BaseParams): + betas: tuple[float, float, float] = (0.98, 0.92, 0.99) + eps: float = 1e-6 + weight_decouple: bool = False # For adam_wd variant (maps to no_prox in reference) + adam_wd: bool = False # For optimi optimizer + + def to_reference_kwargs(self, reference_class: type) -> dict[str, Any]: + kwargs = super().to_reference_kwargs(reference_class) + if "weight_decouple" in kwargs: + kwargs["no_prox"] = kwargs.pop("weight_decouple") + kwargs.pop("adam_wd", None) + return kwargs + + +TESTS = [ + OptTest( + name="adan_base", + optimi_class=optimi.Adan, + optimi_params=AdanParams(), + reference_class=reference.Adan, + reference_params=AdanParams(), + custom_iterations={(OptTestType.normal, DeviceType.gpu, torch.bfloat16): 20}, + ), + OptTest( + name="adan_weight_decay", + optimi_class=optimi.Adan, + optimi_params=AdanParams(weight_decay=2e-2), + reference_class=reference.Adan, + reference_params=AdanParams(weight_decay=2e-2), + custom_iterations={(OptTestType.normal, DeviceType.gpu, torch.bfloat16): 20}, + ), + OptTest( + name="adan_adam_wd", + optimi_class=optimi.Adan, + optimi_params=AdanParams(weight_decay=2e-2, adam_wd=True), + reference_class=reference.Adan, + reference_params=AdanParams(weight_decay=2e-2, weight_decouple=True), + custom_iterations={(OptTestType.normal, DeviceType.gpu, torch.bfloat16): 20}, + ), + OptTest( + name="adan_decoupled_lr", + optimi_class=optimi.Adan, + optimi_params=AdanParams(weight_decay=2e-5, decouple_lr=True), + reference_class=reference.Adan, + reference_params=AdanParams(weight_decay=2e-2), + custom_iterations={(OptTestType.normal, DeviceType.gpu, torch.bfloat16): 20}, + ), +] diff --git a/tests/opt_anyadam.py b/tests/opt_anyadam.py new file mode 100644 index 0000000..17d87ab --- /dev/null +++ b/tests/opt_anyadam.py @@ -0,0 +1,73 @@ +"""AnyAdam optimizer tests using the new OptTest format (manual list).""" + +from dataclasses import dataclass + +import optimi +import torch +from tests.reference import AnyPrecisionAdamW + +from .config import BaseParams, OptTest, OptTestType, Tolerance, with_updated_spec + + +@dataclass +class AnyAdamParams(BaseParams): + betas: tuple[float, float] = (0.9, 0.999) + eps: float = 1e-6 + kahan_sum: bool = False + use_kahan_summation: bool = False + + def to_reference_kwargs(self, reference_class: type) -> dict: + kwargs = super().to_reference_kwargs(reference_class) + if "kahan_sum" in kwargs: + kwargs["use_kahan_summation"] = kwargs.pop("kahan_sum") + if reference_class.__name__ == "AnyPrecisionAdamW": + kwargs.setdefault("momentum_dtype", torch.bfloat16) + kwargs.setdefault("variance_dtype", torch.bfloat16) + kwargs.setdefault("compensation_buffer_dtype", torch.bfloat16) + return kwargs + + +TESTS = [ + OptTest( + name="anyadam_kahan", + optimi_class=optimi.Adam, + optimi_params=AnyAdamParams(betas=(0.9, 0.99), kahan_sum=True), + reference_class=AnyPrecisionAdamW, + reference_params=AnyAdamParams(betas=(0.9, 0.99), use_kahan_summation=True), + only_dtypes=[torch.bfloat16], + any_precision=True, + spec=with_updated_spec( + spec=None, + test_type=OptTestType.normal, + tolerances_override={torch.bfloat16: Tolerance(rtol=2e-2, atol=2e-3, max_error_rate=0.01)}, + ), + ), + OptTest( + name="anyadam_kahan_wd", + optimi_class=optimi.AdamW, + optimi_params=AnyAdamParams(betas=(0.9, 0.99), weight_decay=0.01, kahan_sum=True), + reference_class=AnyPrecisionAdamW, + reference_params=AnyAdamParams(betas=(0.9, 0.99), weight_decay=0.01, use_kahan_summation=True), + only_dtypes=[torch.bfloat16], + any_precision=True, + spec=with_updated_spec( + spec=None, + test_type=OptTestType.normal, + tolerances_override={torch.bfloat16: Tolerance(rtol=5e-2, atol=1e-2, max_error_rate=0.01)}, + ), + ), + OptTest( + name="anyadam_kahan_decoupled_lr", + optimi_class=optimi.AdamW, + optimi_params=AnyAdamParams(betas=(0.9, 0.99), weight_decay=1e-5, decouple_lr=True, kahan_sum=True), + reference_class=AnyPrecisionAdamW, + reference_params=AnyAdamParams(betas=(0.9, 0.99), weight_decay=1e-2, use_kahan_summation=True), + only_dtypes=[torch.bfloat16], + any_precision=True, + spec=with_updated_spec( + spec=None, + test_type=OptTestType.normal, + tolerances_override={torch.bfloat16: Tolerance(rtol=2e-2, atol=2e-3, max_error_rate=0.01)}, + ), + ), +] diff --git a/tests/opt_lion.py b/tests/opt_lion.py new file mode 100644 index 0000000..93cf97a --- /dev/null +++ b/tests/opt_lion.py @@ -0,0 +1,39 @@ +"""Lion optimizer tests in new OptTest format (manual list to match prior values).""" + +from dataclasses import dataclass + +import optimi +from tests.reference import lion as reference_lion + +from .config import BaseParams, OptTest + + +@dataclass +class LionParams(BaseParams): + lr: float = 1e-4 + betas: tuple[float, float] = (0.9, 0.99) + + +TESTS = [ + OptTest( + name="lion_base", + optimi_class=optimi.Lion, + optimi_params=LionParams(), + reference_class=reference_lion.Lion, + reference_params=LionParams(), + ), + OptTest( + name="lion_decoupled_wd", + optimi_class=optimi.Lion, + optimi_params=LionParams(weight_decay=0.1, decouple_wd=True), + reference_class=reference_lion.Lion, + reference_params=LionParams(weight_decay=0.1), + ), + OptTest( + name="lion_decoupled_lr", + optimi_class=optimi.Lion, + optimi_params=LionParams(weight_decay=1e-5, decouple_lr=True), + reference_class=reference_lion.Lion, + reference_params=LionParams(weight_decay=0.1), + ), +] diff --git a/tests/opt_radam.py b/tests/opt_radam.py new file mode 100644 index 0000000..08aadf8 --- /dev/null +++ b/tests/opt_radam.py @@ -0,0 +1,31 @@ +"""RAdam optimizer definitions using the new OptTest/variants flow.""" + +import inspect +from dataclasses import dataclass, field + +import optimi +import torch + +from .config import BaseParams, OptTest, OptTestType, Tolerance, with_updated_spec + + +@dataclass +class RAdamParams(BaseParams): + betas: tuple[float, float] = (0.9, 0.99) + eps: float = 1e-8 + decoupled_weight_decay: bool = field(default=False) + + def __post_init__(self): + if self.decouple_wd or self.decouple_lr: + self.decoupled_weight_decay = True + + +BASE = OptTest( + name="radam", + optimi_class=optimi.RAdam, + optimi_params=RAdamParams(), + reference_class=torch.optim.RAdam, + reference_params=RAdamParams(), + spec=with_updated_spec(spec=None, test_type=OptTestType.normal, tolerances_override={torch.float32: Tolerance(max_error_rate=0.001)}), + test_decoupled_wd="decoupled_weight_decay" in inspect.signature(torch.optim.RAdam.__init__).parameters, +) diff --git a/tests/opt_ranger.py b/tests/opt_ranger.py new file mode 100644 index 0000000..771cd86 --- /dev/null +++ b/tests/opt_ranger.py @@ -0,0 +1,29 @@ +"""Ranger optimizer tests using new OptTest format (base only).""" + +from dataclasses import dataclass + +import optimi +from tests import reference + +from .config import BaseParams, OptTest, OptTestType + + +@dataclass +class RangerParams(BaseParams): + betas: tuple[float, float] = (0.9, 0.99) + eps: float = 1e-8 + k: int = 6 # Lookahead steps + alpha: float = 0.5 # Lookahead alpha + + +TESTS = [ + OptTest( + name="ranger_base", + optimi_class=optimi.Ranger, + optimi_params=RangerParams(), + reference_class=reference.Ranger, + reference_params=RangerParams(), + # Match legacy longer gradient-release coverage due to Lookahead cadence. + custom_iterations={OptTestType.gradient_release: 160}, + ) +] diff --git a/tests/opt_sgd.py b/tests/opt_sgd.py new file mode 100644 index 0000000..5d0f055 --- /dev/null +++ b/tests/opt_sgd.py @@ -0,0 +1,66 @@ +"""SGD optimizer definitions using the new OptTest/variants flow (manual list).""" + +from dataclasses import dataclass +from typing import Any + +import optimi +import torch +from tests import reference + +from .config import BaseParams, OptTest, OptTestType + + +@dataclass +class SGDParams(BaseParams): + momentum: float = 0.0 + dampening: bool = False # Optimi uses bool instead of float + torch_init: bool = False + + def to_reference_kwargs(self, reference_class: type) -> dict[str, Any]: + kwargs = super().to_reference_kwargs(reference_class) + # Convert dampening bool to float for reference optimizer + if "dampening" in kwargs and isinstance(kwargs["dampening"], bool): + kwargs["dampening"] = 0.9 if kwargs["dampening"] else 0.0 + return kwargs + + +# Manual list to mirror the original explicit coverage for SGD +TESTS = [ + OptTest( + name="sgd_base", + optimi_class=optimi.SGD, + optimi_params=SGDParams(), + reference_class=torch.optim.SGD, + reference_params=SGDParams(), + skip_tests=[OptTestType.accumulation], + ), + OptTest( + name="sgd_momentum", + optimi_class=optimi.SGD, + optimi_params=SGDParams(momentum=0.9), + reference_class=torch.optim.SGD, + reference_params=SGDParams(momentum=0.9), + ), + OptTest( + name="sgd_dampening", + optimi_class=optimi.SGD, + optimi_params=SGDParams(momentum=0.9, dampening=True, torch_init=True), + reference_class=torch.optim.SGD, + reference_params=SGDParams(momentum=0.9, dampening=0.9), + ), + OptTest( + name="sgd_weight_decay", + optimi_class=optimi.SGD, + optimi_params=SGDParams(momentum=0.9, weight_decay=1e-2), + reference_class=torch.optim.SGD, + reference_params=SGDParams(momentum=0.9, weight_decay=1e-2), + skip_tests=[OptTestType.accumulation], + ), + OptTest( + name="sgd_decoupled_lr", + optimi_class=optimi.SGD, + optimi_params=SGDParams(momentum=0.9, dampening=True, decouple_lr=True, weight_decay=1e-5, torch_init=True), + reference_class=reference.DecoupledSGDW, + reference_params=SGDParams(momentum=0.9, dampening=0.9, weight_decay=1e-5), + ), +] diff --git a/tests/opt_stableadamw.py b/tests/opt_stableadamw.py new file mode 100644 index 0000000..69f1507 --- /dev/null +++ b/tests/opt_stableadamw.py @@ -0,0 +1,23 @@ +"""StableAdamW optimizer definitions using new OptTest/variants flow.""" + +from dataclasses import dataclass + +import optimi +from tests import reference + +from .config import BaseParams, OptTest + + +@dataclass +class StableAdamWParams(BaseParams): + betas: tuple[float, float] = (0.9, 0.99) + eps: float = 1e-6 + + +BASE = OptTest( + name="stableadamw", + optimi_class=optimi.StableAdamW, + optimi_params=StableAdamWParams(), + reference_class=reference.StableAdamWUnfused, + reference_params=StableAdamWParams(), +) diff --git a/tests/optimizer_test.py b/tests/optimizer_test.py deleted file mode 100644 index 6eecc4a..0000000 --- a/tests/optimizer_test.py +++ /dev/null @@ -1,400 +0,0 @@ -# Optimizer testing modified from bitsandbytes: https://github.com/TimDettmers/bitsandbytes/blob/main/tests/test_optim.py -# bitsandbytes - MIT License - Copyright (c) Facebook, Inc. and its affiliates. - -import inspect -import io -from typing import Optional - -import pytest -import torch -from torch import Tensor -from optimi.utils import MIN_TORCH_2_6 -from optimi import prepare_for_gradient_release, remove_gradient_release - - -@pytest.fixture() -def gpu_device(worker_id, request): - """Map xdist workers to available GPU devices in a round-robin fashion, - supporting CUDA (NVIDIA/ROCm) and XPU (Intel) backends. - Use a single specified GPU if --gpu-id is provided""" - - # Check if specific GPU was requested - specific_gpu = request.config.getoption("--gpu-id") - - # Determine available GPU backend and device count - if torch.cuda.is_available(): - backend = "cuda" - device_count = torch.cuda.device_count() - elif hasattr(torch, 'xpu') and torch.xpu.is_available(): - backend = "xpu" - device_count = torch.xpu.device_count() - else: - # Fallback to cuda for compatibility - backend = "cuda" - device_count = 0 - - if specific_gpu is not None: - return f"{backend}:{specific_gpu}" - - if worker_id == "master": - return backend - - # If no devices available, return default backend - if device_count == 0: - return backend - - # Extract worker number from worker_id (e.g., 'gw6' -> 6) - worker_num = int(worker_id.replace('gw', '')) - - # Map worker to GPU index using modulo to round-robin - gpu_idx = (worker_num - 1) % device_count - return f"{backend}:{gpu_idx}" - - -class MLP(torch.nn.Module): - def __init__(self, input_size, hidden_size, device, dtype): - super().__init__() - self.norm = torch.nn.LayerNorm(input_size, device=device, dtype=dtype) - self.fc1 = torch.nn.Linear(input_size, hidden_size, bias=False, device=device, dtype=dtype) - self.act = torch.nn.Mish() - self.fc2 = torch.nn.Linear(hidden_size, 1, bias=False, device=device, dtype=dtype) - - def forward(self, x): - x = self.norm(x) - x = self.fc1(x) - x = self.act(x) - x = self.fc2(x) - return x - - -def assert_most_approx_close(a: Tensor, b: Tensor, rtol: float = 1e-3, atol: float = 1e-3, max_error_count: int = 0, max_error_rate: float | None = None, name: str = ''): - idx = torch.isclose(a.float(), b.float(), rtol=rtol, atol=atol) - error_count = (idx == 0).sum().item() - if max_error_rate is not None: - if error_count > (a.numel()) * max_error_rate and error_count > max_error_count: - print(f"{name}Too many values not close: assert {error_count} < {(a.numel()) * max_error_rate}") - torch.testing.assert_close(a.float(), b.float(), rtol=rtol, atol=atol) - elif error_count > max_error_count: - print(f"{name}Too many values not close: assert {error_count} < {max_error_count}") - torch.testing.assert_close(a.float(), b.float(), rtol=rtol, atol=atol) - - -def load_optimizer(params, optimizers, optim_name, key, ftype, skip=False) -> torch.optim.Optimizer: - def update_kwargs(key, argspec, value=True): - if key in argspec.kwonlyargs or key in argspec.args: - kwargs.update({key: value}) - elif value and skip: - pytest.skip(f"Skipping {key} for {optim_name}") - - if optim_name in optimizers: - optimizer = optimizers[optim_name][key]['optim'] - kwargs = optimizers[optim_name][key]['kwargs'] - else: - raise ValueError(f"{optim_name} optimizer not defined") - - argspec = inspect.getfullargspec(optimizer) - update_kwargs('fused', argspec, False) - update_kwargs('foreach', argspec, False) - update_kwargs('triton', argspec, False) - if ftype != '': - update_kwargs(ftype, argspec, True) - - return optimizer(params, **kwargs) - - -def run_optimizer(optimizers:dict, dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, - ftype:str, device:torch.device, buffer:io.BytesIO, iterations:Optional[int]=None, - any_precision:bool=False, atol_override:Optional[dict[torch.dtype, float]]=None, - rtol_override:Optional[dict[torch.dtype, float]]=None, - max_error_rate_override:Optional[dict[torch.dtype, float]]=None): - if dim1 == 1 and dim2 == 1: - pytest.skip("Skipping 1x1 optimizer test") - - ftype = ftype.replace('_', '') - - if atol_override is None: - atol_override = {} - if rtol_override is None: - rtol_override = {} - if max_error_rate_override is None: - max_error_rate_override = {} - - if iterations is None: - if device == torch.device('cpu'): - iterations = 20 - else: - iterations = 40 - - # allow for a small number of errors on low dimension tests - max_error_count = 2 if device == torch.device('cpu') else 5 - - if dtype == torch.float32: - atol = atol_override.get(torch.float32, 1e-6) - rtol = rtol_override.get(torch.float32, 1e-5) - max_error_rate = max_error_rate_override.get(torch.float32, 0.0005) - elif dtype == torch.bfloat16: - atol = atol_override.get(torch.bfloat16, 1e-3) - rtol = rtol_override.get(torch.bfloat16, 1e-2) - max_error_rate = max_error_rate_override.get(torch.bfloat16, 0.01) - elif dtype == torch.float16: - atol = atol_override.get(torch.float16, 1e-4) - rtol = rtol_override.get(torch.float16, 1e-3) - max_error_rate = max_error_rate_override.get(torch.float16, 0.01) - - # Create MLP models instead of simple parameters - m1 = MLP(dim1, dim2, device=device, dtype=dtype) - m2 = MLP(dim1, dim2, device=device, dtype=dtype) - m2.load_state_dict(m1.state_dict()) - - # Convert model parameters to float for non-any_precision testing - if not any_precision and dtype != torch.float32: - for p in m1.parameters(): - p.data = p.data.float() - - torch_optimizer = load_optimizer(m1.parameters(), optimizers, optim_name, 0, ftype) - optimi_optimizer = load_optimizer(m2.parameters(), optimizers, optim_name, 1, ftype, skip=True) - - bs = 1 if device.type == "cpu" else 32 - - for i in range(iterations): - # Training loop with input/target generation - input1 = torch.randn(bs, dim1, device=device, dtype=dtype) - input2 = input1.detach().clone() - target1 = torch.randn(bs, 1, device=device, dtype=dtype) - target2 = target1.detach().clone() - - # Convert model parameters to float for non-any_precision testing - if not any_precision and dtype != torch.float32: - input1 = input1.float() - target1 = target1.float() - - # Forward pass - output1 = m1(input1) - output2 = m2(input2) - - # Loss calculation - loss1 = torch.nn.functional.mse_loss(output1, target1) - loss2 = torch.nn.functional.mse_loss(output2, target2) - - # Backward pass - loss1.backward() - loss2.backward() - - # Optimizer step - optimi_optimizer.step() - torch_optimizer.step() - - # Zero gradients - optimi_optimizer.zero_grad() - torch_optimizer.zero_grad() - - # Compare model weights - assert_most_approx_close(m1.fc1.weight, m2.fc1.weight, atol=atol, rtol=rtol, - max_error_count=max_error_count, max_error_rate=max_error_rate, - name='fc1: ') - assert_most_approx_close(m1.fc2.weight, m2.fc2.weight, atol=atol, rtol=rtol, - max_error_count=max_error_count, max_error_rate=max_error_rate, - name='fc2: ') - - # # Test state_dict saving and loading periodically - if i % (iterations // 10) == 0 and i > 0: - # Save optimizer state - torch.save(optimi_optimizer.state_dict(), buffer) - buffer.seek(0) - # Load checkpoint - ckpt = torch.load(buffer, weights_only=True) - # Recreate optimizer and load its state - optimi_optimizer = load_optimizer(m2.parameters(), optimizers, optim_name, 1, ftype) - optimi_optimizer.load_state_dict(ckpt) - # Clear buffer - buffer.seek(0) - buffer.truncate(0) - - # Verify models are still aligned after state_dict loading - assert_most_approx_close(m1.fc1.weight, m2.fc1.weight, atol=atol, rtol=rtol, - max_error_count=max_error_count, max_error_rate=max_error_rate, - name='fc1 after load: ') - assert_most_approx_close(m1.fc2.weight, m2.fc2.weight, atol=atol, rtol=rtol, - max_error_count=max_error_count, max_error_rate=max_error_rate, - name='fc2 after load: ') - - -def gradient_release(optimizers:dict, dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, - ftype:str, device:torch.device, iterations:int=40, framework_opt_step:bool=False, - atol_override:Optional[dict[torch.dtype, float]]=None, - rtol_override:Optional[dict[torch.dtype, float]]=None, - max_error_rate_override:Optional[dict[torch.dtype, float]]=None): - def optimizer_hook(parameter) -> None: - torch_optimizers[parameter].step() - torch_optimizers[parameter].zero_grad() - - # Since Lion & Adan can have noisy updates, allow up to 12 errors - max_error_count = 12 - - if atol_override is None: - atol_override = {} - if rtol_override is None: - rtol_override = {} - if max_error_rate_override is None: - max_error_rate_override = {} - - if dtype == torch.float32: - atol = atol_override.get(torch.float32, 2e-6) - rtol = rtol_override.get(torch.float32, 1e-5) - elif dtype == torch.bfloat16: - atol = atol_override.get(torch.bfloat16, 2e-3) - rtol = rtol_override.get(torch.bfloat16, 1e-2) - elif dtype == torch.float16: - atol = atol_override.get(torch.float16, 2e-4) - rtol = rtol_override.get(torch.float16, 1e-3) - - m1 = MLP(dim1, dim2, device=device, dtype=dtype) - m2 = MLP(dim1, dim2, device=device, dtype=dtype) - m3 = MLP(dim1, dim2, device=device, dtype=dtype) - m2.load_state_dict(m1.state_dict()) - m3.load_state_dict(m1.state_dict()) - - regular_optimizer = load_optimizer(m1.parameters(), optimizers, optim_name, 0, ftype) - - - # PyTorch Method: taken from https://pytorch.org/tutorials/intermediate/optimizer_step_in_backward_tutorial.html - torch_optimizers = {p: load_optimizer([p], optimizers, optim_name, 0, ftype) for p in m2.parameters()} - - pytorch_hooks = [] - for p in m2.parameters(): - pytorch_hooks.append(p.register_post_accumulate_grad_hook(optimizer_hook)) - - - # Optimim Method - # add the gradient release flag to the optimizer kwargs - optimizers[optim_name][1]['kwargs']['gradient_release'] = True - optimi_optimizer = load_optimizer(m3.parameters(), optimizers, optim_name, 1, ftype) - - prepare_for_gradient_release(m3, optimi_optimizer) - bs = 1 if device.type == "cpu" else 32 - - - # Training loop - for i in range(iterations): - input1 = torch.randn(bs, dim1, device=device, dtype=dtype) - input2 = input1.clone() - input3 = input1.clone() - target1 = torch.randn(bs, 1, device=device, dtype=dtype) - target2 = target1.clone() - target3 = target1.clone() - - output1 = m1(input1) - output2 = m2(input2) - output3 = m3(input3) - - loss1 = torch.nn.functional.mse_loss(output1, target1) - loss2 = torch.nn.functional.mse_loss(output2, target2) - loss3 = torch.nn.functional.mse_loss(output3, target3) - - loss1.backward() - loss2.backward() - loss3.backward() - - regular_optimizer.step() - regular_optimizer.zero_grad() - - # simulates using an optimi gradient release optimizer in a framework - # where the optimizer step and zero_grad cannot be disabled. - if framework_opt_step: - optimi_optimizer.step() - optimi_optimizer.zero_grad() - - assert_most_approx_close(m1.fc1.weight, m2.fc1.weight, rtol=rtol, atol=atol, - max_error_count=max_error_count, name='PyTorch-PyTorch: ') - assert_most_approx_close(m1.fc2.weight, m2.fc2.weight, rtol=rtol, atol=atol, - max_error_count=max_error_count, name='PyTorch-PyTorch: ') - assert_most_approx_close(m1.fc1.weight, m3.fc1.weight, rtol=rtol, atol=atol, - max_error_count=max_error_count, name='PyTorch-Optimi: ') - assert_most_approx_close(m1.fc2.weight, m3.fc2.weight, rtol=rtol, atol=atol, - max_error_count=max_error_count, name='PyTorch-Optimi: ') - - for h in pytorch_hooks: - h.remove() - remove_gradient_release(m3) - - -def optimizer_accumulation(optimizers:dict, dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, - ftype:str, device:torch.device, iterations:int=40, framework_opt_step:bool=False, - atol_override:Optional[dict[torch.dtype, float]]=None, - rtol_override:Optional[dict[torch.dtype, float]]=None, - max_error_rate_override:Optional[dict[torch.dtype, float]]=None): - # Since optimizer accumulation approximates gradient accumulation, the tolerances - # compared to normal optimizers are high despite the low number of iterations - max_error_rate = 0.035 - atol, rtol = 1e-2, 1e-2 - - m1 = MLP(dim1, dim2, device=device, dtype=dtype) - m2 = MLP(dim1, dim2, device=device, dtype=dtype) - m2.load_state_dict(m1.state_dict()) - - regular_optimizer = load_optimizer(m1.parameters(), optimizers, optim_name, 0, ftype) - - - # Optimim Method - # add the gradient release flag to the optimizer kwargs - optimizers[optim_name][1]['kwargs']['gradient_release'] = True - optimi_optimizer = load_optimizer(m2.parameters(), optimizers, optim_name, 1, ftype) - - prepare_for_gradient_release(m2, optimi_optimizer) - - gradient_accumulation_steps = 4 - bs = 1 if device.type == "cpu" else 32 - - # Training loop - for i in range(iterations): - input1 = torch.randn(bs, dim1, device=device, dtype=dtype) - input2 = input1.clone() - target1 = torch.randn(bs, 1, device=device, dtype=dtype) - target2 = target1.clone() - - optimi_optimizer.optimizer_accumulation = (i+1) % gradient_accumulation_steps != 0 - - output1 = m1(input1) - output2 = m2(input2) - - loss1 = torch.nn.functional.mse_loss(output1, target1) - loss2 = torch.nn.functional.mse_loss(output2, target2) - - loss1.backward() - loss2.backward() - - if not optimi_optimizer.optimizer_accumulation: - regular_optimizer.step() - regular_optimizer.zero_grad() - - # simulates using an optimi gradient release optimizer in a framework - # where the optimizer step and zero_grad cannot be disabled. - if framework_opt_step: - optimi_optimizer.step() - optimi_optimizer.zero_grad() - - # unlike other tests, compare that the weights are in the same approximate range at the end of training - assert_most_approx_close(m1.fc1.weight, m2.fc1.weight, rtol=rtol, atol=atol, max_error_rate=max_error_rate) - assert_most_approx_close(m1.fc2.weight, m2.fc2.weight, rtol=rtol, atol=atol, max_error_rate=max_error_rate) - - remove_gradient_release(m2) - - -buffer = io.BytesIO() - - -cpu_dim1 = [64] -cpu_dim2 = [64, 128] -cpu_dtype = [torch.float32] -cpu_ftype = ['', '_foreach'] - - -gpu_dim1 = [256] -gpu_dim2 = [256, 512, 1024, 2048] -gpu_dtype = [torch.float32, torch.bfloat16] -gpu_ftype = ['', '_foreach'] + (['_triton'] if MIN_TORCH_2_6 else []) - -gr_dim1 = [128] -gr_dim2 = [256, 1024] -gr_dtype = [torch.float32] -gr_ftype = [''] + (['_triton'] if MIN_TORCH_2_6 else []) \ No newline at end of file diff --git a/tests/radam_test.py b/tests/radam_test.py deleted file mode 100644 index 6012d85..0000000 --- a/tests/radam_test.py +++ /dev/null @@ -1,75 +0,0 @@ -from itertools import product - -import pytest -import torch - -import optimi -from packaging.version import parse - -from tests.optimizer_test import (buffer, run_optimizer, gradient_release, cpu_dim1, cpu_dim2, cpu_dtype, - cpu_ftype, gpu_dim1, gpu_dim2, gpu_dtype, gpu_ftype, gr_dim1, - gr_dim2, gr_dtype, gr_ftype, optimizer_accumulation, gpu_device) - -# PyTorch's RAdam adds epsilon before debiasing V while Optimi debases before. -# RAdam tests with a smaller epsilon then other optimizers to prevent numerical divergances. - -optimizers = {} - -optimizers["radam"] = ({'optim':torch.optim.RAdam, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), eps=1e-8, weight_decay=0)}, - {'optim':optimi.RAdam, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), eps=1e-8, weight_decay=0)}) - -optimizers["radam_l2"] = ({'optim':torch.optim.RAdam, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), eps=1e-8, weight_decay=1e-2)}, - {'optim':optimi.RAdam, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), eps=1e-8, weight_decay=1e-2, decouple_wd=False)}) - -if parse(torch.__version__) >= parse("2.2"): - optimizers["radamw"] = ({'optim':torch.optim.RAdam, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), eps=1e-8, weight_decay=1e-2, decoupled_weight_decay=True)}, - {'optim':optimi.RAdam, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), eps=1e-8, weight_decay=1e-2, decouple_wd=True)}) - - optimizers["radam_dlr"] = ({'optim':torch.optim.RAdam, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), eps=1e-8, weight_decay=1e-2, decoupled_weight_decay=True)}, - {'optim':optimi.RAdam, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), eps=1e-8, weight_decay=1e-5, decouple_lr=True)}) - -optimizer_names = [key for key in optimizers.keys()] - - - -cpu_values = list(product(cpu_dim1, cpu_dim2, cpu_dtype, optimizer_names, cpu_ftype)) -cpu_names = ["dim1_{}_dim2_{}_dtype_{}_optim_{}{}".format(*vals) for vals in cpu_values] - -@pytest.mark.cpu -@pytest.mark.radam -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cpu_values, ids=cpu_names) -def test_optimizer_cpu(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str): - run_optimizer(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device('cpu'), buffer) - - - -cuda_values = list(product(gpu_dim1, gpu_dim2, gpu_dtype, optimizer_names, gpu_ftype)) -cuda_names = ["dim1_{}_dim2_{}_dtype_{}_optim_{}{}".format(*vals) for vals in cuda_values] - -@pytest.mark.gpu -@pytest.mark.radam -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cuda_values, ids=cuda_names) -def test_optimizer_gpu(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str, gpu_device:str): - run_optimizer(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device(gpu_device), buffer, max_error_rate_override={torch.float32: 0.001}) - - - -cuda_values = list(product(gr_dim1, gr_dim2, gr_dtype, optimizer_names, gr_ftype)) -cuda_names = ["dim1_{}_dim2_{}_dtype_{}_optim_{}{}".format(*vals) for vals in cuda_values] - -@pytest.mark.gpu -@pytest.mark.radam -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cuda_values, ids=cuda_names) -def test_gradient_release(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str, gpu_device:str): - gradient_release(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device(gpu_device), - framework_opt_step=torch.rand(1).item() > 0.5, max_error_rate_override={torch.float32: 0.001}) - - -@pytest.mark.gpu -@pytest.mark.radam -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cuda_values, ids=cuda_names) -def test_optimizer_accumulation(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str, gpu_device:str): - if optim_name in ["radam_l2"]: - pytest.skip("Skip tests for RAdam with L2 weight decay.") - optimizer_accumulation(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device(gpu_device), - framework_opt_step=torch.rand(1).item() > 0.5, max_error_rate_override={torch.float32: 0.001}) diff --git a/tests/ranger_test.py b/tests/ranger_test.py deleted file mode 100644 index c38b3fb..0000000 --- a/tests/ranger_test.py +++ /dev/null @@ -1,67 +0,0 @@ -from itertools import product - -import pytest -import torch - -import optimi -from tests import reference - -from tests.optimizer_test import (buffer, run_optimizer, gradient_release, cpu_dim1, cpu_dim2, cpu_dtype, - cpu_ftype, gpu_dim1, gpu_dim2, gpu_dtype, gpu_ftype, gr_dim1, - gr_dim2, gr_dtype, gr_ftype, optimizer_accumulation, gpu_device) - -# The reference Ranger adds epsilon before debiasing V while Optimi debases before. -# Ranger tests with a smaller epsilon then other optimizers to prevent numerical divergances. - -optimizers = {} - -optimizers["ranger"] = ({'optim':reference.Ranger, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), eps=1e-8, weight_decay=0)}, - {'optim':optimi.Ranger, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), eps=1e-8, weight_decay=0)}) - -# reference doesn't perform the normal weight decay step, so no wd tests - -optimizer_names = [key for key in optimizers.keys()] - - - -cpu_values = list(product(cpu_dim1, cpu_dim2, cpu_dtype, optimizer_names, cpu_ftype)) -cpu_names = ["dim1_{}_dim2_{}_dtype_{}_optim_{}{}".format(*vals) for vals in cpu_values] - -@pytest.mark.cpu -@pytest.mark.ranger -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cpu_values, ids=cpu_names) -def test_optimizer_cpu(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str): - run_optimizer(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device('cpu'), buffer) - - - -cuda_values = list(product(gpu_dim1, gpu_dim2, gpu_dtype, optimizer_names, gpu_ftype)) -cuda_names = ["dim1_{}_dim2_{}_dtype_{}_optim_{}{}".format(*vals) for vals in cuda_values] - -@pytest.mark.gpu -@pytest.mark.ranger -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cuda_values, ids=cuda_names) -def test_optimizer_gpu(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str, gpu_device:str): - # test ranger longer due to the lookahead step - run_optimizer(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device(gpu_device), buffer) - - - -cuda_values = list(product(gr_dim1, gr_dim2, gr_dtype, optimizer_names, gr_ftype)) -cuda_names = ["dim1_{}_dim2_{}_dtype_{}_optim_{}{}".format(*vals) for vals in cuda_values] - -@pytest.mark.gpu -@pytest.mark.ranger -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cuda_values, ids=cuda_names) -def test_gradient_release(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str, gpu_device:str): - # test ranger longer due to the lookahead step - gradient_release(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device(gpu_device), - iterations=160, framework_opt_step=torch.rand(1).item() > 0.5) - - -@pytest.mark.gpu -@pytest.mark.ranger -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cuda_values, ids=cuda_names) -def test_optimizer_accumulation(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str, gpu_device:str): - optimizer_accumulation(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device(gpu_device), - framework_opt_step=torch.rand(1).item() > 0.5) diff --git a/tests/runner.py b/tests/runner.py new file mode 100644 index 0000000..f0cecba --- /dev/null +++ b/tests/runner.py @@ -0,0 +1,332 @@ +from __future__ import annotations + +import io +import random + +import torch +from optimi import prepare_for_gradient_release, remove_gradient_release +from torch import Tensor + +from .config import Backend, DeviceType, OptTest, OptTestType + + +def _device_type(device: torch.device) -> DeviceType: + return DeviceType.cpu if device.type == "cpu" else DeviceType.gpu + + +def _get_iterations( + opttest: OptTest, + test_type: OptTestType, + default: int, + device: torch.device | None = None, + dtype: torch.dtype | None = None, +) -> int: + if not opttest.custom_iterations: + return default + if device is not None: + key = (test_type, _device_type(device)) + if dtype is not None: + dtype_key = (test_type, _device_type(device), dtype) + if dtype_key in opttest.custom_iterations: + return opttest.custom_iterations[dtype_key] + if key in opttest.custom_iterations: + return opttest.custom_iterations[key] + return opttest.custom_iterations.get(test_type, default) + + +def assert_most_approx_close( + a: torch.Tensor, + b: torch.Tensor, + rtol: float = 1e-3, + atol: float = 1e-3, + max_error_count: int = 0, + max_error_rate: float | None = None, + name: str = "", +) -> None: + """Assert that most values in two tensors are approximately close.""" + idx = torch.isclose(a.float(), b.float(), rtol=rtol, atol=atol) + error_count = (idx == 0).sum().item() + + if max_error_rate is not None: + if error_count > (a.numel()) * max_error_rate and error_count > max_error_count: + msg = f"{name}Too many values not close: assert {error_count} < {(a.numel()) * max_error_rate}" + torch.testing.assert_close(a.float(), b.float(), rtol=rtol, atol=atol, msg=msg) + elif error_count > max_error_count: + msg = f"{name}Too many values not close: assert {error_count} < {max_error_count}" + torch.testing.assert_close(a.float(), b.float(), rtol=rtol, atol=atol, msg=msg) + + +class MLP(torch.nn.Module): + def __init__(self, input_size: int, hidden_size: int, device: torch.device, dtype: torch.dtype): + super().__init__() + self.norm = torch.nn.LayerNorm(input_size, device=device, dtype=dtype) + self.fc1 = torch.nn.Linear(input_size, hidden_size, bias=False, device=device, dtype=dtype) + self.act = torch.nn.Mish() + self.fc2 = torch.nn.Linear(hidden_size, 1, bias=False, device=device, dtype=dtype) + + def forward(self, x: Tensor) -> Tensor: + x = self.norm(x) + x = self.fc1(x) + x = self.act(x) + x = self.fc2(x) + return x + + +def run_test( + opttest: OptTest, + device: torch.device, + dtype: torch.dtype, + backend: Backend, + test_type: OptTestType, + dims: tuple[int, int] | None = None, +) -> None: + if test_type == OptTestType.normal: + normal_spec = opttest.spec.normal + normal_iters = normal_spec.iterations_cpu if device.type == "cpu" else normal_spec.iterations_gpu + iterations = _get_iterations(opttest, test_type, normal_iters, device=device, dtype=dtype) + tolerance = normal_spec.tolerance[dtype] + + if dims is None: + dim1, dim2 = (64, 128) if device.type == "cpu" else (256, 512) + else: + dim1, dim2 = dims + + batch_size = normal_spec.batch_cpu if device.type == "cpu" else normal_spec.batch_gpu + max_error_count = normal_spec.max_error_cpu if device.type == "cpu" else normal_spec.max_error_gpu + max_error_rate = tolerance.max_error_rate + + elif test_type == OptTestType.gradient_release: + gradient_spec = opttest.spec.gradient_release + iterations = _get_iterations(opttest, test_type, gradient_spec.iterations, device=device, dtype=dtype) + tolerance = gradient_spec.tolerance[dtype] + + dim1, dim2 = dims if dims is not None else (128, 256) + batch_size = gradient_spec.batch + max_error_count = gradient_spec.max_error_count + max_error_rate = tolerance.max_error_rate + + elif test_type == OptTestType.accumulation: + accumulation_spec = opttest.spec.accumulation + iterations = _get_iterations(opttest, test_type, accumulation_spec.iterations, device=device, dtype=dtype) + tolerance = accumulation_spec.tolerance[dtype] + dim1, dim2 = dims if dims is not None else (128, 256) + batch_size = accumulation_spec.batch + max_error_count = 0 + max_error_rate = accumulation_spec.max_error_rate + else: + raise ValueError(f"Unknown test type: {test_type}") + + m1 = MLP(dim1, dim2, device=device, dtype=dtype) + m2 = MLP(dim1, dim2, device=device, dtype=dtype) + m2.load_state_dict(m1.state_dict()) + + if test_type == OptTestType.gradient_release: + m3 = MLP(dim1, dim2, device=device, dtype=dtype) + m3.load_state_dict(m1.state_dict()) + else: + m3 = None + + if test_type == OptTestType.normal and not opttest.any_precision and dtype != torch.float32: + for p in m1.parameters(): + p.data = p.data.float() + + reference_kwargs = opttest.to_reference_kwargs(backend) + optimi_kwargs = opttest.to_optimi_kwargs(backend) + reference_class = opttest.reference_class + + reference_optimizer = None + optimi_optimizer = None + torch_optimizers: dict[torch.nn.Parameter, torch.optim.Optimizer] | None = None + pytorch_hooks: list[torch.utils.hooks.RemovableHandle] = [] + + if test_type == OptTestType.normal: + reference_optimizer = reference_class(m1.parameters(), **reference_kwargs) + optimi_optimizer = opttest.optimi_class(m2.parameters(), **optimi_kwargs) + buffer = io.BytesIO() + elif test_type == OptTestType.gradient_release: + reference_optimizer = reference_class(m1.parameters(), **reference_kwargs) + + def optimizer_hook(parameter) -> None: + assert torch_optimizers is not None + torch_optimizers[parameter].step() + torch_optimizers[parameter].zero_grad() + + torch_optimizers = {p: reference_class([p], **reference_kwargs) for p in m2.parameters()} + for p in m2.parameters(): + pytorch_hooks.append(p.register_post_accumulate_grad_hook(optimizer_hook)) + + optimi_kwargs["gradient_release"] = True + optimi_optimizer = opttest.optimi_class(m3.parameters(), **optimi_kwargs) + prepare_for_gradient_release(m3, optimi_optimizer) + else: + reference_optimizer = reference_class(m1.parameters(), **reference_kwargs) + optimi_kwargs["gradient_release"] = True + optimi_optimizer = opttest.optimi_class(m2.parameters(), **optimi_kwargs) + prepare_for_gradient_release(m2, optimi_optimizer) + gradient_accumulation_steps = accumulation_spec.gradient_accumulation_steps + + for i in range(iterations): + input1 = torch.randn(batch_size, dim1, device=device, dtype=dtype) + if test_type == OptTestType.normal: + input2 = input1.detach().clone() + else: + input2 = input1.clone() + target1 = torch.randn(batch_size, 1, device=device, dtype=dtype) + if test_type == OptTestType.normal: + target2 = target1.detach().clone() + else: + target2 = target1.clone() + + if test_type == OptTestType.gradient_release: + input3 = input1.clone() + target3 = target1.clone() + else: + input3 = None + target3 = None + + if test_type == OptTestType.normal and not opttest.any_precision and dtype != torch.float32: + input1 = input1.float() + target1 = target1.float() + + if test_type == OptTestType.accumulation: + optimi_optimizer.optimizer_accumulation = (i + 1) % gradient_accumulation_steps != 0 + + output1 = m1(input1) + output2 = m2(input2) + output3 = m3(input3) if m3 is not None else None + + loss1 = torch.nn.functional.mse_loss(output1, target1) + loss2 = torch.nn.functional.mse_loss(output2, target2) + loss3 = torch.nn.functional.mse_loss(output3, target3) if output3 is not None else None + + loss1.backward() + loss2.backward() + if loss3 is not None: + loss3.backward() + + if test_type == OptTestType.normal: + reference_optimizer.step() + optimi_optimizer.step() + reference_optimizer.zero_grad() + optimi_optimizer.zero_grad() + elif test_type == OptTestType.gradient_release: + reference_optimizer.step() + reference_optimizer.zero_grad() + elif not optimi_optimizer.optimizer_accumulation: + reference_optimizer.step() + reference_optimizer.zero_grad() + + if test_type in (OptTestType.gradient_release, OptTestType.accumulation): + if random.random() < 0.5: + optimi_optimizer.step() + optimi_optimizer.zero_grad() + + if test_type == OptTestType.normal: + assert_most_approx_close( + m1.fc1.weight, + m2.fc1.weight, + atol=tolerance.atol, + rtol=tolerance.rtol, + max_error_count=max_error_count, + max_error_rate=max_error_rate, + name="fc1: ", + ) + assert_most_approx_close( + m1.fc2.weight, + m2.fc2.weight, + atol=tolerance.atol, + rtol=tolerance.rtol, + max_error_count=max_error_count, + max_error_rate=max_error_rate, + name="fc2: ", + ) + + if i % max(1, iterations // 10) == 0 and i > 0: + torch.save(optimi_optimizer.state_dict(), buffer) + buffer.seek(0) + ckpt = torch.load(buffer, weights_only=True) + optimi_optimizer = opttest.optimi_class(m2.parameters(), **optimi_kwargs) + optimi_optimizer.load_state_dict(ckpt) + buffer.seek(0) + buffer.truncate(0) + + assert_most_approx_close( + m1.fc1.weight, + m2.fc1.weight, + atol=tolerance.atol, + rtol=tolerance.rtol, + max_error_count=max_error_count, + max_error_rate=max_error_rate, + name="fc1 after load: ", + ) + assert_most_approx_close( + m1.fc2.weight, + m2.fc2.weight, + atol=tolerance.atol, + rtol=tolerance.rtol, + max_error_count=max_error_count, + max_error_rate=max_error_rate, + name="fc2 after load: ", + ) + elif test_type == OptTestType.gradient_release: + assert_most_approx_close( + m1.fc1.weight, + m2.fc1.weight, + rtol=tolerance.rtol, + atol=tolerance.atol, + max_error_count=max_error_count, + max_error_rate=max_error_rate, + name="PyTorch-PyTorch: ", + ) + assert_most_approx_close( + m1.fc2.weight, + m2.fc2.weight, + rtol=tolerance.rtol, + atol=tolerance.atol, + max_error_count=max_error_count, + max_error_rate=max_error_rate, + name="PyTorch-PyTorch: ", + ) + assert_most_approx_close( + m1.fc1.weight, + m3.fc1.weight, + rtol=tolerance.rtol, + atol=tolerance.atol, + max_error_count=max_error_count, + max_error_rate=max_error_rate, + name="PyTorch-Optimi: ", + ) + assert_most_approx_close( + m1.fc2.weight, + m3.fc2.weight, + rtol=tolerance.rtol, + atol=tolerance.atol, + max_error_count=max_error_count, + max_error_rate=max_error_rate, + name="PyTorch-Optimi: ", + ) + + if test_type == OptTestType.accumulation: + assert_most_approx_close( + m1.fc1.weight, + m2.fc1.weight, + rtol=tolerance.rtol, + atol=tolerance.atol, + max_error_count=max_error_count, + max_error_rate=max_error_rate, + ) + assert_most_approx_close( + m1.fc2.weight, + m2.fc2.weight, + rtol=tolerance.rtol, + atol=tolerance.atol, + max_error_count=max_error_count, + max_error_rate=max_error_rate, + ) + + for h in pytorch_hooks: + h.remove() + if test_type == OptTestType.gradient_release: + remove_gradient_release(m3) + elif test_type == OptTestType.accumulation: + remove_gradient_release(m2) diff --git a/tests/sgd_test.py b/tests/sgd_test.py deleted file mode 100644 index 02df73a..0000000 --- a/tests/sgd_test.py +++ /dev/null @@ -1,77 +0,0 @@ -from itertools import product - -import pytest -import torch - -import optimi -from tests import reference - -from tests.optimizer_test import (buffer, run_optimizer, gradient_release, cpu_dim1, cpu_dim2, cpu_dtype, - cpu_ftype, gpu_dim1, gpu_dim2, gpu_dtype, gpu_ftype, gr_dim1, - gr_dim2, gr_dtype, gr_ftype, optimizer_accumulation, gpu_device) - - - -optimizers = {} - -optimizers["sgd"] = ({'optim':torch.optim.SGD, 'kwargs':dict(lr=1e-3, momentum=0, dampening=0, weight_decay=0)}, - {'optim':optimi.SGD, 'kwargs':dict(lr=1e-3, momentum=0, dampening=False, weight_decay=0)}) - -optimizers["sgd_mom"] = ({'optim':torch.optim.SGD, 'kwargs':dict(lr=1e-3, momentum=0.9, dampening=0, weight_decay=0)}, - {'optim':optimi.SGD, 'kwargs':dict(lr=1e-3, momentum=0.9, dampening=False, weight_decay=0)}) - -optimizers["sgd_damp"] = ({'optim':torch.optim.SGD, 'kwargs':dict(lr=1e-3, momentum=0.9, dampening=0.9, weight_decay=0)}, - {'optim':optimi.SGD, 'kwargs':dict(lr=1e-3, momentum=0.9, dampening=True, weight_decay=0, torch_init=True)}) - -optimizers["sgd_l2"] = ({'optim':torch.optim.SGD, 'kwargs':dict(lr=1e-3, momentum=0.9, dampening=0, weight_decay=1e-2)}, - {'optim':optimi.SGD, 'kwargs':dict(lr=1e-3, momentum=0.9, dampening=False, weight_decay=1e-2, decouple_wd=False)}) - -optimizers["sgdw_dlr"] = ({'optim':reference.DecoupledSGDW, 'kwargs':dict(lr=1e-3, momentum=0.9, dampening=0.9, weight_decay=1e-5)}, - {'optim':optimi.SGD, 'kwargs':dict(lr=1e-3, momentum=0.9, dampening=True, decouple_lr=True, weight_decay=1e-5, torch_init=True)}) - -optimizer_names = [key for key in optimizers.keys()] - - - -cpu_values = list(product(cpu_dim1, cpu_dim2, cpu_dtype, optimizer_names, cpu_ftype)) -cpu_names = ["dim1_{}_dim2_{}_dtype_{}_optim_{}{}".format(*vals) for vals in cpu_values] - -@pytest.mark.cpu -@pytest.mark.sgd -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cpu_values, ids=cpu_names) -def test_optimizer_cpu(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str): - run_optimizer(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device('cpu'), buffer) - - - -cuda_values = list(product(gpu_dim1, gpu_dim2, gpu_dtype, optimizer_names, gpu_ftype)) -cuda_names = ["dim1_{}_dim2_{}_dtype_{}_optim_{}{}".format(*vals) for vals in cuda_values] - -@pytest.mark.gpu -@pytest.mark.sgd -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cuda_values, ids=cuda_names) -def test_optimizer_gpu(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str, gpu_device:str): - run_optimizer(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device(gpu_device), buffer) - - - -cuda_values = list(product(gr_dim1, gr_dim2, gr_dtype, optimizer_names, gr_ftype)) -cuda_names = ["dim1_{}_dim2_{}_dtype_{}_optim_{}{}".format(*vals) for vals in cuda_values] - -@pytest.mark.gpu -@pytest.mark.sgd -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cuda_values, ids=cuda_names) -def test_gradient_release(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str, gpu_device:str): - gradient_release(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device(gpu_device), - framework_opt_step=torch.rand(1).item() > 0.5) - - -@pytest.mark.gpu -@pytest.mark.sgd -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cuda_values, ids=cuda_names) -def test_optimizer_accumulation(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str, gpu_device:str): - if optim_name in ["sgd", "sgd_l2"]: - pytest.skip("Skip tests for SGD and SGD with L2 weight decay.") - # SGD will error out more often if iterations is the default of 80 - optimizer_accumulation(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device(gpu_device), - iterations=20, framework_opt_step=torch.rand(1).item() > 0.5) diff --git a/tests/stableadam_test.py b/tests/stableadam_test.py deleted file mode 100644 index c5ea260..0000000 --- a/tests/stableadam_test.py +++ /dev/null @@ -1,68 +0,0 @@ -from itertools import product - -import pytest -import torch - -import optimi -from tests import reference - -from tests.optimizer_test import (buffer, run_optimizer, gradient_release, cpu_dim1, cpu_dim2, cpu_dtype, - cpu_ftype, gpu_dim1, gpu_dim2, gpu_dtype, gpu_ftype, gr_dim1, - gr_dim2, gr_dtype, gr_ftype, optimizer_accumulation, gpu_device) - - - -optimizers = {} - -optimizers["stableadam"] = ({'optim':reference.StableAdamWUnfused, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), weight_decay=0, eps=1e-6)}, - {'optim':optimi.StableAdamW, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), weight_decay=0, eps=1e-6)}) - -optimizers["stableadam_wd"] = ({'optim':reference.StableAdamWUnfused, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), weight_decay=1e-2, eps=1e-6)}, - {'optim':optimi.StableAdamW, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), weight_decay=1e-2, eps=1e-6)}) - -optimizers["stableadam_dlr"] = ({'optim':reference.StableAdamWUnfused, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), weight_decay=1e-2, eps=1e-6)}, - {'optim':optimi.StableAdamW, 'kwargs':dict(lr=1e-3, betas=(0.9, 0.99), weight_decay=1e-5, eps=1e-6, decouple_lr=True)}) - -optimizer_names = [key for key in optimizers.keys()] - - - -cpu_values = list(product(cpu_dim1, cpu_dim2, cpu_dtype, optimizer_names, cpu_ftype)) -cpu_names = ["dim1_{}_dim2_{}_dtype_{}_optim_{}{}".format(*vals) for vals in cpu_values] - -@pytest.mark.cpu -@pytest.mark.stableadam -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cpu_values, ids=cpu_names) -def test_optimizer_cpu(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str): - run_optimizer(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device('cpu'), buffer) - - - -cuda_values = list(product(gpu_dim1, gpu_dim2, gpu_dtype, optimizer_names, gpu_ftype)) -cuda_names = ["dim1_{}_dim2_{}_dtype_{}_optim_{}{}".format(*vals) for vals in cuda_values] - -@pytest.mark.gpu -@pytest.mark.stableadam -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cuda_values, ids=cuda_names) -def test_optimizer_gpu(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str, gpu_device:str): - run_optimizer(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device(gpu_device), buffer, iterations=80) - - - -cuda_values = list(product(gr_dim1, gr_dim2, gr_dtype, optimizer_names, gr_ftype)) -cuda_names = ["dim1_{}_dim2_{}_dtype_{}_optim_{}{}".format(*vals) for vals in cuda_values] - -@pytest.mark.gpu -@pytest.mark.stableadam -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cuda_values, ids=cuda_names) -def test_gradient_release(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str, gpu_device:str): - gradient_release(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device(gpu_device), - framework_opt_step=torch.rand(1).item() > 0.5) - - -@pytest.mark.gpu -@pytest.mark.stableadam -@pytest.mark.parametrize("dim1, dim2, dtype, optim_name, ftype", cuda_values, ids=cuda_names) -def test_optimizer_accumulation(dim1:int, dim2:int, dtype:torch.dtype, optim_name:str, ftype:str, gpu_device:str): - optimizer_accumulation(optimizers, dim1, dim2, dtype, optim_name, ftype, torch.device(gpu_device), - framework_opt_step=torch.rand(1).item() > 0.5) diff --git a/tests/test_optimizers.py b/tests/test_optimizers.py new file mode 100644 index 0000000..5b2e65f --- /dev/null +++ b/tests/test_optimizers.py @@ -0,0 +1,161 @@ +import pytest +import torch +from _pytest.mark.structures import ParameterSet + + +from .config import Backend, DeviceType, OptTest, OptTestType, discover_tests +from .runner import run_test + +DEVICE_PARAMS = [ + pytest.param(DeviceType.cpu, marks=pytest.mark.cpu, id=DeviceType.cpu.value), + pytest.param(DeviceType.gpu, marks=pytest.mark.gpu, id=DeviceType.gpu.value), +] +DTYPE_PARAMS = [ + pytest.param(torch.float32, marks=pytest.mark.float32, id="float32"), + pytest.param(torch.bfloat16, marks=pytest.mark.bfloat16, id="bfloat16"), +] +BACKEND_PARAMS = [ + pytest.param(Backend.torch, marks=pytest.mark.torch, id=Backend.torch.value), + pytest.param(Backend.foreach, marks=pytest.mark.foreach, id=Backend.foreach.value), + pytest.param(Backend.triton, marks=pytest.mark.triton, id=Backend.triton.value), +] + +# Attach per-optimizer marks so users can -m adam, -m sgd, etc. +OPTIMIZERS = [pytest.param(c, id=c.name, marks=getattr(pytest.mark, c.optimizer_name)) for c in discover_tests()] + +# Full dimensions: CPU -> (64,64), (64,128); GPU -> (256,256), (256,512), (256,1024), (256,2048) +CPU_DIMS = [ + pytest.param((64, 64), id="cpu-64x64"), + pytest.param((64, 128), id="cpu-64x128"), +] +GPU_FULL_DIMS = [ + pytest.param((256, 256), id="gpu-256x256"), + pytest.param((256, 512), id="gpu-256x512"), + pytest.param((256, 1024), id="gpu-256x1024"), + pytest.param((256, 2048), id="gpu-256x2048"), +] + +# Gradient release and accumulation dims: (128,256) and (128,1024) +SUBSET_DIMS = [ + pytest.param((128, 256), id="gr-128x256"), + pytest.param((128, 1024), id="gr-128x1024"), +] + + +def _should_skip(test_type: OptTestType, opttest: OptTest, device_type: DeviceType, dtype: torch.dtype, backend: Backend) -> bool: + # 1. Hardware availability + if not device_type.is_available(): + return True + + # 2. Backend support for hardware + if not backend.is_supported(device_type): + return True + + # 3. Explicit per-opttest skip + if test_type in set(opttest.skip_tests): + return True + + # 4. Respect per-test dtype constraints if provided + if opttest.only_dtypes and dtype not in opttest.only_dtypes: + return True + + # 5. Gradient release and accumulation are GPU-only tests + if test_type in (OptTestType.gradient_release, OptTestType.accumulation) and device_type == DeviceType.cpu: + return True + + # 6. bfloat16 is not supported on MPS + if ( + device_type == DeviceType.gpu + and dtype == torch.bfloat16 + and not (torch.cuda.is_available() or (hasattr(torch, "xpu") and torch.xpu.is_available())) + and (hasattr(torch.backends, "mps") and torch.backends.mps.is_available()) + ): + return True + + # 7. Skip bfloat16 on CPU for most optimizers; allow anyadam exception via opttest.any_precision + if device_type == DeviceType.cpu and dtype == torch.bfloat16 and not opttest.any_precision: + return True + + # 8. Skip foreach for gradient release and accumulation tests + if test_type != OptTestType.normal and backend == Backend.foreach: + return True + + return False + + +def _param_value(param: ParameterSet) -> object: + return param.values[0] + + +def _param_id(param: ParameterSet) -> str: + return param.id or str(param.values[0]) + + +def _build_params(test_type: OptTestType) -> list[ParameterSet]: + if test_type == OptTestType.normal: + device_params = DEVICE_PARAMS + dtype_params = DTYPE_PARAMS + else: + device_params = [pytest.param(DeviceType.gpu, marks=pytest.mark.gpu, id=DeviceType.gpu.value)] + dtype_params = [pytest.param(torch.float32, marks=pytest.mark.float32, id="float32")] + dims_params = SUBSET_DIMS + + params: list[ParameterSet] = [] + for opt_param in OPTIMIZERS: + for device_param in device_params: + if test_type == OptTestType.normal: + dims_params = GPU_FULL_DIMS if _param_value(device_param) == DeviceType.gpu else CPU_DIMS + for dtype_param in dtype_params: + for backend_param in BACKEND_PARAMS: + for dims_param in dims_params: + if _should_skip( + test_type, + _param_value(opt_param), + _param_value(device_param), + _param_value(dtype_param), + _param_value(backend_param), + ): + continue + param_id = "-".join( + [ + _param_id(dims_param), + _param_id(backend_param), + _param_id(dtype_param), + _param_id(device_param), + _param_id(opt_param), + ] + ) + params.append( + pytest.param( + _param_value(opt_param), + _param_value(device_param), + _param_value(dtype_param), + _param_value(backend_param), + _param_value(dims_param), + id=param_id, + marks=list(opt_param.marks + device_param.marks + dtype_param.marks + backend_param.marks), + ) + ) + return params + + +def _get_device(device_type: DeviceType, request: pytest.FixtureRequest) -> torch.device: + if device_type == DeviceType.gpu: + return torch.device(request.getfixturevalue("gpu_device")) + else: + return torch.device("cpu") + + +@pytest.mark.parametrize("opttest, device_type, dtype, backend, dims", _build_params(OptTestType.normal)) +def test_normal(opttest, device_type, dtype, backend, dims, request): + run_test(opttest, _get_device(device_type, request), dtype, backend, OptTestType.normal, dims=dims) + + +@pytest.mark.parametrize("opttest, device_type, dtype, backend, dims", _build_params(OptTestType.gradient_release)) +def test_gradient_release(opttest, device_type, dtype, backend, dims, request): + run_test(opttest, _get_device(device_type, request), dtype, backend, OptTestType.gradient_release, dims=dims) + + +@pytest.mark.parametrize("opttest, device_type, dtype, backend, dims", _build_params(OptTestType.accumulation)) +def test_accumulation(opttest, device_type, dtype, backend, dims, request): + run_test(opttest, _get_device(device_type, request), dtype, backend, OptTestType.accumulation, dims=dims) diff --git a/tests/to_low_precision_test.py b/tests/test_to_low_precision.py similarity index 100% rename from tests/to_low_precision_test.py rename to tests/test_to_low_precision.py diff --git a/tests/weight_decay_test.py b/tests/test_weight_decay.py similarity index 100% rename from tests/weight_decay_test.py rename to tests/test_weight_decay.py