diff --git a/.github/workflows/auto-release.yml b/.github/workflows/auto-release.yml index 8c34a51..924a0d3 100644 --- a/.github/workflows/auto-release.yml +++ b/.github/workflows/auto-release.yml @@ -20,12 +20,12 @@ jobs: should-release: ${{ steps.check.outputs.should_release }} steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v4 with: fetch-depth: 0 # Need full history to compare versions - name: Set up Python - uses: actions/setup-python@v6 + uses: actions/setup-python@v5 with: python-version: "3.12" @@ -85,10 +85,10 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v6 + uses: actions/setup-python@v5 with: python-version: "3.12" @@ -112,10 +112,10 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v6 + uses: actions/setup-python@v5 with: python-version: "3.12" @@ -128,7 +128,7 @@ jobs: run: poetry build - name: Upload build artifacts - uses: actions/upload-artifact@v6 + uses: actions/upload-artifact@v4 with: name: dist path: dist/ @@ -139,7 +139,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v4 - name: Create and push tag run: | @@ -159,10 +159,10 @@ jobs: url: https://pypi.org/project/datacheck-cli/ steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v6 + uses: actions/setup-python@v5 with: python-version: "3.12" @@ -185,7 +185,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v4 with: fetch-depth: 0 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3456cea..7049878 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,10 +11,10 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v6 + uses: actions/setup-python@v5 with: python-version: "3.12" @@ -37,10 +37,10 @@ jobs: needs: [lint] steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v6 + uses: actions/setup-python@v5 with: python-version: "3.10" @@ -53,7 +53,7 @@ jobs: run: poetry build - name: Upload artifacts - uses: actions/upload-artifact@v6 + uses: actions/upload-artifact@v4 with: name: dist path: dist/ diff --git a/.github/workflows/pr-version-check.yml b/.github/workflows/pr-version-check.yml index 761d2fc..d4c697e 100644 --- a/.github/workflows/pr-version-check.yml +++ b/.github/workflows/pr-version-check.yml @@ -11,12 +11,12 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v4 with: fetch-depth: 0 # Need full history to compare with base branch - name: Set up Python - uses: actions/setup-python@v6 + uses: actions/setup-python@v5 with: python-version: "3.12" diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 3ea7251..a7c2559 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -14,10 +14,10 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v6 + uses: actions/setup-python@v5 with: python-version: "3.12" @@ -40,10 +40,10 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v6 + uses: actions/setup-python@v5 with: python-version: "3.12" @@ -56,7 +56,7 @@ jobs: run: poetry build - name: Upload build artifacts - uses: actions/upload-artifact@v6 + uses: actions/upload-artifact@v4 with: name: dist path: dist/ @@ -69,10 +69,10 @@ jobs: url: https://pypi.org/project/datacheck/ steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v6 + uses: actions/setup-python@v5 with: python-version: "3.12" @@ -94,7 +94,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v4 with: fetch-depth: 0 # Fetch all history for changelog @@ -134,7 +134,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v4 - name: Extract version from tag id: version diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml index fa98c53..03dde64 100644 --- a/.github/workflows/security.yml +++ b/.github/workflows/security.yml @@ -20,10 +20,10 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v6 + uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v6 + uses: actions/setup-python@v5 with: python-version: "3.11" @@ -44,7 +44,7 @@ jobs: - name: Upload Bandit report if: always() - uses: actions/upload-artifact@v6 + uses: actions/upload-artifact@v4 with: name: bandit-report path: bandit-report.json @@ -56,10 +56,10 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v6 + uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v6 + uses: actions/setup-python@v5 with: python-version: "3.11" @@ -86,7 +86,7 @@ jobs: - name: Upload Safety report if: always() - uses: actions/upload-artifact@v6 + uses: actions/upload-artifact@v4 with: name: safety-report path: safety-report.json @@ -94,7 +94,7 @@ jobs: - name: Upload pip-audit report if: always() - uses: actions/upload-artifact@v6 + uses: actions/upload-artifact@v4 with: name: pip-audit-report path: pip-audit-report.json @@ -106,7 +106,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v6 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -129,7 +129,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v6 + uses: actions/checkout@v4 - name: Initialize CodeQL uses: github/codeql-action/init@v3 diff --git a/.gitignore b/.gitignore index 1093d8b..a384462 100644 --- a/.gitignore +++ b/.gitignore @@ -151,4 +151,9 @@ init_db.py examples/ .claude/ -.datacheck/ \ No newline at end of file +.datacheck/ + +# Test suite — exclude runtime artifacts +testing/venv/ +testing/csv/results/ +testing/parquet/results/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 7f5d08b..2b39358 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -51,7 +51,6 @@ DataCheck v2.0.0 is the first major release under the new product vision: a focu - All data loaders now use `dtype_backend="pyarrow"` for Arrow-backed DataFrames - CSV loader uses `engine="pyarrow"` for 2-5x faster parsing - Parquet loader skips Arrow-to-NumPy conversion (~30% faster) - - DuckDB, Delta Lake, and Avro loaders also use Arrow backend - 2-5x memory reduction for string-heavy datasets - **CLI Support for `sum_equals` Rule** diff --git a/README.md b/README.md index 61ae2dc..b628774 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ DataCheck Logo

-

DataCheck — Data Validation Engine

+

DataCheck - A Linter for Data Pipelines

CI @@ -11,25 +11,63 @@ PyPI version

-DataCheck is a data quality validation engine for data engineers. Define validation rules in a YAML config and data sources in a separate sources file, then automatically validate data across files, databases, and cloud warehouses. +**DataCheck enforces deterministic validation rules at the pipeline boundary.** Define rules in YAML. Run in CI. Fail fast on bad data. No servers, no dashboards, no infrastructure. -DataCheck provides the `datacheck` Command-Line Interface (CLI) and a Python API, which you can use to validate data, profile quality, and detect schema changes. These operations can be executed locally during development, embedded programmatically within your data pipelines (Airflow, Dagster, Prefect, etc.), or integrated into CI/CD workflows. View the [Documentation](https://squrtech.github.io/datacheck/) for more details. +``` +Your data source → [DataCheck rules] → exit 0: pipeline continues + → exit 1: pipeline stops +``` + +View the [Documentation](https://squrtech.github.io/datacheck/) for full details. + +## Mental Model + +Code has linters. +Infrastructure has policy enforcement. +Data pipelines need gates. + +DataCheck is that gate. + +### Why DataCheck? + +Most teams detect bad data after the fact - broken dashboards, wrong reports, angry stakeholders. DataCheck enforces validation rules *before* bad data moves downstream, the same way a linter enforces code quality before bad code ships. + +- **Fail fast** - structured exit codes stop pipelines at the gate, not after the damage is done +- **Deterministic** - rules are explicit and binary. No heuristics. No anomaly scoring. No statistical guessing. +- **SQL pushdown** - database checks run as a single aggregate `SELECT`; no data leaves your warehouse +- **Zero infrastructure** - one `pip install`, one YAML file, runs anywhere +- **CI-native** - SARIF output to GitHub Security tab, GitHub Action, Apache Airflow operators + +### Validate Where Data Lives + +For databases, DataCheck executes validation as aggregate SQL inside your warehouse. -### Highlights +- No data pulled into pandas +- No row transfer +- No separate compute layer +- Single aggregate `SELECT` per rule set -- Define validation rules in YAML config and data sources inline (files) or in a `sources.yaml` (databases, cloud) -- Run checks on CSV, Parquet, Delta Lake, Avro, PostgreSQL, MySQL, Snowflake, BigQuery, Redshift, and more -- Use 27+ built-in data quality rules for null checks, numeric ranges, patterns, timestamps, email/phone/URL validation, and cross-column checks -- Profile data quality with automatic scoring, outlier detection, and rule suggestions -- Detect schema evolution with compatibility levels (COMPATIBLE, WARNING, BREAKING) -- Extend with custom rules using the `@custom_rule` plugin decorator +Validation happens where the data already lives. + +### Why not observability? + +DataCheck is **not** a data observability platform. It does not provide dashboards, trend analysis, anomaly detection, or SaaS backends. Those tools answer "what happened?" - DataCheck answers "does this data meet our rules right now?" Enforcement happens at the gate; investigation happens after. + +### What DataCheck Is Not + +- Not a monitoring dashboard +- Not anomaly detection +- Not a SaaS platform +- Not a data catalog + +It is an enforcement layer. ### Demo

DataCheck Quickstart Demo
- Install DataCheck, generate an ecommerce config with sample data, and run validation — all in one go. + Install DataCheck, generate an ecommerce config with sample data, and run validation - all in one go.

## Setup @@ -62,24 +100,24 @@ pip install datacheck-cli[mysql] # MySQL pip install datacheck-cli[snowflake] # Snowflake pip install datacheck-cli[bigquery] # BigQuery pip install datacheck-cli[redshift] # Redshift -pip install datacheck-cli[cloud] # S3, GCS, Azure Blob +pip install datacheck-cli[s3] # S3 pip install datacheck-cli[all] # All data sources ``` ## Quickstart -The examples below show minimal configurations. To see detailed logs, add `--verbose` or `-v` to any command. +To see detailed logs on any command, add `--verbose` or `-v`. ### Create a config -Use `datacheck config init` to generate a config from a template. Add `--with-sample-data` to also generate a sample CSV file so you can test validation immediately: +**Option 1 - Start from a template:** ```bash datacheck config init --with-sample-data datacheck config init --template ecommerce --with-sample-data ``` -Or create a `.datacheck.yaml` file manually. The config defines both the data source and the validation rules. +**Option 2 - Write manually.** The config defines both the data source and the validation rules. ```yaml # .datacheck.yaml @@ -101,11 +139,6 @@ checks: not_null: true min: 0 max: 10000 - - - name: email_check - column: email - rules: - email_valid: true ``` DataCheck auto-discovers config files in this order: `.datacheck.yaml` → `.datacheck.yml` → `datacheck.yaml` → `datacheck.yml`. To specify a config explicitly, use the `--config` flag. @@ -113,30 +146,63 @@ DataCheck auto-discovers config files in this order: `.datacheck.yaml` → `.dat ### Run validation ```bash -datacheck validate +datacheck validate # auto-discover config +datacheck validate data.csv # direct file datacheck validate --config checks.yaml +echo $? # 1 if any error-severity rule fails ``` -| Parameter | Required | Description | -|-----------|----------|-------------| -| `-c, --config` | No | Path to config file (auto-discovered if not specified) | -| `--source` | No | Named source from `sources.yaml` | -| `--sources-file` | No | Path to sources YAML file | -| `-t, --table` | No | Database table name (for database sources) | -| `-w, --where` | No | WHERE clause for filtering (for database sources) | -| `-q, --query` | No | Custom SQL query (alternative to --table) | -| `-o, --output` | No | Save results to a JSON file (terminal output is always shown) | -| `--csv-export` | No | Export failure details as CSV | -| `--sample-rate` | No | Random sample fraction (0.0–1.0) | -| `--sample-count` | No | Fixed sample size | -| `--parallel` | No | Enable multi-core execution | -| `--verbose, -v` | No | Enable detailed logging | +**Data source** + +| Option | Short | Description | +|--------|-------|-------------| +| `[DATA_SOURCE]` | | Positional: file path or connection string | +| `--config` | `-c` | Path to config file (auto-discovered if not set) | +| `--source` | | Named source from `sources.yaml` | +| `--sources-file` | | Path to sources YAML file | +| `--table` | `-t` | Database table name | +| `--where` | `-w` | WHERE clause for filtering | +| `--query` | `-q` | Custom SQL query (alternative to `--table`) | +| `--schema` | `-s` | Schema/dataset name (databases and warehouses) | +| `--warehouse` | | Snowflake warehouse name | +| `--credentials` | | Path to credentials file (e.g., BigQuery service account JSON) | +| `--region` | | Cloud region (Redshift IAM auth) | +| `--cluster` | | Cluster identifier (Redshift IAM auth) | +| `--iam-auth` | | Use IAM authentication (Redshift) | + +**Output** + +| Option | Short | Description | +|--------|-------|-------------| +| `--output` | `-o` | Save results to file | +| `--format` | `-f` | Output format: `json` (default), `sarif`, `markdown`, `csv` | +| `--csv-export` | | Export failure details as CSV | +| `--suggestions` / `--no-suggestions` | | Show actionable fix suggestions (default: on) | + +**Execution** + +| Option | Short | Description | +|--------|-------|-------------| +| `--parallel` | | Enable multi-core execution | +| `--workers` | | Number of worker processes (default: CPU count) | +| `--chunk-size` | | Rows per chunk for parallel processing (default: 100000) | +| `--progress` / `--no-progress` | | Show progress bar (default: on) | +| `--slack-webhook` | | Slack webhook URL for result notifications | + +**Logging** + +| Option | Short | Description | +|--------|-------|-------------| +| `--verbose` | `-v` | Set log level to DEBUG | +| `--log-level` | | `DEBUG`, `INFO`, `WARNING`, `ERROR`, `CRITICAL` | +| `--log-format` | | `console` (default) or `json` | +| `--log-file` | | Path to log file (enables rotation) | ## Data Source Configuration File-based data sources are defined inline under `data_source` in your config. For databases and cloud storage, define named sources in a separate `sources.yaml` file and reference them. -### CSV / Parquet / Avro +### CSV / Parquet ```yaml data_source: @@ -153,25 +219,11 @@ data_source: path: ./data/orders.parquet ``` -### SQLite / DuckDB - -```yaml -data_source: - type: sqlite - path: ./data/analytics.db -``` - -### Delta Lake - -```yaml -data_source: - type: delta - path: ./data/delta-table -``` - ### Databases (PostgreSQL, Snowflake, BigQuery, etc.) -For database connections, use **named sources** in a `sources.yaml` file. The inline `data_source` config only supports file-based sources (csv, parquet, avro, delta, duckdb, sqlite). +For database connections, use **named sources** in a `sources.yaml` file. The inline `data_source` config only supports file-based sources (csv, parquet). + +> **SQL pushdown:** database checks run as a single aggregate `SELECT` per rule - no rows are transferred to the validator. Validation happens inside your warehouse. ```yaml # sources.yaml @@ -225,7 +277,7 @@ checks: not_null: true ``` -### Cloud Storage (S3, GCS, Azure) +### Cloud Storage (S3) Access cloud files via named sources in `sources.yaml`: @@ -245,7 +297,7 @@ sources: datacheck validate --source s3_data --sources-file sources.yaml ``` -### Named Sources (continued) +### Switching Sources at Runtime Switch sources at runtime: @@ -292,62 +344,128 @@ sources: Use `datacheck config env` to list all variables referenced in a config and their current values. -## Profile Data Quality +## CI/CD Integration -Generate a data quality profile with summary statistics, quality scores, and automatic rule suggestions. The data source can be provided directly, read from your config, or loaded from a named source. +DataCheck is built for pipelines. Rules fail hard and fast - no soft warnings that let bad data slip through unnoticed. -```bash -# Direct file path -datacheck profile data.csv +### Exit codes + +| Code | Meaning | +|------|---------| +| `0` | All rules passed (or only warning/info severity failures) | +| `1` | One or more error-severity rules failed | +| `2` | Configuration error | +| `3` | Data loading error | +| `4` | Unexpected error | -# Auto-discover config (looks for .datacheck.yaml, datacheck.yaml, etc.) -datacheck profile +Rules can have `severity: error` (default), `severity: warning`, or `severity: info`. Only `error`-severity failures cause exit code `1` and stop the pipeline. -# Explicit config file -datacheck profile --config checks.yaml +### GitHub Actions (with SARIF to Security tab) -# Named source from sources file -datacheck profile --source production_db --sources-file sources.yaml +Results appear as annotations on PRs in the GitHub Security tab via SARIF 2.1.0: + +```yaml +# .github/workflows/data-quality.yml +name: Data Quality Gate +on: [push, pull_request] + +permissions: + contents: read + security-events: write # Required for SARIF upload + +jobs: + validate: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: squrtech/datacheck-action@v1 + with: + config: .datacheck.yaml ``` -| Parameter | Required | Description | -|-----------|----------|-------------| -| `DATA_SOURCE` | No | Data source: file path, connection string, or omit when using config/sources | -| `-c, --config` | No | Path to config file with data_source or sources_file defined | -| `--source` | No | Named source from sources.yaml | -| `--sources-file` | No | Path to sources YAML file | -| `--outlier-method` | No | Outlier detection method: `zscore` (default) or `iqr` | -| `--format` | No | Output format: `terminal`, `json`, or `markdown` | -| `-o, --output` | No | Write output to a file | +Or without the GitHub Action - generates SARIF and uploads it directly: + +```yaml + - name: Install DataCheck + run: pip install datacheck-cli + + - name: Run data quality gate + run: datacheck validate -c .datacheck.yaml --format sarif --output results.sarif + + - name: Upload SARIF to GitHub Security tab + uses: github/codeql-action/upload-sarif@v3 + if: always() + with: + sarif_file: results.sarif +``` + +### Apache Airflow + +Use the built-in Airflow operators to gate DAG tasks on data quality: + +```python +from airflow_provider_datacheck.operators.datacheck import DataCheckOperator + +validate_orders = DataCheckOperator( + task_id="validate_orders", + config_path="/config/orders.datacheck.yaml", + source_name="production_db", + table="orders", + min_pass_rate=100.0, # Fail if any rule fails + fail_on_error=True, +) +``` + +The operator raises `AirflowException` when validation fails, halting the DAG at the gate. + +### Any CI runner + +Works with any CI system that respects exit codes: ```bash -datacheck profile # Full profile -datacheck profile --format json -o profile.json # Export as JSON -datacheck profile --source analytics_wh --sources-file sources.yaml # Profile a named source +pip install datacheck-cli +datacheck validate -c .datacheck.yaml +# exits 1 if any error-severity rule fails ``` -## Detect Schema Changes +## Enforce Schema Contracts -Capture a baseline schema and compare future data against it to detect column additions, removals, type changes, and nullable changes. The data source can be provided directly, read from your config, or loaded from a named source. +Capture a schema baseline and compare future data against it. Detects column additions, removals, type changes, and nullable changes. Use `--fail-on-breaking` to exit 1 on breaking changes. The data source can be provided directly, read from your config, or loaded from a named source. ```bash # Auto-discover config or use named source -datacheck schema capture # Save current schema as baseline -datacheck schema compare # Compare against baseline +datacheck schema capture # Save current schema as baseline +datacheck schema compare # Compare - reports changes, exit 0 +datacheck schema compare --fail-on-breaking # Compare - exit 1 on breaking changes # Direct file path datacheck schema capture data.csv -datacheck schema compare data.csv +datacheck schema compare data.csv --fail-on-breaking # Named source datacheck schema capture --source production_db --sources-file sources.yaml # Other schema commands -datacheck schema show # Display detected schema -datacheck schema list # List saved baselines -datacheck schema history # View capture history +datacheck schema show # Display saved baseline +datacheck schema list # List saved baselines +datacheck schema history # View capture history ``` +`schema compare` options: + +| Option | Short | Description | +|--------|-------|-------------| +| `[DATA_SOURCE]` | | Positional: file path or connection string | +| `--config` | `-c` | Path to config file | +| `--source` | | Named source from `sources.yaml` | +| `--sources-file` | | Path to sources YAML file | +| `--table` | `-t` | Database table name | +| `--baseline` | `-b` | Name of baseline to compare against (default: `baseline`) | +| `--baseline-dir` | | Directory containing baselines (default: `.datacheck/schemas`) | +| `--rename-threshold` | | Similarity threshold for rename detection (default: 0.8) | +| `--fail-on-breaking` | | Exit 1 if breaking changes are detected | +| `--format` | `-f` | Output format: `terminal` (default) or `json` | + ## Python API Use DataCheck programmatically within your pipelines: @@ -363,28 +481,9 @@ print(f"Passed: {summary.passed_rules}/{summary.total_rules}") for result in summary.get_failed_results(): print(f" FAIL: {result.rule_name} on {result.column} ({result.failed_rows} rows)") -``` -## CI/CD Integration - -DataCheck uses standard exit codes for automation: - -| Code | Meaning | -|------|---------| -| `0` | All rules passed (or only warning/info severity failures) | -| `1` | Some error-severity rules failed | -| `2` | Configuration error | -| `3` | Data loading error | -| `4` | Unexpected error | - -Rules can have `severity: error` (default), `severity: warning`, or `severity: info`. Only error-severity failures cause exit code 1. - -```yaml -# GitHub Actions -- name: Validate Data - run: | - pip install datacheck-cli - datacheck validate --output results.json +if not summary.all_passed: + raise ValueError("Data quality gate failed - halting pipeline") ``` ## Available Rules @@ -392,51 +491,18 @@ Rules can have `severity: error` (default), `severity: warning`, or `severity: i | Category | Rules | |----------|-------| | Null & Uniqueness | `not_null`, `unique`, `unique_combination` | -| Numeric | `min`, `max`, `mean_between`, `std_dev_less_than`, `percentile_range`, `z_score_outliers`, `distribution_type` | +| Numeric | `min`, `max`, `range`, `boolean` | | String & Pattern | `regex`, `allowed_values`, `length`, `min_length`, `max_length`, `type` | -| Temporal | `max_age`, `timestamp_range` (or `date_range`), `no_future_timestamps`, `date_format_valid` (or `date_format`), `business_days_only` | -| Semantic | `email_valid`, `phone_valid`, `url_valid`, `json_valid` | +| Temporal | `max_age`, `timestamp_range` (or `date_range`), `no_future_timestamps`, `date_format_valid` (or `date_format`) | | Cross-Column | `unique_combination`, `foreign_key_exists` (Python API), `sum_equals` | -| Custom | `custom` — user-defined functions via `@custom_rule` decorator | - -## Custom Rules - -Create a plugin file with custom validation functions using the `@custom_rule` decorator. The function receives a `pd.Series` and optional parameters, and returns a boolean `pd.Series` (True = valid). - -```python -# custom_rules.py -from datacheck.plugins.decorators import custom_rule -import pandas as pd - -@custom_rule -def is_business_email(column: pd.Series, allowed_domains: list) -> pd.Series: - domains = column.dropna().str.split("@").str[1] - return domains.isin(allowed_domains) -``` - -Reference the plugin in your config: - -```yaml -plugins: - - ./custom_rules.py - -checks: - - name: email_check - column: email - rules: - custom: - rule: is_business_email - params: - allowed_domains: ["company.com"] -``` ## Roadmap -DataCheck v2.0.1 includes smart config auto-generation (regex pattern inference, cross-column `sum_equals` detection, semantic rule suggestion, type-aware profiling), batch error reporting for config validation, connection pre-validation for database sources, and international phone number support. Here's what's next: +What's coming next: -- **Enhanced CI/CD examples** — Starter workflows for GitHub Actions, GitLab CI, and Jenkins. -- **Streaming validation** — Validate large datasets without loading everything into memory. -- **Notification integrations** — Slack, email, and webhook alerts on validation failures. +- **Data Contracts format** - `--format datacontract` aligned with the [datacontract.com](https://datacontract.com) open spec. +- **dbt integration** - generate DataCheck rules directly from your dbt schema YAML. +- **Streaming validation** - chunk-based ingestion for 100M+ row datasets without loading into memory. ## Development @@ -444,7 +510,6 @@ DataCheck v2.0.1 includes smart config auto-generation (regex pattern inference, git clone https://github.com/squrtech/datacheck.git cd datacheck poetry install -poetry run pytest ``` See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines. @@ -454,10 +519,11 @@ See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines. - [Documentation](https://squrtech.github.io/datacheck/) - [CLI Command Reference](https://squrtech.github.io/datacheck/#cli-command-reference) - [Python API Reference](https://squrtech.github.io/datacheck/#python-api) +- [Philosophy](docs/philosophy.md) - [PyPI](https://pypi.org/project/datacheck-cli/) - [Issues](https://github.com/squrtech/datacheck/issues) - [Changelog](CHANGELOG.md) ## License -Apache License 2.0 — see [LICENSE](LICENSE) for details. +Apache License 2.0 - see [LICENSE](LICENSE) for details. diff --git a/README_PYPI.md b/README_PYPI.md index 71d1036..9485db0 100644 --- a/README_PYPI.md +++ b/README_PYPI.md @@ -1,20 +1,24 @@ -# DataCheck — Data Validation Engine +# DataCheck - A Linter for Data Pipelines +[![PyPI version](https://img.shields.io/pypi/v/datacheck-cli.svg)](https://pypi.org/project/datacheck-cli/) [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/) [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) +[![Downloads](https://img.shields.io/pypi/dm/datacheck-cli.svg)](https://pypi.org/project/datacheck-cli/) -DataCheck is a data quality validation engine for data engineers. Define validation rules in a YAML config and data sources inline (files) or in a `sources.yaml` (databases, cloud), then automatically validate data across files, databases, and cloud warehouses. +**DataCheck enforces deterministic validation rules at the pipeline boundary.** Define rules in YAML. Run in CI. Fail fast on bad data. No servers, no dashboards, no infrastructure. -DataCheck provides the `datacheck` Command-Line Interface (CLI) and a Python API, which you can use to validate data, profile quality, and detect schema changes. These operations can be executed locally during development, embedded programmatically within your data pipelines (Airflow, Dagster, Prefect, etc.), or integrated into CI/CD workflows. +``` +Your data source → [DataCheck rules] → exit 0: pipeline continues + → exit 1: pipeline stops +``` -### Highlights +Most teams detect bad data after the fact - broken reports, wrong numbers, angry stakeholders. DataCheck enforces validation rules *before* bad data moves downstream, the same way a linter enforces code quality before bad code ships. -- Define validation rules in YAML config and data sources inline (files) or in a `sources.yaml` (databases, cloud) -- Run checks on CSV, Parquet, Delta Lake, Avro, PostgreSQL, MySQL, Snowflake, BigQuery, Redshift, and more -- Use 27+ built-in data quality rules for null checks, numeric ranges, patterns, timestamps, email/phone/URL validation, and cross-column checks -- Profile data quality with automatic scoring, outlier detection, and rule suggestions -- Detect schema evolution with compatibility levels (COMPATIBLE, WARNING, BREAKING) -- Extend with custom rules using the `@custom_rule` plugin decorator +- **Fail fast** - structured exit codes stop pipelines at the gate, not after the damage is done +- **Deterministic** - rules are explicit and binary. No heuristics. No anomaly scoring. No statistical guessing. +- **SQL pushdown** - database checks run as a single aggregate `SELECT`; no data leaves your warehouse +- **Zero infrastructure** - one `pip install`, one YAML file, runs anywhere +- **CI-native** - SARIF output to GitHub Security tab, GitHub Action, Apache Airflow operators ## Installation @@ -30,20 +34,20 @@ pip install datacheck-cli[mysql] # MySQL pip install datacheck-cli[snowflake] # Snowflake pip install datacheck-cli[bigquery] # BigQuery pip install datacheck-cli[redshift] # Redshift -pip install datacheck-cli[cloud] # S3, GCS, Azure Blob +pip install datacheck-cli[s3] # S3 pip install datacheck-cli[all] # All data sources ``` ## Quickstart -Use `datacheck config init` to generate a config from a template. Add `--with-sample-data` to also generate a sample CSV file so you can test validation immediately: +**Option 1 - Start from a template:** ```bash datacheck config init --with-sample-data datacheck config init --template ecommerce --with-sample-data ``` -Or create a `.datacheck.yaml` config file manually with your data source and validation rules: +**Option 2 - Write manually.** Create a `.datacheck.yaml` config file with your data source and validation rules: ```yaml data_source: @@ -64,16 +68,67 @@ checks: min: 0 max: 10000 - - name: email_check - column: email - rules: - email_valid: true ``` Run validation: ```bash -datacheck validate +datacheck validate # auto-discover config +datacheck validate data.csv # direct file +datacheck validate --config checks.yaml +echo $? # 1 if any error-severity rule fails +``` + +## CI/CD Integration + +### GitHub Actions (with SARIF to Security tab) + +```yaml +# .github/workflows/data-quality.yml +name: Data Quality Gate +on: [push, pull_request] + +permissions: + contents: read + security-events: write + +jobs: + validate: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: squrtech/datacheck-action@v1 + with: + config: .datacheck.yaml +``` + +Or generate SARIF manually and upload to the GitHub Security tab: + +```yaml + - name: Run data quality gate + run: | + pip install datacheck-cli + datacheck validate -c .datacheck.yaml --format sarif --output results.sarif + + - name: Upload SARIF + uses: github/codeql-action/upload-sarif@v3 + if: always() + with: + sarif_file: results.sarif +``` + +### Apache Airflow + +```python +from airflow_provider_datacheck.operators.datacheck import DataCheckOperator + +validate_orders = DataCheckOperator( + task_id="validate_orders", + config_path="/config/orders.datacheck.yaml", + source_name="production_db", + table="orders", + fail_on_error=True, +) ``` ## Database and Cloud Sources @@ -118,22 +173,13 @@ source: production_db table: orders ``` -## Profile Data Quality +## Enforce Schema Contracts ```bash -datacheck profile # Auto-discover config -datacheck profile data.csv # Direct file path -datacheck profile --source production_db --sources-file sources.yaml # Named source -datacheck profile --format json -o profile.json # Export as JSON -``` - -## Detect Schema Changes - -```bash -datacheck schema capture # Auto-discover config +datacheck schema capture # Save current schema as baseline datacheck schema capture data.csv # Direct file path datacheck schema capture --source production_db --sources-file sources.yaml # Named source -datacheck schema compare # Compare against baseline +datacheck schema compare # Compare against baseline - fails if schema changed ``` ## Python API @@ -148,6 +194,9 @@ print(f"Passed: {summary.passed_rules}/{summary.total_rules}") for result in summary.get_failed_results(): print(f" FAIL: {result.rule_name} on {result.column} ({result.failed_rows} rows)") + +if not summary.all_passed: + raise ValueError("Data quality gate failed - halting pipeline") ``` ## Available Rules @@ -155,12 +204,10 @@ for result in summary.get_failed_results(): | Category | Rules | |----------|-------| | Null & Uniqueness | `not_null`, `unique`, `unique_combination` | -| Numeric | `min`, `max`, `mean_between`, `std_dev_less_than`, `percentile_range`, `z_score_outliers`, `distribution_type` | +| Numeric | `min`, `max`, `range`, `boolean` | | String & Pattern | `regex`, `allowed_values`, `length`, `min_length`, `max_length`, `type` | -| Temporal | `max_age`, `timestamp_range` (or `date_range`), `no_future_timestamps`, `date_format_valid` (or `date_format`), `business_days_only` | -| Semantic | `email_valid`, `phone_valid`, `url_valid`, `json_valid` | +| Temporal | `max_age`, `timestamp_range` (or `date_range`), `no_future_timestamps`, `date_format_valid` (or `date_format`) | | Cross-Column | `unique_combination`, `foreign_key_exists` (Python API), `sum_equals` | -| Custom | `custom` — user-defined functions via `@custom_rule` decorator | ## Links @@ -171,4 +218,4 @@ for result in summary.get_failed_results(): ## License -Apache License 2.0 — Copyright 2026 Squrtech +Apache License 2.0 - Copyright 2026 Squrtech diff --git a/SECURITY.md b/SECURITY.md index b40a786..54879c5 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -235,8 +235,6 @@ Core dependencies: - pyarrow (Parquet support) - pyyaml (configuration) - boto3 (AWS S3) - optional -- google-cloud-storage (GCS) - optional -- azure-storage-blob (Azure) - optional ## Updates and Patches diff --git a/airflow-provider/.github/workflows/test.yml b/airflow-provider/.github/workflows/test.yml new file mode 100644 index 0000000..9ddda5b --- /dev/null +++ b/airflow-provider/.github/workflows/test.yml @@ -0,0 +1,115 @@ +name: Test Provider + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + # ── Test: install and import ────────────────────────────────────────────────── + test-import: + name: Install & import (Python ${{ matrix.python-version }}, Airflow ${{ matrix.airflow-version }}) + runs-on: ubuntu-latest + + strategy: + fail-fast: false + matrix: + python-version: ["3.10", "3.11", "3.12"] + airflow-version: ["2.6.0", "2.9.0", "2.10.0"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install provider and Airflow + run: | + pip install -q "apache-airflow==${{ matrix.airflow-version }}" \ + --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-${{ matrix.airflow-version }}/constraints-${{ matrix.python-version }}.txt" + pip install -q . + + - name: Import operators + run: | + python -c " + from airflow_provider_datacheck.operators.datacheck import ( + DataCheckOperator, + DataCheckSchemaOperator, + ) + print('DataCheckOperator:', DataCheckOperator) + print('DataCheckSchemaOperator:', DataCheckSchemaOperator) + print('Import OK') + " + + - name: Verify get_provider_info + run: | + python -c " + from airflow_provider_datacheck import get_provider_info + info = get_provider_info() + assert info['package-name'] == 'apache-airflow-provider-datacheck', 'Wrong package-name' + assert 'operators' in info, 'Missing operators key' + assert len(info['operators']) == 2, f'Expected 2 operators, got {len(info[\"operators\"])}' + print('get_provider_info OK:', info) + " + + # ── Test: provider.yaml is valid YAML ───────────────────────────────────────── + test-provider-yaml: + name: Validate provider.yaml + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install PyYAML + run: pip install -q pyyaml + + - name: Validate provider.yaml + run: | + python -c " + import yaml + with open('provider.yaml') as f: + data = yaml.safe_load(f) + required = ['package-name', 'name', 'description', 'versions', 'operators'] + for field in required: + assert field in data, f'Missing required field: {field}' + assert data['package-name'] == 'apache-airflow-provider-datacheck' + assert len(data['operators']) > 0, 'No operators defined' + print('provider.yaml is valid') + print('Operators:', [op['python-modules'] for op in data['operators']]) + " + + # ── Test: example DAGs are importable ──────────────────────────────────────── + test-example-dags: + name: Validate example DAGs + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install dependencies + run: | + pip install -q "apache-airflow>=2.9.0" \ + --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.9.0/constraints-3.11.txt" + pip install -q . + + - name: Parse example DAGs + run: | + python -c "import ast, pathlib + for dag_file in pathlib.Path('example_dags').glob('*.py'): + src = dag_file.read_text() + ast.parse(src) + print(f'Syntax OK: {dag_file.name}') + " diff --git a/airflow-provider/LICENSE b/airflow-provider/LICENSE new file mode 100644 index 0000000..70173f1 --- /dev/null +++ b/airflow-provider/LICENSE @@ -0,0 +1,190 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to the Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright 2026 Squrtech + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/airflow-provider/README.md b/airflow-provider/README.md new file mode 100644 index 0000000..2617b69 --- /dev/null +++ b/airflow-provider/README.md @@ -0,0 +1,225 @@ +

+ DataCheck +

+ +

Apache Airflow Provider — DataCheck

+ +

+ PyPI version + Downloads + License +

+ +Data quality validation operators for Apache Airflow. Define rules in YAML, validate files, +databases, and cloud warehouses, and gate your pipelines on quality thresholds. + +--- + +## Installation + +```bash +pip install apache-airflow-provider-datacheck +``` + +For database and cloud sources, install with the relevant connector extra: + +```bash +pip install apache-airflow-provider-datacheck[postgresql] +pip install apache-airflow-provider-datacheck[snowflake] +pip install apache-airflow-provider-datacheck[bigquery] +pip install apache-airflow-provider-datacheck[s3] +pip install apache-airflow-provider-datacheck[all] # all connectors +``` + +--- + +## Operators + +### `DataCheckOperator` + +Runs DataCheck validation from a YAML config against any data source. + +```python +from airflow_provider_datacheck.operators.datacheck import DataCheckOperator + +validate = DataCheckOperator( + task_id="validate_orders", + config_path="/config/checks/orders.yaml", + sources_file="/config/sources.yaml", + source_name="production_db", + table="orders", + where="created_at >= '{{ ds }}'", # Jinja templating supported + min_pass_rate=95.0, # fail if < 95% of rules pass + fail_on_error=True, + push_results=True, # results pushed to XCom +) +``` + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `config_path` | str | required | Path to `.datacheck.yaml` validation config | +| `file_path` | str | None | Path to a data file (CSV, Parquet) | +| `sources_file` | str | None | Path to `sources.yaml` (for databases/cloud) | +| `source_name` | str | None | Named source from `sources.yaml` | +| `table` | str | None | Database table name | +| `where` | str | None | SQL WHERE clause for filtering | +| `query` | str | None | Custom SQL query (alternative to `table`) | +| `parallel` | bool | False | Enable multi-core execution | +| `workers` | int | None | Number of worker processes | +| `min_pass_rate` | float | 0.0 | Minimum rule pass rate % (0 = disabled) | +| `fail_on_error` | bool | True | Raise `AirflowException` on failure | +| `push_results` | bool | True | Push results to XCom | + +**XCom keys pushed:** `passed` (bool), `pass_rate` (float), `validation_results` (dict) + +--- + +### `DataCheckSchemaOperator` + +Enforces schema contracts against a saved baseline - fails if breaking changes are detected. On first run, captures the baseline automatically. + +```python +from airflow_provider_datacheck.operators.datacheck import DataCheckSchemaOperator + +schema_check = DataCheckSchemaOperator( + task_id="schema_check", + sources_file="/config/sources.yaml", + source_name="production_db", + table="orders", + baseline_name="orders_baseline", + fail_on_breaking=True, +) +``` + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `file_path` | str | None | Path to a data file | +| `sources_file` | str | None | Path to `sources.yaml` | +| `source_name` | str | None | Named source from `sources.yaml` | +| `table` | str | None | Database table name | +| `query` | str | None | Custom SQL query (alternative to `table`) | +| `baseline_name` | str | `"baseline"` | Name for the schema baseline | +| `baseline_dir` | str | `".datacheck/schemas"` | Directory to store baselines | +| `fail_on_breaking` | bool | True | Fail on BREAKING schema changes | +| `push_results` | bool | True | Push results to XCom | + +**XCom keys pushed:** `schema_compatible` (bool), `schema_results` (dict with change details) + +> **Tip:** For large tables, use `query` with a `LIMIT` instead of `table` — schema detection only needs a sample of rows to infer column types, so loading the full table is unnecessary. +> +> ```python +> query="SELECT * FROM orders LIMIT 1000" +> ``` + +**Compatibility levels:** +- `COMPATIBLE` — safe additions (new nullable column, index added) +- `WARNING` — nullable changed, type widened +- `BREAKING` — column removed, type narrowed, required column added + +--- + +## Quickstart + +### 1. Define your validation config + +```yaml +# /config/checks/orders.yaml +sources_file: /config/sources.yaml +source: production_db +table: orders + +checks: + - name: order_id_check + column: order_id + rules: + not_null: true + unique: true + + - name: amount_check + column: amount + rules: + not_null: true + min: 0 + max: 1000000 +``` + +### 2. Define your sources + +```yaml +# /config/sources.yaml +sources: + production_db: + type: postgresql + host: ${DB_HOST} + port: ${DB_PORT:-5432} + database: ${DB_NAME} + user: ${DB_USER} + password: ${DB_PASSWORD} +``` + +### 3. Create your DAG + +```python +from datetime import datetime +from airflow import DAG +from airflow_provider_datacheck.operators.datacheck import ( + DataCheckOperator, + DataCheckSchemaOperator, +) + +with DAG( + dag_id="daily_data_quality", + start_date=datetime(2026, 1, 1), + schedule="@daily", + catchup=False, +): + schema_check = DataCheckSchemaOperator( + task_id="schema_check", + sources_file="/config/sources.yaml", + source_name="production_db", + table="orders", + baseline_name="orders_baseline", + fail_on_breaking=True, + ) + + validate = DataCheckOperator( + task_id="validate_orders", + config_path="/config/checks/orders.yaml", + where="created_at >= '{{ ds }}'", + min_pass_rate=95.0, + ) + + schema_check >> validate +``` + +--- + +## Available Extras + +| Extra | Installs | Use for | +|-------|----------|---------| +| `postgresql` | `psycopg2-binary`, `sqlalchemy` | PostgreSQL | +| `mysql` | `mysql-connector-python`, `sqlalchemy` | MySQL | +| `mssql` | `pyodbc`, `sqlalchemy` | SQL Server | +| `snowflake` | `snowflake-connector-python` | Snowflake | +| `bigquery` | `google-cloud-bigquery`, `google-auth` | BigQuery | +| `redshift` | `boto3`, `psycopg2-binary`, `sqlalchemy` | Redshift | +| `s3` | `boto3` | S3 file sources | +| `cloud` | `boto3` | S3 file sources (alias) | +| `databases` | PostgreSQL + MySQL + MSSQL | All SQL databases | +| `warehouses` | Snowflake + BigQuery + Redshift | All warehouses | +| `all` | Everything | All connectors | + +--- + +## Links + +- [DataCheck on PyPI](https://pypi.org/project/datacheck-cli/) +- [DataCheck GitHub](https://github.com/squrtech/datacheck) +- [Documentation](https://squrtech.github.io/datacheck/) +- [Available Rules](https://squrtech.github.io/datacheck/#available-rules) +- [Report an Issue](https://github.com/squrtech/datacheck/issues) + +## License + +Apache License 2.0 — Copyright 2026 Squrtech diff --git a/airflow-provider/airflow_provider_datacheck/__init__.py b/airflow-provider/airflow_provider_datacheck/__init__.py new file mode 100644 index 0000000..e6b7e62 --- /dev/null +++ b/airflow-provider/airflow_provider_datacheck/__init__.py @@ -0,0 +1,15 @@ +"""Apache Airflow provider for DataCheck - enforce validation rules in Airflow DAGs.""" + + +def get_provider_info() -> dict: + """Return provider metadata required by Airflow's provider discovery.""" + return { + "package-name": "apache-airflow-provider-datacheck", + "name": "DataCheck", + "description": "Data quality validation operators for Apache Airflow pipelines.", + "versions": ["1.0.0"], + "operators": [ + "airflow_provider_datacheck.operators.datacheck.DataCheckOperator", + "airflow_provider_datacheck.operators.datacheck.DataCheckSchemaOperator", + ], + } diff --git a/airflow-provider/airflow_provider_datacheck/operators/__init__.py b/airflow-provider/airflow_provider_datacheck/operators/__init__.py new file mode 100644 index 0000000..f588fa2 --- /dev/null +++ b/airflow-provider/airflow_provider_datacheck/operators/__init__.py @@ -0,0 +1 @@ +"""DataCheck operators for Apache Airflow.""" diff --git a/airflow-provider/airflow_provider_datacheck/operators/datacheck.py b/airflow-provider/airflow_provider_datacheck/operators/datacheck.py new file mode 100644 index 0000000..f1d1d6f --- /dev/null +++ b/airflow-provider/airflow_provider_datacheck/operators/datacheck.py @@ -0,0 +1,14 @@ +"""DataCheck operators — re-exported from datacheck-cli. + +The full operator implementation lives in ``datacheck.airflow.operators`` +inside the ``datacheck-cli`` package (installed as a dependency). +This module re-exports them at the standard provider path so Airflow +can discover and load them. +""" + +from datacheck.airflow.operators import DataCheckOperator, DataCheckSchemaOperator + +__all__ = [ + "DataCheckOperator", + "DataCheckSchemaOperator", +] diff --git a/airflow-provider/example_dags/example_schema_dag.py b/airflow-provider/example_dags/example_schema_dag.py new file mode 100644 index 0000000..0260146 --- /dev/null +++ b/airflow-provider/example_dags/example_schema_dag.py @@ -0,0 +1,79 @@ +"""Example DAG: schema evolution monitoring with DataCheckSchemaOperator. + +Demonstrates: +- Capturing a schema baseline on first run +- Comparing schema on subsequent runs +- Handling COMPATIBLE, WARNING, and BREAKING changes differently +- Monitoring multiple tables in parallel +""" + +from datetime import datetime, timedelta + +from airflow import DAG +from airflow.operators.python import PythonOperator + +from airflow_provider_datacheck.operators.datacheck import DataCheckSchemaOperator + +default_args = { + "owner": "data-engineering", + "retries": 0, +} + + +def _log_schema_results(table: str, **context): + """Log schema comparison results from XCom.""" + results = context["ti"].xcom_pull(task_ids=f"schema_{table}", key="schema_results") + if not results: + return + + if results["mode"] == "capture": + print(f"[{table}] Baseline captured — {len(results['columns'])} columns") + return + + level = results.get("compatibility_level", "COMPATIBLE") + changes = results.get("total_changes", 0) + breaking = results.get("breaking_changes", 0) + + print(f"[{table}] Schema check: {level} — {changes} change(s), {breaking} breaking") + + for change in results.get("changes", []): + print(f" [{change['compatibility']}] {change['message']}") + + +# --------------------------------------------------------------------------- +# Monitor multiple warehouse tables in parallel +# --------------------------------------------------------------------------- + +TABLES_TO_MONITOR = ["orders", "customers", "products", "inventory"] + +with DAG( + dag_id="datacheck_schema_monitor", + description="Monitor schema evolution across warehouse tables", + start_date=datetime(2026, 1, 1), + schedule="@daily", + default_args=default_args, + catchup=False, + tags=["schema", "datacheck"], +) as dag: + + for table in TABLES_TO_MONITOR: + + schema_check = DataCheckSchemaOperator( + task_id=f"schema_{table}", + sources_file="/config/sources.yaml", + source_name="production_db", + table=table, + baseline_name=f"{table}_baseline", + baseline_dir="/config/schemas", + fail_on_breaking=True, # break the DAG on breaking schema changes + push_results=True, + ) + + log_results = PythonOperator( + task_id=f"log_{table}_results", + python_callable=_log_schema_results, + op_kwargs={"table": table}, + trigger_rule="all_done", # run even if schema_check fails + ) + + schema_check >> log_results diff --git a/airflow-provider/example_dags/example_validate_dag.py b/airflow-provider/example_dags/example_validate_dag.py new file mode 100644 index 0000000..562dceb --- /dev/null +++ b/airflow-provider/example_dags/example_validate_dag.py @@ -0,0 +1,109 @@ +"""Example DAG: daily validation gate with DataCheckOperator. + +Demonstrates: +- Validating a date-partitioned Parquet file using Jinja templating +- Validating a database table with a WHERE clause +- Using XCom to branch on validation results +- Chaining schema check → validation +""" + +from datetime import datetime, timedelta + +from airflow import DAG +from airflow.operators.python import BranchPythonOperator, PythonOperator + +from airflow_provider_datacheck.operators.datacheck import ( + DataCheckOperator, + DataCheckSchemaOperator, +) + +default_args = { + "owner": "data-engineering", + "retries": 1, + "retry_delay": timedelta(minutes=5), +} + +# --------------------------------------------------------------------------- +# Example 1 — Validate a date-partitioned file +# --------------------------------------------------------------------------- + +with DAG( + dag_id="datacheck_validate_file", + description="Validate daily order export with DataCheck", + start_date=datetime(2026, 1, 1), + schedule="@daily", + default_args=default_args, + catchup=False, + tags=["data-quality", "datacheck"], +) as file_dag: + + # Schema check first — catches structural changes before validation runs + schema_check = DataCheckSchemaOperator( + task_id="schema_check", + file_path="/data/orders/orders_{{ ds }}.parquet", + baseline_name="orders_baseline", + baseline_dir="/config/schemas", + fail_on_breaking=True, + ) + + # Validate quality rules from config + validate = DataCheckOperator( + task_id="validate_orders", + config_path="/config/checks/orders.yaml", + file_path="/data/orders/orders_{{ ds }}.parquet", + min_pass_rate=95.0, # fail if fewer than 95% of rules pass + fail_on_error=True, + push_results=True, # results available via XCom + ) + + schema_check >> validate + + +# --------------------------------------------------------------------------- +# Example 2 — Validate a database table with branching on result +# --------------------------------------------------------------------------- + +def _branch_on_quality(**context): + """Branch downstream based on validation pass rate.""" + passed = context["ti"].xcom_pull(task_ids="validate_db", key="passed") + return "notify_success" if passed else "notify_failure" + + +with DAG( + dag_id="datacheck_validate_database", + description="Validate production database table with quality gate", + start_date=datetime(2026, 1, 1), + schedule="@daily", + default_args=default_args, + catchup=False, + tags=["data-quality", "datacheck", "postgresql"], +) as db_dag: + + validate_db = DataCheckOperator( + task_id="validate_db", + config_path="/config/checks/orders.yaml", + sources_file="/config/sources.yaml", + source_name="production_db", + table="orders", + where="created_at >= '{{ ds }}'", # only validate today's rows + parallel=True, + fail_on_error=False, # don't fail — branch instead + push_results=True, + ) + + branch = BranchPythonOperator( + task_id="branch_on_quality", + python_callable=_branch_on_quality, + ) + + notify_success = PythonOperator( + task_id="notify_success", + python_callable=lambda **_: print("Data quality passed!"), + ) + + notify_failure = PythonOperator( + task_id="notify_failure", + python_callable=lambda **_: print("Data quality failed — alerting team."), + ) + + validate_db >> branch >> [notify_success, notify_failure] diff --git a/airflow-provider/provider.yaml b/airflow-provider/provider.yaml new file mode 100644 index 0000000..ab4a3bb --- /dev/null +++ b/airflow-provider/provider.yaml @@ -0,0 +1,17 @@ +package-name: apache-airflow-provider-datacheck +name: DataCheck +description: Data quality validation operators for Apache Airflow pipelines. + Validate files, databases, Snowflake, BigQuery, and more using YAML rules. + Detect schema evolution with compatibility levels. +homepage: https://squrtech.github.io/datacheck/ +versions: + - 1.0.0 + +operators: + - integration-name: DataCheck + python-modules: + - airflow_provider_datacheck.operators.datacheck.DataCheckOperator + - airflow_provider_datacheck.operators.datacheck.DataCheckSchemaOperator + +connection-types: [] +hook-class-names: [] diff --git a/airflow-provider/pyproject.toml b/airflow-provider/pyproject.toml new file mode 100644 index 0000000..a642060 --- /dev/null +++ b/airflow-provider/pyproject.toml @@ -0,0 +1,70 @@ +[tool.poetry] +name = "apache-airflow-provider-datacheck" +version = "1.0.0" +description = "Enforce DataCheck validation rules in Apache Airflow. Gate pipelines on data quality for files, databases, Snowflake, BigQuery, and more." +authors = ["Squrtech "] +readme = "README.md" +license = "Apache-2.0" +homepage = "https://github.com/squrtech/datacheck" +repository = "https://github.com/squrtech/datacheck" +keywords = [ + "airflow", "data-linter", "data-validation", "data-engineering", + "pipeline", "etl", "snowflake", "bigquery", "postgresql", "schema-contracts", +] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Software Development :: Quality Assurance", + "Environment :: Plugins", + "Framework :: Apache Airflow", + "Framework :: Apache Airflow :: Provider", +] +packages = [{include = "airflow_provider_datacheck"}] + +[tool.poetry.urls] +"Documentation" = "https://squrtech.github.io/datacheck/" +"Bug Tracker" = "https://github.com/squrtech/datacheck/issues" +"Changelog" = "https://github.com/squrtech/datacheck/releases" + +[tool.poetry.dependencies] +python = ">=3.10,<4.0" +apache-airflow = ">=2.6.0" +datacheck-cli = ">=2.1.0,<3.0.0" + +# Connector extras — mirror datacheck-cli extras so users can do: +# pip install apache-airflow-provider-datacheck[postgresql] +psycopg2-binary = { version = ">=2.9.9,<3.0.0", optional = true } +mysql-connector-python = { version = ">=8.2.0,<10.0.0", optional = true } +pyodbc = { version = ">=5.0.1,<6.0.0", optional = true } +sqlalchemy = { version = ">=2.0.23,<3.0.0", optional = true } +boto3 = { version = ">=1.34.0,<2.0.0", optional = true } +snowflake-connector-python = { version = ">=3.0.0,<4.0.0", optional = true } +google-cloud-bigquery = { version = ">=3.0.0,<4.0.0", optional = true } +google-auth = { version = ">=2.0.0,<3.0.0", optional = true } + +[tool.poetry.extras] +postgresql = ["psycopg2-binary", "sqlalchemy"] +postgres = ["psycopg2-binary", "sqlalchemy"] +mysql = ["mysql-connector-python", "sqlalchemy"] +mssql = ["pyodbc", "sqlalchemy"] +databases = ["psycopg2-binary", "mysql-connector-python", "pyodbc", "sqlalchemy"] +s3 = ["boto3"] +cloud = ["boto3"] +snowflake = ["snowflake-connector-python"] +bigquery = ["google-cloud-bigquery", "google-auth"] +redshift = ["boto3", "psycopg2-binary", "sqlalchemy"] +warehouses = ["snowflake-connector-python", "google-cloud-bigquery", "google-auth", "boto3", "psycopg2-binary", "sqlalchemy"] +all = [ + "psycopg2-binary", "mysql-connector-python", "pyodbc", "sqlalchemy", + "boto3", + "snowflake-connector-python", "google-cloud-bigquery", "google-auth", +] + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/blog/2026-02-23-designing-fail-fast-data-pipelines.md b/blog/2026-02-23-designing-fail-fast-data-pipelines.md new file mode 100644 index 0000000..0870cd6 --- /dev/null +++ b/blog/2026-02-23-designing-fail-fast-data-pipelines.md @@ -0,0 +1,458 @@ +# Designing Fail-Fast Data Pipelines in GitHub Actions and Airflow + +*Published 2026-02-23 by the DataCheck engineering team* + +--- + +The Airflow DAG had a validation task. It had been in production for six months. The team believed it was gating their pipeline. It was not. + +A developer had added a `trigger_rule='all_done'` to the transform task four months earlier to handle an optional upstream branch. One line. It meant the transform task would run regardless of whether the validation task upstream had succeeded or failed. The gate was structurally present but functionally bypassed. Every validation failure for four months had reported to the task logs, been counted in the DAG run summary, and been silently ignored. + +This class of problem is not rare. It appears consistently across teams that have added validation steps to their pipelines without designing the surrounding pipeline topology for fail-fast behavior. The validation tool does its job. The orchestration configuration undoes it. + +Fail-fast pipeline design is not about adding validation steps. It is about the structural decisions in GitHub Actions and Airflow that determine whether a gate actually holds when it fires. + +--- + +## The Three Structural Requirements of a Real Gate + +Before examining the two environments, it is worth being precise about what "fail-fast" requires structurally: + +**Propagation.** A validation failure must cause the pipeline to stop. This means the failure must be visible to the orchestrator as a task failure (Airflow) or step failure (GitHub Actions), and subsequent tasks/steps must be configured to require the validation task's success. + +**Non-bypass.** No configuration path should allow downstream work to proceed when validation has failed. Every `trigger_rule`, `continue-on-error`, `if:` condition, and `needs:` dependency that touches validation must be reviewed for whether it can create a bypass. + +**Correct retry semantics.** A data quality failure (`exit 1`) should not be retried. The data is bad. Retrying the validation task against the same bad data produces the same failure. Retries are appropriate for transient infrastructure failures (`exit 3`) — not for content violations. Misconfigured retry policies can give the appearance of enforcement while actually delaying and eventually swallowing failures. + +With these requirements in mind, the specific failure modes in each environment become clear. + +--- + +## GitHub Actions: The Structural Bypass Patterns + +**`continue-on-error: true`** is the most common bypass in GitHub Actions pipelines: + +```yaml +# DO NOT DO THIS +- name: Validate data + continue-on-error: true # ← This step can fail without failing the job + run: datacheck validate -c .datacheck.yaml +``` + +`continue-on-error: true` allows the step to exit non-zero without marking the job as failed. Subsequent steps run. The gate is gone. This setting is sometimes added for debug visibility — "I want to see the output even if it fails" — and never removed. The result is a validation step that is structurally present and functionally inert. + +The correct configuration: omit `continue-on-error` entirely, or set it explicitly to `false`. The default behavior in GitHub Actions is correct — a non-zero step exit fails the step, which fails the job. + +**Missing `needs:` on the deployment job** is the second common bypass: + +```yaml +# Two jobs — but load runs regardless of validate result +jobs: + validate: + runs-on: ubuntu-latest + steps: + - run: datacheck validate -c .datacheck.yaml + + load: + runs-on: ubuntu-latest # ← No needs: — runs in parallel, not after validate + steps: + - run: python load_to_warehouse.py +``` + +Without `needs: [validate]`, the `load` job runs in parallel with `validate`. It does not wait for validation to succeed. The correct structure requires the dependency to be explicit: + +```yaml +jobs: + validate: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install DataCheck + run: pip install datacheck-cli[postgresql] + - name: Validate + env: + DB_HOST: ${{ secrets.DB_HOST }} + DB_PASSWORD: ${{ secrets.DB_PASSWORD }} + run: datacheck validate -c checks/orders.yaml + + load: + needs: [validate] # ← load only runs if validate job succeeds + runs-on: ubuntu-latest + steps: + - run: python load_to_warehouse.py +``` + +`needs: [validate]` creates the dependency. GitHub Actions will not start the `load` job if the `validate` job failed. This is the correct gate topology. + +--- + +## GitHub Actions: Multi-Stage Pipeline Design + +For pipelines with multiple validation phases — validate raw, transform, validate mart — the job dependency graph must encode the sequence: + +```yaml +jobs: + validate-raw: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - run: pip install -q datacheck-cli[postgresql] + - name: Validate raw layer + env: + DB_HOST: ${{ secrets.STAGING_DB_HOST }} + DB_PASSWORD: ${{ secrets.STAGING_DB_PASSWORD }} + run: | + datacheck schema compare --source staging_db --table orders_raw + datacheck validate -c checks/orders-raw.yaml --source staging_db + + transform: + needs: [validate-raw] # Only runs if raw validation passed + runs-on: ubuntu-latest + steps: + - run: dbt run --models staging + + validate-mart: + needs: [transform] # Only runs if transform completed + runs-on: ubuntu-latest + steps: + - name: Validate mart layer + run: datacheck validate -c checks/orders-mart.yaml --source staging_db + + promote-to-production: + needs: [validate-mart] # Only runs if mart validation passed + environment: production # ← GitHub Environment protection rules apply here + runs-on: ubuntu-latest + steps: + - run: python promote_to_prod.py +``` + +The `environment: production` on the `promote-to-production` job enables GitHub Environment protection rules: required reviewers, deployment branch restrictions, and environment-specific secrets. The combination of the `needs:` dependency chain and the environment gate means production promotion requires all validation layers to succeed and may require manual approval. + +--- + +## GitHub Actions: Matrix Validation for Multi-Table Pipelines + +When a pipeline validates multiple independent tables, a matrix strategy runs them in parallel and fails fast if any table fails: + +```yaml +jobs: + validate-tables: + runs-on: ubuntu-latest + strategy: + fail-fast: true # ← Cancel remaining matrix jobs if any fails + matrix: + table: + - name: orders + config: checks/orders.yaml + - name: customers + config: checks/customers.yaml + - name: products + config: checks/products.yaml + - name: events + config: checks/events.yaml + + steps: + - uses: actions/checkout@v4 + - run: pip install -q datacheck-cli[postgresql] + - name: Validate ${{ matrix.table.name }} + env: + DB_HOST: ${{ secrets.DB_HOST }} + DB_PASSWORD: ${{ secrets.DB_PASSWORD }} + run: | + datacheck validate \ + -c ${{ matrix.table.config }} \ + --source production_db \ + --format sarif \ + --output ${{ matrix.table.name }}-results.sarif + - uses: github/codeql-action/upload-sarif@v3 + if: always() + with: + sarif_file: ${{ matrix.table.name }}-results.sarif + + load-all: + needs: [validate-tables] # Requires ALL matrix jobs to succeed + runs-on: ubuntu-latest + steps: + - run: python load_pipeline.py +``` + +`fail-fast: true` on the matrix strategy means: if the `orders` validation fails, cancel the `customers`, `products`, and `events` validations that are still running. The job fails. `load-all` never starts. Without `fail-fast: true`, all matrix jobs run to completion even when one has failed — useful for collecting all failure reports, but potentially misleading about whether the full pipeline gate has failed. + +The choice between `fail-fast: true` and `fail-fast: false` in a matrix depends on whether you want to stop all parallel work on first failure (to conserve resources) or collect all failures across all tables before surfacing the result. Both are valid; the choice should be deliberate. + +--- + +## Airflow: The `trigger_rule` Failure Mode + +Airflow's `trigger_rule` parameter determines when a task becomes eligible to run relative to its upstream dependencies. The default is `all_success` — a task runs only when all upstream tasks have succeeded. This is the correct behavior for a gate. + +Any `trigger_rule` other than `all_success` on a task downstream of a validation gate is a potential bypass: + +```python +# SILENT GATE BYPASS +transform = PythonOperator( + task_id="transform_orders", + python_callable=transform_orders, + trigger_rule="all_done", # ← Runs regardless of validate_raw success or failure +) +``` + +`all_done` means "run when all upstream tasks have finished, regardless of their outcome." This is appropriate for cleanup tasks, notification tasks, and tasks that must run even when upstream work fails. It is never appropriate for a task that should be gated on validation success. + +Common `trigger_rule` values and their implications for gate design: + +| `trigger_rule` | Behavior | Appropriate for gate? | +|---|---|---| +| `all_success` (default) | Run only when ALL upstream tasks succeeded | Yes — this is the gate behavior | +| `all_done` | Run when ALL upstream tasks finished (any outcome) | No — bypasses failed validation | +| `one_success` | Run when ANY one upstream task succeeded | No — bypasses if validation fails but another task succeeds | +| `all_failed` | Run only when ALL upstream tasks failed | No — only for failure handling | +| `none_failed` | Run when no upstream tasks failed (success OR skipped) | Conditional — valid if skip is intentional | + +The correct pattern: + +```python +from airflow_provider_datacheck.operators.datacheck import DataCheckOperator + +validate_raw = DataCheckOperator( + task_id="validate_raw_orders", + config_path="/config/orders-raw.yaml", + source_name="staging_db", + table="orders_raw", + fail_on_error=True, + retries=0, # ← No retries — data quality failures are not transient +) + +transform = PythonOperator( + task_id="transform_orders", + python_callable=transform_orders, + # trigger_rule is omitted — defaults to all_success + # transform only runs when validate_raw has succeeded +) +``` + +--- + +## Airflow: Retry Policy for Validation Tasks + +A validation task configured with `retries=2` and `retry_delay=timedelta(minutes=5)` will retry a data quality failure twice before marking the task as failed. Each retry runs DataCheck against the same data. Each retry produces the same failure. The pipeline is delayed 15 minutes and reaches the same dead end. + +Retries are appropriate for tasks that fail due to transient conditions: network timeouts, warehouse connection drops, temporary unavailability. DataCheck's exit codes distinguish these: exit `3` (data loading error) may indicate a transient infrastructure failure worth retrying. Exit `1` (rule failure) indicates a data content violation that retrying will not fix. + +```python +from datetime import timedelta + +validate_raw = DataCheckOperator( + task_id="validate_raw_orders", + config_path="/config/orders-raw.yaml", + source_name="staging_db", + table="orders_raw", + fail_on_error=True, + retries=0, # Content failures: no retries + execution_timeout=timedelta(minutes=15), # Fail if validation hangs +) +``` + +If the underlying infrastructure commonly has transient failures, a retry policy can be configured with a short delay and a low count — but the expectation should be that exit `1` on any retry attempt still fails the task immediately. Some teams implement this by wrapping the DataCheck call in a shell script that inspects the exit code and exits `1` immediately on content failures without waiting for retry: + +```bash +#!/bin/bash +datacheck validate -c "$CONFIG_PATH" +EXIT=$? +if [ $EXIT -eq 1 ]; then + echo "Data quality failure — not retrying" >&2 + exit 1 # Will be caught by Airflow and treated as immediate failure +elif [ $EXIT -ge 2 ]; then + echo "Infrastructure or config error — may retry" >&2 + exit $EXIT +fi +``` + +--- + +## Airflow: `on_failure_callback` for Operational Visibility + +When a validation gate fires in production, the failure needs to be surfaced immediately and with enough context to route it correctly. Airflow's `on_failure_callback` runs a Python callable when the task fails, allowing the failure to trigger notifications, log structured context, or initiate remediation workflows. + +```python +def validation_failure_callback(context): + task_id = context["task_instance"].task_id + dag_id = context["dag"].dag_id + run_id = context["run_id"] + log_url = context["task_instance"].log_url + + message = ( + f"Data quality gate failed\n" + f"DAG: {dag_id}\n" + f"Task: {task_id}\n" + f"Run: {run_id}\n" + f"Logs: {log_url}" + ) + + # Post to Slack, PagerDuty, or internal alerting + requests.post( + os.environ["SLACK_WEBHOOK_URL"], + json={"text": message, "channel": "#data-quality-alerts"}, + ) + + +validate_raw = DataCheckOperator( + task_id="validate_raw_orders", + config_path="/config/orders-raw.yaml", + source_name="staging_db", + table="orders_raw", + fail_on_error=True, + retries=0, + on_failure_callback=validation_failure_callback, +) +``` + +The `on_failure_callback` does not affect the gate behavior — the task is still failed, the downstream tasks still do not run. It adds operational visibility: the failure is loud, attributed, and routable. The team sees the failure immediately without polling the Airflow UI. + +--- + +## Airflow: Branching After Validation + +Some pipelines need to route differently based on validation outcome rather than simply stopping. A quarantine pattern routes failing batches to an error table for investigation while allowing the pipeline to continue with clean data: + +```python +from airflow.operators.python import BranchPythonOperator + +def route_by_validation(**context): + ti = context["task_instance"] + # DataCheckOperator pushes summary JSON to XCom + validation_result = ti.xcom_pull(task_ids="validate_raw", key="summary") + if validation_result and validation_result.get("all_passed"): + return "transform_orders" + else: + return "quarantine_failed_batch" + +validate_raw = DataCheckOperator( + task_id="validate_raw_orders", + config_path="/config/orders-raw.yaml", + source_name="staging_db", + table="orders_raw", + fail_on_error=False, # ← Don't fail the task; let branch routing handle it +) + +route = BranchPythonOperator( + task_id="route_by_validation", + python_callable=route_by_validation, +) + +transform = PythonOperator(task_id="transform_orders", ...) +quarantine = PythonOperator(task_id="quarantine_failed_batch", ...) + +validate_raw >> route >> [transform, quarantine] +``` + +Note the `fail_on_error=False` here. When using `BranchPythonOperator` routing, the validation task should not fail the DAG — it should surface the result via XCom for the branch to read. This is the appropriate pattern when the downstream intent is quarantine-and-continue rather than halt-and-fix. + +--- + +## Environment-Specific Validation Configs + +Development and staging environments often have different data characteristics — smaller volumes, synthetic data, incomplete referential integrity. Enforcing production-level rules in staging blocks development work on legitimate data that does not meet production constraints. + +```yaml +# checks/orders.dev.yaml — permissive, unblocking +checks: + - name: order_id_not_null + column: order_id + rules: + not_null: true + severity: warning # Advisory in dev + + - name: amount_valid + column: amount + rules: + min: 0 + severity: warning +``` + +```yaml +# checks/orders.prod.yaml — strict, enforcing +checks: + - name: order_id_not_null + column: order_id + rules: + not_null: true + severity: error # Blocks in production + + - name: amount_valid + column: amount + rules: + not_null: true + type: float + min: 0.01 + max: 1000000.00 + severity: error +``` + +In GitHub Actions, the environment-specific config is selected by the workflow: + +```yaml +- name: Validate + run: | + CONFIG="checks/orders.${{ vars.ENVIRONMENT }}.yaml" + datacheck validate -c "$CONFIG" --source production_db +``` + +In Airflow, the config path is parameterized via environment variable or Airflow Variable: + +```python +import os + +validate_raw = DataCheckOperator( + task_id="validate_raw_orders", + config_path=f"/config/orders.{os.getenv('PIPELINE_ENV', 'prod')}.yaml", + source_name="staging_db", + table="orders_raw", + fail_on_error=(os.getenv("PIPELINE_ENV") == "prod"), +) +``` + +Production is strict. Staging catches regressions without blocking on data that is legitimately absent or different in the non-production environment. + +--- + +## Engineering Takeaways + +- **`trigger_rule='all_done'` silently bypasses validation gates.** Any Airflow task downstream of a validation gate with `trigger_rule` set to anything other than `all_success` (the default) will run regardless of validation outcome. Audit every `trigger_rule` in DAGs that contain validation tasks. + +- **`continue-on-error: true` in GitHub Actions removes the gate entirely.** A step with `continue-on-error: true` can exit non-zero without failing the job. This configuration should never appear on a validation step. The correct behavior is the default: omit the option. + +- **Missing `needs:` on deployment jobs creates parallel execution, not sequential gating.** A `load` job without `needs: [validate]` runs in parallel with validation, not after it. The dependency must be explicit. Every job that should be blocked by a validation failure must declare that dependency. + +- **Validation tasks should have `retries=0`.** Data quality failures (`exit 1`) are not transient. Retrying validation against the same bad data wastes time and reaches the same conclusion. Infrastructure failures (`exit 3`) may warrant retries — but this requires distinguishing exit codes at the retry policy level, not applying a blanket retry count. + +- **`fail-fast: true` on matrix validation jobs stops all parallel validations when any one fails.** This conserves compute and surfaces the first failure quickly. `fail-fast: false` collects all failures before reporting — useful when you want to see the full picture across all tables. Both are deliberate choices; the default matters. + +- **`on_failure_callback` makes the gate loud.** A failed validation task that quietly marks the DAG run as failed is not operationally visible enough in production. The callback routes the failure to the right alerting channel with task context, log URLs, and attribution — immediately, not when someone next opens the Airflow UI. + +- **`BranchPythonOperator` enables quarantine-and-continue as an alternative to halt-and-fix.** When the correct response to a validation failure is routing the bad batch to an error table rather than halting the pipeline, the branch pattern gives the DAG explicit routing logic based on validation outcome. This is a different policy decision from `fail_on_error=True` — both are correct for different scenarios. + +- **Environment-specific configs express different enforcement postures per stage.** Production rules enforce at `severity: error`. Staging rules may enforce at `severity: warning` for constraints that production data satisfies but synthetic staging data does not. The enforcement posture is a policy decision encoded in config, not in orchestration logic. + +--- + +*DataCheck is an open-source deterministic validation engine. Install: `pip install datacheck-cli`. Source: [github.com/squrtech/datacheck](https://github.com/squrtech/datacheck).* + + + +LinkedIn Post + +An Airflow DAG had a validation task. It had been in production for six months. The team believed it was gating their pipeline. + +Someone had added trigger_rule='all_done' to the transform task four months earlier to handle an optional upstream branch. One line. It meant the transform task would run regardless of whether validation succeeded or failed. Every validation failure for four months had logged, been counted in the DAG summary, and been silently ignored. + +The validation tool did its job. The orchestration configuration undid it. + +"A gate that doesn't propagate isn't a gate. It's a log entry." + +The structural failure modes come up consistently across both GitHub Actions and Airflow: + +continue-on-error: true on the validation step removes the gate entirely +Missing needs: on downstream jobs creates parallel execution, not sequential gating +retries=2 on a validation task retries bad data against itself — three identical failures, fifteen minutes later +Fail-fast pipeline design is not about adding validation steps. It's about auditing every trigger_rule, continue-on-error, needs:, and retry policy that touches a validation task — and confirming that none of them create a path where downstream work proceeds after the gate has fired. \ No newline at end of file diff --git a/blog/2026-02-23-deterministic-validation-vs-statistical-anomaly-detection.md b/blog/2026-02-23-deterministic-validation-vs-statistical-anomaly-detection.md new file mode 100644 index 0000000..ba23941 --- /dev/null +++ b/blog/2026-02-23-deterministic-validation-vs-statistical-anomaly-detection.md @@ -0,0 +1,251 @@ +# Deterministic Validation vs Statistical Anomaly Detection + +*Published 2026-02-23 by the DataCheck engineering team* + +--- + +Consider a column `payment_amount` with a business constraint: values must be between $0.01 and $50,000. A refund processing bug produces a batch containing negative values. You have two mechanisms that might catch it. + +The first executes: + +```sql +SELECT COUNT(*) AS failed_count +FROM payments +WHERE payment_amount < 0.01 OR payment_amount > 50000 +``` + +The second computes a z-score of the current batch's `payment_amount` distribution against a 30-day rolling historical baseline, and fires if the z-score exceeds a configured threshold. + +The first approach catches every negative value in the batch, with certainty, on every run, from day one of the pipeline's existence. + +The second approach may or may not catch it — depending on whether negative values have appeared before, what the variance in the historical distribution looks like, how the threshold is calibrated, and whether the current batch is large enough to shift the distribution signal. + +The first approach is not "simpler" in a pejorative sense. It knows something the second approach does not: that the valid range is explicitly $0.01 to $50,000. That knowledge came from an engineer who wrote it down. The second approach does not have that knowledge — it only knows what the historical data looked like. + +This is the core distinction between deterministic validation and statistical anomaly detection. It is not a question of sophistication. It is a question of which errors each approach is structurally capable of catching, and why. + +--- + +## What Each Approach Actually Computes + +To understand the trade-offs, start with the algorithm. + +**Deterministic validation** evaluates a predicate against the data. For a `not_null` rule: + +```sql +SELECT COUNT(*) AS failed_count +FROM orders +WHERE customer_id IS NULL +``` + +For an `allowed_values` rule: + +```sql +SELECT COUNT(*) AS failed_count +FROM orders +WHERE status NOT IN ('pending', 'confirmed', 'shipped', 'cancelled') + AND status IS NOT NULL +``` + +For a `range` rule: + +```sql +SELECT COUNT(*) AS failed_count +FROM orders +WHERE amount < 0 OR amount > 100000 +``` + +The output of each query is an integer: the number of rows that violated the constraint. The mathematical operation is set membership or inequality evaluation — there is no statistical component. The result depends only on the current data and the rule definition. + +**Statistical anomaly detection** computes deviation from a learned distribution. A common implementation using z-scores in SQL looks like this: + +```sql +WITH historical_stats AS ( + SELECT + AVG(daily_null_rate) AS mean_rate, + STDDEV(daily_null_rate) AS stddev_rate + FROM ( + SELECT + run_date, + SUM(CASE WHEN customer_id IS NULL THEN 1.0 ELSE 0.0 END) / COUNT(*) AS daily_null_rate + FROM orders_history + GROUP BY run_date + ) daily +), +current_batch AS ( + SELECT + SUM(CASE WHEN customer_id IS NULL THEN 1.0 ELSE 0.0 END) / COUNT(*) AS current_null_rate + FROM orders +) +SELECT + current_null_rate, + mean_rate, + stddev_rate, + (current_null_rate - mean_rate) / NULLIF(stddev_rate, 0) AS z_score +FROM current_batch +CROSS JOIN historical_stats +``` + +The output is a continuous score: how many standard deviations the current batch's null rate deviates from the historical mean. Whether this constitutes a failure depends on a separately configured threshold — commonly 2.5 or 3.0 standard deviations. The result depends on the current data, the historical data used to compute the baseline, and the threshold value. + +Note the structural difference: deterministic validation requires the current data and a rule. Anomaly detection requires the current data, a historical dataset, a model or formula, and a threshold. Each of these additional inputs is a source of variance in the output. + +--- + +## The Error Class Taxonomy + +These two approaches are suited to different categories of data error. Conflating the categories is where most teams go wrong. + +**Known constraint violations** are errors against explicit, pre-defined business rules. A value is null when the schema says it must not be. A status value is outside the allowed set. A foreign key references a record that does not exist. A timestamp is in the future when the domain requires it to be in the past. An ID does not match the expected UUID format. + +These errors are binary by definition. A customer ID either exists or it does not. An order status either belongs to the defined enum or it does not. There is no statistical ambiguity — only an engineer who knows the constraint. + +Deterministic validation is the correct tool for known constraint violations. It catches them with perfect sensitivity: if the constraint is violated, the count is non-zero. It catches them with perfect specificity: a count of zero means the constraint holds for every row, not that the violation was too subtle to detect. + +**Unknown pattern deviations** are anomalies against a data's expected statistical behavior — deviations you did not know to express as an explicit constraint. A volume that is 60% lower than typical. A cardinality explosion in a column that normally has stable cardinality. A correlation between two columns that has broken. A p99 latency value that is statistically inconsistent with recent history. + +These anomalies cannot be caught by deterministic rules because they require comparison against historical context. You do not know the "normal" volume of your pipeline without observing it over time. You cannot write a deterministic rule for "row count consistent with historical distribution" — that rule is, by definition, a statistical computation. + +Statistical anomaly detection is the correct tool for unknown pattern deviations. It surfaces signals you did not know to check for. It provides coverage for a class of problems that is simply not expressible as explicit predicates. + +The critical insight: **these domains are largely non-overlapping.** Applying anomaly detection to known constraint violations does not give you better coverage — it gives you a noisier, less reliable version of coverage you could have gotten with a three-line predicate. + +--- + +## Reproducibility as a First-Class Guarantee + +Deterministic validation is reproducible by construction: the same input data with the same rules produces the same output, always. This property has concrete engineering implications that become visible at scale. + +**Debugging.** When a validation run fails, you can reproduce the failure exactly by re-running the same command against the same data. The failure is not a product of model state, threshold calibration, or historical distribution — it is a direct consequence of the data and the rule. You can trace from the failure to the offending rows without understanding any model internals. + +**Environment parity.** Deterministic rules work identically in development, staging, and production. The predicate `amount < 0` produces the same result in every environment for the same input data. An anomaly model trained on production traffic does not have a valid baseline in a staging environment — it will fire on staging data that looks anomalous relative to production patterns, and miss production-scale anomalies that are too subtle to appear in lower-traffic staging runs. + +**Auditability.** A validation result of "3,412 rows violated the range constraint on `payment_amount`" is a fully auditable finding. An auditor can understand it, reproduce it, and verify it without any knowledge of the validation tool's internals. "The anomaly model returned a score of 0.73 which exceeded the threshold of 0.60 based on a 30-day rolling IQR baseline for the `payment_amount` distribution" is not auditable in the same sense — reproducing it requires access to the same 30 days of historical data, the same model implementation, and the same threshold configuration. + +In regulated industries — payments, healthcare, financial reporting — auditability is not optional. Deterministic rules with explicit, versioned YAML configs produce an audit trail that is a diff in a source control system. The rule was added in commit `a3f8c2d` on a specific date. The rule changed in commit `f91b447` three months later. The reviewer approved it in a pull request. That is an auditable history. + +--- + +## The Confounder Problem + +Statistical models are susceptible to confounders in the data — external factors that correlate with the metric being measured and produce systematic patterns that look anomalous relative to an unconditional baseline. + +**Seasonality.** Null rates, row counts, and value distributions often follow weekly or monthly cycles. A null rate of 0.1% on a Tuesday might be normal, while the same rate on a Saturday might be two standard deviations above the weekend mean. An unconditional anomaly model fires on the Saturday rate. An engineer investigates and finds nothing wrong. The model is tuned. The useful signal is reduced. + +**Promotional events.** A marketing campaign that drives 4x normal order volume creates a distribution shift across dozens of metrics. The anomaly model fires on volume, on cardinality, on value distributions. None of these are quality failures. They are expected consequences of the event. The model requires either explicit event calendars fed as features or manual suppression during known events. + +**Data migrations.** When an upstream source migrates to a new system with different ID formats, different precision, or different encoding, the historical baseline becomes structurally invalid. The new data distribution is entirely consistent and correct — but it is inconsistent with the old distribution that the model was trained on. + +Deterministic rules are immune to these confounders. The predicate `status NOT IN ('pending', 'confirmed', 'shipped', 'cancelled')` evaluates identically on a Tuesday and a Saturday, during a campaign and outside one, before and after an upstream migration. The constraint is about what values are valid, not about what values are typical. + +```yaml +checks: + - name: order_status_valid + column: status + rules: + not_null: true + allowed_values: [pending, confirmed, shipped, cancelled, refunded] + severity: error + + - name: order_amount_valid + column: amount + rules: + not_null: true + type: float + min: 0.01 + max: 50000.00 + severity: error + + - name: customer_id_format + column: customer_id + rules: + not_null: true + regex: '^CUST-[0-9]{8}$' + severity: error + + - name: created_at_valid + column: created_at + rules: + not_null: true + no_future_timestamps: true + severity: error +``` + +These rules do not care about the day of the week. They do not care about seasonal patterns, campaign volume, or upstream migrations. They care about whether each row satisfies the constraint. + +--- + +## The Legibility Gap + +Deterministic validation rules are, by design, human-readable. + +A rule that reads `min: 0.01` is legible to every engineer on the team. It can be reviewed in a pull request. It can be explained to a new team member in seconds. Its behavior in every possible input scenario is fully predictable from the rule definition alone. Changing the rule requires a code review. Deleting the rule leaves a record in source control. + +An anomaly detection model's decision boundary is not legible in the same sense. "This batch failed because the z-score of the null rate was 2.83 standard deviations above the 90-day mean" requires understanding the historical baseline, the z-score calculation, and the threshold calibration to evaluate. You cannot review that decision in a pull request. You cannot reproduce it without the same historical data. + +This is not an indictment of anomaly detection — it is a description of an inherent trade-off. Models are powerful precisely because they can capture patterns that are too complex to express as explicit rules. That power comes at the cost of legibility. For the subset of quality checks that can be expressed as explicit rules — which is the majority of enforcement-critical checks — the legibility cost is unnecessary. + +```bash +# The entire enforcement decision is visible in one command: +datacheck validate -c checks/orders.datacheck.yaml + + PASS order_status_valid (0 failures / 2,341,887 rows) + FAIL order_amount_valid (1,203 failures / 2,341,887 rows) + PASS customer_id_format (0 failures / 2,341,887 rows) + PASS created_at_valid (0 failures / 2,341,887 rows) + + Rules: 4 total 3 passed 1 failed + Exit code: 1 +``` + +The output is self-explanatory. 1,203 rows in the current batch have an `amount` value that falls outside the valid range. No model internals. No threshold explanation. No historical context required to understand the failure. + +--- + +## Where Anomaly Detection Genuinely Wins + +Anomaly detection has a domain where it provides coverage that deterministic validation cannot: error classes you did not know to check for. + +A volume anomaly that drops row count by 40% is not expressible as a deterministic rule unless you know the expected row count range in advance. An unexpected correlation between `discount_rate` and `customer_segment` breaking is not expressible as a constraint rule. A p95 value for processing latency trending 3x higher than typical is not a constraint violation — it is a statistical signal. + +These are real data quality signals. They represent value. The correct response is to use anomaly detection for this class of problem and deterministic validation for the class of problems described above — not to use anomaly detection as a universal substitute for explicit rules. + +A mature data platform uses both. Deterministic rules at the enforcement gate enforce what you know. Anomaly detection in the monitoring layer surfaces what you did not know to check. The division of responsibility maps cleanly to the error taxonomy: known constraint violations belong to deterministic validation; unknown pattern deviations belong to statistical analysis. + +--- + +## Engineering Takeaways + +- **Deterministic validation and anomaly detection compute fundamentally different things.** Validation evaluates a predicate — a binary function of the current data against an explicit rule. Anomaly detection estimates deviation from a learned distribution — a continuous function of the current data against historical state. The outputs have different properties and are suited to different purposes. + +- **Known constraint violations are always better caught by deterministic rules.** If you know a column should not be null, write `not_null: true`. The rule catches it with 100% sensitivity on every run. Using anomaly detection for known constraints trades guaranteed coverage for probabilistic coverage — with no upside. + +- **Reproducibility is a first-class guarantee of deterministic validation.** The same data with the same rules produces the same result everywhere — in development, staging, and production. Anomaly models depend on external state (training data, weights, thresholds) that varies across environments and over time. + +- **Deterministic rules are auditable by design; model decisions are not.** A YAML rule reviewed in a pull request produces an audit trail in source control. A model decision that depends on a 30-day rolling baseline does not produce a comparable audit record. For regulated environments, this difference is operationally significant. + +- **Statistical confounders systematically degrade anomaly models on data that follows expected business patterns.** Seasonality, campaigns, and migrations produce distribution shifts that are legitimate and expected. Deterministic predicates are immune to confounders — a constraint violation is a constraint violation regardless of the day or the traffic profile. + +- **Anomaly detection is the correct tool for unknown pattern deviations, not for known constraint enforcement.** Volume anomalies, unexpected correlation breaks, and cardinality explosions are legitimately hard to express as explicit rules. These are the use cases anomaly detection was built for. Applying it to null checks and format validation is using a tool outside its designed domain. + +- **The combined architecture is not a compromise — it is the precise mapping of each tool to its error class.** Deterministic rules at the gate for constraints you can express. Statistical analysis in the monitoring layer for patterns you cannot predict. The boundary between them follows the boundary between known and unknown — which is the most natural division available. + +--- + +*DataCheck is an open-source deterministic validation engine. Install: `pip install datacheck-cli`. Source: [github.com/squrtech/datacheck](https://github.com/squrtech/datacheck).* + + + +LinkedIn Post + +Most teams think of deterministic validation and anomaly detection as competing approaches to data quality. They're not competing — they operate on different error classes and produce different guarantees. + +Deterministic validation evaluates a predicate: is this value in the allowed set, is this column null, does this ID match the expected format. The output is a count. Same data, same rule, same result — every time, in every environment. + +Anomaly detection estimates deviation from a learned distribution. It requires historical state — training data, model weights, a calibrated threshold. Change the training window, get a different answer for the same input data. + +"If you know a column should never be null, write not_null: true. Using a statistical model to catch a known constraint is trading a guaranteed predicate for a probabilistic approximation — with no upside." + +The domain split is clean: deterministic rules for constraints you can express, statistical analysis for patterns you cannot predict. Volume anomalies and unexpected correlation breaks are legitimately hard to encode as explicit rules — that's what anomaly detection was built for. Null checks and format validation are not. + +The mistake isn't using anomaly detection. It's using it for the wrong error class. \ No newline at end of file diff --git a/blog/2026-02-23-exit-codes-as-contracts-in-cicd-for-data-pipelines.md b/blog/2026-02-23-exit-codes-as-contracts-in-cicd-for-data-pipelines.md new file mode 100644 index 0000000..5426b11 --- /dev/null +++ b/blog/2026-02-23-exit-codes-as-contracts-in-cicd-for-data-pipelines.md @@ -0,0 +1,388 @@ +# Exit Codes as Contracts in CI/CD for Data Pipelines + +*Published 2026-02-23 by the DataCheck engineering team* + +--- + +A process exits. One integer is written to the process table. The parent process reads it. If it is zero, the next command runs. If it is non-zero, the next command does not. + +This mechanism is fifty years old. It works on every operating system that has ever run a CI pipeline. It requires no network connection, no shared database, no external service, no registration. It is the lowest-common-denominator interface for process orchestration — which is precisely why it is the most reliable one. + +Every CI system ever built — Jenkins, GitHub Actions, CircleCI, Buildkite, GitLab CI — treats a non-zero exit code as a job failure. Every shell ever written interprets `&&` as "run the next command only if the previous one exited zero." Every orchestrator that runs DAG tasks reads the exit code of the process it spawned to determine whether the task succeeded. + +Data pipeline enforcement built on exit codes inherits all of this — for free, without integration work, across every environment the pipeline runs in. The exit code is not just a return value. It is a contract between the validation tool and every system that might ever invoke it. + +--- + +## What the Contract Says + +A contract has terms. The exit code contract for a data validation tool has specific terms that matter for how a pipeline is designed around it. + +DataCheck exits with one of four codes, each carrying a distinct semantic meaning: + +| Code | Meaning | Downstream implication | +|------|---------|----------------------| +| `0` | All error-severity rules passed | Pipeline may proceed | +| `1` | One or more error-severity rules failed | Pipeline must stop; data does not meet its contract | +| `2` | Configuration error | Pipeline must stop; the validation config itself is malformed | +| `3` | Data loading error | Pipeline must stop; the data source was unreachable | +| `4` | Unexpected internal error | Pipeline must stop; investigate the validator | + +Exit code `1` and exit code `3` carry different information. Both fail the pipeline, but they indicate different problems requiring different responses. Exit `1` means the data arrived but is bad. A retry of the same pipeline run will produce the same bad data until the upstream issue is fixed. Exit `3` means the data source was unavailable. A retry may succeed if the outage was transient. + +A pipeline that inspects exit codes can route these cases differently: + +```bash +#!/bin/bash +set -e + +datacheck validate -c checks/orders.datacheck.yaml +VALIDATION_EXIT=$? + +case $VALIDATION_EXIT in + 0) + echo "Validation passed — proceeding to load" + python load_to_warehouse.py + ;; + 1) + echo "Data quality failure — data does not meet contract" >&2 + # Do not retry — the data is bad, not the infrastructure + exit 1 + ;; + 2) + echo "Configuration error — alerting platform team" >&2 + curl -X POST "$SLACK_WEBHOOK" \ + -d '{"text": "DataCheck config error on orders pipeline"}' + exit 2 + ;; + 3) + echo "Data source unavailable — scheduling retry" >&2 + # Signal to the scheduler that this is a transient failure + exit 75 # EX_TEMPFAIL — conventional retry signal on some systems + ;; +esac +``` + +Most pipelines will not need this level of exit code inspection. The common case — `set -e` and let any non-zero exit halt the script — is correct for most enforcement gates. But the semantic distinction exists and is available when the pipeline design requires it. + +--- + +## Severity as the Policy Layer + +The exit code contract has one configurable dimension: which rule failures count as exit `1` and which do not. This is controlled by the `severity` field on each rule. + +```yaml +checks: + - name: order_id_not_null + column: order_id + rules: + not_null: true + severity: error # Failure → exit 1. Pipeline stops. + + - name: email_format + column: email + rules: + regex: '^[^@]+@[^@]+\.[^@]+$' + severity: warning # Failure → reported, but exit remains 0. + + - name: phone_populated + column: phone + rules: + not_null: true + severity: info # Failure → logged only. No output to stderr. +``` + +DataCheck computes the exit code after evaluating all rules: + +- Any `error`-severity rule failure → exit `1` +- Only `warning` or `info` failures, no `error` failures → exit `0` +- All rules pass → exit `0` + +Severity is not a quality judgment about the rule. It is a policy decision about whether a given rule failure should stop the pipeline. Both the `error` rule and the `warning` rule are deterministic — they either pass or fail, with the same certainty. The difference is what the pipeline does when they fail. + +This makes severity the mechanism for expressing enforcement policy in config, not in orchestration logic. You do not need a separate decision layer that reads validation results and decides whether to stop the pipeline — the exit code already encodes that decision, and the severity settings on each rule are the policy that drives it. + +--- + +## Shell Composability + +The exit code is what makes a validation tool composable with any Unix toolchain. Because DataCheck is a subprocess that exits with a standard code, it can participate in any shell composition pattern without modification. + +**Sequential enforcement with `&&`:** + +```bash +# Each step only runs if the previous step exited 0 +datacheck validate -c checks/raw.yaml \ + && dbt run --models staging \ + && datacheck validate -c checks/staging.yaml \ + && dbt run --models marts \ + && datacheck validate -c checks/marts.yaml \ + && python publish.py +``` + +If any DataCheck validation fails, the chain stops at that point. The subsequent dbt run and all downstream steps are skipped. The pipeline halts exactly where the contract was violated. + +**Script-level enforcement with `set -e`:** + +```bash +#!/bin/bash +set -e # Exit the script on any non-zero exit code +set -o pipefail # Extend to pipeline failures + +datacheck validate -c checks/orders.yaml +python transform_orders.py +datacheck validate -c checks/orders_mart.yaml +python publish_orders_mart.py +``` + +`set -e` means every DataCheck invocation in the script is implicitly an enforcement gate. If any validation fails, the script exits immediately — the remaining commands do not run. This is equivalent to explicit `&&` chaining but removes the repetition. + +**Makefile dependency chains:** + +```makefile +validate-raw: + datacheck validate -c checks/raw.yaml + +transform: validate-raw + dbt run --models staging + +validate-staging: transform + datacheck validate -c checks/staging.yaml + +publish: validate-staging + python publish.py +``` + +Make enforces the dependency graph: `publish` depends on `validate-staging`, which depends on `transform`, which depends on `validate-raw`. A non-zero exit from any target prevents dependent targets from running. This is the same sequential enforcement, expressed as a declarative graph rather than an imperative script. + +All three patterns work without any DataCheck-specific integration. The shell, Make, and any other Unix toolchain that respects exit codes enforce the gate automatically. + +--- + +## How Orchestrators Consume Exit Codes + +Each orchestration environment reads exit codes through its own abstraction, but the underlying mechanism is the same. + +**Apache Airflow BashOperator:** + +```python +from airflow.operators.bash import BashOperator + +validate_orders = BashOperator( + task_id="validate_orders", + bash_command="datacheck validate -c /config/orders.yaml", +) +``` + +Airflow's `BashOperator` spawns a subprocess and waits for it to exit. If the subprocess exits with a non-zero code, the operator raises `AirflowException`. Airflow marks the task as failed. Downstream tasks that depend on `validate_orders` are skipped. The DAG run is marked failed. Airflow's retry policy applies if configured. + +The DataCheck Airflow operator wraps this pattern with validation-specific options: + +```python +from airflow_provider_datacheck.operators.datacheck import DataCheckOperator + +validate_orders = DataCheckOperator( + task_id="validate_orders", + config_path="/config/orders.yaml", + source_name="production_db", + table="orders", + fail_on_error=True, +) +``` + +`fail_on_error=True` maps to the exit code contract: if DataCheck exits `1`, raise `AirflowException`. `fail_on_error=False` maps to running DataCheck in warning-only mode — the task reports results but does not fail the DAG regardless of exit code. + +**Kubernetes Jobs:** + +```yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: validate-orders +spec: + template: + spec: + containers: + - name: datacheck + image: python:3.12-slim + command: + - sh + - -c + - | + pip install -q datacheck-cli[postgresql] && \ + datacheck validate -c /config/orders.yaml + env: + - name: DB_HOST + valueFrom: + secretKeyRef: + name: db-credentials + key: host + restartPolicy: Never # Don't retry data quality failures + backoffLimit: 0 # No retries — a failed validation is not a transient error +``` + +Kubernetes reads the container's exit code. Exit non-zero marks the Job as failed. `restartPolicy: Never` with `backoffLimit: 0` is the correct policy for data quality failures — unlike infrastructure failures, a bad dataset does not fix itself on retry. The exit code semantics of DataCheck directly inform the Kubernetes Job policy. + +**Argo Workflows:** + +```yaml +- name: validate-and-load + dag: + tasks: + - name: validate-raw + template: datacheck-validate + arguments: + parameters: + - name: config + value: checks/raw.yaml + + - name: transform + dependencies: [validate-raw] + template: dbt-run + + - name: validate-mart + dependencies: [transform] + template: datacheck-validate + arguments: + parameters: + - name: config + value: checks/mart.yaml + + - name: publish + dependencies: [validate-mart] + template: publish-data +``` + +Argo's DAG task dependencies resolve based on task success status — which is derived from exit codes. A failed `validate-raw` task prevents `transform` from starting. This is the same DAG enforcement pattern as Airflow, but driven through Argo's declarative workflow definition. + +--- + +## The Canary Approach to Rule Introduction + +One practical implication of the severity-to-exit-code mapping: you can introduce new validation rules without immediately blocking the pipeline, observe their behavior in production, and promote them to enforcement when confident they are correct. + +The pattern is deliberate severity staging: + +**Stage 1 — Observe.** Add the rule at `severity: warning`. The pipeline runs unchanged. The rule reports failures to the output, but exit code remains `0` when only warnings fire. You learn whether the rule produces false positives against live production data. + +```yaml +# Stage 1: observe behavior without blocking +- name: amount_precision + column: amount + rules: + regex: '^\d+\.\d{2}$' # Require exactly 2 decimal places + severity: warning +``` + +**Stage 2 — Evaluate.** Run the pipeline for several cycles. If the rule never fires, it is either correct or the constraint is already consistently met. If it fires frequently on legitimate data, the rule is wrong and needs adjustment. + +**Stage 3 — Enforce.** Change `severity: error`. The next pipeline run where this rule fails will exit `1`. + +```yaml +# Stage 3: enforce after validation in production +- name: amount_precision + column: amount + rules: + regex: '^\d+\.\d{2}$' + severity: error +``` + +This severity promotion is a single-line diff in the config file. It produces a clear audit trail in source control: the rule was introduced as a warning on one date, promoted to error on another, with the reason visible in the commit message and PR review. + +The exit code contract makes this migration path safe: the validation tool never accidentally enforces a rule that is not yet at `severity: error`. The mapping from severity to exit code is explicit and stable. + +--- + +## The Dual-Channel Output Pattern + +When DataCheck runs with SARIF output, two independent channels carry information: + +```bash +datacheck validate -c checks/orders.yaml \ + --format sarif \ + --output results.sarif +# exit code: 0 or 1 (enforcement) +# results.sarif: human-readable annotation data (reporting) +``` + +The exit code drives pipeline enforcement. The SARIF file drives annotation — GitHub Security tab, IDE integration, human-readable reports. These channels are independent. The enforcement decision (proceed or stop) is made by the parent process reading the exit code. The annotation experience is driven by the SARIF file being uploaded to GitHub's security endpoint. + +This separation matters: you can configure DataCheck to write SARIF and still let the exit code gate the pipeline. The reporting output does not influence the enforcement output. A pipeline that generates a SARIF report and still exits `1` blocks the merge and annotates the PR simultaneously. + +```yaml +# .github/workflows/data-quality.yml +- name: Validate + run: | + datacheck validate -c .datacheck.yaml \ + --format sarif --output results.sarif + # Non-zero exit fails this step → fails the job → blocks merge + +- name: Upload annotations + uses: github/codeql-action/upload-sarif@v3 + if: always() # Upload even if the previous step failed + with: + sarif_file: results.sarif +``` + +`if: always()` ensures the SARIF upload runs even when DataCheck exits `1`. The enforcement (blocked merge) and the annotation (inline PR comments showing which rules failed) both happen. Neither depends on the other. + +--- + +## Why Not Exceptions? + +The DataCheck Python API raises `ValueError` when validation fails, which is the correct interface for Python callers: + +```python +from datacheck import ValidationEngine + +engine = ValidationEngine(config_path=".datacheck.yaml") +summary = engine.validate() + +if not summary.all_passed: + raise ValueError(f"Validation failed: {summary.failed_rules} rules failed") +``` + +The CLI translates validation outcomes to exit codes instead of exceptions — not because exceptions are wrong, but because exceptions are a Python-specific mechanism that does not exist in the environments where pipeline enforcement operates. + +A Bash script cannot catch a Python exception. GitHub Actions cannot route on a Python exception. Kubernetes cannot set job completion policy based on a Python exception. Airflow can catch exceptions from Python operators, but the `BashOperator` — and the DataCheck CLI it invokes — uses exit codes. + +The exit code is the interface that all of these environments share. It is the result of fifty years of convergence on a universal process communication mechanism. A data validation CLI that exits with meaningful codes is a tool that works everywhere pipeline execution happens — without any environment-specific integration. + +--- + +## Engineering Takeaways + +- **Exit codes are a bilateral contract.** The validation tool commits to specific exit code semantics; the caller commits to enforcement based on those semantics. The contract requires no network, no shared state, and no registration — it is carried in the process table. + +- **Exit code `1` and exit code `3` are distinct failure modes requiring different pipeline responses.** Exit `1` means data failed its contract — a retry will not fix it, the data is bad. Exit `3` means the data source was unreachable — a retry may succeed. Pipelines that inspect exit codes can route these cases to different recovery paths. + +- **Severity is the policy layer that maps rule failures to exit code determination.** `severity: error` rules contribute to exit `1`. `severity: warning` rules do not. Severity is not a quality judgment — it is an explicit policy decision about what should stop the pipeline, encoded in config and reviewed in PRs. + +- **Shell composability (`&&`, `set -e`, Makefile dependencies) requires no integration code.** Because DataCheck exits with standard POSIX codes, it participates correctly in any Unix pipeline composition pattern without modification. The shell is the orchestrator for the simple cases. + +- **Different orchestrators consume exit codes through different abstractions, but the underlying mechanism is identical.** Airflow raises `AirflowException`. Kubernetes marks the Job failed. Argo marks the task failed. All of them read the process exit code. The exit code contract works across all of them without DataCheck needing to know which one is running it. + +- **The canary pattern — warning before error — is the safe way to introduce new rules to a production pipeline.** Add the rule at `severity: warning`, observe its behavior over multiple pipeline runs, then promote to `severity: error`. The severity promotion is a single-line diff that produces an auditable trail in source control. + +- **Enforcement and reporting are independent channels.** The exit code gates the pipeline. The SARIF file annotates the PR. Using `if: always()` on the upload step ensures annotations appear even when the gate fails. Neither channel depends on the other. + +- **Exit codes outlast any specific orchestration technology.** Airflow gets replaced. CI providers change. Kubernetes versions deprecate APIs. The POSIX exit code contract has been stable since the 1970s and is honored by every process scheduler ever written. A tool that uses exit codes as its enforcement interface will work with whatever runs it, now and in the future. + +--- + +*DataCheck is an open-source deterministic validation engine. Install: `pip install datacheck-cli`. Source: [github.com/squrtech/datacheck](https://github.com/squrtech/datacheck).* + + +LinkedIn Post + +A process exits. One integer is written to the process table. The parent process reads it. Everything else — GitHub Actions failing a job, Airflow marking a task failed, Kubernetes marking a Job failed, Bash skipping the next command — flows from that one integer. + +This mechanism is fifty years old. It works identically on every CI system ever built, every shell ever written, every orchestrator ever deployed. And it is exactly the right interface for data pipeline enforcement. + +The interesting design decision is what different exit codes mean. Exit 1 means the data failed its validation contract — a retry won't fix it, the data is bad. Exit 3 means the data source was unreachable — a retry might succeed. These are different failure modes requiring different pipeline responses, and a tool that expresses them as different codes lets orchestrators route them correctly. + +"An exit code is a contract. The tool commits to specific semantics; the caller commits to enforcement. No network, no shared state, no registration required." + +There's also a migration pattern here: introducing new validation rules at severity: warning before promoting them to severity: error. The rule fires and reports, but exit remains 0. You observe behavior over production pipeline runs, then flip one YAML field to enforce. The promotion is a single-line diff with a clear audit trail. + +The universality is the feature. Tools that depend on webhooks, APIs, or platform-specific plugins work with one orchestrator. Exit codes work with all of them. \ No newline at end of file diff --git a/blog/2026-02-23-if-code-must-pass-ci-data-should-too.md b/blog/2026-02-23-if-code-must-pass-ci-data-should-too.md new file mode 100644 index 0000000..0af09fb --- /dev/null +++ b/blog/2026-02-23-if-code-must-pass-ci-data-should-too.md @@ -0,0 +1,305 @@ +# If Code Must Pass CI, Data Should Too + +*Published 2026-02-23 by the DataCheck engineering team* + +--- + +Every software team runs CI on code. No one debates whether a PR needs to pass tests and linting before it merges. The gate is structural — the deployment step does not run if tests fail. That constraint is enforced by the pipeline, not by convention. + +The data those systems produce and consume? In most engineering organizations, it ships with zero gates. + +This asymmetry is the source of a specific class of incident that is hard to attribute, slow to debug, and completely preventable. + +--- + +## The Structural Gap + +When an engineer changes an API response schema, the client tests catch it. When an engineer removes a function, the type checker fails. When an engineer introduces a calculation error, unit tests flag it. The feedback loop is tight: push, CI runs, fail fast, fix before merge. + +The data equivalent rarely exists. A data engineer updates an ETL job and a column that was `float` is now emitted as `string` — because a library changed, because implicit coercion happened somewhere, because the upstream source system changed its format. The ETL code change ships. The downstream consumer parses the column numerically. A sum aggregation returns `NaN`. A financial report silently zeroes out. The incident surfaces two days later when a user notices the numbers don't match. + +The code tests passed. Nothing in CI caught it. Because data was not part of CI. + +The problem is not that engineers do not care about data quality. It is structural: there is no gate. Data contracts change silently alongside code changes because nothing enforces them at the point of change. + +--- + +## What Code CI Actually Does + +Strip away the tooling and CI does one thing: it runs a set of rules against an artifact, reports whether the rules passed or failed, and returns an exit code. The exit code is the contract. `0` means proceed. Non-zero means stop. + +Everything else — PR annotations, build badges, Slack notifications — is UI around that exit code. + +Linters enforce style and correctness rules before code merges. Static analysis enforces type safety. Tests enforce behavioral contracts. Each tool runs as a subprocess, writes output to stdout, and exits with a code. The CI orchestrator does not care which tool ran — it only reads the exit code. + +This mechanism is simple, composable, and universal. It works with any tool that respects POSIX exit conventions. And it is exactly the mechanism that data validation needs. The tooling gap is not conceptual — it is that data validation tools have historically been built for dashboards, not for pipelines. The primary interface was a UI, not an exit code. + +--- + +## Where the Gate Lives + +The critical question is not whether to add a data gate — it is where in the pipeline to place it. + +**Before ingestion.** Validation runs against source data before it is loaded into the warehouse. If the source fails, the load does not happen. This is the cleanest position for catching upstream schema drift, but it requires access to the source at pipeline time. + +**After load, before transformation.** The gate runs against the raw table after ingestion but before dbt, before aggregations, before any downstream consumer touches it. This is the most common production-viable position and catches both source issues and load bugs. + +**After transformation, before serving.** The gate runs against the final artifact — the mart table, the parquet export, the API dataset — immediately before it is published. This catches transformation bugs that gating on raw data would miss. + +Production pipelines need gates at both positions 2 and 3. The key property in either case: the gate runs before the next stage can proceed — not as a parallel monitor that fires alerts after the fact, but as a sequential dependency that blocks progress. + +--- + +## Config as a Versioned Contract + +The second requirement is treating validation rules as code. Not as dashboard configurations, not as UI settings, not as metadata in a catalog — as text files that live in the repository, get reviewed in pull requests, and evolve alongside the code that produces the data. + +```yaml +# checks/orders.datacheck.yaml + +data_source: + type: postgresql + # connection via sources.yaml + +checks: + - name: order_id_integrity + column: order_id + rules: + not_null: true + unique: true + + - name: order_total_valid + column: order_total + rules: + not_null: true + type: float + min: 0 + max: 1000000 + + - name: status_constrained + column: status + rules: + not_null: true + allowed_values: [pending, confirmed, shipped, cancelled, refunded] + + - name: created_at_fresh + column: created_at + rules: + not_null: true + max_age: 7d +``` + +This file belongs in the same repository as the ETL code that produces the `orders` table. When an engineer changes the ETL, the diff includes any validation config change. The PR reviewer sees both. If the ETL now emits `order_total` as a string, the reviewer sees that the `type: float` rule was removed or modified — that is a conversation before merge, not an incident two days later. + +Without the config in the repository, there is no signal that the data contract changed. It is an invisible breaking change dressed as a code change. + +--- + +## Schema Contracts as Breaking Change Detection + +One pattern deserves explicit treatment: schema contracts. + +A schema contract captures the shape of a dataset at a known-good state — column names, types, nullability — and compares every subsequent run against that baseline. Any structural deviation fails the gate. + +```bash +# Run once against a known-good state. Commit the output file. +datacheck schema capture --source production_db --sources-file sources.yaml + +# In CI on every subsequent run: +datacheck schema compare --source production_db --sources-file sources.yaml +``` + +When the comparison fails: + +``` +FAIL schema_compare: Column 'order_total' type changed: float -> varchar +FAIL schema_compare: Column 'tax_rate' removed +PASS schema_compare: Column 'order_id' unchanged (integer, NOT NULL) +PASS schema_compare: Column 'status' unchanged (varchar, NOT NULL) + +Exit code: 1 +``` + +The pipeline stops. The engineer who triggered the upstream change gets a clear failure at the point of change — not after downstream consumers have already ingested corrupted data. + +This is the data equivalent of API breaking change detection. In code, you use a type system or a schema registry. For data artifacts, the equivalent is a captured baseline compared on every pipeline run. The baseline file lives in the repository alongside the validation config — both are versioned, both are reviewed, both encode what the contract is expected to be. + +--- + +## The GitHub Actions Integration + +Wiring this into GitHub Actions is direct. The key design choice: trigger on every push that touches ETL code, dbt models, or validation config — not just on a schedule. + +```yaml +# .github/workflows/data-quality.yml +name: Data Quality Gate + +on: + push: + paths: + - 'etl/**' + - 'dbt/**' + - 'checks/**' + - 'schema-baselines/**' + pull_request: + paths: + - 'etl/**' + - 'checks/**' + +permissions: + contents: read + security-events: write + +jobs: + validate: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install DataCheck + run: pip install datacheck-cli[postgresql] + + - name: Run data quality gate + env: + DB_HOST: ${{ secrets.DB_HOST }} + DB_NAME: ${{ secrets.DB_NAME }} + DB_USER: ${{ secrets.DB_USER }} + DB_PASSWORD: ${{ secrets.DB_PASSWORD }} + run: | + datacheck validate -c checks/orders.datacheck.yaml \ + --format sarif --output results.sarif + + - name: Upload SARIF to GitHub Security tab + uses: github/codeql-action/upload-sarif@v3 + if: always() + with: + sarif_file: results.sarif +``` + +The SARIF upload produces annotations directly on the pull request diff in the GitHub Security tab. Not in a separate dashboard, not in an email. On the PR, at the point of change. Exit code `1` from DataCheck causes the validation step to fail, which fails the job, which blocks the merge. That is the gate. + +--- + +## The Airflow DAG Pattern + +For orchestrated pipelines, the gate is structural: the validation task must succeed before the downstream task is eligible to run. The DAG encodes the dependency explicitly. + +```python +from airflow import DAG +from airflow.operators.python import PythonOperator +from airflow_provider_datacheck.operators.datacheck import DataCheckOperator + +with DAG("orders_pipeline", schedule_interval="@hourly", ...) as dag: + + extract = PythonOperator(task_id="extract_orders", ...) + + validate_raw = DataCheckOperator( + task_id="validate_raw_orders", + config_path="/config/orders-raw.datacheck.yaml", + source_name="staging_db", + table="orders_raw", + fail_on_error=True, + ) + + transform = PythonOperator(task_id="transform_orders", ...) + + validate_final = DataCheckOperator( + task_id="validate_orders_mart", + config_path="/config/orders-mart.datacheck.yaml", + source_name="production_db", + table="orders_mart", + fail_on_error=True, + ) + + serve = PythonOperator(task_id="publish_to_consumers", ...) + + extract >> validate_raw >> transform >> validate_final >> serve +``` + +If `validate_raw_orders` raises `AirflowException`, Airflow marks it failed and `transform_orders` never starts. If `validate_orders_mart` fails, `publish_to_consumers` never runs. The DAG graph is the enforcement mechanism — bad data cannot reach the next stage because the task that would move it there is blocked by a failed upstream dependency. + +This is structurally identical to how CI gates a deployment. The deployment step only runs if tests pass. The downstream task only runs if validation passes. + +--- + +## What Runs Against the Database + +For database sources, it is worth being explicit about what executes. DataCheck does not pull rows into the validation process. For a `not_null` rule on a million-row table: + +```sql +SELECT COUNT(*) AS failed_count +FROM orders +WHERE order_total IS NULL +``` + +One row comes back. The validation host never sees the actual data. For a `type: float` check: + +```sql +SELECT COUNT(*) AS failed_count +FROM orders +WHERE order_total IS NOT NULL + AND order_total::text !~ '^-?[0-9]+(\.[0-9]+)?$' +``` + +Again, a single integer. No rows transferred. No egress cost. Validation runs at warehouse speed, inside your existing compute, without a separate validation tier. + +This has security and compliance implications beyond performance. In regulated environments — finance, healthcare, PII-heavy infrastructure — data leaving the warehouse boundary is an audit event. Aggregate `COUNT` queries never expose row-level data. The validation result tells you how many rows failed a rule, not which rows or what values they contained. + +--- + +## What This Gate Catches + +Gating data in CI catches a specific set of failure modes that no other mechanism addresses before data moves downstream: + +**Type regressions.** An ETL update changes a column from `float` to `string` through implicit coercion. The `type: float` rule fails on the first post-deploy pipeline run, before any downstream aggregation processes the column. + +**Schema drift from upstream sources.** A source system removes a column, renames a field, or changes nullability. Schema comparison catches the structural deviation before downstream consumers encounter an unexpected shape. + +**Value set violations.** A source system adds a new enum value — `"on_hold"` — that the downstream status mapping does not handle. The `allowed_values` rule catches it before the unmapped value produces a silent `NULL` in downstream joins. + +**Referential breaks.** `orders.customer_id` references customer IDs that were deleted from the customers table. A `foreign_key_exists` rule catches the orphaned references before the broken join propagates into aggregations. + +**Temporal staleness.** A scheduled extraction job fails silently and the table stops updating. A `max_age: 24h` rule fails the next morning's pipeline run before downstream consumers serve stale data. + +None of these require statistical anomaly detection. None require training data or ML inference. They require explicit rules, written by engineers, enforced at the gate. The rules are deterministic — they produce the same result on every run, in every environment, at any scale. + +--- + +## Engineering Takeaways + +- **The exit code is the contract.** `0` means data passed its rules. `1` means at least one error-severity rule failed. Any CI system, orchestrator, or shell script that reads POSIX exit codes can enforce a data gate without custom integration code. + +- **Validation config belongs in the repository.** Rules that live outside the codebase are invisible to reviewers. A data contract change that is not in the PR diff is an invisible breaking change. Versioning the config alongside the ETL makes contract changes reviewable before they ship. + +- **Gate position determines what you catch.** Gating before ingestion catches source schema drift. Gating after load catches ETL bugs. Gating after transformation catches modeling errors. A single gate is not sufficient for a production pipeline. + +- **Schema comparison is breaking change detection for data.** Capture a baseline against a known-good state, commit the file, compare on every run. Structural deviations fail the gate. This should be a standard practice, not an advanced configuration. + +- **SQL pushdown keeps validation inside the warehouse boundary.** A single aggregate `SELECT` returns counts, not rows. No data leaves the warehouse. No egress cost, no PII exposure, no additional compute tier. The validation runs where the data already lives. + +- **Severity is an explicit engineering decision, not a tunable threshold.** `severity: error` stops the pipeline. `severity: warning` surfaces the issue without blocking. Both are written in config and reviewed in PRs. Neither is controlled by a model that learned what is "normal" from historical distributions. + +- **The DAG dependency graph is the enforcement mechanism.** A validation task that must succeed before a downstream task can run is not a monitoring job — it is a sequential dependency that blocks the next stage. The enforcement is in the topology, not in alerting logic. + +- **Data CI and data observability are complementary, not competing.** After the gate passes, trend visibility, anomaly alerting, and historical context for investigation all still matter. Data CI answers "does this batch meet its rules right now?" Observability answers "what happened over time?" The gate enforces the contract at the point of ingestion. Observability investigates what happened after. Both have a role, and they are separate tools solving separate problems. + +--- + +*DataCheck is an open-source deterministic validation engine. Install: `pip install datacheck-cli`. Source: [github.com/squrtech/datacheck](https://github.com/squrtech/datacheck).* + + + + +LinkedIn Post + +Code can't ship without passing CI. Data ships constantly with no gate at all. + +That asymmetry explains a specific class of incident: an ETL job changes a float column to string, the code tests pass, and a financial aggregation silently returns NaN two days before anyone notices. + +The mechanism CI uses is not complicated — run rules against an artifact, return an exit code, block if non-zero. That's it. Every linter, static analyzer, and test runner works this way. The same mechanism applies directly to data validation. + +What's missing is the config living in the repository alongside the ETL code that produces the data. When validation rules are versioned with the pipeline, a data contract change shows up in the PR diff. The reviewer sees it. It's a conversation before merge, not an incident after. + +"A gate you can't trust is not a gate. A gate that doesn't exist isn't even that." + +DataCheck: deterministic data validation with POSIX exit codes, designed to run inside your existing pipeline compute. pip install datacheck-cli \ No newline at end of file diff --git a/blog/2026-02-23-schema-contracts-vs-semantic-contracts.md b/blog/2026-02-23-schema-contracts-vs-semantic-contracts.md new file mode 100644 index 0000000..1d9b50c --- /dev/null +++ b/blog/2026-02-23-schema-contracts-vs-semantic-contracts.md @@ -0,0 +1,342 @@ +# Schema Contracts vs Semantic Contracts in Modern Data Systems + +*Published 2026-02-23 by the DataCheck engineering team* + +--- + +A payments pipeline ran cleanly for two weeks after a source system migration. Schema validation passed on every run. Column names matched. Types matched. Nullability matched. The monitoring showed green. + +The problem was that the upstream system had changed its convention for `transaction_amount`: it now emitted values in cents instead of dollars. The column was still `NUMERIC NOT NULL`. The values were still positive numbers. Every structural check passed. Every semantic contract was broken. + +Revenue figures were off by two orders of magnitude. The error was discovered when a finance reconciliation showed a 100x discrepancy between the warehouse aggregation and the source system's ledger. Two weeks of pipeline runs. Clean schema validation throughout. + +Schema contracts and semantic contracts are different things. Conflating them — or worse, treating schema compliance as the complete contract — is how this class of incident happens. + +--- + +## Two Different Questions + +A data contract answers one of two fundamentally different questions: + +**"Is the shape of this data what I expect?"** This is the schema contract. It covers column names, data types, nullability, cardinality hints, and structural organization. A schema contract violation means the data's container has changed — a column was removed, renamed, or retyped. The consumer may not be able to parse the data at all. + +**"Are the values inside this data what I expect?"** This is the semantic contract. It covers the meaning of values: valid ranges, format conventions, allowed value sets, temporal constraints, cross-column relationships, and population-level invariants. A semantic contract violation means the data arrived in a parseable form but contains incorrect or invalid content. + +These questions require different mechanisms to answer. Schema comparison detects structural drift. Validation rules enforce semantic correctness. A pipeline that only runs schema comparison is answering the first question and ignoring the second. Most data quality incidents live in the second category. + +--- + +## The Contract Hierarchy + +There are five levels of data contract, each expressing a different class of expectation: + +**Level 1 — Structural.** Column names, data types, nullability. The container. Schema comparison tools operate at this level. DataCheck's `schema capture/compare` operates here: it detects column additions, removals, type changes, and nullability changes against a versioned baseline. + +**Level 2 — Syntactic.** Format constraints on values within their containers. A `varchar` column might store email addresses, UUIDs, ISO 8601 dates, E.164 phone numbers, or free text. The type says nothing about the format. Syntactic contracts express what the value looks like: `regex`, `allowed_values`, `date_format_valid`, `type` (for string-typed numeric fields). These are detectable by inspecting individual values without any cross-row context. + +**Level 3 — Semantic.** Value constraints that express domain meaning. A transaction amount must be positive. A timestamp cannot be in the future. A percentage must be between 0 and 1. A record cannot be more than 48 hours old. These constraints require knowledge of the domain — they cannot be derived from the data type alone. + +**Level 4 — Referential.** Cross-table relationships. `orders.customer_id` must reference a valid row in `customers`. A `product_id` in a sales fact table must exist in the products dimension. These constraints require evaluating the current table against another table — the relationship must hold, not just the individual values. + +**Level 5 — Aggregate.** Population-level invariants. The sum of `line_item_total` across all rows for an `order_id` must equal the `order_total` on the order header. Each `(user_id, subscription_plan)` combination must be unique in the active subscriptions table. These constraints are invisible at the row level — they only exist as properties of the full dataset. + +Database schema DDL enforces Level 1 structurally and can partially enforce Level 4 via `FOREIGN KEY` constraints. Levels 2, 3, and 5 are entirely outside what the database schema can express concisely and reliably in production systems. + +--- + +## Encoding Drift: The Failure Mode Schema Checking Cannot Catch + +Encoding drift is the class of data quality failure where the schema is preserved but the convention for filling it is changed. The container is intact; the meaning of its contents has shifted. + +**Units.** The source system changes `transaction_amount` from dollars to cents. Schema: `NUMERIC(18,2) NOT NULL`. Values are still numeric, still positive, still non-null. Downstream aggregations produce values 100x too large. Schema comparison returns clean. + +**Timezone convention.** The upstream ETL shifts from emitting UTC timestamps to emitting local-time timestamps without modifying the column. Schema: `TIMESTAMP NOT NULL`. Values are still timestamps, still parseable, still within plausible date ranges. Consumers that assume UTC are now off by a timezone offset — 4 to 9 hours depending on geography and DST. Every timestamp-based join and time-series aggregation is wrong. + +**ID format convention.** The source system migrates from integer-based customer IDs to UUID-based IDs. The warehouse column was already `VARCHAR(64) NOT NULL` to accommodate the integers as strings. The new UUIDs are also strings. Schema passes. But downstream joins that parse the old format (`CUST-001234`) fail silently for all new records, producing NULLs where matches should exist. + +In all three cases, a semantic rule would have caught the drift immediately: + +```yaml +checks: + - name: transaction_amount_valid + column: transaction_amount + rules: + type: float + min: 0.01 + max: 1000000.00 # $1M maximum — if values suddenly exceed this, investigate + severity: error + + - name: created_at_is_recent_utc + column: created_at + rules: + not_null: true + no_future_timestamps: true + max_age: 48h # Fails if timestamps are hours ahead (timezone drift) + severity: error + + - name: customer_id_format + column: customer_id + rules: + not_null: true + regex: '^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$' + severity: error +``` + +If the source system had been emitting dollar amounts and the semantic contract said `max: 1000000.00`, a batch where amounts suddenly jumped 100x would fail the rule on the first run after the migration. If the timestamp convention changed and UTC timestamps were expected, `no_future_timestamps` and `max_age` rules would catch values that were hours ahead of current time. If the customer ID format was pinned to a regex matching UUIDs, integer-format IDs would fail the constraint the first time they appeared. + +Schema comparison would have caught a type change from `NUMERIC` to `BIGINT` or `VARCHAR`. It would not have caught any of these convention changes. + +--- + +## Semantic Drift: When Value Sets Expand Without Notice + +A subtler failure mode is semantic drift — the gradual expansion of a value set that is constrained in the consumer but not enforced at the source. + +An `order_status` column starts with three values: `pending`, `confirmed`, `shipped`. The consumer's ETL has a `CASE` statement that maps these to internal status codes. The source system is under active development. Six months later, the source team adds `on_hold` and `partially_fulfilled` to support new fulfillment workflows. They document it in an internal wiki. The change does not trigger a schema change — the column is still `VARCHAR NOT NULL`. No schema alert fires. + +The new values flow downstream. The consumer's `CASE` statement has no branch for `on_hold` or `partially_fulfilled`. It returns `NULL` for all rows with these statuses. Aggregations that depend on status counts are now undercounting certain states. The error is silent. + +An `allowed_values` rule catches this on the first pipeline run containing the new status values: + +```yaml +checks: + - name: order_status_constrained + column: order_status + rules: + not_null: true + allowed_values: [pending, confirmed, shipped] + severity: error +``` + +``` +FAIL order_status_constrained (847 failures / 2,341,887 rows) +Exit code: 1 +``` + +The pipeline stops. The 847 rows with `on_hold` status are counted and flagged. The consumer team learns that a new status value has appeared before it has been incorporated into the downstream mapping. The `allowed_values` rule becomes a communication mechanism between producer and consumer, enforced at the gate rather than discovered in a post-incident review. + +This is the key property of semantic contract enforcement: the producer's change to an undeclared value triggers an immediate, visible failure in the consumer's pipeline — not a silent downstream corruption days later. + +--- + +## Cross-Column Constraints: The Purely Semantic Layer + +Some of the most important data quality constraints have no representation in database schema DDL. They are purely semantic — properties of the relationship between values across columns, or across rows within a dataset. + +**Temporal ordering constraints.** An order cannot ship before it is confirmed. `shipped_at` must be greater than `confirmed_at`. + +```sql +SELECT COUNT(*) AS failed_count +FROM orders +WHERE shipped_at IS NOT NULL + AND confirmed_at IS NOT NULL + AND shipped_at < confirmed_at +``` + +No database schema primitive expresses this. A `CHECK` constraint could in theory, but cross-column `CHECK` constraints on timestamps are rarely used in distributed warehouse environments due to the lack of portability and enforcement overhead. + +**Financial integrity constraints.** The sum of line item totals must equal the order total. + +```yaml +checks: + - name: order_total_integrity + column: order_total + rules: + sum_equals: + group_by: order_id + sum_column: line_item_total + tolerance: 0.01 # Allow 1-cent floating-point rounding + severity: error +``` + +The generated SQL: + +```sql +SELECT COUNT(*) AS failed_count +FROM ( + SELECT order_id + FROM orders + GROUP BY order_id + HAVING ABS(SUM(line_item_total) - MAX(order_total)) > 0.01 +) AS violations +``` + +**Uniqueness across combinations.** A customer should not have two active subscriptions for the same plan simultaneously. + +```yaml +checks: + - name: no_duplicate_active_subscriptions + column: customer_id + rules: + unique_combination: + columns: [customer_id, subscription_plan] + where: "status = 'active'" + severity: error +``` + +These constraints represent business invariants — properties that must hold true for the data to be correct, independent of what any schema specification says. They are not derivable from column types, not expressible in DDL, and not catchable by schema comparison. They require explicit semantic rule specification. + +--- + +## The Inadequacy of Database Schema as a Contract + +Database DDL is a contract with the storage engine, not a contract with consumers. It enforces: + +- That columns exist with the declared type +- That NOT NULL columns contain values +- That UNIQUE columns contain distinct values (within the table) +- That FOREIGN KEY columns reference existing rows in another table (within the database) + +It does not enforce: + +- That values within the declared type are in a valid range +- That string values follow a particular format convention +- That the encoding convention for numeric values has not changed +- That enum-like string columns contain only the values the consumer expects +- That cross-column temporal ordering holds +- That financial aggregations are internally consistent + +`CHECK` constraints can express some of these, but in practice they are rarely used in production distributed data systems. They have limited cross-column support, no cross-row support, inconsistent behavior across warehouse engines, and create migration complexity. The industry standard for data teams working with Snowflake, BigQuery, and Redshift is: use DDL for structural definition, and enforce value contracts separately. + +DataCheck externalizes this enforcement into a YAML file that lives in the repository alongside the pipeline code. The contract is readable, reviewable, versionable, and portable — it works the same way against PostgreSQL, Snowflake, BigQuery, a CSV file, or a Parquet extract. + +--- + +## The Layered Validation Pattern + +In practice, both contract levels should be enforced, in sequence: + +```bash +# Layer 1: structural contract check +# Detects: added/removed columns, type changes, nullability changes +datacheck schema compare \ + --source production_db \ + --sources-file sources.yaml + +# Layer 2: semantic contract enforcement (only runs if Layer 1 passes) +# Enforces: format constraints, value ranges, cross-column invariants, aggregate rules +datacheck validate \ + -c checks/orders.datacheck.yaml \ + --source production_db +``` + +The sequence matters. If schema comparison fails — a column was removed, a type changed — the semantic validation results may be misleading. A rule expecting `amount` as a `float` will behave differently if the column is now `varchar`. Running semantic validation only on structurally sound data avoids validating against a corrupt schema baseline. + +```bash +#!/bin/bash +set -e + +echo "Layer 1: structural contract check" +datacheck schema compare --source production_db --sources-file sources.yaml + +echo "Layer 2: semantic contract enforcement" +datacheck validate -c checks/orders.datacheck.yaml --source production_db + +echo "Both contracts satisfied — proceeding to load" +python transform_and_load.py +``` + +The `set -e` means a failure at either layer halts the script. Layer 1 failures indicate structural breaks that need platform team attention. Layer 2 failures indicate value contract violations that need data investigation. + +A complete semantic contract for the same orders table: + +```yaml +# checks/orders.datacheck.yaml + +checks: + # Level 2: Syntactic + - name: order_id_format + column: order_id + rules: + not_null: true + regex: '^ORD-[0-9]{10}$' + + # Level 3: Semantic + - name: order_amount_valid + column: order_total + rules: + not_null: true + type: float + min: 0.01 + max: 1000000.00 + + - name: status_constrained + column: order_status + rules: + not_null: true + allowed_values: [pending, confirmed, shipped, cancelled, refunded] + + - name: timestamps_valid + column: created_at + rules: + not_null: true + no_future_timestamps: true + max_age: 7d + + # Level 4: Referential + - name: customer_exists + column: customer_id + rules: + not_null: true + foreign_key_exists: + table: customers + column: id + + # Level 5: Aggregate + - name: line_items_sum_to_total + column: order_total + rules: + sum_equals: + group_by: order_id + sum_column: line_item_total + tolerance: 0.01 + + - name: no_duplicate_orders + column: order_id + rules: + unique_combination: + columns: [customer_id, order_id] +``` + +Schema comparison covers Level 1. This config covers Levels 2 through 5. Together they answer both questions: is the shape correct, and are the values correct. + +--- + +## Engineering Takeaways + +- **Schema compliance is a necessary condition for data correctness, not a sufficient one.** A table that passes schema validation can contain completely incorrect values — wrong units, wrong format conventions, out-of-range numbers, broken referential relationships. Schema compliance means the container is intact. It says nothing about the contents. + +- **Encoding drift is the failure mode that schema checking structurally cannot catch.** A units change, a timezone convention change, or an ID format convention change preserves the column type while breaking the semantics. The only mechanism that catches encoding drift is an explicit semantic rule that knows what valid values look like — a range, a regex, or a temporal constraint. + +- **Semantic drift is the silent failure mode of missing `allowed_values` rules.** Upstream enum sets expand. New status values appear. Consumer mappings produce NULLs for unmapped values. Schema comparison returns clean because no structural change occurred. An `allowed_values` rule catches the new value on its first appearance. + +- **Cross-column constraints are purely semantic and have no schema representation.** Temporal ordering between columns (`shipped_at > confirmed_at`), financial integrity (`sum of line items = order total`), and combination uniqueness cannot be expressed in database DDL in a portable, production-viable way. They require explicit semantic rule specification. + +- **The five contract levels require two enforcement mechanisms.** `datacheck schema compare` answers "has the structural shape changed?" `datacheck validate` answers "do the values meet their semantic contract?" These are different commands answering different questions. Both need to run. + +- **Run schema comparison before semantic validation.** If the structural contract is broken — a column was removed or retyped — semantic validation against the changed structure may produce misleading results. Layer 1 passes before Layer 2 runs. + +- **Database DDL is a contract with the storage engine, not with consumers.** It enforces type and nullability. It does not enforce value ranges, format conventions, or cross-column invariants. Externalizing the semantic contract into versioned YAML that lives in the repository makes it reviewable, portable, and enforceable across environments — including environments where you do not control the DDL. + +- **Semantic contracts capture producer-consumer agreements that live outside the schema.** When a source team adds a new enum value, it should surface as a validation failure in the consumer's pipeline — not as a silent NULL in a downstream join. The `allowed_values` rule is the communication interface between producer and consumer, enforced at the gate. + +--- + +*DataCheck is an open-source deterministic validation engine. Install: `pip install datacheck-cli`. Source: [github.com/squrtech/datacheck](https://github.com/squrtech/datacheck).* + + +LinkedIn Post + +A payments pipeline ran clean for two weeks after a source system migration. Schema validation passed every run. Column names matched, types matched, nullability matched. Green across the board. + +The upstream system had changed its transaction_amount convention from dollars to cents. The column was still NUMERIC NOT NULL. Values were still positive numbers. Every schema check passed. Revenue figures were off by 100x. + +Schema contracts and semantic contracts are different questions. "Is the shape of this data what I expect?" is a schema question. "Are the values inside correct?" is a semantic question. Most pipelines only answer the first one. + +"Schema compliance means the container is intact. It says nothing about the contents." + +The failure modes that live in the gap: encoding drift (units, timezone conventions, ID format conventions that change without a type change), semantic drift (upstream enum sets expanding without notifying consumers), and cross-column invariants that have no representation in database schema DDL at all. + +A CASE statement that maps order_status to internal codes silently returns NULL for on_hold status values that the source team added last month. The column is still VARCHAR NOT NULL. Nothing in schema validation catches it. An allowed_values rule catches it on the first run. + +Schema comparison catches structural breaks. Semantic rules catch value contract violations. Both questions need to be answered, by different mechanisms, in sequence. \ No newline at end of file diff --git a/blog/2026-02-23-sql-pushdown-as-a-validation-strategy.md b/blog/2026-02-23-sql-pushdown-as-a-validation-strategy.md new file mode 100644 index 0000000..b5324bc --- /dev/null +++ b/blog/2026-02-23-sql-pushdown-as-a-validation-strategy.md @@ -0,0 +1,289 @@ +# SQL Pushdown as a Validation Strategy: Architecture and Trade-offs + +*Published 2026-02-23 by the DataCheck engineering team* + +--- + +A data engineering team ran their quality checks against a Snowflake table containing 800 million rows. The check was whether the `transaction_id` column was null. Their tool fetched all 800 million rows to an EC2 validation host, loaded them into a pandas DataFrame, ran `.isnull().sum()`, and returned a number. + +The fetch took 11 minutes. The instance required 64 GB of RAM. The egress cost was approximately $6 per run. They ran it twice a day. + +The query that answers the same question: + +```sql +SELECT COUNT(*) AS null_count +FROM transactions +WHERE transaction_id IS NULL +``` + +Execution time in Snowflake: under one second. Cost: a fraction of one credit. The answer is identical. The architectural decision is not. + +--- + +## The Principle: Move Computation to Data + +SQL pushdown is a specific instance of a general principle in distributed systems: move computation to the data, not data to the computation. + +When data lives in a warehouse — Snowflake, BigQuery, Redshift, PostgreSQL — it is already co-located with a query engine that is specifically designed and optimized for aggregate computation on that data. Column-oriented warehouses are built around the assumption that you will be running aggregate queries on large datasets. Their storage formats, compression strategies, and MPP execution engines are optimized for exactly this pattern. + +Pulling data out of the warehouse to validate it in an external Python process is an anti-pattern: it moves gigabytes across a network to use a slower compute environment for an operation the warehouse could have executed in seconds. + +SQL pushdown for validation means: express the validation logic as a SQL predicate, execute the predicate inside the warehouse, and receive only the aggregate result. The validator never sees the rows. It receives an integer — the count of rows that violated the constraint. + +--- + +## Three Execution Architectures + +There are three common approaches to running data validation against a warehouse table. They differ in what moves across the network and what the validator actually executes. + +**Architecture 1: Pull-to-Python** + +```python +# Validation tool fetches rows to an external process +df = pd.read_sql("SELECT * FROM orders", engine) +null_count = df["customer_id"].isnull().sum() +amount_violations = ((df["amount"] < 0) | (df["amount"] > 100000)).sum() +``` + +The entire table transfers across the network. Memory requirement on the validation host scales linearly with table size. Data leaves the warehouse boundary on every run. Validation performance is bottlenecked by the network and the external host's compute, not the warehouse's query engine. + +**Architecture 2: Push-Aggregate (SQL Pushdown)** + +```sql +SELECT COUNT(*) AS null_count +FROM orders +WHERE customer_id IS NULL +``` + +Zero row transfer. One integer returned. Runs at warehouse speed. Data never leaves the warehouse boundary. Validation performance is bottlenecked by query execution — which is what the warehouse is built for. + +**Architecture 3: Sampling** + +```python +df = pd.read_sql( + "SELECT * FROM orders TABLESAMPLE BERNOULLI (1)", engine +) +null_count = df["customer_id"].isnull().sum() +``` + +Transfers a fraction of the table. Reduces the cost of pull-to-Python. But introduces sampling error: a constraint violation affecting 0.1% of rows has a significant probability of not appearing in a 1% sample. For an enforcement gate where the contract is binary — pass or fail — sampling is the wrong execution model. The gate may report pass on data that is failing. + +For enforcement, Architecture 2 is the only correct choice. It returns exact counts, not estimates, and costs orders of magnitude less. + +--- + +## The Single Aggregate SELECT Optimization + +The most important optimization in SQL pushdown validation is batching multiple rule checks into one query — one table scan, multiple results. + +A naive implementation runs one query per rule: + +```sql +-- Query 1 +SELECT COUNT(*) FROM orders WHERE customer_id IS NULL; + +-- Query 2 +SELECT COUNT(*) FROM orders WHERE amount < 0 OR amount > 100000; + +-- Query 3 +SELECT COUNT(*) FROM orders +WHERE status NOT IN ('pending', 'confirmed', 'shipped', 'cancelled'); + +-- Query 4 +SELECT COUNT(*) FROM orders WHERE created_at > NOW(); +``` + +This scans the table four times and pays query startup overhead four times. The same result is available from a single pass: + +```sql +SELECT + SUM(CASE WHEN customer_id IS NULL + THEN 1 ELSE 0 END) AS customer_id_nulls, + SUM(CASE WHEN amount < 0 OR amount > 100000 + THEN 1 ELSE 0 END) AS amount_violations, + SUM(CASE WHEN status NOT IN ( + 'pending', 'confirmed', 'shipped', 'cancelled') + THEN 1 ELSE 0 END) AS status_violations, + SUM(CASE WHEN created_at > NOW() + THEN 1 ELSE 0 END) AS future_timestamps +FROM orders +``` + +One table scan. Four rule results. On a column-oriented warehouse, the efficiency gains come from two properties: + +**Column projection.** Columnar storage reads only the columns referenced in the query. On a 200-column `orders` table where the validation config checks 4 columns, the warehouse reads approximately 4/200 of the stored data. The remaining 196 columns are never touched. This applies equally to the single-query and multi-query approaches — but the single query pays the per-query overhead once. + +**Query startup cost.** Each query on a warehouse cluster requires parsing, planning, and worker allocation. For an MPP system managing compute concurrency, a validation run with 20 rules that executes as 1 query consumes one query slot. The same run as 20 queries can hit concurrency limits, queue behind other workloads, and pay the startup cost twenty times. At scale, this difference shows up in pipeline latency. + +On BigQuery, billing is based on bytes scanned. Multiple `CASE WHEN` expressions on the same columns do not increase bytes scanned — the columns are read once regardless of how many expressions reference them. A 20-rule validation config over 4 columns costs the same as a 1-rule validation, provided they reference the same columns. + +--- + +## WHERE Clauses and Partition Pruning + +Validation runs against large historical tables benefit substantially from partition targeting. A daily-partitioned table with three years of history contains over 1,000 partitions. Without a partition filter, a COUNT query scans all of them. With one, it scans one. + +DataCheck supports this via the `--where` flag: + +```bash +datacheck validate \ + -c checks/orders.datacheck.yaml \ + --source production_db \ + --table orders \ + --where "DATE(created_at) = CURRENT_DATE" +``` + +The generated queries become: + +```sql +SELECT + SUM(CASE WHEN customer_id IS NULL THEN 1 ELSE 0 END) AS customer_id_nulls, + SUM(CASE WHEN amount < 0 OR amount > 100000 THEN 1 ELSE 0 END) AS amount_violations +FROM orders +WHERE DATE(created_at) = CURRENT_DATE +``` + +The warehouse query planner prunes all partitions except today's. The validation run scans one day's data instead of three years. On Snowflake, this is the difference between consuming one credit and consuming a hundred. On BigQuery, it is the difference between scanning 10 MB and scanning 20 GB. + +This optimization is only available because validation runs as SQL inside the warehouse. A pull-to-Python approach that adds a WHERE clause to its fetch query reduces data transfer — but it still transfers matching rows rather than returning a single aggregate. The optimization is partial. SQL pushdown makes it complete. + +--- + +## Egress Cost and Security Arithmetic + +The cost difference between pull-to-Python and SQL pushdown is not academic. For a mid-sized production table: + +- 100 million rows, 20 columns, ~200 bytes per row = **20 GB** +- AWS inter-region data transfer: $0.09/GB +- Pull-to-Python cost per run: **$1.80** +- At 4 validation runs per day: **$2,628 per year**, for one table + +SQL pushdown returns a single row of integers — approximately 100 bytes. The cost is negligible. For a data platform validating a dozen production tables multiple times per day, the egress arithmetic at scale favors pushdown by several orders of magnitude. + +The security implication has a different character. In a pull-to-Python approach, every validation run transmits the raw contents of the table to the validation host — including PII, financial fields, and any regulated data. That host is now in scope for your data security policy, your SOC 2 audit, and any HIPAA or PCI-DSS compliance review. Its memory, logs, and temporary storage become potential exposure surfaces for the data it processed. + +In a SQL pushdown approach, the validation host receives integers. It never sees column values. The warehouse boundary is the data security boundary. The validator knows that 3,412 rows violated a constraint — it does not know which rows or what their `customer_id` values were. This is not a limitation; it is a deliberate architectural property. + +--- + +## What SQL Pushdown Cannot Do + +Being precise about the limitations: + +**Cross-database foreign key checks.** A referential integrity constraint where the parent table lives in a different database — or a different warehouse system entirely — cannot be evaluated as a single pushed query. DataCheck handles single-warehouse foreign key validation via SQL pushdown. Cross-system referential integrity requires fetching one side, which is a partial pull. + +**Row-level failure details.** Pushdown returns counts, not rows. When a validation run reports 3,412 violations, you know how many — not which ones. Retrieving the offending rows is a separate warehouse query: + +```sql +SELECT order_id, customer_id, amount, status +FROM orders +WHERE amount < 0 OR amount > 100000 +ORDER BY created_at DESC +LIMIT 100 +``` + +This is the correct separation: the enforcement gate uses counts to make a pass/fail decision; investigation of failures uses direct warehouse queries. The validation tool's job is the gate, not the forensics. + +**Regex portability.** PostgreSQL supports `~` for regex matching. MySQL uses `REGEXP`. Snowflake uses `REGEXP_LIKE`. SQL Server has limited native regex support. DataCheck normalizes regex rules to the correct dialect per backend, but complex patterns — lookaheads, named groups, possessive quantifiers — may not be portable across all supported warehouse types. For maximum portability, keep regex patterns simple or use the Python API for regex-heavy validation on problematic backends. + +**Warehouse credit consumption.** Validation queries run on warehouse compute. On Snowflake and BigQuery, high-frequency validation pipelines will register in your credit and billing usage. The cost per run is typically small — a single aggregate SELECT against a partitioned table costs fractions of a credit — but it is not zero and should appear in capacity planning. This is a real cost, not a reason to avoid pushdown, but it should be quantified rather than ignored. + +--- + +## Custom SQL for Constraints YAML Cannot Express + +Some constraints require SQL expressiveness beyond what predicate-per-column rules can capture. DataCheck accepts a custom SQL query via `--query`. The query should return the rows that constitute a failure. DataCheck wraps it in a COUNT: + +```bash +# No customer should have more than 5 active subscriptions +datacheck validate \ + --source production_db \ + --check-name max_active_subscriptions \ + --query " + SELECT customer_id + FROM subscriptions + WHERE status = 'active' + GROUP BY customer_id + HAVING COUNT(*) > 5 + " +``` + +DataCheck executes: + +```sql +SELECT COUNT(*) AS failed_count +FROM ( + SELECT customer_id + FROM subscriptions + WHERE status = 'active' + GROUP BY customer_id + HAVING COUNT(*) > 5 +) AS violations +``` + +The result is still a single integer. The execution is still inside the warehouse. The exit code is still `1` if the count is non-zero. The pushdown property holds regardless of the complexity of the user-supplied query. + +This pattern handles aggregation-based constraints, cross-column join conditions, and any validation logic that requires SQL expressiveness beyond the YAML rule vocabulary. + +--- + +## File-Based Validation: The Equivalent Pattern + +For CSV and Parquet files, there is no warehouse to push to. DataCheck loads the file into an in-process engine and applies predicate logic. The architectural principle remains: push computation into the optimized engine, not a Python loop. + +The difference between vectorized evaluation and row iteration matters at scale: + +```python +# Row iteration: O(n) Python overhead per row, per rule +failed = 0 +for _, row in df.iterrows(): + if row["amount"] < 0 or row["amount"] > 100000: + failed += 1 + +# Vectorized: C-level SIMD execution, near-constant Python overhead +failed = ((df["amount"] < 0) | (df["amount"] > 100000)).sum() +``` + +For a 10 million row CSV, row iteration takes seconds per rule. Vectorized evaluation takes milliseconds. For a 20-rule validation config, the difference is a 60-second run versus a 3-second run. + +The underlying mechanism — expressing the validation as a predicate over a column rather than a condition on each row — is the same principle as SQL pushdown. The optimization target is the same: avoid Python-level loop overhead by delegating computation to the engine that is built to do it. + +--- + +## Engineering Takeaways + +- **Move computation to data, not data to computation.** SQL pushdown is a direct instantiation of this distributed systems principle. A COUNT predicate runs at warehouse speed. Pulling rows to validate externally fights against the warehouse's architecture rather than working with it. + +- **Batch multiple rules into a single aggregate SELECT.** `CASE WHEN` expressions inside a single query execute in one table scan. Twenty rules as twenty separate queries pays query startup overhead twenty times. On partitioned tables with high rule counts, this difference is measurable in both latency and warehouse credit consumption. + +- **Use WHERE clauses to enable partition pruning.** `--where "DATE(created_at) = CURRENT_DATE"` limits the warehouse scan to today's partition on a daily-partitioned table. The cost of validating a three-year historical table drops to the cost of validating one day's data. This only works because validation runs inside the warehouse's query planner. + +- **The egress cost is real and compounds with table count and frequency.** At $0.09/GB, a 20 GB table validated four times per day costs over $2,600 per year in transfer alone. SQL pushdown returns approximately 100 bytes per run. The cost difference is not marginal — it is structural. + +- **The warehouse boundary is the data security boundary.** SQL pushdown means the validation host receives integers, not rows. PII, financial data, and regulated fields never leave the warehouse. The validator cannot leak what it never received. This is an architectural property, not a configuration option. + +- **Pushdown returns counts, not rows.** The gate knows 3,412 rows failed — not which rows or what their values were. Investigation of failures is a separate warehouse query. This separation is correct: enforcement and forensics are different jobs, and conflating them by pulling rows into the validation host compromises the security property without improving the enforcement. + +- **Custom SQL extends pushdown to constraints YAML cannot express.** Aggregation-based constraints, cross-column join conditions, and HAVING clauses are outside the predicate-per-column rule model. Wrapping a user-supplied query in a COUNT preserves the pushdown property — the complex logic runs inside the warehouse, and the validator still receives only an integer. + +- **Sampling is not a valid substitute for pushdown in enforcement contexts.** A 1% sample misses constraint violations affecting less than ~5% of rows with meaningful probability. For a binary enforcement gate, the only acceptable false negative rate is zero. Pushdown provides exact counts at the cost of zero additional data transfer. + +--- + +*DataCheck is an open-source deterministic validation engine. Install: `pip install datacheck-cli`. Source: [github.com/squrtech/datacheck](https://github.com/squrtech/datacheck).* + + + +LinkedIn Post + +A team was running data quality checks against an 800M-row Snowflake table. Their tool fetched all 800 million rows to an EC2 host to check whether a column was null. 11 minutes. 64 GB RAM. $6 in egress per run, twice a day. + +The query that answers the same question takes under a second and returns one integer. + +This is not a subtle optimization. It is the difference between working with a warehouse's architecture and fighting against it. MPP systems like Snowflake and BigQuery are built to execute aggregate queries at scale. Pulling rows out to validate them externally moves computation away from the engine that's optimized for it. + +The less obvious optimization is batching: twenty validation rules don't need twenty queries. A single SELECT with CASE WHEN expressions executes in one table scan, one round trip, one query slot. On columnar warehouses, it scans only the columns referenced — regardless of how many CASE WHEN branches reference them. + +"The validation host should receive integers, not rows. What it never received, it cannot leak." + +There's also a security property here that often goes unnoticed. When validation runs inside the warehouse, PII and regulated data never leave the warehouse boundary. The validator knows 3,412 rows violated a constraint — not which rows or what their values were. That's an architectural property, not a configuration option. \ No newline at end of file diff --git a/blog/2026-02-23-why-observability-is-not-enough-for-data-enforcement.md b/blog/2026-02-23-why-observability-is-not-enough-for-data-enforcement.md new file mode 100644 index 0000000..195a4a3 --- /dev/null +++ b/blog/2026-02-23-why-observability-is-not-enough-for-data-enforcement.md @@ -0,0 +1,244 @@ +# Why Observability Is Not Enough for Data Enforcement + +*Published 2026-02-23 by the DataCheck engineering team* + +--- + +The alert fired at 11:47 PM. The pipeline had run at 8:30 PM. For three hours, the downstream mart had been serving aggregations built on a column that had silently coerced from `numeric` to `varchar` in an upstream ETL change. The anomaly detection model flagged the null rate deviation. The alert went to PagerDuty. The on-call engineer woke up, confirmed the issue, and started the rollback at 12:15 AM. + +The observability platform worked exactly as designed. The problem is that "working as designed" still meant three hours of bad data in production, a manual rollback, and a downstream reprocessing job that ran until 4 AM. + +Observability is not enforcement. This distinction is architectural, not philosophical. Understanding why requires looking at how observability systems are built, what they optimize for, and what they fundamentally cannot do. + +--- + +## The Telemetry Chain Is Asynchronous by Design + +Observability systems are built around an asynchronous data collection architecture. Your pipeline runs, emits metrics or events to a collection endpoint, those events are buffered and ingested by the observability platform, processing and aggregation happen in the background, alert conditions are evaluated on the stored data, and notifications are dispatched if thresholds are breached. + +This chain is deliberately async. Async collection means your pipeline does not wait for the observability backend to process each event before proceeding. Async processing means the platform can absorb bursts. Async alerting means alert evaluation can run on aggregated windows rather than per-event. These are correct architectural choices for an observability system. + +But they mean the system is structurally incapable of sitting in the critical path of your pipeline. By the time the alert fires, your pipeline has already finished running. The data is already wherever your pipeline put it. + +An enforcement gate has the opposite requirement. It must be synchronous. It must be in the critical path. Its entire purpose is to prevent the next stage from running if the current stage produced bad data. You cannot retrofit a synchronous enforcement point onto an asynchronous observation system — they are solving different problems at different points in time. + +--- + +## Probabilistic Outputs Cannot Make Binary Routing Decisions + +The second architectural incompatibility is in the nature of the output. + +Anomaly detection produces a signal that is inherently probabilistic. "This value is 3.2 standard deviations from the 30-day rolling mean." "The null rate is elevated at 94th percentile of historical distribution." "Row count is anomalous with 87% confidence." These are useful signals for investigation — they tell you something changed, and they give you a severity indication. + +A pipeline gate needs a different kind of answer entirely. It needs a binary predicate: pass or fail, proceed or stop. You cannot route a pipeline on a probability score without converting it to a threshold, and every threshold is a judgment call that must be made in advance, maintained over time, and tuned when it produces noise. + +Consider what happens to that threshold in practice. The anomaly model fires on Monday mornings because weekend data has a different volume profile. The on-call engineer adjusts the threshold. The model fires when a marketing campaign runs and order volume spikes 4x. The threshold gets widened. Six months in, the threshold that was set to catch a 10% null rate increase now lets through a 35% increase before alerting because the band was widened incrementally to suppress noise. + +This is not a failure of the observability platform. It is the expected behavior of a probabilistic system operated by engineers who rationally respond to false positives by tuning them away. The problem is that each tuning decision slightly weakens the gate, and the degradation is invisible until a bad batch slips through. + +A deterministic rule has no threshold to tune. The rule `not_null: true` either passes or fails. The rule `allowed_values: [pending, confirmed, shipped]` either passes or fails. There is no sensitivity dial, no window size to adjust, no training data to go stale. The predicate is the same on every run. + +--- + +## The Baseline Cold-Start Problem + +Statistical anomaly detection requires a baseline. It needs to know what "normal" looks like before it can identify what is "abnormal." This requirement creates a specific class of blind spots that occur exactly when bad data is most likely to enter. + +**New pipelines.** A new pipeline has no history. You cannot train an anomaly model on data that does not exist yet. From day one through the first weeks or months of operation, the model is either not running or operating on an insufficient baseline. These early runs are often the highest-risk period — the pipeline is new, the data sources are not fully understood, the transformation logic has not been battle-tested. + +**After schema changes.** When a column is added, removed, renamed, or changes type, the historical baseline for that column is no longer valid. A model trained on a `float` distribution does not have meaningful anomaly thresholds for `varchar`. After the change, you need to wait for the model to establish a new baseline — during which time the column has no effective anomaly coverage. + +**After data migrations.** When a source system migrates and the data characteristics change structurally — different ID formats, different value ranges, different cardinality — the old baseline misrepresents the new normal. Every value that is now legitimately different from the old distribution looks anomalous. The model fires constantly. Engineers tune down the sensitivity. The gate degrades. + +Deterministic rules do not have a cold-start period. A validation config written on day one of a pipeline enforces exactly the same rules as one written eighteen months in. A `regex` rule that validates UUID format enforces UUID format whether the column contains 100 rows or 100 million rows, whether it is one day old or three years old. + +```yaml +checks: + - name: event_id_format + column: event_id + rules: + not_null: true + regex: '^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$' + + - name: event_type_constrained + column: event_type + rules: + not_null: true + allowed_values: [click, view, purchase, refund, signup] + + - name: amount_bounds + column: amount_usd + rules: + type: float + min: 0.01 + max: 99999.99 +``` + +These rules work on the first run. They work the same way on every subsequent run. There is no warm-up period, no baseline to rebuild after a schema change, no sensitivity to tune after a traffic spike. + +--- + +## Alert Fatigue as Systemic Gate Degradation + +There is a well-documented operational failure mode in alert-driven systems: when alerts fire frequently enough, engineers adapt by treating them as lower-priority signals. + +This adaptation is individually rational. If a data quality alert fires 40 times per week and 38 of those firings are benign — expected seasonality, acceptable distribution shifts, pipeline restarts — a skilled engineer will quickly learn to evaluate context before acting. The alert has trained them to distinguish signal from noise. The consequence is that when a real quality issue fires, it enters a queue of other alerts that are also being evaluated for context. Response time increases. The alert is acknowledged rather than acted on immediately. + +Pipeline enforcement does not have this failure mode. When a validation task in an Airflow DAG fails, the downstream task does not run. There is no alert to acknowledge, no threshold to tune, no on-call queue to compete with. The pipeline is stopped. The engineer sees a failed task. The only path forward is to fix the underlying issue. + +```python +# The task dependency IS the enforcement mechanism. +# No alert routing, no on-call, no threshold tuning. +extract >> validate_raw >> transform >> validate_final >> serve +``` + +If `validate_raw` fails, `transform` does not run. The pipeline is in a known, visible failed state. There is no ambiguity about whether this is signal or noise — the pipeline is down. That visibility is a feature, not a limitation. + +Compare this to the observability-as-enforcement pattern some teams use: an alert fires, which triggers a webhook, which pauses the pipeline, which notifies the on-call engineer, who then reviews the alert, confirms it is actionable, and manually resumes or kills the pipeline. That is four systems and three human decisions in the critical path of what should be a binary gate. Each link adds latency and failure surface. + +--- + +## Out-of-Band Observers and In-Path Gates + +The architectural distinction generalizes: observability is out-of-band; enforcement is in-path. + +An out-of-band observer receives copies of data and events. It runs alongside the system being observed. It can fail without affecting the pipeline. If your observability backend goes down, your pipelines continue running — which is the correct behavior for a monitoring system. + +An in-path gate sits in the critical path of execution. It must run. Its success or failure determines whether the next step proceeds. If the gate fails to execute, that itself is a failure — the pipeline should not proceed when its quality check is unavailable. + +This is not a subtle distinction. A system that can be bypassed without affecting pipeline execution is not a gate. It is a monitor. Calling it a gate is a category error that produces a false sense of security: teams believe their pipeline is protected by the quality check, but the check runs in a lane that does not intersect the pipeline's execution path. + +DataCheck is in-path by construction. It runs as a step in your pipeline. Its exit code is what the orchestrator reads to decide whether to proceed. + +```bash +# In a shell pipeline: the next command only runs if datacheck exits 0 +datacheck validate -c checks/events.datacheck.yaml && load_to_warehouse.sh +``` + +```yaml +# In a Makefile: explicit dependency chain +validate: + datacheck validate -c checks/events.datacheck.yaml + +load: validate + python load_to_warehouse.py +``` + +If `datacheck validate` exits `1`, `load_to_warehouse.sh` does not execute. The shell `&&` operator is the enforcement mechanism. No additional tooling required. + +Exit code semantics: + +| Code | Meaning | +|------|---------| +| `0` | All rules passed — pipeline may proceed | +| `1` | One or more error-severity rules failed — pipeline must stop | +| `2` | Configuration error — cannot proceed | +| `3` | Data loading error — cannot proceed | + +--- + +## What Observability Is Actually For + +None of this is an argument against observability. Observability tools solve problems that enforcement tools cannot. + +**Trend analysis.** A null rate that is 0.2% today and was 0.1% last week is not a failure — it might not even be notable. But a null rate that has climbed from 0.05% to 0.8% over six months is a signal worth investigating. Trend data requires time-series storage and historical context that a per-run validation tool does not provide. + +**Unknown unknowns.** Deterministic rules enforce what you know to check. An anomaly detection system can surface patterns you did not know to look for — unusual distributions, unexpected correlations, cardinality explosions. These are valuable discovery signals. + +**Post-incident investigation.** After a quality issue, you want to know when it started, how many runs were affected, and how the metrics evolved. Observability platforms are built for this kind of historical query. A validation tool records pass/fail per run — not the history needed for detailed incident analysis. + +**SLA monitoring.** "This pipeline has run successfully within the last 4 hours" is a monitoring question, not a validation question. DataCheck can check `max_age` on a timestamp column to catch stale data within a run — but SLA-level uptime monitoring belongs in your observability layer. + +The cleaner architecture uses both: deterministic validation rules as the synchronous enforcement gate, observability for trend analysis and post-hoc investigation. They operate at different points in the pipeline lifecycle and answer different questions. + +--- + +## The Enforcement Layer + +What the enforcement layer needs to provide is conceptually simple: explicit rules, evaluated deterministically, with a binary output that can be consumed by any pipeline orchestrator. + +In practice, for a database-backed pipeline, a `not_null` check executes as: + +```sql +SELECT COUNT(*) AS failed_count +FROM events +WHERE event_id IS NULL +``` + +An `allowed_values` check: + +```sql +SELECT COUNT(*) AS failed_count +FROM events +WHERE event_type NOT IN ('click', 'view', 'purchase', 'refund', 'signup') + AND event_type IS NOT NULL +``` + +A `max_age` check, confirming the table has been updated within the last 4 hours: + +```sql +SELECT COUNT(*) AS failed_count +FROM events +WHERE created_at < NOW() - INTERVAL '4 hours' +``` + +Each query returns a single integer — the number of rows that violated the rule. Zero means pass. Non-zero means fail. The entire validation run produces a single exit code. No metrics to store, no baselines to maintain, no training data to refresh. + +Running validation: + +```bash +datacheck validate -c checks/events.datacheck.yaml + + PASS event_id_format (0 failures / 1,847,293 rows) + PASS event_type_constrained (0 failures / 1,847,293 rows) + FAIL amount_bounds (3,412 failures / 1,847,293 rows) + PASS created_at_fresh (0 failures / 1,847,293 rows) + +Rules: 4 total 3 passed 1 failed +Exit code: 1 +``` + +The pipeline stops. The 3,412 rows that violated the `amount_bounds` rule are identified by count. The next stage does not run. No three-hour detection window. No on-call page at midnight. No rollback at 4 AM. + +--- + +## Engineering Takeaways + +- **Observability is asynchronous by design; enforcement requires synchronous in-path execution.** These are architectural incompatibilities. An async telemetry chain cannot be retrofitted into a synchronous pipeline gate — the data has already moved by the time the alert evaluates. + +- **Probabilistic anomaly scores cannot make binary routing decisions reliably.** Any threshold applied to a probability output will be tuned over time in response to false positives. That tuning incrementally weakens the gate. Deterministic predicates have no threshold to erode. + +- **Anomaly detection is least reliable precisely when you need enforcement most.** New pipelines, post-schema-change periods, and post-migration states have no valid historical baseline. Deterministic rules work from run one with no warm-up period. + +- **Alert fatigue degrades the gate over time; pipeline failure does not.** Engineers rationally learn to evaluate alert context before acting, which increases response time for real issues. A failed pipeline task produces a clear, unambiguous blocked state with no equivalent path toward being ignored. + +- **An out-of-band observer that can be bypassed is a monitor, not a gate.** If the quality check runs in a lane that does not intersect pipeline execution, it provides no enforcement guarantee. Enforcement requires being in the critical path. + +- **SQL pushdown means enforcement is a COUNT query, not a data scan.** A single aggregate SELECT returns one row — the number of violations. No data leaves the warehouse. No rows are transferred. The validation cost is proportional to the query, not the table size. + +- **Observability and enforcement answer different questions at different points in time.** "Does this batch meet its rules right now?" is an enforcement question. "How has this metric trended over the last 90 days?" is an observability question. Treating the second system as a substitute for the first leaves the pipeline unprotected during the window between ingestion and alert evaluation. + +- **The correct architecture uses both.** Deterministic validation at the gate answers the binary question before data moves. Observability downstream answers the trend and anomaly questions after data has passed the gate. Neither is a substitute for the other. + +--- + +*DataCheck is an open-source deterministic validation engine. Install: `pip install datacheck-cli`. Source: [github.com/squrtech/datacheck](https://github.com/squrtech/datacheck).* + + + + +LinkedIn Post + +Data observability platforms are built to be asynchronous by design. Your pipeline runs, telemetry is emitted, it's collected in the background, processed, aggregated, and then an alert fires. + +By the time that alert fires, your data is already wherever the pipeline sent it. + +That's not a flaw in observability — it's the correct architecture for trend analysis, anomaly discovery, and historical investigation. But it means observability cannot be your quality gate. An async system cannot sit synchronously in the critical path of pipeline execution. + +The second problem is probabilistic output. Anomaly detection gives you scores and distributions. A pipeline gate needs a binary predicate. Every threshold you apply to that score will be tuned over time in response to false positives — and each tuning decision slightly weakens the gate. After six months, the threshold that was meant to catch a 10% null rate increase is letting through 30%. + +"An out-of-band observer that can be bypassed is a monitor, not a gate." + +The enforcement layer is a different architectural slot: deterministic rules, evaluated in-path, with a POSIX exit code that blocks the next stage if any rule fails. No alert to acknowledge. No threshold to tune. The pipeline either proceeds or it does not. + +Both layers belong in a mature data platform. They just answer different questions at different points in time. \ No newline at end of file diff --git a/datacheck/__init__.py b/datacheck/__init__.py index e15171b..4bf1840 100644 --- a/datacheck/__init__.py +++ b/datacheck/__init__.py @@ -1,4 +1,4 @@ -"""DataCheck - Lightweight data quality validation CLI tool.""" +"""DataCheck - A linter for data pipelines.""" from datacheck.engine import ValidationEngine from datacheck.exceptions import ( @@ -12,11 +12,9 @@ ValidationError, ) from datacheck.loader import ( - AvroLoader, CSVLoader, DataLoader, - DeltaLakeLoader, - DuckDBLoader, + DatabaseLoader, LoaderFactory, ParquetLoader, ) @@ -26,18 +24,8 @@ SchemaComparator, SchemaDetector, ) -from datacheck.profiling import DataProfiler -from datacheck.profiling.models import ColumnProfile, DatasetProfile -from datacheck.profiling.outliers import OutlierDetector, OutlierMethod -from datacheck.profiling.quality import QualityScorer -from datacheck.profiling.suggestions import RuleSuggester -from datacheck.profiling.formatters import ( - JsonFormatter, - MarkdownFormatter, - TerminalFormatter, -) -__version__ = "2.0.2" +__version__ = "2.1.0" __author__ = "Squrtech" __email__ = "contact@squrtech.com" @@ -58,9 +46,7 @@ "DataLoader", "CSVLoader", "ParquetLoader", - "DuckDBLoader", - "DeltaLakeLoader", - "AvroLoader", + "DatabaseLoader", "LoaderFactory", # Engine "ValidationEngine", @@ -71,15 +57,4 @@ "SchemaDetector", "SchemaComparator", "BaselineManager", - # Profiling - "DataProfiler", - "ColumnProfile", - "DatasetProfile", - "OutlierDetector", - "OutlierMethod", - "QualityScorer", - "RuleSuggester", - "JsonFormatter", - "MarkdownFormatter", - "TerminalFormatter", ] diff --git a/datacheck/airflow/__init__.py b/datacheck/airflow/__init__.py index 4dce2b7..0fd7f34 100644 --- a/datacheck/airflow/__init__.py +++ b/datacheck/airflow/__init__.py @@ -1,10 +1,10 @@ """Airflow integration for DataCheck. -Provides two operators for integrating DataCheck data quality -validation into Airflow pipelines: +Provides two operators for enforcing DataCheck validation rules +in Airflow pipelines: -- DataCheckOperator: Validate data against configured rules -- DataCheckSchemaOperator: Detect schema changes against baselines +- DataCheckOperator: Enforce validation rules against configured data sources +- DataCheckSchemaOperator: Enforce schema contracts against saved baselines For complex workflows, you can also use the CLI via BashOperator. """ diff --git a/datacheck/airflow/operators.py b/datacheck/airflow/operators.py index 69ea7f7..0c91b79 100644 --- a/datacheck/airflow/operators.py +++ b/datacheck/airflow/operators.py @@ -1,9 +1,9 @@ """Airflow operators for DataCheck validation. -Provides two operators for running data quality checks in Airflow DAGs: +Provides two operators for enforcing validation rules in Airflow DAGs: -- DataCheckOperator: Validate data against configured rules -- DataCheckSchemaOperator: Detect schema changes against baselines +- DataCheckOperator: Enforce validation rules against configured data sources +- DataCheckSchemaOperator: Enforce schema contracts against saved baselines """ from __future__ import annotations @@ -103,17 +103,15 @@ class DataCheckOperator(BaseOperator): Attributes: config_path: Path to the DataCheck validation config YAML - file_path: Path to a data file (CSV, Parquet, Avro, Delta, etc.) + file_path: Path to a data file (CSV, Parquet) sources_file: Path to named sources YAML file source_name: Named source to validate table: Database table name override where: SQL WHERE clause for filtering query: Custom SQL query (alternative to table) - sample_rate: Random sample fraction (0.0-1.0) parallel: Enable multi-core validation workers: Number of worker processes min_pass_rate: Minimum rule pass rate to succeed (0-100) - min_quality_score: Minimum quality score to succeed (0-100) fail_on_error: Whether to fail the Airflow task on validation failure push_results: Whether to push results to XCom """ @@ -141,11 +139,9 @@ def __init__( table: str | None = None, where: str | None = None, query: str | None = None, - sample_rate: float | None = None, parallel: bool = False, workers: int | None = None, min_pass_rate: float = 0.0, - min_quality_score: float = 0.0, fail_on_error: bool = True, push_results: bool = True, **kwargs, @@ -154,17 +150,15 @@ def __init__( Args: config_path: Path to DataCheck validation config YAML (required) - file_path: Path to data file (CSV, Parquet, Avro, Delta, etc.) + file_path: Path to data file (CSV, Parquet) sources_file: Path to sources YAML file (overrides config) source_name: Named source from sources.yaml table: Database table name (for database sources) where: WHERE clause for filtering (for database sources) query: Custom SQL query (alternative to table) - sample_rate: Random sample fraction (0.0-1.0) parallel: Enable parallel execution workers: Number of worker processes (default: CPU count) min_pass_rate: Minimum pass rate percentage (0-100, 0 = disabled) - min_quality_score: Minimum quality score (0-100, 0 = disabled) fail_on_error: Whether to raise AirflowException on failure push_results: Whether to push results to XCom **kwargs: Additional arguments passed to BaseOperator @@ -177,11 +171,9 @@ def __init__( self.table = table self.where = where self.query = query - self.sample_rate = sample_rate self.parallel = parallel self.workers = workers self.min_pass_rate = min_pass_rate - self.min_quality_score = min_quality_score self.fail_on_error = fail_on_error self.push_results = push_results @@ -219,10 +211,7 @@ def execute(self, context: dict[str, Any]) -> dict[str, Any]: try: if self.file_path: # File-based validation - summary = engine.validate_file( - self.file_path, - sample_rate=self.sample_rate, - ) + summary = engine.validate_file(self.file_path) elif self.source_name or engine.config.source: # Named source validation summary = engine.validate_sources( @@ -230,16 +219,12 @@ def execute(self, context: dict[str, Any]) -> dict[str, Any]: table=self.table, where=self.where, query=self.query, - sample_rate=self.sample_rate, ) elif engine.config.data_source is not None: # Inline data_source from config config_dir = Path(self.config_path).parent source_path = config_dir / engine.config.data_source.path - summary = engine.validate_file( - str(source_path), - sample_rate=self.sample_rate, - ) + summary = engine.validate_file(str(source_path)) else: raise AirflowException( "No data source specified. Provide file_path, " @@ -258,9 +243,8 @@ def execute(self, context: dict[str, Any]) -> dict[str, Any]: ) # Check thresholds - has_thresholds = self.min_pass_rate > 0 or self.min_quality_score > 0 + has_thresholds = self.min_pass_rate > 0 met_pass_rate = pass_rate >= self.min_pass_rate - met_quality = pass_rate >= self.min_quality_score # Build results results = { @@ -278,7 +262,6 @@ def execute(self, context: dict[str, Any]) -> dict[str, Any]: if has_thresholds: results["met_pass_rate_threshold"] = met_pass_rate - results["met_quality_threshold"] = met_quality # Push to XCom if self.push_results: @@ -301,11 +284,6 @@ def execute(self, context: dict[str, Any]) -> dict[str, Any]: f"Pass rate {pass_rate:.1f}% below threshold " f"{self.min_pass_rate}%" ) - if not met_quality: - raise AirflowException( - f"Quality score {pass_rate:.1f} below threshold " - f"{self.min_quality_score}" - ) else: # Strict mode: fail if any error-severity rule failed if not summary.all_passed: @@ -328,8 +306,7 @@ class DataCheckSchemaOperator(BaseOperator): baseline. If no baseline exists, captures one automatically. Data is loaded using DataCheck's LoaderFactory (supports CSV, - Parquet, Avro, Delta Lake, DuckDB, SQLite) or from named sources - for database connections. + Parquet) or from named sources for database connections. Examples: Compare file schema against baseline:: @@ -339,7 +316,7 @@ class DataCheckSchemaOperator(BaseOperator): file_path="/data/orders_{{ ds }}.parquet", baseline_name="orders", fail_on_breaking=True, - ) + ) # Supports CSV, Parquet, or named database sources Compare database table schema:: @@ -356,6 +333,7 @@ class DataCheckSchemaOperator(BaseOperator): sources_file: Path to named sources YAML file source_name: Named source to check table: Database table name + query: Custom SQL query (alternative to table) baseline_name: Name for the schema baseline baseline_dir: Directory to store baseline files fail_on_breaking: Whether to fail on breaking schema changes @@ -367,6 +345,7 @@ class DataCheckSchemaOperator(BaseOperator): "sources_file", "source_name", "table", + "query", "baseline_name", ) template_ext: Sequence[str] = (".yaml", ".yml") @@ -380,6 +359,7 @@ def __init__( sources_file: str | None = None, source_name: str | None = None, table: str | None = None, + query: str | None = None, baseline_name: str = "baseline", baseline_dir: str = ".datacheck/schemas", fail_on_breaking: bool = True, @@ -389,10 +369,11 @@ def __init__( """Initialize DataCheckSchemaOperator. Args: - file_path: Path to data file (CSV, Parquet, Avro, Delta, etc.) + file_path: Path to data file (CSV, Parquet) sources_file: Path to sources YAML file source_name: Named source from sources.yaml table: Database table name (for database sources) + query: Custom SQL query (alternative to table) baseline_name: Name for the schema baseline (default: "baseline") baseline_dir: Directory to store baseline files fail_on_breaking: Whether to raise AirflowException on breaking changes @@ -404,6 +385,7 @@ def __init__( self.sources_file = sources_file self.source_name = source_name self.table = table + self.query = query self.baseline_name = baseline_name self.baseline_dir = baseline_dir self.fail_on_breaking = fail_on_breaking @@ -434,7 +416,7 @@ def _load_data(self) -> pd.DataFrame: f"Source '{self.source_name}' not found. " f"Available: {', '.join(sorted(sources.keys()))}" ) - return load_source_data(sources[self.source_name], table=self.table) + return load_source_data(sources[self.source_name], table=self.table, query=self.query) raise AirflowException( "No data source specified. Provide file_path, " diff --git a/datacheck/cli/__init__.py b/datacheck/cli/__init__.py index 0e102b0..b658810 100644 --- a/datacheck/cli/__init__.py +++ b/datacheck/cli/__init__.py @@ -7,7 +7,7 @@ app = typer.Typer( name="datacheck", - help="Lightweight data quality validation CLI tool", + help="A linter for data pipelines. Enforce validation rules in CI, Airflow, and beyond.", add_completion=False, ) @@ -22,22 +22,21 @@ def version() -> None: @app.callback(invoke_without_command=True) def main(ctx: typer.Context) -> None: - """DataCheck - Lightweight data quality validation CLI tool. + """DataCheck - A linter for data pipelines. - Run 'datacheck validate ' to validate a data file. + Run 'datacheck validate' to enforce validation rules against a data source. Run 'datacheck --help' for more information. """ if ctx.invoked_subcommand is None: - console.print("[bold]DataCheck[/bold] - Data Quality Validation") + console.print("[bold]DataCheck[/bold] - A Linter for Data Pipelines") console.print(f"Version: {__version__}") console.print() console.print("Usage: datacheck [COMMAND] [OPTIONS]") console.print() console.print("Commands:") - console.print(" validate Validate data file against configured rules") - console.print(" profile Generate data quality profile for a dataset") + console.print(" validate Enforce validation rules against a data source") console.print(" config Configuration management commands") - console.print(" schema Schema evolution detection commands") + console.print(" schema Enforce schema contracts against a baseline") console.print(" version Display version information") console.print() console.print("Run 'datacheck [COMMAND] --help' for more information on a command.") @@ -46,7 +45,6 @@ def main(ctx: typer.Context) -> None: # Import submodules to register commands on app. # These must come AFTER app and console are defined to avoid circular imports. import datacheck.cli.validate # noqa: E402, F401 -import datacheck.cli.profile # noqa: E402, F401 # Register sub-apps (also triggers module-level command registration) from datacheck.cli.schema import schema_app # noqa: E402 diff --git a/datacheck/cli/config.py b/datacheck/cli/config.py index c152049..78f4d46 100644 --- a/datacheck/cli/config.py +++ b/datacheck/cli/config.py @@ -108,7 +108,6 @@ def config_init( console.print(f"[green]OK:[/green] Sample data generated: {data_filename} ({sample_rows} rows)") console.print("\n[dim]Edit the config file to customize validation rules.[/dim]") - console.print("[dim]To generate config from data, use: datacheck config generate [/dim]") raise typer.Exit(code=0) @@ -371,170 +370,6 @@ def config_templates() -> None: raise typer.Exit(code=4) from e -@config_app.command("generate") -def config_generate( - data_source: str = typer.Argument( - ..., - help="Data source to analyze (file path)", - ), - output: str = typer.Option( - "datacheck.yaml", - "--output", - "-o", - help="Output config file path", - ), - confidence: str = typer.Option( - "medium", - "--confidence", - "-c", - help="Minimum confidence for rules (low, medium, high)", - ), - name: str | None = typer.Option( - None, - "--name", - "-n", - help="Dataset name (default: derived from filename)", - ), - force: bool = typer.Option( - False, - "--force", - "-f", - help="Overwrite existing config file", - ), -) -> None: - """Generate configuration from data analysis. - - Analyzes the data file and generates validation rules based on - detected patterns, types, and statistics. - - Examples: - datacheck config generate data.csv - datacheck config generate data.csv --confidence high - datacheck config generate data.csv -o custom.yaml - - Exit codes: - 0 - Config generated successfully - 1 - Output file exists (use --force) - 3 - Data loading error - 4 - Unexpected error - """ - try: - from pathlib import Path - from datacheck.config import ConfigGenerator - - output_path = Path(output) - - # Check if file exists - if output_path.exists() and not force: - console.print( - f"[red]Error:[/red] Config file '{output}' already exists. " - f"Use --force to overwrite.", - style="red", - ) - raise typer.Exit(code=1) - - console.print(f"[cyan]Analyzing data:[/cyan] {data_source}") - - generator = ConfigGenerator() - - try: - dataset_name = name or Path(data_source).stem - result = generator.generate_from_file( - data_source, - confidence_threshold=confidence, - return_profile=True, - ) - assert isinstance(result, tuple) - config, profile = result - generator.save_config(config, output_path) - - except Exception as e: - console.print(f"[red]Data Load Error:[/red] {e}", style="red") - raise typer.Exit(code=3) from e - - console.print(f"[green]OK:[/green] Config generated: {output}") - - # Show summary - checks = config.get("checks", []) - metadata = config.get("metadata", {}) - - console.print("\n[bold]Generated Configuration Summary:[/bold]") - console.print(f" Dataset: {metadata.get('description', dataset_name)}") - console.print(f" Source rows: {metadata.get('source_rows', 'N/A'):,}") - console.print(f" Source columns: {metadata.get('source_columns', 'N/A')}") - console.print(f" Quality score: {metadata.get('quality_score', 'N/A')}") - console.print(f" Checks generated: {len(checks)}") - - if checks: - console.print("\n[bold]Checks:[/bold]") - for check in checks[:10]: - rules = list(check.get("rules", {}).keys()) - desc = check.get("description", "") - col = check.get("column") or ", ".join(check.get("columns", [])) - console.print(f" - {col}: {', '.join(rules)}") - if desc: - console.print(f" [dim]{desc}[/dim]") - if len(checks) > 10: - console.print(f" ... and {len(checks) - 10} more") - - # Rule distribution - rule_categories = { - "Type": {"type", "not_null", "unique"}, - "Range": {"min", "max", "mean_between", "percentile_range"}, - "Format": { - "regex", "email_valid", "phone_valid", "url_valid", - "date_format", "json_valid", "length", - }, - "Statistical": { - "std_dev_less_than", "z_score_outliers", - }, - "Temporal": { - "timestamp_range", "no_future_timestamps", "business_days_only", - }, - "Cross-column": {"sum_equals", "unique_combination"}, - } - category_counts: dict[str, int] = {} - total_rules = 0 - excluded_count = 0 - for check in checks: - for rule_name in check.get("rules", {}): - total_rules += 1 - for cat, rule_set in rule_categories.items(): - if rule_name in rule_set: - category_counts[cat] = category_counts.get(cat, 0) + 1 - break - else: - category_counts["Other"] = category_counts.get("Other", 0) + 1 - excluded_count += len(check.get("_excluded_rules", {})) - - if category_counts: - parts = [f"{cat}: {cnt}" for cat, cnt in category_counts.items() if cnt > 0] - console.print("\n[bold]Rule Distribution:[/bold]") - console.print(f" {' | '.join(parts)}") - summary_line = f" {total_rules} rules generated" - if excluded_count > 0: - summary_line += f" ({excluded_count} below threshold excluded)" - console.print(summary_line) - - # Data quality notes - improvements = generator.suggest_improvements(profile) - if improvements: - console.print("\n[bold]Data Quality Notes:[/bold]") - for imp in improvements[:5]: - console.print( - f" - {imp['column']}: {imp['detail']} " - f"[dim]— {imp['recommendation']}[/dim]" - ) - - raise typer.Exit(code=0) - - except typer.Exit: - raise - except Exception as e: - console.print(f"[red]Error:[/red] {e}", style="red") - raise typer.Exit(code=4) from e - - @config_app.command("env") def config_env( config_path: str = typer.Argument( diff --git a/datacheck/cli/profile.py b/datacheck/cli/profile.py deleted file mode 100644 index af66c13..0000000 --- a/datacheck/cli/profile.py +++ /dev/null @@ -1,390 +0,0 @@ -"""Profile command for DataCheck CLI.""" - -from pathlib import Path - -import typer - -from datacheck.cli import app, console -from datacheck.exceptions import DataCheckError, DataLoadError -from datacheck.logging import configure_logging, get_logger, set_trace_id, generate_trace_id - - -@app.command() -def profile( - data_source: str | None = typer.Argument( - None, - help="Data source: file path, connection string, or omit when using config/sources" - ), - config: str | None = typer.Option( - None, - "--config", - "-c", - help="Path to config file with data_source defined", - ), - source: str | None = typer.Option( - None, - "--source", - help="Named source from sources.yaml", - ), - sources_file: str | None = typer.Option( - None, - "--sources-file", - help="Path to sources YAML file", - ), - table: str | None = typer.Option( - None, - "--table", - "-t", - help="Database table name (for database sources)", - ), - query: str | None = typer.Option( - None, - "--query", - "-q", - help="Custom SQL query (alternative to --table)", - ), - delta_version: int | None = typer.Option( - None, - "--delta-version", - help="Delta Lake version to load (time travel)", - ), - delta_timestamp: str | None = typer.Option( - None, - "--delta-timestamp", - help="Delta Lake timestamp (ISO 8601) to load data as of (time travel)", - ), - storage_options: str | None = typer.Option( - None, - "--storage-options", - help="JSON string of storage options for Delta Lake cloud access", - ), - output: str | None = typer.Option( - None, - "--output", - "-o", - help="Path to write profile report", - ), - output_format: str = typer.Option( - "terminal", - "--format", - "-f", - help="Output format: 'terminal', 'json', or 'markdown'", - ), - outlier_method: str = typer.Option( - "zscore", - "--outlier-method", - help="Outlier detection method: 'zscore' or 'iqr'", - ), - show_suggestions: bool = typer.Option( - True, - "--suggestions/--no-suggestions", - help="Show rule suggestions based on data profile", - ), - show_correlations: bool = typer.Option( - True, - "--correlations/--no-correlations", - help="Show correlation matrix for numeric columns", - ), - log_level: str = typer.Option( - "WARNING", - "--log-level", - help="Log level: DEBUG, INFO, WARNING, ERROR, CRITICAL", - ), - log_format: str = typer.Option( - "console", - "--log-format", - help="Log format: 'console' (human-readable) or 'json' (machine-parseable)", - ), - log_file: str | None = typer.Option( - None, - "--log-file", - help="Path to log file (enables file logging with rotation)", - ), - verbose: bool = typer.Option( - False, - "--verbose", - "-v", - help="Enable verbose logging (sets log level to DEBUG)", - ), -) -> None: - """Generate data quality profile for a dataset. - - Analyzes data to provide statistical summaries, missing value analysis, - cardinality, outlier detection, quality scoring, and rule suggestions. - - Examples: - datacheck profile data.csv - datacheck profile data.csv --format json --output profile.json - datacheck profile data.csv --format markdown --output PROFILE.md - - Exit codes: - 0 - Profile generated successfully - 3 - Data loading error - 4 - Unexpected error - """ - # Configure logging - effective_log_level = "DEBUG" if verbose else log_level - configure_logging( - level=effective_log_level, - format_type=log_format, - log_file=log_file, - mask_sensitive=True, - ) - - # Generate trace ID for this profiling run - trace_id = generate_trace_id() - set_trace_id(trace_id) - - logger = get_logger(__name__) - logger.info( - "profiling_started", - extra={ - "trace_id": trace_id, - "data_source": data_source, - } - ) - - try: - # Load data - resolve source from config/sources/argument - from datacheck.loader import LoaderFactory - from datacheck.config import ConfigLoader - from datacheck.config.source import load_sources - from datacheck.connectors.factory import load_source_data - - # Parse storage options if provided - parsed_storage_options = None - if storage_options: - import json as json_module - try: - parsed_storage_options = json_module.loads(storage_options) - except json_module.JSONDecodeError as e: - console.print(f"[red]Error:[/red] Invalid --storage-options JSON: {e}", style="red") - raise typer.Exit(code=2) from e - - _status = console.status("[bold blue]Loading data...", spinner="dots") - _status.start() - - try: - df = None - resolved_source_name = None - - # Option 1: Named source from sources file - if source: - # Load sources file - if sources_file: - sources_path = Path(sources_file) - else: - # Try to find sources from config - if config: - config_data = ConfigLoader.load(config) - if config_data.sources_file: - sources_path = Path(config).parent / config_data.sources_file - else: - console.print( - "[red]Error:[/red] --source requires --sources-file or sources_file in config", - style="red", - ) - raise typer.Exit(code=2) - else: - console.print( - "[red]Error:[/red] --source requires --sources-file", - style="red", - ) - raise typer.Exit(code=2) - - sources = load_sources(sources_path) - if source not in sources: - console.print( - f"[red]Error:[/red] Source '{source}' not found. " - f"Available: {', '.join(sorted(sources.keys()))}", - style="red", - ) - raise typer.Exit(code=2) - - source_config = sources[source] - df = load_source_data(source_config, table=table, query=query) - resolved_source_name = source - logger.info("data_loaded", extra={"source_type": "named_source", "source": source}) - - # Option 2: Inline data_source from config - elif data_source is None and config: - config_data = ConfigLoader.load(config) - config_dir = Path(config).parent - if config_data.data_source: - source_path = config_dir / config_data.data_source.path - df = LoaderFactory.load( - str(source_path), - table=table, - query=query, - version=delta_version, - timestamp=delta_timestamp, - storage_options=parsed_storage_options, - ) - resolved_source_name = str(source_path) - logger.info("data_loaded", extra={"source_type": "inline", "path": str(source_path)}) - elif config_data.sources_file and config_data.source: - # Use default source from config - sources_path = config_dir / config_data.sources_file - sources = load_sources(sources_path) - if config_data.source not in sources: - console.print( - f"[red]Error:[/red] Default source '{config_data.source}' not found", - style="red", - ) - raise typer.Exit(code=2) - source_config = sources[config_data.source] - df = load_source_data(source_config, table=table, query=query) - resolved_source_name = config_data.source - logger.info("data_loaded", extra={"source_type": "config_source", "source": config_data.source}) - else: - console.print( - "[red]Error:[/red] Config file has no data_source or sources_file defined", - style="red", - ) - raise typer.Exit(code=2) - - # Option 3: Auto-discover config file - elif data_source is None: - found_config = ConfigLoader.find_config() - if found_config: - config_data = ConfigLoader.load(found_config) - config_dir = found_config.parent - if config_data.data_source: - source_path = config_dir / config_data.data_source.path - df = LoaderFactory.load( - str(source_path), - table=table, - query=query, - version=delta_version, - timestamp=delta_timestamp, - storage_options=parsed_storage_options, - ) - resolved_source_name = str(source_path) - logger.info("data_loaded", extra={"source_type": "auto_config", "path": str(source_path)}) - elif config_data.sources_file and config_data.source: - sources_path = config_dir / config_data.sources_file - sources = load_sources(sources_path) - if config_data.source in sources: - source_config = sources[config_data.source] - df = load_source_data(source_config, table=table, query=query) - resolved_source_name = config_data.source - logger.info("data_loaded", extra={"source_type": "auto_source", "source": config_data.source}) - else: - console.print( - f"[red]Error:[/red] Source '{config_data.source}' not found", - style="red", - ) - raise typer.Exit(code=2) - else: - console.print( - "[red]Error:[/red] No data source specified. " - "Provide a file path as argument, use --config with data_source, " - "or use --source with --sources-file.", - style="red", - ) - raise typer.Exit(code=2) - else: - console.print( - "[red]Error:[/red] No data source specified. " - "Provide a file path as argument, use --config, or use --source.", - style="red", - ) - raise typer.Exit(code=2) - - # Option 4: Direct data source argument - else: - logger.debug("loading_data", extra={"data_source": data_source}) - df = LoaderFactory.load( - data_source, - table=table, - query=query, - version=delta_version, - timestamp=delta_timestamp, - storage_options=parsed_storage_options, - ) - resolved_source_name = data_source - logger.info("data_loaded", extra={"row_count": len(df), "column_count": len(df.columns)}) - - except DataLoadError as e: - _status.stop() - logger.error("data_load_failed", extra={"error": str(e)}) - console.print(f"[red]Data Load Error:[/red] {e}", style="red") - raise typer.Exit(code=3) from e - - _status.update("[bold blue]Profiling data...") - - # Generate profile - from datacheck.profiling import DataProfiler - from datacheck.profiling.outliers import OutlierMethod - - logger.debug("generating_profile") - - # Determine outlier method - method = OutlierMethod.IQR if outlier_method.lower() == "iqr" else OutlierMethod.ZSCORE - - profiler = DataProfiler(outlier_method=method) - - # Generate profile - if resolved_source_name and not resolved_source_name.startswith(("http", "s3", "gs", "az")): - dataset_name = Path(resolved_source_name).stem - else: - dataset_name = resolved_source_name or "dataset" - profile_result = profiler.profile(df, name=dataset_name) - _status.stop() - logger.info("profile_generated", extra={ - "columns_analyzed": len(profile_result.columns), - "quality_score": profile_result.overall_quality_score, - }) - - # Output based on format - if output_format == "json": - from datacheck.profiling.formatters import JsonFormatter - json_fmt = JsonFormatter( - pretty=True, - include_suggestions=show_suggestions, - include_correlations=show_correlations, - ) - if output: - json_fmt.save(profile_result, output) - console.print(f"[green]OK[/green] Profile written to {output}") - else: - console.print(json_fmt.format(profile_result)) - - elif output_format == "markdown": - from datacheck.profiling.formatters import MarkdownFormatter - md_fmt = MarkdownFormatter( - include_suggestions=show_suggestions, - include_correlations=show_correlations, - ) - if output: - md_fmt.save(profile_result, output) - console.print(f"[green]OK[/green] Profile written to {output}") - else: - console.print(md_fmt.format(profile_result)) - - else: # terminal - from datacheck.profiling.formatters import TerminalFormatter - term_fmt = TerminalFormatter( - console=console, - include_suggestions=show_suggestions, - include_correlations=show_correlations, - ) - term_fmt.format(profile_result) - - logger.info("profiling_completed", extra={"trace_id": trace_id, "exit_code": 0}) - raise typer.Exit(code=0) - - except typer.Exit: - raise - except DataLoadError as e: - logger.error("profiling_error", extra={"error_type": "DataLoadError", "error": str(e)}) - console.print(f"[red]Data Load Error:[/red] {e}", style="red") - raise typer.Exit(code=3) from e - except DataCheckError as e: - logger.error("profiling_error", extra={"error_type": "DataCheckError", "error": str(e)}) - console.print(f"[red]DataCheck Error:[/red] {e}", style="red") - raise typer.Exit(code=4) from e - except Exception as e: - logger.exception("unexpected_error", extra={"error_type": type(e).__name__, "error": str(e)}) - console.print(f"[red]Unexpected Error:[/red] {e}", style="red") - raise typer.Exit(code=4) from e diff --git a/datacheck/cli/schema.py b/datacheck/cli/schema.py index 31b9ba8..d20c582 100644 --- a/datacheck/cli/schema.py +++ b/datacheck/cli/schema.py @@ -20,10 +20,10 @@ def _safe_encoding() -> bool: _TICK = "✓" if _safe_encoding() else "v" from datacheck.exceptions import DataLoadError -# Schema sub-app for schema evolution commands +# Schema sub-app for schema contract enforcement commands schema_app = typer.Typer( name="schema", - help="Schema evolution detection commands", + help="Enforce schema contracts - capture baselines and fail on breaking changes", ) @@ -34,9 +34,6 @@ def _resolve_data_source( sources_file: str | None, table: str | None = None, query: str | None = None, - delta_version: int | None = None, - delta_timestamp: str | None = None, - storage_options: str | None = None, ) -> tuple[pd.DataFrame, str]: """Resolve and load data from various source options. @@ -48,16 +45,6 @@ def _resolve_data_source( from datacheck.config.source import load_sources from datacheck.connectors.factory import load_source_data - # Parse storage options if provided - parsed_storage_options = None - if storage_options: - import json as json_module - try: - parsed_storage_options = json_module.loads(storage_options) - except json_module.JSONDecodeError as e: - console.print(f"[red]Error:[/red] Invalid --storage-options JSON: {e}", style="red") - raise typer.Exit(code=2) from e - df = None resolved_source_name = None @@ -103,14 +90,7 @@ def _resolve_data_source( config_dir = Path(config).parent if config_data.data_source: source_path = config_dir / config_data.data_source.path - df = LoaderFactory.load( - str(source_path), - table=table, - query=query, - version=delta_version, - timestamp=delta_timestamp, - storage_options=parsed_storage_options, - ) + df = LoaderFactory.load(str(source_path), table=table, query=query) resolved_source_name = str(source_path) elif config_data.sources_file and config_data.source: # Use default source from config @@ -140,14 +120,7 @@ def _resolve_data_source( config_dir = found_config.parent if config_data.data_source: source_path = config_dir / config_data.data_source.path - df = LoaderFactory.load( - str(source_path), - table=table, - query=query, - version=delta_version, - timestamp=delta_timestamp, - storage_options=parsed_storage_options, - ) + df = LoaderFactory.load(str(source_path), table=table, query=query) resolved_source_name = str(source_path) elif config_data.sources_file and config_data.source: sources_path = config_dir / config_data.sources_file @@ -180,14 +153,7 @@ def _resolve_data_source( # Option 4: Direct data source argument else: - df = LoaderFactory.load( - data_source, - table=table, - query=query, - version=delta_version, - timestamp=delta_timestamp, - storage_options=parsed_storage_options, - ) + df = LoaderFactory.load(data_source, table=table, query=query) resolved_source_name = data_source return df, resolved_source_name @@ -233,21 +199,6 @@ def schema_capture( "-q", help="Custom SQL query (alternative to --table)", ), - delta_version: int | None = typer.Option( - None, - "--delta-version", - help="Delta Lake version to load (time travel)", - ), - delta_timestamp: str | None = typer.Option( - None, - "--delta-timestamp", - help="Delta Lake timestamp (ISO 8601) to load data as of", - ), - storage_options: str | None = typer.Option( - None, - "--storage-options", - help="JSON string of storage options for Delta Lake cloud access", - ), baseline_dir: str | None = typer.Option( None, "--baseline-dir", @@ -290,9 +241,6 @@ def schema_capture( sources_file=sources_file, table=table, query=query, - delta_version=delta_version, - delta_timestamp=delta_timestamp, - storage_options=storage_options, ) except DataLoadError as e: console.print(f"[red]Data Load Error:[/red] {e}", style="red") @@ -374,21 +322,6 @@ def schema_compare( "-q", help="Custom SQL query (alternative to --table)", ), - delta_version: int | None = typer.Option( - None, - "--delta-version", - help="Delta Lake version to load (time travel)", - ), - delta_timestamp: str | None = typer.Option( - None, - "--delta-timestamp", - help="Delta Lake timestamp (ISO 8601) to load data as of", - ), - storage_options: str | None = typer.Option( - None, - "--storage-options", - help="JSON string of storage options for Delta Lake cloud access", - ), baseline_dir: str | None = typer.Option( None, "--baseline-dir", @@ -456,9 +389,6 @@ def schema_compare( sources_file=sources_file, table=table, query=query, - delta_version=delta_version, - delta_timestamp=delta_timestamp, - storage_options=storage_options, ) except DataLoadError as e: console.print(f"[red]Data Load Error:[/red] {e}", style="red") diff --git a/datacheck/cli/validate.py b/datacheck/cli/validate.py index 3d894e5..e41e640 100644 --- a/datacheck/cli/validate.py +++ b/datacheck/cli/validate.py @@ -3,6 +3,8 @@ from pathlib import Path from typing import Any +import time + import typer import pandas as pd @@ -11,7 +13,6 @@ from datacheck.engine import ValidationEngine from datacheck.exceptions import ConfigurationError, DataCheckError, DataLoadError, ValidationError from datacheck.logging import configure_logging, get_logger, set_trace_id, generate_trace_id -from datacheck.output import JSONExporter def _load_from_warehouse( @@ -25,7 +26,6 @@ def _load_from_warehouse( region: str | None = None, cluster: str | None = None, iam_auth: bool = False, - sample_rate: float | None = None, ) -> pd.DataFrame: """Load data from a cloud data warehouse. @@ -40,7 +40,6 @@ def _load_from_warehouse( region: Cloud region cluster: Cluster identifier iam_auth: Use IAM authentication - sample_rate: Sample fraction Returns: DataFrame with loaded data @@ -106,7 +105,6 @@ def _load_from_warehouse( table_name=table, where=where, schema=schema, - sample_rate=sample_rate, ) return result else: @@ -115,81 +113,116 @@ def _load_from_warehouse( ) -def _generate_markdown_report(summary: Any) -> str: - """Generate a markdown report from validation summary. - - Args: - summary: ValidationSummary object - - Returns: - Markdown formatted string - """ - lines = [] +def _generate_markdown_report( + summary: Any, + source_info: str | None = None, + elapsed: float | None = None, +) -> str: + """Generate a markdown report from validation summary.""" + lines: list[str] = [] lines.append("# DataCheck Validation Report\n") - # Overall status - if summary.all_passed: - lines.append("**Status:** PASSED\n") - elif summary.has_errors: - lines.append("**Status:** ERRORS\n") + # Header metadata + if source_info: + lines.append(f"**Source:** {source_info} ") + if summary.all_passed and not summary.has_failures: + lines.append("**Status:** ✅ All checks passed ") + elif summary.all_passed: + lines.append("**Status:** ⚠️ Passed with warnings ") else: - lines.append("**Status:** FAILED\n") - - # Summary statistics - lines.append("## Summary\n") - lines.append("| Metric | Value |") - lines.append("|--------|-------|") - lines.append(f"| Total Rules | {summary.total_rules} |") - lines.append(f"| Passed | {summary.passed_rules} |") - lines.append(f"| Failed | {summary.failed_rules} |") + lines.append("**Status:** ❌ Validation failed ") + + counts_parts = [f"{summary.passed_rules} passed"] if summary.failed_errors > 0: - lines.append(f"| - Errors | {summary.failed_errors} |") + counts_parts.append(f"{summary.failed_errors} failed") if summary.failed_warnings > 0: - lines.append(f"| - Warnings | {summary.failed_warnings} |") + counts_parts.append(f"{summary.failed_warnings} warnings") if summary.failed_info > 0: - lines.append(f"| - Info | {summary.failed_info} |") - lines.append(f"| Execution Errors | {summary.error_rules} |") + counts_parts.append(f"{summary.failed_info} info") + if summary.error_rules > 0: + counts_parts.append(f"{summary.error_rules} execution errors") + + run_line = f"**Ran:** {summary.total_rules} checks" + if summary.total_rows > 0: + run_line += f" on {summary.total_rows:,} rows" + run_line += f" — {', '.join(counts_parts)}" + if elapsed is not None: + run_line += f". Took {elapsed:.2f}s" + lines.append(run_line + " ") + lines.append("") + + # All rules table + lines.append("## Results\n") + lines.append("| Result | Check | Column | Details | Severity |") + lines.append("|--------|-------|--------|---------|----------|") + + for result in summary.results: + check_label = result.check_name or result.rule_name + rule_type = result.rule_type or "" + check_display = f"{check_label} · {rule_type}" if rule_type else check_label + + if result.has_error: + result_icon = "❌ error" + detail = str(result.error)[:80].replace("|", "\\|") + elif result.passed: + result_icon = "✅ passed" + detail = "" + elif result.severity == "warning": + result_icon = "⚠️ warning" + failure_rate = (result.failed_rows / result.total_rows * 100) if result.total_rows > 0 else 0.0 + detail = f"{result.failed_rows:,}/{result.total_rows:,} ({failure_rate:.1f}%)" + elif result.severity == "info": + result_icon = "ℹ️ info" + failure_rate = (result.failed_rows / result.total_rows * 100) if result.total_rows > 0 else 0.0 + detail = f"{result.failed_rows:,}/{result.total_rows:,} ({failure_rate:.1f}%)" + else: + result_icon = "❌ failed" + failure_rate = (result.failed_rows / result.total_rows * 100) if result.total_rows > 0 else 0.0 + detail = f"{result.failed_rows:,}/{result.total_rows:,} ({failure_rate:.1f}%)" + + lines.append( + f"| {result_icon} | {check_display} | `{result.column}` | {detail} | {result.severity} |" + ) lines.append("") - # Failed rules details + # Failure details failed_results = summary.get_failed_results() if failed_results: - lines.append("## Failed Rules\n") + lines.append("## Failure Details\n") for result in failed_results: - check_name = result.check_name or result.rule_name + check_label = result.check_name or result.rule_name rule_type = result.rule_type or "unknown" failure_rate = (result.failed_rows / result.total_rows * 100) if result.total_rows > 0 else 0.0 - - lines.append(f"### {check_name}") - lines.append(f"- **Column:** {result.column}") - lines.append(f"- **Rule Type:** {rule_type}") + lines.append(f"### {check_label} · {rule_type} (`{result.column}`)") lines.append(f"- **Severity:** {result.severity}") - lines.append(f"- **Failed Rows:** {result.failed_rows}/{result.total_rows} ({failure_rate:.1f}%)") + lines.append(f"- **Rows failed:** {result.failed_rows:,} / {result.total_rows:,} ({failure_rate:.1f}%)") if result.failure_details and result.failure_details.sample_failures: - lines.append("\n**Sample Failures:**\n") - lines.append("| Row | Value | Reason |") - lines.append("|-----|-------|--------|") details = result.failure_details - for i, row_idx in enumerate(details.sample_failures[:5]): + samples = [] + for i in range(min(5, len(details.sample_failures))): value = details.sample_values[i] if i < len(details.sample_values) else "N/A" - reason = details.sample_reasons[i] if i < len(details.sample_reasons) else "N/A" - # Escape pipes in values - value_str = str(value).replace("|", "\\|")[:40] - reason_str = str(reason).replace("|", "\\|")[:60] - lines.append(f"| {row_idx} | {value_str} | {reason_str} |") + reason = details.sample_reasons[i] if i < len(details.sample_reasons) else "" + val_str = str(value).replace("|", "\\|")[:40] + reason_str = reason.replace("|", "\\|")[:60] if reason else "" + samples.append((details.sample_failures[i], val_str, reason_str)) + + lines.append("\n**Sample failures:**\n") + lines.append("| Row | Value | Reason |") + lines.append("|-----|-------|--------|") + for row_idx, val_str, reason_str in samples: + lines.append(f"| {row_idx} | {val_str} | {reason_str} |") lines.append("") - # Error rules details + # Execution errors error_results = summary.get_error_results() if error_results: - lines.append("## Rules with Errors\n") + lines.append("## Execution Errors\n") for result in error_results: - check_name = result.check_name or result.rule_name - lines.append(f"### {check_name}") - lines.append(f"- **Column:** {result.column}") - lines.append(f"- **Error:** {result.error}") - lines.append("") + check_label = result.check_name or result.rule_name + rule_type = result.rule_type or "" + lines.append(f"### {check_label} · {rule_type} (`{result.column}`)") + lines.append(f"\n```\n{result.error}\n```\n") return "\n".join(lines) @@ -265,71 +298,6 @@ def validate( "--iam-auth", help="Use IAM authentication (for Redshift)", ), - sample_rate: float | None = typer.Option( - None, - "--sample-rate", - help="Random sample rate (0.0 to 1.0)", - ), - sample_count: int | None = typer.Option( - None, - "--sample-count", - help="Number of rows to sample", - ), - top: int | None = typer.Option( - None, - "--top", - help="Validate only first N rows", - ), - stratify: str | None = typer.Option( - None, - "--stratify", - help="Column name for stratified sampling (requires --sample-count)", - ), - seed: int | None = typer.Option( - None, - "--seed", - help="Random seed for reproducible sampling", - ), - sample_strategy: str | None = typer.Option( - None, - "--sample-strategy", - help="Sampling strategy: random, stratified, time_based, error_focused, adaptive, reservoir", - ), - time_column: str | None = typer.Option( - None, - "--time-column", - help="Column for time-based sampling", - ), - start_date: str | None = typer.Option( - None, - "--start-date", - help="Start date for time-based sampling (ISO format)", - ), - end_date: str | None = typer.Option( - None, - "--end-date", - help="End date for time-based sampling (ISO format)", - ), - error_indicators: str | None = typer.Option( - None, - "--error-indicators", - help="Comma-separated conditions for error-focused sampling (e.g., 'age<0,price>10000')", - ), - delta_version: int | None = typer.Option( - None, - "--delta-version", - help="Delta Lake version to load (time travel)", - ), - delta_timestamp: str | None = typer.Option( - None, - "--delta-timestamp", - help="Delta Lake timestamp (ISO 8601) to load data as of (time travel)", - ), - storage_options: str | None = typer.Option( - None, - "--storage-options", - help="JSON string of storage options for Delta Lake cloud access (e.g., '{\"AWS_ACCESS_KEY_ID\": \"...\", \"AWS_SECRET_ACCESS_KEY\": \"...\"}')", - ), parallel: bool = typer.Option( False, "--parallel", @@ -359,7 +327,13 @@ def validate( None, "--output", "-o", - help="Save results to a JSON file (terminal output is always shown)", + help="Save results to a file (terminal output is always shown). Format is controlled by --format.", + ), + output_format: str = typer.Option( + "json", + "--format", + "-f", + help="Output format when using --output: json (default), sarif, markdown, csv", ), csv_export: str | None = typer.Option( None, @@ -393,7 +367,7 @@ def validate( help="Enable verbose logging (sets log level to DEBUG)", ), ) -> None: - """Validate data using specified rules. + """Enforce validation rules against a configured data source. Supports both file-based and database sources. @@ -485,32 +459,27 @@ def validate( _status.start() # Load and validate data + _start_time = time.monotonic() + _source_info: str | None = None try: # Source-based validation mode if source or engine.sources: + effective_source = source or engine.config.source or "" + effective_table = table or engine.config.table + _source_info = effective_source + if effective_table: + _source_info += f" → {effective_table}" + elif query: + _source_info += " (custom query)" logger.debug( "loading_from_source", - extra={"source": source or engine.config.source}, + extra={"source": effective_source}, ) - # Parse error indicators if provided - parsed_error_indicators = None - if error_indicators: - parsed_error_indicators = [ind.strip() for ind in error_indicators.split(",")] - summary = engine.validate_sources( source_name=source, table=table, where=where, query=query, - sample_rate=sample_rate, - sample_count=sample_count, - stratify=stratify, - seed=seed, - sample_strategy=sample_strategy, - time_column=time_column, - start_date=start_date, - end_date=end_date, - error_indicators=parsed_error_indicators, ) logger.info( "data_loaded", @@ -527,6 +496,7 @@ def validate( else: source_path = Path(inline_source.path) + _source_info = f"{source_path.name} ({inline_source.type})" logger.debug( "loading_inline_data_source", extra={"type": inline_source.type, "path": str(source_path)}, @@ -548,6 +518,11 @@ def validate( # Warehouse connection string mode elif data_source.startswith(("snowflake://", "bigquery://", "redshift://")): + _source_info = data_source.split("://")[0] + if table: + _source_info += f" → {table}" + elif query: + _source_info += " (custom query)" logger.debug("loading_data", extra={"data_source": data_source}) df = _load_from_warehouse( data_source, @@ -560,47 +535,21 @@ def validate( region=region, cluster=cluster, iam_auth=iam_auth, - sample_rate=sample_rate, ) logger.info("data_loaded", extra={"source_type": "warehouse", "row_count": len(df)}) summary = engine.validate_dataframe(df) # File/connection string mode else: + _source_info = Path(data_source).name if not data_source.startswith(("http://", "https://")) else data_source + if table: + _source_info += f" → {table}" logger.debug("loading_data", extra={"data_source": data_source}) - # Parse storage options if provided - parsed_storage_options = None - if storage_options: - import json as json_module - try: - parsed_storage_options = json_module.loads(storage_options) - except json_module.JSONDecodeError as e: - console.print(f"[red]Error:[/red] Invalid --storage-options JSON: {e}", style="red") - raise typer.Exit(code=2) from e - - # Parse error indicators if provided - parsed_error_indicators = None - if error_indicators: - parsed_error_indicators = [ind.strip() for ind in error_indicators.split(",")] - summary = engine.validate_file( data_source, table=table, where=where, query=query, - sample_rate=sample_rate, - sample_count=sample_count, - top=top, - stratify=stratify, - seed=seed, - sample_strategy=sample_strategy, - time_column=time_column, - start_date=start_date, - end_date=end_date, - error_indicators=parsed_error_indicators, - version=delta_version, - timestamp=delta_timestamp, - storage_options=parsed_storage_options, ) logger.info("data_loaded", extra={"source_type": "file", "data_source": data_source}) except DataLoadError as e: @@ -611,6 +560,8 @@ def validate( if _status: _status.stop() + _elapsed = time.monotonic() - _start_time + # Log validation results logger.info( "validation_completed", @@ -636,14 +587,41 @@ def validate( console=console, show_suggestions=suggestions, ) - terminal_reporter.report(summary) + terminal_reporter.report(summary, elapsed=_elapsed, source_info=_source_info) - # JSON output — save to file if --output specified + # File output — format controlled by --format flag if effective_output: from pathlib import Path as OutputPath OutputPath(effective_output).parent.mkdir(parents=True, exist_ok=True) - JSONExporter.export_summary(summary, output_path=effective_output, pretty=True) - console.print(f"[green]OK:[/green] Results saved to {effective_output}") + + fmt = output_format.lower().strip() + if fmt == "sarif": + from datacheck.reporting import SarifExporter + SarifExporter.export( + summary, + output_path=effective_output, + elapsed=_elapsed, + source_info=_source_info, + ) + elif fmt == "markdown": + OutputPath(effective_output).write_text( + _generate_markdown_report(summary, source_info=_source_info, elapsed=_elapsed), + encoding="utf-8", + ) + elif fmt == "csv": + from datacheck.reporting import CsvExporter + CsvExporter.export_failures(summary, output_path=effective_output) + else: + # Default: json — use JsonReporter for richer output + from datacheck.reporting.json_reporter import JsonReporter + JsonReporter().export( + summary, + output_path=effective_output, + source_info=_source_info, + elapsed=_elapsed, + ) + + console.print(f"[green]OK:[/green] Results saved to {effective_output} (format: {fmt})") # Export CSV if requested via CLI option if csv_export: diff --git a/datacheck/config/__init__.py b/datacheck/config/__init__.py index ed3fa93..8ec5098 100644 --- a/datacheck/config/__init__.py +++ b/datacheck/config/__init__.py @@ -8,7 +8,6 @@ ConfigLoader, NotificationsConfig, RuleConfig, - SamplingConfig, ValidationConfig, ) @@ -16,7 +15,6 @@ from datacheck.config.schema import CONFIG_SCHEMA from datacheck.config.validator import ConfigValidator from datacheck.config.parser import ConfigParser -from datacheck.config.generator import ConfigGenerator from datacheck.config.source import SourceConfig, load_sources from datacheck.config.templates import ( TEMPLATES_DIR, @@ -30,13 +28,11 @@ "ConfigLoader", "NotificationsConfig", "RuleConfig", - "SamplingConfig", "ValidationConfig", # New config management "CONFIG_SCHEMA", "ConfigValidator", "ConfigParser", - "ConfigGenerator", # Source management "SourceConfig", "load_sources", diff --git a/datacheck/config/generator.py b/datacheck/config/generator.py deleted file mode 100644 index 35de931..0000000 --- a/datacheck/config/generator.py +++ /dev/null @@ -1,513 +0,0 @@ -"""Auto-generate configuration from data profile.""" - -from __future__ import annotations - -from datetime import datetime -from pathlib import Path -from typing import Any - -import yaml - -import pandas as pd - -from datacheck.exceptions import ConfigurationError -from datacheck.profiling import DataProfiler -from datacheck.profiling.models import ColumnProfile, DatasetProfile - - -class ConfigGenerator: - """Generate validation config from data profile. - - Analyzes data to suggest validation rules based on - detected patterns and statistics. - - Example: - >>> generator = ConfigGenerator() - >>> config = generator.generate_from_dataframe(df) - >>> generator.save_config(config, "datacheck.yaml") - """ - - # Confidence levels for filtering suggestions - CONFIDENCE_LEVELS = {"low": 1, "medium": 2, "high": 3} - - def __init__(self) -> None: - """Initialize generator.""" - self.profiler = DataProfiler() - - def generate_from_dataframe( - self, - df: pd.DataFrame, - name: str = "dataset", - confidence_threshold: str = "medium", - include_metadata: bool = True, - ) -> dict[str, Any]: - """ - Generate config from DataFrame. - - Args: - df: DataFrame to analyze - name: Name for the dataset - confidence_threshold: Minimum confidence ("low", "medium", "high") - include_metadata: Include metadata section - - Returns: - Config dictionary - """ - # Validate confidence threshold - if confidence_threshold not in self.CONFIDENCE_LEVELS: - raise ConfigurationError( - f"Invalid confidence_threshold '{confidence_threshold}'. " - f"Must be one of: {', '.join(self.CONFIDENCE_LEVELS.keys())}" - ) - - # Profile data - profile = self.profiler.profile(df, name=name) - - # Generate config from profile - return self.generate_from_profile( - profile, - confidence_threshold=confidence_threshold, - include_metadata=include_metadata, - ) - - def generate_from_profile( - self, - profile: DatasetProfile, - confidence_threshold: str = "medium", - include_metadata: bool = True, - ) -> dict[str, Any]: - """ - Generate config from existing profile. - - Args: - profile: DatasetProfile from profiler - confidence_threshold: Minimum confidence level - include_metadata: Include metadata section - - Returns: - Config dictionary - """ - min_confidence = self.CONFIDENCE_LEVELS[confidence_threshold] - - # Generate single-column checks - checks = [] - for _col_name, col_profile in profile.columns.items(): - check = self._generate_check(col_profile, min_confidence) - if check and check.get("rules"): - checks.append(check) - - # Generate cross-column checks - for cc_rule in getattr(profile, "cross_column_rules", []): - cc_confidence = self.CONFIDENCE_LEVELS.get( - cc_rule.get("confidence", "low"), 1 - ) - if cc_confidence >= min_confidence: - col_names = cc_rule["columns"] - cc_check: dict[str, Any] = { - "name": f"cross_{'_'.join(col_names[:2])}_{cc_rule['rule']}", - "column": col_names[0], - "rules": {cc_rule["rule"]: cc_rule["params"]}, - "description": cc_rule.get("reason", "Cross-column rule"), - } - reason = cc_rule.get("reason") - if reason: - cc_check["_rule_reasons"] = {cc_rule["rule"]: reason} - checks.append(cc_check) - - # Build config - config: dict[str, Any] = {"version": "1.0"} - - if include_metadata: - config["metadata"] = { - "description": f"Auto-generated config for {profile.name}", - "created": datetime.now().isoformat(), - "generated_by": "datacheck", - "source_rows": profile.row_count, - "source_columns": profile.column_count, - "quality_score": profile.overall_quality_score, - } - - config["checks"] = checks - - config["reporting"] = { - "output_path": "./output", - "export_failures": True, - } - - return config - - def _generate_check( - self, col_profile: ColumnProfile, min_confidence: int - ) -> dict[str, Any] | None: - """ - Generate check for a column. - - Args: - col_profile: ColumnProfile - min_confidence: Minimum confidence level (1-3) - - Returns: - Check dictionary or None if no rules - """ - rules: dict[str, Any] = {} - rule_reasons: dict[str, str] = {} - excluded_rules: dict[str, Any] = {} - excluded_reasons: dict[str, str] = {} - - # Process suggestions from profiler - for suggestion in col_profile.suggestions: - sugg_confidence = self.CONFIDENCE_LEVELS.get( - suggestion.get("confidence", "low"), 1 - ) - - rule_name = suggestion["rule"] - params = suggestion.get("params") - reason = suggestion.get("reason", "") - - if sugg_confidence >= min_confidence: - if params is not None: - rules[rule_name] = params - else: - rules[rule_name] = True - - if reason: - rule_reasons[rule_name] = reason - else: - if params is not None: - excluded_rules[rule_name] = params - else: - excluded_rules[rule_name] = True - if reason: - excluded_reasons[rule_name] = reason - - if not rules: - return None - - # Create check with description - check: dict[str, Any] = { - "name": f"{col_profile.name}_check", - "column": col_profile.name, - "rules": rules, - } - - if rule_reasons: - check["_rule_reasons"] = rule_reasons - - if excluded_rules: - check["_excluded_rules"] = excluded_rules - check["_excluded_reasons"] = excluded_reasons - - # Build description from column characteristics - desc_parts: list[str] = [] - - if col_profile.null_percentage == 0: - desc_parts.append("Required field") - - if col_profile.unique_percentage >= 99: - desc_parts.append("unique identifier") - elif col_profile.unique_count <= 10 and col_profile.unique_count > 0: - desc_parts.append(f"{col_profile.unique_count} distinct values") - - inferred = getattr(col_profile, "inferred_type", None) - if inferred: - desc_parts.append(inferred) - - if desc_parts: - check["description"] = ", ".join(desc_parts) - - return check - - def generate_from_file( - self, - data_path: str | Path, - confidence_threshold: str = "medium", - return_profile: bool = False, - **load_kwargs: Any, - ) -> dict[str, Any] | tuple[dict[str, Any], DatasetProfile]: - """ - Generate config from data file. - - Args: - data_path: Path to data file (CSV, Parquet, etc.) - confidence_threshold: Minimum confidence level - return_profile: If True, also return the DatasetProfile - **load_kwargs: Additional kwargs for data loading - - Returns: - Config dictionary, or (config, profile) tuple if return_profile=True - """ - from datacheck.loader import LoaderFactory - - data_path = Path(data_path) - df = LoaderFactory.load(str(data_path), **load_kwargs) - name = data_path.stem - - # Determine source type from file extension - ext = data_path.suffix.lower().lstrip(".") - source_type_map = { - "csv": "csv", - "parquet": "parquet", - "pq": "parquet", - "json": "json", - "avro": "avro", - } - source_type = source_type_map.get(ext, "csv") - - if return_profile: - if confidence_threshold not in self.CONFIDENCE_LEVELS: - raise ConfigurationError( - f"Invalid confidence_threshold '{confidence_threshold}'. " - f"Must be one of: {', '.join(self.CONFIDENCE_LEVELS.keys())}" - ) - profile = self.profiler.profile(df, name=name) - config = self.generate_from_profile( - profile, confidence_threshold=confidence_threshold - ) - config["data_source"] = { - "type": source_type, - "path": f"./{data_path.name}", - } - return config, profile - - config = self.generate_from_dataframe( - df, name=name, confidence_threshold=confidence_threshold - ) - config["data_source"] = { - "type": source_type, - "path": f"./{data_path.name}", - } - return config - - def save_config( - self, - config: dict[str, Any], - output_path: str | Path, - add_comments: bool = True, - ) -> None: - """ - Save config to YAML file. - - Args: - config: Config dictionary - output_path: Output file path - add_comments: Add helpful comments to YAML - """ - output_path = Path(output_path) - output_path.parent.mkdir(parents=True, exist_ok=True) - - if add_comments: - yaml_content = self._generate_yaml_with_comments(config) - with open(output_path, "w", encoding="utf-8") as f: - f.write(yaml_content) - else: - with open(output_path, "w", encoding="utf-8") as f: - yaml.dump(config, f, default_flow_style=False, sort_keys=False) - - def _generate_yaml_with_comments(self, config: dict[str, Any]) -> str: - """ - Generate YAML with helpful comments. - - Args: - config: Config dictionary - - Returns: - YAML string with comments - """ - lines = [ - "# DataCheck Configuration", - "# Auto-generated - review and adjust as needed", - "#", - "# Documentation: https://github.com/Squrtech/datacheck", - "", - ] - - # Version - if "version" in config: - lines.append(f"version: '{config['version']}'") - lines.append("") - - # Metadata - if "metadata" in config: - lines.append("# Configuration metadata") - lines.append("metadata:") - for key, value in config["metadata"].items(): - if isinstance(value, str): - lines.append(f" {key}: '{value}'") - else: - lines.append(f" {key}: {value}") - lines.append("") - - # Data source - if "data_source" in config: - ds = config["data_source"] - lines.append("# Data source configuration") - lines.append("data_source:") - lines.append(f" type: {ds['type']}") - lines.append(f" path: '{ds['path']}'") - if "options" in ds and ds["options"]: - lines.append(" options:") - for key, value in ds["options"].items(): - if isinstance(value, str): - lines.append(f" {key}: '{value}'") - else: - lines.append(f" {key}: {value}") - lines.append("") - - # Checks - lines.append("# Validation checks") - lines.append("# Each check validates a single column with one or more rules") - lines.append("checks:") - - for check in config.get("checks", []): - lines.append(f" - name: {check['name']}") - - # Support both single-column and multi-column checks - if "columns" in check: - col_list = check["columns"] - lines.append(" columns:") - for c in col_list: - lines.append(f" - {c}") - elif "column" in check: - lines.append(f" column: {check['column']}") - - if "description" in check: - lines.append(f" description: '{check['description']}'") - - lines.append(" rules:") - rule_reasons = check.get("_rule_reasons", {}) - for rule_name, rule_value in check.get("rules", {}).items(): - reason = rule_reasons.get(rule_name, "") - comment = f" # {reason}" if reason else "" - self._render_rule_line(lines, rule_name, rule_value, comment) - - # Commented-out excluded rules (below confidence threshold) - excluded = check.get("_excluded_rules", {}) - excluded_reasons = check.get("_excluded_reasons", {}) - if excluded: - lines.append(" # --- Below confidence threshold ---") - for rule_name, rule_value in excluded.items(): - reason = excluded_reasons.get(rule_name, "") - comment = f" # {reason}" if reason else "" - self._render_rule_line( - lines, rule_name, rule_value, comment, commented=True - ) - - lines.append("") - - # Reporting - if "reporting" in config: - lines.append("# Output configuration") - lines.append("reporting:") - for key, value in config["reporting"].items(): - if isinstance(value, bool): - lines.append(f" {key}: {str(value).lower()}") - elif isinstance(value, str): - lines.append(f" {key}: {value}") - else: - lines.append(f" {key}: {value}") - - return "\n".join(lines) - - @staticmethod - def _render_rule_line( - lines: list[str], - rule_name: str, - rule_value: object, - comment: str = "", - commented: bool = False, - ) -> None: - """Render a single rule as YAML line(s). - - Args: - lines: Output list to append to - rule_name: Rule name - rule_value: Rule value (bool, dict, list, str, or number) - comment: Inline comment string (e.g. " # reason text") - commented: If True, prefix all lines with "# " (excluded rules) - """ - prefix = " # " if commented else " " - sub_prefix = " # " if commented else " " - - if isinstance(rule_value, bool): - lines.append( - f"{prefix}{rule_name}: {str(rule_value).lower()}{comment}" - ) - elif isinstance(rule_value, dict): - if comment: - lines.append(f"{prefix}{rule_name}:{comment}") - else: - lines.append(f"{prefix}{rule_name}:") - for k, v in rule_value.items(): - if isinstance(v, str): - lines.append(f"{sub_prefix}{k}: '{v}'") - elif isinstance(v, list): - lines.append(f"{sub_prefix}{k}:") - item_prefix = " # " if commented else " " - for item in v: - if isinstance(item, str): - lines.append(f"{item_prefix}- '{item}'") - else: - lines.append(f"{item_prefix}- {item}") - else: - lines.append(f"{sub_prefix}{k}: {v}") - elif isinstance(rule_value, list): - if comment: - lines.append(f"{prefix}{rule_name}:{comment}") - else: - lines.append(f"{prefix}{rule_name}:") - for item in rule_value: - if isinstance(item, str): - lines.append(f"{sub_prefix}- '{item}'") - else: - lines.append(f"{sub_prefix}- {item}") - elif isinstance(rule_value, str): - lines.append(f"{prefix}{rule_name}: '{rule_value}'{comment}") - else: - lines.append(f"{prefix}{rule_name}: {rule_value}{comment}") - - def suggest_improvements( - self, profile: DatasetProfile - ) -> list[dict[str, Any]]: - """ - Suggest data quality improvements based on profile. - - Args: - profile: DatasetProfile - - Returns: - List of improvement suggestions - """ - suggestions = [] - - for col_name, col_profile in profile.columns.items(): - # High null percentage - if col_profile.null_percentage > 20: - suggestions.append({ - "column": col_name, - "issue": "High null percentage", - "detail": f"{col_profile.null_percentage:.1f}% null values", - "recommendation": "Consider adding not_null rule or investigate data quality", - }) - - # Potential duplicates for ID columns - if "id" in col_name.lower() and col_profile.unique_percentage < 100: - suggestions.append({ - "column": col_name, - "issue": "Non-unique ID column", - "detail": f"{100 - col_profile.unique_percentage:.1f}% duplicates", - "recommendation": "Add unique rule if column should be unique", - }) - - # Outliers detected - if col_profile.outlier_count > 0 and col_profile.outlier_percentage > 5: - suggestions.append({ - "column": col_name, - "issue": "High outlier percentage", - "detail": f"{col_profile.outlier_percentage:.1f}% outliers", - "recommendation": "Review outliers and consider adding range validation", - }) - - return suggestions - - -__all__ = ["ConfigGenerator"] diff --git a/datacheck/config/loader.py b/datacheck/config/loader.py index d5ebd56..fdd38a9 100644 --- a/datacheck/config/loader.py +++ b/datacheck/config/loader.py @@ -6,6 +6,7 @@ import yaml +from datacheck.config.schema import get_valid_rule_types from datacheck.exceptions import ConfigurationError @@ -48,24 +49,9 @@ def __post_init__(self) -> None: f"Must be one of: {', '.join(valid_severities)}" ) - # Validate rule types - valid_rule_types = { - # Basic rules - "not_null", "min", "max", "unique", "regex", - "allowed_values", "type", "length", "custom", - # Statistical rules - "mean_between", "std_dev_less_than", "percentile_range", - "z_score_outliers", "distribution_type", - # Freshness rules - "max_age", "timestamp_range", "no_future_timestamps", - "date_format_valid", "business_days_only", - # Format rules - "email_valid", "phone_valid", "url_valid", "json_valid", - # Relationship rules - "foreign_key_exists", "sum_equals", "unique_combination", - # Additional rules - "min_length", "max_length", "date_format", "date_range", - } + # Validate rule types — use the canonical list from schema.py so this + # stays in sync automatically when new rules are added. + valid_rule_types = set(get_valid_rule_types()) invalid_rules = set(self.rules.keys()) - valid_rule_types if invalid_rules: raise ConfigurationError( @@ -79,7 +65,7 @@ class DataSourceConfig: """Configuration for inline data source. Attributes: - type: Source type (csv, parquet, json, excel, delta) + type: Source type (csv, parquet) path: Path to the data file (relative to config file or absolute) options: Loader-specific options (e.g. encoding, delimiter for CSV) """ @@ -90,7 +76,7 @@ class DataSourceConfig: def __post_init__(self) -> None: """Validate data source configuration.""" - valid_types = ["csv", "parquet", "delta", "avro", "duckdb", "sqlite"] + valid_types = ["csv", "parquet"] if self.type not in valid_types: raise ConfigurationError( f"Invalid data source type '{self.type}'. " @@ -116,103 +102,6 @@ class ReportingConfig: failures_file: str | None = None -@dataclass -class SamplingConfig: - """Configuration for data sampling. - - Supports both basic and advanced sampling strategies: - - Basic methods: - - none: No sampling (default) - - random: Random sample by rate or count - - stratified: Proportional sample per group - - top: First N rows - - systematic: Every Nth row - - Advanced methods: - - time_based: Filter by date range - - error_focused: Oversample rows likely to fail - - adaptive: Dynamically adjust sampling based on error rate - - reservoir: Memory-efficient streaming sample - - Example YAML: - sampling: - method: stratified - stratify_by: region - count: 1000 - seed: 42 - """ - - # Basic fields - method: str = "none" - rate: float | None = None # For random sampling (0.0-1.0) - count: int | None = None # For random/stratified/top/reservoir - stratify_by: str | None = None # For stratified sampling - seed: int | None = None # For reproducibility - - # Time-based sampling fields - time_column: str | None = None # Column containing timestamps - start_date: str | None = None # ISO format date string - end_date: str | None = None # ISO format date string - - # Error-focused/adaptive sampling fields - error_indicators: list[str] | None = None # e.g., ["age < 0", "price > 100000"] - - # Systematic sampling fields - interval: int | None = None # For systematic: sample every Nth row - start: int = 0 # Starting index for systematic sampling - - def __post_init__(self) -> None: - """Validate sampling configuration.""" - valid_methods = [ - "none", "random", "stratified", "top", "systematic", - "time_based", "error_focused", "adaptive", "reservoir" - ] - if self.method not in valid_methods: - raise ConfigurationError( - f"Invalid sampling method '{self.method}'. " - f"Must be one of: {', '.join(valid_methods)}" - ) - - if self.method == "random": - if self.rate is None and self.count is None: - raise ConfigurationError( - "Random sampling requires either 'rate' or 'count'" - ) - - if self.method == "stratified": - if self.stratify_by is None: - raise ConfigurationError( - "Stratified sampling requires 'stratify_by' column" - ) - if self.count is None: - raise ConfigurationError( - "Stratified sampling requires 'count'" - ) - - if self.method == "top": - if self.count is None: - raise ConfigurationError("Top-N sampling requires 'count'") - - if self.method == "time_based": - if self.time_column is None: - raise ConfigurationError( - "Time-based sampling requires 'time_column'" - ) - - if self.method == "error_focused": - if self.error_indicators is None: - raise ConfigurationError( - "Error-focused sampling requires 'error_indicators' list" - ) - - if self.method == "reservoir": - if self.count is None: - raise ConfigurationError( - "Reservoir sampling requires 'count' (reservoir size)" - ) - - @dataclass class NotificationsConfig: """Configuration for validation notifications. @@ -247,8 +136,6 @@ class ValidationConfig: Attributes: checks: List of rule configurations - plugins: List of plugin file paths - sampling: Optional sampling configuration sources_file: Path to external sources YAML file source: Default source name for all checks table: Default table name for all checks @@ -258,8 +145,6 @@ class ValidationConfig: """ checks: list[RuleConfig] - plugins: list[str] | None = None - sampling: SamplingConfig | None = None sources_file: str | None = None source: str | None = None table: str | None = None @@ -269,9 +154,6 @@ class ValidationConfig: def __post_init__(self) -> None: """Validate configuration after initialization.""" - if not self.checks: - raise ConfigurationError("Configuration must contain at least one check") - # Check for duplicate rule names names = [check.name for check in self.checks] duplicates = [name for name in names if names.count(name) > 1] @@ -280,9 +162,6 @@ def __post_init__(self) -> None: f"Duplicate rule names found: {', '.join(set(duplicates))}" ) - # Initialize plugins list if None - if self.plugins is None: - self.plugins = [] class ConfigLoader: @@ -393,43 +272,6 @@ def load(config_path: str | Path) -> ValidationConfig: "Configuration has errors:\n - " + "\n - ".join(check_errors) ) - # Parse plugins (optional) - plugins = data.get("plugins", []) - if not isinstance(plugins, list): - raise ConfigurationError("'plugins' must be a list of file paths") - - # Parse sampling (optional) - sampling = None - if "sampling" in data: - sampling_data = data["sampling"] - if not isinstance(sampling_data, dict): - raise ConfigurationError("'sampling' must be a dictionary") - - # Parse error_indicators - can be list or comma-separated string - error_indicators = sampling_data.get("error_indicators") - if isinstance(error_indicators, str): - error_indicators = [i.strip() for i in error_indicators.split(",")] - - try: - sampling = SamplingConfig( - method=sampling_data.get("method", "none"), - rate=sampling_data.get("rate"), - count=sampling_data.get("count"), - stratify_by=sampling_data.get("stratify_by"), - seed=sampling_data.get("seed"), - # Advanced fields - time_column=sampling_data.get("time_column"), - start_date=sampling_data.get("start_date"), - end_date=sampling_data.get("end_date"), - error_indicators=error_indicators, - interval=sampling_data.get("interval"), - start=sampling_data.get("start", 0), - ) - except ConfigurationError: - raise - except Exception as e: - raise ConfigurationError(f"Error parsing sampling config: {e}") from e - # Parse source settings (optional) sources_file = data.get("sources_file") default_source = data.get("source") @@ -498,8 +340,6 @@ def load(config_path: str | Path) -> ValidationConfig: return ValidationConfig( checks=checks, - plugins=plugins, - sampling=sampling, sources_file=sources_file, source=default_source, table=default_table, @@ -541,7 +381,6 @@ def find_config() -> Path | None: "NotificationsConfig", "ReportingConfig", "RuleConfig", - "SamplingConfig", "ValidationConfig", "ConfigLoader", ] diff --git a/datacheck/config/sample_data.py b/datacheck/config/sample_data.py index b5be1b5..0464f94 100644 --- a/datacheck/config/sample_data.py +++ b/datacheck/config/sample_data.py @@ -1,375 +1,442 @@ """Sample data generators for DataCheck templates. -This module generates sample CSV data files that match the validation rules -defined in each configuration template. +Each generator produces realistic CSV data that matches the validation rules +in the corresponding template. The data is designed to: + +- Pass all validation checks when run with the matching template config +- Use realistic distributions (Gaussian, uniform) for numeric columns +- Demonstrate every major rule type across the six templates + +Default sample count is 1 000 rows. """ import csv import random import string -from datetime import datetime, timedelta +from datetime import date, timedelta from pathlib import Path from typing import Any +# --------------------------------------------------------------------------- +# Low-level helpers +# --------------------------------------------------------------------------- + +def _clamp(value: float, lo: float, hi: float) -> float: + return max(lo, min(hi, value)) + + +def _gauss(mu: float, sigma: float, lo: float = float("-inf"), hi: float = float("inf")) -> float: + """Clamped Gaussian sample.""" + return _clamp(random.gauss(mu, sigma), lo, hi) + def _random_string(length: int = 8, chars: str = string.ascii_uppercase + string.digits) -> str: - """Generate a random string.""" return "".join(random.choice(chars) for _ in range(length)) -def _random_email(domain: str = "example.com") -> str: - """Generate a random email address.""" - username = _random_string(8, string.ascii_lowercase) - return f"{username}@{domain}" +def _random_email(domains: tuple[str, ...] = ("gmail.com", "yahoo.com", "outlook.com", "company.com")) -> str: + first = _random_string(5, string.ascii_lowercase) + last = _random_string(5, string.ascii_lowercase) + return f"{first}.{last}@{random.choice(domains)}" -def _random_date(start_year: int = 2020, end_year: int = 2025) -> str: - """Generate a random date in YYYY-MM-DD format.""" - start = datetime(start_year, 1, 1) - end = datetime(end_year, 12, 31) - delta = end - start - random_days = random.randint(0, delta.days) - date = start + timedelta(days=random_days) - return date.strftime("%Y-%m-%d") +def _random_date(start: date, end: date) -> str: + delta = (end - start).days + return (start + timedelta(days=random.randint(0, delta))).strftime("%Y-%m-%d") -def _random_datetime(start_year: int = 2020, end_year: int = 2025) -> str: - """Generate a random datetime in YYYY-MM-DD HH:MM:SS format.""" - start = datetime(start_year, 1, 1) - end = datetime(end_year, 12, 31) - delta = end - start - random_seconds = random.randint(0, int(delta.total_seconds())) - dt = start + timedelta(seconds=random_seconds) - return dt.strftime("%Y-%m-%d %H:%M:%S") +def _random_datetime(start: date, end: date) -> str: + delta = (end - start).days + d = start + timedelta(days=random.randint(0, delta)) + h = random.randint(0, 23) + m = random.randint(0, 59) + s = random.randint(0, 59) + return f"{d.strftime('%Y-%m-%d')} {h:02d}:{m:02d}:{s:02d}" -def _random_iso_datetime(start_year: int = 2020, end_year: int = 2025) -> str: - """Generate a random ISO 8601 datetime.""" - start = datetime(start_year, 1, 1) - end = datetime(end_year, 12, 31) - delta = end - start - random_seconds = random.randint(0, int(delta.total_seconds())) - dt = start + timedelta(seconds=random_seconds) - return dt.strftime("%Y-%m-%dT%H:%M:%SZ") +def _random_uuid() -> str: + h = "0123456789abcdef" + def seg(n: int) -> str: + return "".join(random.choice(h) for _ in range(n)) + return f"{seg(8)}-{seg(4)}-4{seg(3)}-{random.choice('89ab')}{seg(3)}-{seg(12)}" -def _random_uuid() -> str: - """Generate a random UUID-like string.""" - return f"{_random_string(8, '0123456789abcdef')}-{_random_string(4, '0123456789abcdef')}-{_random_string(4, '0123456789abcdef')}-{_random_string(4, '0123456789abcdef')}-{_random_string(12, '0123456789abcdef')}" +def _next_business_day(d: date) -> date: + """Advance d until it lands on a weekday.""" + while d.weekday() >= 5: + d += timedelta(days=1) + return d -def _random_phone() -> str: - """Generate a random phone number.""" - return f"+1{random.randint(2000000000, 9999999999)}" +def _random_business_date(start: date, end: date) -> str: + delta = (end - start).days + d = start + timedelta(days=random.randint(0, delta)) + d = _next_business_day(d) + return d.strftime("%Y-%m-%d") -def _random_postal_code() -> str: - """Generate a random US postal code.""" - return f"{random.randint(10000, 99999)}" +# --------------------------------------------------------------------------- +# Template generators +# --------------------------------------------------------------------------- +def generate_basic_data(num_rows: int = 1000) -> list[dict[str, Any]]: + """Generate sample data for the basic template. -def generate_basic_data(num_rows: int = 100) -> list[dict[str, Any]]: - """Generate sample data for the basic template.""" + Columns exercised: + id, name, email, created_at, status, age, score, is_verified + Rules demonstrated: + not_null, unique, type, positive, range, regex, + allowed_values, min_length, max_length, boolean, date_range, + no_future_timestamps + """ statuses = ["active", "inactive", "pending"] + today = date.today() + start = date(2022, 1, 1) data = [] for i in range(1, num_rows + 1): + # score ~ N(60, 15), clamped 0-100 + score = round(_gauss(60, 15, 0.0, 100.0), 2) + # age ~ uniform 18-80 + age = random.randint(18, 80) + data.append({ - "id": i, - "name": f"User {_random_string(6)}", - "email": _random_email(), - "created_at": _random_date(), - "status": random.choice(statuses), + "id": i, + "name": f"{_random_string(4, string.ascii_uppercase)}{_random_string(4, string.ascii_lowercase)}", + "email": _random_email(), + "created_at": _random_date(start, today - timedelta(days=1)), + "status": random.choice(statuses), + "age": age, + "score": score, + "is_verified": random.choice([True, False]), }) return data -def generate_ecommerce_data(num_rows: int = 100) -> list[dict[str, Any]]: - """Generate sample data for the ecommerce template.""" - order_statuses = ["pending", "confirmed", "processing", "shipped", "delivered", "cancelled", "refunded", "returned"] +def generate_ecommerce_data(num_rows: int = 1000) -> list[dict[str, Any]]: + """Generate sample data for the ecommerce template. + + Rules demonstrated: + not_null, unique, type, positive, non_negative, range, min, max, + regex, allowed_values, min_length, max_length, boolean, + no_future_timestamps, date_range, unique_combination + """ + order_statuses = ["pending", "confirmed", "processing", "shipped", "delivered", "cancelled", "refunded"] payment_methods = ["credit_card", "debit_card", "paypal", "bank_transfer", "cash_on_delivery", "gift_card"] currencies = ["USD", "EUR", "GBP", "CAD", "AUD"] - + today = date.today() + start = date(2022, 1, 1) data = [] - for _ in range(1, num_rows + 1): - quantity = random.randint(1, 10) - unit_price = round(random.uniform(9.99, 499.99), 2) - discount = random.randint(0, 25) - total_price = round(quantity * unit_price * (1 - discount / 100), 2) + for i in range(1, num_rows + 1): + quantity = random.randint(1, 50) + # unit_price ~ N(50, 15), clamped 0.99-499.99 + unit_price = round(_gauss(50, 15, 0.99, 499.99), 2) + discount_pct = round(random.uniform(0, 30), 2) # 0-30%, non_negative + total_price = round(quantity * unit_price * (1 - discount_pct / 100), 2) + order_dt = _random_datetime(start, today - timedelta(days=1)) data.append({ - "order_id": f"ORD-{_random_string(12)}", - "customer_id": f"CUST-{random.randint(10000, 99999)}", - "product_sku": f"{_random_string(3, string.ascii_uppercase)}-{random.randint(10000, 99999999)}", - "product_name": f"Product {_random_string(8)}", - "quantity": quantity, - "unit_price": unit_price, - "total_price": total_price, - "discount": discount, - "order_status": random.choice(order_statuses), - "payment_method": random.choice(payment_methods), - "shipping_address": f"{random.randint(100, 9999)} {_random_string(8)} Street, {_random_string(6)} City", - "postal_code": _random_postal_code(), - "order_date": _random_datetime(), - "customer_email": _random_email(), - "phone": _random_phone(), - "currency": random.choice(currencies), + "order_id": f"ORD-{i:08d}", + "customer_id": f"CUST-{random.randint(10000, 99999)}", + "product_sku": f"{_random_string(3, string.ascii_uppercase)}-{random.randint(10000, 99999)}", + "product_name": f"Product {_random_string(6, string.ascii_letters)}", + "quantity": quantity, + "unit_price": unit_price, + "total_price": total_price, + "discount_pct": discount_pct, + "order_status": random.choice(order_statuses), + "payment_method": random.choice(payment_methods), + "shipping_address": f"{random.randint(1, 9999)} {_random_string(8, string.ascii_letters)} St", + "postal_code": f"{random.randint(10000, 99999)}", + "order_date": order_dt, + "customer_email": _random_email(), + "currency": random.choice(currencies), + "is_gift": random.choice([True, False]), }) return data -def generate_healthcare_data(num_rows: int = 100) -> list[dict[str, Any]]: - """Generate sample data for the healthcare template.""" - genders = ["M", "F", "Other"] - blood_types = ["A+", "A-", "B+", "B-", "AB+", "AB-", "O+", "O-"] - - # Common ICD-10 codes - diagnosis_codes = ["J06.9", "I10", "E11.9", "M54.5", "F32.9", "J45.909", "K21.0", "N39.0"] - procedure_codes = ["99213", "99214", "99215", "99203", "99204"] +def generate_finance_data(num_rows: int = 1000) -> list[dict[str, Any]]: + """Generate sample data for the finance template. + Rules demonstrated: + not_null, unique, type, range, regex, allowed_values, + max_age, boolean, no_future_timestamps, unique_combination + """ + tx_types = ["credit", "debit", "transfer", "payment", "refund", "withdrawal", "deposit", "fee"] + statuses = ["pending", "processing", "completed", "failed", "cancelled", "reversed"] + currencies = ["USD", "EUR", "GBP", "JPY", "CAD"] + today = date.today() + start = date(2023, 1, 1) # within 2 years for max_age check data = [] - for _ in range(1, num_rows + 1): - # Generate admission and discharge dates - admission_date = _random_date(2023, 2025) - admission_dt = datetime.strptime(admission_date, "%Y-%m-%d") - discharge_dt = admission_dt + timedelta(days=random.randint(1, 14)) - discharge_date = discharge_dt.strftime("%Y-%m-%d") - - # Generate DOB (patients aged 18-90) - birth_year = random.randint(1935, 2006) - dob = f"{birth_year}-{random.randint(1, 12):02d}-{random.randint(1, 28):02d}" + for i in range(1, num_rows + 1): + tx_type = random.choice(tx_types) + # amount: credits positive, debits negative, mix ~ N(0, 5000) + raw = round(_gauss(0, 5000, -50000, 50000), 2) + if tx_type in ("credit", "deposit", "refund"): + amount = abs(raw) + elif tx_type in ("debit", "withdrawal", "fee", "payment"): + amount = -abs(raw) + else: + amount = raw + + tx_d = start + timedelta(days=random.randint(0, (today - start).days - 1)) + tx_date = _random_datetime(start, tx_d) + # settlement always on a business day, 1-3 days after transaction + settle_d = _next_business_day(tx_d + timedelta(days=random.randint(1, 3))) + + # risk_score ~ N(500, 150), clamped 0-1000 + risk_score = round(_gauss(500, 150, 0.0, 1000.0), 2) data.append({ - "patient_id": f"MRN-{random.randint(10000000, 999999999999)}", - "ssn": f"{random.randint(100, 999)}-{random.randint(10, 99)}-{random.randint(1000, 9999)}", - "date_of_birth": dob, - "gender": random.choice(genders), - "provider_npi": f"{random.randint(1000000000, 9999999999)}", - "diagnosis_code": random.choice(diagnosis_codes), - "secondary_diagnosis": random.choice(diagnosis_codes) if random.random() > 0.5 else "", - "procedure_code": random.choice(procedure_codes), - "admission_date": admission_date, - "discharge_date": discharge_date, - "facility_code": f"FAC-{_random_string(6, string.ascii_uppercase + string.digits)}", - "insurance_id": f"{_random_string(3, string.ascii_uppercase)}{random.randint(10000000, 999999999999999)}", - "blood_type": random.choice(blood_types), - "bp_systolic": random.randint(90, 180), - "bp_diastolic": random.randint(60, 110), - "heart_rate": random.randint(50, 120), - "temperature": round(random.uniform(97.0, 101.0), 1), - "medication_dosage": round(random.uniform(5, 500), 1), - "allergies": random.choice(["None", "Penicillin", "Sulfa", "Latex", "Peanuts", ""]), - "emergency_phone": _random_phone(), + "transaction_id": f"TXN-{i:010d}", + "account_id": f"ACC-{random.randint(100000, 999999)}", + "amount": amount, + "currency": random.choice(currencies), + "transaction_type": tx_type, + "status": random.choice(statuses), + "transaction_date": tx_date, + "settlement_date": settle_d.strftime("%Y-%m-%d"), + "risk_score": risk_score, + "is_flagged": random.choice([True, False, False, False, False]), # ~20% flagged + "merchant_id": f"MER-{random.randint(100000, 999999)}", + "batch_id": f"BATCH-{tx_d.strftime('%Y%m%d')}-{random.randint(100, 999)}", }) return data -def generate_finance_data(num_rows: int = 100) -> list[dict[str, Any]]: - """Generate sample data for the finance template.""" - transaction_types = ["credit", "debit", "transfer", "payment", "refund", "withdrawal", "deposit", "fee"] - statuses = ["pending", "processing", "completed", "failed", "cancelled", "reversed"] - currencies = ["USD", "EUR", "GBP", "JPY", "CAD"] +def generate_healthcare_data(num_rows: int = 1000) -> list[dict[str, Any]]: + """Generate sample data for the healthcare template. + Rules demonstrated: + not_null, unique, type, positive, range, regex, allowed_values, + min_length, max_length, timestamp_range, no_future_timestamps, + unique_combination, boolean + """ + genders = ["M", "F", "O"] + # ICD-10 codes + diagnoses = ["J06.9", "I10", "E11.9", "M54.5", "F32.9", "J45.909", "K21.0", "N39.0", + "R05", "Z00.00", "I25.10", "G43.909"] + procedures = ["99213", "99214", "99215", "99203", "99204", "99205"] + today = date.today() + start = date(2023, 1, 1) data = [] - for _ in range(1, num_rows + 1): - tx_type = random.choice(transaction_types) - amount = round(random.uniform(-10000, 50000), 2) - if tx_type in ["credit", "deposit", "refund"]: - amount = abs(amount) - elif tx_type in ["debit", "withdrawal", "fee", "payment"]: - amount = -abs(amount) + for i in range(1, num_rows + 1): + admission = start + timedelta(days=random.randint(0, (today - start).days - 14)) + discharge = admission + timedelta(days=random.randint(1, 14)) + + # vitals with realistic distributions + bp_sys = round(_gauss(120, 15, 70, 200)) + bp_dia = round(_gauss(80, 10, 40, 130)) + hr = round(_gauss(75, 12, 30, 200)) + temp_f = round(_gauss(98.6, 0.8, 95.0, 107.0), 1) - tx_date = _random_datetime(2023, 2025) - tx_dt = datetime.strptime(tx_date, "%Y-%m-%d %H:%M:%S") - settlement_dt = tx_dt + timedelta(days=random.randint(1, 5)) + birth_year = random.randint(1940, 2006) + dob = f"{birth_year}-{random.randint(1, 12):02d}-{random.randint(1, 28):02d}" data.append({ - "transaction_id": f"TXN{_random_string(14, string.ascii_uppercase + string.digits)}", - "account_number": f"{random.randint(10000000, 99999999999999999)}", - "routing_number": f"{random.randint(100000000, 999999999)}", - "iban": f"DE{random.randint(10, 99)}{_random_string(4, string.ascii_uppercase)}{random.randint(1000000, 9999999)}", - "swift_code": f"{_random_string(4, string.ascii_uppercase)}{_random_string(2, string.ascii_uppercase)}{_random_string(2, string.ascii_uppercase + string.digits)}", - "amount": amount, - "currency": random.choice(currencies), - "exchange_rate": round(random.uniform(0.5, 2.0), 4), - "transaction_type": tx_type, - "status": random.choice(statuses), - "transaction_date": tx_date, - "settlement_date": settlement_dt.strftime("%Y-%m-%d"), - "customer_id": f"{_random_string(12, string.ascii_uppercase + string.digits)}", - "merchant_category_code": f"{random.randint(1000, 9999)}", - "card_last4": f"{random.randint(1000, 9999)}", - "balance": round(random.uniform(100, 100000), 2), - "interest_rate": round(random.uniform(0, 25), 2), - "risk_score": random.randint(0, 1000), - "is_fraud": random.choice([0, 0, 0, 0, 0, 0, 0, 0, 0, 1]), # 10% fraud - "reference_number": _random_string(20, string.ascii_uppercase + string.digits), - "batch_id": f"BATCH-{datetime.now().strftime('%Y%m%d')}-{random.randint(1000, 9999)}", + "patient_id": f"MRN-{i:08d}", + "date_of_birth": dob, + "gender": random.choice(genders), + "diagnosis_code": random.choice(diagnoses), + "procedure_code": random.choice(procedures), + "admission_date": admission.strftime("%Y-%m-%d"), + "discharge_date": discharge.strftime("%Y-%m-%d"), + "facility_id": f"FAC-{random.randint(100, 999)}", + "bp_systolic": bp_sys, + "bp_diastolic": bp_dia, + "heart_rate": hr, + "temperature_f": temp_f, + "is_insured": random.choice([True, True, True, False]), # 75% insured + "provider_npi": f"{random.randint(1000000000, 9999999999)}", }) return data -def generate_saas_data(num_rows: int = 100) -> list[dict[str, Any]]: - """Generate sample data for the saas template.""" - plans = ["free", "starter", "professional", "business", "enterprise"] - subscription_statuses = ["active", "trialing", "past_due", "cancelled", "paused", "expired"] - roles = ["owner", "admin", "member", "viewer", "guest"] - account_statuses = ["active", "inactive", "suspended", "pending_verification"] - billing_cycles = ["monthly", "quarterly", "annual"] - timezones = ["America/New_York", "America/Los_Angeles", "Europe/London", "Asia/Tokyo", "Australia/Sydney"] - locales = ["en-US", "en-GB", "de-DE", "fr-FR", "ja-JP", "es-ES"] - event_types = ["page_view", "button_click", "form_submit", "api_call", "login", "logout"] +def generate_saas_data(num_rows: int = 1000) -> list[dict[str, Any]]: + """Generate sample data for the saas template. + Rules demonstrated: + not_null, unique, type, positive, non_negative, boolean, + regex, allowed_values, min_length, max_length, + date_range, no_future_timestamps, max_age, + unique_combination + """ + plans = ["free", "starter", "professional", "business", "enterprise"] + statuses = ["active", "trialing", "past_due", "cancelled", "paused"] + roles = ["owner", "admin", "member", "viewer", "guest"] + today = date.today() + start = date(2021, 1, 1) + last_login_lo = today - timedelta(days=364) # within past year data = [] - for _ in range(1, num_rows + 1): + for _i in range(1, num_rows + 1): plan = random.choice(plans) - mrr = 0 if plan == "free" else random.randint(10, 5000) - seats = 1 if plan in ["free", "starter"] else random.randint(1, 100) - - created_at = _random_iso_datetime(2020, 2024) - created_dt = datetime.strptime(created_at, "%Y-%m-%dT%H:%M:%SZ") - last_login_dt = created_dt + timedelta(days=random.randint(1, 365)) - trial_end_dt = created_dt + timedelta(days=14) + # mrr: 0 for free, else gauss(300, 200) clamped to [1, 5000] + mrr = 0 if plan == "free" else round(_gauss(300, 200, 1, 5000), 2) + seats = 1 if plan in ("free", "starter") else random.randint(2, 200) + # api_calls_30d: non_negative integer, gauss(5000, 3000) clamped [0, 100000] + api_calls = max(0, round(_gauss(5000, 3000, 0, 100000))) + # storage_gb: non_negative, gauss(20, 15) clamped [0, 500] + storage_gb = round(_gauss(20, 15, 0, 500), 2) + + created_dt = start + timedelta(days=random.randint(0, (today - start).days - 30)) + last_login = last_login_lo + timedelta(days=random.randint(0, 363)) + trial_end = (created_dt + timedelta(days=14)).strftime("%Y-%m-%d") data.append({ - "user_id": _random_uuid(), + "user_id": _random_uuid(), "organization_id": _random_uuid(), - "email": _random_email(), - "username": f"user_{_random_string(8, string.ascii_lowercase + string.digits)}", - "subscription_plan": plan, - "subscription_status": random.choice(subscription_statuses), - "role": random.choice(roles), - "account_status": random.choice(account_statuses), - "created_at": created_at, - "last_login_at": last_login_dt.strftime("%Y-%m-%dT%H:%M:%SZ"), - "billing_cycle": random.choice(billing_cycles), - "mrr": mrr, - "seat_count": seats, - "enabled_features": '["feature_a", "feature_b"]', - "api_key": _random_string(40, string.ascii_letters + string.digits), - "webhook_url": f"https://webhook.{_random_string(8, string.ascii_lowercase)}.com/callback", - "timezone": random.choice(timezones), - "locale": random.choice(locales), - "event_type": random.choice(event_types), - "session_id": _random_uuid(), - "ip_address": f"{random.randint(1, 255)}.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(1, 255)}", - "user_agent": f"Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/{random.randint(90, 120)}.0.0.0", - "referral_code": _random_string(8, string.ascii_uppercase + string.digits), - "trial_ends_at": trial_end_dt.strftime("%Y-%m-%d"), - "storage_used_bytes": random.randint(0, 10737418240), # Up to 10GB - "api_calls_count": random.randint(0, 1000000), + "email": _random_email(), + "username": f"usr_{_random_string(6, string.ascii_lowercase + string.digits)}", + "plan": plan, + "status": random.choice(statuses), + "role": random.choice(roles), + "created_at": created_dt.strftime("%Y-%m-%d"), + "last_login_at": last_login.strftime("%Y-%m-%d"), + "mrr": mrr, + "seat_count": seats, + "api_calls_30d": api_calls, + "storage_gb": storage_gb, + "is_active": random.choice([True, True, True, False]), # 75% active + "trial_end_date": trial_end, }) return data -def generate_iot_data(num_rows: int = 100) -> list[dict[str, Any]]: - """Generate sample data for the iot template.""" - device_statuses = ["online", "offline", "standby", "error", "maintenance"] - sensor_types = ["temperature", "humidity", "pressure", "motion", "light", "gas"] - quality_flags = ["good", "warning", "error"] - protocols = ["mqtt", "http", "https", "coap", "websocket"] +def generate_iot_data(num_rows: int = 1000) -> list[dict[str, Any]]: + """Generate sample data for the iot template. - data = [] - - for i in range(1, num_rows + 1): - timestamp = _random_iso_datetime(2024, 2025).replace("Z", "") - unix_ts = int(datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S").timestamp()) - - sensor_type = random.choice(sensor_types) - - # Generate realistic sensor values based on type - temp = round(random.uniform(15, 35), 2) if sensor_type == "temperature" else round(random.uniform(-10, 50), 2) - humidity = round(random.uniform(30, 80), 2) - pressure = round(random.uniform(980, 1050), 2) + Rules demonstrated: + not_null, unique (record_id), type, positive, non_negative, range, + regex, allowed_values, timestamp_range, no_future_timestamps, + unique_combination (device_id + timestamp) + """ + device_ids = [f"DEV-{_random_string(8, '0123456789ABCDEF')}" for _ in range(50)] + quality_flags = ["good", "warning", "error"] + sensor_types = ["temperature", "humidity", "pressure", "motion", "light", "gas"] + protocols = ["mqtt", "http", "coap", "websocket"] + today = date.today() + start = date(2024, 1, 1) + used_combos: set[tuple[str, str]] = set() + data: list[dict[str, Any]] = [] + i = 0 + + while len(data) < num_rows: + i += 1 + device_id = random.choice(device_ids) + ts_d = start + timedelta(days=random.randint(0, (today - start).days - 1)) + h = random.randint(0, 23) + m_min = random.randint(0, 59) + s_sec = random.randint(0, 59) + timestamp = f"{ts_d.strftime('%Y-%m-%d')} {h:02d}:{m_min:02d}:{s_sec:02d}" + + combo = (device_id, timestamp) + if combo in used_combos: + continue + used_combos.add(combo) + + # temperature ~ N(22, 5), clamped -10 to 50 + temperature = round(_gauss(22, 5, -10.0, 50.0), 2) + # humidity ~ uniform(20, 80) + humidity = round(random.uniform(20.0, 80.0), 2) + # pressure ~ N(1013, 20), clamped 900-1100 + pressure = round(_gauss(1013, 20, 900.0, 1100.0), 2) + # battery_level: positive int, 1-100 + battery = random.randint(1, 100) + # rssi: negative, -110 to -20 + rssi = random.randint(-110, -20) + # lat/lon within USA bounds + lat = round(random.uniform(24.0, 49.0), 6) + lon = round(random.uniform(-125.0, -66.0), 6) + alt = round(random.uniform(0.0, 3000.0), 1) data.append({ - "device_id": f"{_random_string(3, string.ascii_uppercase)}-{_random_string(12, '0123456789ABCDEF')}", - "sensor_id": f"SENS-{random.randint(1000, 99999999)}", - "mac_address": ":".join([f"{random.randint(0, 255):02X}" for _ in range(6)]), - "ip_address": f"{random.randint(10, 192)}.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(1, 254)}", - "firmware_version": f"{random.randint(1, 5)}.{random.randint(0, 15)}.{random.randint(0, 99)}", - "timestamp": timestamp, - "unix_timestamp": unix_ts, - "temperature": temp, - "humidity": humidity, - "pressure": pressure, - "battery_level": random.randint(10, 100), - "rssi": random.randint(-90, -30), - "latitude": round(random.uniform(25, 48), 6), - "longitude": round(random.uniform(-125, -70), 6), - "altitude": round(random.uniform(0, 3000), 2), - "speed": round(random.uniform(0, 30), 2), - "acceleration": round(random.uniform(-5, 5), 2), - "voltage": round(random.uniform(3.0, 5.0), 2), - "current": round(random.uniform(0.01, 2.0), 3), - "power": round(random.uniform(0.1, 10), 2), - "energy_kwh": round(random.uniform(0, 1000), 3), - "device_status": random.choice(device_statuses), - "sensor_type": sensor_type, + "record_id": i, + "device_id": device_id, + "sensor_id": f"SENS-{random.randint(1000, 9999)}", + "timestamp": timestamp, + "temperature": temperature, + "humidity": humidity, + "pressure": pressure, + "battery_level": battery, + "rssi": rssi, + "latitude": lat, + "longitude": lon, + "altitude": alt, "quality_flag": random.choice(quality_flags), - "error_code": random.choice(["OK", "OK", "OK", "OK", "ERR-001", "ERR-002", "ERR-100"]), - "sequence_number": i, - "message_size_bytes": random.randint(50, 2048), - "gateway_id": f"GW-{_random_string(10, string.ascii_uppercase + string.digits)}", - "protocol": random.choice(protocols), + "sensor_type": random.choice(sensor_types), + "protocol": random.choice(protocols), }) return data -# Mapping of template names to generator functions -GENERATORS = { - "basic": generate_basic_data, - "ecommerce": generate_ecommerce_data, - "healthcare": generate_healthcare_data, - "finance": generate_finance_data, - "saas": generate_saas_data, - "iot": generate_iot_data, +# --------------------------------------------------------------------------- +# Registry +# --------------------------------------------------------------------------- + +GENERATORS: dict[str, Any] = { + "basic": generate_basic_data, + "ecommerce": generate_ecommerce_data, + "healthcare": generate_healthcare_data, + "finance": generate_finance_data, + "saas": generate_saas_data, + "iot": generate_iot_data, } -# Default filenames for each template -DEFAULT_FILENAMES = { - "basic": "data.csv", - "ecommerce": "orders.csv", - "healthcare": "patients.csv", - "finance": "transactions.csv", - "saas": "users.csv", - "iot": "sensor_data.csv", +DEFAULT_FILENAMES: dict[str, str] = { + "basic": "data.csv", + "ecommerce": "orders.csv", + "healthcare": "patients.csv", + "finance": "transactions.csv", + "saas": "users.csv", + "iot": "sensor_data.csv", +} + +DEFAULT_ROWS: dict[str, int] = { + "basic": 1000, + "ecommerce": 1000, + "healthcare": 1000, + "finance": 1000, + "saas": 1000, + "iot": 1000, } def generate_sample_data( template: str, output_path: Path | str, - num_rows: int = 100, + num_rows: int | None = None, ) -> Path: """Generate sample data for a template and save to CSV. Args: template: Template name (basic, ecommerce, healthcare, finance, saas, iot) output_path: Path where the CSV file will be saved - num_rows: Number of sample rows to generate + num_rows: Number of sample rows to generate (defaults to 1 000) Returns: Path to the generated CSV file Raises: - ValueError: If template is not recognized + ValueError: If template is not recognised """ if template not in GENERATORS: raise ValueError(f"Unknown template: {template}. Available: {', '.join(GENERATORS.keys())}") - generator = GENERATORS[template] - data = generator(num_rows) + if num_rows is None: + num_rows = DEFAULT_ROWS.get(template, 1000) + + data = GENERATORS[template](num_rows) output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) diff --git a/datacheck/config/schema.py b/datacheck/config/schema.py index 1be779d..7cbbd52 100644 --- a/datacheck/config/schema.py +++ b/datacheck/config/schema.py @@ -12,28 +12,15 @@ "regex", "allowed_values", "type", - "length", "min_length", "max_length", - "custom", - # Statistical rules - "mean_between", - "std_dev_less_than", - "percentile_range", - "z_score_outliers", - "distribution_type", # Freshness rules "max_age", "timestamp_range", + "date_range", "no_future_timestamps", "date_format_valid", "date_format", - "business_days_only", - # Format rules - "email_valid", - "phone_valid", - "url_valid", - "json_valid", # Relationship rules "foreign_key_exists", "sum_equals", @@ -41,7 +28,6 @@ # Range rules "range", "positive", - "negative", "non_negative", # Boolean rules "boolean", @@ -51,22 +37,20 @@ VALID_DATA_SOURCE_TYPES = [ "csv", "parquet", - "json", - "avro", - "deltalake", "postgresql", "mysql", + "mssql", "snowflake", "bigquery", "redshift", - "duckdb", + "s3", ] # Valid output formats VALID_OUTPUT_FORMATS = [ "terminal", "json", - "html", + "sarif", "markdown", "csv", ] @@ -171,37 +155,6 @@ }, }, }, - "plugins": { - "type": "array", - "description": "List of plugin file paths", - "items": {"type": "string"}, - }, - "sampling": { - "type": "object", - "description": "Data sampling configuration", - "properties": { - "method": { - "type": "string", - "enum": ["none", "random", "stratified", "top", "systematic"], - "default": "none", - }, - "rate": { - "type": "number", - "minimum": 0.0, - "maximum": 1.0, - }, - "count": { - "type": "integer", - "minimum": 1, - }, - "stratify_by": { - "type": "string", - }, - "seed": { - "type": "integer", - }, - }, - }, "reporting": { "type": "object", "description": "Output and reporting configuration", diff --git a/datacheck/config/source.py b/datacheck/config/source.py index 821cf08..e50548b 100644 --- a/datacheck/config/source.py +++ b/datacheck/config/source.py @@ -26,24 +26,18 @@ # File loaders "csv": ["path"], "parquet": ["path"], - "duckdb": ["path"], - "sqlite": ["path"], - "delta": ["path"], - "avro": ["path"], # Cloud storage "s3": ["bucket"], - "gcs": ["bucket"], - "azure": ["container"], } # Source types that are database connectors DATABASE_TYPES = {"postgresql", "mysql", "mssql", "snowflake", "bigquery", "redshift"} # Source types that are file-based loaders -FILE_TYPES = {"csv", "parquet", "duckdb", "sqlite", "delta", "avro"} +FILE_TYPES = {"csv", "parquet"} # Source types that are cloud storage -CLOUD_TYPES = {"s3", "gcs", "azure"} +CLOUD_TYPES = {"s3"} @dataclass diff --git a/datacheck/config/templates/basic.yaml b/datacheck/config/templates/basic.yaml index 7d860f5..426c078 100644 --- a/datacheck/config/templates/basic.yaml +++ b/datacheck/config/templates/basic.yaml @@ -1,73 +1,116 @@ # DataCheck Basic Configuration Template -# A simple starting point for data validation +# A comprehensive starting point covering all core rule categories # -# Usage: -# datacheck init --template basic -# datacheck validate data.csv -c datacheck.yaml +# Quick start: +# datacheck config init --template basic --with-sample-data +# datacheck validate --config datacheck.yaml +# +# Rules demonstrated: +# Presence : not_null, unique +# Type : type (integer, numeric, string) +# Numeric : positive, range +# String : regex, allowed_values, min_length, max_length +# Boolean : boolean +# Temporal : no_future_timestamps, date_range version: "1.0" metadata: - description: "Basic data validation configuration" + description: "Basic data validation — covers all core rule types" template: "basic" + domain: "general" -# Data source configuration data_source: type: csv path: "./data.csv" options: encoding: "utf-8" -# Common validation checks checks: - # ID column validation - - name: id_check + # ─── Primary key ──────────────────────────────────────────────────────────── + + - name: id column: id - description: "Primary identifier must be unique and not null" + description: "Integer primary key — unique, positive, not null" rules: not_null: true unique: true + type: integer + positive: true + + # ─── String fields ────────────────────────────────────────────────────────── - # Name field validation - - name: name_check + - name: name_not_null column: name - description: "Name field must not be empty" + description: "Name must be present and within reasonable length" rules: not_null: true - min_length: 1 + min_length: 3 + max_length: 50 - # Email validation (if applicable) - - name: email_check + - name: email_format column: email description: "Valid email format" rules: - regex: "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$" + not_null: true + regex: '^[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}$' - # Date field validation - - name: created_at_check - column: created_at - description: "Creation date must be valid" + # ─── Categorical ──────────────────────────────────────────────────────────── + + - name: status + column: status + description: "Status must be one of the allowed lifecycle values" rules: not_null: true - date_format: - format: "%Y-%m-%d" + allowed_values: [active, inactive, pending] - # Status field validation - - name: status_check - column: status - description: "Status must be a valid value" + - name: is_verified + column: is_verified + description: "Verification flag must be a boolean" rules: - allowed_values: - - active - - inactive - - pending + not_null: true + boolean: true -# Notifications (optional) -# notifications: -# slack_webhook: "${SLACK_WEBHOOK}" -# mention_on_failure: false + # ─── Numeric ──────────────────────────────────────────────────────────────── + + - name: age_range + column: age + description: "Age must be a positive integer between 18 and 100" + rules: + not_null: true + type: integer + positive: true + range: + min: 18 + max: 100 + + - name: score_range + column: score + description: "Score is a numeric value between 0 and 100" + rules: + not_null: true + type: numeric + range: + min: 0 + max: 100 + + # ─── Temporal ─────────────────────────────────────────────────────────────── + + - name: created_at_not_future + column: created_at + description: "Creation timestamp cannot be in the future" + rules: + not_null: true + no_future_timestamps: true + + - name: created_at_range + column: created_at + description: "All records must be within the system launch window" + rules: + date_range: + min: "2022-01-01" + max: "2030-12-31" -# Output configuration reporting: export_failures: true - output_path: "validation_results" + output_file: "validation_results" diff --git a/datacheck/config/templates/ecommerce.yaml b/datacheck/config/templates/ecommerce.yaml index 8d925f8..22eec62 100644 --- a/datacheck/config/templates/ecommerce.yaml +++ b/datacheck/config/templates/ecommerce.yaml @@ -1,102 +1,119 @@ # DataCheck E-commerce Configuration Template -# Validation rules for e-commerce data +# Advanced validation for retail / order-management data # -# Usage: -# datacheck init --template ecommerce -# datacheck validate orders.csv -c datacheck.yaml +# Quick start: +# datacheck config init --template ecommerce --with-sample-data +# datacheck validate --config datacheck.yaml +# +# Rules demonstrated (covers 18+ rule types): +# Presence : not_null, unique +# Type : type (integer, numeric, string) +# Numeric : positive, non_negative, min, max, range +# String : regex, allowed_values, min_length, max_length +# Boolean : boolean +# Temporal : no_future_timestamps, date_range +# Cross-col : unique_combination version: "1.0" metadata: - description: "E-commerce data validation configuration" + description: "E-commerce order data validation — advanced template" template: "ecommerce" domain: "retail" -# Data source configuration data_source: type: csv path: "./orders.csv" options: encoding: "utf-8" -# E-commerce validation checks checks: - # Order ID validation - - name: order_id_check + # ─── Identifiers ──────────────────────────────────────────────────────────── + + - name: order_id column: order_id - description: "Unique order identifier" + description: "Each order has a unique, non-null ID in ORD-XXXXXXXX format" rules: not_null: true unique: true - min_length: 8 + regex: '^ORD-\d{8}$' - # Customer ID validation - - name: customer_id_check + - name: customer_id column: customer_id - description: "Customer reference" + description: "Customer reference in CUST-NNNNN format" rules: not_null: true - regex: "^CUST-[0-9]+$" + regex: '^CUST-\d{5}$' - # Product SKU validation - - name: sku_check + - name: product_sku column: product_sku - description: "Product SKU format" + description: "SKU format: 3 uppercase letters, dash, 5 digits" rules: not_null: true - regex: "^[A-Z]{2,4}-[0-9]{4,8}$" + regex: '^[A-Z]{3}-\d{5}$' + + - name: order_id_customer_unique + column: order_id + description: "Each (order_id, customer_id) combination must be unique" + rules: + unique_combination: + - order_id + - customer_id - # Product name validation - - name: product_name_check + # ─── Product / Pricing ────────────────────────────────────────────────────── + + - name: product_name column: product_name - description: "Product name must be present" + description: "Product name length must be between 3 and 120 characters" rules: not_null: true - min_length: 2 - max_length: 200 + min_length: 3 + max_length: 120 - # Quantity validation - - name: quantity_check + - name: quantity column: quantity - description: "Order quantity must be positive" + description: "Quantity is a positive integer, 1–50" rules: not_null: true type: integer - min: 1 - max: 10000 + positive: true + range: + min: 1 + max: 50 - # Unit price validation - - name: unit_price_check + - name: unit_price column: unit_price - description: "Valid unit price" + description: "Unit price must be positive and within realistic bounds" rules: not_null: true type: numeric + positive: true min: 0.01 - max: 1000000 + max: 500.00 - # Total price validation - - name: total_price_check + - name: total_price column: total_price - description: "Total price must be positive" + description: "Total price must be non-negative" rules: not_null: true type: numeric - min: 0 + non_negative: true - # Discount validation - - name: discount_check - column: discount - description: "Discount must be within valid range" + - name: discount_pct + column: discount_pct + description: "Discount percentage 0–30" rules: type: numeric - min: 0 - max: 100 + non_negative: true + range: + min: 0 + max: 30 + + # ─── Order Metadata ───────────────────────────────────────────────────────── - # Order status validation - - name: order_status_check + - name: order_status column: order_status - description: "Valid order status" + description: "Status must be one of the defined lifecycle values" rules: not_null: true allowed_values: @@ -107,12 +124,10 @@ checks: - delivered - cancelled - refunded - - returned - # Payment method validation - - name: payment_method_check + - name: payment_method column: payment_method - description: "Valid payment method" + description: "Accepted payment methods" rules: not_null: true allowed_values: @@ -120,65 +135,55 @@ checks: - debit_card - paypal - bank_transfer - - crypto - cash_on_delivery - gift_card - # Shipping address validation - - name: shipping_address_check - column: shipping_address - description: "Shipping address must be present" + - name: currency + column: currency + description: "ISO 4217 currency codes supported by the platform" rules: not_null: true - min_length: 10 + allowed_values: [USD, EUR, GBP, CAD, AUD] - # Postal code validation - - name: postal_code_check - column: postal_code - description: "Valid postal code format" + - name: is_gift + column: is_gift + description: "Gift flag must be a boolean True/False" rules: not_null: true - regex: "^[0-9]{5}(-[0-9]{4})?$|^[A-Z][0-9][A-Z]\\s?[0-9][A-Z][0-9]$" + boolean: true - # Order date validation - - name: order_date_check - column: order_date - description: "Valid order date" + # ─── Address / Contact ────────────────────────────────────────────────────── + + - name: postal_code + column: postal_code + description: "5-digit US ZIP code" rules: not_null: true - date_format: - format: "%Y-%m-%d %H:%M:%S" + regex: '^\d{5}$' - # Email validation - - name: customer_email_check + - name: customer_email column: customer_email - description: "Valid customer email" + description: "Valid email address" rules: - regex: "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$" + regex: '^[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}$' - # Phone validation - - name: phone_check - column: phone - description: "Valid phone number" - rules: - regex: "^\\+?[0-9]{10,15}$" + # ─── Temporal ─────────────────────────────────────────────────────────────── - # Currency validation - - name: currency_check - column: currency - description: "Valid ISO currency code" + - name: order_date_not_future + column: order_date + description: "No future-dated orders" rules: not_null: true - allowed_values: - - USD - - EUR - - GBP - - CAD - - AUD - - JPY - - CNY - -# Output configuration + no_future_timestamps: true + + - name: order_date_range + column: order_date + description: "All orders must be on or after the platform launch date" + rules: + date_range: + min: "2022-01-01" + max: "2030-12-31" + reporting: export_failures: true - output_path: "ecommerce_validation" + output_file: "ecommerce_validation" diff --git a/datacheck/config/templates/finance.yaml b/datacheck/config/templates/finance.yaml index e572264..1d188fd 100644 --- a/datacheck/config/templates/finance.yaml +++ b/datacheck/config/templates/finance.yaml @@ -1,17 +1,25 @@ # DataCheck Finance Configuration Template -# Validation rules for financial/banking data +# Advanced validation for financial transaction data # -# Usage: -# datacheck init --template finance -# datacheck validate transactions.csv -c datacheck.yaml +# Quick start: +# datacheck config init --template finance --with-sample-data +# datacheck validate --config datacheck.yaml # -# Note: This template includes checks for sensitive financial data. -# Ensure proper data handling and compliance with regulations. +# Rules demonstrated (covers 16+ rule types): +# Presence : not_null, unique +# Type : type (numeric, integer, string) +# Numeric : range +# String : regex, allowed_values, min_length +# Boolean : boolean +# Temporal : no_future_timestamps, max_age +# Cross-col : unique_combination +# +# Compliance: SOX, PCI-DSS, GDPR version: "1.0" metadata: - description: "Financial data validation configuration" + description: "Financial transaction data validation — advanced template" template: "finance" domain: "finance" compliance: @@ -19,89 +27,78 @@ metadata: - PCI-DSS - GDPR -# Data source configuration data_source: type: csv path: "./transactions.csv" options: encoding: "utf-8" -# Financial validation checks checks: - # Transaction ID validation - - name: transaction_id_check + # ─── Identifiers ──────────────────────────────────────────────────────────── + + - name: transaction_id column: transaction_id - description: "Unique transaction identifier" + description: "Unique transaction ID in TXN-NNNNNNNNNN format" rules: not_null: true unique: true - min_length: 10 + regex: '^TXN-\d{10}$' - # Account number validation - - name: account_number_check - column: account_number - description: "Bank account number" + - name: account_id + column: account_id + description: "Account reference in ACC-NNNNNN format" rules: not_null: true - regex: "^[0-9]{8,17}$" - metadata: - sensitivity: high - pci: true - - # Routing number validation - - name: routing_number_check - column: routing_number - description: "Bank routing number (ABA)" - rules: - regex: "^[0-9]{9}$" + regex: '^ACC-\d{6}$' - # IBAN validation (international) - - name: iban_check - column: iban - description: "International Bank Account Number" + - name: txn_account_unique + column: transaction_id + description: "Each (transaction_id, account_id) pair must be unique" rules: - regex: "^[A-Z]{2}[0-9]{2}[A-Z0-9]{4}[0-9]{7}([A-Z0-9]?){0,16}$" - metadata: - sensitivity: high - - # SWIFT/BIC validation - - name: swift_check - column: swift_code - description: "SWIFT/BIC code" + unique_combination: + - transaction_id + - account_id + + - name: merchant_id + column: merchant_id + description: "Merchant reference in MER-NNNNNN format" rules: - regex: "^[A-Z]{4}[A-Z]{2}[A-Z0-9]{2}([A-Z0-9]{3})?$" + regex: '^MER-\d{6}$' - # Transaction amount validation - - name: amount_check + # ─── Financial Amounts ────────────────────────────────────────────────────── + + - name: amount_not_null column: amount - description: "Transaction amount" + description: "Every transaction must have an amount" rules: not_null: true type: numeric - # No min constraint - allows negative for debits - max: 1000000000 # $1 billion limit - # Currency validation - - name: currency_check - column: currency - description: "ISO 4217 currency code" + - name: amount_range + column: amount + description: "Amount must be within defined limits (negative = debit)" rules: - not_null: true - regex: "^[A-Z]{3}$" + range: + min: -50000 + max: 50000 - # Exchange rate validation - - name: exchange_rate_check - column: exchange_rate - description: "Currency exchange rate" + # ─── Risk Score ───────────────────────────────────────────────────────────── + + - name: risk_score_range + column: risk_score + description: "Risk score 0–1000 (higher = more risk)" rules: + not_null: true type: numeric - min: 0.0001 - max: 10000 + range: + min: 0 + max: 1000 + + # ─── Categorical ──────────────────────────────────────────────────────────── - # Transaction type validation - - name: transaction_type_check + - name: transaction_type column: transaction_type - description: "Type of financial transaction" + description: "Valid transaction type" rules: not_null: true allowed_values: @@ -113,14 +110,10 @@ checks: - withdrawal - deposit - fee - - interest - - dividend - - adjustment - # Transaction status validation - - name: status_check + - name: status column: status - description: "Transaction status" + description: "Valid transaction status" rules: not_null: true allowed_values: @@ -130,103 +123,37 @@ checks: - failed - cancelled - reversed - - held - # Transaction date validation - - name: transaction_date_check - column: transaction_date - description: "Transaction timestamp" + - name: currency + column: currency + description: "Supported ISO 4217 currency codes" rules: not_null: true - date_format: - format: "%Y-%m-%d %H:%M:%S" - - # Settlement date validation - - name: settlement_date_check - column: settlement_date - description: "Settlement date" - rules: - date_format: - format: "%Y-%m-%d" + allowed_values: [USD, EUR, GBP, JPY, CAD] - # Customer ID validation - - name: customer_id_check - column: customer_id - description: "Customer identifier" + - name: is_flagged + column: is_flagged + description: "Fraud/compliance flag must be a boolean" rules: not_null: true - regex: "^[A-Z0-9]{8,20}$" + boolean: true - # Merchant category code (MCC) - - name: mcc_check - column: merchant_category_code - description: "Merchant Category Code" - rules: - regex: "^[0-9]{4}$" - - # Credit card last 4 (masked) - - name: card_last4_check - column: card_last4 - description: "Last 4 digits of card" - rules: - regex: "^[0-9]{4}$" - metadata: - pci: true - - # Balance validation - - name: balance_check - column: balance - description: "Account balance" - rules: - type: numeric + # ─── Temporal ─────────────────────────────────────────────────────────────── - # Interest rate validation - - name: interest_rate_check - column: interest_rate - description: "Interest rate (percentage)" - rules: - type: numeric - min: 0 - max: 100 - - # Risk score validation - - name: risk_score_check - column: risk_score - description: "Transaction risk score" - rules: - type: numeric - min: 0 - max: 1000 - - # Fraud flag validation - - name: fraud_flag_check - column: is_fraud - description: "Fraud indicator flag" - rules: - allowed_values: - - 0 - - 1 - - true - - false - - "True" - - "False" - - # Reference number validation - - name: reference_check - column: reference_number - description: "Transaction reference number" + - name: transaction_date_not_future + column: transaction_date + description: "Transactions cannot be future-dated" rules: - regex: "^[A-Z0-9]{10,30}$" + not_null: true + no_future_timestamps: true - # Batch ID validation - - name: batch_id_check - column: batch_id - description: "Processing batch identifier" + - name: transaction_date_freshness + column: transaction_date + description: "All transactions in this feed are within the past 2 years" + severity: warning rules: - regex: "^BATCH-[0-9]{8}-[0-9]{4}$" + max_age: "730d" -# Output configuration reporting: export_failures: true - output_path: "finance_validation" - # Note: Be careful with exports containing PCI/sensitive data + output_file: "finance_validation" diff --git a/datacheck/config/templates/healthcare.yaml b/datacheck/config/templates/healthcare.yaml index efbf899..866d092 100644 --- a/datacheck/config/templates/healthcare.yaml +++ b/datacheck/config/templates/healthcare.yaml @@ -1,218 +1,183 @@ # DataCheck Healthcare Configuration Template -# HIPAA-aware validation rules for healthcare/medical data +# Advanced validation for patient / clinical data # -# Usage: -# datacheck init --template healthcare -# datacheck validate patients.csv -c datacheck.yaml +# Quick start: +# datacheck config init --template healthcare --with-sample-data +# datacheck validate --config datacheck.yaml # -# Note: This template includes checks for PHI (Protected Health Information) -# Ensure proper data handling and access controls are in place. +# Rules demonstrated (covers 15+ rule types): +# Presence : not_null, unique +# Type : type (integer, numeric, string) +# Numeric : positive, range +# String : regex, allowed_values, min_length, max_length +# Boolean : boolean +# Temporal : timestamp_range, no_future_timestamps +# Cross-col : unique_combination (patient × admission) +# +# Compliance: HIPAA, HITECH version: "1.0" metadata: - description: "Healthcare data validation configuration (HIPAA-aware)" + description: "Healthcare patient and clinical data validation — advanced template" template: "healthcare" domain: "healthcare" compliance: - HIPAA - HITECH -# Data source configuration data_source: type: csv path: "./patients.csv" options: encoding: "utf-8" -# Healthcare validation checks checks: - # Patient ID validation - - name: patient_id_check + # ─── Identifiers ──────────────────────────────────────────────────────────── + + - name: patient_id column: patient_id - description: "Unique patient identifier" + description: "Medical record number in MRN-NNNNNNNN format — unique" rules: not_null: true unique: true - regex: "^MRN-[0-9]{8,12}$" + regex: '^MRN-\d{8}$' + metadata: + sensitivity: phi - # SSN validation (PHI - handle with care) - - name: ssn_check - column: ssn - description: "Social Security Number format (PHI)" + - name: provider_npi + column: provider_npi + description: "10-digit NPI number" rules: - regex: "^[0-9]{3}-[0-9]{2}-[0-9]{4}$" - metadata: - phi: true - sensitivity: high + not_null: true + regex: '^\d{10}$' - # Date of birth validation - - name: dob_check + - name: patient_admission_unique + column: patient_id + description: "One record per patient per admission" + rules: + unique_combination: + - patient_id + - admission_date + + # ─── Demographics ──────────────────────────────────────────────────────────── + + - name: date_of_birth column: date_of_birth - description: "Valid date of birth" + description: "Valid date of birth (no future DOBs, minimum year 1920)" rules: not_null: true - date_format: - format: "%Y-%m-%d" - date_range: - min: "1900-01-01" - max: "today" + timestamp_range: + min: "1920-01-01" + max: "2010-12-31" metadata: - phi: true + sensitivity: phi - # Gender validation - - name: gender_check + - name: gender column: gender - description: "Valid gender value" - rules: - allowed_values: - - M - - F - - Male - - Female - - Other - - Unknown - - Non-binary - - # Provider NPI validation - - name: provider_npi_check - column: provider_npi - description: "National Provider Identifier" + description: "HL7-aligned gender codes" rules: not_null: true - regex: "^[0-9]{10}$" + allowed_values: [M, F, O] - # ICD-10 diagnosis code validation - - name: diagnosis_code_check + # ─── Clinical Codes ────────────────────────────────────────────────────────── + + - name: diagnosis_code column: diagnosis_code - description: "ICD-10 diagnosis code" + description: "ICD-10 code format (e.g. J06.9, I10, E11.9)" rules: not_null: true - regex: "^[A-Z][0-9]{2}(\\.[0-9A-Z]{1,4})?$" + regex: '^[A-Z]\d{2}(\.\d{1,4})?$' + min_length: 3 + max_length: 8 - # Secondary diagnosis validation - - name: secondary_diagnosis_check - column: secondary_diagnosis - description: "Secondary ICD-10 diagnosis code (optional)" + - name: procedure_code + column: procedure_code + description: "CPT procedure code — 5 digits" rules: - regex: "^[A-Z][0-9]{2}(\\.[0-9A-Z]{1,4})?$" + not_null: true + regex: '^\d{5}$' - # Procedure code validation - - name: procedure_code_check - column: procedure_code - description: "CPT procedure code" + - name: facility_id + column: facility_id + description: "Facility identifier in FAC-NNN format" rules: - regex: "^[0-9]{5}$" + not_null: true + regex: '^FAC-\d{3}$' - # Admission date validation - - name: admission_date_check + # ─── Admission / Discharge ─────────────────────────────────────────────────── + + - name: admission_date column: admission_date - description: "Valid admission date" + description: "Admission date is not in the future, within the data window" rules: not_null: true - date_format: - format: "%Y-%m-%d" + no_future_timestamps: true + timestamp_range: + min: "2023-01-01" + max: "2030-12-31" - # Discharge date validation - - name: discharge_date_check + - name: discharge_date column: discharge_date - description: "Valid discharge date (must be after admission)" - rules: - date_format: - format: "%Y-%m-%d" - # date_after: admission_date # Cross-column validation - - # Facility code validation - - name: facility_code_check - column: facility_code - description: "Healthcare facility identifier" + description: "Discharge date is not in the future" rules: not_null: true - regex: "^FAC-[A-Z0-9]{4,10}$" + no_future_timestamps: true - # Insurance ID validation - - name: insurance_id_check - column: insurance_id - description: "Insurance policy identifier" - rules: - regex: "^[A-Z]{2,4}[0-9]{8,15}$" + # ─── Vitals ────────────────────────────────────────────────────────────────── - # Blood type validation - - name: blood_type_check - column: blood_type - description: "Valid blood type" - rules: - allowed_values: - - A+ - - A- - - B+ - - B- - - AB+ - - AB- - - O+ - - O- - - Unknown - - # Vital signs - Blood pressure systolic - - name: bp_systolic_check + - name: bp_systolic column: bp_systolic - description: "Systolic blood pressure (mmHg)" + description: "Systolic blood pressure 70–200 mmHg" rules: + not_null: true type: integer - min: 60 - max: 250 + positive: true + range: + min: 70 + max: 200 - # Vital signs - Blood pressure diastolic - - name: bp_diastolic_check + - name: bp_diastolic column: bp_diastolic - description: "Diastolic blood pressure (mmHg)" + description: "Diastolic blood pressure 40–130 mmHg" rules: + not_null: true type: integer - min: 40 - max: 150 + positive: true + range: + min: 40 + max: 130 - # Vital signs - Heart rate - - name: heart_rate_check + - name: heart_rate column: heart_rate - description: "Heart rate (bpm)" + description: "Heart rate 30–200 bpm" rules: + not_null: true type: integer - min: 30 - max: 220 - - # Vital signs - Temperature - - name: temperature_check - column: temperature - description: "Body temperature (Fahrenheit)" - rules: - type: numeric - min: 95.0 - max: 108.0 + positive: true + range: + min: 30 + max: 200 - # Medication dosage validation - - name: medication_dosage_check - column: medication_dosage - description: "Medication dosage (positive value)" + - name: temperature_f + column: temperature_f + description: "Body temperature 95–107 °F" rules: + not_null: true type: numeric - min: 0 + range: + min: 95.0 + max: 107.0 - # Allergies validation - - name: allergies_check - column: allergies - description: "Patient allergies (text field)" - rules: - max_length: 1000 + # ─── Insurance ─────────────────────────────────────────────────────────────── - # Emergency contact phone - - name: emergency_phone_check - column: emergency_phone - description: "Emergency contact phone" + - name: is_insured + column: is_insured + description: "Insurance coverage flag must be boolean" rules: - regex: "^\\+?[0-9]{10,15}$" + not_null: true + boolean: true -# Output configuration reporting: export_failures: true - output_path: "healthcare_validation" - # Note: Be careful with failure exports containing PHI + output_file: "healthcare_validation" diff --git a/datacheck/config/templates/iot.yaml b/datacheck/config/templates/iot.yaml index e004dce..acc7dcf 100644 --- a/datacheck/config/templates/iot.yaml +++ b/datacheck/config/templates/iot.yaml @@ -1,299 +1,195 @@ # DataCheck IoT Configuration Template -# Validation rules for IoT sensor/device data +# Advanced validation for IoT sensor / telemetry data # -# Usage: -# datacheck init --template iot -# datacheck validate sensor_data.csv -c datacheck.yaml +# Quick start: +# datacheck config init --template iot --with-sample-data +# datacheck validate --config datacheck.yaml +# +# Rules demonstrated (covers 14+ rule types): +# Presence : not_null, unique +# Type : type (numeric, integer, string) +# Numeric : positive, non_negative, range +# String : regex, allowed_values +# Temporal : no_future_timestamps, timestamp_range +# Cross-col : unique_combination (device × timestamp) version: "1.0" metadata: - description: "IoT/sensor data validation configuration" + description: "IoT sensor telemetry validation — advanced template" template: "iot" domain: "industrial" -# Data source configuration data_source: type: csv path: "./sensor_data.csv" options: encoding: "utf-8" -# IoT validation checks checks: - # Device ID validation - - name: device_id_check - column: device_id - description: "Unique device identifier" + # ─── Identifiers ──────────────────────────────────────────────────────────── + + - name: record_id + column: record_id + description: "Sequential record ID — unique, positive" rules: not_null: true - regex: "^[A-Z]{2,4}-[0-9A-F]{8,16}$" + unique: true + positive: true + type: integer - # Sensor ID validation - - name: sensor_id_check - column: sensor_id - description: "Sensor identifier within device" + - name: device_id + column: device_id + description: "Device ID in DEV-HHHHHHHH format" rules: not_null: true - regex: "^SENS-[0-9]{4,8}$" + regex: '^DEV-[0-9A-F]{8}$' - # MAC address validation - - name: mac_address_check - column: mac_address - description: "Device MAC address" + - name: sensor_id + column: sensor_id + description: "Sensor ID in SENS-NNNN format" rules: - regex: "^([0-9A-Fa-f]{2}:){5}[0-9A-Fa-f]{2}$" + not_null: true + regex: '^SENS-\d{4}$' - # IP address validation - - name: ip_address_check - column: ip_address - description: "Device IP address" + - name: device_timestamp_unique + column: device_id + description: "Each device can only emit one reading per timestamp" rules: - regex: "^([0-9]{1,3}\\.){3}[0-9]{1,3}$" + unique_combination: + - device_id + - timestamp - # Firmware version validation - - name: firmware_check - column: firmware_version - description: "Device firmware version" - rules: - regex: "^[0-9]+\\.[0-9]+\\.[0-9]+(-[a-zA-Z0-9]+)?$" + # ─── Timestamp ────────────────────────────────────────────────────────────── - # Timestamp validation - - name: timestamp_check + - name: timestamp_not_future column: timestamp - description: "Measurement timestamp (ISO 8601)" + description: "Sensor readings cannot be future-dated" rules: not_null: true - date_format: - format: "%Y-%m-%dT%H:%M:%S" + no_future_timestamps: true - # Unix timestamp validation - - name: unix_timestamp_check - column: unix_timestamp - description: "Measurement Unix timestamp (seconds)" + - name: timestamp_range + column: timestamp + description: "All readings must be within the data-collection window" rules: - type: integer - min: 0 - max: 4102444800 # Year 2100 + timestamp_range: + min: "2024-01-01" + max: "2030-12-31" - # Temperature reading validation - - name: temperature_check + # ─── Temperature (Normal distribution) ───────────────────────────────────── + + - name: temperature_type column: temperature - description: "Temperature reading (Celsius)" + description: "Temperature in Celsius — numeric" rules: + not_null: true type: numeric - min: -273.15 # Absolute zero - max: 1000 + range: + min: -10.0 + max: 50.0 + + # ─── Humidity ──────────────────────────────────────────────────────────────── - # Humidity reading validation - - name: humidity_check + - name: humidity_range column: humidity - description: "Relative humidity (%)" + description: "Relative humidity 20–80 %" rules: + not_null: true type: numeric - min: 0 - max: 100 + non_negative: true + range: + min: 20.0 + max: 80.0 + + # ─── Pressure ─────────────────────────────────────────────────────────────── - # Pressure reading validation - - name: pressure_check + - name: pressure column: pressure - description: "Atmospheric pressure (hPa)" + description: "Atmospheric pressure 900–1100 hPa" rules: + not_null: true type: numeric - min: 300 - max: 1100 + positive: true + range: + min: 900.0 + max: 1100.0 + + # ─── Battery / Signal ─────────────────────────────────────────────────────── - # Battery level validation - - name: battery_check + - name: battery_level column: battery_level - description: "Battery level (%)" + description: "Battery level 1–100 % (positive integer)" rules: - type: numeric - min: 0 - max: 100 + not_null: true + type: integer + positive: true + range: + min: 1 + max: 100 - # Signal strength validation (RSSI) - - name: rssi_check + - name: rssi column: rssi - description: "Signal strength (dBm)" + description: "Signal strength -110 to -20 dBm (always negative)" rules: + not_null: true type: integer - min: -120 - max: 0 + range: + min: -110 + max: -20 - # GPS latitude validation - - name: latitude_check + # ─── GPS ──────────────────────────────────────────────────────────────────── + + - name: latitude column: latitude - description: "GPS latitude coordinate" + description: "Latitude in valid WGS-84 range" rules: + not_null: true type: numeric - min: -90 - max: 90 + range: + min: -90.0 + max: 90.0 - # GPS longitude validation - - name: longitude_check + - name: longitude column: longitude - description: "GPS longitude coordinate" + description: "Longitude in valid WGS-84 range" rules: + not_null: true type: numeric - min: -180 - max: 180 + range: + min: -180.0 + max: 180.0 - # Altitude validation - - name: altitude_check + - name: altitude column: altitude - description: "Altitude (meters above sea level)" + description: "Altitude in metres (non-negative)" rules: type: numeric - min: -500 # Dead Sea depression - max: 50000 # Higher than Everest, for aircraft + non_negative: true - # Speed validation - - name: speed_check - column: speed - description: "Speed (m/s)" - rules: - type: numeric - min: 0 - max: 1000 + # ─── Categorical ──────────────────────────────────────────────────────────── - # Acceleration validation - - name: acceleration_check - column: acceleration - description: "Acceleration (m/s²)" - rules: - type: numeric - min: -100 - max: 100 - - # Voltage reading validation - - name: voltage_check - column: voltage - description: "Voltage reading (V)" - rules: - type: numeric - min: 0 - max: 10000 - - # Current reading validation - - name: current_check - column: current - description: "Current reading (A)" - rules: - type: numeric - min: 0 - max: 1000 - - # Power reading validation - - name: power_check - column: power - description: "Power reading (W)" - rules: - type: numeric - min: 0 - max: 1000000 - - # Energy consumption validation - - name: energy_check - column: energy_kwh - description: "Energy consumption (kWh)" - rules: - type: numeric - min: 0 - - # Device status validation - - name: device_status_check - column: device_status - description: "Device operational status" + - name: quality_flag + column: quality_flag + description: "Reading quality classification" rules: not_null: true - allowed_values: - - online - - offline - - standby - - error - - maintenance - - booting + allowed_values: [good, warning, error] - # Sensor type validation - - name: sensor_type_check + - name: sensor_type column: sensor_type - description: "Type of sensor" + description: "Sensor category" rules: - allowed_values: - - temperature - - humidity - - pressure - - motion - - light - - sound - - gas - - vibration - - proximity - - gps - - accelerometer - - gyroscope - - magnetometer - - # Data quality flag - - name: quality_flag_check - column: quality_flag - description: "Data quality indicator" - rules: - allowed_values: - - good - - warning - - error - - unknown - - # Error code validation - - name: error_code_check - column: error_code - description: "Device/sensor error code" - rules: - regex: "^(ERR-[0-9]{3,6}|OK)$" - - # Sequence number validation - - name: sequence_check - column: sequence_number - description: "Message sequence number" - rules: - type: integer - min: 0 - - # Message size validation - - name: message_size_check - column: message_size_bytes - description: "Message payload size" - rules: - type: integer - min: 0 - max: 1048576 # 1 MB max - - # Gateway ID validation - - name: gateway_id_check - column: gateway_id - description: "Gateway/hub identifier" - rules: - regex: "^GW-[A-Z0-9]{8,12}$" + not_null: true + allowed_values: [temperature, humidity, pressure, motion, light, gas] - # Protocol validation - - name: protocol_check + - name: protocol column: protocol description: "Communication protocol" rules: - allowed_values: - - mqtt - - coap - - http - - https - - websocket - - lorawan - - zigbee - - bluetooth - - wifi + not_null: true + allowed_values: [mqtt, http, coap, websocket] -# Output configuration reporting: export_failures: true - output_path: "iot_validation" + output_file: "iot_validation" diff --git a/datacheck/config/templates/rules-reference.yaml b/datacheck/config/templates/rules-reference.yaml index 960eb0c..5b81905 100644 --- a/datacheck/config/templates/rules-reference.yaml +++ b/datacheck/config/templates/rules-reference.yaml @@ -5,8 +5,8 @@ # datacheck config init --template rules-reference # Then edit to keep only the rules you need. # -# Tip: Run 'datacheck config generate data.csv' to auto-generate -# a config with rules tailored to your data. +# Tip: Run 'datacheck config init --with-sample-data' to generate +# a starter config with sample data. version: "1.0" @@ -58,43 +58,6 @@ checks: min: 0 max: 10000 - - name: mean_between_example - column: score - description: "Validate that column mean falls within range" - rules: - mean_between: - min: 50.0 - max: 100.0 - - - name: std_dev_example - column: measurements - description: "Validate that standard deviation stays below threshold" - rules: - std_dev_less_than: 15.0 - - - name: percentile_range_example - column: salary - description: "Validate 25th and 75th percentile bounds" - rules: - percentile_range: - p25_min: 30000 - p25_max: 50000 - p75_min: 80000 - p75_max: 120000 - - - name: z_score_example - column: revenue - description: "Detect outliers by Z-score (default threshold: 3.0)" - rules: - z_score_outliers: 3.0 - - - name: distribution_example - column: test_scores - description: "Validate data follows expected distribution" - rules: - # Valid types: normal, uniform - distribution_type: normal - # ────────────────────────────────────────────────────────────── # STRING & PATTERN RULES # ────────────────────────────────────────────────────────────── @@ -114,24 +77,17 @@ checks: - inactive - pending - - name: length_example + - name: min_length_example column: username - description: "Validate string length (min and/or max)" + description: "Validate minimum string length" rules: - length: - min: 3 - max: 50 + min_length: 3 - # Shorthand for length: set min or max individually - # - name: min_length_example - # column: password - # rules: - # min_length: 8 - - # - name: max_length_example - # column: bio - # rules: - # max_length: 500 + - name: max_length_example + column: bio + description: "Validate maximum string length" + rules: + max_length: 500 # ────────────────────────────────────────────────────────────── # TEMPORAL / DATE RULES @@ -169,50 +125,6 @@ checks: # Supported units: m (minutes), h (hours), d (days), w (weeks) max_age: "24h" - - name: business_days_example - column: settlement_date - description: "Ensure dates fall on weekdays (Mon-Fri)" - rules: - business_days_only: true - - # ────────────────────────────────────────────────────────────── - # SEMANTIC VALIDATION - # ────────────────────────────────────────────────────────────── - - - name: email_example - column: email - description: "Validate email addresses (RFC 5322)" - rules: - email_valid: true - - - name: phone_example - column: phone - description: "Validate phone numbers" - rules: - # Simple (auto-detect country): - # phone_valid: true - # With country code: - phone_valid: - country_code: "US" - - - name: url_example - column: website - description: "Validate URLs" - rules: - # Simple (http/https only): - # url_valid: true - # With custom schemes: - url_valid: - schemes: - - http - - https - - - name: json_example - column: metadata - description: "Validate values are valid JSON" - rules: - json_valid: true - # ────────────────────────────────────────────────────────────── # CROSS-COLUMN / RELATIONSHIP RULES # ────────────────────────────────────────────────────────────── @@ -246,21 +158,6 @@ checks: - { code: "GB" } - { code: "DE" } - # ────────────────────────────────────────────────────────────── - # CUSTOM RULES - # ────────────────────────────────────────────────────────────── - - # - name: custom_rule_example - # column: email - # description: "User-defined validation via plugin" - # rules: - # custom: - # rule: "is_business_email" - # params: - # allowed_domains: - # - company.com - # - subsidiary.com - # ────────────────────────────────────────────────────────────── # COMBINING MULTIPLE RULES # ────────────────────────────────────────────────────────────── @@ -270,10 +167,8 @@ checks: description: "Multiple rules on one column — all must pass" rules: not_null: true - email_valid: true - length: - min: 5 - max: 254 + min_length: 5 + max_length: 254 # Notifications (optional) — send results to Slack # notifications: @@ -282,4 +177,4 @@ checks: reporting: export_failures: true - output_path: "validation_results" + output_file: "validation_results" diff --git a/datacheck/config/templates/saas.yaml b/datacheck/config/templates/saas.yaml index a2258e6..b2e3be8 100644 --- a/datacheck/config/templates/saas.yaml +++ b/datacheck/config/templates/saas.yaml @@ -1,264 +1,186 @@ # DataCheck SaaS Configuration Template -# Validation rules for SaaS platform data (users, subscriptions, events) +# Advanced validation for SaaS user / subscription data # -# Usage: -# datacheck init --template saas -# datacheck validate users.csv -c datacheck.yaml +# Quick start: +# datacheck config init --template saas --with-sample-data +# datacheck validate --config datacheck.yaml +# +# Rules demonstrated (covers 16+ rule types): +# Presence : not_null, unique +# Type : type (numeric, integer, string) +# Numeric : positive, non_negative, min, max +# String : regex, allowed_values, min_length, max_length +# Boolean : boolean +# Temporal : no_future_timestamps, date_range, max_age (warning) +# Cross-col : unique_combination version: "1.0" metadata: - description: "SaaS platform data validation configuration" + description: "SaaS platform user and subscription data validation — advanced template" template: "saas" domain: "technology" + compliance: + - GDPR -# Data source configuration data_source: type: csv path: "./users.csv" options: encoding: "utf-8" -# SaaS validation checks checks: - # User ID validation - - name: user_id_check + # ─── Identifiers ──────────────────────────────────────────────────────────── + + - name: user_id column: user_id - description: "Unique user identifier (UUID)" + description: "UUID v4 user identifier — unique and non-null" rules: not_null: true unique: true - regex: "^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$" + regex: '^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$' - # Organization/Tenant ID validation - - name: org_id_check + - name: organization_id column: organization_id - description: "Organization/tenant identifier" + description: "UUID v4 organisation identifier" rules: not_null: true - regex: "^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$" + regex: '^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$' - # Email validation - - name: email_check - column: email - description: "User email address" + - name: user_email_unique + column: user_id + description: "Each (user_id, email) pair must be unique" rules: - not_null: true - unique: true - regex: "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$" - metadata: - pii: true + unique_combination: + - user_id + - email - # Username validation - - name: username_check - column: username - description: "Unique username" - rules: - not_null: true - unique: true - min_length: 3 - max_length: 30 - regex: "^[a-zA-Z0-9_-]+$" + # ─── Contact ──────────────────────────────────────────────────────────────── - # Subscription plan validation - - name: plan_check - column: subscription_plan - description: "Subscription plan type" + - name: email + column: email + description: "Valid email address" rules: not_null: true - allowed_values: - - free - - starter - - professional - - business - - enterprise - - custom + regex: '^[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}$' - # Subscription status validation - - name: subscription_status_check - column: subscription_status - description: "Subscription status" + - name: username + column: username + description: "Username: 8–24 chars, starts with usr_" rules: not_null: true - allowed_values: - - active - - trialing - - past_due - - cancelled - - paused - - expired + min_length: 8 + max_length: 24 + regex: '^usr_[a-z0-9]{4,20}$' - # User role validation - - name: role_check - column: role - description: "User role in organization" - rules: - not_null: true - allowed_values: - - owner - - admin - - member - - viewer - - billing - - guest + # ─── Subscription ──────────────────────────────────────────────────────────── - # Account status validation - - name: account_status_check - column: account_status - description: "User account status" + - name: plan + column: plan + description: "Subscription tier" rules: not_null: true - allowed_values: - - active - - inactive - - suspended - - pending_verification - - deleted + allowed_values: [free, starter, professional, business, enterprise] - # Created at timestamp - - name: created_at_check - column: created_at - description: "Account creation timestamp" + - name: status + column: status + description: "Account lifecycle status" rules: not_null: true - date_format: - format: "%Y-%m-%dT%H:%M:%SZ" + allowed_values: [active, trialing, past_due, cancelled, paused] - # Last login timestamp - - name: last_login_check - column: last_login_at - description: "Last login timestamp" - rules: - date_format: - format: "%Y-%m-%dT%H:%M:%SZ" - - # Billing cycle validation - - name: billing_cycle_check - column: billing_cycle - description: "Billing cycle period" + - name: role + column: role + description: "User role within the organisation" rules: - allowed_values: - - monthly - - quarterly - - annual - - custom + not_null: true + allowed_values: [owner, admin, member, viewer, guest] - # Monthly recurring revenue - - name: mrr_check + - name: mrr column: mrr - description: "Monthly Recurring Revenue" + description: "Monthly recurring revenue (non-negative, free plan = 0)" rules: + not_null: true type: numeric - min: 0 + non_negative: true + max: 5000 - # Seats/licenses count - - name: seats_check + - name: seat_count column: seat_count - description: "Number of seats/licenses" + description: "Positive integer seat count" rules: + not_null: true type: integer - min: 1 - max: 10000 - - # Feature flags validation - - name: features_check - column: enabled_features - description: "Enabled feature flags (JSON array)" - rules: - max_length: 5000 + positive: true + max: 500 - # API key validation - - name: api_key_check - column: api_key - description: "API access key" - rules: - regex: "^[a-zA-Z0-9]{32,64}$" - metadata: - sensitivity: high - - # Webhook URL validation - - name: webhook_url_check - column: webhook_url - description: "Webhook callback URL" - rules: - regex: "^https?://[a-zA-Z0-9.-]+(/[a-zA-Z0-9._/-]*)?$" + # ─── Usage Metrics ────────────────────────────────────────────────────────── - # Timezone validation - - name: timezone_check - column: timezone - description: "User timezone" + - name: api_calls_30d + column: api_calls_30d + description: "30-day API call count — non-negative integer" rules: - regex: "^[A-Za-z]+/[A-Za-z_]+$" + not_null: true + type: integer + non_negative: true - # Locale validation - - name: locale_check - column: locale - description: "User locale/language" + - name: storage_gb + column: storage_gb + description: "Storage in GB — non-negative" rules: - regex: "^[a-z]{2}(-[A-Z]{2})?$" + not_null: true + type: numeric + non_negative: true + max: 500 - # Event type validation (for event logs) - - name: event_type_check - column: event_type - description: "Event type for analytics" - rules: - regex: "^[a-z]+(_[a-z]+)*$" + # ─── Boolean Flags ────────────────────────────────────────────────────────── - # Session ID validation - - name: session_id_check - column: session_id - description: "User session identifier" + - name: is_active + column: is_active + description: "Active flag must be a boolean" rules: - regex: "^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$" + not_null: true + boolean: true - # IP address validation - - name: ip_address_check - column: ip_address - description: "Client IP address" - rules: - regex: "^([0-9]{1,3}\\.){3}[0-9]{1,3}$|^([0-9a-fA-F]{0,4}:){7}[0-9a-fA-F]{0,4}$" - metadata: - pii: true + # ─── Temporal ─────────────────────────────────────────────────────────────── - # User agent validation - - name: user_agent_check - column: user_agent - description: "Browser user agent string" + - name: created_at_not_future + column: created_at + description: "Account creation date cannot be in the future" rules: - max_length: 500 + not_null: true + no_future_timestamps: true - # Referral code validation - - name: referral_code_check - column: referral_code - description: "User referral code" + - name: created_at_range + column: created_at + description: "Platform launched 2021-01-01; no accounts before that" rules: - regex: "^[A-Z0-9]{6,10}$" + date_range: + min: "2021-01-01" + max: "2030-12-31" - # Trial end date validation - - name: trial_ends_check - column: trial_ends_at - description: "Trial period end date" + - name: last_login_not_future + column: last_login_at + description: "Last login cannot be in the future" rules: - date_format: - format: "%Y-%m-%d" + not_null: true + no_future_timestamps: true - # Storage usage validation - - name: storage_used_check - column: storage_used_bytes - description: "Storage usage in bytes" + - name: last_login_freshness + column: last_login_at + description: "Active users should have logged in within the past year" + severity: warning # some users may be inactive rules: - type: integer - min: 0 + max_age: "365d" - # API calls count - - name: api_calls_check - column: api_calls_count - description: "API calls count" + - name: trial_end_date + column: trial_end_date + description: "Trial end date is after account creation" rules: - type: integer - min: 0 + date_range: + min: "2021-01-15" + max: "2035-12-31" -# Output configuration reporting: export_failures: true - output_path: "saas_validation" + output_file: "saas_validation" diff --git a/datacheck/config/templates/sources.yaml b/datacheck/config/templates/sources.yaml index 8b55984..65c53d6 100644 --- a/datacheck/config/templates/sources.yaml +++ b/datacheck/config/templates/sources.yaml @@ -92,26 +92,6 @@ sources: # type: parquet # path: ./data/transactions.parquet - # ── DuckDB / SQLite ───────────────────────────────────────── - # local_duckdb: - # type: duckdb - # path: ./data/analytics.duckdb - # table: customers - - # ── Delta Lake ────────────────────────────────────────────── - # delta_table: - # type: delta - # path: s3://my-bucket/delta-tables/customers - # storage_options: - # AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID} - # AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY} - # AWS_REGION: ${AWS_REGION:-us-east-1} - - # ── Avro File ─────────────────────────────────────────────── - # avro_data: - # type: avro - # path: ./data/events.avro - # ── AWS S3 ────────────────────────────────────────────────── # s3_data: # type: s3 @@ -122,20 +102,4 @@ sources: # access_key: ${AWS_ACCESS_KEY_ID} # secret_key: ${AWS_SECRET_ACCESS_KEY} - # ── Google Cloud Storage ──────────────────────────────────── - # gcs_data: - # type: gcs - # bucket: ${GCS_BUCKET} - # prefix: data/ - # path: data/customers.csv - # project: ${GCP_PROJECT_ID} - # credentials_path: ${GCS_CREDENTIALS_PATH} - # ── Azure Blob Storage ───────────────────────────────────── - # azure_data: - # type: azure - # container: ${AZURE_CONTAINER} - # prefix: data/ - # path: data/customers.csv - # account_name: ${AZURE_ACCOUNT_NAME} - # account_key: ${AZURE_ACCOUNT_KEY} diff --git a/datacheck/connectors/azure.py b/datacheck/connectors/azure.py deleted file mode 100644 index 9404349..0000000 --- a/datacheck/connectors/azure.py +++ /dev/null @@ -1,310 +0,0 @@ -"""Azure Blob Storage connector for DataCheck.""" -import io -import re - -import pandas as pd - -from datacheck.connectors.cloud_base import CloudConnector, CloudFile -from datacheck.exceptions import AuthenticationError, ConfigurationError, ConnectionError, DataLoadError - - -class AzureConnector(CloudConnector): - """Azure Blob Storage connector.""" - - def __init__( - self, - container: str, - prefix: str = "", - account_name: str | None = None, - account_key: str | None = None, - connection_string: str | None = None, - sas_token: str | None = None, - ) -> None: - """Initialize Azure Blob connector. - - Args: - container: Azure container name - prefix: Path prefix (folder) - account_name: Azure storage account name - account_key: Azure storage account key - connection_string: Azure connection string (alternative auth) - sas_token: Shared Access Signature token (alternative auth) - """ - self.container = container - self.account_name = account_name - self.account_key = account_key - self.connection_string = connection_string - self.sas_token = sas_token - - # Use container as bucket for base class - super().__init__(container, prefix, region=account_name or "") - - # Initialize Azure client - self._client = self._create_client() - - def _validate_config(self) -> None: - """Validate Azure configuration.""" - if not self.container: - raise ConfigurationError("Azure container name is required") - - # Validate container name format - if not self._is_valid_container_name(self.container): - raise ConfigurationError(f"Invalid Azure container name: {self.container}") - - # Check authentication options - if not any([ - self.connection_string, - (self.account_name and self.account_key), - (self.account_name and self.sas_token), - ]): - raise ConfigurationError( - "Azure authentication required: provide connection_string, " - "or account_name with account_key or sas_token" - ) - - def _is_valid_container_name(self, name: str) -> bool: - """Validate Azure container name format. - - Args: - name: Container name to validate - - Returns: - True if valid, False otherwise - """ - # Azure container naming rules - # Must be 3-63 characters, lowercase letters, numbers, and hyphens - # Must start with letter or number - pattern = r"^[a-z0-9][a-z0-9\-]{1,61}[a-z0-9]$" - return bool(re.match(pattern, name)) - - def _create_client(self): - """Create Azure Blob service client. - - Returns: - azure.storage.blob.BlobServiceClient - - Raises: - AuthenticationError: If credentials not found - ConnectionError: If client creation fails - """ - try: - from azure.storage.blob import BlobServiceClient - from azure.core.exceptions import ClientAuthenticationError - except ImportError: - raise DataLoadError( - "Azure connector dependencies are not installed. " - "Install with: pip install 'datacheck[azure]'" - ) - - try: - if self.connection_string: - # Use connection string - return BlobServiceClient.from_connection_string(self.connection_string) - elif self.account_name and self.account_key: - # Use account name and key - account_url = f"https://{self.account_name}.blob.core.windows.net" - return BlobServiceClient( - account_url=account_url, - credential=self.account_key, - ) - elif self.account_name and self.sas_token: - # Use SAS token - account_url = f"https://{self.account_name}.blob.core.windows.net" - return BlobServiceClient( - account_url=account_url, - credential=self.sas_token, - ) - else: - raise AuthenticationError( - "Azure authentication required: provide connection_string, " - "or account_name with account_key or sas_token" - ) - - except ClientAuthenticationError as e: - raise AuthenticationError(f"Azure authentication failed: {e}") - except Exception as e: - raise ConnectionError(f"Failed to create Azure client: {e}") - - def list_files(self, pattern: str = "*") -> list[CloudFile]: - """List files in Azure container. - - Args: - pattern: Glob pattern to match files - - Returns: - List of CloudFile objects - - Raises: - ConnectionError: If container doesn't exist or access denied - AuthenticationError: If access is denied - """ - from azure.core.exceptions import ResourceNotFoundError, ClientAuthenticationError - - try: - container_client = self._client.get_container_client(self.container) - - # List all blobs with prefix - blobs = container_client.list_blobs(name_starts_with=self.prefix) - - # Convert to CloudFile objects - files = [] - for blob in blobs: - # Skip "directory" markers - if blob.name.endswith("/"): - continue - - files.append( - CloudFile( - path=blob.name, - size=blob.size or 0, - last_modified=blob.last_modified.isoformat() if blob.last_modified else "", - etag=blob.etag, - ) - ) - - # Filter by pattern - if pattern != "*": - file_paths = [f.path for f in files] - matched_paths = self._match_pattern(file_paths, pattern) - files = [f for f in files if f.path in matched_paths] - - return files - - except ResourceNotFoundError: - raise ConnectionError(f"Azure container does not exist: {self.container}") - except ClientAuthenticationError: - raise AuthenticationError(f"Access denied to Azure container: {self.container}") - except Exception as e: - raise ConnectionError(f"Azure error: {e}") - - def read_csv(self, path: str, **kwargs) -> pd.DataFrame: - """Read CSV file from Azure Blob. - - Args: - path: Blob name (file path) - **kwargs: Additional arguments for pd.read_csv - - Returns: - DataFrame with file contents - - Raises: - FileNotFoundError: If file doesn't exist - ConnectionError: If read fails - """ - from azure.core.exceptions import ResourceNotFoundError - - try: - container_client = self._client.get_container_client(self.container) - blob_client = container_client.get_blob_client(path) - content = blob_client.download_blob().readall() - return pd.read_csv(io.BytesIO(content), **kwargs) - - except ResourceNotFoundError: - raise FileNotFoundError(f"File not found in Azure: {path}") - except Exception as e: - raise ConnectionError(f"Failed to read CSV from Azure: {e}") - - def read_parquet(self, path: str, **kwargs) -> pd.DataFrame: - """Read Parquet file from Azure Blob. - - Args: - path: Blob name (file path) - **kwargs: Additional arguments for reading - - Returns: - DataFrame with file contents - - Raises: - FileNotFoundError: If file doesn't exist - ConnectionError: If read fails - """ - try: - import pyarrow.parquet # noqa: F401 - check availability - except ImportError: - raise DataLoadError( - "pyarrow is required for reading Parquet files from Azure. " - "Install with: pip install pyarrow" - ) - - from azure.core.exceptions import ResourceNotFoundError - - try: - container_client = self._client.get_container_client(self.container) - blob_client = container_client.get_blob_client(path) - content = blob_client.download_blob().readall() - return pd.read_parquet(io.BytesIO(content), **kwargs) - - except ResourceNotFoundError: - raise FileNotFoundError(f"File not found in Azure: {path}") - except Exception as e: - raise ConnectionError(f"Failed to read Parquet from Azure: {e}") - - def read_json(self, path: str, **kwargs) -> pd.DataFrame: - """Read JSON file from Azure Blob. - - Args: - path: Blob name (file path) - **kwargs: Additional arguments for pd.read_json - - Returns: - DataFrame with file contents - - Raises: - FileNotFoundError: If file doesn't exist - ConnectionError: If read fails - """ - from azure.core.exceptions import ResourceNotFoundError - - try: - container_client = self._client.get_container_client(self.container) - blob_client = container_client.get_blob_client(path) - content = blob_client.download_blob().readall() - return pd.read_json(io.BytesIO(content), **kwargs) - - except ResourceNotFoundError: - raise FileNotFoundError(f"File not found in Azure: {path}") - except Exception as e: - raise ConnectionError(f"Failed to read JSON from Azure: {e}") - - def file_exists(self, path: str) -> bool: - """Check if file exists in Azure Blob. - - Args: - path: Blob name (file path) - - Returns: - True if file exists, False otherwise - """ - try: - container_client = self._client.get_container_client(self.container) - blob_client = container_client.get_blob_client(path) - return blob_client.exists() - except Exception: - return False - - def load_file_size(self, path: str) -> int: - """Load file size in bytes. - - Args: - path: Blob name (file path) - - Returns: - File size in bytes - - Raises: - FileNotFoundError: If file doesn't exist - ConnectionError: If operation fails - """ - from azure.core.exceptions import ResourceNotFoundError - - try: - container_client = self._client.get_container_client(self.container) - blob_client = container_client.get_blob_client(path) - properties = blob_client.get_blob_properties() - return properties.size or 0 - - except ResourceNotFoundError: - raise FileNotFoundError(f"File not found in Azure: {path}") - except Exception as e: - raise ConnectionError(f"Failed to get file size: {e}") - diff --git a/datacheck/connectors/base.py b/datacheck/connectors/base.py index c4f20fb..f949188 100644 --- a/datacheck/connectors/base.py +++ b/datacheck/connectors/base.py @@ -50,7 +50,8 @@ def load_table( self, table_name: str, where: str | None = None, - limit: int | None = None + limit: int | None = None, + columns: set[str] | None = None, ) -> pd.DataFrame: """Load data from a database table. @@ -58,6 +59,9 @@ def load_table( table_name: Name of the table to load where: Optional WHERE clause (without 'WHERE' keyword) limit: Optional row limit + columns: Optional set of column names to load. When provided, + generates SELECT col1, col2 instead of SELECT *. + Pass None to load all columns. Returns: DataFrame containing table data diff --git a/datacheck/connectors/bigquery.py b/datacheck/connectors/bigquery.py index 8105720..13155d8 100644 --- a/datacheck/connectors/bigquery.py +++ b/datacheck/connectors/bigquery.py @@ -142,6 +142,7 @@ def load_table( limit: int | None = None, dataset_id: str | None = None, sample_rate: float | None = None, + columns: set[str] | None = None, ) -> pd.DataFrame: """Load data from BigQuery table. @@ -180,7 +181,11 @@ def load_table( full_table_name = f"`{self.project_id}.{effective_dataset}.{table_name}`" # Build query (table_name validated by _validate_table_name above) - query_parts = [f"SELECT * FROM {full_table_name}"] # nosec B608 + if columns: + col_list = ", ".join(f"`{c}`" for c in sorted(columns)) + query_parts = [f"SELECT {col_list} FROM {full_table_name}"] # nosec B608 + else: + query_parts = [f"SELECT * FROM {full_table_name}"] # nosec B608 # Add sampling clause (BigQuery specific) # TABLESAMPLE comes after FROM clause diff --git a/datacheck/connectors/factory.py b/datacheck/connectors/factory.py index 6d4c724..da235f5 100644 --- a/datacheck/connectors/factory.py +++ b/datacheck/connectors/factory.py @@ -105,6 +105,7 @@ def load_source_data( where: str | None = None, query: str | None = None, sample_rate: float | None = None, + limit: int | None = None, ) -> pd.DataFrame: """Load data from a source configuration. @@ -116,6 +117,7 @@ def load_source_data( where: WHERE clause (for database sources) query: Custom SQL query (for database sources) sample_rate: Sample rate (for warehouse sources that support it) + limit: Maximum rows to return (pushed down as SQL LIMIT for databases) Returns: DataFrame with loaded data @@ -125,7 +127,7 @@ def load_source_data( ConfigurationError: If source type is invalid """ if source.is_database: - return _load_from_database(source, table, where, query, sample_rate) + return _load_from_database(source, table, where, query, sample_rate, limit) if source.is_file: return _load_from_file(source, table, query) @@ -163,6 +165,7 @@ def _load_from_database( where: str | None, query: str | None, sample_rate: float | None, + limit: int | None = None, ) -> pd.DataFrame: """Load data from a database source.""" connector = create_connector(source) @@ -175,6 +178,8 @@ def _load_from_database( kwargs: dict[str, Any] = {"where": where} if sample_rate is not None and source.type in warehouse_types: kwargs["sample_rate"] = sample_rate + if limit is not None: + kwargs["limit"] = limit return connector.load_table(table, **kwargs) raise DataLoadError( f"Source '{source.name}' is a database source — " @@ -204,34 +209,6 @@ def _load_from_file( return ParquetLoader(conn["path"]).load() - if source.type in ("duckdb", "sqlite"): - from datacheck.loader import DuckDBLoader - - return DuckDBLoader( - conn["path"], - table_name=table or conn.get("table"), - query=query or conn.get("query"), - ).load() - - if source.type == "delta": - from datacheck.loader import DeltaLakeLoader - - return DeltaLakeLoader( - conn["path"], - version=conn.get("version"), - timestamp=conn.get("timestamp"), - columns=conn.get("columns"), - storage_options=conn.get("storage_options"), - ).load() - - if source.type == "avro": - from datacheck.loader import AvroLoader - - return AvroLoader( - conn["path"], - reader_schema=conn.get("reader_schema"), - ).load() - raise ConfigurationError(f"Unknown file source type: {source.type}") @@ -259,30 +236,6 @@ def _load_from_cloud(source: SourceConfig) -> pd.DataFrame: ) return s3.read_file(file_path) - if source.type == "gcs": - from datacheck.connectors.gcs import GCSConnector - - gcs = GCSConnector( - bucket=conn["bucket"], - prefix=conn.get("prefix", ""), - project=conn.get("project"), - credentials_path=conn.get("credentials_path"), - ) - return gcs.read_file(file_path) - - if source.type == "azure": - from datacheck.connectors.azure import AzureConnector - - az = AzureConnector( - container=conn["container"], - prefix=conn.get("prefix", ""), - account_name=conn.get("account_name"), - account_key=conn.get("account_key"), - connection_string=conn.get("connection_string"), - sas_token=conn.get("sas_token"), - ) - return az.read_file(file_path) - raise ConfigurationError(f"Unknown cloud source type: {source.type}") diff --git a/datacheck/connectors/gcs.py b/datacheck/connectors/gcs.py deleted file mode 100644 index 3b1b4da..0000000 --- a/datacheck/connectors/gcs.py +++ /dev/null @@ -1,281 +0,0 @@ -"""Google Cloud Storage connector for DataCheck.""" -import io -import re - -import pandas as pd - -from datacheck.connectors.cloud_base import CloudConnector, CloudFile -from datacheck.exceptions import AuthenticationError, ConfigurationError, ConnectionError, DataLoadError - - -class GCSConnector(CloudConnector): - """Google Cloud Storage connector.""" - - def __init__( - self, - bucket: str, - prefix: str = "", - project: str | None = None, - credentials_path: str | None = None, - ) -> None: - """Initialize GCS connector. - - Args: - bucket: GCS bucket name - prefix: Path prefix (folder) - project: GCP project ID - credentials_path: Path to service account JSON file (optional) - """ - self.project = project - self.credentials_path = credentials_path - - super().__init__(bucket, prefix, region=project or "") - - # Initialize GCS client - self._client = self._create_client() - - def _validate_config(self) -> None: - """Validate GCS configuration.""" - if not self.bucket: - raise ConfigurationError("GCS bucket name is required") - - # Validate bucket name format - if not self._is_valid_bucket_name(self.bucket): - raise ConfigurationError(f"Invalid GCS bucket name: {self.bucket}") - - def _is_valid_bucket_name(self, name: str) -> bool: - """Validate GCS bucket name format. - - Args: - name: Bucket name to validate - - Returns: - True if valid, False otherwise - """ - # GCS bucket naming rules - # Must be 3-63 characters, lowercase, numbers, hyphens, underscores - # Must start and end with letter or number - pattern = r"^[a-z0-9][a-z0-9\-_\.]{1,61}[a-z0-9]$" - return bool(re.match(pattern, name)) - - def _create_client(self): - """Create GCS client. - - Returns: - google.cloud.storage.Client - - Raises: - AuthenticationError: If credentials not found - ConnectionError: If client creation fails - """ - try: - from google.cloud import storage - from google.auth.exceptions import DefaultCredentialsError - except ImportError: - raise DataLoadError( - "GCS connector dependencies are not installed. " - "Install with: pip install 'datacheck[gcs]'" - ) - - try: - if self.credentials_path: - # Use explicit credentials file - return storage.Client.from_service_account_json( - self.credentials_path, - project=self.project, - ) - else: - # Use Application Default Credentials - return storage.Client(project=self.project) - - except DefaultCredentialsError: - raise AuthenticationError( - "GCP credentials not found. Set GOOGLE_APPLICATION_CREDENTIALS " - "environment variable, or pass credentials_path parameter." - ) - except Exception as e: - raise ConnectionError(f"Failed to create GCS client: {e}") - - def list_files(self, pattern: str = "*") -> list[CloudFile]: - """List files in GCS bucket. - - Args: - pattern: Glob pattern to match files - - Returns: - List of CloudFile objects - - Raises: - ConnectionError: If bucket doesn't exist or access denied - AuthenticationError: If access is denied - """ - from google.api_core.exceptions import NotFound, Forbidden - - try: - bucket = self._client.bucket(self.bucket) - - # List all blobs with prefix - blobs = bucket.list_blobs(prefix=self.prefix) - - # Convert to CloudFile objects - files = [] - for blob in blobs: - # Skip "directory" markers (blobs ending with /) - if blob.name.endswith("/"): - continue - - files.append( - CloudFile( - path=blob.name, - size=blob.size or 0, - last_modified=blob.updated.isoformat() if blob.updated else "", - etag=blob.etag, - ) - ) - - # Filter by pattern - if pattern != "*": - file_paths = [f.path for f in files] - matched_paths = self._match_pattern(file_paths, pattern) - files = [f for f in files if f.path in matched_paths] - - return files - - except NotFound: - raise ConnectionError(f"GCS bucket does not exist: {self.bucket}") - except Forbidden: - raise AuthenticationError(f"Access denied to GCS bucket: {self.bucket}") - except Exception as e: - raise ConnectionError(f"GCS error: {e}") - - def read_csv(self, path: str, **kwargs) -> pd.DataFrame: - """Read CSV file from GCS. - - Args: - path: GCS blob name (file path) - **kwargs: Additional arguments for pd.read_csv - - Returns: - DataFrame with file contents - - Raises: - FileNotFoundError: If file doesn't exist - ConnectionError: If read fails - """ - from google.api_core.exceptions import NotFound - - try: - bucket = self._client.bucket(self.bucket) - blob = bucket.blob(path) - content = blob.download_as_bytes() - return pd.read_csv(io.BytesIO(content), **kwargs) - - except NotFound: - raise FileNotFoundError(f"File not found in GCS: {path}") - except Exception as e: - raise ConnectionError(f"Failed to read CSV from GCS: {e}") - - def read_parquet(self, path: str, **kwargs) -> pd.DataFrame: - """Read Parquet file from GCS. - - Args: - path: GCS blob name (file path) - **kwargs: Additional arguments for reading - - Returns: - DataFrame with file contents - - Raises: - FileNotFoundError: If file doesn't exist - ConnectionError: If read fails - """ - try: - import pyarrow.parquet # noqa: F401 - check availability - except ImportError: - raise DataLoadError( - "pyarrow is required for reading Parquet files from GCS. " - "Install with: pip install pyarrow" - ) - - from google.api_core.exceptions import NotFound - - try: - bucket = self._client.bucket(self.bucket) - blob = bucket.blob(path) - content = blob.download_as_bytes() - return pd.read_parquet(io.BytesIO(content), **kwargs) - - except NotFound: - raise FileNotFoundError(f"File not found in GCS: {path}") - except Exception as e: - raise ConnectionError(f"Failed to read Parquet from GCS: {e}") - - def read_json(self, path: str, **kwargs) -> pd.DataFrame: - """Read JSON file from GCS. - - Args: - path: GCS blob name (file path) - **kwargs: Additional arguments for pd.read_json - - Returns: - DataFrame with file contents - - Raises: - FileNotFoundError: If file doesn't exist - ConnectionError: If read fails - """ - from google.api_core.exceptions import NotFound - - try: - bucket = self._client.bucket(self.bucket) - blob = bucket.blob(path) - content = blob.download_as_bytes() - return pd.read_json(io.BytesIO(content), **kwargs) - - except NotFound: - raise FileNotFoundError(f"File not found in GCS: {path}") - except Exception as e: - raise ConnectionError(f"Failed to read JSON from GCS: {e}") - - def file_exists(self, path: str) -> bool: - """Check if file exists in GCS. - - Args: - path: GCS blob name (file path) - - Returns: - True if file exists, False otherwise - """ - try: - bucket = self._client.bucket(self.bucket) - blob = bucket.blob(path) - return blob.exists() - except Exception: - return False - - def load_file_size(self, path: str) -> int: - """Load file size in bytes. - - Args: - path: GCS blob name (file path) - - Returns: - File size in bytes - - Raises: - FileNotFoundError: If file doesn't exist - ConnectionError: If operation fails - """ - from google.api_core.exceptions import NotFound - - try: - bucket = self._client.bucket(self.bucket) - blob = bucket.blob(path) - blob.reload() # Fetch metadata - return blob.size or 0 - - except NotFound: - raise FileNotFoundError(f"File not found in GCS: {path}") - except Exception as e: - raise ConnectionError(f"Failed to get file size: {e}") - diff --git a/datacheck/connectors/mssql.py b/datacheck/connectors/mssql.py index 0d9973c..96aa18a 100644 --- a/datacheck/connectors/mssql.py +++ b/datacheck/connectors/mssql.py @@ -109,6 +109,7 @@ def load_table( where: str | None = None, limit: int | None = None, filters: dict[str, Any] | None = None, + columns: set[str] | None = None, ) -> pd.DataFrame: """Load data from SQL Server table. @@ -119,6 +120,8 @@ def load_table( limit: Optional row limit (uses TOP in SQL Server) filters: Dictionary of column-value pairs for safe filtering. Example: {"status": "active", "age": 25} + columns: Optional set of column names to project (SELECT col1, col2). + Pass None to load all columns (SELECT *). Returns: DataFrame containing table data @@ -136,12 +139,17 @@ def load_table( params = [] # SQL Server uses TOP instead of LIMIT + top_clause = "" if limit: if not isinstance(limit, int) or limit <= 0: raise DataLoadError(f"Invalid limit: {limit}. Must be a positive integer.") - query_parts = [f"SELECT TOP {int(limit)} * FROM [{table_name}]"] # nosec B608 + top_clause = f"TOP {int(limit)} " + + if columns: + col_list = ", ".join(f"[{c}]" for c in sorted(columns)) + query_parts = [f"SELECT {top_clause}{col_list} FROM [{table_name}]"] # nosec B608 else: - query_parts = [f"SELECT * FROM [{table_name}]"] # nosec B608 + query_parts = [f"SELECT {top_clause}* FROM [{table_name}]"] # nosec B608 conditions = [] diff --git a/datacheck/connectors/mysql.py b/datacheck/connectors/mysql.py index db8218f..d533ad0 100644 --- a/datacheck/connectors/mysql.py +++ b/datacheck/connectors/mysql.py @@ -100,6 +100,7 @@ def load_table( where: str | None = None, limit: int | None = None, filters: dict[str, Any] | None = None, + columns: set[str] | None = None, ) -> pd.DataFrame: """Load data from MySQL table. @@ -110,6 +111,8 @@ def load_table( limit: Optional row limit filters: Dictionary of column-value pairs for safe filtering. Example: {"status": "active", "age": 25} + columns: Optional set of column names to project (SELECT col1, col2). + Pass None to load all columns (SELECT *). Returns: DataFrame containing table data @@ -125,7 +128,11 @@ def load_table( try: # Build query with parameterization params = [] - query_parts = [f"SELECT * FROM `{table_name}`"] # nosec B608 + if columns: + col_list = ", ".join(f"`{c}`" for c in sorted(columns)) + query_parts = [f"SELECT {col_list} FROM `{table_name}`"] # nosec B608 + else: + query_parts = [f"SELECT * FROM `{table_name}`"] # nosec B608 conditions = [] diff --git a/datacheck/connectors/postgresql.py b/datacheck/connectors/postgresql.py index a4d6337..5a8b78f 100644 --- a/datacheck/connectors/postgresql.py +++ b/datacheck/connectors/postgresql.py @@ -79,6 +79,7 @@ def load_table( where: str | None = None, limit: int | None = None, filters: dict[str, Any] | None = None, + columns: set[str] | None = None, ) -> pd.DataFrame: """Load data from PostgreSQL table. @@ -91,6 +92,8 @@ def load_table( filters: Dictionary of column-value pairs for safe filtering. Example: {"status": "active", "age": 25} This is the recommended way to filter data. + columns: Optional set of column names to project (SELECT col1, col2). + Pass None to load all columns (SELECT *). Returns: DataFrame containing table data @@ -110,7 +113,11 @@ def load_table( try: # Build query with parameterization params = [] - query_parts = [f'SELECT * FROM "{table_name}"'] # nosec B608 + if columns: + col_list = ", ".join(f'"{c}"' for c in sorted(columns)) + query_parts = [f'SELECT {col_list} FROM "{table_name}"'] # nosec B608 + else: + query_parts = [f'SELECT * FROM "{table_name}"'] # nosec B608 # Handle both where and filters (filters takes precedence) conditions = [] diff --git a/datacheck/connectors/redshift.py b/datacheck/connectors/redshift.py index 21f4869..f9dd8fe 100644 --- a/datacheck/connectors/redshift.py +++ b/datacheck/connectors/redshift.py @@ -205,6 +205,7 @@ def load_table( limit: int | None = None, schema: str | None = None, sample_rate: float | None = None, + columns: set[str] | None = None, ) -> pd.DataFrame: """Load data from Redshift table. @@ -245,7 +246,11 @@ def load_table( full_table_name = table_name # Build query (table_name validated by _validate_table_name above) - query_parts = [f"SELECT * FROM {full_table_name}"] # nosec B608 + if columns: + col_list = ", ".join(f'"{c}"' for c in sorted(columns)) + query_parts = [f'SELECT {col_list} FROM {full_table_name}'] # nosec B608 + else: + query_parts = [f"SELECT * FROM {full_table_name}"] # nosec B608 # Build WHERE conditions conditions = [] diff --git a/datacheck/connectors/snowflake.py b/datacheck/connectors/snowflake.py index 8943ccb..b3a0889 100644 --- a/datacheck/connectors/snowflake.py +++ b/datacheck/connectors/snowflake.py @@ -152,6 +152,7 @@ def load_table( schema: str | None = None, sample_rate: float | None = None, sample_rows: int | None = None, + columns: set[str] | None = None, ) -> pd.DataFrame: """Load data from Snowflake table. @@ -199,7 +200,11 @@ def load_table( full_table_name = table_name # Build query (table_name validated by _validate_table_name above) - query_parts = [f"SELECT * FROM {full_table_name}"] # nosec B608 + if columns: + col_list = ", ".join(f'"{c}"' for c in sorted(columns)) + query_parts = [f"SELECT {col_list} FROM {full_table_name}"] # nosec B608 + else: + query_parts = [f"SELECT * FROM {full_table_name}"] # nosec B608 # Add sampling clause (Snowflake specific) # SAMPLE clause comes before WHERE diff --git a/datacheck/engine.py b/datacheck/engine.py index 5fa5577..c7b24a7 100644 --- a/datacheck/engine.py +++ b/datacheck/engine.py @@ -11,8 +11,28 @@ from datacheck.exceptions import ConfigurationError, DataLoadError, ValidationError from datacheck.loader import LoaderFactory from datacheck.results import RuleResult, ValidationSummary -from datacheck.rules import RuleFactory, UniqueRule -from datacheck.sampling import DataSampler +from datacheck.rules import RuleFactory + + +def _collect_needed_columns(checks: list) -> set[str] | None: + """Return the set of column names needed by all checks. + + Handles multi-column rules (unique_combination, sum_equals) by + inspecting their rule parameter values. + """ + cols: set[str] = set() + for check in checks: + if check.column: + cols.add(check.column) + # Extract extra columns referenced in multi-column rule configs + for rule_type, rule_value in check.rules.items(): + if rule_type == "unique_combination" and isinstance(rule_value, list): + cols.update(str(c) for c in rule_value) + elif rule_type == "sum_equals" and isinstance(rule_value, dict): + for key in ("column_a", "column_b"): + if key in rule_value: + cols.add(str(rule_value[key])) + return cols or None class ValidationEngine: @@ -87,18 +107,6 @@ def __init__( if effective_sources_file: self._load_sources(effective_sources_file) - # Load plugins if specified - if self.config.plugins: - from datacheck.plugins.loader import PluginLoader - - loader = PluginLoader() - - for plugin_path in self.config.plugins: - try: - loader.load_from_file(plugin_path) - except Exception as e: - raise ConfigurationError(f"Failed to load plugin {plugin_path}: {e}") from e - def _load_sources(self, sources_file: str | Path) -> None: """Load named sources from a YAML file. @@ -127,13 +135,7 @@ def validate_file( Args: file_path: Path to the data file to validate - **loader_kwargs: Additional arguments passed to the data loader - May include sampling parameters: - - sample_rate: Random sample rate (0.0 to 1.0) - - sample_count: Number of rows to sample - - top: Validate only first N rows - - stratify: Column name for stratified sampling - - seed: Random seed for reproducibility + **loader_kwargs: Additional arguments passed to the data loader. Returns: ValidationSummary with aggregated results @@ -142,25 +144,10 @@ def validate_file( DataLoadError: If data cannot be loaded ValidationError: If validation fails unexpectedly """ - # Extract sampling parameters from loader_kwargs - sample_rate = loader_kwargs.pop("sample_rate", None) - sample_count = loader_kwargs.pop("sample_count", None) - top = loader_kwargs.pop("top", None) - stratify = loader_kwargs.pop("stratify", None) - seed = loader_kwargs.pop("seed", None) - # Advanced sampling parameters - sample_strategy = loader_kwargs.pop("sample_strategy", None) - time_column = loader_kwargs.pop("time_column", None) - start_date = loader_kwargs.pop("start_date", None) - end_date = loader_kwargs.pop("end_date", None) - error_indicators = loader_kwargs.pop("error_indicators", None) - - # Collect columns referenced by rules for Parquet column pruning + # Collect columns referenced by rules for column pruning (Parquet + CSV) file_str = str(file_path) - if file_str.endswith((".parquet", ".pq")) and "columns" not in loader_kwargs: - columns_needed = { - check.column for check in self.config.checks if check.column - } + if file_str.endswith((".parquet", ".pq", ".csv")) and "columns" not in loader_kwargs: + columns_needed = _collect_needed_columns(self.config.checks) if columns_needed: loader_kwargs["columns"] = sorted(columns_needed) @@ -172,21 +159,6 @@ def validate_file( except Exception as e: raise DataLoadError(f"Unexpected error loading data: {e}") from e - # Apply sampling (CLI arguments override config) - df = self._apply_sampling( - df, - sample_rate=sample_rate, - sample_count=sample_count, - top=top, - stratify=stratify, - seed=seed, - sample_strategy=sample_strategy, - time_column=time_column, - start_date=start_date, - end_date=end_date, - error_indicators=error_indicators, - ) - # Validate the loaded data summary = self.validate_dataframe(df) @@ -236,16 +208,7 @@ def validate_dataframe(self, df: pd.DataFrame) -> ValidationSummary: # Return early with error if rule creation fails return ValidationSummary(results=[error_result]) - # Check for UniqueRule - must disable parallel to ensure correctness - has_unique_rule = any(isinstance(rule, UniqueRule) for rule in all_rules) - use_parallel = self.parallel and len(df) > 10000 and not has_unique_rule - - if self.parallel and len(df) > 10000 and has_unique_rule: - warnings.warn( - "Parallel execution disabled for unique rule to ensure correctness.", - UserWarning, - stacklevel=2, - ) + use_parallel = self.parallel and len(df) > 10000 # Execute rules (parallel or sequential) if use_parallel: @@ -278,8 +241,8 @@ def validate_dataframe(self, df: pd.DataFrame) -> ValidationSummary: for result in results: # check_name contains the original check name check_name = result.check_name or result.rule_name - # Remove suffixes like _min, _max that factory may add - base_name = check_name.replace("_min", "").replace("_max", "") + # Remove only the trailing suffix that factory may add (_min, _max) + base_name = check_name.removesuffix("_min").removesuffix("_max") if base_name in severity_map: result.severity = severity_map[base_name] elif check_name in severity_map: @@ -297,15 +260,6 @@ def validate_sources( table: str | None = None, where: str | None = None, query: str | None = None, - sample_rate: float | None = None, - sample_count: int | None = None, - stratify: str | None = None, - seed: int | None = None, - sample_strategy: str | None = None, - time_column: str | None = None, - start_date: str | None = None, - end_date: str | None = None, - error_indicators: list[str] | None = None, ) -> ValidationSummary: """Validate data using named source definitions. @@ -317,15 +271,6 @@ def validate_sources( table: Table name override (overrides config table) where: WHERE clause for database sources query: Custom SQL query for database sources - sample_rate: Sample rate for database sources - sample_count: Number of rows to sample - stratify: Column for stratified sampling - seed: Random seed for reproducibility - sample_strategy: Advanced sampling strategy - time_column: Column for time-based sampling - start_date: Start date for time-based sampling - end_date: End date for time-based sampling - error_indicators: List of error indicator conditions Returns: ValidationSummary with aggregated results @@ -396,7 +341,7 @@ def validate_sources( ) if connection_errors: - raise ConfigurationError( + raise DataLoadError( "Source connectivity check failed:\n - " + "\n - ".join(connection_errors) ) @@ -418,44 +363,100 @@ def validate_sources( effective_table = check.table or default_table table_checks.setdefault(effective_table, []).append(check) - for tbl, tbl_checks in table_checks.items(): - try: - df = load_source_data( - source_config, - table=tbl, - where=where, - query=query, - sample_rate=sample_rate, - ) - except Exception as e: - # Create error results for all checks in this group - for check in tbl_checks: - all_results.append(RuleResult( - rule_name=check.name, - column=check.column, - passed=False, - total_rows=0, - error=f"Failed to load data from source '{src_name}': {e}", - )) - continue - - # Apply sampling and validate - df = self._apply_sampling( - df, - sample_rate=sample_rate, - sample_count=sample_count, - stratify=stratify, - seed=seed, - sample_strategy=sample_strategy, - time_column=time_column, - start_date=start_date, - end_date=end_date, - error_indicators=error_indicators, - ) - _total_rows += len(df) - _total_columns = max(_total_columns, len(df.columns)) - results = self._run_checks(df, tbl_checks) - all_results.extend(results) + # SQL aggregate pushdown — activates for all supported DB types + # when no custom --query is used. + from datacheck.sql_pushdown.dialects import get_dialect + _dialect = get_dialect(source_config.type) if not query else None + + if _dialect is not None: + from datacheck.connectors.factory import create_connector + from datacheck.sql_pushdown.builder import SqlAggregateBuilder + + _builder = SqlAggregateBuilder() + _connector = create_connector(source_config) + with _connector: + for tbl, tbl_checks in table_checks.items(): + try: + if tbl is None: + raise DataLoadError( + f"Source '{src_name}' is a database source — " + "either 'table' or 'query' must be specified" + ) + pushable, non_pushable = _builder.partition_checks( + tbl_checks, _dialect + ) + + # SQL pushdown — zero data transfer + if pushable: + _sql = _builder.build_query( + tbl, where, pushable, _dialect + ) + _pd_result = _connector.execute_query(_sql) + _pd_row: dict[str, Any] = {str(k): v for k, v in _pd_result.iloc[0].to_dict().items()} + _sql_results = _builder.parse_results(_pd_row, pushable) + all_results.extend(_sql_results) + if not non_pushable: + _total_rows += int(_pd_row.get("_total_rows") or 0) + + # Python path — only for non-pushable checks + if non_pushable: + if tbl: + _load_kw: dict[str, Any] = {"where": where} + _needed_cols = _collect_needed_columns(non_pushable) + if _needed_cols is not None: + _load_kw["columns"] = _needed_cols + df = _connector.load_table(tbl, **_load_kw) + else: + raise DataLoadError( + f"Source '{src_name}' is a database source — " + "either 'table' or 'query' must be specified" + ) + _total_rows += len(df) + _total_columns = max(_total_columns, len(df.columns)) + results = self._run_checks(df, non_pushable) + all_results.extend(results) + + except DataLoadError: + raise + except Exception as e: + for check in tbl_checks: + all_results.append(RuleResult( + rule_name=check.name, + column=check.column, + passed=False, + total_rows=0, + error=f"Failed to load data from source '{src_name}': {e}", + )) + continue + + else: + # Unsupported DB type for pushdown, or custom --query: load-all path + for tbl, tbl_checks in table_checks.items(): + try: + df = load_source_data( + source_config, + table=tbl, + where=where, + query=query, + ) + except DataLoadError: + raise # propagate so caller maps to exit code 3 + except Exception as e: + # Create error results for non-DataLoadError failures + for check in tbl_checks: + all_results.append(RuleResult( + rule_name=check.name, + column=check.column, + passed=False, + total_rows=0, + error=f"Failed to load data from source '{src_name}': {e}", + )) + continue + + _total_rows += len(df) + _total_columns = max(_total_columns, len(df.columns)) + results = self._run_checks(df, tbl_checks) + all_results.extend(results) else: # File/cloud sources — load once, run all checks try: @@ -471,23 +472,23 @@ def validate_sources( )) continue - df = self._apply_sampling( - df, - sample_rate=sample_rate, - sample_count=sample_count, - stratify=stratify, - seed=seed, - sample_strategy=sample_strategy, - time_column=time_column, - start_date=start_date, - end_date=end_date, - error_indicators=error_indicators, - ) _total_rows += len(df) _total_columns = max(_total_columns, len(df.columns)) results = self._run_checks(df, checks) all_results.extend(results) + # Apply severity from check config to results (same as validate_dataframe) + severity_map: dict[str, str] = {} + for check_config in self.config.checks: + severity_map[check_config.name] = check_config.severity + for result in all_results: + check_name = result.check_name or result.rule_name + base_name = check_name.removesuffix("_min").removesuffix("_max") + if base_name in severity_map: + result.severity = severity_map[base_name] + elif check_name in severity_map: + result.severity = severity_map[check_name] + summary = ValidationSummary( results=all_results, total_rows=_total_rows, @@ -575,303 +576,6 @@ def validate( assert df is not None return self.validate_dataframe(df) - def _apply_sampling( - self, - df: pd.DataFrame, - sample_rate: float | None = None, - sample_count: int | None = None, - top: int | None = None, - stratify: str | None = None, - seed: int | None = None, - sample_strategy: str | None = None, - time_column: str | None = None, - start_date: str | None = None, - end_date: str | None = None, - error_indicators: list[str] | None = None, - ) -> pd.DataFrame: - """Apply sampling to DataFrame. - - CLI arguments take precedence over config file settings. - - Args: - df: DataFrame to sample - sample_rate: Random sample rate (CLI argument) - sample_count: Number of rows to sample (CLI argument) - top: First N rows (CLI argument) - stratify: Column for stratified sampling (CLI argument) - seed: Random seed (CLI argument) - sample_strategy: Advanced sampling strategy (CLI argument) - time_column: Column for time-based sampling (CLI argument) - start_date: Start date for time-based sampling (CLI argument) - end_date: End date for time-based sampling (CLI argument) - error_indicators: List of error indicator conditions (CLI argument) - - Returns: - Sampled DataFrame (or original if no sampling configured) - - Raises: - DataLoadError: If sampling configuration is invalid - """ - # Check if advanced sampling strategy specified - if sample_strategy is not None: - return self._apply_advanced_sampling( - df, - sample_strategy=sample_strategy, - sample_count=sample_count, - stratify=stratify, - time_column=time_column, - start_date=start_date, - end_date=end_date, - error_indicators=error_indicators, - seed=seed, - ) - - # Check if any CLI sampling arguments provided - has_cli_sampling = any([ - sample_rate is not None, - sample_count is not None, - top is not None, - stratify is not None, - ]) - - # If CLI arguments provided, use them (override config) - if has_cli_sampling: - # Top-N sampling - if top is not None: - return DataSampler.top_n(df, top) - - # Stratified sampling - if stratify is not None: - if sample_count is None: - raise DataLoadError("--stratify requires --sample-count") - return DataSampler.stratified_sample(df, stratify, sample_count, seed=seed) - - # Random sampling - if sample_rate is not None or sample_count is not None: - return DataSampler.random_sample(df, rate=sample_rate, count=sample_count, seed=seed) - - # Otherwise, use config file sampling - return self._apply_config_sampling(df) - - def _apply_advanced_sampling( - self, - df: pd.DataFrame, - sample_strategy: str, - sample_count: int | None = None, - stratify: str | None = None, - time_column: str | None = None, - start_date: str | None = None, - end_date: str | None = None, - error_indicators: list[str] | None = None, - seed: int | None = None, - ) -> pd.DataFrame: - """Apply advanced sampling strategy. - - Args: - df: DataFrame to sample - sample_strategy: Strategy name (random, stratified, time_based, error_focused, adaptive, reservoir) - sample_count: Number of rows to sample - stratify: Column for stratified sampling - time_column: Column for time-based sampling - start_date: Start date for time-based sampling - end_date: End date for time-based sampling - error_indicators: List of error indicator conditions - seed: Random seed for reproducibility - - Returns: - Sampled DataFrame - - Raises: - DataLoadError: If required parameters are missing - """ - from datacheck.sampling import SamplerFactory, SamplingStrategy - - try: - strategy = SamplingStrategy(sample_strategy.lower()) - except ValueError: - valid_strategies = [s.value for s in SamplingStrategy] - raise DataLoadError( - f"Invalid sampling strategy: '{sample_strategy}'. " - f"Valid options: {', '.join(valid_strategies)}" - ) - - # Create sampler using factory - sampler = SamplerFactory.create(strategy) - - # Configure and sample based on strategy - if strategy == SamplingStrategy.RANDOM: - if sample_count is None: - sample_count = min(10000, len(df)) - return sampler.sample(df, n=sample_count, seed=seed) - - elif strategy == SamplingStrategy.STRATIFIED: - if stratify is None: - raise DataLoadError("--stratify column required for stratified sampling") - if sample_count is None: - sample_count = min(10000, len(df)) - return sampler.sample(df, n=sample_count, stratify_column=stratify, seed=seed) - - elif strategy == SamplingStrategy.TIME_BASED: - if time_column is None: - raise DataLoadError("--time-column required for time_based sampling") - return sampler.sample( - df, - time_column=time_column, - start_date=start_date, - end_date=end_date, - n=sample_count, - seed=seed, - ) - - elif strategy == SamplingStrategy.ERROR_FOCUSED: - if error_indicators is None: - raise DataLoadError( - "--error-indicators required for error_focused sampling. " - "Example: 'age<0,price>10000'" - ) - if sample_count is None: - sample_count = min(10000, len(df)) - return sampler.sample( - df, - n=sample_count, - error_indicators=error_indicators, - seed=seed, - ) - - elif strategy == SamplingStrategy.ADAPTIVE: - if sample_count is None: - sample_count = min(10000, len(df)) - return sampler.sample( - df, - n=sample_count, - error_indicators=error_indicators, - seed=seed, - ) - - elif strategy == SamplingStrategy.RESERVOIR: - if sample_count is None: - sample_count = min(10000, len(df)) - return sampler.sample(df, k=sample_count, seed=seed) - - return df - - def _apply_config_sampling(self, df: pd.DataFrame) -> pd.DataFrame: - """Apply sampling from config file. - - Supports all sampling methods: none, random, stratified, top, systematic, - time_based, error_focused, adaptive, reservoir. - - Args: - df: DataFrame to sample - - Returns: - Sampled DataFrame (or original if no sampling configured) - - Raises: - DataLoadError: If sampling configuration is invalid - """ - if self.config.sampling is None: - return df - - sampling_config = self.config.sampling - - # No sampling - if sampling_config.method == "none": - return df - - # Top-N sampling - if sampling_config.method == "top": - if sampling_config.count is None: - raise DataLoadError("Top-N sampling requires 'count' in config") - return DataSampler.top_n(df, sampling_config.count) - - # Stratified sampling - if sampling_config.method == "stratified": - if sampling_config.stratify_by is None: - raise DataLoadError("Stratified sampling requires 'stratify_by' in config") - if sampling_config.count is None: - raise DataLoadError("Stratified sampling requires 'count' in config") - return DataSampler.stratified_sample( - df, - sampling_config.stratify_by, - sampling_config.count, - seed=sampling_config.seed - ) - - # Random sampling - if sampling_config.method == "random": - return DataSampler.random_sample( - df, - rate=sampling_config.rate, - count=sampling_config.count, - seed=sampling_config.seed - ) - - # Systematic sampling - if sampling_config.method == "systematic": - # Use interval if provided, otherwise calculate from rate or use default - if sampling_config.interval is not None: - interval = sampling_config.interval - elif sampling_config.rate is not None and sampling_config.rate > 0: - interval = int(1.0 / sampling_config.rate) - else: - # Default to every 10th row - interval = 10 - return DataSampler.systematic_sample( - df, interval=interval, start=sampling_config.start - ) - - # Advanced sampling methods - use SamplerFactory - from datacheck.sampling import SamplerFactory, SamplingStrategy - - # Time-based sampling - if sampling_config.method == "time_based": - if sampling_config.time_column is None: - raise DataLoadError("Time-based sampling requires 'time_column' in config") - sampler = SamplerFactory.create(SamplingStrategy.TIME_BASED) - return sampler.sample( - df, - time_column=sampling_config.time_column, - start_date=sampling_config.start_date, - end_date=sampling_config.end_date, - n=sampling_config.count, - seed=sampling_config.seed, - ) - - # Error-focused sampling - if sampling_config.method == "error_focused": - if sampling_config.error_indicators is None: - raise DataLoadError( - "Error-focused sampling requires 'error_indicators' in config" - ) - sampler = SamplerFactory.create(SamplingStrategy.ERROR_FOCUSED) - sample_count = sampling_config.count or min(10000, len(df)) - return sampler.sample( - df, - n=sample_count, - error_indicators=sampling_config.error_indicators, - seed=sampling_config.seed, - ) - - # Adaptive sampling - if sampling_config.method == "adaptive": - sampler = SamplerFactory.create(SamplingStrategy.ADAPTIVE) - sample_count = sampling_config.count or min(10000, len(df)) - return sampler.sample( - df, - n=sample_count, - error_indicators=sampling_config.error_indicators, - seed=sampling_config.seed, - ) - - # Reservoir sampling - if sampling_config.method == "reservoir": - if sampling_config.count is None: - raise DataLoadError("Reservoir sampling requires 'count' in config") - sampler = SamplerFactory.create(SamplingStrategy.RESERVOIR) - return sampler.sample(df, k=sampling_config.count, seed=sampling_config.seed) - - return df __all__ = [ diff --git a/datacheck/loader.py b/datacheck/loader.py index 2a995ec..1a4b954 100644 --- a/datacheck/loader.py +++ b/datacheck/loader.py @@ -1,7 +1,5 @@ """Data loaders for various formats.""" -import re -import sqlite3 from abc import ABC, abstractmethod from pathlib import Path from typing import TYPE_CHECKING, Any @@ -13,30 +11,6 @@ if TYPE_CHECKING: from datacheck.connectors.base import DatabaseConnector -# Optional DuckDB import -try: - import duckdb - - HAS_DUCKDB = True -except ImportError: - HAS_DUCKDB = False - -# Optional Delta Lake import -try: - import deltalake - - HAS_DELTALAKE = True -except ImportError: - HAS_DELTALAKE = False - -# Optional Avro import -try: - import fastavro - - HAS_FASTAVRO = True -except ImportError: - HAS_FASTAVRO = False - class DataLoader(ABC): """Abstract base class for data loaders. @@ -94,6 +68,7 @@ def __init__( file_path: str | Path, encoding: str | None = None, delimiter: str = ",", + columns: list[str] | None = None, **kwargs: Any, ) -> None: """Initialize CSV loader. @@ -102,11 +77,13 @@ def __init__( file_path: Path to the CSV file encoding: File encoding (auto-detected if None) delimiter: CSV delimiter character + columns: Column subset to load (None = all columns) **kwargs: Additional arguments passed to pandas.read_csv """ super().__init__(file_path) self.encoding = encoding self.delimiter = delimiter + self.columns = columns self.kwargs = kwargs def _detect_encoding(self) -> str: @@ -144,23 +121,26 @@ def load(self) -> pd.DataFrame: """ try: encoding = self._detect_encoding() + usecols_kwarg = {"usecols": self.columns} if self.columns is not None else {} try: # Use PyArrow engine for faster CSV parsing + Arrow-backed dtypes - df: pd.DataFrame = pd.read_csv( + df: pd.DataFrame = pd.read_csv( # type: ignore[call-overload] self.file_path, encoding=encoding, delimiter=self.delimiter, dtype_backend="pyarrow", engine="pyarrow", + **usecols_kwarg, **self.kwargs, ) except Exception: # Fallback to default engine for exotic encodings or edge cases - df = pd.read_csv( + df = pd.read_csv( # type: ignore[call-overload] self.file_path, encoding=encoding, delimiter=self.delimiter, dtype_backend="pyarrow", + **usecols_kwarg, **self.kwargs, ) self._validate_dataframe(df) @@ -216,413 +196,6 @@ def load(self) -> pd.DataFrame: raise DataLoadError(f"Error loading Parquet file {self.file_path}: {e}") from e -class DuckDBLoader(DataLoader): - """Loader for DuckDB and SQLite database files.""" - - # Valid pattern for table names (alphanumeric, underscore, dot for schema.table) - _TABLE_NAME_PATTERN = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*(\.[a-zA-Z_][a-zA-Z0-9_]*)?$") - - def __init__( - self, file_path: str | Path, table_name: str | None = None, query: str | None = None - ) -> None: - """Initialize DuckDB/SQLite loader. - - Args: - file_path: Path to the database file - table_name: Name of table to load (if query not provided) - query: SQL query to execute (takes precedence over table_name) - - Raises: - DataLoadError: If neither table_name nor query is provided - DataLoadError: If table_name contains invalid characters - """ - super().__init__(file_path) - if not table_name and not query: - raise DataLoadError("Either table_name or query must be provided") - # Validate table_name to prevent SQL injection - if table_name and not self._TABLE_NAME_PATTERN.match(table_name): - raise DataLoadError( - f"Invalid table name: {table_name}. " - "Table names must be alphanumeric with underscores, optionally with schema prefix." - ) - self.table_name = table_name - self.query = query - - def _is_sqlite(self) -> bool: - """Check if file is SQLite database. - - Returns: - True if file is SQLite, False otherwise - """ - try: - with open(self.file_path, "rb") as f: - header = f.read(16) - return header[:6] == b"SQLite" - except Exception: - return False - - def _build_query(self) -> str: - """Build SQL query from table name or use provided query. - - Returns: - SQL query string - """ - if self.query: - return self.query - return f'SELECT * FROM "{self.table_name}"' # nosec B608 - - def load(self) -> pd.DataFrame: - """Load data from database into DataFrame. - - Returns: - DataFrame containing database data - - Raises: - DataLoadError: If database cannot be loaded or DuckDB is not installed - EmptyDatasetError: If query returns no data - """ - query = self._build_query() - - try: - if self._is_sqlite(): - # Use sqlite3 for SQLite files - sqlite_conn = sqlite3.connect(str(self.file_path)) - try: - df = pd.read_sql_query(query, sqlite_conn, dtype_backend="pyarrow") - finally: - sqlite_conn.close() - else: - # Use DuckDB for DuckDB files - if not HAS_DUCKDB: - raise DataLoadError( - "DuckDB is not installed. Install it with: pip install 'datacheck[duckdb]'" - ) - duckdb_conn = duckdb.connect(str(self.file_path), read_only=True) - try: - df = duckdb_conn.execute(query).fetchdf() - finally: - duckdb_conn.close() - - self._validate_dataframe(df) - return df - - except EmptyDatasetError: - raise - except Exception as e: - raise DataLoadError( - f"Error loading database file {self.file_path}: {e}" - ) from e - - -class DeltaLakeLoader: - """Loader for Delta Lake tables with time travel and cloud storage support. - - Delta Lake is a directory-based format, so this loader works with table paths - rather than individual files. Supports reading from local filesystem, S3, GCS, - and Azure Blob Storage. - - Features: - - Time travel: Load specific versions or timestamps - - Cloud storage: S3, GCS, Azure with authentication - - Column selection: Load only specified columns - - Partitioning: Efficient reads with partition pruning - - Example: - >>> loader = DeltaLakeLoader("s3://bucket/delta-table", version=5) - >>> df = loader.load() - """ - - def __init__( - self, - table_path: str | Path, - version: int | None = None, - timestamp: str | None = None, - columns: list[str] | None = None, - storage_options: dict[str, str] | None = None, - **kwargs: Any, - ) -> None: - """Initialize Delta Lake loader. - - Args: - table_path: Path to Delta table (local path or cloud URI like s3://, gs://, az://) - version: Specific version to load (time travel) - timestamp: ISO 8601 timestamp to load data as of (time travel) - columns: List of columns to load (None for all) - storage_options: Cloud storage authentication options: - - S3: {"AWS_ACCESS_KEY_ID": "...", "AWS_SECRET_ACCESS_KEY": "...", "AWS_REGION": "..."} - - GCS: {"GOOGLE_SERVICE_ACCOUNT_KEY": "..."} or {"GOOGLE_SERVICE_ACCOUNT": "..."} - - Azure: {"AZURE_STORAGE_ACCOUNT_NAME": "...", "AZURE_STORAGE_ACCOUNT_KEY": "..."} - **kwargs: Additional arguments passed to DeltaTable - - Raises: - DataLoadError: If Delta Lake is not installed - DataLoadError: If both version and timestamp are specified - """ - if not HAS_DELTALAKE: - raise DataLoadError( - "Delta Lake is not installed. Install it with: pip install 'datacheck[deltalake]'" - ) - - if version is not None and timestamp is not None: - raise DataLoadError("Cannot specify both 'version' and 'timestamp' for time travel") - - self.table_path = str(table_path) - self.version = version - self.timestamp = timestamp - self.columns = columns - self.storage_options = storage_options or {} - self.kwargs = kwargs - - def _is_cloud_path(self) -> bool: - """Check if table path is a cloud URI. - - Returns: - True if path is S3, GCS, or Azure URI - """ - cloud_prefixes = ("s3://", "s3a://", "gs://", "az://", "abfs://", "abfss://") - return self.table_path.startswith(cloud_prefixes) - - def _validate_path(self) -> None: - """Validate that the Delta table path exists. - - Raises: - DataLoadError: If local path doesn't exist - """ - if not self._is_cloud_path(): - path = Path(self.table_path) - if not path.exists(): - raise DataLoadError(f"Delta table not found: {self.table_path}") - if not path.is_dir(): - raise DataLoadError(f"Delta table must be a directory: {self.table_path}") - # Check for _delta_log directory - if not (path / "_delta_log").exists(): - raise DataLoadError( - f"Invalid Delta table (missing _delta_log): {self.table_path}" - ) - - def load(self) -> pd.DataFrame: - """Load Delta table into DataFrame. - - Returns: - DataFrame containing Delta table data - - Raises: - DataLoadError: If table cannot be loaded - EmptyDatasetError: If table is empty - """ - self._validate_path() - - try: - # Build DeltaTable kwargs - dt_kwargs: dict[str, Any] = {} - if self.version is not None: - dt_kwargs["version"] = self.version - if self.storage_options: - dt_kwargs["storage_options"] = self.storage_options - - # Load the Delta table - dt = deltalake.DeltaTable(self.table_path, **dt_kwargs, **self.kwargs) - - # Handle timestamp-based time travel - if self.timestamp is not None: - dt.load_as_version(self.timestamp) - - # Convert to pandas DataFrame via Arrow for better performance - try: - arrow_table = dt.to_pyarrow_table(columns=self.columns) if self.columns else dt.to_pyarrow_table() - df = arrow_table.to_pandas(types_mapper=pd.ArrowDtype) - except Exception: - # Fallback to direct conversion - if self.columns: - df = dt.to_pandas(columns=self.columns) - else: - df = dt.to_pandas() - - # Validate not empty - if df.empty: - raise EmptyDatasetError(f"Delta table is empty: {self.table_path}") - - result: pd.DataFrame = df - return result - - except EmptyDatasetError: - raise - except DataLoadError: - raise - except Exception as e: - raise DataLoadError(f"Error loading Delta table {self.table_path}: {e}") from e - - def load_metadata(self) -> dict[str, Any]: - """Load Delta table metadata. - - Returns: - Dictionary containing table metadata (version, created_time, etc.) - - Raises: - DataLoadError: If metadata cannot be retrieved - """ - self._validate_path() - - try: - dt_kwargs: dict[str, Any] = {} - if self.version is not None: - dt_kwargs["version"] = self.version - if self.storage_options: - dt_kwargs["storage_options"] = self.storage_options - - dt = deltalake.DeltaTable(self.table_path, **dt_kwargs) - - return { - "version": dt.version(), - "file_uris": dt.file_uris(), - "schema": str(dt.schema().to_arrow()), - "metadata": dt.metadata(), - "protocol": dt.protocol(), - } - except Exception as e: - raise DataLoadError(f"Error getting Delta metadata {self.table_path}: {e}") from e - - def history(self, limit: int | None = None) -> list[dict[str, Any]]: - """Get Delta table history. - - Args: - limit: Maximum number of history entries to return - - Returns: - List of history entries (version, timestamp, operation, etc.) - - Raises: - DataLoadError: If history cannot be retrieved - """ - self._validate_path() - - try: - dt_kwargs: dict[str, Any] = {} - if self.storage_options: - dt_kwargs["storage_options"] = self.storage_options - - dt = deltalake.DeltaTable(self.table_path, **dt_kwargs) - history: list[dict[str, Any]] = dt.history(limit=limit) - - return history - except Exception as e: - raise DataLoadError(f"Error getting Delta history {self.table_path}: {e}") from e - - -class AvroLoader(DataLoader): - """Loader for Apache Avro files. - - Supports reading Avro files with optional schema validation and handles - compressed files (deflate, snappy, etc.) automatically. - - Example: - >>> loader = AvroLoader("data.avro") - >>> df = loader.load() - """ - - def __init__( - self, - file_path: str | Path, - reader_schema: dict[str, Any] | None = None, - **kwargs: Any, - ) -> None: - """Initialize Avro loader. - - Args: - file_path: Path to the Avro file - reader_schema: Optional Avro schema for schema evolution/projection - **kwargs: Additional arguments passed to fastavro.reader - - Raises: - DataLoadError: If fastavro is not installed - """ - if not HAS_FASTAVRO: - raise DataLoadError( - "fastavro is not installed. Install it with: pip install 'datacheck[avro]'" - ) - - super().__init__(file_path) - self.reader_schema = reader_schema - self.kwargs = kwargs - - def load(self) -> pd.DataFrame: - """Load Avro file into DataFrame. - - Returns: - DataFrame containing Avro data - - Raises: - DataLoadError: If Avro file cannot be loaded - EmptyDatasetError: If Avro file is empty - """ - try: - records = [] - with open(self.file_path, "rb") as f: - reader = fastavro.reader(f, reader_schema=self.reader_schema, **self.kwargs) - for record in reader: - records.append(record) - - if not records: - raise EmptyDatasetError(f"Avro file is empty: {self.file_path}") - - df = pd.DataFrame(records).convert_dtypes(dtype_backend="pyarrow") - self._validate_dataframe(df) - return df - - except EmptyDatasetError: - raise - except Exception as e: - raise DataLoadError(f"Error loading Avro file {self.file_path}: {e}") from e - - def load_schema(self) -> dict[str, Any]: - """Load the Avro file's schema. - - Returns: - Dictionary containing the Avro schema - - Raises: - DataLoadError: If schema cannot be read - """ - try: - with open(self.file_path, "rb") as f: - reader = fastavro.reader(f) - schema = reader.writer_schema - if isinstance(schema, dict): - return dict(schema) - raise DataLoadError(f"Unexpected schema type: {type(schema)}") - except DataLoadError: - raise - except Exception as e: - raise DataLoadError(f"Error reading Avro schema {self.file_path}: {e}") from e - - def validate_schema(self, expected_schema: dict[str, Any]) -> bool: - """Validate file schema against expected schema. - - Args: - expected_schema: The expected Avro schema to validate against - - Returns: - True if schemas match (field names and types) - - Raises: - DataLoadError: If schema validation fails - """ - try: - actual_schema = self.load_schema() - - # Extract field info for comparison - actual_fields = { - f["name"]: f["type"] for f in actual_schema.get("fields", []) - } - expected_fields = { - f["name"]: f["type"] for f in expected_schema.get("fields", []) - } - - return actual_fields == expected_fields - except Exception as e: - raise DataLoadError(f"Error validating Avro schema: {e}") from e - - class DatabaseLoader(DataLoader): """Loader for database sources.""" @@ -692,11 +265,11 @@ class LoaderFactory: """Factory for creating appropriate data loaders based on file format.""" @staticmethod - def create_loader(source: str | Path, **kwargs: Any) -> DataLoader | DeltaLakeLoader: + def create_loader(source: str | Path, **kwargs: Any) -> DataLoader: """Create appropriate loader based on source type. Args: - source: Data source (file path, connection string, or cloud URI) + source: Data source (file path or connection string) **kwargs: Additional arguments for specific loaders Returns: @@ -704,6 +277,7 @@ def create_loader(source: str | Path, **kwargs: Any) -> DataLoader | DeltaLakeLo Raises: DataLoadError: If source type cannot be determined + UnsupportedFormatError: If file format is not supported """ source_str = str(source) @@ -716,63 +290,31 @@ def create_loader(source: str | Path, **kwargs: Any) -> DataLoader | DeltaLakeLo query=kwargs.get("query") ) - # Check if it's a Delta Lake source (delta:// protocol or cloud storage) - delta_prefixes = ("delta://", "s3://", "s3a://", "gs://", "az://", "abfs://", "abfss://") - if source_str.startswith(delta_prefixes): - # Strip delta:// prefix if present, cloud paths remain as-is - if source_str.startswith("delta://"): - table_path = source_str[8:] # Remove "delta://" - else: - table_path = source_str - - # Extract Delta-specific kwargs - delta_kwargs = { - k: v for k, v in kwargs.items() - if k in ["version", "timestamp", "columns", "storage_options"] - } - return DeltaLakeLoader(table_path, **delta_kwargs) - - # Existing file-based logic + # File-based loaders source_path = Path(source) if not source_path.exists(): raise DataLoadError(f"File not found: {source}") - # Check if it's a Delta table directory - if source_path.is_dir() and (source_path / "_delta_log").exists(): - delta_kwargs = { - k: v for k, v in kwargs.items() - if k in ["version", "timestamp", "columns", "storage_options"] - } - return DeltaLakeLoader(source_path, **delta_kwargs) - - # Must be a file for remaining loaders if not source_path.is_file(): raise DataLoadError(f"Path is not a file: {source}") ext = source_path.suffix.lower() - # Filter out database-specific and delta-specific kwargs for file loaders + # Filter out non-file-loader kwargs file_kwargs = {k: v for k, v in kwargs.items() - if k not in ["table", "where", "query", "version", "timestamp", - "columns", "storage_options", "reader_schema"]} + if k not in ["table", "where", "query", "columns"]} if ext == ".csv": - return CSVLoader(source_path, **file_kwargs) + csv_columns = kwargs.get("columns") + return CSVLoader(source_path, columns=csv_columns, **file_kwargs) elif ext in [".parquet", ".pq"]: - # Pass columns for column pruning if provided parquet_columns = kwargs.get("columns") return ParquetLoader(source_path, columns=parquet_columns, **file_kwargs) - elif ext in [".db", ".sqlite", ".sqlite3", ".duckdb"]: - return DuckDBLoader(source_path, **file_kwargs) - elif ext == ".avro": - avro_kwargs = {k: v for k, v in kwargs.items() - if k in ["reader_schema"]} - return AvroLoader(source_path, **avro_kwargs) else: raise UnsupportedFormatError( f"Unsupported file format: {ext}. " - f"Supported formats: .csv, .parquet, .pq, .db, .duckdb, .sqlite, .sqlite3, .avro" + f"Supported formats: .csv, .parquet, .pq" ) @staticmethod @@ -799,9 +341,6 @@ def load(file_path: str | Path, **kwargs: Any) -> pd.DataFrame: "DataLoader", "CSVLoader", "ParquetLoader", - "DuckDBLoader", - "DeltaLakeLoader", - "AvroLoader", "DatabaseLoader", "LoaderFactory", ] diff --git a/datacheck/parallel/executor.py b/datacheck/parallel/executor.py index d59a83e..e557617 100644 --- a/datacheck/parallel/executor.py +++ b/datacheck/parallel/executor.py @@ -1,7 +1,7 @@ """Parallel execution engine for DataCheck.""" -from concurrent.futures import ProcessPoolExecutor, as_completed -from multiprocessing import Pool, cpu_count +from concurrent.futures import ThreadPoolExecutor, as_completed +from multiprocessing import cpu_count from typing import Any import pandas as pd @@ -27,11 +27,12 @@ class ParallelExecutor: - """Execute validation rules in parallel across multiple CPU cores. + """Execute validation rules in parallel using threads. - Splits data into chunks and processes each chunk in parallel, - then aggregates the results. Provides significant speedup for - large datasets on multi-core systems. + Splits data into chunks and processes each chunk concurrently with + ThreadPoolExecutor (zero pickle overhead, pandas/NumPy release GIL + during C-level operations). Aggregates results across chunks. + Provides significant speedup for large datasets. Features: - Progress tracking with Rich progress bar (if available) @@ -46,14 +47,14 @@ class ParallelExecutor: def __init__( self, workers: int | None = None, - chunk_size: int = 10000, + chunk_size: int = 100_000, show_progress: bool = True, ) -> None: """Initialize parallel executor. Args: - workers: Number of worker processes (default: CPU count) - chunk_size: Rows per chunk (default: 10000) + workers: Number of worker threads (default: CPU count) + chunk_size: Rows per chunk (default: 100_000) show_progress: Show progress bar during execution (default: True) """ self.workers = workers or cpu_count() @@ -89,9 +90,11 @@ def validate_parallel( # Prepare work items (chunk, rules pairs) work_items = [(chunk, rules) for chunk in chunks] - # Execute in parallel - with Pool(self.workers) as pool: - chunk_results = pool.starmap(self._validate_chunk, work_items) + # Execute in parallel using threads (zero pickle overhead) + with ThreadPoolExecutor(max_workers=self.workers) as executor: + chunk_results = list( + executor.map(lambda item: self._validate_chunk(*item), work_items) + ) # Aggregate results across chunks aggregated_results = self._aggregate_results(chunk_results, len(df)) @@ -126,7 +129,7 @@ def _validate_with_progress( ) as progress: task = progress.add_task("Validating", total=total) - with ProcessPoolExecutor(max_workers=self.workers) as executor: + with ThreadPoolExecutor(max_workers=self.workers) as executor: # Submit all tasks future_to_idx = { executor.submit(self._validate_chunk, chunk, rules): i @@ -175,8 +178,6 @@ def _chunk_dataframe(self, df: pd.DataFrame) -> list[pd.DataFrame]: def _validate_chunk(chunk: pd.DataFrame, rules: list[Any]) -> list[RuleResult]: """Validate a single chunk. - This is a static method so it can be pickled for multiprocessing. - Args: chunk: DataFrame chunk to validate rules: List of validation rules diff --git a/datacheck/plugins/__init__.py b/datacheck/plugins/__init__.py deleted file mode 100644 index 57905ad..0000000 --- a/datacheck/plugins/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -"""Custom rule plugin system for DataCheck.""" - -from datacheck.plugins.decorators import custom_rule, validate_custom_rule_signature -from datacheck.plugins.loader import PluginLoader -from datacheck.plugins.registry import RuleRegistry, get_global_registry - -__all__ = [ - "custom_rule", - "validate_custom_rule_signature", - "RuleRegistry", - "get_global_registry", - "PluginLoader", -] diff --git a/datacheck/plugins/decorators.py b/datacheck/plugins/decorators.py deleted file mode 100644 index 9e93c41..0000000 --- a/datacheck/plugins/decorators.py +++ /dev/null @@ -1,84 +0,0 @@ -"""Decorators for custom validation rules.""" - -import functools -from collections.abc import Callable -from typing import Any - - -def custom_rule(func: Callable) -> Callable: - """Decorator to mark a function as a custom validation rule. - - Custom rules must accept a pandas Series as the first parameter and return - a pandas Series of boolean values (True = valid, False = invalid). - - Example: - >>> @custom_rule - ... def is_business_email(column: pd.Series, allowed_domains: list) -> pd.Series: - ... return column.str.endswith(tuple(allowed_domains)) - - >>> # Use in config - >>> checks: - ... - name: email_check - ... column: email - ... rules: - ... custom: - ... rule: is_business_email - ... params: - ... allowed_domains: ["company.com"] - - Args: - func: Function to be marked as a custom rule - - Returns: - Decorated function with metadata - """ - @functools.wraps(func) - def wrapper(*args: Any, **kwargs: Any) -> Any: - """Invoke the decorated custom rule function.""" - return func(*args, **kwargs) - - # Mark function as custom rule - wrapper._is_custom_rule = True # type: ignore - wrapper._rule_name = func.__name__ # type: ignore - wrapper._original_func = func # type: ignore - - return wrapper - - -def validate_custom_rule_signature(func: Callable) -> bool: - """Validate that a custom rule has the correct signature. - - Custom rules must: - - Accept a pandas Series as first parameter - - Return a pandas Series of booleans - - Accept **kwargs for additional parameters - - Args: - func: Function to validate - - Returns: - True if signature is valid - - Raises: - ValueError: If signature is invalid - """ - import inspect - - sig = inspect.signature(func) - params = list(sig.parameters.values()) - - if len(params) < 1: - raise ValueError( - f"Custom rule '{func.__name__}' must accept at least one parameter (column)" - ) - - # First parameter should be the column (Series) - first_param = params[0] - if first_param.annotation != inspect.Parameter.empty: - import pandas as pd - if first_param.annotation != pd.Series: - raise ValueError( - f"Custom rule '{func.__name__}' first parameter should be pd.Series" - ) - - return True diff --git a/datacheck/plugins/loader.py b/datacheck/plugins/loader.py deleted file mode 100644 index 8cc54fc..0000000 --- a/datacheck/plugins/loader.py +++ /dev/null @@ -1,123 +0,0 @@ -"""Plugin loader for custom validation rules.""" - -import importlib.util -import inspect -import sys -from pathlib import Path -from typing import Any - -from datacheck.exceptions import ConfigurationError -from datacheck.plugins.registry import get_global_registry - - -class PluginLoader: - """Loads custom validation rules from Python files. - - The loader scans Python files for functions decorated with @custom_rule - and registers them in the global rule registry. - - Example: - >>> loader = PluginLoader() - >>> loader.load_from_file("my_rules.py") - >>> # Rules from my_rules.py are now available - """ - - def __init__(self) -> None: - """Initialize plugin loader.""" - self.registry = get_global_registry() - self._loaded_modules: list[str] = [] - - def load_from_file(self, file_path: str) -> list[str]: - """Load custom rules from a Python file. - - Args: - file_path: Path to Python file containing custom rules - - Returns: - List of rule names that were loaded - - Raises: - ConfigurationError: If file cannot be loaded - """ - path = Path(file_path) - - if not path.exists(): - raise ConfigurationError(f"Plugin file not found: {file_path}") - - if not path.suffix == ".py": - raise ConfigurationError(f"Plugin file must be a Python file: {file_path}") - - try: - # Load module from file - module_name = f"datacheck_plugin_{path.stem}" - spec = importlib.util.spec_from_file_location(module_name, path) - - if spec is None or spec.loader is None: - raise ConfigurationError(f"Failed to load plugin: {file_path}") - - module = importlib.util.module_from_spec(spec) - sys.modules[module_name] = module - spec.loader.exec_module(module) - - # Find and register custom rules - loaded_rules = self._register_rules_from_module(module) - self._loaded_modules.append(module_name) - - return loaded_rules - - except Exception as e: - raise ConfigurationError(f"Error loading plugin {file_path}: {e}") from e - - def _register_rules_from_module(self, module: Any) -> list[str]: - """Register all custom rules from a module. - - Args: - module: Python module to scan for rules - - Returns: - List of registered rule names - """ - loaded_rules = [] - - for name, obj in inspect.getmembers(module): - if callable(obj) and hasattr(obj, "_is_custom_rule"): - rule_name = getattr(obj, "_rule_name", name) - - # Register rule if not already registered - if not self.registry.has_rule(rule_name): - self.registry.register(rule_name, obj) - loaded_rules.append(rule_name) - - return loaded_rules - - def load_from_directory(self, directory_path: str) -> list[str]: - """Load all custom rules from a directory. - - Args: - directory_path: Path to directory containing Python files - - Returns: - List of all loaded rule names - - Raises: - ConfigurationError: If directory cannot be accessed - """ - dir_path = Path(directory_path) - - if not dir_path.exists(): - raise ConfigurationError(f"Plugin directory not found: {directory_path}") - - if not dir_path.is_dir(): - raise ConfigurationError(f"Path is not a directory: {directory_path}") - - all_loaded_rules = [] - - # Load all .py files in directory - for py_file in dir_path.glob("*.py"): - if py_file.name.startswith("_"): - continue # Skip private files - - loaded_rules = self.load_from_file(str(py_file)) - all_loaded_rules.extend(loaded_rules) - - return all_loaded_rules diff --git a/datacheck/plugins/registry.py b/datacheck/plugins/registry.py deleted file mode 100644 index 3b789fa..0000000 --- a/datacheck/plugins/registry.py +++ /dev/null @@ -1,120 +0,0 @@ -"""Registry for custom validation rules.""" - -from collections.abc import Callable -from typing import Any - -import pandas as pd - -from datacheck.exceptions import RuleDefinitionError - - -class RuleRegistry: - """Registry for storing and retrieving custom validation rules. - - The registry maintains a mapping of rule names to their implementations, - allowing custom rules to be loaded and executed dynamically. - - Example: - >>> registry = RuleRegistry() - >>> registry.register("my_rule", my_rule_func) - >>> rule_func = registry.get("my_rule") - """ - - def __init__(self) -> None: - """Initialize empty rule registry.""" - self._rules: dict[str, Callable] = {} - - def register(self, name: str, func: Callable) -> None: - """Register a custom rule. - - Args: - name: Name of the rule - func: Rule function - - Raises: - RuleDefinitionError: If rule name is already registered - """ - if name in self._rules: - raise RuleDefinitionError(f"Rule '{name}' is already registered") - - self._rules[name] = func - - def get(self, name: str) -> Callable | None: - """Get a registered rule by name. - - Args: - name: Name of the rule - - Returns: - Rule function or None if not found - """ - return self._rules.get(name) - - def has_rule(self, name: str) -> bool: - """Check if a rule is registered. - - Args: - name: Name of the rule - - Returns: - True if rule exists - """ - return name in self._rules - - def list_rules(self) -> list[str]: - """List all registered rule names. - - Returns: - List of rule names - """ - return list(self._rules.keys()) - - def clear(self) -> None: - """Clear all registered rules.""" - self._rules.clear() - - def execute_rule( - self, - rule_name: str, - column: pd.Series, - params: dict[str, Any] | None = None - ) -> pd.Series: - """Execute a custom rule. - - Args: - rule_name: Name of the rule to execute - column: Column data to validate - params: Optional parameters for the rule - - Returns: - Boolean series indicating valid rows - - Raises: - RuleDefinitionError: If rule not found or execution fails - """ - rule_func = self.get(rule_name) - - if rule_func is None: - raise RuleDefinitionError(f"Custom rule '{rule_name}' not found in registry") - - try: - if params: - result: pd.Series = rule_func(column, **params) - else: - result = rule_func(column) - return result - except Exception as e: - raise RuleDefinitionError(f"Error executing custom rule '{rule_name}': {e}") from e - - -# Global registry instance -_global_registry = RuleRegistry() - - -def get_global_registry() -> RuleRegistry: - """Get the global rule registry. - - Returns: - Global RuleRegistry instance - """ - return _global_registry diff --git a/datacheck/profiling/__init__.py b/datacheck/profiling/__init__.py deleted file mode 100644 index 12f27bf..0000000 --- a/datacheck/profiling/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -"""Data profiling and analysis.""" - -from datacheck.profiling.models import ColumnProfile, DatasetProfile -from datacheck.profiling.outliers import OutlierDetector, OutlierMethod -from datacheck.profiling.profiler import DataProfiler -from datacheck.profiling.quality import QualityScorer -from datacheck.profiling.statistics import StatisticsCalculator -from datacheck.profiling.suggestions import RuleSuggester - -__all__ = [ - "DataProfiler", - "ColumnProfile", - "DatasetProfile", - "StatisticsCalculator", - "OutlierDetector", - "OutlierMethod", - "QualityScorer", - "RuleSuggester", -] diff --git a/datacheck/profiling/formatters/__init__.py b/datacheck/profiling/formatters/__init__.py deleted file mode 100644 index f087504..0000000 --- a/datacheck/profiling/formatters/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -"""Profiling output formatters.""" - -from datacheck.profiling.formatters.json_formatter import JsonFormatter -from datacheck.profiling.formatters.markdown_formatter import MarkdownFormatter -from datacheck.profiling.formatters.terminal_formatter import TerminalFormatter - -__all__ = ["TerminalFormatter", "JsonFormatter", "MarkdownFormatter"] diff --git a/datacheck/profiling/formatters/json_formatter.py b/datacheck/profiling/formatters/json_formatter.py deleted file mode 100644 index f566287..0000000 --- a/datacheck/profiling/formatters/json_formatter.py +++ /dev/null @@ -1,141 +0,0 @@ -"""JSON formatting for profiles.""" - -import json -from pathlib import Path -from typing import Any - -from datacheck.profiling.models import DatasetProfile - - -class JsonFormatter: - """Format profile results as JSON.""" - - def __init__( - self, - pretty: bool = True, - indent: int = 2, - include_suggestions: bool = True, - include_correlations: bool = True, - ): - """ - Initialize formatter. - - Args: - pretty: Whether to format with indentation - indent: Indentation level for pretty printing - include_suggestions: Include rule suggestions in output - include_correlations: Include correlation matrix in output - """ - self.pretty = pretty - self.indent = indent if pretty else None - self.include_suggestions = include_suggestions - self.include_correlations = include_correlations - - def format(self, profile: DatasetProfile) -> str: - """ - Format profile as JSON string. - - Args: - profile: DatasetProfile to format - - Returns: - JSON string - """ - data = self._profile_to_dict(profile) - return json.dumps(data, indent=self.indent, default=str) - - def save(self, profile: DatasetProfile, path: str | Path) -> None: - """ - Save profile to JSON file. - - Args: - profile: DatasetProfile to save - path: Path to output file - """ - path = Path(path) - path.parent.mkdir(parents=True, exist_ok=True) - - with open(path, "w") as f: - f.write(self.format(profile)) - - def _profile_to_dict(self, profile: DatasetProfile) -> dict[str, Any]: - """ - Convert profile to dictionary. - - Args: - profile: DatasetProfile to convert - - Returns: - Dictionary representation - """ - result = { - "name": profile.name, - "created_at": profile.created_at.isoformat(), - "summary": { - "row_count": profile.row_count, - "column_count": profile.column_count, - "overall_quality_score": profile.overall_quality_score, - "completeness_percentage": profile.completeness_percentage, - "total_nulls": profile.total_nulls, - "total_duplicates": profile.total_duplicates, - "memory_usage_mb": profile.memory_usage_mb, - }, - "columns": { - name: self._column_to_dict(col) - for name, col in profile.columns.items() - }, - } - - if self.include_correlations: - result["correlations"] = profile.correlations - - return result - - def _column_to_dict(self, col: Any) -> dict[str, Any]: - """Convert column profile to dictionary.""" - result = { - "name": col.name, - "dtype": col.dtype, - "total_count": col.total_count, - "null_count": col.null_count, - "unique_count": col.unique_count, - "duplicate_count": col.duplicate_count, - "null_percentage": col.null_percentage, - "unique_percentage": col.unique_percentage, - "completeness": col.completeness, - "quality_score": col.quality_score, - "issues": col.issues, - } - - if self.include_suggestions: - result["suggestions"] = col.suggestions - - # Add numeric stats if present - if col.min_value is not None: - result.update({ - "min_value": col.min_value, - "max_value": col.max_value, - "mean": col.mean, - "median": col.median, - "std_dev": col.std_dev, - "percentile_25": col.percentile_25, - "percentile_75": col.percentile_75, - "outlier_count": col.outlier_count, - "outlier_percentage": col.outlier_percentage, - }) - - # Add datetime stats if present - if col.min_date is not None: - result.update({ - "inferred_type": col.inferred_type, - "min_date": col.min_date, - "max_date": col.max_date, - }) - - # Add top values - if col.top_values: - result["top_values"] = [ - {"value": str(v), "count": c} for v, c in col.top_values - ] - - return result diff --git a/datacheck/profiling/formatters/markdown_formatter.py b/datacheck/profiling/formatters/markdown_formatter.py deleted file mode 100644 index dec93b6..0000000 --- a/datacheck/profiling/formatters/markdown_formatter.py +++ /dev/null @@ -1,361 +0,0 @@ -"""Markdown formatting for profiles.""" - -from pathlib import Path - -from datacheck.profiling.models import ColumnProfile, DatasetProfile -from datacheck.profiling.quality import QualityScorer - - -class MarkdownFormatter: - """Format profile results as Markdown.""" - - def __init__( - self, - include_suggestions: bool = True, - include_correlations: bool = True, - ): - """ - Initialize formatter. - - Args: - include_suggestions: Include rule suggestions in output - include_correlations: Include correlation matrix in output - """ - self.include_suggestions = include_suggestions - self.include_correlations = include_correlations - - def format(self, profile: DatasetProfile) -> str: - """ - Format profile as Markdown string. - - Args: - profile: DatasetProfile to format - - Returns: - Markdown string - """ - lines = [] - - # Title - lines.append(f"# Data Profile: {profile.name}") - lines.append("") - lines.append(f"*Generated: {profile.created_at.strftime('%Y-%m-%d %H:%M:%S')}*") - lines.append("") - - # Summary - lines.extend(self._format_summary(profile)) - - # Quality Overview - lines.extend(self._format_quality_overview(profile)) - - # Column Overview table - lines.extend(self._format_column_overview(profile)) - - # Column Profiles - lines.extend(self._format_columns(profile)) - - # Correlations - if self.include_correlations and profile.correlations: - lines.extend(self._format_correlations(profile)) - - # Suggestions - if self.include_suggestions: - lines.extend(self._format_suggestions(profile)) - - # Recommendations - lines.extend(self._format_recommendations(profile)) - - return "\n".join(lines) - - def save(self, profile: DatasetProfile, path: str | Path) -> None: - """ - Save profile to Markdown file. - - Args: - profile: DatasetProfile to save - path: Path to output file - """ - path = Path(path) - path.parent.mkdir(parents=True, exist_ok=True) - - with open(path, "w", encoding="utf-8") as f: - f.write(self.format(profile)) - - def _format_summary(self, profile: DatasetProfile) -> list[str]: - """Format summary section.""" - grade = QualityScorer.get_quality_grade(profile.overall_quality_score) - - lines = [ - "## Summary", - "", - "| Metric | Value |", - "|--------|-------|", - f"| Rows | {profile.row_count:,} |", - f"| Columns | {profile.column_count} |", - f"| Overall Quality | {profile.overall_quality_score:.1f}/100 ({grade}) |", - f"| Completeness | {profile.completeness_percentage:.1f}% |", - f"| Total Nulls | {profile.total_nulls:,} |", - f"| Duplicate Rows | {profile.total_duplicates:,} |", - f"| Memory Usage | {profile.memory_usage_mb:.2f} MB |", - "", - ] - return lines - - def _format_quality_overview(self, profile: DatasetProfile) -> list[str]: - """Format quality overview section.""" - grade = QualityScorer.get_quality_grade(profile.overall_quality_score) - - lines = [ - "## Quality Overview", - "", - f"**Overall Grade: {grade}** ({profile.overall_quality_score:.1f}/100)", - "", - ] - - # Grade distribution - grade_counts: dict[str, int] = {"A": 0, "B": 0, "C": 0, "D": 0, "F": 0} - for col in profile.columns.values(): - g = QualityScorer.get_quality_grade(col.quality_score) - grade_counts[g] = grade_counts.get(g, 0) + 1 - - dist_parts = [] - for g in ("A", "B", "C", "D", "F"): - if grade_counts[g] > 0: - dist_parts.append(f"{grade_counts[g]} {g}") - if dist_parts: - lines.append(f"**Grade distribution:** {', '.join(dist_parts)}") - lines.append("") - - # Issues summary - all_issues = [] - for col in profile.columns.values(): - for issue in col.issues: - all_issues.append(f"- **{col.name}**: {issue}") - - if all_issues: - lines.append("### Issues Detected") - lines.append("") - lines.extend(all_issues[:20]) - if len(all_issues) > 20: - lines.append(f"- *... and {len(all_issues) - 20} more*") - lines.append("") - else: - lines.append("*No significant quality issues detected.*") - lines.append("") - - return lines - - def _format_column_overview(self, profile: DatasetProfile) -> list[str]: - """Format column overview summary table.""" - lines = [ - "## Column Overview", - "", - "| Column | Type | Dtype | Quality | Completeness | Nulls | Unique |", - "|--------|------|-------|---------|--------------|-------|--------|", - ] - - for col in profile.columns.values(): - grade = QualityScorer.get_quality_grade(col.quality_score) - lines.append( - f"| {col.name} " - f"| {col.column_type} " - f"| {col.dtype} " - f"| {col.quality_score:.0f} [{grade}] " - f"| {col.completeness:.0f}% " - f"| {col.null_count:,} ({col.null_percentage:.0f}%) " - f"| {col.unique_count:,} ({col.unique_percentage:.0f}%) |" - ) - - lines.append("") - return lines - - def _format_columns(self, profile: DatasetProfile) -> list[str]: - """Format columns section.""" - lines = [ - "## Column Details", - "", - ] - - for col in profile.columns.values(): - lines.extend(self._format_column(col)) - - return lines - - def _format_column(self, col: ColumnProfile) -> list[str]: - """Format single column.""" - grade = QualityScorer.get_quality_grade(col.quality_score) - - lines = [ - f"### {col.name}", - "", - "| Metric | Value |", - "|--------|-------|", - f"| Type | {col.column_type} (`{col.dtype}`) |", - f"| Quality | {col.quality_score:.1f}/100 ({grade}) |", - f"| Total Count | {col.total_count:,} |", - f"| Null Count | {col.null_count:,} ({col.null_percentage:.1f}%) |", - f"| Unique Count | {col.unique_count:,} ({col.unique_percentage:.1f}%) |", - f"| Completeness | {col.completeness:.1f}% |", - ] - - # Numeric stats - if col.min_value is not None: - lines.extend([ - f"| Min | {col.min_value:.4f} |", - f"| Max | {col.max_value:.4f} |", - f"| Mean | {col.mean:.4f} |", - f"| Median | {col.median:.4f} |", - f"| Std Dev | {col.std_dev:.4f} |", - f"| Q1 (25th) | {col.percentile_25:.4f} |", - f"| Q3 (75th) | {col.percentile_75:.4f} |", - f"| Outliers | {col.outlier_count} ({col.outlier_percentage:.1f}%) |", - ]) - - # Datetime stats - if col.min_date is not None: - lines.extend([ - f"| Min Date | {col.min_date} |", - f"| Max Date | {col.max_date} |", - ]) - - # String length stats - if col.str_length_min is not None: - lines.append( - f"| String Lengths | {col.str_length_min}-{col.str_length_max} " - f"(avg {col.str_length_mean}) |" - ) - - # Date format - if col.detected_date_format is not None: - lines.append(f"| Date Format | `{col.detected_date_format}` |") - - # Weekday only - if col.weekday_only is True: - lines.append("| Weekdays Only | Yes |") - - lines.append("") - - # Top values - if col.top_values: - lines.append("**Top Values:**") - for val, count in col.top_values[:5]: - lines.append(f"- `{val}`: {count:,}") - lines.append("") - - # Issues - if col.issues: - for issue in col.issues: - lines.append(f"> Warning: {issue}") - lines.append("") - - return lines - - def _format_correlations(self, profile: DatasetProfile) -> list[str]: - """Format correlations section.""" - lines = [ - "## Correlations", - "", - ] - - cols = list(profile.correlations.keys()) - if not cols: - lines.append("*No numeric columns for correlation analysis.*") - lines.append("") - return lines - - # Create markdown table - header = "| |" + "|".join(f" {c} " for c in cols) + "|" - separator = "|---|" + "|".join(["---:" for _ in cols]) + "|" - - lines.append(header) - lines.append(separator) - - for col1 in cols: - row = f"| **{col1}** |" - for col2 in cols: - if col1 == col2: - row += " 1.000 |" - else: - corr = profile.correlations.get(col1, {}).get(col2, 0) - if abs(corr) >= 0.7: - row += f" **{corr:.3f}** |" - else: - row += f" {corr:.3f} |" - lines.append(row) - - lines.append("") - return lines - - def _format_suggestions(self, profile: DatasetProfile) -> list[str]: - """Format suggestions section grouped by confidence.""" - lines = [ - "## Suggested Validation Rules", - "", - ] - - has_suggestions = False - for col in profile.columns.values(): - if not col.suggestions: - continue - - has_suggestions = True - lines.append(f"### {col.name}") - lines.append("") - - by_conf: dict[str, list] = {"high": [], "medium": [], "low": []} - for s in col.suggestions: - conf = s.get("confidence", "low") - by_conf.setdefault(conf, []).append(s) - - conf_labels = { - "high": "High Confidence", - "medium": "Medium Confidence", - "low": "Low Confidence", - } - for conf_level in ("high", "medium", "low"): - suggs = by_conf.get(conf_level, []) - if not suggs: - continue - lines.append(f"#### {conf_labels[conf_level]}") - lines.append("") - for sugg in suggs: - rule = sugg["rule"] - reason = sugg.get("reason", "") - params = sugg.get("params") - if params is not None and not isinstance(params, (dict, list)): - lines.append(f"- `{rule}`: {params} — *{reason}*") - elif params is not None: - lines.append(f"- `{rule}`: {params} — *{reason}*") - else: - lines.append(f"- `{rule}` — *{reason}*") - lines.append("") - - if not has_suggestions: - lines.append("*No rule suggestions.*") - lines.append("") - - return lines - - def _format_recommendations(self, profile: DatasetProfile) -> list[str]: - """Format recommendations section.""" - recommendations = QualityScorer.recommend(profile) - if not recommendations: - return [] - - lines = [ - "## Recommendations", - "", - "| Priority | Column | Issue | Action |", - "|----------|--------|-------|--------|", - ] - - for rec in recommendations[:15]: - lines.append( - f"| {rec['priority'].upper()} " - f"| {rec['column']} " - f"| {rec['issue']} " - f"| {rec['action']} |" - ) - - lines.append("") - return lines diff --git a/datacheck/profiling/formatters/terminal_formatter.py b/datacheck/profiling/formatters/terminal_formatter.py deleted file mode 100644 index 432c45e..0000000 --- a/datacheck/profiling/formatters/terminal_formatter.py +++ /dev/null @@ -1,371 +0,0 @@ -"""Rich terminal formatting for profiles.""" - -from rich.console import Console -from rich.panel import Panel -from rich.table import Table - -from datacheck.profiling.models import ColumnProfile, DatasetProfile -from datacheck.profiling.quality import QualityScorer - - -class TerminalFormatter: - """Format profile results for terminal display using Rich.""" - - GRADE_COLORS = {"A": "green", "B": "green", "C": "yellow", "D": "red", "F": "red"} - - def __init__( - self, - console: Console | None = None, - include_suggestions: bool = True, - include_correlations: bool = True, - ): - """ - Initialize formatter. - - Args: - console: Rich console instance - include_suggestions: Include rule suggestions in output - include_correlations: Include correlation matrix in output - """ - self.console = console or Console() - self.include_suggestions = include_suggestions - self.include_correlations = include_correlations - - def format(self, profile: DatasetProfile) -> None: - """ - Display profile in terminal. - - Args: - profile: DatasetProfile to display - """ - self.console.print() - - # Header panel - self.console.print( - Panel.fit( - f"[bold]Data Profile: {profile.name}[/bold]", - border_style="blue", - ) - ) - self.console.print() - - # Dataset summary - self._print_summary(profile) - - # Column overview table - self._print_column_overview(profile) - - # Detailed column profiles - self._print_column_profiles(profile) - - # Correlations - if self.include_correlations and profile.correlations: - self._print_correlations(profile) - - # Quality summary - self._print_quality_summary(profile) - - def _print_summary(self, profile: DatasetProfile) -> None: - """Print dataset summary.""" - grade = QualityScorer.get_quality_grade(profile.overall_quality_score) - grade_color = self.GRADE_COLORS.get(grade, "white") - - table = Table( - title="Dataset Summary", - show_header=False, - title_style="bold", - padding=(0, 2), - ) - table.add_column("Metric", style="cyan") - table.add_column("Value", style="white") - - table.add_row("Rows", f"{profile.row_count:,}") - table.add_row("Columns", str(profile.column_count)) - quality_str = self._format_quality_score(profile.overall_quality_score) - table.add_row("Overall Quality", f"{quality_str} [{grade_color}]{grade}[/{grade_color}]") - table.add_row("Completeness", self._progress_bar(profile.completeness_percentage)) - table.add_row("Total Nulls", f"{profile.total_nulls:,}") - table.add_row("Duplicate Rows", f"{profile.total_duplicates:,}") - table.add_row("Memory Usage", f"{profile.memory_usage_mb:.2f} MB") - - self.console.print(table) - self.console.print() - - def _print_column_overview(self, profile: DatasetProfile) -> None: - """Print compact column overview table.""" - table = Table( - title="Column Overview", - title_style="bold", - padding=(0, 1), - ) - table.add_column("Column", style="bold white") - table.add_column("Type", style="cyan") - table.add_column("Quality", justify="right") - table.add_column("Completeness", justify="right") - table.add_column("Nulls", justify="right") - table.add_column("Unique", justify="right") - table.add_column("Issues", justify="center") - - for col in profile.columns.values(): - grade = QualityScorer.get_quality_grade(col.quality_score) - grade_color = self.GRADE_COLORS.get(grade, "white") - quality_str = f"[{grade_color}]{col.quality_score:.0f} [{grade}][/{grade_color}]" - - completeness_str = f"{col.completeness:.0f}%" - if col.completeness >= 95: - completeness_str = f"[green]{completeness_str}[/green]" - elif col.completeness >= 70: - completeness_str = f"[yellow]{completeness_str}[/yellow]" - else: - completeness_str = f"[red]{completeness_str}[/red]" - - null_str = f"{col.null_count:,} ({col.null_percentage:.0f}%)" - unique_str = f"{col.unique_count:,} ({col.unique_percentage:.0f}%)" - - issue_count = len(col.issues) - issue_str = f"[yellow]{issue_count}[/yellow]" if issue_count > 0 else "[dim]-[/dim]" - - table.add_row( - col.name, - col.column_type, - quality_str, - completeness_str, - null_str, - unique_str, - issue_str, - ) - - self.console.print(table) - self.console.print() - - def _print_column_profiles(self, profile: DatasetProfile) -> None: - """Print detailed column profiles.""" - self.console.print("[bold]Column Details[/bold]\n") - - for col_profile in profile.columns.values(): - self._print_column_profile(col_profile) - - def _print_column_profile(self, col: ColumnProfile) -> None: - """Print single column profile.""" - grade = QualityScorer.get_quality_grade(col.quality_score) - grade_color = self.GRADE_COLORS.get(grade, "white") - quality_str = self._format_quality_score(col.quality_score) - - # Simple heading line - self.console.print( - f" [bold]{col.name}[/bold] [dim]({col.column_type}, {col.dtype})[/dim] " - f"- {quality_str} [{grade_color}]{grade}[/{grade_color}]" - ) - - # Basic stats - self.console.print(f" Nulls: {col.null_count:,} ({col.null_percentage:.1f}%) " - f"Unique: {col.unique_count:,} ({col.unique_percentage:.1f}%) " - f"Completeness: {col.completeness:.1f}%") - - # Numeric stats - if col.min_value is not None: - self.console.print( - f" Range: [{col.min_value:.2f}, {col.max_value:.2f}] " - f"Mean: {col.mean:.2f} Median: {col.median:.2f} " - f"Std: {col.std_dev:.2f} " - f"Q1/Q3: {col.percentile_25:.2f}/{col.percentile_75:.2f}" - ) - if col.outlier_count > 0: - self.console.print( - f" [yellow]Outliers: {col.outlier_count} ({col.outlier_percentage:.1f}%)[/yellow]" - ) - - # Datetime stats - if col.min_date is not None: - self.console.print(f" Date range: {col.min_date} to {col.max_date}") - - # String length stats - if col.str_length_min is not None: - self.console.print( - f" Lengths: {col.str_length_min}-{col.str_length_max} " - f"(avg {col.str_length_mean})" - ) - - # Detected date format - if col.detected_date_format is not None: - self.console.print(f" Format: {col.detected_date_format}") - - # Weekday only flag - if col.weekday_only is True: - self.console.print(" [dim]Weekdays only[/dim]") - - # Top values - if col.top_values: - top_str = ", ".join( - f"'{val}' ({count})" for val, count in col.top_values[:5] - ) - self.console.print(f" [dim]Top: {top_str}[/dim]") - - # Issues - if col.issues: - for issue in col.issues: - self.console.print(f" [yellow]! {issue}[/yellow]") - - # Suggestions — show all grouped by confidence - if self.include_suggestions and col.suggestions: - by_conf: dict[str, list] = {"high": [], "medium": [], "low": []} - for s in col.suggestions: - conf = s.get("confidence", "low") - by_conf.setdefault(conf, []).append(s) - - has_any = any(by_conf.values()) - if has_any: - self.console.print(" [bold]Suggested rules:[/bold]") - conf_styles = { - "high": ("[green]HIGH[/green]", " "), - "medium": ("[yellow]MED[/yellow]", " "), - "low": ("[dim]LOW[/dim]", " "), - } - for conf_level in ("high", "medium", "low"): - style, pad = conf_styles[conf_level] - for s in by_conf.get(conf_level, []): - params = s.get("params") - reason = s.get("reason", "") - if params is not None and not isinstance(params, (dict, list)): - rule_str = f"{s['rule']}: {params}" - else: - rule_str = s["rule"] - reason_str = f" [dim]— {reason}[/dim]" if reason else "" - self.console.print( - f" {style}{pad}{rule_str}{reason_str}" - ) - - self.console.print() - - def _print_correlations(self, profile: DatasetProfile) -> None: - """Print correlation matrix.""" - self.console.print("\n[bold]Correlations[/bold]\n") - - cols = list(profile.correlations.keys()) - if not cols: - return - - table = Table(title="Correlation Matrix", title_style="bold") - table.add_column("", style="cyan") - - for col in cols: - table.add_column(col[:15], style="white", justify="right") - - for col1 in cols: - row = [col1[:15]] - for col2 in cols: - if col1 == col2: - row.append("[dim]1.000[/dim]") - else: - corr = profile.correlations.get(col1, {}).get(col2, 0) - if abs(corr) >= 0.7: - row.append(f"[red]{corr:.3f}[/red]") - elif abs(corr) >= 0.5: - row.append(f"[yellow]{corr:.3f}[/yellow]") - else: - row.append(f"{corr:.3f}") - table.add_row(*row) - - self.console.print(table) - - def _print_quality_summary(self, profile: DatasetProfile) -> None: - """Print quality summary.""" - self.console.print("\n[bold]Quality Summary[/bold]\n") - - # Grade distribution - grade_counts: dict[str, int] = {"A": 0, "B": 0, "C": 0, "D": 0, "F": 0} - for col in profile.columns.values(): - g = QualityScorer.get_quality_grade(col.quality_score) - grade_counts[g] = grade_counts.get(g, 0) + 1 - - grade_parts = [] - for g in ("A", "B", "C", "D", "F"): - if grade_counts[g] > 0: - color = self.GRADE_COLORS.get(g, "white") - grade_parts.append(f"[{color}]{grade_counts[g]} {g}[/{color}]") - if grade_parts: - self.console.print(f"Grade distribution: {', '.join(grade_parts)}") - self.console.print() - - # Collect issues with column names - all_issues: list[tuple[str, str]] = [] - for col in profile.columns.values(): - for issue in col.issues: - all_issues.append((col.name, issue)) - - if all_issues: - self.console.print(f"[yellow]Issues detected: {len(all_issues)}[/yellow]") - for col_name, issue in all_issues[:10]: - self.console.print(f" [cyan]{col_name}:[/cyan] {issue}") - if len(all_issues) > 10: - self.console.print(f" [dim]... and {len(all_issues) - 10} more[/dim]") - else: - self.console.print("[green]No significant quality issues detected.[/green]") - - # Low quality columns with score breakdown - sorted_cols = sorted(profile.columns.values(), key=lambda c: c.quality_score) - low_quality = [c for c in sorted_cols if c.quality_score < 80] - - if low_quality: - self.console.print("\n[bold]Columns needing attention:[/bold]") - for col in low_quality[:5]: - grade = QualityScorer.get_quality_grade(col.quality_score) - grade_color = self.GRADE_COLORS.get(grade, "white") - self.console.print( - f" {col.name}: " - f"{self._format_quality_score(col.quality_score)} " - f"[{grade_color}]{grade}[/{grade_color}]" - ) - breakdown = QualityScorer.score_breakdown(col) - for component, info in breakdown.items(): - score = info["score"] - max_score = info["max"] - detail = info["detail"] - if score < max_score: - self.console.print( - f" [dim]{component}: {score}/{max_score} — {detail}[/dim]" - ) - - # Recommendations - recommendations = QualityScorer.recommend(profile) - if recommendations: - self.console.print("\n[bold]Recommendations:[/bold]") - priority_styles = { - "high": "[red]HIGH[/red]", - "medium": "[yellow]MED[/yellow]", - "low": "[dim]LOW[/dim]", - } - for rec in recommendations[:10]: - pstyle = priority_styles.get(rec["priority"], rec["priority"]) - self.console.print( - f" {pstyle} {rec['column']}: {rec['action']} " - f"[dim]— {rec['issue']}[/dim]" - ) - - self.console.print() - - def _format_quality_score(self, score: float) -> str: - """Format quality score with color.""" - if score >= 90: - return f"[green]{score:.1f}/100[/green]" - elif score >= 70: - return f"[yellow]{score:.1f}/100[/yellow]" - else: - return f"[red]{score:.1f}/100[/red]" - - @staticmethod - def _progress_bar(percentage: float, width: int = 15) -> str: - """Create a progress bar string.""" - pct = max(0.0, min(100.0, percentage)) - filled = int((pct / 100) * width) - empty = width - filled - - if pct >= 90: - color = "green" - elif pct >= 70: - color = "yellow" - else: - color = "red" - - bar = "#" * filled + "-" * empty - return f"[{color}]{bar}[/{color}] {pct:.1f}%" diff --git a/datacheck/profiling/models.py b/datacheck/profiling/models.py deleted file mode 100644 index dd82acf..0000000 --- a/datacheck/profiling/models.py +++ /dev/null @@ -1,155 +0,0 @@ -"""Data models for profiling results.""" - -from dataclasses import dataclass, field -from datetime import datetime -from typing import Any - - -@dataclass -class ColumnProfile: - """Profile for a single column.""" - - name: str - dtype: str - - # Counts - total_count: int = 0 - null_count: int = 0 - unique_count: int = 0 - duplicate_count: int = 0 - - # Percentages - null_percentage: float = 0.0 - unique_percentage: float = 0.0 - completeness: float = 100.0 - - # Statistics (for numeric columns) - min_value: float | None = None - max_value: float | None = None - mean: float | None = None - median: float | None = None - std_dev: float | None = None - percentile_25: float | None = None - percentile_75: float | None = None - - # Distribution - top_values: list[tuple[Any, int]] = field(default_factory=list) - value_distribution: dict[str, int] = field(default_factory=dict) - - # Outliers - outlier_count: int = 0 - outlier_percentage: float = 0.0 - outliers: list[Any] = field(default_factory=list) - - # Datetime stats - inferred_type: str | None = None - min_date: str | None = None - max_date: str | None = None - - # String length stats (for string/object columns) - str_length_min: int | None = None - str_length_max: int | None = None - str_length_mean: float | None = None - - # Datetime extras - detected_date_format: str | None = None - weekday_only: bool | None = None - - # Sample non-null values (for value-based rule detection) - sample_values: list[Any] = field(default_factory=list) - - # Quality - quality_score: float = 100.0 - issues: list[str] = field(default_factory=list) - suggestions: list[dict[str, Any]] = field(default_factory=list) - - @property - def column_type(self) -> str: - """Display-friendly column type derived from inferred_type.""" - return self.inferred_type or self.dtype - - def to_dict(self) -> dict[str, Any]: - """Convert to dictionary.""" - return { - "name": self.name, - "dtype": self.dtype, - "column_type": self.column_type, - "total_count": self.total_count, - "null_count": self.null_count, - "unique_count": self.unique_count, - "duplicate_count": self.duplicate_count, - "null_percentage": self.null_percentage, - "unique_percentage": self.unique_percentage, - "completeness": self.completeness, - "min_value": self.min_value, - "max_value": self.max_value, - "mean": self.mean, - "median": self.median, - "std_dev": self.std_dev, - "percentile_25": self.percentile_25, - "percentile_75": self.percentile_75, - "top_values": [(str(v), c) for v, c in self.top_values], - "outlier_count": self.outlier_count, - "outlier_percentage": self.outlier_percentage, - "inferred_type": self.inferred_type, - "min_date": self.min_date, - "max_date": self.max_date, - "str_length_min": self.str_length_min, - "str_length_max": self.str_length_max, - "str_length_mean": self.str_length_mean, - "detected_date_format": self.detected_date_format, - "weekday_only": self.weekday_only, - "quality_score": self.quality_score, - "issues": self.issues, - "suggestions": self.suggestions, - } - - -@dataclass -class DatasetProfile: - """Profile for entire dataset.""" - - name: str - row_count: int - column_count: int - created_at: datetime = field(default_factory=datetime.now) - - # Column profiles - columns: dict[str, ColumnProfile] = field(default_factory=dict) - - # Overall quality - overall_quality_score: float = 100.0 - - # Correlations (for numeric columns) - correlations: dict[str, dict[str, float]] = field(default_factory=dict) - - # Cross-column rules (sum_equals, unique_combination, etc.) - cross_column_rules: list[dict[str, Any]] = field(default_factory=list) - - # Summary - total_nulls: int = 0 - total_duplicates: int = 0 - completeness_percentage: float = 100.0 - memory_usage_mb: float = 0.0 - - def to_dict(self) -> dict[str, Any]: - """Convert to dictionary.""" - return { - "name": self.name, - "row_count": self.row_count, - "column_count": self.column_count, - "created_at": self.created_at.isoformat(), - "columns": {k: v.to_dict() for k, v in self.columns.items()}, - "overall_quality_score": self.overall_quality_score, - "correlations": self.correlations, - "cross_column_rules": self.cross_column_rules, - "total_nulls": self.total_nulls, - "total_duplicates": self.total_duplicates, - "completeness_percentage": self.completeness_percentage, - "memory_usage_mb": self.memory_usage_mb, - } - - @property - def column_names(self) -> list[str]: - """Get list of column names.""" - return list(self.columns.keys()) diff --git a/datacheck/profiling/outliers.py b/datacheck/profiling/outliers.py deleted file mode 100644 index deb0e23..0000000 --- a/datacheck/profiling/outliers.py +++ /dev/null @@ -1,123 +0,0 @@ -"""Outlier detection methods.""" - -from enum import Enum -from typing import Any - -import numpy as np -import pandas as pd - - -class OutlierMethod(Enum): - """Outlier detection methods.""" - - ZSCORE = "zscore" - IQR = "iqr" - - -class OutlierDetector: - """Detect outliers in data.""" - - @staticmethod - def detect_zscore( - series: pd.Series, - threshold: float = 3.0, - ) -> tuple[list[Any], int, float]: - """ - Detect outliers using Z-score method. - - Args: - series: Numeric pandas Series - threshold: Z-score threshold (default: 3.0) - - Returns: - Tuple of (outlier_values, outlier_count, outlier_percentage) - """ - total_count = len(series) - non_null_count = int(series.notna().sum()) - - if non_null_count == 0: - return [], 0, 0.0 - - clean_series = series.dropna() - std = clean_series.std() - if pd.isna(std) or float(std) == 0: - return [], 0, 0.0 - - mean = clean_series.mean() - - # Calculate Z-scores - z_scores = np.abs((clean_series - mean) / std) - - # Find outliers - outlier_mask = z_scores > threshold - outliers = clean_series[outlier_mask].tolist() - outlier_count = len(outliers) - outlier_percentage = (outlier_count / total_count * 100) if total_count > 0 else 0.0 - - # Limit to 100 examples - return outliers[:100], outlier_count, round(outlier_percentage, 2) - - @staticmethod - def detect_iqr( - series: pd.Series, - multiplier: float = 1.5, - ) -> tuple[list[Any], int, float]: - """ - Detect outliers using IQR (Interquartile Range) method. - - Args: - series: Numeric pandas Series - multiplier: IQR multiplier (default: 1.5) - - Returns: - Tuple of (outlier_values, outlier_count, outlier_percentage) - """ - total_count = len(series) - non_null_count = int(series.notna().sum()) - - if non_null_count == 0: - return [], 0, 0.0 - - clean_series = series.dropna() - q1 = clean_series.quantile(0.25) - q3 = clean_series.quantile(0.75) - iqr = q3 - q1 - - if pd.isna(iqr) or float(iqr) == 0: - return [], 0, 0.0 - - lower_bound = q1 - (multiplier * iqr) - upper_bound = q3 + (multiplier * iqr) - - # Find outliers - outlier_mask = (clean_series < lower_bound) | (clean_series > upper_bound) - outliers = clean_series[outlier_mask].tolist() - outlier_count = len(outliers) - outlier_percentage = (outlier_count / total_count * 100) if total_count > 0 else 0.0 - - # Limit to 100 examples - return outliers[:100], outlier_count, round(outlier_percentage, 2) - - @staticmethod - def detect( - series: pd.Series, - method: OutlierMethod = OutlierMethod.ZSCORE, - threshold: float = 3.0, - iqr_multiplier: float = 1.5, - ) -> tuple[list[Any], int, float]: - """ - Detect outliers using specified method. - - Args: - series: Numeric pandas Series - method: Detection method (ZSCORE or IQR) - threshold: Z-score threshold - iqr_multiplier: IQR multiplier - - Returns: - Tuple of (outlier_values, outlier_count, outlier_percentage) - """ - if method == OutlierMethod.ZSCORE: - return OutlierDetector.detect_zscore(series, threshold) - else: - return OutlierDetector.detect_iqr(series, iqr_multiplier) diff --git a/datacheck/profiling/profiler.py b/datacheck/profiling/profiler.py deleted file mode 100644 index e90f517..0000000 --- a/datacheck/profiling/profiler.py +++ /dev/null @@ -1,605 +0,0 @@ -"""Data profiling and quality analysis.""" - -import logging -import re -from datetime import datetime as dt - -import pandas as pd - -logger = logging.getLogger(__name__) - -from datacheck.profiling.models import ColumnProfile, DatasetProfile -from datacheck.profiling.outliers import OutlierDetector, OutlierMethod -from datacheck.profiling.quality import QualityScorer -from datacheck.profiling.statistics import StatisticsCalculator -from datacheck.profiling.suggestions import RuleSuggester - - -class DataProfiler: - """Generate comprehensive data quality profiles. - - Analyzes DataFrames to provide: - - Column types and data types - - Statistical summaries (numeric columns) - - Missing value analysis - - Cardinality and uniqueness - - Outlier detection - - Quality scoring - - Rule suggestions - - Correlation analysis - - Example: - >>> profiler = DataProfiler() - >>> profile = profiler.profile(df, name="my_data") - >>> print(profile.overall_quality_score) - >>> for col in profile.columns.values(): - ... print(f"{col.name}: {col.inferred_type}, score={col.quality_score}") - """ - - def __init__( - self, - outlier_method: OutlierMethod = OutlierMethod.ZSCORE, - outlier_threshold: float = 3.0, - iqr_multiplier: float = 1.5, - ): - """ - Initialize profiler. - - Args: - outlier_method: Method for outlier detection (ZSCORE or IQR) - outlier_threshold: Threshold for Z-score outlier detection - iqr_multiplier: Multiplier for IQR outlier detection - """ - self.outlier_method = outlier_method - self.outlier_threshold = outlier_threshold - self.iqr_multiplier = iqr_multiplier - self.stats_calc = StatisticsCalculator() - self.outlier_detector = OutlierDetector() - self.quality_scorer = QualityScorer() - self.rule_suggester = RuleSuggester() - - def profile(self, df: pd.DataFrame, name: str = "dataset") -> DatasetProfile: - """ - Generate comprehensive profile for DataFrame. - - Args: - df: DataFrame to profile - name: Name for the dataset - - Returns: - DatasetProfile with complete analysis - """ - # Compute memory usage once (deep=True is expensive) - memory_bytes = df.memory_usage(deep=True).sum() - - # Initialize profile - dataset_profile = DatasetProfile( - name=name, - row_count=len(df), - column_count=len(df.columns), - memory_usage_mb=round(memory_bytes / 1024 / 1024, 2), - ) - - # Profile each column - for col_name in df.columns: - col_profile = self._profile_column(df[col_name], col_name) - dataset_profile.columns[col_name] = col_profile - - # Calculate correlations for numeric columns - dataset_profile.correlations = self.stats_calc.calculate_correlation_matrix(df) - - # Calculate totals - dataset_profile.total_nulls = sum( - col.null_count for col in dataset_profile.columns.values() - ) - dataset_profile.total_duplicates = int(df.duplicated().sum()) - - total_cells = len(df) * len(df.columns) - if total_cells > 0: - dataset_profile.completeness_percentage = round( - ((total_cells - dataset_profile.total_nulls) / total_cells) * 100, 2 - ) - else: - dataset_profile.completeness_percentage = 100.0 - - # Detect cross-column rules - dataset_profile.cross_column_rules = self._detect_cross_column_rules( - df, dataset_profile - ) - - # Calculate overall quality score - dataset_profile.overall_quality_score = self.quality_scorer.score_dataset( - dataset_profile - ) - - return dataset_profile - - def _profile_column(self, series: pd.Series, col_name: str) -> ColumnProfile: - """ - Profile a single column. - - Args: - series: Column data - col_name: Column name - - Returns: - ColumnProfile with complete analysis - """ - # Initialize profile with basic counts - counts = self.stats_calc.calculate_basic_counts(series) - - profile = ColumnProfile( - name=col_name, - dtype=str(series.dtype), - total_count=int(counts["total_count"]), - null_count=int(counts["null_count"]), - unique_count=int(counts["unique_count"]), - duplicate_count=int(counts["duplicate_count"]), - null_percentage=counts["null_percentage"], - unique_percentage=counts["unique_percentage"], - completeness=counts["completeness"], - ) - - # Value distribution - profile.top_values, profile.value_distribution = ( - self.stats_calc.calculate_value_counts(series) - ) - - # Type-specific analysis - if pd.api.types.is_bool_dtype(series): - profile.inferred_type = "boolean" - - elif pd.api.types.is_numeric_dtype(series): - # Distinguish integer vs float types - if pd.api.types.is_integer_dtype(series): - profile.inferred_type = "integer" - elif pd.api.types.is_float_dtype(series): - # Check if all non-null values are whole numbers - # (common when nulls force int->float promotion) - non_null = series.dropna() - if len(non_null) > 0 and (non_null == non_null.astype(int)).all(): - profile.inferred_type = "integer" - else: - profile.inferred_type = "numeric" - else: - profile.inferred_type = "numeric" - stats = self.stats_calc.calculate_numeric_stats(series) - profile.min_value = stats["min"] - profile.max_value = stats["max"] - profile.mean = stats["mean"] - profile.median = stats["median"] - profile.std_dev = stats["std_dev"] - profile.percentile_25 = stats["percentile_25"] - profile.percentile_75 = stats["percentile_75"] - - # Detect outliers - outliers, count, percentage = self.outlier_detector.detect( - series, - method=self.outlier_method, - threshold=self.outlier_threshold, - iqr_multiplier=self.iqr_multiplier, - ) - profile.outliers = outliers - profile.outlier_count = count - profile.outlier_percentage = percentage - - elif ( - pd.api.types.is_datetime64_any_dtype(series) - or str(series.dtype).startswith("timestamp") - ): - profile.inferred_type = "datetime" - non_null = series.dropna() - if len(non_null) > 0: - profile.min_date = str(non_null.min()) - profile.max_date = str(non_null.max()) - - # Detect date format from string representation of the - # native timestamps so that date_format rules can be suggested - str_samples = [str(v) for v in non_null.head(20).tolist()] - profile.detected_date_format = self._detect_date_format( - str_samples - ) - - elif self._is_datetime_string_column(series): - profile.inferred_type = "datetime" - parsed = pd.to_datetime(series, errors="coerce", format="mixed").dropna() - if len(parsed) > 0: - profile.min_date = str(parsed.min()) - profile.max_date = str(parsed.max()) - - # Detect date format from sample values - non_null_vals = series.dropna().head(20).tolist() - profile.detected_date_format = self._detect_date_format(non_null_vals) - - else: - profile.inferred_type = "categorical" - - # String length analysis (for string/object columns) - dtype_str = str(series.dtype).lower() - if dtype_str in ("object", "str") or dtype_str.startswith("string"): - non_null_str = series.dropna() - if len(non_null_str) > 0: - lengths = non_null_str.astype(str).str.len() - profile.str_length_min = int(lengths.min()) - profile.str_length_max = int(lengths.max()) - profile.str_length_mean = round(float(lengths.mean()), 2) - - # Weekday analysis (for datetime columns) - if profile.inferred_type == "datetime": - try: - if ( - pd.api.types.is_datetime64_any_dtype(series) - or str(series.dtype).startswith("timestamp") - ): - dt_values = series.dropna() - else: - dt_values = pd.to_datetime( - series, errors="coerce", format="mixed" - ).dropna() - if len(dt_values) > 0: - profile.weekday_only = bool((dt_values.dt.dayofweek < 5).all()) - except Exception: - logger.debug("Weekday analysis failed for column '%s'", series.name) - - # Sample values (for value-based rule detection) - non_null_sample = series.dropna() - if len(non_null_sample) > 0: - profile.sample_values = non_null_sample.head(50).tolist() - - # Quality scoring - profile.quality_score = self.quality_scorer.score_column(profile) - profile.issues = self.quality_scorer.identify_issues(profile) - - # Rule suggestions - profile.suggestions = self.rule_suggester.suggest_rules(profile) - - return profile - - def _detect_cross_column_rules( - self, df: pd.DataFrame, profile: DatasetProfile - ) -> list[dict]: - """Detect cross-column relationships (sum_equals, unique_combination). - - Args: - df: Original DataFrame - profile: DatasetProfile with column profiles already computed - - Returns: - List of cross-column rule dicts. - """ - rules: list[dict] = [] - if len(df) < 2: - return rules - - # --- sum_equals detection --- - numeric_cols = [ - name for name, cp in profile.columns.items() - if cp.inferred_type in ("numeric", "integer") - ] - # Only check if manageable number of columns (<=15 numeric) - if 3 <= len(numeric_cols) <= 15: - # Prioritize columns whose names suggest totals - total_keywords = {"total", "sum", "amount", "gross", "net"} - candidate_targets = [ - c for c in numeric_cols - if any(kw in c.lower() for kw in total_keywords) - ] - # If no name-based candidates, try all if <=10 numeric cols - if not candidate_targets and len(numeric_cols) <= 10: - candidate_targets = numeric_cols - - for target in candidate_targets: - others = [c for c in numeric_cols if c != target] - for i, col_a in enumerate(others): - for col_b in others[i + 1:]: - try: - mask = ( - df[col_a].notna() - & df[col_b].notna() - & df[target].notna() - ) - valid = mask.sum() - if valid < 5: - continue - sum_ab = df.loc[mask, col_a] + df.loc[mask, col_b] - target_vals = df.loc[mask, target] - # Check within 1% tolerance - denom = target_vals.abs().replace(0, 1) - close = ((sum_ab - target_vals).abs() / denom) < 0.01 - if close.sum() / valid >= 0.95: - rules.append({ - "rule": "sum_equals", - "columns": [target, col_a, col_b], - "params": { - "column_a": col_a, - "column_b": col_b, - }, - "confidence": "high", - "reason": ( - f"{col_a} + {col_b} = {target} " - f"(verified on {valid} rows)" - ), - }) - except Exception: - logger.debug("sum_equals check failed for %s + %s = %s", col_a, col_b, target) - - # --- unique_combination detection --- - cat_cols = [ - name for name, cp in profile.columns.items() - if cp.inferred_type == "categorical" - and cp.unique_count < 50 - and cp.unique_count > 1 - ] - if 2 <= len(cat_cols) <= 10: - for i, col_a in enumerate(cat_cols): - for col_b in cat_cols[i + 1:]: - try: - combo = df[[col_a, col_b]].dropna() - if len(combo) < 5: - continue - if not combo.duplicated().any(): - rules.append({ - "rule": "unique_combination", - "columns": [col_a, col_b], - "params": [col_a, col_b], - "confidence": "medium", - "reason": ( - f"Combination of {col_a} and {col_b} " - f"is unique across {len(combo)} rows" - ), - }) - except Exception: - logger.debug("unique_combination check failed for %s, %s", col_a, col_b) - - return rules - - COMMON_DATE_FORMATS = [ - "%Y-%m-%d %H:%M:%S", - "%Y-%m-%dT%H:%M:%S", - "%Y-%m-%dT%H:%M:%SZ", - "%Y-%m-%d", - "%m/%d/%Y", - "%d/%m/%Y", - "%m-%d-%Y", - "%d-%m-%Y", - "%Y/%m/%d", - "%d %b %Y", - "%d %B %Y", - "%b %d, %Y", - "%B %d, %Y", - "%m/%d/%Y %H:%M:%S", - "%d/%m/%Y %H:%M:%S", - "%Y-%m-%d %H:%M", - "%Y%m%d", - "%m/%d/%y", - "%d/%m/%y", - ] - - @staticmethod - def _detect_date_format( - sample_values: list, threshold: float = 0.8 - ) -> str | None: - """Detect the most likely date format from sample string values. - - Args: - sample_values: List of string date values to analyze - threshold: Minimum fraction of values that must match (0-1) - - Returns: - Detected format string or None - """ - if not sample_values: - return None - - str_values = [ - str(v).strip() - for v in sample_values - if v is not None and str(v).strip() - ] - if not str_values: - return None - - best_format = None - best_count = 0 - - for fmt in DataProfiler.COMMON_DATE_FORMATS: - count = 0 - for val in str_values: - try: - dt.strptime(val, fmt) - count += 1 - except (ValueError, TypeError): - continue # Value doesn't match this format - if count > best_count: - best_count = count - best_format = fmt - - if best_format and best_count >= len(str_values) * threshold: - return best_format - - # Fallback: dynamically infer format from value structure - return DataProfiler._infer_date_format(str_values, threshold) - - # Regex to split date strings into numeric/alpha segments and separators - _DATE_TOKEN_RE = re.compile(r"(\d+|[A-Za-z]+|[^A-Za-z0-9]+)") - - @staticmethod - def _infer_date_format( - str_values: list[str], threshold: float = 0.8 - ) -> str | None: - """Infer a date format string dynamically from sample values. - - Tokenizes values into segments and separators, then classifies - each numeric segment as year/month/day/hour/minute/second based - on value ranges across all samples. - - Args: - str_values: Non-empty list of date string values - threshold: Minimum fraction that must parse with inferred format - - Returns: - Format string (e.g. "%d.%m.%Y") or None - """ - if len(str_values) < 2: - return None - - # Tokenize all values - tokenized = [DataProfiler._DATE_TOKEN_RE.findall(v) for v in str_values] - - # All values must have the same token count - token_counts: dict[int, int] = {} - for tokens in tokenized: - n = len(tokens) - token_counts[n] = token_counts.get(n, 0) + 1 - - if not token_counts: - return None - - best_count = max(token_counts, key=token_counts.get) # type: ignore[arg-type] - if token_counts[best_count] < len(str_values) * threshold: - return None - - matching = [t for t in tokenized if len(t) == best_count] - if len(matching) < 2: - return None - - # Build format by classifying each token position - fmt_parts: list[str] = [] - has_year = False - has_month = False - has_day = False - - for pos in range(best_count): - seg_values = [t[pos] for t in matching] - - # Check if this is a separator (non-alphanumeric) - if all(not c.isalnum() for s in seg_values for c in s): - # All identical separators - if len(set(seg_values)) == 1: - fmt_parts.append(seg_values[0].replace("%", "%%")) - else: - return None - continue - - # Alpha segments (month names like "Jan", "January") - if all(s.isalpha() for s in seg_values): - sample_len = len(seg_values[0]) - if sample_len == 3: - fmt_parts.append("%b") - has_month = True - elif sample_len > 3: - fmt_parts.append("%B") - has_month = True - else: - return None # Can't classify - continue - - # Numeric segments — classify by value range - if not all(s.isdigit() for s in seg_values): - return None - - int_vals = [int(s) for s in seg_values] - min_v, max_v = min(int_vals), max(int_vals) - seg_len = len(seg_values[0]) # typical length - - # Year: 4-digit or 2-digit with values suggesting years - if seg_len == 4 and min_v >= 1900 and max_v <= 2100: - fmt_parts.append("%Y") - has_year = True - elif seg_len == 2 and not has_year and min_v >= 0 and max_v <= 99 and max_v > 31: - fmt_parts.append("%y") - has_year = True - # Hour (0-23), but only after date parts are found - elif has_year and has_month and has_day and min_v >= 0 and max_v <= 23: - fmt_parts.append("%H") - # Minute/Second (0-59) - elif has_year and has_month and has_day and min_v >= 0 and max_v <= 59: - fmt_parts.append("%M" if "%M" not in "".join(fmt_parts) else "%S") - # Month (1-12) - elif not has_month and min_v >= 1 and max_v <= 12: - fmt_parts.append("%m") - has_month = True - # Day (1-31) - elif not has_day and min_v >= 1 and max_v <= 31: - fmt_parts.append("%d") - has_day = True - else: - return None - - # Must have at least year, month, and day - fmt_str = "".join(fmt_parts) - if not (has_year and has_month and has_day): - return None - - # Validate: inferred format must parse >= threshold of original values - parse_count = 0 - for val in str_values: - try: - dt.strptime(val, fmt_str) - parse_count += 1 - except (ValueError, TypeError): - continue - if parse_count < len(str_values) * threshold: - return None - - return fmt_str - - @staticmethod - def _is_datetime_string_column( - series: pd.Series, - sample_size: int = 20, - threshold: float = 0.8, - ) -> bool: - """Check if an object-dtype column contains datetime strings. - - Samples non-null values and attempts to parse them as datetimes. - - Args: - series: Column data (expected to be object dtype) - sample_size: Number of non-null values to sample - threshold: Minimum fraction of successfully parsed values (0-1) - - Returns: - True if column likely contains datetime strings - """ - if series.dtype != "object" and not pd.api.types.is_string_dtype(series): - return False - - non_null = series.dropna() - if len(non_null) < 2: - return False - - sample = non_null.head(sample_size) - - # Reject version-like patterns (e.g., "5.0.48", "3.12.13") - # but NOT dot-separated dates (e.g., "15.01.2024", "2024.01.15") - version_pattern = re.compile(r"^\d{1,4}\.\d{1,4}\.\d{1,4}$") - version_matches = sum(1 for v in sample if version_pattern.match(str(v))) - if version_matches / len(sample) > 0.5: - # Before rejecting, check if these are actually dot-separated dates - date_formats_with_dots = ["%d.%m.%Y", "%m.%d.%Y", "%Y.%m.%d"] - dot_vals = [str(v) for v in sample if version_pattern.match(str(v))] - is_date = False - for fmt in date_formats_with_dots: - parsed_count = 0 - for v in dot_vals: - try: - dt.strptime(v, fmt) - parsed_count += 1 - except (ValueError, TypeError): - continue - if parsed_count >= len(dot_vals) * 0.8: - is_date = True - break - if not is_date: - return False - - try: - parsed = pd.to_datetime(sample, errors="coerce", format="mixed") - success_rate = parsed.notna().sum() / len(sample) - return bool(success_rate >= threshold) - except Exception: - return False - - -__all__ = ["DataProfiler"] diff --git a/datacheck/profiling/quality.py b/datacheck/profiling/quality.py deleted file mode 100644 index 2dadcc2..0000000 --- a/datacheck/profiling/quality.py +++ /dev/null @@ -1,289 +0,0 @@ -"""Data quality scoring.""" - -from __future__ import annotations - -from typing import TYPE_CHECKING, Any - -if TYPE_CHECKING: - from datacheck.profiling.models import ColumnProfile, DatasetProfile - - -class QualityScorer: - """Calculate data quality scores.""" - - @staticmethod - def score_column(profile: ColumnProfile) -> float: - """ - Calculate quality score for a column (0-100). - - Scoring criteria: - - Completeness (no nulls): 0-40 points - - No outliers: 0-20 points - - Data validity: 0-20 points - - Consistency: 0-20 points - - Args: - profile: ColumnProfile to score - - Returns: - Quality score 0-100 - """ - score = 100.0 - - # Penalize for null values (up to -40 points) - # 0% nulls = 0 penalty, 100% nulls = -40 penalty - null_penalty = profile.null_percentage * 0.4 - score -= null_penalty - - # Penalize for outliers (up to -20 points) - # 0% outliers = 0 penalty, 10%+ outliers = -20 penalty - outlier_penalty = min(profile.outlier_percentage * 2, 20) - score -= outlier_penalty - - # Penalize for constant values (all same value) - if profile.unique_count == 1 and profile.total_count > 1: - score -= 10 - - # Bonus for high uniqueness in ID-like columns - if "id" in profile.name.lower(): - if profile.unique_percentage >= 99: - score = min(score + 5, 100) - elif profile.unique_percentage < 95: - score -= 10 # IDs should be mostly unique - - return max(0.0, round(score, 1)) - - @staticmethod - def score_dataset(profile: DatasetProfile) -> float: - """ - Calculate overall dataset quality score. - - Args: - profile: DatasetProfile to score - - Returns: - Overall quality score 0-100 - """ - if not profile.columns: - return 100.0 - - column_scores = [col.quality_score for col in profile.columns.values()] - avg_score = sum(column_scores) / len(column_scores) - - # Additional dataset-level penalties - dataset_penalty = 0.0 - - # Penalize for high duplicate rows - if profile.row_count > 0: - dup_percentage = (profile.total_duplicates / profile.row_count) * 100 - if dup_percentage > 10: - dataset_penalty += min(dup_percentage * 0.5, 15) - - # Penalize for very low completeness - if profile.completeness_percentage < 50: - dataset_penalty += 10 - - final_score = avg_score - dataset_penalty - return max(0.0, round(final_score, 1)) - - @staticmethod - def identify_issues(profile: ColumnProfile) -> list[str]: - """ - Identify data quality issues. - - Args: - profile: ColumnProfile to analyze - - Returns: - List of issue descriptions - """ - issues = [] - - # High null percentage - if profile.null_percentage > 50: - issues.append(f"Very high null percentage: {profile.null_percentage:.1f}%") - elif profile.null_percentage > 20: - issues.append(f"High null percentage: {profile.null_percentage:.1f}%") - elif profile.null_percentage > 5: - issues.append(f"Moderate null percentage: {profile.null_percentage:.1f}%") - - # Outliers - if profile.outlier_percentage > 5: - issues.append(f"High outlier percentage: {profile.outlier_percentage:.1f}%") - elif profile.outlier_percentage > 1: - issues.append(f"Contains outliers: {profile.outlier_percentage:.1f}%") - - # Constant values - if profile.unique_count == 1 and profile.total_count > 1: - issues.append("All values are identical (constant column)") - - # Low uniqueness for ID columns - if "id" in profile.name.lower() and profile.unique_percentage < 95: - issues.append(f"Low uniqueness for ID column: {profile.unique_percentage:.1f}%") - - # Very high cardinality for categorical - if profile.unique_percentage > 95 and profile.unique_count > 100: - if profile.dtype == "object" or "str" in profile.dtype.lower(): - issues.append(f"Very high cardinality: {profile.unique_count:,} unique values") - - return issues - - @staticmethod - def score_breakdown(profile: ColumnProfile) -> dict[str, dict[str, Any]]: - """ - Get component-level quality score breakdown. - - Args: - profile: ColumnProfile to analyze - - Returns: - Dict with component scores, max points, and detail text. - """ - # Completeness component (max 40) - null_penalty = profile.null_percentage * 0.4 - completeness_score = round(40 - null_penalty, 1) - if profile.null_percentage == 0: - completeness_detail = "No null values" - else: - completeness_detail = ( - f"{profile.null_percentage:.1f}% nulls" - ) - - # Outlier component (max 20) - outlier_penalty = min(profile.outlier_percentage * 2, 20) - outlier_score = round(20 - outlier_penalty, 1) - if profile.outlier_percentage == 0: - outlier_detail = "No outliers" - else: - outlier_detail = ( - f"{profile.outlier_percentage:.1f}% outliers" - ) - - # Consistency component (max 20) - consistency_score = 20.0 - consistency_detail = "No issues" - if profile.unique_count == 1 and profile.total_count > 1: - consistency_score = 10.0 - consistency_detail = "Constant column (all values identical)" - - # Validity component (max 20) - validity_score = 20.0 - validity_detail = "No issues" - if "id" in profile.name.lower(): - if profile.unique_percentage >= 99: - validity_detail = "ID column with high uniqueness" - elif profile.unique_percentage < 95: - validity_score = 10.0 - validity_detail = ( - f"ID column with low uniqueness " - f"({profile.unique_percentage:.1f}%)" - ) - - return { - "completeness": { - "score": max(0, completeness_score), - "max": 40, - "detail": completeness_detail, - }, - "outliers": { - "score": max(0, outlier_score), - "max": 20, - "detail": outlier_detail, - }, - "consistency": { - "score": consistency_score, - "max": 20, - "detail": consistency_detail, - }, - "validity": { - "score": validity_score, - "max": 20, - "detail": validity_detail, - }, - } - - @staticmethod - def recommend(profile: DatasetProfile) -> list[dict[str, str]]: - """ - Generate prioritized data quality recommendations. - - Args: - profile: DatasetProfile to analyze - - Returns: - List of recommendations sorted by priority (high first). - Each dict has: priority, column, issue, action. - """ - priority_order = {"high": 0, "medium": 1, "low": 2} - recommendations: list[dict[str, str]] = [] - - for col_name, col in profile.columns.items(): - # Null percentage checks - if col.null_percentage > 20: - recommendations.append({ - "priority": "high", - "column": col_name, - "issue": f"{col.null_percentage:.1f}% null values", - "action": "Investigate missing data or add not_null rule", - }) - elif col.null_percentage > 5: - recommendations.append({ - "priority": "medium", - "column": col_name, - "issue": f"{col.null_percentage:.1f}% null values", - "action": "Review missing data patterns", - }) - - # Outlier checks - if col.outlier_percentage > 5: - recommendations.append({ - "priority": "medium", - "column": col_name, - "issue": f"{col.outlier_percentage:.1f}% outliers", - "action": "Review outliers; consider range validation", - }) - - # Constant column - if col.unique_count == 1 and col.total_count > 1: - recommendations.append({ - "priority": "low", - "column": col_name, - "issue": "All values are identical", - "action": "Consider removing constant column", - }) - - # ID column uniqueness - if "id" in col_name.lower() and col.unique_percentage < 95: - recommendations.append({ - "priority": "high", - "column": col_name, - "issue": ( - f"ID column with {col.unique_percentage:.1f}% uniqueness" - ), - "action": "Investigate duplicate IDs", - }) - - recommendations.sort(key=lambda r: priority_order.get(r["priority"], 9)) - return recommendations - - @staticmethod - def get_quality_grade(score: float) -> str: - """ - Get letter grade for quality score. - - Args: - score: Quality score 0-100 - - Returns: - Letter grade (A, B, C, D, F) - """ - if score >= 90: - return "A" - elif score >= 80: - return "B" - elif score >= 70: - return "C" - elif score >= 60: - return "D" - else: - return "F" diff --git a/datacheck/profiling/statistics.py b/datacheck/profiling/statistics.py deleted file mode 100644 index c660927..0000000 --- a/datacheck/profiling/statistics.py +++ /dev/null @@ -1,134 +0,0 @@ -"""Statistical calculations for profiling.""" - -from typing import Any - -import pandas as pd - - -class StatisticsCalculator: - """Calculate statistics for data profiling.""" - - @staticmethod - def calculate_numeric_stats(series: pd.Series) -> dict[str, Any]: - """ - Calculate statistics for numeric column. - - Args: - series: Pandas Series (numeric) - - Returns: - Dict with statistics - """ - clean_series = series.dropna() - - if len(clean_series) == 0: - return { - "min": None, - "max": None, - "mean": None, - "median": None, - "std_dev": None, - "percentile_25": None, - "percentile_50": None, - "percentile_75": None, - } - - # Single-pass computation via describe() - desc = clean_series.describe(percentiles=[0.25, 0.5, 0.75]) - return { - "min": float(desc["min"]), - "max": float(desc["max"]), - "mean": round(float(desc["mean"]), 4), - "median": float(desc["50%"]), - "std_dev": round(float(desc["std"]), 4) if len(clean_series) > 1 else 0.0, - "percentile_25": float(desc["25%"]), - "percentile_50": float(desc["50%"]), - "percentile_75": float(desc["75%"]), - } - - @staticmethod - def calculate_value_counts( - series: pd.Series, - top_n: int = 10, - ) -> tuple[list[tuple[Any, int]], dict[str, int]]: - """ - Calculate value frequencies. - - Args: - series: Pandas Series - top_n: Number of top values to return - - Returns: - Tuple of (top_values_list, full_distribution_dict) - """ - value_counts = series.value_counts() - - # Top N values as list of tuples - top_values = [(val, int(count)) for val, count in value_counts.head(top_n).items()] - - # Distribution as dict (limit to top 100 for memory) - distribution = {str(k): int(v) for k, v in value_counts.head(100).items()} - - return top_values, distribution - - @staticmethod - def calculate_correlation_matrix(df: pd.DataFrame) -> dict[str, dict[str, float]]: - """ - Calculate correlation matrix for numeric columns. - - Args: - df: DataFrame with numeric columns - - Returns: - Nested dict: {col1: {col2: correlation, ...}, ...} - """ - # Use "number" string selector to capture both NumPy and Arrow numeric types - numeric_cols = df.select_dtypes(include=["number"]).columns.tolist() - - if len(numeric_cols) < 2: - return {} - - corr_matrix = df[numeric_cols].corr() - - # Convert to nested dict - result: dict[str, dict[str, float]] = {} - for col1 in corr_matrix.index: - result[col1] = {} - for col2 in corr_matrix.columns: - if col1 != col2: # Don't include self-correlation - corr_val = corr_matrix.loc[col1, col2] - # Handle NaN correlations - if pd.notna(corr_val): - result[col1][col2] = round(float(corr_val), 3) - - return result - - @staticmethod - def calculate_basic_counts(series: pd.Series) -> dict[str, int | float]: - """ - Calculate basic counts for a series. - - Args: - series: Pandas Series - - Returns: - Dict with counts - """ - total_count = len(series) - null_count = int(series.isnull().sum()) - unique_count = int(series.nunique()) - duplicate_count = total_count - unique_count - - null_percentage = (null_count / total_count * 100) if total_count > 0 else 0.0 - unique_percentage = (unique_count / total_count * 100) if total_count > 0 else 0.0 - completeness = 100.0 - null_percentage - - return { - "total_count": total_count, - "null_count": null_count, - "unique_count": unique_count, - "duplicate_count": duplicate_count, - "null_percentage": round(null_percentage, 2), - "unique_percentage": round(unique_percentage, 2), - "completeness": round(completeness, 2), - } diff --git a/datacheck/profiling/suggestions.py b/datacheck/profiling/suggestions.py deleted file mode 100644 index 5bfc449..0000000 --- a/datacheck/profiling/suggestions.py +++ /dev/null @@ -1,762 +0,0 @@ -"""Auto-suggest validation rules based on profile.""" - -from __future__ import annotations - -import json -import logging -import re -from datetime import datetime, timedelta -from collections.abc import Callable -from typing import TYPE_CHECKING, Any - -import pandas as pd - -logger = logging.getLogger(__name__) - -if TYPE_CHECKING: - from datacheck.profiling.models import ColumnProfile - - -# Column-name keyword rules for semantic type detection. -# Each entry maps column name keywords to a validation rule. -_NAME_BASED_RULES: list[dict[str, Any]] = [ - { - "rule": "email_valid", - "keywords": ["email", "mail", "e_mail"], - "confidence": "high", - "reason": "Column name suggests email addresses", - }, - { - "rule": "phone_valid", - "keywords": ["phone", "tel", "mobile", "cell"], - "confidence": "high", - "reason": "Column name suggests phone numbers", - }, - { - "rule": "url_valid", - "keywords": ["url", "link", "website", "href"], - "confidence": "high", - "reason": "Column name suggests URLs", - }, -] - - -class RuleSuggester: - """Suggest validation rules based on data profile. - - Analyzes column profiles to suggest applicable validation rules - with confidence levels and reasoning. Suggestions use parameter - formats compatible with the RuleFactory. - """ - - @staticmethod - def suggest_rules(profile: ColumnProfile) -> list[dict[str, Any]]: - """ - Suggest validation rules for a column. - - Args: - profile: ColumnProfile to analyze - - Returns: - List of suggested rules with confidence and reasoning. - Each dict has keys: rule, confidence, reason, and optionally params. - """ - suggestions: list[dict[str, Any]] = [] - - # --- Null analysis --- - if profile.null_percentage < 1: - suggestions.append({ - "rule": "not_null", - "confidence": "high", - "reason": f"No null values detected ({profile.null_percentage:.1f}%)", - }) - elif profile.null_percentage < 5: - suggestions.append({ - "rule": "not_null", - "confidence": "medium", - "reason": f"Very few null values ({profile.null_percentage:.1f}%)", - }) - - # --- Uniqueness --- - if profile.unique_percentage >= 99.9: - suggestions.append({ - "rule": "unique", - "confidence": "high", - "reason": f"All values are unique ({profile.unique_percentage:.1f}%)", - }) - elif profile.unique_percentage >= 95: - suggestions.append({ - "rule": "unique", - "confidence": "medium", - "reason": f"Most values are unique ({profile.unique_percentage:.1f}%)", - }) - - # --- Type inference --- - inferred = getattr(profile, "inferred_type", None) - if inferred: - type_map = { - "integer": "int", - "numeric": "numeric", - "boolean": "bool", - "datetime": "date", - } - if inferred in type_map: - suggestions.append({ - "rule": "type", - "params": type_map[inferred], - "confidence": "high", - "reason": f"Column detected as {inferred}", - }) - - # --- Numeric rules --- - if profile.min_value is not None and profile.max_value is not None: - # Use IQR-based Tukey fences when outliers push raw bounds too wide - use_iqr = ( - profile.outlier_percentage > 5 - and profile.percentile_25 is not None - and profile.percentile_75 is not None - ) - - if use_iqr: - assert profile.percentile_75 is not None and profile.percentile_25 is not None - iqr = profile.percentile_75 - profile.percentile_25 - lower_fence = round(profile.percentile_25 - 1.5 * iqr, 2) - upper_fence = round(profile.percentile_75 + 1.5 * iqr, 2) - suggestions.append({ - "rule": "min", - "params": lower_fence, - "confidence": "high", - "reason": ( - f"Outlier-resistant lower bound (IQR method, " - f"{profile.outlier_percentage:.1f}% outliers excluded)" - ), - }) - suggestions.append({ - "rule": "max", - "params": upper_fence, - "confidence": "high", - "reason": ( - f"Outlier-resistant upper bound (IQR method, " - f"{profile.outlier_percentage:.1f}% outliers excluded)" - ), - }) - else: - # min - if profile.min_value >= 0: - suggestions.append({ - "rule": "min", - "params": 0, - "confidence": "high", - "reason": f"All values are non-negative (min: {profile.min_value})", - }) - else: - suggestions.append({ - "rule": "min", - "params": profile.min_value, - "confidence": "medium", - "reason": f"Observed minimum: {profile.min_value}", - }) - - # max - suggestions.append({ - "rule": "max", - "params": profile.max_value, - "confidence": "medium", - "reason": f"Observed maximum: {profile.max_value}", - }) - - # mean_between (numeric with sufficient data) - if ( - profile.mean is not None - and profile.std_dev is not None - and profile.std_dev > 0 - ): - margin = profile.std_dev * 2 - suggestions.append({ - "rule": "mean_between", - "params": { - "min": round(profile.mean - margin, 2), - "max": round(profile.mean + margin, 2), - }, - "confidence": "medium", - "reason": ( - f"Mean {profile.mean:.2f} ± 2 std devs " - f"({profile.std_dev:.2f})" - ), - }) - - # std_dev_less_than (numeric with variation) - if profile.std_dev is not None and profile.std_dev > 0: - threshold = round(profile.std_dev * 1.5, 2) - suggestions.append({ - "rule": "std_dev_less_than", - "params": threshold, - "confidence": "low", - "reason": f"Observed std dev: {profile.std_dev:.2f}, threshold at 1.5x", - }) - - # z_score_outliers (when outliers detected) - if profile.outlier_count > 0 and profile.outlier_percentage > 0: - suggestions.append({ - "rule": "z_score_outliers", - "params": 3.0, - "confidence": "medium", - "reason": ( - f"{profile.outlier_count} outliers detected " - f"({profile.outlier_percentage:.1f}%)" - ), - }) - - # percentile_range (numeric with percentiles) - if profile.percentile_25 is not None and profile.percentile_75 is not None: - iqr = profile.percentile_75 - profile.percentile_25 - if iqr > 0: - margin = iqr * 0.5 - suggestions.append({ - "rule": "percentile_range", - "params": { - "p25_min": round(profile.percentile_25 - margin, 2), - "p25_max": round(profile.percentile_25 + margin, 2), - "p75_min": round(profile.percentile_75 - margin, 2), - "p75_max": round(profile.percentile_75 + margin, 2), - }, - "confidence": "low", - "reason": ( - f"P25={profile.percentile_25:.2f}, " - f"P75={profile.percentile_75:.2f}, IQR={iqr:.2f}" - ), - }) - - # --- Categorical / allowed_values --- - if 2 <= profile.unique_count <= 10 and profile.top_values: - allowed_values = [val for val, _ in profile.top_values] - suggestions.append({ - "rule": "allowed_values", - "params": allowed_values, - "confidence": "high" if profile.unique_count <= 5 else "medium", - "reason": f"Only {profile.unique_count} unique values", - }) - - # --- String length rules --- - str_min = getattr(profile, "str_length_min", None) - str_max = getattr(profile, "str_length_max", None) - if str_min is not None and str_max is not None and str_min != str_max: - # Suggest length bounds with margin - margin = max(1, int((str_max - str_min) * 0.2)) - suggested_max = str_max + margin - suggestions.append({ - "rule": "length", - "params": {"min": max(1, str_min), "max": suggested_max}, - "confidence": "medium", - "reason": ( - f"String lengths range from {str_min} to {str_max}" - ), - }) - elif str_min is not None and str_min == str_max and str_min > 0: - # Fixed-length strings (codes, IDs) - suggestions.append({ - "rule": "length", - "params": {"min": str_min, "max": str_min}, - "confidence": "high", - "reason": f"All strings are exactly {str_min} characters", - }) - - # --- Date format (with detected pattern) --- - detected_fmt = getattr(profile, "detected_date_format", None) - is_string_dtype = ( - profile.dtype in ("object", "str") - or profile.dtype.startswith("string") - ) - if detected_fmt: - # Format detected from values — works for both string columns - # and native datetime/PyArrow timestamp columns - suggestions.append({ - "rule": "date_format", - "params": detected_fmt, - "confidence": "high", - "reason": f"Detected date format: {detected_fmt}", - }) - elif ( - "date" in profile.name.lower() or "time" in profile.name.lower() - ) and is_string_dtype: - suggestions.append({ - "rule": "date_format", - "confidence": "medium", - "reason": "Column name suggests date/time values", - }) - elif ( - inferred == "datetime" - and is_string_dtype - and not detected_fmt - ): - suggestions.append({ - "rule": "date_format", - "confidence": "high", - "reason": "Column values detected as datetime strings", - }) - - # --- Temporal rules --- - if profile.min_date is not None and profile.max_date is not None: - # timestamp_range — add 1-day margin on each side so edge - # values don't fail due to profiling-time rounding - min_date_str = profile.min_date.split(" ")[0] - max_date_str = profile.max_date.split(" ")[0] - try: - min_dt = datetime.fromisoformat(min_date_str) - max_dt = datetime.fromisoformat(max_date_str) - min_date_str = (min_dt - timedelta(days=1)).strftime("%Y-%m-%d") - max_date_str = (max_dt + timedelta(days=1)).strftime("%Y-%m-%d") - except (ValueError, TypeError): - pass # Keep original strings if parsing fails - suggestions.append({ - "rule": "timestamp_range", - "params": { - "min": min_date_str, - "max": max_date_str, - }, - "confidence": "medium", - "reason": f"Dates range from {profile.min_date} to {profile.max_date}", - }) - - # no_future_timestamps - try: - max_dt = datetime.fromisoformat( - str(profile.max_date).replace("Z", "+00:00") - ) - if max_dt <= datetime.now(max_dt.tzinfo): - suggestions.append({ - "rule": "no_future_timestamps", - "confidence": "high", - "reason": "No future dates detected in data", - }) - except (ValueError, TypeError): - logger.debug("Failed to parse max_date for no_future_timestamps check") - - # business_days_only - weekday_only = getattr(profile, "weekday_only", None) - if weekday_only is True and profile.inferred_type == "datetime": - suggestions.append({ - "rule": "business_days_only", - "confidence": "medium", - "reason": "All dates fall on weekdays (Mon-Fri)", - }) - - # --- Name-based semantic rules --- - col_lower = profile.name.lower() - existing_rules = {s["rule"] for s in suggestions} - - for rule_def in _NAME_BASED_RULES: - if rule_def["rule"] in existing_rules: - continue - if any(kw in col_lower for kw in rule_def["keywords"]): - suggestions.append({ - "rule": rule_def["rule"], - "confidence": rule_def["confidence"], - "reason": rule_def["reason"], - }) - existing_rules.add(rule_def["rule"]) - - # --- Regex pattern detection (before value-based, so phone/email - # detection can skip when a regex pattern is already matched) --- - sample = getattr(profile, "sample_values", []) - if sample and inferred == "categorical": - # Guard: if values look like dates, suggest date_format - # instead of a regex pattern - if _looks_like_dates(sample): - from datacheck.profiling.profiler import DataProfiler - - detected_fmt = DataProfiler._detect_date_format( - [str(v) for v in sample[:20] if v is not None] - ) - if detected_fmt: - suggestions.append({ - "rule": "date_format", - "params": detected_fmt, - "confidence": "high", - "reason": f"Detected date format: {detected_fmt}", - }) - else: - suggestions.append({ - "rule": "date_format", - "confidence": "medium", - "reason": "Column values detected as date strings", - }) - else: - _suggest_regex_patterns(suggestions, sample) - - # --- Value-based semantic detection --- - if sample and inferred == "categorical": - _suggest_from_values(suggestions, sample) - - # --- JSON detection --- - if sample and inferred == "categorical": - json_count = 0 - for v in sample[:20]: - s = str(v).strip() - if s.startswith("{") or s.startswith("["): - try: - json.loads(s) - json_count += 1 - except (json.JSONDecodeError, TypeError): - continue # Not valid JSON - if len(sample[:20]) > 0 and json_count >= len(sample[:20]) * 0.8: - suggestions.append({ - "rule": "json_valid", - "confidence": "high", - "reason": ( - f"Values are valid JSON " - f"({json_count}/{len(sample[:20])} samples)" - ), - }) - - return suggestions - - @staticmethod - def suggest_config(profile: ColumnProfile) -> dict[str, Any]: - """ - Generate suggested validation config for a column. - - Args: - profile: ColumnProfile to analyze - - Returns: - Suggested validation config dict - """ - suggestions = RuleSuggester.suggest_rules(profile) - - # Only include high-confidence suggestions - high_conf = [s for s in suggestions if s.get("confidence") == "high"] - - config: dict[str, Any] = { - "column": profile.name, - "rules": {}, - } - - for sugg in high_conf: - rule = sugg["rule"] - if "params" in sugg: - config["rules"][rule] = sugg["params"] - else: - config["rules"][rule] = True - - return config - - -# Value-based semantic detectors. -# Each entry defines a rule to suggest when sample values match a regex. -# ``skip_if_rules``: suppress this detector when any of these rules are -# already present (prevents e.g. IPv4 addresses triggering phone_valid). -_VALUE_DETECTORS: list[dict[str, Any]] = [ - { - "rule": "email_valid", - "pattern": r"^[^@\s]+@[^@\s]+\.[^@\s]+$", - "confidence": "high", - "reason_template": "Values match email format ({matches}/{total} samples)", - "skip_if_rules": {"email_valid"}, - }, - { - "rule": "url_valid", - "pattern": r"^https?://[^\s]+", - "confidence": "high", - "reason_template": "Values match URL format ({matches}/{total} samples)", - "skip_if_rules": {"url_valid"}, - }, - { - "rule": "phone_valid", - "pattern": r"^[+0-9][0-9\s\-().]{6,}$", - "confidence": "medium", - "reason_template": "Values match phone format ({matches}/{total} samples)", - "skip_if_rules": {"phone_valid", "regex"}, - }, -] - - -def _looks_like_dates(sample: list, threshold: float = 0.8) -> bool: - """Check if sample values look like date strings. - - Uses pd.to_datetime with format="mixed" to detect date-like values. - Filters out version-like strings first. - """ - if not sample: - return False - - str_values = [str(v).strip() for v in sample[:20] if v is not None] - if not str_values: - return False - - try: - parsed = pd.Series(pd.to_datetime(str_values, errors="coerce", format="mixed")) - success_rate = parsed.notna().sum() / len(str_values) - return bool(success_rate >= threshold) - except Exception: - return False - - -def _suggest_from_values( - suggestions: list[dict[str, Any]], sample: list -) -> None: - """Detect email/phone/URL from actual sample values. - - Iterates over ``_VALUE_DETECTORS`` and suggests a rule when >=80% of - sample values match the detector's regex. Skips detectors whose rule - (or a conflicting rule) is already present in *suggestions*. - """ - if not sample: - return - - str_values = [str(v) for v in sample[:20] if v is not None] - if not str_values: - return - - total = len(str_values) - threshold = 0.8 - existing_rules = {s["rule"] for s in suggestions} - - for detector in _VALUE_DETECTORS: - if detector["skip_if_rules"] & existing_rules: - continue - - compiled = re.compile(detector["pattern"]) - matches = sum(1 for v in str_values if compiled.match(v)) - if matches >= total * threshold: - suggestions.append({ - "rule": detector["rule"], - "confidence": detector["confidence"], - "reason": detector["reason_template"].format( - matches=matches, total=total - ), - }) - existing_rules.add(detector["rule"]) - - -# Known regex patterns to detect from sample values -_KNOWN_PATTERNS: list[tuple[str, str, str, str]] = [ - # (name, regex_pattern, confidence, description) - ( - "UUID", - r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$", - "high", - "UUID format", - ), - ( - "IPv4", - r"^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$", - "high", - "IPv4 address format", - ), - ( - "Hex color", - r"^#[0-9a-fA-F]{6}$", - "high", - "Hex color code format", - ), - ( - "US zip code", - r"^[0-9]{5}(-[0-9]{4})?$", - "medium", - "US zip code format", - ), - ( - "Credit card", - r"^[0-9]{4}[- ]?[0-9]{4}[- ]?[0-9]{4}[- ]?[0-9]{4}$", - "medium", - "Credit card number format", - ), - ( - "SSN-like", - r"^[0-9]{3}-[0-9]{2}-[0-9]{4}$", - "medium", - "SSN-like format (XXX-XX-XXXX)", - ), - ( - "MAC address", - r"^[0-9a-fA-F]{2}:[0-9a-fA-F]{2}:[0-9a-fA-F]{2}:[0-9a-fA-F]{2}:[0-9a-fA-F]{2}:[0-9a-fA-F]{2}$", - "high", - "MAC address format", - ), -] - - -def _suggest_regex_patterns( - suggestions: list[dict[str, Any]], sample: list -) -> None: - """Detect known regex patterns from sample values. - - Tests sample values against common patterns (UUID, IPv4, zip code, etc.) - and suggests a regex rule if >=80% of values match. - """ - if not sample: - return - - # Skip if regex rule already suggested - if any(s["rule"] == "regex" for s in suggestions): - return - - str_values = [str(v).strip() for v in sample[:20] if v is not None] - if not str_values: - return - - total = len(str_values) - threshold = 0.8 - - for _name, pattern, confidence, description in _KNOWN_PATTERNS: - compiled = re.compile(pattern) - matches = sum(1 for v in str_values if compiled.match(v)) - if matches >= total * threshold: - suggestions.append({ - "rule": "regex", - "params": pattern, - "confidence": confidence, - "reason": ( - f"Values match {description} " - f"({matches}/{total} samples)" - ), - }) - return # Only suggest one regex pattern per column - - # No known pattern matched — try auto-inference from value structure - result = _infer_custom_pattern(str_values) - if result is not None: - pattern, description = result - # Verify match count for the reason string - compiled = re.compile(pattern) - matches = sum(1 for v in str_values if compiled.match(v)) - suggestions.append({ - "rule": "regex", - "params": pattern, - "confidence": "medium", - "reason": ( - f"Values match {description} " - f"({matches}/{total} samples)" - ), - }) - - -# --- Separator token regex for splitting structured values --- -_SEP_RE = re.compile(r"([-:_./])") - - -def _infer_custom_pattern( - str_values: list[str], -) -> tuple[str, str] | None: - """Infer a regex pattern from structured string values. - - Splits values by common separators (-, :, _, ., /) and classifies - each segment to build a regex pattern. Works well for ID-like values - such as ``SENS-12345678`` or ``7F:42:7D:4B:C3:D6``. - - Returns ``(pattern, description)`` or ``None``. - """ - if len(str_values) < 5: - return None - - all_tokenized = [_SEP_RE.split(v) for v in str_values] - - # Need >=80% of values to have the same token count - token_counts: dict[int, int] = {} - for tokens in all_tokenized: - n = len(tokens) - token_counts[n] = token_counts.get(n, 0) + 1 - - most_common_count = max(token_counts, key=token_counts.get) # type: ignore[arg-type] - if token_counts[most_common_count] < len(str_values) * 0.8: - return None - - # Must have at least one separator (3+ tokens: seg-sep-seg) - if most_common_count < 3: - return None - - matching = [t for t in all_tokenized if len(t) == most_common_count] - - # Build pattern for each token position - pattern_parts: list[str] = [] - desc_parts: list[str] = [] - for pos in range(most_common_count): - segment_values = [t[pos] for t in matching] - - if pos % 2 == 1: - # Separator position — must be the same literal char - if len(set(segment_values)) == 1: - pattern_parts.append(re.escape(segment_values[0])) - else: - return None - else: - seg_pattern = _classify_segment(segment_values) - if seg_pattern is None: - return None - pattern_parts.append(seg_pattern) - - # Description helper: note literal prefixes - if len(set(segment_values)) == 1: - desc_parts.append(f"'{segment_values[0]}'") - - pattern = "^" + "".join(pattern_parts) + "$" - - # Verify against ALL original values - compiled = re.compile(pattern) - matches = sum(1 for v in str_values if compiled.match(v)) - if matches < len(str_values) * 0.8: - return None - - # Build human-readable description - if desc_parts: - description = f"{' + '.join(desc_parts)} prefix pattern" - else: - sep_char = matching[0][1] - n_segments = (most_common_count + 1) // 2 - description = f"structured pattern ({n_segments} segments, '{sep_char}' separator)" - - return pattern, description - - -# Character class definitions for segment classification. -# Each entry: (per-char test function, regex class string). -# Checked in priority order — first match wins. -_CHAR_CLASSES: list[tuple[Callable[[str], bool], str]] = [ - (str.isdigit, "[0-9]"), - (str.isupper, "[A-Z]"), - (str.islower, "[a-z]"), - (lambda c: c in "0123456789ABCDEF", "[0-9A-F]"), - (lambda c: c in "0123456789abcdefABCDEF", "[0-9a-fA-F]"), - (lambda c: c.isupper() or c.isdigit(), "[A-Z0-9]"), - (lambda c: c.islower() or c.isdigit(), "[a-z0-9]"), - (str.isalnum, "[A-Za-z0-9]"), -] - - -def _classify_segment(values: list[str]) -> str | None: - """Classify a content segment (between separators) as a regex fragment. - - Uses the ``_CHAR_CLASSES`` table to find the most specific character - class that covers every character across all segment values. - - Returns a regex fragment like ``\\d{8}``, ``[A-Z]{3}``, ``[0-9A-F]{12}``, - or ``None`` if the segment cannot be classified. - """ - if not values: - return None - - # Literal — all values are identical - if len(set(values)) == 1: - return re.escape(values[0]) - - # Length analysis - lengths = [len(v) for v in values] - min_len, max_len = min(lengths), max(lengths) - - all_chars = "".join(values) - if not all_chars: - return None - - # Build quantifier - if min_len == max_len: - quant = f"{{{min_len}}}" if min_len > 1 else "" - else: - quant = f"{{{min_len},{max_len}}}" - - # Walk the priority table — first class that covers all chars wins - for test_fn, regex_class in _CHAR_CLASSES: - if all(test_fn(c) for c in all_chars): - return f"{regex_class}{quant}" - - return None diff --git a/datacheck/reporting/__init__.py b/datacheck/reporting/__init__.py index 1471ee4..9a9aa95 100644 --- a/datacheck/reporting/__init__.py +++ b/datacheck/reporting/__init__.py @@ -3,15 +3,18 @@ This module provides enhanced reporting capabilities including: - Rich terminal output with suggestions - CSV export for failure details +- SARIF 2.1.0 export for GitHub Code Scanning - Suggestion engine for actionable recommendations """ from datacheck.reporting.csv_exporter import CsvExporter +from datacheck.reporting.sarif_exporter import SarifExporter from datacheck.reporting.suggestion_engine import SuggestionEngine from datacheck.reporting.terminal_reporter import TerminalReporter __all__ = [ "CsvExporter", + "SarifExporter", "SuggestionEngine", "TerminalReporter", ] diff --git a/datacheck/reporting/csv_exporter.py b/datacheck/reporting/csv_exporter.py index 06329bb..05e0858 100644 --- a/datacheck/reporting/csv_exporter.py +++ b/datacheck/reporting/csv_exporter.py @@ -51,10 +51,10 @@ def export_failures( # Base row data base_row = { - "rule_name": result.rule_name, "check_name": result.check_name or result.rule_name, "column": result.column, "rule_type": result.rule_type or "", + "severity": result.severity, "status": "PASS" if result.passed else ("ERROR" if result.has_error else "FAIL"), "total_rows": result.total_rows, "failed_rows": result.failed_rows, @@ -121,10 +121,10 @@ def export_summary( for result in summary.results: rows.append({ - "rule_name": result.rule_name, "check_name": result.check_name or result.rule_name, "column": result.column, "rule_type": result.rule_type or "", + "severity": result.severity, "status": "PASS" if result.passed else ("ERROR" if result.has_error else "FAIL"), "total_rows": result.total_rows, "failed_rows": result.failed_rows, @@ -255,28 +255,7 @@ def _get_suggestion_for_value(value: Any, rule_type: str) -> str: if value is None: return "Replace NULL with default value" - value_str = str(value) - - if rule_type == "email_valid": - if "@" not in value_str: - return f"Add domain: {value_str}@example.com" - return "Fix email format" - - elif rule_type == "phone_valid": - digits = "".join(c for c in value_str if c.isdigit()) - if len(digits) >= 10: - return f"Standardize: +1-{digits[:3]}-{digits[3:6]}-{digits[6:10]}" - return "Add missing digits" - - elif rule_type == "url_valid": - if not value_str.startswith(("http://", "https://")): - return f"Add protocol: https://{value_str}" - return "Fix URL format" - - elif rule_type == "json_valid": - return "Fix JSON syntax" - - elif rule_type == "not_null": + if rule_type == "not_null": return "Replace with default value" elif rule_type == "unique": diff --git a/datacheck/reporting/json_reporter.py b/datacheck/reporting/json_reporter.py index 07945bd..52d4798 100644 --- a/datacheck/reporting/json_reporter.py +++ b/datacheck/reporting/json_reporter.py @@ -52,18 +52,22 @@ def generate_report( self, summary: ValidationSummary, df: pd.DataFrame | None = None, + source_info: str | None = None, + elapsed: float | None = None, ) -> dict[str, Any]: """Generate comprehensive JSON report. Args: summary: ValidationSummary to report df: Optional DataFrame for distribution analysis + source_info: Human-readable description of the data source + elapsed: Validation elapsed time in seconds Returns: Dictionary containing full report data """ report: dict[str, Any] = { - "metadata": self._generate_metadata(), + "metadata": self._generate_metadata(source_info, elapsed), "summary": self._generate_summary(summary), "results": self._generate_results(summary), } @@ -85,6 +89,8 @@ def export( summary: ValidationSummary, output_path: str | Path | None = None, df: pd.DataFrame | None = None, + source_info: str | None = None, + elapsed: float | None = None, ) -> str: """Export validation results to JSON format. @@ -92,11 +98,13 @@ def export( summary: ValidationSummary to export output_path: Optional file path to write JSON df: Optional DataFrame for distribution analysis + source_info: Human-readable description of the data source + elapsed: Validation elapsed time in seconds Returns: JSON string representation of report """ - report = self.generate_report(summary, df) + report = self.generate_report(summary, df, source_info=source_info, elapsed=elapsed) indent = 2 if self.pretty else None json_str = json.dumps(report, indent=indent, default=str) @@ -108,42 +116,50 @@ def export( return json_str - def _generate_metadata(self) -> dict[str, Any]: - """Generate report metadata. - - Returns: - Dictionary containing metadata - """ - return { + def _generate_metadata( + self, + source_info: str | None = None, + elapsed: float | None = None, + ) -> dict[str, Any]: + """Generate report metadata.""" + meta: dict[str, Any] = { "generated_at": datetime.now(timezone.utc).isoformat(), "report_version": "1.0", "includes_suggestions": self.include_suggestions, "includes_distributions": self.include_distributions, } + if source_info: + meta["source"] = source_info + if elapsed is not None: + meta["elapsed_seconds"] = round(elapsed, 3) + return meta def _generate_summary(self, summary: ValidationSummary) -> dict[str, Any]: - """Generate summary statistics. + """Generate summary statistics.""" + if summary.all_passed and summary.has_failures: + status = "PASSED_WITH_WARNINGS" + elif summary.all_passed: + status = "PASSED" + else: + status = "FAILED" - Args: - summary: ValidationSummary to summarize - - Returns: - Dictionary containing summary statistics - """ return { - "status": "PASSED" if summary.all_passed else "FAILED", + "status": status, + "total_rows": summary.total_rows, + "total_columns": summary.total_columns, "total_rules": summary.total_rules, "passed_rules": summary.passed_rules, "failed_rules": summary.failed_rules, - "error_rules": summary.error_rules, + "failed_errors": summary.failed_errors, + "failed_warnings": summary.failed_warnings, + "failed_info": summary.failed_info, + "execution_errors": summary.error_rules, "pass_rate": round( (summary.passed_rules / summary.total_rules * 100) if summary.total_rules > 0 else 0, 2, ), - "has_failures": summary.has_failures, - "has_errors": summary.has_errors, } def _generate_results(self, summary: ValidationSummary) -> list[dict[str, Any]]: @@ -158,12 +174,23 @@ def _generate_results(self, summary: ValidationSummary) -> list[dict[str, Any]]: results: list[dict[str, Any]] = [] for result in summary.results: + if result.has_error: + status = "ERROR" + elif result.passed: + status = "PASS" + elif result.severity == "warning": + status = "WARNING" + elif result.severity == "info": + status = "INFO" + else: + status = "FAIL" + result_dict: dict[str, Any] = { - "rule_name": result.rule_name, "check_name": result.check_name or result.rule_name, "column": result.column, "rule_type": result.rule_type or "", - "status": "PASS" if result.passed else ("ERROR" if result.has_error else "FAIL"), + "severity": result.severity, + "status": status, "total_rows": result.total_rows, "failed_rows": result.failed_rows, "success_rate": round(result.success_rate, 2), diff --git a/datacheck/reporting/sarif_exporter.py b/datacheck/reporting/sarif_exporter.py new file mode 100644 index 0000000..131d1f3 --- /dev/null +++ b/datacheck/reporting/sarif_exporter.py @@ -0,0 +1,203 @@ +"""SARIF 2.1.0 exporter for validation results. + +SARIF (Static Analysis Results Interchange Format) is the standard consumed +by GitHub Code Scanning. Exporting results as SARIF allows data quality +failures to appear in the GitHub Security tab alongside code analysis results. + +Since DataCheck failures are column-level aggregates (not tied to a specific +source code line), results use ``logicalLocations`` rather than +``physicalLocation``. This means failures appear in the Security tab, not +as inline PR annotations. + +Reference: https://docs.oasis-open.org/sarif/sarif/v2.1.0/sarif-v2.1.0.html +""" + +import json +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +from datacheck.results import ValidationSummary + + +_SARIF_SCHEMA = "https://schemastore.azurewebsites.net/schemas/json/sarif-2.1.0.json" +_DATACHECK_VERSION = "2.1.0" +_DATACHECK_INFO_URI = "https://github.com/squrtech/datacheck" + +# DataCheck severity → SARIF level mapping +_SEVERITY_MAP: dict[str, str] = { + "error": "error", + "warning": "warning", + "info": "note", +} + + +class SarifExporter: + """Exporter for SARIF 2.1.0 output of validation results. + + Produces a valid SARIF 2.1.0 JSON document that GitHub Code Scanning + can consume. Only failed rules (and rules with execution errors) are + included — passed rules are omitted per the SARIF convention. + """ + + @staticmethod + def export( + summary: ValidationSummary, + output_path: str | Path | None = None, + elapsed: float | None = None, + source_info: str | None = None, + ) -> str: + """Export validation results to SARIF 2.1.0 JSON format. + + Args: + summary: ValidationSummary to export + output_path: Optional file path to write the SARIF JSON + elapsed: Validation elapsed time in seconds + source_info: Human-readable description of the data source + + Returns: + SARIF JSON string + """ + sarif = SarifExporter._build_sarif(summary, elapsed=elapsed, source_info=source_info) + sarif_json = json.dumps(sarif, indent=2) + + if output_path: + path = Path(output_path) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(sarif_json, encoding="utf-8") + + return sarif_json + + @staticmethod + def _build_sarif( + summary: ValidationSummary, + elapsed: float | None = None, + source_info: str | None = None, + ) -> dict[str, Any]: + """Build the SARIF 2.1.0 document structure.""" + rules = SarifExporter._build_rules(summary) + results = SarifExporter._build_results(summary) + + end_time = datetime.now(timezone.utc) + end_time_str = end_time.strftime("%Y-%m-%dT%H:%M:%SZ") + + invocation: dict[str, Any] = { + "executionSuccessful": not summary.has_errors, + "endTimeUtc": end_time_str, + } + if elapsed is not None: + from datetime import timedelta + start_time = end_time - timedelta(seconds=elapsed) + invocation["startTimeUtc"] = start_time.strftime("%Y-%m-%dT%H:%M:%SZ") + + run: dict[str, Any] = { + "tool": { + "driver": { + "name": "DataCheck", + "version": _DATACHECK_VERSION, + "informationUri": _DATACHECK_INFO_URI, + "rules": rules, + } + }, + "results": results, + "invocations": [invocation], + } + if source_info: + run["automationDetails"] = {"description": {"text": source_info}} + + return { + "$schema": _SARIF_SCHEMA, + "version": "2.1.0", + "runs": [run], + } + + @staticmethod + def _build_rules(summary: ValidationSummary) -> list[dict[str, Any]]: + """Build the SARIF rules list from all results (not just failures). + + Each unique rule_name becomes one SARIF rule entry. Passed rules are + included so that the rule registry is complete. + + Args: + summary: ValidationSummary containing all rule results + + Returns: + List of SARIF rule descriptor objects + """ + seen: set[str] = set() + rules: list[dict[str, Any]] = [] + + for result in summary.results: + if result.rule_name in seen: + continue + seen.add(result.rule_name) + + level = _SEVERITY_MAP.get(result.severity, "error") + rule_label = result.rule_type or result.rule_name + description = f"{rule_label} check on column '{result.column}'" + + rules.append({ + "id": result.rule_name, + "shortDescription": {"text": description}, + "defaultConfiguration": {"level": level}, + "helpUri": _DATACHECK_INFO_URI, + }) + + return rules + + @staticmethod + def _build_results(summary: ValidationSummary) -> list[dict[str, Any]]: + """Build the SARIF results list (failures and execution errors only). + + Passed rules are skipped — SARIF only lists findings, not passes. + + Args: + summary: ValidationSummary containing all rule results + + Returns: + List of SARIF result objects + """ + sarif_results: list[dict[str, Any]] = [] + + for result in summary.results: + if result.passed and not result.has_error: + continue + + level = _SEVERITY_MAP.get(result.severity, "error") + + if result.has_error: + msg = ( + f"Column '{result.column}': rule execution error" + + (f" — {result.error}" if result.error else "") + ) + level = "error" + else: + rate = ( + result.failed_rows / result.total_rows * 100 + if result.total_rows > 0 + else 0.0 + ) + rule_label = result.rule_type or result.rule_name + msg = ( + f"Column '{result.column}': {result.failed_rows:,} of " + f"{result.total_rows:,} rows failed {rule_label} check " + f"({rate:.2f}%)" + ) + + sarif_results.append({ + "ruleId": result.rule_name, + "level": level, + "message": {"text": msg}, + "locations": [ + { + "logicalLocations": [ + {"name": result.column, "kind": "column"} + ] + } + ], + }) + + return sarif_results + + +__all__ = ["SarifExporter"] diff --git a/datacheck/reporting/suggestion_engine.py b/datacheck/reporting/suggestion_engine.py index c3f8f9e..bc93fdd 100644 --- a/datacheck/reporting/suggestion_engine.py +++ b/datacheck/reporting/suggestion_engine.py @@ -94,34 +94,6 @@ class SuggestionEngine: "message": "String length outside acceptable range", "action": "Add length validation at data entry or implement truncation in ETL", }, - "email_valid": { - "message": "Invalid email addresses detected", - "action": "Implement email validation at form submission or clean existing data", - }, - "phone_valid": { - "message": "Invalid phone numbers detected", - "action": "Standardize phone format at entry points or use phone parsing library", - }, - "url_valid": { - "message": "Invalid URLs detected", - "action": "Validate URLs at data entry or implement URL sanitization", - }, - "json_valid": { - "message": "Invalid JSON detected in string column", - "action": "Fix JSON encoding issues at source or add JSON validation middleware", - }, - "mean_between": { - "message": "Column mean outside expected range", - "action": "Investigate outliers affecting mean or adjust expected range", - }, - "std_dev_less_than": { - "message": "Column standard deviation exceeds threshold", - "action": "Investigate high variance in data or normalize values", - }, - "z_score_outliers": { - "message": "Statistical outliers detected", - "action": "Review outlier values and implement outlier handling strategy", - }, "max_age": { "message": "Data exceeds maximum age threshold", "action": "Check data pipeline freshness or increase refresh frequency", @@ -138,10 +110,6 @@ class SuggestionEngine: "message": "Invalid date formats detected", "action": "Standardize dates to match the expected format at the source or add date parsing logic", }, - "business_days_only": { - "message": "Records found on non-business days", - "action": "Review business logic for weekend/holiday handling", - }, "foreign_key_exists": { "message": "Orphan records detected (missing foreign key references)", "action": "Add referential integrity constraints or clean orphan records", @@ -287,10 +255,6 @@ def _infer_rule_type(self, rule_name: str) -> str: "min": ["min_", "minimum"], "max": ["max_", "maximum"], "regex": ["regex", "pattern"], - "email_valid": ["email"], - "phone_valid": ["phone"], - "url_valid": ["url"], - "json_valid": ["json"], } for rule_type, patterns in type_patterns.items(): @@ -356,28 +320,7 @@ def _suggest_fix_for_value( value_str = str(value) - if rule_type == "email_valid": - # Try to suggest email fix - if "@" not in value_str: - return f"Add '@' domain: {value_str}@example.com" - return "Fix email format or remove invalid characters" - - elif rule_type == "phone_valid": - # Suggest phone format - digits = "".join(c for c in value_str if c.isdigit()) - if len(digits) >= 10: - return f"Standardize format: +1-{digits[:3]}-{digits[3:6]}-{digits[6:10]}" - return "Add missing digits or use international format" - - elif rule_type == "url_valid": - if not value_str.startswith(("http://", "https://")): - return f"Add protocol: https://{value_str}" - return "Fix URL structure or encoding" - - elif rule_type == "json_valid": - return "Validate JSON syntax, check for unquoted strings or trailing commas" - - elif rule_type in ("min", "max"): + if rule_type in ("min", "max"): return f"Adjust value or investigate data source for '{value_str}'" elif rule_type == "not_null": diff --git a/datacheck/reporting/terminal_reporter.py b/datacheck/reporting/terminal_reporter.py index d033d2d..906e635 100644 --- a/datacheck/reporting/terminal_reporter.py +++ b/datacheck/reporting/terminal_reporter.py @@ -1,15 +1,9 @@ -"""Enhanced terminal reporter with Rich formatting and suggestions. - -Provides production-grade terminal output for validation results including: -- Color-coded status display -- Actionable suggestions for failures -- Summary statistics with progress bars -""" +"""Terminal reporter with table-based output.""" import sys +from rich import box from rich.console import Console -from rich.panel import Panel from rich.table import Table from rich.text import Text @@ -23,24 +17,23 @@ def _safe_encoding() -> bool: return encoding.lower().replace("-", "") in ("utf8", "utf16", "utf32", "utf16le", "utf16be") -# Symbols that degrade gracefully on non-UTF-8 terminals (e.g. Windows cp1252) -_TICK = "✓" if _safe_encoding() else "v" -_CROSS = "✗" if _safe_encoding() else "x" -_WARN = "⚠" if _safe_encoding() else "!" -_BAR_FILLED = "█" if _safe_encoding() else "#" -_BAR_EMPTY = "░" if _safe_encoding() else "-" -_HLINE = "─" if _safe_encoding() else "-" -_ARROW = "→" if _safe_encoding() else "->" _BULLET = "•" if _safe_encoding() else "*" +_ARROW = "→" if _safe_encoding() else "->" class TerminalReporter: - """Enhanced terminal reporter with Rich formatting. - - Provides rich, informative terminal output including: - - Color-coded validation status - - Detailed failure statistics - - Actionable suggestions for fixing issues + """Table-based terminal reporter modelled on datacontract CLI output. + + Renders every rule as a row in a rounded Rich table: + + ╭─────────┬─────────────────────────┬──────────┬──────────────────────────────────╮ + │ Result │ Check │ Column │ Details │ + ├─────────┼─────────────────────────┼──────────┼──────────────────────────────────┤ + │ passed │ id_check · not_null │ id │ │ + │ failed │ amount_check · min │ amount │ 42/1,000 rows (4.2%) — e.g. -5 │ + │ warning │ email_check · regex │ email │ 5/1,000 rows (0.5%) │ + ╰─────────┴─────────────────────────┴──────────┴──────────────────────────────────╯ + 🔴 Validation failed. Ran 10 checks on 1,000 rows — 8 passed, 1 failed, 1 warning. """ def __init__( @@ -48,226 +41,185 @@ def __init__( console: Console | None = None, show_suggestions: bool = True, ) -> None: - """Initialize terminal reporter. - - Args: - console: Rich Console instance (creates new one if None) - show_suggestions: Whether to show fix suggestions (default: True) - """ self.console = console or Console() self.show_suggestions = show_suggestions - - # Initialize analyzers self._suggestion_engine = SuggestionEngine() - def report(self, summary: ValidationSummary) -> None: - """Generate and print comprehensive validation report. + def report( + self, + summary: ValidationSummary, + elapsed: float | None = None, + source_info: str | None = None, + ) -> None: + """Print the full validation report. Args: summary: ValidationSummary to report + elapsed: Optional elapsed time in seconds to display in the footer + source_info: Human-readable description of the data source (e.g. "orders.csv" or "production_db → orders") """ - # Print header - self._print_header() - - # Print overall status - self._print_status(summary) - - # Print statistics - self._print_statistics(summary) + if source_info: + self._print_source_header(source_info) + self._print_table(summary) + self._print_footer(summary, elapsed) - # Print detailed failures - if summary.has_failures or summary.has_errors: - self._print_failures(summary) + # Print full error messages for execution errors (too long to fit in the table) + error_results = summary.get_error_results() + if error_results: + self._print_execution_errors(error_results) - # Print suggestions if enabled if self.show_suggestions and (summary.has_failures or summary.has_errors): suggestions = self._suggestion_engine.analyze(summary) if suggestions: self._print_suggestions(suggestions) - # Print summary footer - self._print_footer(summary) - - def _print_header(self) -> None: - """Print report header.""" - self.console.print() - self.console.print( - Panel.fit( - "[bold]DataCheck Validation Report[/bold]", - border_style="blue", - ) + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + _ERROR_DETAIL_MAX = 60 # chars shown inline in the table for execution errors + + def _print_source_header(self, source_info: str) -> None: + """Print a dim header line showing what is being validated.""" + self.console.print(f"[dim]Validating[/dim] [bold]{source_info}[/bold]") + + def _result_cell(self, result: RuleResult) -> Text: + """Return a styled Text object for the Result column.""" + if result.has_error: + return Text("error", style="bold red") + if result.passed: + return Text("passed", style="bold green") + if result.severity == "warning": + return Text("warning", style="bold yellow") + if result.severity == "info": + return Text("info", style="bold blue") + return Text("failed", style="bold red") + + def _details_cell(self, result: RuleResult) -> str: + """Return the Details column string for a rule result.""" + if result.has_error: + err = str(result.error) + if len(err) > self._ERROR_DETAIL_MAX: + return err[: self._ERROR_DETAIL_MAX] + "… (see below)" + return err + if result.passed: + return "" + failure_rate = ( + result.failed_rows / result.total_rows * 100 + if result.total_rows > 0 + else 0.0 ) + detail = f"{result.failed_rows:,}/{result.total_rows:,} rows failed ({failure_rate:.1f}%)" + if result.failure_details and result.failure_details.sample_values: + samples = [ + str(v) + for v in result.failure_details.sample_values[:3] + if v is not None + ] + if samples: + detail += f" — e.g. {', '.join(samples)}" + return detail + + def _print_table(self, summary: ValidationSummary) -> None: + """Render all rule results as a single Rich table.""" self.console.print() - - def _print_status(self, summary: ValidationSummary) -> None: - """Print overall validation status. - - Args: - summary: ValidationSummary containing status - """ - if summary.all_passed: - status = Text("ALL CHECKS PASSED", style="bold green") - icon = f"[green]{_TICK}[/green]" - elif summary.error_rules > 0 and summary.failed_rules == 0: - status = Text("VALIDATION ERRORS", style="bold yellow") - icon = f"[yellow]{_WARN}[/yellow]" - else: - status = Text("VALIDATION FAILED", style="bold red") - icon = f"[red]{_CROSS}[/red]" - - self.console.print(f"{icon} {status}") - self.console.print() - - def _print_statistics(self, summary: ValidationSummary) -> None: - """Print summary statistics table. - - Args: - summary: ValidationSummary containing statistics - """ table = Table( show_header=True, - header_style="bold cyan", - box=None, - padding=(0, 2), - ) - table.add_column("Metric", style="cyan") - table.add_column("Value", justify="right") - table.add_column("", width=20) - - # Dataset size - if summary.total_rows > 0: - table.add_row("Records", f"{summary.total_rows:,}", "") - if summary.total_columns > 0: - table.add_row("Columns", f"{summary.total_columns:,}", "") - - # Total rules - table.add_row("Total Rules", str(summary.total_rules), "") - - # Passed rules with bar - pass_pct = (summary.passed_rules / summary.total_rules * 100) if summary.total_rules > 0 else 0 - pass_bar = self._create_progress_bar(pass_pct, "green") - table.add_row( - "Passed", - f"[green]{summary.passed_rules}[/green]", - pass_bar, - ) - - # Failed rules with bar - fail_pct = (summary.failed_rules / summary.total_rules * 100) if summary.total_rules > 0 else 0 - fail_bar = self._create_progress_bar(fail_pct, "red") - table.add_row( - "Failed", - f"[red]{summary.failed_rules}[/red]" if summary.failed_rules > 0 else "0", - fail_bar if summary.failed_rules > 0 else "", - ) - - # Error rules with bar - error_pct = (summary.error_rules / summary.total_rules * 100) if summary.total_rules > 0 else 0 - error_bar = self._create_progress_bar(error_pct, "yellow") - table.add_row( - "Errors", - f"[yellow]{summary.error_rules}[/yellow]" if summary.error_rules > 0 else "0", - error_bar if summary.error_rules > 0 else "", + header_style="bold", + box=box.ROUNDED, + padding=(0, 1), + show_lines=False, ) + table.add_column("Result", width=9, no_wrap=True) + table.add_column("Check") + table.add_column("Column", style="cyan", no_wrap=True) + table.add_column("Details") + + for result in summary.results: + check_label = result.check_name or result.rule_name + rule_type = result.rule_type or "" + check_display = f"{check_label} · {rule_type}" if rule_type else check_label + + table.add_row( + self._result_cell(result), + check_display, + result.column, + self._details_cell(result), + ) self.console.print(table) self.console.print() - def _create_progress_bar(self, percentage: float, color: str, width: int = 15) -> str: - """Create a simple progress bar string. - - Args: - percentage: Percentage (0-100) - color: Color name for the bar - width: Width of the bar in characters - - Returns: - Formatted progress bar string - """ - filled = int((percentage / 100) * width) - empty = width - filled - bar = _BAR_FILLED * filled + _BAR_EMPTY * empty - return f"[{color}]{bar}[/{color}] {percentage:.0f}%" - - def _print_failures(self, summary: ValidationSummary) -> None: - """Print detailed failure information. - - Args: - summary: ValidationSummary containing failures - """ - # Print failed rules - failed_results = summary.get_failed_results() - if failed_results: - self.console.print("[bold red]Failed Rules:[/bold red]") - self.console.print() - - for result in failed_results: - self._print_rule_failure(result) - - # Print error rules - error_results = summary.get_error_results() - if error_results: - self.console.print("[bold yellow]Rules with Errors:[/bold yellow]") - self.console.print() - - for result in error_results: - self._print_rule_error(result) - - def _print_rule_failure(self, result: RuleResult) -> None: - """Print detailed failure information for a single rule. - - Args: - result: Failed RuleResult - """ - # Rule header - check_name = result.check_name if result.check_name else result.rule_name - rule_type = result.rule_type if result.rule_type else "unknown" - - self.console.print( - f"[red]{_CROSS}[/red] [bold]{check_name}[/bold] " - f"([cyan]{result.column}[/cyan] · {rule_type})" - ) - - # Failure statistics - failure_rate = (result.failed_rows / result.total_rows * 100) if result.total_rows > 0 else 0 - self.console.print( - f" Failed: {result.failed_rows:,}/{result.total_rows:,} rows ({failure_rate:.1f}%)" - ) + def _print_footer(self, summary: ValidationSummary, elapsed: float | None) -> None: + """Print the one-line summary footer.""" + if summary.all_passed and not summary.has_failures: + icon = "🟢" if _safe_encoding() else "[OK]" + status = "[green]All checks passed.[/green]" + elif summary.all_passed: + # Only warnings/info — pipeline not blocked + icon = "🟡" if _safe_encoding() else "[WARN]" + status = "[yellow]Passed with warnings.[/yellow]" + else: + icon = "🔴" if _safe_encoding() else "[FAIL]" + status = "[red]Validation failed.[/red]" + # Build the run summary + if summary.total_rows > 0: + run_info = f"Ran {summary.total_rules} checks on {summary.total_rows:,} rows" + else: + run_info = f"Ran {summary.total_rules} checks" + + counts = [f"[green]{summary.passed_rules} passed[/green]"] + if summary.failed_errors > 0: + counts.append(f"[red]{summary.failed_errors} failed[/red]") + if summary.failed_warnings > 0: + counts.append(f"[yellow]{summary.failed_warnings} warning{'s' if summary.failed_warnings != 1 else ''}[/yellow]") + if summary.failed_info > 0: + counts.append(f"[blue]{summary.failed_info} info[/blue]") + if summary.error_rules > 0: + counts.append(f"[red]{summary.error_rules} execution error{'s' if summary.error_rules != 1 else ''}[/red]") + + line = f"{icon} {status} {run_info} — {', '.join(counts)}." + if elapsed is not None: + line += f" Took {elapsed:.2f}s." + + self.console.print(line) self.console.print() - def _print_rule_error(self, result: RuleResult) -> None: - """Print error information for a rule that failed to execute. - - Args: - result: Error RuleResult - """ - check_name = result.check_name if result.check_name else result.rule_name + def _print_execution_errors(self, error_results: list[RuleResult]) -> None: + """Print full error messages for rules that had execution errors.""" + from rich.panel import Panel self.console.print( - f"[yellow]{_WARN}[/yellow] [bold]{check_name}[/bold] " - f"([cyan]{result.column}[/cyan])" + Panel.fit( + "[bold red]Execution Errors[/bold red]", + border_style="red", + ) ) - self.console.print(f" Error: {result.error}", style="yellow") self.console.print() + for result in error_results: + check_label = result.check_name or result.rule_name + rule_type = result.rule_type or "" + check_display = f"{check_label} · {rule_type}" if rule_type else check_label + self.console.print( + f"[red]error[/red] [bold]{check_display}[/bold] ([cyan]{result.column}[/cyan])" + ) + self.console.print(f" {result.error}") + self.console.print() def _print_suggestions(self, suggestions: list[Suggestion]) -> None: - """Print actionable suggestions for fixing failures. + """Print actionable suggestions below the table.""" + from rich.panel import Panel - Args: - suggestions: List of Suggestion objects - """ - self.console.print() self.console.print( Panel.fit( - "[bold]Suggestions for Fixing Data Quality Issues[/bold]", + "[bold]Suggestions[/bold]", border_style="cyan", ) ) self.console.print() for i, suggestion in enumerate(suggestions, 1): - # Severity indicator severity_styles = { "high": "[red]HIGH[/red]", "medium": "[yellow]MEDIUM[/yellow]", @@ -285,7 +237,6 @@ def _print_suggestions(self, suggestions: list[Suggestion]) -> None: if suggestion.impact: self.console.print(f" [dim]Impact:[/dim] {suggestion.impact}") - # Show sample fixes if suggestion.sample_fixes: self.console.print(" [dim]Sample Fixes:[/dim]") for fix in suggestion.sample_fixes[:3]: @@ -295,31 +246,6 @@ def _print_suggestions(self, suggestions: list[Suggestion]) -> None: self.console.print() - def _print_footer(self, summary: ValidationSummary) -> None: - """Print report footer with summary. - - Args: - summary: ValidationSummary for footer - """ - self.console.print(_HLINE * 60) - - if summary.all_passed: - self.console.print( - f"[green]{_TICK} All validation rules passed successfully.[/green]" - ) - else: - issues = [] - if summary.failed_rules > 0: - issues.append(f"{summary.failed_rules} failed") - if summary.error_rules > 0: - issues.append(f"{summary.error_rules} errors") - issue_str = ", ".join(issues) - self.console.print( - f"[yellow]{_WARN} Validation complete with issues: {issue_str}[/yellow]" - ) - - self.console.print() - __all__ = [ "TerminalReporter", diff --git a/datacheck/rules/__init__.py b/datacheck/rules/__init__.py index bc52160..41a2ffe 100644 --- a/datacheck/rules/__init__.py +++ b/datacheck/rules/__init__.py @@ -1,20 +1,16 @@ """Validation rules implementations.""" -from datacheck.rules.base import CustomRule, Rule +from datacheck.rules.base import Rule from datacheck.rules.factory import RuleFactory from datacheck.rules.null_rules import NotNullRule from datacheck.rules.numeric_rules import ( - DistributionTypeRule, MaxRule, MeanBetweenRule, MinMaxRule, MinRule, - PercentileRangeRule, RangeRule, StdDevLessThanRule, ZScoreOutliersRule, + MaxRule, MinMaxRule, MinRule, NonNegativeRule, PositiveRule, RangeRule, ) from datacheck.rules.string_rules import AllowedValuesRule, LengthRule, RegexRule from datacheck.rules.temporal_rules import ( - BusinessDaysOnlyRule, DateFormatValidRule, MaxAgeRule, + DateFormatValidRule, MaxAgeRule, NoFutureTimestampsRule, TimestampRangeRule, ) -from datacheck.rules.semantic_rules import ( - EmailValidRule, JsonValidRule, PhoneValidRule, UrlValidRule, -) from datacheck.rules.composite_rules import ( DataTypeRule, ForeignKeyExistsRule, SumEqualsRule, UniqueCombinationRule, UniqueRule, @@ -22,10 +18,9 @@ __all__ = [ "Rule", "NotNullRule", "MinMaxRule", "MinRule", "MaxRule", "RangeRule", + "NonNegativeRule", "PositiveRule", "UniqueRule", "RegexRule", "AllowedValuesRule", "DataTypeRule", "LengthRule", - "CustomRule", "MeanBetweenRule", "StdDevLessThanRule", "PercentileRangeRule", - "ZScoreOutliersRule", "DistributionTypeRule", "MaxAgeRule", "TimestampRangeRule", - "NoFutureTimestampsRule", "DateFormatValidRule", "BusinessDaysOnlyRule", - "EmailValidRule", "PhoneValidRule", "UrlValidRule", "JsonValidRule", + "MaxAgeRule", "TimestampRangeRule", "NoFutureTimestampsRule", + "DateFormatValidRule", "ForeignKeyExistsRule", "SumEqualsRule", "UniqueCombinationRule", "RuleFactory", ] diff --git a/datacheck/rules/base.py b/datacheck/rules/base.py index ea7718d..6e7bdc1 100644 --- a/datacheck/rules/base.py +++ b/datacheck/rules/base.py @@ -5,7 +5,7 @@ import pandas as pd -from datacheck.exceptions import ColumnNotFoundError, RuleDefinitionError +from datacheck.exceptions import ColumnNotFoundError from datacheck.results import FailureDetail, RuleResult @@ -98,117 +98,3 @@ def _create_failure_detail( sample_values=sample_values, sample_reasons=sample_reasons, ) - - -class CustomRule(Rule): - """Custom validation rule defined by user. - - Executes user-defined validation functions loaded from plugins. - - Example: - >>> rule = CustomRule("email", "is_business_email", - ... params={"allowed_domains": ["company.com"]}) - >>> result = rule.validate(df) - """ - - def __init__( - self, - column: str, - rule_func_name: str, - params: dict[str, Any] | None = None, - rule_name: str | None = None - ) -> None: - """Initialize custom rule. - - Args: - column: Column to validate - rule_func_name: Name of the custom rule function - params: Parameters to pass to the rule function - rule_name: Optional custom name for the rule - """ - super().__init__(rule_name or f"custom_{rule_func_name}", column) - self.rule_func_name = rule_func_name - self.params = params or {} - - # Get rule function from registry - from datacheck.plugins.registry import get_global_registry - self.registry = get_global_registry() - - if not self.registry.has_rule(rule_func_name): - raise RuleDefinitionError(f"Custom rule '{rule_func_name}' not found. Did you load the plugin?") - - def validate(self, df: pd.DataFrame) -> RuleResult: - """Execute custom validation rule. - - Args: - df: DataFrame to validate - - Returns: - RuleResult with validation outcome - """ - self._check_column_exists(df) - - try: - # Execute custom rule - validation_result = self.registry.execute_rule( - self.rule_func_name, - df[self.column], - self.params - ) - - # Check result is boolean series - if not isinstance(validation_result, pd.Series): - raise RuleDefinitionError( - f"Custom rule '{self.rule_func_name}' must return a pandas Series" - ) - - # Find failures - failed_mask = ~validation_result - failed_indices = df[failed_mask].index - - passed = not failed_mask.any() - total_rows = len(df) - - if passed: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=total_rows, - failed_rows=0, - rule_type="custom", - check_name=self.name, - ) - - # Create failure detail with failed values - failed_values = df.loc[failed_mask, self.column] - reasons = [f"Custom rule '{self.rule_func_name}' failed"] * len(failed_indices) - - failure_detail = self._create_failure_detail( - failed_indices, - total_rows, - failed_values, - reasons - ) - - return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=total_rows, - failed_rows=len(failed_indices), - failure_details=failure_detail, - rule_type="custom", - check_name=self.name, - ) - - except Exception as e: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=len(df), - error=f"Custom rule execution failed: {e}", - rule_type="custom", - check_name=self.name, - ) diff --git a/datacheck/rules/composite_rules.py b/datacheck/rules/composite_rules.py index 3959bdc..6f79cb8 100644 --- a/datacheck/rules/composite_rules.py +++ b/datacheck/rules/composite_rules.py @@ -1,7 +1,6 @@ """Composite validation rules.""" -from typing import Any - +import numpy as np import pandas as pd from datacheck.exceptions import ColumnNotFoundError, RuleDefinitionError @@ -224,26 +223,18 @@ def _check_type(self, data: pd.Series) -> pd.Series: if pd.api.types.is_float_dtype(data) or "double" in dtype_str: # Check if floats are actually whole numbers return data != data.round(0) - # For object dtype, try to check each value - def is_int(v: Any) -> bool: - """Check whether a single value is an integer type.""" - if isinstance(v, bool): - return False - if isinstance(v, int): - return True - if isinstance(v, float): - return v == int(v) - return False - return ~data.apply(is_int) + # For object dtype: coerce to numeric, reject bools and non-whole floats + coerced = pd.to_numeric(data, errors="coerce") + bool_mask = data.astype(str).isin({"True", "False"}) + return coerced.isna() | bool_mask | (coerced != coerced.round(0)) elif self.expected_type == "float": # Fast-path: any numeric dtype (Arrow or NumPy) passes if pd.api.types.is_numeric_dtype(data) or "double" in dtype_str: return self._all_false(data) - def is_numeric(v: Any) -> bool: - """Check whether a single value is a numeric type (int or float).""" - return isinstance(v, (int, float)) and not isinstance(v, bool) - return ~data.apply(is_numeric) + coerced = pd.to_numeric(data, errors="coerce") + bool_mask = data.astype(str).isin({"True", "False"}) + return coerced.isna() | bool_mask elif self.expected_type == "string": # Fast-path: Arrow string dtype — all values are strings by definition @@ -251,31 +242,23 @@ def is_numeric(v: Any) -> bool: return self._all_false(data) if pd.api.types.is_string_dtype(data) or data.dtype == object: # Object dtype may have mixed types, check each value - def is_str(v: Any) -> bool: - """Check whether a single value is a string type.""" - return isinstance(v, str) - return ~data.apply(is_str) + check_fn = np.frompyfunc(lambda v: isinstance(v, str), 1, 1) + return ~pd.Series(check_fn(data.values).astype(bool), index=data.index) return self._all_true(data) elif self.expected_type == "bool": # Fast-path: Arrow or NumPy bool dtype if pd.api.types.is_bool_dtype(data): return self._all_false(data) - def is_bool(v: Any) -> bool: - """Check whether a single value is a boolean type.""" - return isinstance(v, bool) - return ~data.apply(is_bool) + return ~data.astype(str).isin({"True", "False"}) elif self.expected_type == "date": # Fast-path: datetime64 or Arrow timestamp dtype if pd.api.types.is_datetime64_any_dtype(data) or "timestamp" in dtype_str: return self._all_false(data) - # Check for datetime objects - def is_date(v: Any) -> bool: - """Check whether a single value is a date or datetime type.""" - import datetime - return isinstance(v, (datetime.date, datetime.datetime, pd.Timestamp)) - return ~data.apply(is_date) + # Check for datetime objects via coercion + converted = pd.to_datetime(data, errors="coerce", format="mixed") + return converted.isna() # Should not reach here due to validation in __init__ return self._all_true(data) @@ -661,3 +644,52 @@ def validate(self, df: pd.DataFrame) -> RuleResult: rule_type="unique_combination", check_name=self.name, ) + + +class BooleanRule(Rule): + """Rule to validate that a column contains only boolean (True/False) values.""" + + def validate(self, df: pd.DataFrame) -> RuleResult: + try: + self._check_column_exists(df) + total_rows = len(df) + + # Bool dtype — all values are inherently boolean + if pd.api.types.is_bool_dtype(df[self.column]): + return RuleResult( + rule_name=self.name, column=self.column, passed=True, + total_rows=total_rows, failed_rows=0, + rule_type="boolean", check_name=self.name, + ) + + non_null = df[self.column].dropna() + valid_strs = {"True", "False", "true", "false", "1", "0"} + violations_mask = ~non_null.astype(str).isin(valid_strs) + violation_indices = non_null.index[violations_mask] + + if len(violation_indices) == 0: + return RuleResult( + rule_name=self.name, column=self.column, passed=True, + total_rows=total_rows, failed_rows=0, + rule_type="boolean", check_name=self.name, + ) + + failed_values = non_null.loc[violation_indices] + reasons = [f"Value '{v}' is not boolean" for v in failed_values.iloc[:100]] + failure_detail = self._create_failure_detail( + violation_indices, total_rows, failed_values, reasons + ) + return RuleResult( + rule_name=self.name, column=self.column, passed=False, + total_rows=total_rows, failed_rows=len(violation_indices), + failure_details=failure_detail, rule_type="boolean", check_name=self.name, + ) + + except ColumnNotFoundError: + raise + except Exception as e: + return RuleResult( + rule_name=self.name, column=self.column, passed=False, + total_rows=len(df), rule_type="boolean", check_name=self.name, + error=f"Error executing boolean rule: {e}", + ) diff --git a/datacheck/rules/factory.py b/datacheck/rules/factory.py index 07925e3..c660c03 100644 --- a/datacheck/rules/factory.py +++ b/datacheck/rules/factory.py @@ -21,47 +21,30 @@ def create_rules(rule_config: RuleConfig) -> list: RuleDefinitionError: If rule configuration is invalid """ # Lazy imports to avoid circular imports - from datacheck.rules.base import CustomRule from datacheck.rules.null_rules import NotNullRule from datacheck.rules.numeric_rules import ( - DistributionTypeRule, MeanBetweenRule, MinMaxRule, - PercentileRangeRule, StdDevLessThanRule, ZScoreOutliersRule, + MinMaxRule, NonNegativeRule, PositiveRule, RangeRule, ) from datacheck.rules.string_rules import AllowedValuesRule, LengthRule, RegexRule from datacheck.rules.temporal_rules import ( - BusinessDaysOnlyRule, DateFormatValidRule, MaxAgeRule, + DateFormatValidRule, MaxAgeRule, NoFutureTimestampsRule, TimestampRangeRule, ) - from datacheck.rules.semantic_rules import ( - EmailValidRule, JsonValidRule, PhoneValidRule, UrlValidRule, - ) from datacheck.rules.composite_rules import ( - DataTypeRule, ForeignKeyExistsRule, SumEqualsRule, + BooleanRule, DataTypeRule, ForeignKeyExistsRule, SumEqualsRule, UniqueCombinationRule, UniqueRule, ) rules: list = [] - - # Check for custom rules first - if "custom" in rule_config.rules: - custom_config = rule_config.rules["custom"] - if not isinstance(custom_config, dict): - raise RuleDefinitionError("Custom rule must be a dictionary with 'rule' and optional 'params'") - - rule_func_name = custom_config.get("rule") - params = custom_config.get("params", {}) - - if not rule_func_name: - raise RuleDefinitionError("Custom rule must specify 'rule' parameter") - - rules.append(CustomRule(rule_config.column, rule_func_name, params, rule_config.name)) - return rules # Custom rules are exclusive + explicitly_disabled = False # set when a rule is knowingly skipped (rule: false) for rule_type, rule_params in rule_config.rules.items(): try: if rule_type == "not_null": if rule_params: rules.append(NotNullRule(rule_config.name, rule_config.column)) + else: + explicitly_disabled = True elif rule_type == "min": rules.append( @@ -84,6 +67,8 @@ def create_rules(rule_config: RuleConfig) -> list: elif rule_type == "unique": if rule_params: rules.append(UniqueRule(rule_config.name, rule_config.column)) + else: + explicitly_disabled = True elif rule_type == "regex": rules.append( @@ -104,20 +89,6 @@ def create_rules(rule_config: RuleConfig) -> list: ) ) - elif rule_type == "length": - if not isinstance(rule_params, dict): - raise RuleDefinitionError( - "length rule must be a dictionary with 'min' and/or 'max'" - ) - rules.append( - LengthRule( - rule_config.name, - rule_config.column, - min_length=rule_params.get("min"), - max_length=rule_params.get("max"), - ) - ) - elif rule_type == "min_length": rules.append( LengthRule( @@ -136,65 +107,6 @@ def create_rules(rule_config: RuleConfig) -> list: ) ) - # Statistical rules - elif rule_type == "mean_between": - if not isinstance(rule_params, dict): - raise RuleDefinitionError( - "mean_between rule must be a dictionary with 'min' and 'max'" - ) - rules.append( - MeanBetweenRule( - rule_config.name, - rule_config.column, - min_value=rule_params["min"], - max_value=rule_params["max"], - ) - ) - - elif rule_type == "std_dev_less_than": - rules.append( - StdDevLessThanRule( - rule_config.name, - rule_config.column, - threshold=rule_params, - ) - ) - - elif rule_type == "percentile_range": - if not isinstance(rule_params, dict): - raise RuleDefinitionError( - "percentile_range rule must be a dictionary" - ) - rules.append( - PercentileRangeRule( - rule_config.name, - rule_config.column, - p25_min=rule_params["p25_min"], - p25_max=rule_params["p25_max"], - p75_min=rule_params["p75_min"], - p75_max=rule_params["p75_max"], - ) - ) - - elif rule_type == "z_score_outliers": - threshold = rule_params if isinstance(rule_params, (int, float)) and not isinstance(rule_params, bool) else 3.0 - rules.append( - ZScoreOutliersRule( - rule_config.name, - rule_config.column, - threshold=threshold, - ) - ) - - elif rule_type == "distribution_type": - rules.append( - DistributionTypeRule( - rule_config.name, - rule_config.column, - expected_type=rule_params, - ) - ) - # Freshness rules elif rule_type == "max_age": rules.append( @@ -224,6 +136,8 @@ def create_rules(rule_config: RuleConfig) -> list: rules.append( NoFutureTimestampsRule(rule_config.name, rule_config.column) ) + else: + explicitly_disabled = True elif rule_type == "date_format_valid": rules.append( @@ -248,64 +162,6 @@ def create_rules(rule_config: RuleConfig) -> list: ) ) - elif rule_type == "business_days_only": - if isinstance(rule_params, dict): - country_code = rule_params.get("country_code", "US") - elif isinstance(rule_params, str): - country_code = rule_params - else: - country_code = "US" - rules.append( - BusinessDaysOnlyRule( - rule_config.name, - rule_config.column, - country_code=country_code, - ) - ) - - # Format rules - elif rule_type == "email_valid": - if rule_params: - rules.append( - EmailValidRule(rule_config.name, rule_config.column) - ) - - elif rule_type == "phone_valid": - if isinstance(rule_params, dict): - country_code = rule_params.get("country_code") - elif isinstance(rule_params, str): - country_code = rule_params - else: - country_code = None - rules.append( - PhoneValidRule( - rule_config.name, - rule_config.column, - country_code=country_code, - ) - ) - - elif rule_type == "url_valid": - if isinstance(rule_params, dict): - schemes = rule_params.get("schemes", ["http", "https"]) - elif isinstance(rule_params, list): - schemes = rule_params - else: - schemes = ["http", "https"] - rules.append( - UrlValidRule( - rule_config.name, - rule_config.column, - schemes=schemes, - ) - ) - - elif rule_type == "json_valid": - if rule_params: - rules.append( - JsonValidRule(rule_config.name, rule_config.column) - ) - # Relationship rules elif rule_type == "unique_combination": if not isinstance(rule_params, list): @@ -365,12 +221,45 @@ def create_rules(rule_config: RuleConfig) -> list: ) ) + elif rule_type == "positive": + if rule_params: + rules.append(PositiveRule(rule_config.name, rule_config.column)) + else: + explicitly_disabled = True + + elif rule_type == "non_negative": + if rule_params: + rules.append(NonNegativeRule(rule_config.name, rule_config.column)) + else: + explicitly_disabled = True + + elif rule_type == "range": + if not isinstance(rule_params, dict): + raise RuleDefinitionError( + "range rule must be a dictionary with 'min' and 'max'" + ) + min_val = rule_params.get("min") + max_val = rule_params.get("max") + if min_val is None or max_val is None: + raise RuleDefinitionError( + "range rule requires both 'min' and 'max'" + ) + rules.append( + RangeRule(rule_config.name, rule_config.column, min_value=min_val, max_value=max_val) + ) + + elif rule_type == "boolean": + if rule_params: + rules.append(BooleanRule(rule_config.name, rule_config.column)) + else: + explicitly_disabled = True + except (RuleDefinitionError, TypeError, ValueError) as e: raise RuleDefinitionError( f"Error creating {rule_type} rule for '{rule_config.name}': {e}" ) from e - if not rules: + if not rules and not explicitly_disabled: raise RuleDefinitionError( f"No valid rules created for check '{rule_config.name}'" ) diff --git a/datacheck/rules/numeric_rules.py b/datacheck/rules/numeric_rules.py index d7f241a..09e1c89 100644 --- a/datacheck/rules/numeric_rules.py +++ b/datacheck/rules/numeric_rules.py @@ -1,13 +1,43 @@ """Numeric validation rules.""" -import numpy as np import pandas as pd from datacheck.exceptions import ColumnNotFoundError, RuleDefinitionError -from datacheck.results import FailureDetail, RuleResult +from datacheck.results import RuleResult from datacheck.rules.base import Rule +def _ensure_numeric(series: pd.Series) -> pd.Series: + """Cast decimal columns to float64 so numeric checks work. + + Handles two cases: + - Arrow-backed decimal128 (pd.ArrowDtype): ``is_numeric_dtype()`` returns + False, and numpy arithmetic raises ArrowTypeError. + - Object-dtype with Python ``decimal.Decimal`` values: produced by plain + ``pd.read_parquet()`` for Parquet decimal128 columns. ``is_numeric_dtype()`` + returns False, and numpy ops fail on Decimal/float mixing. + """ + try: + import pyarrow as pa + + if isinstance(series.dtype, pd.ArrowDtype) and pa.types.is_decimal( + series.dtype.pyarrow_dtype + ): + return series.astype("float64") + except Exception: + pass + # Handle object dtype containing Python decimal.Decimal objects + if series.dtype == object: + try: + import decimal + first_valid = series.dropna() + if len(first_valid) > 0 and isinstance(first_valid.iloc[0], decimal.Decimal): + return pd.to_numeric(series, errors="coerce") + except Exception: + pass + return series + + class MinMaxRule(Rule): """Rule to check numeric values are within min/max bounds.""" @@ -47,7 +77,7 @@ def validate(self, df: pd.DataFrame) -> RuleResult: # Determine rule type and check name rule_type = "min_max" - check_name = self.name.replace("_min", "").replace("_max", "") + check_name = self.name.removesuffix("_min").removesuffix("_max") if self.name.endswith("_min"): rule_type = "min" elif self.name.endswith("_max"): @@ -55,7 +85,7 @@ def validate(self, df: pd.DataFrame) -> RuleResult: # Filter out null values (they should be caught by not_null rule) non_null_mask = df[self.column].notna() - data = df[self.column][non_null_mask] + data = _ensure_numeric(df[self.column][non_null_mask]) # Check if data is numeric if not pd.api.types.is_numeric_dtype(data): @@ -69,13 +99,33 @@ def validate(self, df: pd.DataFrame) -> RuleResult: check_name=check_name, ) - # Build condition for violations (direct vectorized comparison) - if self.min_value is not None and self.max_value is not None: - violations_mask = (data < self.min_value) | (data > self.max_value) - elif self.min_value is not None: - violations_mask = data < self.min_value - else: - violations_mask = data > self.max_value + # Build violation mask — use PyArrow compute for Arrow-backed columns + # (single fused call avoids intermediate boolean Series allocations) + try: + import pyarrow.compute as pc + + arr = data.array._pa_array # raises AttributeError for numpy-backed + if self.min_value is not None and self.max_value is not None: + violations_pa = pc.or_( + pc.less(arr, self.min_value), pc.greater(arr, self.max_value) + ) + elif self.min_value is not None: + violations_pa = pc.less(arr, self.min_value) + else: + violations_pa = pc.greater(arr, self.max_value) + # Use .values to get positional numpy array (avoids label-alignment + # issues when data.index is non-sequential, e.g. after sampling) + violations_mask = pd.Series( + violations_pa.to_pandas().values, index=data.index, dtype=bool + ) + except (AttributeError, TypeError, ImportError): + # Fallback for numpy-backed arrays + if self.min_value is not None and self.max_value is not None: + violations_mask = (data < self.min_value) | (data > self.max_value) + elif self.min_value is not None: + violations_mask = data < self.min_value + else: + violations_mask = data > self.max_value violation_indices = data.index[violations_mask] @@ -124,7 +174,7 @@ def validate(self, df: pd.DataFrame) -> RuleResult: except Exception as e: # Determine rule type and check name for error case rule_type = "min_max" - check_name = self.name.replace("_min", "").replace("_max", "") + check_name = self.name.removesuffix("_min").removesuffix("_max") if self.name.endswith("_min"): rule_type = "min" elif self.name.endswith("_max"): @@ -139,741 +189,99 @@ def validate(self, df: pd.DataFrame) -> RuleResult: rule_type=rule_type, check_name=check_name, ) +# Convenience classes for clearer API +class MinRule(MinMaxRule): + """Rule to check numeric values are above a minimum.""" - -class MeanBetweenRule(Rule): - """Rule to validate that column mean is within a specified range. - - This rule calculates the mean of numeric values in a column and validates - that it falls within the specified min/max bounds (inclusive). - """ - - def __init__(self, name: str, column: str, min_value: float, max_value: float) -> None: - """Initialize MeanBetweenRule. + def __init__(self, name: str, column: str, min_value: float) -> None: + """Initialize MinRule. Args: name: Name of the rule column: Column to validate - min_value: Minimum acceptable mean (inclusive) - max_value: Maximum acceptable mean (inclusive) - - Raises: - RuleDefinitionError: If min_value > max_value - """ - super().__init__(name, column) - if min_value > max_value: - raise RuleDefinitionError( - f"min_value ({min_value}) cannot be greater than max_value ({max_value})" - ) - self.min_value = min_value - self.max_value = max_value - - def validate(self, df: pd.DataFrame) -> RuleResult: - """Validate that column mean is within the specified range. - - Args: - df: DataFrame to validate - - Returns: - RuleResult with validation outcome + min_value: Minimum allowed value (inclusive) """ - try: - self._check_column_exists(df) - - total_rows = len(df) - - if total_rows == 0: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=0, - failed_rows=0, - rule_type="mean_between", - check_name=self.name, - ) - - # Filter out null values and convert to numeric - non_null_mask = df[self.column].notna() - data = pd.to_numeric(df[self.column][non_null_mask], errors="coerce") - data = data.dropna() - - if len(data) == 0: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=total_rows, - error="No numeric values found in column", - rule_type="mean_between", - check_name=self.name, - ) - - actual_mean = float(np.mean(data)) - - if self.min_value <= actual_mean <= self.max_value: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=total_rows, - failed_rows=0, - rule_type="mean_between", - check_name=self.name, - ) - - # Mean is out of range - all rows are considered part of the failure - reason = ( - f"Mean {actual_mean:.4f} is below minimum {self.min_value}" - if actual_mean < self.min_value - else f"Mean {actual_mean:.4f} exceeds maximum {self.max_value}" - ) - - failure_detail = FailureDetail( - rule_name=self.name, - column=self.column, - failed_count=total_rows, - total_count=total_rows, - failure_rate=100.0, - sample_failures=[], - sample_values=[actual_mean], - sample_reasons=[reason], - ) - - return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=total_rows, - failed_rows=total_rows, - failure_details=failure_detail, - rule_type="mean_between", - check_name=self.name, - ) - - except ColumnNotFoundError: - raise - except Exception as e: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=len(df), - error=f"Error executing mean_between rule: {e}", - rule_type="mean_between", - check_name=self.name, - ) - + super().__init__(name, column, min_value=min_value, max_value=None) -class StdDevLessThanRule(Rule): - """Rule to validate that column standard deviation is below a threshold. - This rule calculates the standard deviation of numeric values in a column - and validates that it is less than the specified threshold. - """ +class MaxRule(MinMaxRule): + """Rule to check numeric values are below a maximum.""" - def __init__(self, name: str, column: str, threshold: float) -> None: - """Initialize StdDevLessThanRule. + def __init__(self, name: str, column: str, max_value: float) -> None: + """Initialize MaxRule. Args: name: Name of the rule column: Column to validate - threshold: Maximum acceptable standard deviation (exclusive) - - Raises: - RuleDefinitionError: If threshold is negative - """ - super().__init__(name, column) - if threshold < 0: - raise RuleDefinitionError("threshold cannot be negative") - self.threshold = threshold - - def validate(self, df: pd.DataFrame) -> RuleResult: - """Validate that column standard deviation is below threshold. - - Args: - df: DataFrame to validate - - Returns: - RuleResult with validation outcome + max_value: Maximum allowed value (inclusive) """ - try: - self._check_column_exists(df) - - total_rows = len(df) - - if total_rows == 0: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=0, - failed_rows=0, - rule_type="std_dev_less_than", - check_name=self.name, - ) - - # Filter out null values and convert to numeric - non_null_mask = df[self.column].notna() - data = pd.to_numeric(df[self.column][non_null_mask], errors="coerce") - data = data.dropna() - - if len(data) == 0: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=total_rows, - error="No numeric values found in column", - rule_type="std_dev_less_than", - check_name=self.name, - ) - - actual_std = float(np.std(data, ddof=0)) - - if actual_std < self.threshold: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=total_rows, - failed_rows=0, - rule_type="std_dev_less_than", - check_name=self.name, - ) - - failure_detail = FailureDetail( - rule_name=self.name, - column=self.column, - failed_count=total_rows, - total_count=total_rows, - failure_rate=100.0, - sample_failures=[], - sample_values=[actual_std], - sample_reasons=[ - f"Standard deviation {actual_std:.4f} >= threshold {self.threshold}" - ], - ) - - return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=total_rows, - failed_rows=total_rows, - failure_details=failure_detail, - rule_type="std_dev_less_than", - check_name=self.name, - ) - - except ColumnNotFoundError: - raise - except Exception as e: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=len(df), - error=f"Error executing std_dev_less_than rule: {e}", - rule_type="std_dev_less_than", - check_name=self.name, - ) - + super().__init__(name, column, min_value=None, max_value=max_value) -class PercentileRangeRule(Rule): - """Rule to validate that 25th and 75th percentiles fall within ranges. - This rule calculates the 25th and 75th percentiles of numeric values - and validates they fall within their respective specified ranges. - """ +class RangeRule(MinMaxRule): + """Rule to check numeric values are within a range.""" - def __init__( - self, - name: str, - column: str, - p25_min: float, - p25_max: float, - p75_min: float, - p75_max: float, - ) -> None: - """Initialize PercentileRangeRule. + def __init__(self, name: str, column: str, min_value: float, max_value: float) -> None: + """Initialize RangeRule. Args: name: Name of the rule column: Column to validate - p25_min: Minimum acceptable 25th percentile (inclusive) - p25_max: Maximum acceptable 25th percentile (inclusive) - p75_min: Minimum acceptable 75th percentile (inclusive) - p75_max: Maximum acceptable 75th percentile (inclusive) - - Raises: - RuleDefinitionError: If min > max for either percentile range - """ - super().__init__(name, column) - if p25_min > p25_max: - raise RuleDefinitionError( - f"p25_min ({p25_min}) cannot be greater than p25_max ({p25_max})" - ) - if p75_min > p75_max: - raise RuleDefinitionError( - f"p75_min ({p75_min}) cannot be greater than p75_max ({p75_max})" - ) - self.p25_min = p25_min - self.p25_max = p25_max - self.p75_min = p75_min - self.p75_max = p75_max - - def validate(self, df: pd.DataFrame) -> RuleResult: - """Validate that percentiles fall within specified ranges. - - Args: - df: DataFrame to validate - - Returns: - RuleResult with validation outcome + min_value: Minimum allowed value (inclusive) + max_value: Maximum allowed value (inclusive) """ - try: - self._check_column_exists(df) - - total_rows = len(df) - - if total_rows == 0: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=0, - failed_rows=0, - rule_type="percentile_range", - check_name=self.name, - ) - - # Filter out null values and convert to numeric - non_null_mask = df[self.column].notna() - data = pd.to_numeric(df[self.column][non_null_mask], errors="coerce") - data = data.dropna() - - if len(data) == 0: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=total_rows, - error="No numeric values found in column", - rule_type="percentile_range", - check_name=self.name, - ) - - p25 = float(np.percentile(data, 25)) - p75 = float(np.percentile(data, 75)) - - p25_valid = self.p25_min <= p25 <= self.p25_max - p75_valid = self.p75_min <= p75 <= self.p75_max - - if p25_valid and p75_valid: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=total_rows, - failed_rows=0, - rule_type="percentile_range", - check_name=self.name, - ) - - reasons = [] - if not p25_valid: - reasons.append( - f"25th percentile {p25:.4f} not in range [{self.p25_min}, {self.p25_max}]" - ) - if not p75_valid: - reasons.append( - f"75th percentile {p75:.4f} not in range [{self.p75_min}, {self.p75_max}]" - ) - - failure_detail = FailureDetail( - rule_name=self.name, - column=self.column, - failed_count=total_rows, - total_count=total_rows, - failure_rate=100.0, - sample_failures=[], - sample_values=[p25, p75], - sample_reasons=reasons, - ) - - return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=total_rows, - failed_rows=total_rows, - failure_details=failure_detail, - rule_type="percentile_range", - check_name=self.name, - ) - - except ColumnNotFoundError: - raise - except Exception as e: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=len(df), - error=f"Error executing percentile_range rule: {e}", - rule_type="percentile_range", - check_name=self.name, - ) + super().__init__(name, column, min_value=min_value, max_value=max_value) -class ZScoreOutliersRule(Rule): - """Rule to detect outliers based on Z-score threshold. +class NonNegativeRule(MinMaxRule): + """Rule to check all numeric values are >= 0.""" - This rule calculates the Z-score for each value and fails if any value - has an absolute Z-score greater than the specified threshold. - """ + def __init__(self, name: str, column: str) -> None: + super().__init__(name, column, min_value=0) - def __init__(self, name: str, column: str, threshold: float = 3.0) -> None: - """Initialize ZScoreOutliersRule. - Args: - name: Name of the rule - column: Column to validate - threshold: Maximum acceptable absolute Z-score (default: 3.0) - - Raises: - RuleDefinitionError: If threshold is not positive - """ - super().__init__(name, column) - if threshold <= 0: - raise RuleDefinitionError("threshold must be positive") - self.threshold = threshold +class PositiveRule(Rule): + """Rule to check all numeric values are strictly > 0.""" def validate(self, df: pd.DataFrame) -> RuleResult: - """Validate that no values have Z-score above threshold. - - Args: - df: DataFrame to validate - - Returns: - RuleResult with validation outcome - """ try: self._check_column_exists(df) - total_rows = len(df) - - if total_rows == 0: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=0, - failed_rows=0, - rule_type="z_score_outliers", - check_name=self.name, - ) - - # Filter out null values and convert to numeric non_null_mask = df[self.column].notna() - data = pd.to_numeric(df[self.column][non_null_mask], errors="coerce") - data = data.dropna() - - if len(data) == 0: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=total_rows, - error="No numeric values found in column", - rule_type="z_score_outliers", - check_name=self.name, - ) - - mean = float(np.mean(data)) - std = float(np.std(data, ddof=0)) - - if std == 0: - # All values are the same, no outliers possible + data = _ensure_numeric(df[self.column][non_null_mask]) + if not pd.api.types.is_numeric_dtype(data): return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=total_rows, - failed_rows=0, - rule_type="z_score_outliers", - check_name=self.name, + rule_name=self.name, column=self.column, passed=False, + total_rows=total_rows, rule_type="positive", check_name=self.name, + error=f"Column '{self.column}' is not numeric", ) - - # Calculate Z-scores - z_scores = np.abs((data - mean) / std) - outlier_mask = z_scores > self.threshold - outlier_indices = data.index[outlier_mask] - - if len(outlier_indices) == 0: + violations_mask = data <= 0 + violation_indices = data.index[violations_mask] + if len(violation_indices) == 0: return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=total_rows, - failed_rows=0, - rule_type="z_score_outliers", - check_name=self.name, + rule_name=self.name, column=self.column, passed=True, + total_rows=total_rows, failed_rows=0, + rule_type="positive", check_name=self.name, ) - - failed_values = data.loc[outlier_indices] - failed_z_scores = z_scores.loc[outlier_indices] + failed_values = data.loc[violation_indices] reasons = [ - f"Z-score {z:.4f} exceeds threshold {self.threshold}" - for z in failed_z_scores.iloc[:100] + f"Value {v} is not positive (must be > 0)" for v in failed_values.iloc[:100] ] - failure_detail = self._create_failure_detail( - outlier_indices, total_rows, failed_values, reasons - ) - - return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=total_rows, - failed_rows=len(outlier_indices), - failure_details=failure_detail, - rule_type="z_score_outliers", - check_name=self.name, - ) - - except ColumnNotFoundError: - raise - except Exception as e: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=len(df), - error=f"Error executing z_score_outliers rule: {e}", - rule_type="z_score_outliers", - check_name=self.name, - ) - - -class DistributionTypeRule(Rule): - """Rule to validate that data follows an expected distribution type. - - Uses the Kolmogorov-Smirnov test to check if data follows - a normal or uniform distribution. - - Note: Requires scipy to be installed for full functionality. - """ - - VALID_TYPES = {"normal", "uniform"} - - def __init__(self, name: str, column: str, expected_type: str) -> None: - """Initialize DistributionTypeRule. - - Args: - name: Name of the rule - column: Column to validate - expected_type: Expected distribution type ("normal" or "uniform") - - Raises: - RuleDefinitionError: If expected_type is not valid - """ - super().__init__(name, column) - expected_type_lower = expected_type.lower() - if expected_type_lower not in self.VALID_TYPES: - raise RuleDefinitionError( - f"Invalid distribution_type '{expected_type}'. " - f"Must be one of: {', '.join(sorted(self.VALID_TYPES))}" - ) - self.expected_type = expected_type_lower - - def validate(self, df: pd.DataFrame) -> RuleResult: - """Validate that data follows the expected distribution. - - Args: - df: DataFrame to validate - - Returns: - RuleResult with validation outcome - """ - try: - self._check_column_exists(df) - - total_rows = len(df) - - if total_rows == 0: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=0, - failed_rows=0, - rule_type="distribution_type", - check_name=self.name, - ) - - # Filter out null values and convert to numeric - non_null_mask = df[self.column].notna() - data = pd.to_numeric(df[self.column][non_null_mask], errors="coerce") - data = data.dropna() - - if len(data) < 8: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=total_rows, - error="Need at least 8 numeric values for distribution test", - rule_type="distribution_type", - check_name=self.name, - ) - - # Try to import scipy for KS test - try: - from scipy import stats as scipy_stats - - if self.expected_type == "normal": - # Normalize data for test - mean = float(np.mean(data)) - std = float(np.std(data, ddof=0)) - if std == 0: - # All values same - not normal unless single value - passed = False - p_value = 0.0 - else: - normalized = (data - mean) / std - statistic, p_value = scipy_stats.kstest(normalized, "norm") - passed = p_value > 0.05 - else: # uniform - min_val = float(data.min()) - max_val = float(data.max()) - if min_val == max_val: - passed = True - p_value = 1.0 - else: - normalized = (data - min_val) / (max_val - min_val) - statistic, p_value = scipy_stats.kstest(normalized, "uniform") - passed = p_value > 0.05 - - except ImportError: - # Fallback: use simple heuristics without scipy - if self.expected_type == "normal": - # Check skewness and kurtosis using simple estimates - mean = float(np.mean(data)) - std = float(np.std(data, ddof=0)) - if std == 0: - passed = False - p_value = 0.0 - else: - normalized = (data - mean) / std - skewness = float(np.mean(normalized**3)) - kurtosis = float(np.mean(normalized**4) - 3) - # Normal: skewness ~0, excess kurtosis ~0 - passed = abs(skewness) < 1.0 and abs(kurtosis) < 2.0 - p_value = 0.1 if passed else 0.01 - else: # uniform - # For uniform, check if values are spread evenly - min_val = float(data.min()) - max_val = float(data.max()) - if min_val == max_val: - passed = True - p_value = 1.0 - else: - # Check coefficient of variation (uniform has ~0.58 for [0,1]) - cv = float(np.std(data, ddof=0) / np.mean(data)) if np.mean(data) != 0 else 0 - passed = 0.3 < cv < 0.8 - p_value = 0.1 if passed else 0.01 - - if passed: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=total_rows, - failed_rows=0, - rule_type="distribution_type", - check_name=self.name, - ) - - failure_detail = FailureDetail( - rule_name=self.name, - column=self.column, - failed_count=total_rows, - total_count=total_rows, - failure_rate=100.0, - sample_failures=[], - sample_values=[p_value], - sample_reasons=[ - f"Data does not follow {self.expected_type} distribution (p-value: {p_value:.4f})" - ], + violation_indices, total_rows, failed_values, reasons ) - return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=total_rows, - failed_rows=total_rows, - failure_details=failure_detail, - rule_type="distribution_type", - check_name=self.name, + rule_name=self.name, column=self.column, passed=False, + total_rows=total_rows, failed_rows=len(violation_indices), + failure_details=failure_detail, rule_type="positive", check_name=self.name, ) - except ColumnNotFoundError: raise except Exception as e: return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=len(df), - error=f"Error executing distribution_type rule: {e}", - rule_type="distribution_type", - check_name=self.name, + rule_name=self.name, column=self.column, passed=False, + total_rows=len(df), rule_type="positive", check_name=self.name, + error=f"Error executing positive rule: {e}", ) -# Convenience classes for clearer API -class MinRule(MinMaxRule): - """Rule to check numeric values are above a minimum.""" - - def __init__(self, name: str, column: str, min_value: float) -> None: - """Initialize MinRule. - - Args: - name: Name of the rule - column: Column to validate - min_value: Minimum allowed value (inclusive) - """ - super().__init__(name, column, min_value=min_value, max_value=None) - - -class MaxRule(MinMaxRule): - """Rule to check numeric values are below a maximum.""" - - def __init__(self, name: str, column: str, max_value: float) -> None: - """Initialize MaxRule. - - Args: - name: Name of the rule - column: Column to validate - max_value: Maximum allowed value (inclusive) - """ - super().__init__(name, column, min_value=None, max_value=max_value) - - -class RangeRule(MinMaxRule): - """Rule to check numeric values are within a range.""" - - def __init__(self, name: str, column: str, min_value: float, max_value: float) -> None: - """Initialize RangeRule. - - Args: - name: Name of the rule - column: Column to validate - min_value: Minimum allowed value (inclusive) - max_value: Maximum allowed value (inclusive) - """ - super().__init__(name, column, min_value=min_value, max_value=max_value) diff --git a/datacheck/rules/semantic_rules.py b/datacheck/rules/semantic_rules.py deleted file mode 100644 index e2cfe14..0000000 --- a/datacheck/rules/semantic_rules.py +++ /dev/null @@ -1,522 +0,0 @@ -"""Semantic validation rules.""" - -import json -from typing import Any -from urllib.parse import urlparse - -import pandas as pd -from email_validator import EmailNotValidError, validate_email -import phonenumbers - -from datacheck.exceptions import ColumnNotFoundError -from datacheck.results import RuleResult -from datacheck.rules.base import Rule - - -class EmailValidRule(Rule): - """Rule to validate that values are valid email addresses. - - Uses the email-validator library for RFC 5322 compliance checking. - """ - - def validate(self, df: pd.DataFrame) -> RuleResult: - """Validate that all values are valid email addresses. - - Args: - df: DataFrame to validate - - Returns: - RuleResult with validation outcome - """ - try: - self._check_column_exists(df) - - total_rows = len(df) - - if total_rows == 0: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=0, - failed_rows=0, - rule_type="email_valid", - check_name=self.name, - ) - - # Filter out null values - non_null_mask = df[self.column].notna() - data = df[self.column][non_null_mask] - - if len(data) == 0: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=total_rows, - failed_rows=0, - rule_type="email_valid", - check_name=self.name, - ) - - # Vectorized regex pre-filter: fast-reject values without basic email structure - data_str = data.astype(str) - pre_filter = data_str.str.match( - r"^[^@\s]+@[^@\s]+\.[^@\s]+$", na=False - ) - - # Only run expensive email-validator on candidates that pass pre-filter - candidates = data[pre_filter] - if len(candidates) > 0: - def is_valid_email(value: Any) -> bool: - """Check whether a single value is a valid email address.""" - try: - validate_email(str(value), check_deliverability=False) - return True - except EmailNotValidError: - return False - - detailed_mask = candidates.apply(is_valid_email) - valid_mask = pre_filter.copy() - valid_mask.loc[candidates.index] = detailed_mask - else: - valid_mask = pre_filter - - invalid_indices = data.index[~valid_mask] - - if len(invalid_indices) == 0: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=total_rows, - failed_rows=0, - rule_type="email_valid", - check_name=self.name, - ) - - failed_values = data.loc[invalid_indices] - reasons = [f"'{v}' is not a valid email address" for v in failed_values.iloc[:100]] - - failure_detail = self._create_failure_detail( - invalid_indices, total_rows, failed_values, reasons - ) - - return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=total_rows, - failed_rows=len(invalid_indices), - failure_details=failure_detail, - rule_type="email_valid", - check_name=self.name, - ) - - except ColumnNotFoundError: - raise - except Exception as e: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=len(df), - error=f"Error executing email_valid rule: {e}", - rule_type="email_valid", - check_name=self.name, - ) - - -class PhoneValidRule(Rule): - """Rule to validate that values are valid phone numbers. - - Uses the phonenumbers library for international phone number validation. - """ - - def __init__(self, name: str, column: str, country_code: str | None = None) -> None: - """Initialize PhoneValidRule. - - Args: - name: Name of the rule - column: Column to validate - country_code: Default country code (e.g., "US", "GB", "IN") - """ - super().__init__(name, column) - self.country_code = country_code - - def validate(self, df: pd.DataFrame) -> RuleResult: - """Validate that all values are valid phone numbers. - - Args: - df: DataFrame to validate - - Returns: - RuleResult with validation outcome - """ - try: - self._check_column_exists(df) - - total_rows = len(df) - - if total_rows == 0: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=0, - failed_rows=0, - rule_type="phone_valid", - check_name=self.name, - ) - - # Filter out null values - non_null_mask = df[self.column].notna() - data = df[self.column][non_null_mask] - - if len(data) == 0: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=total_rows, - failed_rows=0, - rule_type="phone_valid", - check_name=self.name, - ) - - # When pandas/PyArrow loads purely numeric phone numbers, the - # column ends up as int64 or float64. Converting with plain - # .astype(str) produces values like "1234567890.0" which break - # phone-number parsing. Detect numeric dtypes and convert via - # integer casting first so the trailing ".0" is stripped. - _is_numeric_col = pd.api.types.is_numeric_dtype(data) - if _is_numeric_col: - data_str = data.astype("Int64").astype(str) - else: - data_str = data.astype(str) - - # Vectorized regex pre-filter: fast-reject values without phone-like characters - # Valid phones contain digits and may have +, -, (, ), spaces, dots - pre_filter = data_str.str.match( - r"^[+0-9][0-9\s\-().]{4,}$", na=False - ) - - # Only run expensive phonenumbers parsing on candidates - candidates = data_str[pre_filter] - if len(candidates) > 0: - def is_valid_phone(value: Any) -> bool: - """Check whether a single value is a valid phone number.""" - str_val = str(value) - try: - parsed = phonenumbers.parse(str_val, self.country_code) - if phonenumbers.is_valid_number(parsed): - return True - except phonenumbers.NumberParseException: - pass - # When CSV loaders (PyArrow) parse "+1234..." as a number, - # the "+" is lost. Retry with "+" prefix for digit-only - # values that could be international numbers (country code - # 1-3 digits + national number, minimum ~8 digits total). - if _is_numeric_col and str_val.isdigit() and len(str_val) >= 8: - try: - parsed = phonenumbers.parse( - "+" + str_val, self.country_code - ) - return bool(phonenumbers.is_valid_number(parsed)) - except phonenumbers.NumberParseException: - pass - return False - - detailed_mask = candidates.apply(is_valid_phone) - valid_mask = pre_filter.copy() - valid_mask.loc[candidates.index] = detailed_mask - else: - valid_mask = pre_filter - - invalid_indices = data.index[~valid_mask] - - if len(invalid_indices) == 0: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=total_rows, - failed_rows=0, - rule_type="phone_valid", - check_name=self.name, - ) - - failed_values = data_str.loc[invalid_indices] - reasons = [f"'{v}' is not a valid phone number" for v in failed_values.iloc[:100]] - - failure_detail = self._create_failure_detail( - invalid_indices, total_rows, failed_values, reasons - ) - - return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=total_rows, - failed_rows=len(invalid_indices), - failure_details=failure_detail, - rule_type="phone_valid", - check_name=self.name, - ) - - except ColumnNotFoundError: - raise - except Exception as e: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=len(df), - error=f"Error executing phone_valid rule: {e}", - rule_type="phone_valid", - check_name=self.name, - ) - - -class UrlValidRule(Rule): - """Rule to validate that values are valid URLs. - - Validates URL format and scheme using urllib.parse. - """ - - def __init__( - self, name: str, column: str, schemes: list[str] | None = None - ) -> None: - """Initialize UrlValidRule. - - Args: - name: Name of the rule - column: Column to validate - schemes: Allowed URL schemes (default: ["http", "https"]) - """ - super().__init__(name, column) - self.schemes = schemes or ["http", "https"] - - def validate(self, df: pd.DataFrame) -> RuleResult: - """Validate that all values are valid URLs. - - Args: - df: DataFrame to validate - - Returns: - RuleResult with validation outcome - """ - try: - self._check_column_exists(df) - - total_rows = len(df) - - if total_rows == 0: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=0, - failed_rows=0, - rule_type="url_valid", - check_name=self.name, - ) - - # Filter out null values - non_null_mask = df[self.column].notna() - data = df[self.column][non_null_mask] - - if len(data) == 0: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=total_rows, - failed_rows=0, - rule_type="url_valid", - check_name=self.name, - ) - - # Vectorized regex pre-filter: fast-reject values that clearly aren't URLs - schemes_pattern = "|".join(self.schemes) - pre_filter = data.astype(str).str.match( - rf"^({schemes_pattern})://[^\s]+", na=False - ) - - # For values that pass the pre-filter, do full urlparse validation - candidates = data[pre_filter] - if len(candidates) > 0: - def is_valid_url(value: Any) -> bool: - """Check whether a single value is a valid URL with an allowed scheme.""" - try: - result = urlparse(str(value)) - return result.scheme in self.schemes and bool(result.netloc) - except Exception: - return False - - detailed_mask = candidates.apply(is_valid_url) - # Combine: values that failed pre-filter are invalid, - # values that passed pre-filter use detailed result - valid_mask = pre_filter.copy() - valid_mask.loc[candidates.index] = detailed_mask - else: - valid_mask = pre_filter - - invalid_indices = data.index[~valid_mask] - - if len(invalid_indices) == 0: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=total_rows, - failed_rows=0, - rule_type="url_valid", - check_name=self.name, - ) - - failed_values = data.loc[invalid_indices] - schemes_str = ", ".join(self.schemes) - reasons = [ - f"'{v}' is not a valid URL (allowed schemes: {schemes_str})" - for v in failed_values.iloc[:100] - ] - - failure_detail = self._create_failure_detail( - invalid_indices, total_rows, failed_values, reasons - ) - - return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=total_rows, - failed_rows=len(invalid_indices), - failure_details=failure_detail, - rule_type="url_valid", - check_name=self.name, - ) - - except ColumnNotFoundError: - raise - except Exception as e: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=len(df), - error=f"Error executing url_valid rule: {e}", - rule_type="url_valid", - check_name=self.name, - ) - - -class JsonValidRule(Rule): - """Rule to validate that values are valid JSON strings.""" - - def validate(self, df: pd.DataFrame) -> RuleResult: - """Validate that all values are valid JSON. - - Args: - df: DataFrame to validate - - Returns: - RuleResult with validation outcome - """ - try: - self._check_column_exists(df) - - total_rows = len(df) - - if total_rows == 0: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=0, - failed_rows=0, - rule_type="json_valid", - check_name=self.name, - ) - - # Filter out null values - non_null_mask = df[self.column].notna() - data = df[self.column][non_null_mask] - - if len(data) == 0: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=total_rows, - failed_rows=0, - rule_type="json_valid", - check_name=self.name, - ) - - # Vectorized pre-filter: valid JSON must start with {, [, ", digit, true, false, or null - data_str = data.astype(str).str.strip() - pre_filter = data_str.str.match( - r'^[\[{"tfn\d\-]', na=False - ) - - # Only run expensive json.loads on candidates that pass pre-filter - candidates = data[pre_filter] - if len(candidates) > 0: - def is_valid_json(value: Any) -> bool: - """Check whether a single value is valid JSON.""" - try: - json.loads(str(value)) - return True - except (json.JSONDecodeError, TypeError): - return False - - detailed_mask = candidates.apply(is_valid_json) - valid_mask = pre_filter.copy() - valid_mask.loc[candidates.index] = detailed_mask - else: - valid_mask = pre_filter - - invalid_indices = data.index[~valid_mask] - - if len(invalid_indices) == 0: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=total_rows, - failed_rows=0, - rule_type="json_valid", - check_name=self.name, - ) - - failed_values = data.loc[invalid_indices] - reasons = [f"'{v}' is not valid JSON" for v in failed_values.iloc[:100]] - - failure_detail = self._create_failure_detail( - invalid_indices, total_rows, failed_values, reasons - ) - - return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=total_rows, - failed_rows=len(invalid_indices), - failure_details=failure_detail, - rule_type="json_valid", - check_name=self.name, - ) - - except ColumnNotFoundError: - raise - except Exception as e: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=len(df), - error=f"Error executing json_valid rule: {e}", - rule_type="json_valid", - check_name=self.name, - ) diff --git a/datacheck/rules/string_rules.py b/datacheck/rules/string_rules.py index add1495..f5650e7 100644 --- a/datacheck/rules/string_rules.py +++ b/datacheck/rules/string_rules.py @@ -4,6 +4,7 @@ from functools import lru_cache from typing import Any +import numpy as np import pandas as pd from datacheck.exceptions import ColumnNotFoundError, RuleDefinitionError @@ -268,7 +269,8 @@ def validate(self, df: pd.DataFrame) -> RuleResult: # Check for non-string values (skip check if dtype is already string) is_string_col = pd.api.types.is_string_dtype(data) if not is_string_col: - non_string_mask = ~data.apply(lambda v: isinstance(v, str)) + check_fn = np.frompyfunc(lambda v: isinstance(v, str), 1, 1) + non_string_mask = ~pd.Series(check_fn(data.values).astype(bool), index=data.index) if non_string_mask.any(): non_string_indices = data.index[non_string_mask] failed_values = data.loc[non_string_indices] diff --git a/datacheck/rules/temporal_rules.py b/datacheck/rules/temporal_rules.py index c49fb06..5ab2ab2 100644 --- a/datacheck/rules/temporal_rules.py +++ b/datacheck/rules/temporal_rules.py @@ -9,6 +9,39 @@ from datacheck.rules.base import Rule +def _to_datetime_fast(series: pd.Series) -> pd.Series: + """Convert a Series to timestamps. + + Uses PyArrow's vectorized C++-level cast for Arrow-backed string columns, + avoiding the element-by-element Python iteration that ``pd.to_datetime`` + triggers on Arrow-backed arrays (which can be 10-50x slower on large data). + Falls back to ``pd.to_datetime`` for all other column types. + """ + try: + import pyarrow as pa + + if isinstance(series.dtype, pd.ArrowDtype): + pa_type = series.dtype.pyarrow_dtype + if pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type): + arr = series.array._pa_array + try: + # Vectorized ISO-8601 cast — stays in C++, no Python iteration. + # Handles "YYYY-MM-DD", "YYYY-MM-DD HH:MM:SS", and ISO variants. + ts_arr = arr.cast(pa.timestamp("us")) + # Convert to numpy datetime64 so all .dt accessor operations + # (dayofweek, tz, etc.) work without Arrow's tzdata dependency. + return pd.Series( + ts_arr.to_pandas(), + index=series.index, + name=series.name, + ) + except Exception: + pass + except Exception: + pass + return pd.to_datetime(series, errors="coerce", format="mixed") + + def _parse_duration(duration: str) -> timedelta: """Parse a duration string into a timedelta. @@ -100,7 +133,7 @@ def validate(self, df: pd.DataFrame) -> RuleResult: ) # Convert column to datetime - timestamps = pd.to_datetime(df[self.column], errors="coerce") + timestamps = _to_datetime_fast(df[self.column]) valid_timestamps = timestamps.dropna() if len(valid_timestamps) == 0: @@ -115,7 +148,8 @@ def validate(self, df: pd.DataFrame) -> RuleResult: ) max_timestamp = valid_timestamps.max() - now = pd.Timestamp.now() + col_tz = getattr(valid_timestamps.dt, "tz", None) + now = pd.Timestamp.now(tz=col_tz) if col_tz is not None else pd.Timestamp.now() cutoff = now - self.duration if max_timestamp >= cutoff: @@ -226,7 +260,7 @@ def validate(self, df: pd.DataFrame) -> RuleResult: ) # Convert column to datetime - timestamps = pd.to_datetime(df[self.column], errors="coerce") + timestamps = _to_datetime_fast(df[self.column]) non_null_mask = timestamps.notna() valid_timestamps = timestamps[non_null_mask] @@ -241,9 +275,17 @@ def validate(self, df: pd.DataFrame) -> RuleResult: check_name=self.name, ) + # Ensure comparison timestamps match tz-awareness of the column + col_tz = getattr(valid_timestamps.dt, "tz", None) + min_ts = self.min_timestamp + max_ts = self.max_timestamp + if col_tz is not None and min_ts.tzinfo is None: + min_ts = min_ts.tz_localize("UTC").tz_convert(col_tz) + max_ts = max_ts.tz_localize("UTC").tz_convert(col_tz) + # Find values outside range - below_min = valid_timestamps < self.min_timestamp - above_max = valid_timestamps > self.max_timestamp + below_min = valid_timestamps < min_ts + above_max = valid_timestamps > max_ts violation_mask = below_min | above_max violation_indices = valid_timestamps.index[violation_mask] @@ -261,10 +303,10 @@ def validate(self, df: pd.DataFrame) -> RuleResult: failed_values = valid_timestamps.loc[violation_indices] reasons = [] for ts in failed_values.iloc[:100]: - if ts < self.min_timestamp: - reasons.append(f"Timestamp {ts} is before {self.min_timestamp}") + if ts < min_ts: + reasons.append(f"Timestamp {ts} is before {min_ts}") else: - reasons.append(f"Timestamp {ts} is after {self.max_timestamp}") + reasons.append(f"Timestamp {ts} is after {max_ts}") failure_detail = self._create_failure_detail( violation_indices, total_rows, failed_values.astype(str), reasons @@ -327,7 +369,7 @@ def validate(self, df: pd.DataFrame) -> RuleResult: ) # Convert column to datetime - timestamps = pd.to_datetime(df[self.column], errors="coerce") + timestamps = _to_datetime_fast(df[self.column]) non_null_mask = timestamps.notna() valid_timestamps = timestamps[non_null_mask] @@ -342,7 +384,8 @@ def validate(self, df: pd.DataFrame) -> RuleResult: check_name=self.name, ) - now = pd.Timestamp.now() + col_tz = getattr(valid_timestamps.dt, "tz", None) + now = pd.Timestamp.now(tz=col_tz) if col_tz is not None else pd.Timestamp.now() future_mask = valid_timestamps > now future_indices = valid_timestamps.index[future_mask] @@ -449,25 +492,55 @@ def validate(self, df: pd.DataFrame) -> RuleResult: check_name=self.name, ) - # If the column is already datetime (numpy datetime64 or - # PyArrow timestamp), format it using the expected format - # string so the round-trip check works correctly regardless - # of the target format. + # Detect pre-parsed datetime columns (numpy datetime64 or PyArrow + # timestamp). The original string representation is lost once + # pandas has parsed the column, so we use a round-trip check: + # format the datetime with the user's format, parse it back, and + # verify the result equals the original value. If the format loses + # information (e.g. "%d/%m/%Y" drops the time component), the + # round-trip produces a different timestamp, correctly signalling + # a format mismatch. + # + # Special case: Arrow date32[day] columns. + # date32 values have no time component, so any date-only format + # round-trips to the same midnight value — the standard round-trip + # cannot detect format mismatches. Instead, we convert the dates + # to ISO strings (their natural representation) and try to parse + # those strings with the user's format. A wrong format (e.g. + # "%d/%m/%Y") will fail to parse "2024-01-15", correctly failing. + import pyarrow as pa # noqa: PLC0415 + _is_arrow_date = isinstance(data.dtype, pd.ArrowDtype) and pa.types.is_date( + data.dtype.pyarrow_dtype + ) _is_datetime = pd.api.types.is_datetime64_any_dtype(data) or ( isinstance(data.dtype, pd.ArrowDtype) and hasattr(data, "dt") ) - if _is_datetime: + if _is_arrow_date: + # Arrow date32 doesn't support dt.strftime directly. + # Cast to datetime64[ns] (midnight UTC) first, then format. + iso_strings = data.astype("datetime64[ns]").dt.strftime("%Y-%m-%d") + parsed = pd.to_datetime( + iso_strings, format=self.format_string, errors="coerce" + ) + valid_mask = parsed.notna() + elif _is_datetime: dt_series = data.astype("datetime64[ns]") str_data = dt_series.dt.strftime(self.format_string) + parsed = pd.to_datetime( + str_data, format=self.format_string, errors="coerce" + ) + # Round-trip must recover the original timestamp exactly. + # Formats that discard information (e.g. time-only format on + # a column with time values) will not match. + valid_mask = parsed.notna() & (parsed == dt_series) else: str_data = data.astype(str) + parsed = pd.to_datetime( + str_data, format=self.format_string, errors="coerce" + ) + valid_mask = parsed.notna() - # Vectorized date format validation via pd.to_datetime - parsed = pd.to_datetime( - str_data, format=self.format_string, errors="coerce" - ) - valid_mask = parsed.notna() invalid_indices = data.index[~valid_mask] if len(invalid_indices) == 0: @@ -516,114 +589,3 @@ def validate(self, df: pd.DataFrame) -> RuleResult: ) -class BusinessDaysOnlyRule(Rule): - """Rule to validate that all dates fall on business days (weekdays). - - All non-null dates must be Monday through Friday (not Saturday or Sunday). - Holiday checking is not implemented in the MVP version. - """ - - def __init__(self, name: str, column: str, country_code: str = "US") -> None: - """Initialize BusinessDaysOnlyRule. - - Args: - name: Name of the rule - column: Column to validate - country_code: Country code for holidays (not used in MVP) - """ - super().__init__(name, column) - self.country_code = country_code - - def validate(self, df: pd.DataFrame) -> RuleResult: - """Validate that all dates are business days. - - Args: - df: DataFrame to validate - - Returns: - RuleResult with validation outcome - """ - try: - self._check_column_exists(df) - - total_rows = len(df) - - if total_rows == 0: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=0, - failed_rows=0, - rule_type="business_days_only", - check_name=self.name, - ) - - # Convert column to datetime - timestamps = pd.to_datetime(df[self.column], errors="coerce") - non_null_mask = timestamps.notna() - valid_timestamps = timestamps[non_null_mask] - - if len(valid_timestamps) == 0: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=total_rows, - failed_rows=0, - rule_type="business_days_only", - check_name=self.name, - ) - - # Check for weekends (5 = Saturday, 6 = Sunday) - day_of_week = valid_timestamps.dt.dayofweek - weekend_mask = day_of_week >= 5 - weekend_indices = valid_timestamps.index[weekend_mask] - - if len(weekend_indices) == 0: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=True, - total_rows=total_rows, - failed_rows=0, - rule_type="business_days_only", - check_name=self.name, - ) - - failed_values = valid_timestamps.loc[weekend_indices] - day_names = failed_values.dt.day_name() - reasons = [ - f"Date {ts.date()} falls on {day_name} (weekend)" - for ts, day_name in zip( - failed_values.iloc[:100], day_names.iloc[:100], strict=False - ) - ] - - failure_detail = self._create_failure_detail( - weekend_indices, total_rows, failed_values.astype(str), reasons - ) - - return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=total_rows, - failed_rows=len(weekend_indices), - failure_details=failure_detail, - rule_type="business_days_only", - check_name=self.name, - ) - - except ColumnNotFoundError: - raise - except Exception as e: - return RuleResult( - rule_name=self.name, - column=self.column, - passed=False, - total_rows=len(df), - error=f"Error executing business_days_only rule: {e}", - rule_type="business_days_only", - check_name=self.name, - ) diff --git a/datacheck/sampling/__init__.py b/datacheck/sampling/__init__.py deleted file mode 100644 index de3eb1e..0000000 --- a/datacheck/sampling/__init__.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Data sampling utilities for DataCheck.""" - -from datacheck.sampling.sampler import DataSampler -from datacheck.sampling.strategies import ( - SamplingStrategy, - BaseSampler, - RandomSampler, - StratifiedSampler, - TimeBasedSampler, - ErrorFocusedSampler, - AdaptiveSampler, - ReservoirSampler, - SamplerFactory, - smart_sample, -) - -__all__ = [ - "DataSampler", - "SamplingStrategy", - "BaseSampler", - "RandomSampler", - "StratifiedSampler", - "TimeBasedSampler", - "ErrorFocusedSampler", - "AdaptiveSampler", - "ReservoirSampler", - "SamplerFactory", - "smart_sample", -] diff --git a/datacheck/sampling/sampler.py b/datacheck/sampling/sampler.py deleted file mode 100644 index 88a2b28..0000000 --- a/datacheck/sampling/sampler.py +++ /dev/null @@ -1,167 +0,0 @@ -"""Data sampling utilities for efficient validation.""" - - -import pandas as pd - -from datacheck.exceptions import DataLoadError - - -class DataSampler: - """Provides various sampling strategies for data validation. - - Sampling is useful for validating large datasets where checking every row - would be too slow. Different strategies serve different use cases. - - Example: - >>> sampler = DataSampler() - >>> sample = sampler.random_sample(df, rate=0.1, seed=42) - >>> # Validate 10% random sample - """ - - @staticmethod - def random_sample( - df: pd.DataFrame, - rate: float | None = None, - count: int | None = None, - seed: int | None = None - ) -> pd.DataFrame: - """Perform random sampling on DataFrame. - - Args: - df: DataFrame to sample - rate: Fraction of rows to sample (0.0 to 1.0) - count: Exact number of rows to sample - seed: Random seed for reproducibility - - Returns: - Sampled DataFrame - - Raises: - DataLoadError: If both rate and count are specified or neither - - Example: - >>> # Sample 10% of rows - >>> sample = DataSampler.random_sample(df, rate=0.1) - - >>> # Sample exact 1000 rows - >>> sample = DataSampler.random_sample(df, count=1000) - """ - if rate is not None and count is not None: - raise DataLoadError("Specify either 'rate' or 'count', not both") - - if rate is None and count is None: - raise DataLoadError("Must specify either 'rate' or 'count'") - - if rate is not None: - if not 0.0 < rate <= 1.0: - raise DataLoadError(f"Sample rate must be between 0 and 1, got {rate}") - return df.sample(frac=rate, random_state=seed) - - # count is guaranteed non-None here: both-None raises above, - # and rate-not-None returns above. - if count <= 0: # type: ignore[operator] - raise DataLoadError(f"Sample count must be positive, got {count}") - actual_count = min(count, len(df)) # type: ignore[type-var] - return df.sample(n=actual_count, random_state=seed) - - @staticmethod - def stratified_sample( - df: pd.DataFrame, - column: str, - count: int, - seed: int | None = None - ) -> pd.DataFrame: - """Perform stratified sampling based on a column. - - Samples a fixed number of rows from each unique value in the specified column. - Useful for ensuring representation from all categories. - - Args: - df: DataFrame to sample - column: Column to stratify by - count: Number of rows to sample from each stratum - seed: Random seed for reproducibility - - Returns: - Stratified sample DataFrame - - Raises: - DataLoadError: If column doesn't exist or count is invalid - - Example: - >>> # Sample 100 rows from each country - >>> sample = DataSampler.stratified_sample(df, "country", count=100) - """ - if column not in df.columns: - raise DataLoadError(f"Column '{column}' not found in DataFrame") - - if count <= 0: - raise DataLoadError(f"Sample count must be positive, got {count}") - - try: - # Sample from each group, preserving all columns - samples = [] - for _, group in df.groupby(column): - n = min(len(group), count) - samples.append(group.sample(n=n, random_state=seed)) - sampled = pd.concat(samples, ignore_index=True) - return sampled - except Exception as e: - raise DataLoadError(f"Error in stratified sampling: {e}") from e - - @staticmethod - def top_n(df: pd.DataFrame, n: int) -> pd.DataFrame: - """Return the first N rows. - - Simple head() operation, useful for quick validation of first rows. - - Args: - df: DataFrame to sample - n: Number of rows to return - - Returns: - First N rows of DataFrame - - Raises: - DataLoadError: If n is invalid - - Example: - >>> # Validate first 1000 rows - >>> sample = DataSampler.top_n(df, 1000) - """ - if n <= 0: - raise DataLoadError(f"n must be positive, got {n}") - - return df.head(n) - - @staticmethod - def systematic_sample( - df: pd.DataFrame, - interval: int, - start: int = 0 - ) -> pd.DataFrame: - """Perform systematic sampling (every Nth row). - - Args: - df: DataFrame to sample - interval: Sample every Nth row - start: Starting index (default 0) - - Returns: - Systematically sampled DataFrame - - Raises: - DataLoadError: If interval is invalid - - Example: - >>> # Sample every 10th row - >>> sample = DataSampler.systematic_sample(df, interval=10) - """ - if interval <= 0: - raise DataLoadError(f"Interval must be positive, got {interval}") - - if start < 0: - raise DataLoadError(f"Start index must be non-negative, got {start}") - - indices = range(start, len(df), interval) - return df.iloc[list(indices)] diff --git a/datacheck/sampling/strategies.py b/datacheck/sampling/strategies.py deleted file mode 100644 index 2801d2a..0000000 --- a/datacheck/sampling/strategies.py +++ /dev/null @@ -1,930 +0,0 @@ -"""Advanced sampling strategies for large dataset validation.""" - -from __future__ import annotations - -import logging -from abc import ABC, abstractmethod -from enum import Enum -from typing import Any - -import numpy as np -import pandas as pd - -from datacheck.exceptions import DataLoadError - -logger = logging.getLogger(__name__) - - -class SamplingStrategy(Enum): - """Available sampling strategies.""" - - RANDOM = "random" - STRATIFIED = "stratified" - TIME_BASED = "time_based" - ERROR_FOCUSED = "error_focused" - ADAPTIVE = "adaptive" - RESERVOIR = "reservoir" - SYSTEMATIC = "systematic" - TOP_N = "top_n" - - -class BaseSampler(ABC): - """Base class for sampling strategies.""" - - @abstractmethod - def sample(self, df: pd.DataFrame, **kwargs: Any) -> pd.DataFrame: - """Sample DataFrame. - - Args: - df: DataFrame to sample - **kwargs: Strategy-specific options - - Returns: - Sampled DataFrame - """ - pass - - @property - @abstractmethod - def strategy(self) -> SamplingStrategy: - """Return the sampling strategy type.""" - pass - - -class RandomSampler(BaseSampler): - """Simple random sampling.""" - - @property - def strategy(self) -> SamplingStrategy: - """Return the random sampling strategy type.""" - return SamplingStrategy.RANDOM - - def sample( - self, - df: pd.DataFrame, - n: int | None = None, - sample_rate: float | None = None, - sample_count: int | None = None, - random_state: int | None = None, - seed: int | None = None, - **kwargs: Any, - ) -> pd.DataFrame: - """Random sample of DataFrame. - - Args: - df: DataFrame to sample - n: Alias for sample_count - sample_rate: Fraction to sample (0.0-1.0) - sample_count: Exact number of rows to sample - random_state: Random seed - seed: Alias for random_state - - Returns: - Sampled DataFrame - - Raises: - DataLoadError: If neither rate nor count specified - """ - # Handle aliases - effective_count = n if n is not None else sample_count - effective_seed = random_state if random_state is not None else seed - if effective_seed is None: - effective_seed = 42 - - # If count >= total rows, return all - if effective_count is not None and effective_count >= len(df): - return df - - if sample_rate is None and effective_count is None: - raise DataLoadError("Must specify either 'sample_rate', 'sample_count', or 'n'") - - if sample_rate is not None: - if not 0.0 < sample_rate <= 1.0: - raise DataLoadError( - f"Sample rate must be between 0 and 1, got {sample_rate}" - ) - return df.sample(frac=sample_rate, random_state=effective_seed) - - # effective_count is guaranteed non-None here - actual_count = min(effective_count, len(df)) # type: ignore[type-var] - return df.sample(n=actual_count, random_state=effective_seed) - - -class StratifiedSampler(BaseSampler): - """Stratified sampling to preserve distributions.""" - - @property - def strategy(self) -> SamplingStrategy: - """Return the stratified sampling strategy type.""" - return SamplingStrategy.STRATIFIED - - def sample( - self, - df: pd.DataFrame, - stratify_column: str | None = None, - n: int | None = None, - sample_rate: float | None = None, - min_per_stratum: int = 1, - random_state: int | None = None, - seed: int | None = None, - **kwargs: Any, - ) -> pd.DataFrame: - """Stratified sampling based on a column. - - Preserves the distribution of values in stratify_column. - - Args: - df: DataFrame to sample - stratify_column: Column to stratify on - n: Target number of rows to sample (if >= total rows, returns all) - sample_rate: Fraction to sample from each stratum (used if n not provided) - min_per_stratum: Minimum rows to sample from each stratum - random_state: Random seed - seed: Alias for random_state - - Returns: - Stratified sample - - Raises: - DataLoadError: If column doesn't exist - - Example: - >>> sampler = StratifiedSampler() - >>> sample = sampler.sample(df, stratify_column='category', n=1000) - """ - if stratify_column is None: - raise DataLoadError("stratify_column is required for stratified sampling") - if stratify_column not in df.columns: - raise DataLoadError(f"Column '{stratify_column}' not found in DataFrame") - - # Handle seed alias - effective_seed = random_state if random_state is not None else seed - if effective_seed is None: - effective_seed = 42 - - # If n >= total rows, return all rows - if n is not None and n >= len(df): - return df - - # Calculate effective sample rate from n if provided - if n is not None: - effective_rate = n / len(df) - elif sample_rate is not None: - effective_rate = sample_rate - else: - effective_rate = 0.1 - - def sample_group(group: pd.DataFrame) -> pd.DataFrame: - """Sample a single stratum group respecting minimum and rate constraints.""" - n_sample = max(min_per_stratum, int(len(group) * effective_rate)) - n_sample = min(n_sample, len(group)) - return group.sample(n=n_sample, random_state=effective_seed) - - samples = [] - for _, group in df.groupby(stratify_column): - samples.append(sample_group(group)) - sampled = pd.concat(samples, ignore_index=True) - return sampled - - def sample_proportional( - self, - df: pd.DataFrame, - stratify_column: str, - total_sample_size: int, - random_state: int = 42, - ) -> pd.DataFrame: - """Sample proportionally based on stratum sizes. - - Args: - df: DataFrame to sample - stratify_column: Column to stratify on - total_sample_size: Total number of rows to sample - random_state: Random seed - - Returns: - Proportionally stratified sample - """ - if stratify_column not in df.columns: - raise DataLoadError(f"Column '{stratify_column}' not found in DataFrame") - - # Calculate proportions - value_counts = df[stratify_column].value_counts() - proportions = value_counts / len(df) - - samples = [] - np.random.seed(random_state) - - for value, proportion in proportions.items(): - stratum = df[df[stratify_column] == value] - n_sample = max(1, int(total_sample_size * proportion)) - n_sample = min(n_sample, len(stratum)) - samples.append(stratum.sample(n=n_sample, random_state=random_state)) - - return pd.concat(samples, ignore_index=True) - - -class TimeBasedSampler(BaseSampler): - """Time-based sampling for temporal data.""" - - @property - def strategy(self) -> SamplingStrategy: - """Return the time-based sampling strategy type.""" - return SamplingStrategy.TIME_BASED - - def sample( - self, - df: pd.DataFrame, - time_column: str | None = None, - start_date: str | None = None, - end_date: str | None = None, - n: int | None = None, - sample_rate: float | None = None, - random_state: int | None = None, - seed: int | None = None, - **kwargs: Any, - ) -> pd.DataFrame: - """Sample based on time range. - - Args: - df: DataFrame to sample - time_column: Column with datetime values - start_date: Start date (ISO format, e.g., '2024-01-01') - end_date: End date (ISO format) - n: Target number of rows to sample - sample_rate: Fraction to sample from filtered range - random_state: Random seed - seed: Alias for random_state - - Returns: - Time-filtered and sampled DataFrame - - Raises: - DataLoadError: If time_column doesn't exist - """ - # Handle seed alias - effective_seed = random_state if random_state is not None else seed - if effective_seed is None: - effective_seed = 42 - - if time_column is None: - raise DataLoadError("time_column is required for time-based sampling") - if time_column not in df.columns: - raise DataLoadError(f"Column '{time_column}' not found in DataFrame") - - df_work = df.assign(**{time_column: pd.to_datetime(df[time_column], errors="coerce")}) - - # Filter by date range - if start_date: - df_work = df_work[df_work[time_column] >= pd.to_datetime(start_date)] - if end_date: - df_work = df_work[df_work[time_column] <= pd.to_datetime(end_date)] - - if len(df_work) == 0: - return df_work - - # If n >= filtered rows, return all filtered data - if n is not None and n >= len(df_work): - return df_work - - # Sample from filtered data - if n is not None: - actual_count = min(n, len(df_work)) - return df_work.sample(n=actual_count, random_state=effective_seed) - elif sample_rate is not None: - return df_work.sample(frac=sample_rate, random_state=effective_seed) - else: - # Default to 10% if neither specified - return df_work.sample(frac=0.1, random_state=effective_seed) - - def sample_recent( - self, - df: pd.DataFrame, - time_column: str, - days: int = 30, - sample_rate: float = 1.0, - random_state: int = 42, - ) -> pd.DataFrame: - """Sample from recent time period. - - Args: - df: DataFrame to sample - time_column: Column with datetime values - days: Number of recent days to include - sample_rate: Fraction to sample - random_state: Random seed - - Returns: - Sample from recent data - """ - if time_column not in df.columns: - raise DataLoadError(f"Column '{time_column}' not found in DataFrame") - - df_work = df.assign(**{time_column: pd.to_datetime(df[time_column], errors="coerce")}) - - cutoff = pd.Timestamp.now() - pd.Timedelta(days=days) - recent = df_work[df_work[time_column] >= cutoff] - - if len(recent) == 0: - return recent - - if sample_rate < 1.0: - return recent.sample(frac=sample_rate, random_state=random_state) - return recent - - def sample_by_period( - self, - df: pd.DataFrame, - time_column: str, - period: str = "M", - samples_per_period: int = 100, - random_state: int = 42, - ) -> pd.DataFrame: - """Sample evenly across time periods. - - Args: - df: DataFrame to sample - time_column: Column with datetime values - period: Period frequency ('D', 'W', 'M', 'Q', 'Y') - samples_per_period: Rows to sample from each period - random_state: Random seed - - Returns: - Sample distributed across time periods - """ - if time_column not in df.columns: - raise DataLoadError(f"Column '{time_column}' not found in DataFrame") - - parsed_col = pd.to_datetime(df[time_column], errors="coerce") - assign_kwargs: dict[str, pd.Series] = { - time_column: parsed_col, - "_period": parsed_col.dt.to_period(period), - } - df_work = df.assign(**assign_kwargs) - - def sample_period(group: pd.DataFrame) -> pd.DataFrame: - """Sample rows from a single time period group.""" - n = min(samples_per_period, len(group)) - return group.sample(n=n, random_state=random_state) - - samples = [] - for _, group in df_work.groupby("_period"): - samples.append(sample_period(group)) - sampled = pd.concat(samples, ignore_index=True) - return sampled.drop(columns=["_period"], errors="ignore") - - -class ErrorFocusedSampler(BaseSampler): - """Sample with bias toward rows likely to have errors.""" - - @property - def strategy(self) -> SamplingStrategy: - """Return the error-focused sampling strategy type.""" - return SamplingStrategy.ERROR_FOCUSED - - def sample( - self, - df: pd.DataFrame, - n: int | None = None, - error_indicators: list[str] | None = None, - null_columns: list[str] | None = None, - sample_rate: float | None = None, - error_oversample: float = 3.0, - random_state: int | None = None, - seed: int | None = None, - **kwargs: Any, - ) -> pd.DataFrame: - """Oversample rows with potential errors. - - Args: - df: DataFrame to sample - n: Target number of rows to sample (if >= total rows, returns all) - error_indicators: List of pandas query conditions - e.g., ["age < 0", "price > 10000"] - null_columns: Columns where nulls indicate potential errors - sample_rate: Base sample rate for normal rows (used if n not provided) - error_oversample: Multiplier for error rows (3.0 = sample 3x more) - random_state: Random seed - seed: Alias for random_state - - Returns: - Sample with oversampled error rows - """ - # Handle seed alias - effective_seed = random_state if random_state is not None else seed - if effective_seed is None: - effective_seed = 42 - - np.random.seed(effective_seed) - - # Validate error indicators reference existing columns - if error_indicators: - invalid_columns = self._validate_indicator_columns(df, error_indicators) - if invalid_columns: - raise DataLoadError( - f"Error indicator(s) reference non-existent column(s): {', '.join(sorted(invalid_columns))}. " - f"Available columns: {', '.join(sorted(df.columns))}" - ) - - # If n >= total rows, return all rows - if n is not None and n >= len(df): - return df - - # Build error mask - error_mask = pd.Series(False, index=df.index) - valid_indicators_count = 0 - - # Check error indicator conditions - if error_indicators: - for condition in error_indicators: - try: - mask = df.eval(condition) - if isinstance(mask, pd.Series): - error_mask = error_mask | mask.astype(bool) - else: - # Handle scalar or array results - error_mask = error_mask | pd.Series(mask, index=df.index).astype(bool) - valid_indicators_count += 1 - except Exception as exc: - logger.warning("Skipping invalid error indicator condition %r: %s", condition, exc) - - # Check null columns - valid_null_columns = False - if null_columns: - for col in null_columns: - if col in df.columns: - error_mask |= df[col].isna() - valid_null_columns = True - - # Auto-detect potential errors if no valid indicators/columns provided - # This also triggers if ALL provided indicators were invalid - if (not error_indicators and not null_columns) or \ - (error_indicators and valid_indicators_count == 0 and not valid_null_columns): - logger.info("No valid error indicators found, using auto-detection") - error_mask = self._auto_detect_errors(df) - - # Separate error and normal rows - error_rows = df[error_mask] - normal_rows = df[~error_mask] - - # Calculate effective sample rate from n if provided - if n is not None: - effective_rate = n / len(df) - elif sample_rate is not None: - effective_rate = sample_rate - else: - effective_rate = 0.1 - - samples = [] - - # Sample error rows at higher rate (always include at least 1 if any exist) - if len(error_rows) > 0: - error_rate = min(effective_rate * error_oversample, 1.0) - n_error = max(1, int(len(error_rows) * error_rate)) - n_error = min(n_error, len(error_rows)) - error_sample = error_rows.sample(n=n_error, random_state=effective_seed) - samples.append(error_sample) - - # Sample normal rows at base rate - if len(normal_rows) > 0: - n_normal = max(1, int(len(normal_rows) * effective_rate)) - n_normal = min(n_normal, len(normal_rows)) - normal_sample = normal_rows.sample(n=n_normal, random_state=effective_seed) - samples.append(normal_sample) - - if not samples: - return df.head(0) # Empty DataFrame with same schema - - # Combine and shuffle - combined = pd.concat(samples, ignore_index=True) - return combined.sample(frac=1.0, random_state=effective_seed) - - def _validate_indicator_columns( - self, df: pd.DataFrame, indicators: list[str] - ) -> set[str]: - """Validate that error indicator conditions reference existing columns. - - Args: - df: DataFrame to validate against - indicators: List of condition strings like "age < 0", "price > 10000" - - Returns: - Set of column names that don't exist in the DataFrame - """ - import re - - # Extract potential column names from conditions - # Matches word characters that could be column names (excludes operators and numbers) - invalid_columns = set() - available_columns = set(df.columns) - - for condition in indicators: - # Extract identifiers (potential column names) - # This regex finds word tokens that aren't pure numbers - tokens = re.findall(r'\b([a-zA-Z_][a-zA-Z0-9_]*)\b', condition) - for token in tokens: - # Skip common keywords/operators - if token.lower() in {'and', 'or', 'not', 'in', 'is', 'true', 'false', 'none', 'null', 'nan'}: - continue - # Check if token is a column name - if token not in available_columns: - invalid_columns.add(token) - - return invalid_columns - - def _auto_detect_errors(self, df: pd.DataFrame) -> pd.Series: - """Auto-detect rows with potential errors. - - Looks for: - - Null values in typically non-null columns - - Extreme outliers in numeric columns - - Empty strings in string columns - - Args: - df: DataFrame to analyze - - Returns: - Boolean mask of potentially erroneous rows - """ - error_mask = pd.Series(False, index=df.index) - - for col in df.columns: - if df[col].dtype in ["int64", "float64"]: - # Detect outliers using IQR - q1 = df[col].quantile(0.25) - q3 = df[col].quantile(0.75) - iqr = q3 - q1 - lower = q1 - 3 * iqr # 3x IQR for extreme outliers - upper = q3 + 3 * iqr - error_mask |= (df[col] < lower) | (df[col] > upper) - - elif df[col].dtype == "object": - # Check for empty strings - error_mask |= df[col].fillna("").str.strip() == "" - - # Check for nulls if column is mostly non-null - null_rate = df[col].isna().mean() - if null_rate < 0.05: # Less than 5% nulls expected - error_mask |= df[col].isna() - - return error_mask - - -class AdaptiveSampler(BaseSampler): - """Automatically determine optimal sample size.""" - - @property - def strategy(self) -> SamplingStrategy: - """Return the adaptive sampling strategy type.""" - return SamplingStrategy.ADAPTIVE - - def sample( - self, - df: pd.DataFrame, - n: int | None = None, - target_rows: int = 100000, - min_sample_rate: float = 0.01, - max_sample_rate: float = 1.0, - confidence_level: float = 0.95, - margin_of_error: float = 0.01, - random_state: int | None = None, - seed: int | None = None, - **kwargs: Any, - ) -> pd.DataFrame: - """Adaptively sample based on dataset size. - - Args: - df: DataFrame to sample - n: Alias for target_rows - target_rows: Target number of rows (if possible) - min_sample_rate: Minimum sample rate - max_sample_rate: Maximum sample rate - confidence_level: Statistical confidence level (for statistical sizing) - margin_of_error: Acceptable margin of error - random_state: Random seed - seed: Alias for random_state - - Returns: - Adaptively sampled DataFrame - """ - # Handle aliases - effective_target = n if n is not None else target_rows - effective_seed = random_state if random_state is not None else seed - if effective_seed is None: - effective_seed = 42 - - total_rows = len(df) - - if total_rows == 0: - return df - - if total_rows <= effective_target: - # Dataset is small enough, use it all - return df - - # Calculate statistically optimal sample size - statistical_size = self._calculate_sample_size( - total_rows, confidence_level, margin_of_error - ) - - # Choose between target and statistical size - optimal_size = min(effective_target, statistical_size) - - # Convert to rate and bound it - sample_rate = optimal_size / total_rows - sample_rate = max(min_sample_rate, min(sample_rate, max_sample_rate)) - - return df.sample(frac=sample_rate, random_state=effective_seed) - - def _calculate_sample_size( - self, - population: int, - confidence: float = 0.95, - margin: float = 0.01, - ) -> int: - """Calculate statistically valid sample size. - - Uses the formula for sample size with finite population correction. - - Args: - population: Population size - confidence: Confidence level - margin: Margin of error - - Returns: - Required sample size - """ - # Z-scores for common confidence levels - z_scores = { - 0.90: 1.645, - 0.95: 1.96, - 0.99: 2.576, - } - z = z_scores.get(confidence, 1.96) - - if population <= 0: - return 0 - - if margin <= 0: - return population - - # Assume p=0.5 (maximum variability) - p = 0.5 - - # Sample size for infinite population - n_0 = (z**2 * p * (1 - p)) / (margin**2) - - # Finite population correction - n = n_0 / (1 + (n_0 - 1) / population) - - return int(np.ceil(n)) - - def sample_for_validation( - self, - df: pd.DataFrame, - expected_error_rate: float = 0.01, - min_errors_to_detect: int = 100, - random_state: int = 42, - ) -> pd.DataFrame: - """Sample size sufficient to detect expected errors. - - Args: - df: DataFrame to sample - expected_error_rate: Expected rate of errors - min_errors_to_detect: Minimum errors we want in sample - random_state: Random seed - - Returns: - Sample sized to detect expected errors - """ - if expected_error_rate <= 0: - expected_error_rate = 0.01 - - # Calculate required sample to get min_errors_to_detect - required_sample = int(min_errors_to_detect / expected_error_rate) - required_sample = min(required_sample, len(df)) - - if required_sample >= len(df): - return df - - return df.sample(n=required_sample, random_state=random_state) - - -class ReservoirSampler(BaseSampler): - """Reservoir sampling for streaming large files.""" - - @property - def strategy(self) -> SamplingStrategy: - """Return the reservoir sampling strategy type.""" - return SamplingStrategy.RESERVOIR - - def sample( - self, - df: pd.DataFrame, - k: int | None = None, - sample_size: int = 10000, - random_state: int | None = None, - seed: int | None = None, - **kwargs: Any, - ) -> pd.DataFrame: - """Reservoir sampling (Algorithm R). - - This algorithm is particularly useful for streaming data where - the total size is unknown. For DataFrames, it's equivalent to - random sampling but uses the reservoir algorithm. - - Args: - df: DataFrame to sample - k: Alias for sample_size - sample_size: Number of rows to sample - random_state: Random seed - seed: Alias for random_state - - Returns: - Sample of specified size - """ - # Handle aliases - effective_size = k if k is not None else sample_size - effective_seed = random_state if random_state is not None else seed - if effective_seed is None: - effective_seed = 42 - - np.random.seed(effective_seed) - - total_rows = len(df) - if total_rows <= effective_size: - return df - - # Initialize reservoir with first k elements - reservoir_indices = list(range(effective_size)) - - # Replace elements with decreasing probability - for i in range(effective_size, total_rows): - j = np.random.randint(0, i + 1) - if j < effective_size: - reservoir_indices[j] = i - - return df.iloc[sorted(reservoir_indices)].reset_index(drop=True) - - def sample_weighted( - self, - df: pd.DataFrame, - sample_size: int, - weight_column: str, - random_state: int = 42, - ) -> pd.DataFrame: - """Weighted reservoir sampling (Algorithm A-Res). - - Args: - df: DataFrame to sample - sample_size: Number of rows to sample - weight_column: Column with sampling weights - random_state: Random seed - - Returns: - Weighted sample - """ - if weight_column not in df.columns: - raise DataLoadError(f"Column '{weight_column}' not found in DataFrame") - - np.random.seed(random_state) - - n = len(df) - if n <= sample_size: - return df - - weights = np.asarray(df[weight_column].fillna(1).values, dtype=np.float64) - - # Calculate keys for weighted reservoir sampling - # Key = random^(1/weight) for each element - random_values = np.random.random(n) - keys = np.power(random_values, 1.0 / np.maximum(weights, 1e-10)) - - # Select top k by key - top_indices = np.argpartition(keys, -sample_size)[-sample_size:] - top_indices = top_indices[np.argsort(keys[top_indices])] - - return df.iloc[top_indices].reset_index(drop=True) - - -class SamplerFactory: - """Factory for creating samplers.""" - - _samplers: dict[SamplingStrategy, type[BaseSampler]] = { - SamplingStrategy.RANDOM: RandomSampler, - SamplingStrategy.STRATIFIED: StratifiedSampler, - SamplingStrategy.TIME_BASED: TimeBasedSampler, - SamplingStrategy.ERROR_FOCUSED: ErrorFocusedSampler, - SamplingStrategy.ADAPTIVE: AdaptiveSampler, - SamplingStrategy.RESERVOIR: ReservoirSampler, - } - - @classmethod - def create(cls, strategy: SamplingStrategy | str) -> BaseSampler: - """Create sampler instance. - - Args: - strategy: Sampling strategy (enum or string) - - Returns: - Sampler instance - - Raises: - DataLoadError: If strategy is unknown - """ - if isinstance(strategy, str): - try: - strategy = SamplingStrategy(strategy.lower()) - except ValueError: - valid = [s.value for s in SamplingStrategy] - raise DataLoadError( - f"Unknown sampling strategy '{strategy}'. " - f"Valid strategies: {', '.join(valid)}" - ) - - if strategy not in cls._samplers: - raise DataLoadError(f"No sampler for strategy: {strategy}") - - return cls._samplers[strategy]() - - @classmethod - def list_strategies(cls) -> list[str]: - """List available sampling strategies.""" - return [s.value for s in SamplingStrategy] - - -def smart_sample( - df: pd.DataFrame, - target_rows: int = 100000, - stratify_column: str | None = None, - time_column: str | None = None, - error_indicators: list[str] | None = None, - random_state: int = 42, -) -> pd.DataFrame: - """Smart sampling that auto-selects the best strategy. - - Args: - df: DataFrame to sample - target_rows: Target sample size - stratify_column: Column for stratified sampling - time_column: Column for time-based sampling - error_indicators: Conditions for error-focused sampling - random_state: Random seed - - Returns: - Sampled DataFrame using most appropriate strategy - """ - n = len(df) - - # Small dataset - no sampling needed - if n <= target_rows: - return df - - # Error-focused if indicators provided - if error_indicators: - error_sampler = ErrorFocusedSampler() - return error_sampler.sample( - df, - error_indicators=error_indicators, - sample_rate=target_rows / n, - random_state=random_state, - ) - - # Stratified if column provided - if stratify_column and stratify_column in df.columns: - strat_sampler = StratifiedSampler() - result: pd.DataFrame = strat_sampler.sample_proportional( - df, - stratify_column=stratify_column, - total_sample_size=target_rows, - random_state=random_state, - ) - return result - - # Time-based for temporal data - if time_column and time_column in df.columns: - time_sampler = TimeBasedSampler() - result = time_sampler.sample_by_period( - df, - time_column=time_column, - samples_per_period=target_rows // 12, # Assume monthly - random_state=random_state, - ) - return result - - # Default to adaptive sampling - adaptive_sampler = AdaptiveSampler() - return adaptive_sampler.sample(df, target_rows=target_rows, random_state=random_state) - - -__all__ = [ - "SamplingStrategy", - "BaseSampler", - "RandomSampler", - "StratifiedSampler", - "TimeBasedSampler", - "ErrorFocusedSampler", - "AdaptiveSampler", - "ReservoirSampler", - "SamplerFactory", - "smart_sample", -] diff --git a/datacheck/schema/detector.py b/datacheck/schema/detector.py index 2dae1ec..d7df2a1 100644 --- a/datacheck/schema/detector.py +++ b/datacheck/schema/detector.py @@ -42,7 +42,11 @@ def detect( # Calculate statistics null_count = int(col_data.isnull().sum()) null_percentage = null_count / len(df) if len(df) > 0 else 0.0 - unique_count = int(col_data.nunique()) + try: + unique_count = int(col_data.nunique()) + except (TypeError, NotImplementedError, Exception): + # Complex Arrow types (list, struct, map) are not hashable + unique_count = -1 # Create column schema col_schema = ColumnSchema( @@ -158,8 +162,11 @@ def _infer_object_type(series: pd.Series) -> ColumnType: # Check for boolean-like values bool_values = {True, False, "true", "false", "True", "False", "1", "0"} - if all(v in bool_values for v in sample): - return ColumnType.BOOLEAN + try: + if all(v in bool_values for v in sample): + return ColumnType.BOOLEAN + except TypeError: + pass # unhashable values (e.g. dicts from JSONB) cannot be booleans # Check for numeric strings try: diff --git a/datacheck/security/validators.py b/datacheck/security/validators.py index 9d6f625..2b24f03 100644 --- a/datacheck/security/validators.py +++ b/datacheck/security/validators.py @@ -27,7 +27,7 @@ class PathValidator: # Default allowed file extensions for data files DEFAULT_EXTENSIONS = { - '.csv', '.parquet', '.json', '.jsonl', '.avro', + '.csv', '.parquet', '.json', '.jsonl', '.tsv', '.txt', '.xlsx', '.xls', '.yaml', '.yml' } diff --git a/datacheck/sql_pushdown/__init__.py b/datacheck/sql_pushdown/__init__.py new file mode 100644 index 0000000..7dac5b6 --- /dev/null +++ b/datacheck/sql_pushdown/__init__.py @@ -0,0 +1,5 @@ +"""SQL aggregate pushdown for database validation.""" + +from datacheck.sql_pushdown.dialects import get_dialect, PUSHDOWN_CAPABLE_TYPES + +__all__ = ["get_dialect", "PUSHDOWN_CAPABLE_TYPES"] diff --git a/datacheck/sql_pushdown/builder.py b/datacheck/sql_pushdown/builder.py new file mode 100644 index 0000000..33c6f53 --- /dev/null +++ b/datacheck/sql_pushdown/builder.py @@ -0,0 +1,389 @@ +"""SQL aggregate pushdown for database validation. + +Generates a single aggregate SQL query from a set of rule configs, +executes it via the connector, and converts the scalar result row into +RuleResult objects. Zero rows transferred — all computation happens +inside the database engine. + +Supports all database types listed in ``PUSHDOWN_CAPABLE_TYPES`` +(PostgreSQL, Redshift, MySQL, SQL Server, Snowflake, BigQuery). +Each database uses its own :class:`~datacheck.sql_pushdown.dialects.Dialect` +subclass to generate the correct SQL syntax. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from datacheck.results import FailureDetail, RuleResult + +if TYPE_CHECKING: + from datacheck.sql_pushdown.dialects import Dialect + +# Maximum set of rules that CAN be pushed down to SQL across all dialects. +# Individual dialects may support a subset — use dialect.pushable_rules for +# the actual set available for a given database connection. +PUSHABLE_RULES: frozenset[str] = frozenset( + { + "not_null", + "boolean", + "min", + "max", + "range", + "positive", + "non_negative", + "allowed_values", + "min_length", + "max_length", + "unique", + "unique_combination", + "regex", + "max_age", + "sum_equals", + "no_future_timestamps", + "timestamp_range", + "date_range", + } +) + + +def _lit(value: Any) -> str: + """Convert a scalar to a safe SQL literal (numeric or single-quoted string).""" + if isinstance(value, bool): + return "TRUE" if value else "FALSE" + if isinstance(value, (int, float)): + return repr(value) # e.g. 3.14, -1, 100 + s = str(value).replace("'", "''") + return f"'{s}'" + + +class SqlAggregateBuilder: + """Build and parse SQL aggregate queries for pushdown validation.""" + + def __init__(self) -> None: + # Populated by build_query(); consumed by parse_results() + self._items: list[tuple[str, Any, str, Any]] = [] + # alias → (check, rule_type, params) + + # ── Public API ────────────────────────────────────────────────────────── + + def partition_checks( + self, checks: list[Any], dialect: Dialect + ) -> tuple[list[Any], list[Any]]: + """Split checks into (pushable, non_pushable) for the given *dialect*. + + A check is pushable if every rule_type in ``check.rules`` is in + ``dialect.pushable_rules``. Disabled rules (params == False) are + still considered pushable (they are silently skipped in SQL generation). + """ + pushable_rules = dialect.pushable_rules + pushable: list[Any] = [] + non_pushable: list[Any] = [] + for check in checks: + if all(rt in pushable_rules for rt in check.rules): + pushable.append(check) + else: + non_pushable.append(check) + return pushable, non_pushable + + def build_query( + self, + table: str, + where: str | None, + pushable_checks: list[Any], + dialect: Dialect, + ) -> str: + """Build a single aggregate SELECT for all pushable checks. + + Internally stores alias→(check, rule_type, params) so that + parse_results() can reconstruct RuleResult objects. + """ + self._items = [] + exprs: list[str] = ["COUNT(*) AS _total_rows"] + + for i, check in enumerate(pushable_checks): + col = dialect.q(check.column) + for rule_type, params in check.rules.items(): + # params == False / None means the rule is disabled — skip + if params is False or params is None: + continue + alias_prefix = f"_c{i}_{rule_type}" + pairs = self._rule_to_sql(col, rule_type, params, alias_prefix, dialect) + for alias, expr in pairs: + exprs.append(f"{expr} AS {alias}") + self._items.append((alias, check, rule_type, params)) + + select_clause = ", ".join(exprs) + tbl = dialect.q(table) + sql = f"SELECT {select_clause} FROM {tbl}" + if where: + sql += f" WHERE {where}" + return sql + + def parse_results( + self, row: dict[str, Any], pushable_checks: list[Any] + ) -> list[RuleResult]: + """Convert the aggregate result row into a list of RuleResult objects.""" + total_rows = int(row.get("_total_rows") or 0) + results: list[RuleResult] = [] + + for alias, check, rule_type, params in self._items: + value = row.get(alias) + result = self._parse_single(check, rule_type, params, value, total_rows) + results.append(result) + + return results + + # ── SQL generation ────────────────────────────────────────────────────── + + def _rule_to_sql( + self, + col: str, + rule_type: str, + params: Any, + alias_prefix: str, + dialect: Dialect, + ) -> list[tuple[str, str]]: + """Return (alias, SQL_expression) pairs for one rule.""" + + if rule_type == "not_null": + return [ + (alias_prefix, + f"SUM(CASE WHEN {col} IS NULL THEN 1 ELSE 0 END)") + ] + + if rule_type == "boolean": + # Count non-null values whose text representation is not 'true' or 'false'. + # For native boolean columns the DB type guarantees 0 violations. + # For string columns this checks that every value is a boolean literal. + text_col = dialect.cast_to_text(col) + return [ + (alias_prefix, + f"SUM(CASE WHEN {col} IS NOT NULL" + f" AND LOWER({text_col}) NOT IN ('true', 'false')" + f" THEN 1 ELSE 0 END)") + ] + + if rule_type == "min": + v = _lit(params) + return [ + (alias_prefix, + f"SUM(CASE WHEN {col} IS NOT NULL AND {col} < {v}" + f" THEN 1 ELSE 0 END)") + ] + + if rule_type == "max": + v = _lit(params) + return [ + (alias_prefix, + f"SUM(CASE WHEN {col} IS NOT NULL AND {col} > {v}" + f" THEN 1 ELSE 0 END)") + ] + + if rule_type == "range": + lo = _lit(params["min"]) + hi = _lit(params["max"]) + return [ + (alias_prefix, + f"SUM(CASE WHEN {col} IS NOT NULL" + f" AND ({col} < {lo} OR {col} > {hi})" + f" THEN 1 ELSE 0 END)") + ] + + if rule_type == "positive": + return [ + (alias_prefix, + f"SUM(CASE WHEN {col} IS NOT NULL AND {col} <= 0" + f" THEN 1 ELSE 0 END)") + ] + + if rule_type == "non_negative": + return [ + (alias_prefix, + f"SUM(CASE WHEN {col} IS NOT NULL AND {col} < 0" + f" THEN 1 ELSE 0 END)") + ] + + if rule_type == "allowed_values": + values_sql = ", ".join(_lit(v) for v in params) + # Cast to text so ENUM/typed columns compare safely against string literals. + text_col = dialect.cast_to_text(col) + return [ + (alias_prefix, + f"SUM(CASE WHEN {col} IS NOT NULL" + f" AND {text_col} NOT IN ({values_sql})" + f" THEN 1 ELSE 0 END)") + ] + + if rule_type == "min_length": + n = int(params) + length_expr = dialect.str_length(dialect.cast_to_text(col)) + return [ + (alias_prefix, + f"SUM(CASE WHEN {col} IS NOT NULL" + f" AND {length_expr} < {n}" + f" THEN 1 ELSE 0 END)") + ] + + if rule_type == "max_length": + n = int(params) + length_expr = dialect.str_length(dialect.cast_to_text(col)) + return [ + (alias_prefix, + f"SUM(CASE WHEN {col} IS NOT NULL" + f" AND {length_expr} > {n}" + f" THEN 1 ELSE 0 END)") + ] + + if rule_type == "unique": + # COUNT(*) - COUNT(DISTINCT col) = number of "extra" duplicate rows. + return [ + (alias_prefix, + f"COUNT(*) - COUNT(DISTINCT {col})") + ] + + if rule_type == "unique_combination": + # params is a list of column names. + # Strategy: count total non-null combination rows minus distinct non-null + # combinations. Result > 0 means duplicate combinations exist. + # Uses CHR(1)/CHAR(1) as a separator that is not present in real data. + cols: list[str] = params if isinstance(params, list) else [] + if not cols: + return [] + not_null_cond = " AND ".join(f"{dialect.q(c)} IS NOT NULL" for c in cols) + parts = [dialect.cast_to_text(dialect.q(c)) for c in cols] + sep = dialect.sep1() + concat_expr = f" || {sep} || ".join(parts) + return [ + (alias_prefix, + f"SUM(CASE WHEN {not_null_cond} THEN 1 ELSE 0 END)" + f" - COUNT(DISTINCT CASE WHEN {not_null_cond}" + f" THEN {concat_expr} ELSE NULL END)") + ] + + if rule_type == "regex": + expr = dialect.regex_violation_expr(col, str(params)) + if expr is None: + return [] # guarded by partition_checks — should not reach here + return [ + (alias_prefix, + f"SUM(CASE WHEN {col} IS NOT NULL AND {expr}" + f" THEN 1 ELSE 0 END)") + ] + + if rule_type == "max_age": + expr = dialect.age_violation_expr(col, str(params)) + if expr is None: + return [] + return [ + (alias_prefix, + f"SUM(CASE WHEN {col} IS NOT NULL AND {expr}" + f" THEN 1 ELSE 0 END)") + ] + + if rule_type == "sum_equals": + col_a = dialect.q(str(params["column_a"])) + col_b = dialect.q(str(params["column_b"])) + tolerance = float(params.get("tolerance", 0.01)) + # col is the "total" column; col_a + col_b must equal it within tolerance + return [ + (alias_prefix, + f"SUM(CASE WHEN {col} IS NOT NULL" + f" AND {col_a} IS NOT NULL AND {col_b} IS NOT NULL" + f" AND ABS({col_a} + {col_b} - {col}) > {tolerance}" + f" THEN 1 ELSE 0 END)") + ] + + if rule_type == "no_future_timestamps": + ts = dialect.current_timestamp() + return [ + (alias_prefix, + f"SUM(CASE WHEN {col} IS NOT NULL" + f" AND {col} > {ts}" + f" THEN 1 ELSE 0 END)") + ] + + if rule_type in ("timestamp_range", "date_range"): + lo = _lit(params["min"]) + hi = _lit(params["max"]) + return [ + (alias_prefix, + f"SUM(CASE WHEN {col} IS NOT NULL" + f" AND ({col} < {lo} OR {col} > {hi})" + f" THEN 1 ELSE 0 END)") + ] + + return [] # Unknown rule — guarded by PUSHABLE_RULES before calling + + # ── Result parsing ────────────────────────────────────────────────────── + + def _parse_single( + self, + check: Any, + rule_type: str, + params: Any, + value: Any, + total_rows: int, + ) -> RuleResult: + rule_name = _rule_name(check.name, rule_type) + + # All rules: value is a violation count + violation_count = int(value or 0) + passed = violation_count == 0 + reasons = ( + [] + if passed + else [f"SQL aggregate: {violation_count:,} violations detected"] + ) + return self._make_result( + check, rule_type, rule_name, passed, total_rows, + violation_count, reasons, + ) + + def _make_result( + self, + check: Any, + rule_type: str, + rule_name: str, + passed: bool, + total_rows: int, + failed_rows: int, + reasons: list[str], + metric_value: Any = None, + ) -> RuleResult: + failure_details = None + if not passed and total_rows > 0: + sample_values = [metric_value] if metric_value is not None else [] + failure_details = FailureDetail( + rule_name=rule_name, + column=check.column, + failed_count=failed_rows, + total_count=total_rows, + failure_rate=failed_rows / total_rows * 100, + sample_failures=[], + sample_values=sample_values, + sample_reasons=reasons, + ) + return RuleResult( + rule_name=rule_name, + column=check.column, + passed=passed, + total_rows=total_rows, + failed_rows=failed_rows, + failure_details=failure_details, + rule_type=rule_type, + check_name=check.name, + severity=check.severity, + ) + + +def _rule_name(check_name: str, rule_type: str) -> str: + """Match the naming convention used by the rule factory.""" + if rule_type == "min": + return f"{check_name}_min" + if rule_type == "max": + return f"{check_name}_max" + return check_name + + +__all__ = ["SqlAggregateBuilder", "PUSHABLE_RULES"] diff --git a/datacheck/sql_pushdown/dialects.py b/datacheck/sql_pushdown/dialects.py new file mode 100644 index 0000000..f58d0e4 --- /dev/null +++ b/datacheck/sql_pushdown/dialects.py @@ -0,0 +1,366 @@ +"""SQL dialect definitions for multi-database aggregate pushdown. + +Each Dialect subclass encapsulates the SQL syntax differences for one database +engine: identifier quoting, type casts, string functions, aggregate functions, +temporal expressions, and the set of rules that can be pushed down. + +Usage:: + + from datacheck.sql_pushdown.dialects import get_dialect + + dialect = get_dialect("mysql") # → MySQLDialect or None if unsupported + if dialect: + sql = builder.build_query(table, where, limit, checks, dialect) +""" + +from __future__ import annotations + + +# ── Base pushable-rule set (supported by every dialect) ─────────────────────── +# Rules that rely on dialect-specific functions (regex, percentile, max_age) +# are added per-dialect in their pushable_rules property. +_BASE_RULES: frozenset[str] = frozenset( + { + "not_null", + "boolean", + "min", + "max", + "range", + "positive", + "non_negative", + "allowed_values", + "unique", + "unique_combination", + "sum_equals", + "min_length", + "max_length", + "no_future_timestamps", + "timestamp_range", + "date_range", + } +) + + +class Dialect: + """Abstract SQL dialect. Subclasses override only what differs.""" + + name: str = "" + + # ── Identifier quoting ──────────────────────────────────────────────────── + + def q(self, name: str) -> str: + """Return a safely quoted database identifier.""" + return '"' + name.replace('"', '""') + '"' + + # ── Type casts ──────────────────────────────────────────────────────────── + + def cast_to_text(self, col: str) -> str: + """Expression that casts *col* (already quoted) to a text/string type.""" + return f"CAST({col} AS VARCHAR)" + + # ── String functions ────────────────────────────────────────────────────── + + def str_length(self, col: str) -> str: + """Character-length expression for *col*.""" + return f"LENGTH({col})" + + # ── Temporal expressions ────────────────────────────────────────────────── + + def current_timestamp(self) -> str: + """SQL expression for the current wall-clock timestamp.""" + return "CURRENT_TIMESTAMP" + + def age_violation_expr(self, col: str, duration: str) -> str | None: + """Inner CASE condition that is TRUE when *col* is older than *duration*. + + Returns *None* if the dialect cannot express this in SQL (the rule then + falls back to the Python path). The base implementation uses the + standard ``INTERVAL '…'`` syntax supported by PostgreSQL, Redshift, and + Snowflake. + """ + interval = self._duration_to_interval_str(duration) + if interval is None: + return None + ts = self.current_timestamp() + return f"{col} < {ts} - INTERVAL '{interval}'" + + def _duration_to_interval_str(self, duration: str) -> str | None: + """Convert a duration token (e.g. ``'24h'``) to a standard interval string.""" + s = str(duration).strip().lower() + unit_map = {"m": "minutes", "h": "hours", "d": "days", "w": "weeks"} + if s and s[-1] in unit_map: + return f"{s[:-1]} {unit_map[s[-1]]}" + return None + + # ── Regex ────────────────────────────────────────────────────────────────── + + def regex_violation_expr(self, col: str, pattern: str) -> str | None: + """Inner CASE condition that is TRUE when *col* does NOT match *pattern*. + + Returns *None* if the dialect has no native regex operator. + """ + return None # subclasses override + + # ── Concatenation helpers ───────────────────────────────────────────────── + + def sep1(self) -> str: + """SQL expression for the CHR(1) / CHAR(1) separator used in multi-column + uniqueness checks. It is a non-printable control character that is + effectively never present in real data values.""" + return "CHR(1)" + + # ── LIMIT / TOP ──────────────────────────────────────────────────────────── + + def top_clause(self, n: int | None) -> str: + """Token inserted after SELECT (SQL Server ``TOP n``). Empty for most DBs.""" + return "" + + def limit_clause(self, n: int | None) -> str: + """Trailing ``LIMIT n`` clause. Empty for SQL Server (uses TOP instead).""" + return f" LIMIT {n}" if n is not None else "" + + # ── Pushable rule set ────────────────────────────────────────────────────── + + @property + def pushable_rules(self) -> frozenset[str]: + """Set of rule types that this dialect can handle in SQL.""" + return _BASE_RULES + + +# ── Concrete dialect implementations ────────────────────────────────────────── + +class PostgreSQLDialect(Dialect): + """PostgreSQL (and any PostgreSQL-wire-compatible DB).""" + + name = "postgresql" + + def q(self, name: str) -> str: + return '"' + name.replace('"', '""') + '"' + + def cast_to_text(self, col: str) -> str: + # PostgreSQL cast operator — also works for ENUM, UUID, etc. + return f"{col}::text" + + def str_length(self, col: str) -> str: + return f"LENGTH({col})" + + def current_timestamp(self) -> str: + return "NOW()" + + def age_violation_expr(self, col: str, duration: str) -> str | None: + interval = self._duration_to_interval_str(duration) + if interval is None: + return None + return f"{col} < NOW() - INTERVAL '{interval}'" + + def regex_violation_expr(self, col: str, pattern: str) -> str | None: + # !~ is the case-sensitive "does not match regex" operator in PostgreSQL. + # Cast to text so non-text columns (enums, UUIDs) are handled correctly. + p = pattern.replace("'", "''") + return f"{col}::text !~ '{p}'" + + @property + def pushable_rules(self) -> frozenset[str]: + return _BASE_RULES | frozenset({"regex", "max_age"}) + + +class RedshiftDialect(PostgreSQLDialect): + """Amazon Redshift — fully PostgreSQL-compatible SQL dialect.""" + + name = "redshift" + # PERCENTILE_CONT syntax is identical to PostgreSQL in Redshift. + # All other methods inherited from PostgreSQLDialect without change. + + +class MySQLDialect(Dialect): + """MySQL 8.0+ / MariaDB.""" + + name = "mysql" + + def q(self, name: str) -> str: + return "`" + name.replace("`", "``") + "`" + + def cast_to_text(self, col: str) -> str: + return f"CAST({col} AS CHAR)" + + def str_length(self, col: str) -> str: + # CHAR_LENGTH counts Unicode code points; LENGTH counts bytes. + return f"CHAR_LENGTH({col})" + + def current_timestamp(self) -> str: + return "NOW()" + + def age_violation_expr(self, col: str, duration: str) -> str | None: + # MySQL INTERVAL syntax: NOW() - INTERVAL 24 HOUR (no quotes, unit unquoted) + s = str(duration).strip().lower() + unit_map = {"m": "MINUTE", "h": "HOUR", "d": "DAY", "w": "WEEK"} + if s and s[-1] in unit_map: + return f"{col} < NOW() - INTERVAL {s[:-1]} {unit_map[s[-1]]}" + return None + + def regex_violation_expr(self, col: str, pattern: str) -> str | None: + # MySQL REGEXP operator performs case-insensitive matching by default. + p = pattern.replace("'", "''") + return f"{col} NOT REGEXP '{p}'" + + def sep1(self) -> str: + return "CHAR(1)" + + @property + def pushable_rules(self) -> frozenset[str]: + return _BASE_RULES | frozenset({"regex", "max_age"}) + + +class MSSQLDialect(Dialect): + """Microsoft SQL Server (T-SQL).""" + + name = "mssql" + + def q(self, name: str) -> str: + return "[" + name.replace("]", "]]") + "]" + + def cast_to_text(self, col: str) -> str: + return f"CAST({col} AS NVARCHAR(MAX))" + + def str_length(self, col: str) -> str: + # SQL Server uses LEN(), not LENGTH(). + return f"LEN({col})" + + def sep1(self) -> str: + return "CHAR(1)" + + def current_timestamp(self) -> str: + return "GETDATE()" + + def age_violation_expr(self, col: str, duration: str) -> str | None: + # T-SQL: DATEADD(unit, -n, GETDATE()) + s = str(duration).strip().lower() + unit_map = {"m": "minute", "h": "hour", "d": "day", "w": "week"} + if s and s[-1] in unit_map: + return f"{col} < DATEADD({unit_map[s[-1]]}, -{s[:-1]}, GETDATE())" + return None + + def regex_violation_expr(self, col: str, pattern: str) -> str | None: + # SQL Server has no native regex operator. + return None + + def top_clause(self, n: int | None) -> str: + # SQL Server uses SELECT TOP N instead of LIMIT. + return f"TOP {n} " if n is not None else "" + + def limit_clause(self, n: int | None) -> str: + # No LIMIT in T-SQL — rows are bounded by TOP in the SELECT clause. + return "" + + @property + def pushable_rules(self) -> frozenset[str]: + # No regex (no native operator), no percentile (window function only). + return _BASE_RULES | frozenset({"max_age"}) + + +class SnowflakeDialect(Dialect): + """Snowflake Data Cloud.""" + + name = "snowflake" + + def q(self, name: str) -> str: + # Snowflake uses double-quotes for case-sensitive identifiers. + return '"' + name.replace('"', '""') + '"' + + def cast_to_text(self, col: str) -> str: + return f"TO_VARCHAR({col})" + + def str_length(self, col: str) -> str: + return f"LENGTH({col})" + + def current_timestamp(self) -> str: + return "CURRENT_TIMESTAMP()" + + def age_violation_expr(self, col: str, duration: str) -> str | None: + # Snowflake supports standard INTERVAL '…' syntax. + interval = self._duration_to_interval_str(duration) + if interval is None: + return None + return f"{col} < CURRENT_TIMESTAMP() - INTERVAL '{interval}'" + + def regex_violation_expr(self, col: str, pattern: str) -> str | None: + # Snowflake REGEXP_LIKE(subject, pattern) — negate for violations. + p = pattern.replace("'", "''") + return f"NOT REGEXP_LIKE({col}, '{p}')" + + @property + def pushable_rules(self) -> frozenset[str]: + return _BASE_RULES | frozenset({"regex", "max_age"}) + + +class BigQueryDialect(Dialect): + """Google BigQuery (Standard SQL).""" + + name = "bigquery" + + def q(self, name: str) -> str: + # BigQuery uses backtick-quoted identifiers; escape embedded backticks. + return "`" + name.replace("\\", "\\\\").replace("`", "\\`") + "`" + + def cast_to_text(self, col: str) -> str: + return f"CAST({col} AS STRING)" + + def str_length(self, col: str) -> str: + return f"LENGTH({col})" + + def current_timestamp(self) -> str: + return "CURRENT_TIMESTAMP()" + + def age_violation_expr(self, col: str, duration: str) -> str | None: + # BigQuery: TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL n UNIT) + s = str(duration).strip().lower() + unit_map = {"m": "MINUTE", "h": "HOUR", "d": "DAY", "w": "WEEK"} + if s and s[-1] in unit_map: + return ( + f"{col} < TIMESTAMP_SUB(CURRENT_TIMESTAMP()," + f" INTERVAL {s[:-1]} {unit_map[s[-1]]})" + ) + return None + + def regex_violation_expr(self, col: str, pattern: str) -> str | None: + # BigQuery REGEXP_CONTAINS(value, regexp) — negate for violations. + # The r'' prefix is cosmetic in the generated SQL string. + p = pattern.replace("'", "''") + return f"NOT REGEXP_CONTAINS({col}, r'{p}')" + + @property + def pushable_rules(self) -> frozenset[str]: + return _BASE_RULES | frozenset({"regex", "max_age"}) + + +# ── Dialect registry ────────────────────────────────────────────────────────── + +_DIALECT_MAP: dict[str, Dialect] = { + "postgresql": PostgreSQLDialect(), + "redshift": RedshiftDialect(), + "mysql": MySQLDialect(), + "mssql": MSSQLDialect(), + "snowflake": SnowflakeDialect(), + "bigquery": BigQueryDialect(), +} + +# Source types for which SQL pushdown is available. +PUSHDOWN_CAPABLE_TYPES: frozenset[str] = frozenset(_DIALECT_MAP) + + +def get_dialect(source_type: str) -> Dialect | None: + """Return the SQL dialect for *source_type*, or ``None`` if pushdown is not supported.""" + return _DIALECT_MAP.get(source_type) + + +__all__ = [ + "Dialect", + "PostgreSQLDialect", + "RedshiftDialect", + "MySQLDialect", + "MSSQLDialect", + "SnowflakeDialect", + "BigQueryDialect", + "PUSHDOWN_CAPABLE_TYPES", + "get_dialect", +] diff --git a/datacheck/validation/__init__.py b/datacheck/validation/__init__.py index a69ab5a..c0b5b4d 100644 --- a/datacheck/validation/__init__.py +++ b/datacheck/validation/__init__.py @@ -13,7 +13,7 @@ - ``validate()`` returns ``list[RuleResult]`` vs a single ``RuleResult`` - Has ``Severity`` enum (ERROR, WARNING, INFO) for fine-grained control - Provides ``Validator`` class with builder-pattern API -- All 27+ engine rules available through the Python API +- All engine rules available through the Python API """ from datacheck.validation.rules import ( @@ -26,11 +26,6 @@ UniqueRule, # Numeric RangeRule, - MeanBetweenRule, - StdDevLessThanRule, - PercentileRangeRule, - ZScoreOutliersRule, - DistributionTypeRule, # String / Pattern RegexRule, EnumRule, @@ -42,18 +37,10 @@ TimestampRangeRule, NoFutureTimestampsRule, DateFormatValidRule, - BusinessDaysOnlyRule, - # Semantic - EmailValidRule, - PhoneValidRule, - UrlValidRule, - JsonValidRule, # Relationship / Composite ForeignKeyExistsRule, SumEqualsRule, UniqueCombinationRule, - # Custom - CustomRule, ) from datacheck.validation.validator import Validator, ValidationReport from datacheck.validation.config import load_config, RuleConfig @@ -70,11 +57,6 @@ "UniqueRule", # Numeric "RangeRule", - "MeanBetweenRule", - "StdDevLessThanRule", - "PercentileRangeRule", - "ZScoreOutliersRule", - "DistributionTypeRule", # String / Pattern "RegexRule", "EnumRule", @@ -86,18 +68,10 @@ "TimestampRangeRule", "NoFutureTimestampsRule", "DateFormatValidRule", - "BusinessDaysOnlyRule", - # Semantic - "EmailValidRule", - "PhoneValidRule", - "UrlValidRule", - "JsonValidRule", # Relationship / Composite "ForeignKeyExistsRule", "SumEqualsRule", "UniqueCombinationRule", - # Custom - "CustomRule", # Config "load_config", "RuleConfig", diff --git a/datacheck/validation/config.py b/datacheck/validation/config.py index f11a08a..01c9bdc 100644 --- a/datacheck/validation/config.py +++ b/datacheck/validation/config.py @@ -13,11 +13,6 @@ NotNullRule, UniqueRule, RangeRule, - MeanBetweenRule, - StdDevLessThanRule, - PercentileRangeRule, - ZScoreOutliersRule, - DistributionTypeRule, RegexRule, EnumRule, LengthRule, @@ -26,11 +21,6 @@ TimestampRangeRule, NoFutureTimestampsRule, DateFormatValidRule, - BusinessDaysOnlyRule, - EmailValidRule, - PhoneValidRule, - UrlValidRule, - JsonValidRule, ForeignKeyExistsRule, SumEqualsRule, UniqueCombinationRule, @@ -252,51 +242,6 @@ def create_rule_from_config(rule_config: RuleConfig) -> Rule: name=rule_config.name or "timestamp_range", ) - # Numeric (advanced) - elif rule_type in ("meanbetween", "mean"): - return MeanBetweenRule( - columns=rule_config.columns, - min_value=rule_config.params.get("min", 0.0), - max_value=rule_config.params.get("max", 100.0), - severity=severity, - name=rule_config.name or "mean_between", - ) - - elif rule_type in ("stddevlessthan", "stddev"): - return StdDevLessThanRule( - columns=rule_config.columns, - threshold=rule_config.params.get("threshold", 1.0), - severity=severity, - name=rule_config.name or "std_dev_less_than", - ) - - elif rule_type in ("percentilerange", "percentile"): - return PercentileRangeRule( - columns=rule_config.columns, - p25_min=rule_config.params.get("p25_min", 0.0), - p25_max=rule_config.params.get("p25_max", 100.0), - p75_min=rule_config.params.get("p75_min", 0.0), - p75_max=rule_config.params.get("p75_max", 100.0), - severity=severity, - name=rule_config.name or "percentile_range", - ) - - elif rule_type in ("zscoreoutliers", "zscore", "outliers"): - return ZScoreOutliersRule( - columns=rule_config.columns, - threshold=rule_config.params.get("threshold", 3.0), - severity=severity, - name=rule_config.name or "z_score_outliers", - ) - - elif rule_type in ("distributiontype", "distribution"): - return DistributionTypeRule( - columns=rule_config.columns, - expected_type=rule_config.params.get("expected", "normal"), - severity=severity, - name=rule_config.name or "distribution_type", - ) - # Temporal elif rule_type in ("maxage", "freshness"): return MaxAgeRule( @@ -330,45 +275,6 @@ def create_rule_from_config(rule_config: RuleConfig) -> Rule: name=rule_config.name or "date_format_valid", ) - elif rule_type in ("businessdaysonly", "businessdays"): - return BusinessDaysOnlyRule( - columns=rule_config.columns, - country_code=rule_config.params.get("country_code", "US"), - severity=severity, - name=rule_config.name or "business_days_only", - ) - - # Semantic - elif rule_type in ("emailvalid", "email"): - return EmailValidRule( - columns=rule_config.columns, - severity=severity, - name=rule_config.name or "email_valid", - ) - - elif rule_type in ("phonevalid", "phone"): - return PhoneValidRule( - columns=rule_config.columns, - country_code=rule_config.params.get("country_code"), - severity=severity, - name=rule_config.name or "phone_valid", - ) - - elif rule_type in ("urlvalid", "url"): - return UrlValidRule( - columns=rule_config.columns, - schemes=rule_config.params.get("schemes"), - severity=severity, - name=rule_config.name or "url_valid", - ) - - elif rule_type in ("jsonvalid", "json"): - return JsonValidRule( - columns=rule_config.columns, - severity=severity, - name=rule_config.name or "json_valid", - ) - # Relationship / Composite elif rule_type in ("foreignkeyexists", "foreignkey", "fk"): ref_data = rule_config.params.get("reference_data") @@ -469,9 +375,6 @@ def _convert_cli_check_to_rules(check: dict[str, Any]) -> list[Rule]: scalar_param_map = { "regex": "pattern", "type": "expected", - "std_dev_less_than": "threshold", - "z_score_outliers": "threshold", - "distribution_type": "expected", "max_age": "duration", "date_format_valid": "format", "allowed_values": "values", diff --git a/datacheck/validation/rules.py b/datacheck/validation/rules.py index adf20f4..2a50b9b 100644 --- a/datacheck/validation/rules.py +++ b/datacheck/validation/rules.py @@ -10,7 +10,6 @@ """ import re from abc import ABC, abstractmethod -from collections.abc import Callable from dataclasses import dataclass, field from enum import Enum from typing import Any @@ -30,11 +29,6 @@ ) from datacheck.rules.numeric_rules import ( MinMaxRule as _EngineMinMaxRule, - MeanBetweenRule as _EngineMeanBetweenRule, - StdDevLessThanRule as _EngineStdDevLessThanRule, - PercentileRangeRule as _EnginePercentileRangeRule, - ZScoreOutliersRule as _EngineZScoreOutliersRule, - DistributionTypeRule as _EngineDistributionTypeRule, ) from datacheck.rules.string_rules import ( AllowedValuesRule as _EngineAllowedValuesRule, @@ -46,13 +40,6 @@ TimestampRangeRule as _EngineTimestampRangeRule, NoFutureTimestampsRule as _EngineNoFutureTimestampsRule, DateFormatValidRule as _EngineDateFormatValidRule, - BusinessDaysOnlyRule as _EngineBusinessDaysOnlyRule, -) -from datacheck.rules.semantic_rules import ( - EmailValidRule as _EngineEmailValidRule, - PhoneValidRule as _EnginePhoneValidRule, - UrlValidRule as _EngineUrlValidRule, - JsonValidRule as _EngineJsonValidRule, ) @@ -276,180 +263,6 @@ def validate(self, df: pd.DataFrame) -> list[RuleResult]: return results -class MeanBetweenRule(Rule): - """Rule to check that column mean is within a range. - - Delegates to ``datacheck.rules.numeric_rules.MeanBetweenRule``. - """ - - def __init__( - self, - columns: list[str] | None = None, - min_value: float = 0.0, - max_value: float = 100.0, - severity: Severity = Severity.ERROR, - name: str = "mean_between", - ): - self.min_value = min_value - self.max_value = max_value - super().__init__(name=name, columns=columns, severity=severity) - - def _default_description(self) -> str: - return f"Check that mean is between {self.min_value} and {self.max_value}" - - def validate(self, df: pd.DataFrame) -> list[RuleResult]: - results = [] - for col in self._get_columns(df): - engine_rule = _EngineMeanBetweenRule( - name=self.name, column=col, - min_value=self.min_value, max_value=self.max_value, - ) - engine_result = engine_rule.validate(df) - results.append( - _engine_to_api_result(engine_result, self.severity, "mean_between") - ) - return results - - -class StdDevLessThanRule(Rule): - """Rule to check that standard deviation is below a threshold. - - Delegates to ``datacheck.rules.numeric_rules.StdDevLessThanRule``. - """ - - def __init__( - self, - columns: list[str] | None = None, - threshold: float = 1.0, - severity: Severity = Severity.ERROR, - name: str = "std_dev_less_than", - ): - self.threshold = threshold - super().__init__(name=name, columns=columns, severity=severity) - - def _default_description(self) -> str: - return f"Check that std dev is less than {self.threshold}" - - def validate(self, df: pd.DataFrame) -> list[RuleResult]: - results = [] - for col in self._get_columns(df): - engine_rule = _EngineStdDevLessThanRule( - name=self.name, column=col, threshold=self.threshold, - ) - engine_result = engine_rule.validate(df) - results.append( - _engine_to_api_result(engine_result, self.severity, "std_dev_less_than") - ) - return results - - -class PercentileRangeRule(Rule): - """Rule to check that percentiles are within bounds. - - Delegates to ``datacheck.rules.numeric_rules.PercentileRangeRule``. - """ - - def __init__( - self, - columns: list[str] | None = None, - p25_min: float = 0.0, - p25_max: float = 100.0, - p75_min: float = 0.0, - p75_max: float = 100.0, - severity: Severity = Severity.ERROR, - name: str = "percentile_range", - ): - self.p25_min = p25_min - self.p25_max = p25_max - self.p75_min = p75_min - self.p75_max = p75_max - super().__init__(name=name, columns=columns, severity=severity) - - def _default_description(self) -> str: - return ( - f"Check P25 in [{self.p25_min}, {self.p25_max}] " - f"and P75 in [{self.p75_min}, {self.p75_max}]" - ) - - def validate(self, df: pd.DataFrame) -> list[RuleResult]: - results = [] - for col in self._get_columns(df): - engine_rule = _EnginePercentileRangeRule( - name=self.name, column=col, - p25_min=self.p25_min, p25_max=self.p25_max, - p75_min=self.p75_min, p75_max=self.p75_max, - ) - engine_result = engine_rule.validate(df) - results.append( - _engine_to_api_result(engine_result, self.severity, "percentile_range") - ) - return results - - -class ZScoreOutliersRule(Rule): - """Rule to detect outliers based on Z-score threshold. - - Delegates to ``datacheck.rules.numeric_rules.ZScoreOutliersRule``. - """ - - def __init__( - self, - columns: list[str] | None = None, - threshold: float = 3.0, - severity: Severity = Severity.ERROR, - name: str = "z_score_outliers", - ): - self.threshold = threshold - super().__init__(name=name, columns=columns, severity=severity) - - def _default_description(self) -> str: - return f"Check for outliers with Z-score > {self.threshold}" - - def validate(self, df: pd.DataFrame) -> list[RuleResult]: - results = [] - for col in self._get_columns(df): - engine_rule = _EngineZScoreOutliersRule( - name=self.name, column=col, threshold=self.threshold, - ) - engine_result = engine_rule.validate(df) - results.append( - _engine_to_api_result(engine_result, self.severity, "z_score_outliers") - ) - return results - - -class DistributionTypeRule(Rule): - """Rule to check data follows an expected distribution. - - Delegates to ``datacheck.rules.numeric_rules.DistributionTypeRule``. - """ - - def __init__( - self, - columns: list[str] | None = None, - expected_type: str = "normal", - severity: Severity = Severity.ERROR, - name: str = "distribution_type", - ): - self.expected_type = expected_type - super().__init__(name=name, columns=columns, severity=severity) - - def _default_description(self) -> str: - return f"Check that data follows {self.expected_type} distribution" - - def validate(self, df: pd.DataFrame) -> list[RuleResult]: - results = [] - for col in self._get_columns(df): - engine_rule = _EngineDistributionTypeRule( - name=self.name, column=col, expected_type=self.expected_type, - ) - engine_result = engine_rule.validate(df) - results.append( - _engine_to_api_result(engine_result, self.severity, "distribution_type") - ) - return results - - # --------------------------------------------------------------------------- # String / Pattern # --------------------------------------------------------------------------- @@ -746,162 +559,6 @@ def validate(self, df: pd.DataFrame) -> list[RuleResult]: return results -class BusinessDaysOnlyRule(Rule): - """Rule to check dates fall on business days. - - Delegates to ``datacheck.rules.temporal_rules.BusinessDaysOnlyRule``. - """ - - def __init__( - self, - columns: list[str] | None = None, - country_code: str = "US", - severity: Severity = Severity.ERROR, - name: str = "business_days_only", - ): - self.country_code = country_code - super().__init__(name=name, columns=columns, severity=severity) - - def _default_description(self) -> str: - return "Check that dates are business days" - - def validate(self, df: pd.DataFrame) -> list[RuleResult]: - results = [] - for col in self._get_columns(df): - engine_rule = _EngineBusinessDaysOnlyRule( - name=self.name, column=col, country_code=self.country_code, - ) - engine_result = engine_rule.validate(df) - results.append( - _engine_to_api_result(engine_result, self.severity, "business_days_only") - ) - return results - - -# --------------------------------------------------------------------------- -# Semantic -# --------------------------------------------------------------------------- - -class EmailValidRule(Rule): - """Rule to check email address validity. - - Delegates to ``datacheck.rules.semantic_rules.EmailValidRule``. - """ - - def __init__( - self, - columns: list[str] | None = None, - severity: Severity = Severity.ERROR, - name: str = "email_valid", - ): - super().__init__(name=name, columns=columns, severity=severity) - - def _default_description(self) -> str: - return "Check that values are valid email addresses" - - def validate(self, df: pd.DataFrame) -> list[RuleResult]: - results = [] - for col in self._get_columns(df): - engine_rule = _EngineEmailValidRule(name=self.name, column=col) - engine_result = engine_rule.validate(df) - results.append( - _engine_to_api_result(engine_result, self.severity, "email_valid") - ) - return results - - -class PhoneValidRule(Rule): - """Rule to check phone number validity. - - Delegates to ``datacheck.rules.semantic_rules.PhoneValidRule``. - """ - - def __init__( - self, - columns: list[str] | None = None, - country_code: str | None = None, - severity: Severity = Severity.ERROR, - name: str = "phone_valid", - ): - self.country_code = country_code - super().__init__(name=name, columns=columns, severity=severity) - - def _default_description(self) -> str: - return "Check that values are valid phone numbers" - - def validate(self, df: pd.DataFrame) -> list[RuleResult]: - results = [] - for col in self._get_columns(df): - engine_rule = _EnginePhoneValidRule( - name=self.name, column=col, country_code=self.country_code, - ) - engine_result = engine_rule.validate(df) - results.append( - _engine_to_api_result(engine_result, self.severity, "phone_valid") - ) - return results - - -class UrlValidRule(Rule): - """Rule to check URL validity. - - Delegates to ``datacheck.rules.semantic_rules.UrlValidRule``. - """ - - def __init__( - self, - columns: list[str] | None = None, - schemes: list[str] | None = None, - severity: Severity = Severity.ERROR, - name: str = "url_valid", - ): - self.schemes = schemes or ["http", "https"] - super().__init__(name=name, columns=columns, severity=severity) - - def _default_description(self) -> str: - return "Check that values are valid URLs" - - def validate(self, df: pd.DataFrame) -> list[RuleResult]: - results = [] - for col in self._get_columns(df): - engine_rule = _EngineUrlValidRule( - name=self.name, column=col, schemes=self.schemes, - ) - engine_result = engine_rule.validate(df) - results.append( - _engine_to_api_result(engine_result, self.severity, "url_valid") - ) - return results - - -class JsonValidRule(Rule): - """Rule to check JSON validity. - - Delegates to ``datacheck.rules.semantic_rules.JsonValidRule``. - """ - - def __init__( - self, - columns: list[str] | None = None, - severity: Severity = Severity.ERROR, - name: str = "json_valid", - ): - super().__init__(name=name, columns=columns, severity=severity) - - def _default_description(self) -> str: - return "Check that values are valid JSON" - - def validate(self, df: pd.DataFrame) -> list[RuleResult]: - results = [] - for col in self._get_columns(df): - engine_rule = _EngineJsonValidRule(name=self.name, column=col) - engine_result = engine_rule.validate(df) - results.append( - _engine_to_api_result(engine_result, self.severity, "json_valid") - ) - return results - - # --------------------------------------------------------------------------- # Relationship / Composite # --------------------------------------------------------------------------- @@ -1008,67 +665,3 @@ def validate(self, df: pd.DataFrame) -> list[RuleResult]: engine_result = engine_rule.validate(df) return [_engine_to_api_result(engine_result, self.severity, "unique_combination")] - -# --------------------------------------------------------------------------- -# Custom -# --------------------------------------------------------------------------- - -class CustomRule(Rule): - """Rule using a custom validation function. - - This rule is Python-API-only and does not delegate to the engine. - """ - - def __init__( - self, - columns: list[str] | None = None, - func: Callable[[pd.Series], pd.Series] | None = None, - severity: Severity = Severity.ERROR, - name: str = "custom", - description: str | None = None, - ): - self.func = func or (lambda x: pd.Series([True] * len(x), index=x.index)) - super().__init__(name=name, columns=columns, severity=severity, description=description) - - def _default_description(self) -> str: - return "Custom validation rule" - - def validate(self, df: pd.DataFrame) -> list[RuleResult]: - results = [] - for col in self._get_columns(df): - try: - valid_mask = self.func(df[col]) - mask = ~valid_mask - mask &= df[col].notna() - - failed_count = int(mask.sum()) - total_count = int(df[col].notna().sum()) - failed_rows = df.index[mask].tolist()[:100] - failed_values = df.loc[mask, col].tolist()[:20] - - results.append(RuleResult( - rule_name=self.name, - column=col, - passed=failed_count == 0, - severity=self.severity, - message=( - f"Column '{col}' has {failed_count} values failing custom validation" - if failed_count > 0 - else f"Column '{col}' passed custom validation" - ), - failed_count=failed_count, - total_count=total_count, - failed_rows=failed_rows, - failed_values=failed_values, - )) - except Exception as e: - results.append(RuleResult( - rule_name=self.name, - column=col, - passed=False, - severity=self.severity, - message=f"Column '{col}' custom validation error: {e}", - failed_count=len(df), - total_count=len(df), - )) - return results diff --git a/docs/index.md b/docs/index.md index a76b925..99c277b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,54 +1,67 @@ # DataCheck -**Data validation engine for data engineers.** Define validation rules in YAML, run checks on files, databases, and cloud warehouses from your terminal. +DataCheck is a deterministic validation engine for data pipelines. Define rules in YAML, run validation in CI, and enforce data quality contracts through POSIX exit codes. No servers, no dashboards, no anomaly detection. -```bash -pip install datacheck-cli ``` +data source → [validation rules] → exit 0: pipeline continues + → exit 1: pipeline fails +``` + +DataCheck executes rules in one pass over the data and produces a binary pass/fail signal. Rules are config-driven, deterministic, and reproducible — the same input always produces the same result. For database sources, eligible rules compile into a single aggregate SQL query and execute entirely inside the database engine. + +--- + +## 1. Overview + +DataCheck runs as a CLI command or Python library. It reads a YAML config that defines validation checks, loads data from a file or database connection, evaluates every active rule, and exits with a POSIX code that encodes the outcome. -DataCheck provides the `datacheck` CLI and a Python API to validate data, profile quality, and detect schema changes. Run it locally during development, embed it in pipelines (Airflow, Dagster, Prefect), or integrate it into CI/CD workflows. +**Where it runs**: Inside existing pipeline compute. It has no server component, no background process, and no external dependency at runtime. It runs wherever Python 3.10+ and the relevant database drivers are installed. + +**How it integrates**: CI/CD systems, Airflow operators, pre-commit hooks, and deployment gates all consume POSIX exit codes. DataCheck uses exit 0 for pass and exit 1 for error-severity failures. Any non-zero exit code aborts the pipeline step in standard CI environments. + +**What it does not do**: DataCheck does not perform statistical anomaly detection, machine learning inference, or schema drift suggestion. It enforces rules that were explicitly defined. A rule either passes or fails. --- -## Installation +## 2. Installation ### Requirements -- **Python 3.10, 3.11, or 3.12** -- **pip 21.0 or greater** +- Python `>=3.10, <4.0` +- pip 21.0 or greater -### Install +### Base install ```bash pip install datacheck-cli ``` -### Install with extras +The base install supports CSV and Parquet validation. No database connectivity is included. + +### Extras Install only the connectors you need: ```bash -# Databases -pip install datacheck-cli[postgresql] -pip install datacheck-cli[mysql] -pip install datacheck-cli[mssql] +# Individual database connectors +pip install datacheck-cli[postgresql] # psycopg2-binary + SQLAlchemy +pip install datacheck-cli[mysql] # mysql-connector-python + SQLAlchemy +pip install datacheck-cli[mssql] # pyodbc + SQLAlchemy -# Cloud warehouses -pip install datacheck-cli[snowflake] -pip install datacheck-cli[bigquery] -pip install datacheck-cli[redshift] -pip install datacheck-cli[warehouses] # All three warehouses +# All three databases +pip install datacheck-cli[databases] -# Cloud storage -pip install datacheck-cli[cloud] # S3, GCS, Azure Blob +# Cloud warehouses +pip install datacheck-cli[snowflake] # snowflake-connector-python +pip install datacheck-cli[bigquery] # google-cloud-bigquery + google-auth +pip install datacheck-cli[redshift] # boto3 + psycopg2-binary + SQLAlchemy -# File formats -pip install datacheck-cli[deltalake] -pip install datacheck-cli[avro] -pip install datacheck-cli[duckdb] +# All three warehouses +pip install datacheck-cli[warehouses] -# Statistical rules -pip install datacheck-cli[statistical] +# Cloud storage +pip install datacheck-cli[s3] # boto3 +pip install datacheck-cli[cloud] # alias for s3 # Everything pip install datacheck-cli[all] @@ -62,287 +75,341 @@ datacheck version --- -## Quickstart +## 3. Core Concepts -### 1. Generate a config with sample data +### Rules -```bash -datacheck config init --with-sample-data +A rule is a single constraint applied to a column. Rules are identified by type (`not_null`, `min`, `regex`, etc.) and parameterized inline. Each rule produces one `RuleResult`: pass, fail, or execution error. + +```yaml +rules: + not_null: true + min: 0 + regex: '^[A-Z]{2}[0-9]{4}$' ``` -This creates a `datacheck.yaml` config file and a sample CSV file. Use `--template` to pick an industry template: +### Checks -```bash -datacheck config init --template ecommerce --with-sample-data +A check groups one or more rules targeting a single column. Every check has a `name`, `column`, and `rules` map. Optional fields: `severity`, `enabled`, `description`, `source`, `table`. + +```yaml +checks: + - name: order_id + column: id + rules: + not_null: true + unique: true + severity: error ``` -### 2. Run validation +When a check defines multiple rules, each rule is evaluated independently and produces its own result entry. -```bash -datacheck validate -``` +### Severity levels -DataCheck auto-discovers config files in this order: `.datacheck.yaml` → `.datacheck.yml` → `datacheck.yaml` → `datacheck.yml`. To specify a config explicitly: +| Level | Default | Effect on exit code | +|-------|---------|---------------------| +| `error` | Yes | Failure causes exit 1 | +| `warning` | No | Failure reported; exit code unaffected | +| `info` | No | Failure reported; exit code unaffected | -```bash -datacheck validate --config checks.yaml +Only `error`-severity rule failures cause a non-zero exit code. Warning and info violations appear in output but do not block the pipeline. + +### Exit codes + +| Code | Condition | +|------|-----------| +| `0` | All `error`-severity rules passed (warnings and info violations are allowed) | +| `1` | At least one `error`-severity rule failed | +| `2` | Configuration error (invalid YAML, unknown rule type, missing required field) | +| `3` | Data load error (file not found, connection refused, bad credentials) | +| `4` | Execution error (rule threw an exception, unexpected runtime failure) | + +Exit codes are stable contracts. They do not change between patch releases. + +### SQL pushdown model + +For database sources that support it, DataCheck compiles all eligible rules into a single aggregate `SELECT` statement. This query executes inside the database engine and returns only violation counts — no rows are transferred to the client. + +```sql +-- Generated by DataCheck for a PostgreSQL source +SELECT + COUNT(*) AS _total_rows, + SUM(CASE WHEN "amount" IS NULL THEN 1 ELSE 0 END) AS _c0_not_null, + SUM(CASE WHEN "amount" IS NOT NULL AND "amount" < 0 THEN 1 ELSE 0 END) AS _c0_min, + SUM(CASE WHEN "status" IS NOT NULL AND "status"::text NOT IN ('active','inactive') THEN 1 ELSE 0 END) AS _c1_allowed_values +FROM "orders" +WHERE created_at > '2026-01-01' ``` -### 3. Minimal config example +Rules that cannot be expressed as SQL aggregates (e.g., `date_format_valid`, `foreign_key_exists`) fall back to the in-process Python path, which requires loading the relevant rows. -```yaml -# .datacheck.yaml +### Fail-fast behavior -data_source: - type: csv - path: ./data/orders.csv +DataCheck does not fail fast on individual rule failures. All rules run to completion and results are aggregated. The exit code reflects the combined outcome. This behavior is intentional: a single validation run reports all failures simultaneously. -checks: - - name: id_check - column: id - rules: - not_null: true - unique: true +Data load errors and configuration errors abort immediately before any rules execute. - - name: amount_check - column: amount - rules: - not_null: true - min: 0 - max: 10000 +### Enforcement boundary model - - name: email_check - column: email - rules: - email_valid: true -``` +DataCheck enforces rules at the point where it is invoked. It does not monitor, poll, or persist state between runs. Each invocation is stateless and self-contained. Schema baselines (see Section 9) are the only persistent artifact; they are plain JSON files under version control. --- -## Configuration +## 4. Configuration -### Config file structure +### Config file discovery + +When no `--config` flag is provided, DataCheck searches the current working directory for config files in this order: -A `.datacheck.yaml` file can contain: +1. `.datacheck.yaml` +2. `.datacheck.yml` +3. `datacheck.yaml` +4. `datacheck.yml` + +The first match wins. If none is found and no `--config` flag was supplied, DataCheck exits with code 2. + +### Config file structure ```yaml -# Data source (inline, for file-based sources) +# Optional: schema version +version: "1.0" + +# Optional: metadata (informational only) +metadata: + author: "data-eng-team" + description: "Order pipeline validation" + tags: ["orders", "production"] + +# Optional: inherit checks from a base config +extends: base.yaml + +# Inline file-based data source data_source: - type: csv + type: csv # csv | parquet path: ./data/orders.csv - options: + options: # passed to pandas loader delimiter: "," encoding: utf-8 -# Or reference named sources +# OR reference named sources from a separate file sources_file: sources.yaml -source: production_db -table: orders +source: production_db # default source name +table: orders # default table -# Validation checks +# Required: validation checks checks: - - name: id_check + - name: order_id column: id rules: not_null: true unique: true - severity: error # error (default), warning, info - enabled: true # default: true + severity: error + enabled: true + description: "Primary key must be present and unique" -# Custom rule plugins -plugins: - - ./custom_rules.py + - name: order_amount + column: amount + rules: + not_null: true + min: 0 + max: 1000000 + severity: error -# Config inheritance -extends: base.yaml + - name: status_values + column: status + rules: + allowed_values: + - pending + - confirmed + - shipped + - cancelled + severity: warning -# Reporting +# Optional: output and reporting reporting: - output_path: ./reports - export_failures: true - failures_file: failures.csv + output_path: ./reports # directory for output files + export_failures: true # auto-export failures to CSV + failures_file: failures.csv # explicit failures CSV path -# Notifications +# Optional: Slack notifications notifications: slack_webhook: "${SLACK_WEBHOOK}" - mention_on_failure: true - -# Sampling -sampling: - strategy: random - params: - sample_rate: 0.1 + mention_on_failure: false ``` -### Checks definition +### data_source definition -Each check targets a column and applies one or more rules: +The `data_source` block is used for file-based sources embedded in the config. For database sources, use `sources_file` + `source` instead. -```yaml -checks: - - name: order_amount # Rule identifier - column: amount # Target column - rules: - not_null: true # Rule type → parameters - min: 0 - max: 100000 - severity: error # error (default), warning, info - enabled: true # Toggle check on/off - - - name: warehouse_orders - column: total - source: snowflake_wh # Override source for this check - table: orders # Override table for this check - rules: - min: 0 -``` +| Field | Required | Description | +|-------|----------|-------------| +| `type` | Yes | `csv` or `parquet` | +| `path` | Yes | Path to file (relative to config dir) | +| `options` | No | Loader keyword args passed to pandas | -### Severity levels +### checks definition -| Severity | Effect | -|----------|--------| -| `error` (default) | Causes exit code 1 on failure | -| `warning` | Reported but does not fail the run | -| `info` | Informational only | +Each item in `checks` requires `name`, `column`, and `rules`. All other fields are optional. -Only `error`-severity failures cause a non-zero exit code. +| Field | Required | Default | Description | +|-------|----------|---------|-------------| +| `name` | Yes | — | Unique identifier for this check | +| `column` | Yes | — | Column name in the dataset | +| `rules` | Yes | — | Map of rule type → parameters | +| `severity` | No | `error` | `error`, `warning`, or `info` | +| `enabled` | No | `true` | Set to `false` to skip this check | +| `description` | No | — | Human-readable description | +| `source` | No | — | Override default named source for this check | +| `table` | No | — | Override default table for this check | -### Environment variables +### Rules syntax -Config files support environment variable substitution: +Rules are expressed as a map under each check's `rules` key. The key is the rule type; the value is the parameter (or `true` for boolean rules): + +```yaml +rules: + not_null: true # boolean flag + unique: true + min: 0 # scalar + max: 10000 + range: {min: 0, max: 10000} # dict + regex: '^[A-Z0-9]{8,16}$' # string + allowed_values: [active, inactive, pending] # list + min_length: 2 + max_length: 64 + max_age: '24h' + timestamp_range: {min: "2020-01-01", max: "2030-12-31"} + type: string +``` + +### Environment variable substitution + +Config files support shell-style variable substitution. Applies to all string values in both `datacheck.yaml` and `sources.yaml`. ```yaml sources: - production_db: + production: type: postgresql - host: ${DB_HOST} # Required — fails if not set - port: ${DB_PORT:-5432} # Optional — uses default 5432 + host: ${DB_HOST} # required — empty string if unset + port: ${DB_PORT:-5432} # optional — falls back to 5432 database: ${DB_NAME} user: ${DB_USER} password: ${DB_PASSWORD} ``` -Use `datacheck config env` to list all variables referenced in a config and their current values: - -```bash -datacheck config env datacheck.yaml -``` +| Syntax | Behavior | +|--------|----------| +| `${VAR}` | Substituted with the env var value; empty string if unset | +| `${VAR:-default}` | Uses `default` if `VAR` is unset or empty | ### Config inheritance -Use `extends` to inherit rules from a base config and override or add checks per environment: +Use `extends` to inherit checks from a base config and add or override checks per environment: ```yaml -# base.yaml — shared rules +# base.yaml data_source: - type: csv - path: ./data/orders.csv + type: parquet + path: ./data/transactions.parquet checks: - - name: id_check - column: id + - name: tx_id + column: transaction_id rules: not_null: true unique: true ``` ```yaml -# production.yaml — inherits base, adds stricter rules +# production.yaml extends: base.yaml checks: - - name: amount_check + - name: tx_amount column: amount rules: - min: 0 - max: 50000 + positive: true + max: 500000 severity: error ``` -### Config validation - -Check config for errors before running: - -```bash -datacheck config validate -datacheck config validate datacheck.yaml --strict # Fail on warnings too -``` +Checks in the child config are merged with the parent. The child takes precedence on name conflicts. -### Auto-generate config from data +--- -Analyze a data file and generate validation rules automatically: +## 5. Supported Rules -```bash -datacheck config generate data.csv -datacheck config generate data.csv --confidence high -datacheck config generate data.csv -o custom.yaml -``` +### Null and uniqueness -Options: +| Rule | Parameter | Description | +|------|-----------|-------------| +| `not_null` | `true` | Fails if any value is null or missing | +| `unique` | `true` | Fails if any duplicate values exist; nulls are excluded from uniqueness check | +| `unique_combination` | `[col1, col2, ...]` | Fails if any combination of the listed columns is duplicated; null rows are excluded | -| Flag | Description | -|------|-------------| -| `--confidence / -c` | Minimum confidence threshold: `low`, `medium` (default), `high` | -| `--output / -o` | Output config file path (default: `datacheck.yaml`) | -| `--name / -n` | Dataset name (default: derived from filename) | -| `--force / -f` | Overwrite existing config file | +### Numeric -The generated config includes: +| Rule | Parameter | Description | +|------|-----------|-------------| +| `min` | number | Fails if any non-null value is less than the threshold | +| `max` | number | Fails if any non-null value is greater than the threshold | +| `range` | `{min: N, max: N}` | Fails if any non-null value falls outside the inclusive range | +| `positive` | `true` | Fails if any non-null value is `<= 0` | +| `non_negative` | `true` | Fails if any non-null value is `< 0` | -- **Type inference**: Correctly distinguishes `int`, `numeric`, `bool`, `date`, and `string` types -- **Regex patterns**: Auto-detected patterns for IDs, URLs, dates, etc. using `[0-9]` character classes (not `\d`) for cross-language compatibility -- **Statistical rules**: `mean_between`, `std_dev_less_than`, `percentile_range` with thresholds derived from data -- **Semantic rules**: `email_valid`, `phone_valid`, `url_valid`, `json_valid` based on column name detection -- **Cross-column rules**: `sum_equals` auto-detected when two numeric columns sum to a third -- **Temporal rules**: `timestamp_range` with 1-day margin, `no_future_timestamps`, `date_format` with detected format string -- **Reporting block**: Includes `output_path` and `export_failures` settings -- **Data source block**: Includes file type, path, and `options` (delimiter, encoding, etc.) +Null values are always skipped by numeric rules. To enforce non-null numeric bounds, combine with `not_null: true`. -### Config validation error reporting +### String and pattern -`datacheck config validate` reports **all** errors at once instead of stopping at the first one. This includes schema errors, missing fields (`name`, `column`, `rules`), and invalid rule definitions: +| Rule | Parameter | Description | +|------|-----------|-------------| +| `regex` | regex string | Fails if any non-null value does not match the pattern | +| `allowed_values` | list | Fails if any non-null value is not in the list | +| `min_length` | integer | Fails if any non-null string has fewer characters than the threshold | +| `max_length` | integer | Fails if any non-null string has more characters than the threshold | +| `type` | type string | Fails if the column's detected type does not match; accepts `int` (or `integer`), `float` (or `numeric`), `string`, `bool`, `date`, `datetime` | +| `boolean` | `true` | Fails if any non-null value is not a boolean representation (`True`/`False`, `1`/`0`, `true`/`false`) | -```bash -datacheck config validate checks.yaml -# Configuration has errors: -# - Check #2: Missing required field 'column' -# - Check #5: Missing required field 'rules' -# - Schema validation failed at 'checks.3.rules.min': -1 is not valid -``` +`regex` is applied per-value. Null values are skipped. Pattern matching is case-sensitive unless the pattern includes inline flags. -### Show resolved config +### Temporal -Display the fully resolved configuration with env vars and inheritance applied: +| Rule | Parameter | Description | +|------|-----------|-------------| +| `max_age` | duration string | Fails if any non-null timestamp is older than the specified duration from now | +| `timestamp_range` | `{min: "ISO8601", max: "ISO8601"}` | Fails if any non-null timestamp falls outside the inclusive range | +| `date_range` | `{min: "ISO8601", max: "ISO8601"}` | Equivalent to `timestamp_range`; use for date-only columns | +| `no_future_timestamps` | `true` | Fails if any non-null timestamp is greater than the current time at execution | +| `date_format_valid` | strftime string | Fails if any non-null value cannot be parsed with the given format | +| `date_format` | `{format: strftime string}` | Alternate dict form of `date_format_valid` | -```bash -datacheck config show -datacheck config show datacheck.yaml --format json -datacheck config show --no-resolve-env -datacheck config show --no-resolve-extends -``` +**Duration syntax for `max_age`**: A numeric value followed by a unit suffix. -### Merge configs +| Suffix | Unit | +|--------|------| +| `m` | minutes | +| `h` | hours | +| `d` | days | +| `w` | weeks | -Merge multiple configuration files. Later files override values from earlier files: +Examples: `'15m'`, `'24h'`, `'7d'`, `'4w'` -```bash -datacheck config merge base.yaml production.yaml -datacheck config merge base.yaml prod.yaml -o merged.yaml -``` +### Relationship -### List templates +| Rule | Parameter | Description | +|------|-----------|-------------| +| `foreign_key_exists` | Python API only | Fails if any value in the column does not exist in a reference dataset | +| `sum_equals` | `{column_a: col, column_b: col, tolerance: float}` | Fails if any row where `column_a + column_b ≠ column` (within tolerance, default `0.01`) | -Show all available templates with descriptions: +`foreign_key_exists` requires a reference dataset passed via the Python API. It is not usable from CLI config alone. -```bash -datacheck config templates -``` +`sum_equals` is applied row-by-row: for each row, the check is whether `column_a + column_b` equals the value in the target column. --- -## Data Sources +## 6. Data Sources -### File sources (inline in config) +### File-based sources **CSV** @@ -363,1177 +430,851 @@ data_source: path: ./data/orders.parquet ``` -**Avro** (requires `pip install datacheck-cli[avro]`) - -```yaml -data_source: - type: avro - path: ./data/orders.avro -``` - -**Delta Lake** (requires `pip install datacheck-cli[deltalake]`) +DataCheck uses column pruning for both CSV and Parquet: only columns referenced by checks are loaded. For large files, this reduces memory usage proportionally. -```yaml -data_source: - type: delta - path: ./data/delta-table -``` - -Delta Lake supports time travel: - -```bash -datacheck validate --delta-version 5 -datacheck validate --delta-timestamp "2026-01-15T10:00:00" -datacheck validate --storage-options '{"AWS_ACCESS_KEY_ID": "..."}' -``` - -**SQLite** - -```yaml -data_source: - type: sqlite - path: ./data/analytics.db -``` - -**DuckDB** (requires `pip install datacheck-cli[duckdb]`) - -```yaml -data_source: - type: duckdb - path: ./data/analytics.duckdb -``` - -### Database sources (named sources) +### Database sources via sources.yaml -For databases, define named sources in a `sources.yaml` file: +Database connections are defined in a separate `sources.yaml` file and referenced by name. This separates credentials from validation logic. ```yaml # sources.yaml sources: production_db: type: postgresql - host: ${DB_HOST} - port: ${DB_PORT:-5432} - database: ${DB_NAME} - user: ${DB_USER} - password: ${DB_PASSWORD} + host: ${PG_HOST} + port: ${PG_PORT:-5432} + database: ${PG_DATABASE} + user: ${PG_USER} + password: ${PG_PASSWORD} schema: public - mysql_db: - type: mysql - host: ${MYSQL_HOST} - port: ${MYSQL_PORT:-3306} - database: ${MYSQL_DB} - user: ${MYSQL_USER} - password: ${MYSQL_PASSWORD} - - mssql_db: - type: mssql - host: ${MSSQL_HOST} - port: ${MSSQL_PORT:-1433} - database: ${MSSQL_DB} - user: ${MSSQL_USER} - password: ${MSSQL_PASSWORD} -``` - -### Cloud warehouse sources - -```yaml -# sources.yaml -sources: - snowflake_wh: + warehouse: type: snowflake account: ${SF_ACCOUNT} user: ${SF_USER} password: ${SF_PASSWORD} - warehouse: ${SF_WAREHOUSE:-COMPUTE_WH} database: ${SF_DATABASE} schema: ${SF_SCHEMA:-PUBLIC} + warehouse: ${SF_WAREHOUSE} role: ${SF_ROLE} - # SSO: authenticator: externalbrowser - # Key pair: private_key_path: /path/to/key.p8 - bigquery_ds: + analytics: type: bigquery project_id: ${GCP_PROJECT} dataset_id: ${GCP_DATASET} - credentials_path: /path/to/service-account.json + credentials_path: /secrets/bq-service-account.json location: US + mysql_db: + type: mysql + host: ${MYSQL_HOST} + port: ${MYSQL_PORT:-3306} + database: ${MYSQL_DATABASE} + user: ${MYSQL_USER} + password: ${MYSQL_PASSWORD} + + sqlserver_db: + type: mssql + host: ${MSSQL_HOST} + port: ${MSSQL_PORT:-1433} + database: ${MSSQL_DATABASE} + user: ${MSSQL_USER} + password: ${MSSQL_PASSWORD} + redshift_db: type: redshift - host: ${REDSHIFT_HOST} - port: ${REDSHIFT_PORT:-5439} - database: ${REDSHIFT_DB} - user: ${REDSHIFT_USER} - password: ${REDSHIFT_PASSWORD} + host: ${RS_HOST} + port: ${RS_PORT:-5439} + database: ${RS_DATABASE} + user: ${RS_USER} + password: ${RS_PASSWORD} schema: public - # IAM auth: cluster_identifier, region, iam_auth: true + # IAM auth (optional): + # cluster_identifier: ${RS_CLUSTER} + # region: ${AWS_REGION} + # iam_auth: true ``` -Snowflake, BigQuery, and Redshift support **server-side filtering and sampling** — WHERE clauses, LIMIT, and TABLESAMPLE execute on the warehouse to minimize data transfer before validation runs locally. +Reference in config: + +```yaml +# datacheck.yaml +sources_file: sources.yaml +source: production_db +table: orders + +checks: + - name: order_id + column: id + rules: + not_null: true + unique: true +``` -### Cloud storage sources +### Cloud storage ```yaml -# sources.yaml sources: - s3_data: + s3_source: type: s3 - bucket: my-bucket - path: data/orders.csv - region: us-east-1 + bucket: ${S3_BUCKET} + path: data/orders.csv # key within the bucket + region: ${AWS_REGION:-us-east-1} access_key: ${AWS_ACCESS_KEY_ID} secret_key: ${AWS_SECRET_ACCESS_KEY} - - gcs_data: - type: gcs - bucket: my-bucket - path: data/orders.csv - credentials_path: /path/to/service-account.json - - azure_data: - type: azure - container: my-container - path: data/orders.csv - connection_string: ${AZURE_STORAGE_CONNECTION_STRING} - # Or: account_name + account_key -``` - -### Connection strings - -You can also pass connection strings directly to the CLI: - -```bash -datacheck validate postgresql://user:pass@host:5432/db --table orders -datacheck validate mysql://user:pass@host:3306/db --table orders -datacheck validate mssql://user:pass@host:1433/database --table orders -datacheck validate snowflake://account/database/schema --table orders -datacheck validate bigquery://project/dataset --table orders -datacheck validate redshift://user:pass@host:5439/database/schema --table orders ``` -### Named sources and per-check overrides +### Named sources -Reference a named source in your config: +Reference a named source in config to use it as the default: ```yaml -# .datacheck.yaml sources_file: sources.yaml source: production_db table: orders +``` + +Override at runtime without modifying the config: + +```bash +datacheck validate --source staging_db --table orders --config checks.yaml +datacheck validate --source analytics --table transactions --config checks.yaml +``` + +Per-check source overrides allow validating columns from different sources in a single run: +```yaml checks: - - name: customer_email - column: email + - name: local_id + column: id + source: production_db + table: orders rules: not_null: true - - name: order_total - column: total - source: snowflake_wh # Override source for this check - table: orders + - name: warehouse_total + column: revenue + source: warehouse + table: daily_revenue rules: - min: 0 + positive: true ``` -Switch sources at runtime: +### Switching sources at runtime ```bash -datacheck validate --source snowflake_wh --config checks.yaml -datacheck validate --source s3_data --sources-file sources.yaml -``` +# Override source +datacheck validate --source production_db --table orders -### Connection pre-validation +# Override table +datacheck validate --source production_db --table refunds -When validating against database sources, DataCheck tests connectivity for **all** referenced sources before running any validation rules. If multiple sources are unreachable, all connection errors are reported together: +# Override with WHERE clause +datacheck validate --source production_db --table orders --where "status = 'pending'" -``` -Source connectivity check failed: - - Source 'production_db' (postgresql): Connection failed — could not connect to server - - Source 'analytics_wh' (snowflake): Connection failed — invalid credentials +# Override with custom query +datacheck validate --source production_db --query "SELECT * FROM orders WHERE created_at > '2026-01-01'" ``` -For file-based sources, DataCheck verifies the file exists before validation begins. +When `--query` is specified, SQL pushdown is disabled for that run regardless of source type. -### SQL filtering +--- -Use `--table`, `--where`, and `--query` for server-side filtering: +## 7. CLI Reference + +### `datacheck validate` + +Run validation against a data source. The primary command. -```bash -datacheck validate --source production_db --table orders --where "status = 'active'" -datacheck validate --source production_db --query "SELECT * FROM orders WHERE created_at > '2026-01-01'" +``` +datacheck validate [DATA_SOURCE] [OPTIONS] ``` ---- +`DATA_SOURCE` is an optional positional argument: a file path or connection string. If omitted, the source is resolved from the config's `data_source`, `source`, or `sources_file` fields. -## Validation Rules +**Data source options** -### Null and uniqueness +| Flag | Short | Description | +|------|-------|-------------| +| `--config` | `-c` | Path to validation config YAML. Auto-discovered if omitted. | +| `--source` | | Named source from sources.yaml | +| `--sources-file` | | Path to sources YAML (overrides config `sources_file`) | +| `--table` | `-t` | Database table name | +| `--where` | `-w` | SQL WHERE clause | +| `--query` | `-q` | Custom SQL query; disables SQL pushdown | +| `--schema` | `-s` | Schema or dataset name | -| Rule | YAML Syntax | Description | -|------|------------|-------------| -| `not_null` | `not_null: true` | No null or missing values | -| `unique` | `unique: true` | No duplicate values (nulls ignored) | -| `unique_combination` | `unique_combination: [col1, col2]` | Composite uniqueness across columns | +**Warehouse-specific options** -### Numeric +| Flag | Description | +|------|-------------| +| `--warehouse` | Snowflake warehouse name | +| `--credentials` | Path to credentials file (BigQuery service account JSON) | +| `--region` | AWS region for Redshift IAM auth | +| `--cluster` | Cluster identifier for Redshift IAM auth | +| `--iam-auth` | Use IAM authentication for Redshift | -| Rule | YAML Syntax | Description | -|------|------------|-------------| -| `min` | `min: 0` | Column >= value | -| `max` | `max: 10000` | Column <= value | -| `mean_between` | `mean_between: {min: 10, max: 50}` | Column mean within range | -| `std_dev_less_than` | `std_dev_less_than: 5.0` | Standard deviation below threshold | -| `percentile_range` | `percentile_range: {p25_min: 10, p25_max: 20, p75_min: 80, p75_max: 90}` | 25th and 75th percentile bounds | -| `z_score_outliers` | `z_score_outliers: 3.0` | Detect outliers by z-score (default threshold: 3.0) | -| `distribution_type` | `distribution_type: 'normal'` | Validate distribution shape — `normal` or `uniform` (uses KS test) | +**Execution options** -### String and pattern +| Flag | Default | Description | +|------|---------|-------------| +| `--parallel` | off | Enable multi-threaded parallel execution | +| `--workers` | CPU count | Worker thread count (used with `--parallel`) | +| `--chunk-size` | 100000 | Rows per chunk for parallel execution | +| `--progress / --no-progress` | on | Show/hide terminal progress indicator | -| Rule | YAML Syntax | Description | -|------|------------|-------------| -| `regex` | `regex: '^[A-Z]{2}[0-9]{4}$'` | Match regex pattern | -| `allowed_values` | `allowed_values: [active, inactive, pending]` | Value in allowed set | -| `type` | `type: 'string'` | Data type check (`int`, `numeric`, `string`, `bool`, `date`, `datetime`) | -| `length` | `length: {min: 1, max: 100}` | String length constraints | -| `min_length` | `min_length: 1` | Minimum string length | -| `max_length` | `max_length: 255` | Maximum string length | +**Output options** -### Temporal +| Flag | Short | Default | Description | +|------|-------|---------|-------------| +| `--output` | `-o` | — | Save results to file | +| `--format` | `-f` | `json` | Output format for `--output`: `json`, `sarif`, `markdown`, `csv` | +| `--csv-export` | | — | Export failure details to a CSV file | +| `--suggestions / --no-suggestions` | | on | Show actionable suggestions for failures | +| `--slack-webhook` | | — | Slack webhook URL for result notifications | -| Rule | YAML Syntax | Description | -|------|------------|-------------| -| `max_age` | `max_age: '24h'` | Data freshness — supports `h` (hours), `d` (days), `w` (weeks), `m` (minutes) | -| `timestamp_range` | `timestamp_range: {min: "2025-01-01", max: "2026-12-31"}` | Timestamps within range (ISO format) | -| `date_range` | `date_range: {min: "2025-01-01", max: "2026-12-31"}` | Alias for `timestamp_range` | -| `no_future_timestamps` | `no_future_timestamps: true` | No timestamps beyond current time | -| `date_format_valid` | `date_format_valid: '%Y-%m-%d'` | Validates date format (Python strftime) | -| `date_format` | `date_format: {format: '%Y-%m-%d'}` | Alias for `date_format_valid` (dict form) | -| `business_days_only` | `business_days_only: 'US'` | Weekdays only — pass country code (e.g., `'US'`, `'GB'`) or `true` for default | +**Logging options** -### Semantic and format +| Flag | Default | Description | +|------|---------|-------------| +| `--log-level` | `WARNING` | `DEBUG`, `INFO`, `WARNING`, `ERROR`, `CRITICAL` | +| `--log-format` | `console` | `console` (human-readable) or `json` (structured) | +| `--log-file` | — | Write logs to file with automatic rotation | +| `--verbose` / `-v` | off | Shorthand for `--log-level DEBUG` | -| Rule | YAML Syntax | Description | -|------|------------|-------------| -| `email_valid` | `email_valid: true` | RFC 5322 email format (two-stage: regex pre-filter + email-validator library) | -| `phone_valid` | `phone_valid: 'US'` | Phone number format (phonenumbers library, supports all countries; pass country code or `true`) | -| `url_valid` | `url_valid: true` | URL structure validation | -| `json_valid` | `json_valid: true` | Valid JSON parsing | +### `datacheck schema` -### Cross-column and relationships +Schema contract commands. See Section 9. -| Rule | YAML Syntax | Description | -|------|------------|-------------| -| `unique_combination` | `unique_combination: [col1, col2]` | Composite uniqueness across multiple columns | -| `foreign_key_exists` | Python API | Foreign key validation against a reference DataFrame (use Python API to pass live data) | -| `sum_equals` | `sum_equals: {column_a: col1, column_b: col2}` | Verify column equals sum of two other columns (with optional `tolerance`) | +```bash +datacheck schema capture [DATA_SOURCE] [OPTIONS] +datacheck schema compare [DATA_SOURCE] [OPTIONS] +datacheck schema show [OPTIONS] +datacheck schema list [OPTIONS] +datacheck schema history [OPTIONS] +``` -### Example: complete config with rules +### `datacheck config` -```yaml -data_source: - type: csv - path: ./data/orders.csv +Configuration management commands. -checks: - - name: id_not_null - column: id - rules: - not_null: true - unique: true +```bash +datacheck config init [OPTIONS] # Generate config from template +datacheck config validate [CONFIG_PATH] # Validate config syntax +datacheck config show [CONFIG_PATH] # Show resolved config +datacheck config env [CONFIG_PATH] # Show environment variables referenced +datacheck config merge [FILES...] -o out # Merge multiple configs +datacheck config templates # List available templates +``` - - name: amount_range - column: amount - rules: - not_null: true - min: 0 - max: 100000 - z_score_outliers: - threshold: 3.0 - severity: error +**`config init` options** - - name: email_format - column: email - rules: - email_valid: true - severity: warning +| Flag | Short | Default | Description | +|------|-------|---------|-------------| +| `--output` | `-o` | `datacheck.yaml` | Output config file path | +| `--template` | `-t` | `basic` | Template: `basic`, `ecommerce`, `healthcare`, `finance`, `saas`, `iot` | +| `--with-sample-data` | `-s` | off | Generate matching sample CSV file | +| `--sample-rows` | | 100 | Rows in generated sample CSV | +| `--force` | `-f` | off | Overwrite existing config file | - - name: order_date - column: created_at - rules: - no_future_timestamps: true - max_age: '30d' - date_format_valid: '%Y-%m-%d %H:%M:%S' - - - name: status_values - column: status - rules: - allowed_values: - - pending - - confirmed - - shipped - - delivered - - cancelled -``` - ---- - -## Custom Rules - -### Creating custom rules - -Create a Python file with functions decorated with `@custom_rule`. Each function receives a `pd.Series` and optional parameters, and returns a boolean `pd.Series` where `True` means valid: - -```python -# custom_rules.py -from datacheck.plugins.decorators import custom_rule -import pandas as pd - -@custom_rule -def is_business_email(column: pd.Series, allowed_domains: list) -> pd.Series: - """Validate that emails use approved business domains.""" - domains = column.dropna().str.split("@").str[1] - return domains.isin(allowed_domains) +### `datacheck version` -@custom_rule -def is_positive_margin(column: pd.Series, min_margin: float = 0.0) -> pd.Series: - """Validate profit margin is above threshold.""" - return column.dropna() >= min_margin +```bash +datacheck version ``` -### Referencing plugins in config +### Output formats -```yaml -plugins: - - ./custom_rules.py +| Format | Used with | Description | +|--------|-----------|-------------| +| Terminal | Always | Rich-formatted table output; cannot be suppressed | +| `json` | `--output file.json` | Full results with all rule outcomes, failure details, and execution stats | +| `sarif` | `--output file.sarif` | SARIF 2.1.0 for GitHub Security tab and SARIF-aware tools | +| `markdown` | `--output report.md` | Human-readable report with results table and failure details | +| `csv` | `--output failures.csv --format csv` or `--csv-export failures.csv` | Failure rows only: check name, column, severity, failed count, sample values | -checks: - - name: email_domain - column: email - rules: - custom: - rule: is_business_email - params: - allowed_domains: ["company.com", "corp.com"] +Terminal output is always shown regardless of `--output`. The `--format` flag controls only the file output format. - - name: margin_check - column: profit_margin - rules: - custom: - rule: is_positive_margin - params: - min_margin: 0.05 -``` +### Exit codes table -### Plugin registry - -- `load_from_file()` imports the Python module and registers all `@custom_rule` decorated functions -- Registered rules become available through the `RuleFactory` alongside built-in rules -- The global registry tracks all loaded custom rules +| Code | Meaning | Common causes | +|------|---------|---------------| +| 0 | Pass | All `error`-severity rules passed | +| 1 | Validation failure | One or more `error`-severity rules failed | +| 2 | Configuration error | Invalid YAML, unknown rule type, no data source defined | +| 3 | Data load error | File not found, connection refused, authentication failure | +| 4 | Execution error | Exception during rule evaluation; unexpected runtime failure | --- -## Data Profiling +## 8. CI/CD Integration -### Running profiling +### Generic CI usage ```bash -# Direct file path -datacheck profile data.csv - -# Auto-discover config -datacheck profile - -# Explicit config file -datacheck profile --config checks.yaml - -# Named source -datacheck profile --source production_db --sources-file sources.yaml - -# Named source with table -datacheck profile --source production_db --table orders +pip install datacheck-cli +datacheck validate --config checks.yaml --output results.json ``` -### Profile options +DataCheck exits non-zero on any failure. Most CI systems treat non-zero exit codes as build failures automatically. -| Flag | Description | -|------|-------------| -| `--format / -f` | Output format: `terminal` (default), `json`, `markdown` | -| `--output / -o` | Write output to file | -| `--outlier-method` | Outlier detection method: `zscore` (default) or `iqr` | -| `--suggestions / --no-suggestions` | Show rule suggestions (default: enabled) | -| `--correlations / --no-correlations` | Show correlation matrix | +### GitHub Actions — basic -```bash -datacheck profile data.csv --format json -o profile.json -datacheck profile --outlier-method iqr --correlations -datacheck profile --format markdown -o report.md -``` +```yaml +name: data-quality +on: [push, pull_request] -### What profiling computes +jobs: + validate: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - run: pip install datacheck-cli + - run: datacheck validate --config .datacheck.yaml +``` -- **Basic counts**: total rows, null count, unique count, duplicate count, completeness percentage -- **Numeric statistics**: min, max, mean, median, standard deviation, 25th/50th/75th percentiles -- **Value distributions**: top N values with counts -- **Outlier detection**: Z-score method (|z| > 3.0) or IQR method (values outside Q1-1.5\*IQR to Q3+1.5\*IQR) -- **Correlation matrix**: Pearson correlation between all numeric columns -- **Quality scoring**: 0-100 score per column and per dataset +### GitHub Actions — SARIF upload -### Quality scoring +SARIF output integrates with the GitHub Security tab. Findings appear as code scanning alerts on pull requests. -Each column receives a 0-100 quality score based on: +```yaml +name: data-quality +on: [push, pull_request] -| Factor | What it measures | -|--------|-----------------| -| **Completeness** | Penalizes null/missing values | -| **Uniqueness** | Penalizes duplicate values | -| **Validity** | Type consistency across the column | -| **Consistency** | Low variance in categorical columns | +permissions: + contents: read + security-events: write -The dataset score is a weighted average of all column scores. +jobs: + validate: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - run: pip install datacheck-cli + - name: Run validation + run: | + datacheck validate \ + --config .datacheck.yaml \ + --format sarif \ + --output results.sarif + continue-on-error: true # let the upload step run even on failure + - uses: github/codeql-action/upload-sarif@v3 + with: + sarif_file: results.sarif +``` -### Rule suggestions +**SARIF output**: DataCheck generates SARIF 2.1.0. Each failed rule produces a result entry with `ruleId` (rule type), `level` (mapped from severity), and `message` (failure description). The tool identifier is `datacheck`. -The profiler automatically suggests validation rules based on data patterns: +### GitLab CI -- **Numeric columns**: range rules, outlier thresholds, distribution checks, type (`int` vs `numeric`) -- **String columns**: length constraints, regex patterns, allowed value sets -- **Temporal columns**: date format detection, timestamp ranges (with margin), `no_future_timestamps` -- **Semantic columns**: `email_valid`, `phone_valid`, `url_valid`, `json_valid` inferred from column names and content -- **Cross-column**: `sum_equals` auto-detected when two numeric columns sum to a third -- **All columns**: null checks, uniqueness rules +```yaml +validate_data: + image: python:3.12 + script: + - pip install datacheck-cli + - datacheck validate --config .datacheck.yaml --output results.json + artifacts: + paths: + - results.json + when: always +``` ---- +### Database sources in CI -## Schema Detection and Evolution +Store credentials as CI secrets and substitute via environment variables: -### Commands +```yaml +# .datacheck.yaml +sources_file: sources.yaml +source: production_db +table: orders -```bash -datacheck schema capture # Save current schema as baseline -datacheck schema compare # Compare current data against baseline -datacheck schema show # Display detected schema -datacheck schema list # List all saved baselines -datacheck schema history # View capture history +checks: + - name: order_amount + column: amount + rules: + not_null: true + positive: true ``` -### Schema capture - -```bash -datacheck schema capture data.csv -datacheck schema capture --source production_db --sources-file sources.yaml -datacheck schema capture --name v2-baseline -datacheck schema capture --baseline-dir ./schemas -datacheck schema capture --no-history +```yaml +# sources.yaml +sources: + production_db: + type: postgresql + host: ${DB_HOST} + port: ${DB_PORT:-5432} + database: ${DB_NAME} + user: ${DB_USER} + password: ${DB_PASSWORD} ``` -| Flag | Description | -|------|-------------| -| `--name / -n` | Baseline name (default: `baseline`) | -| `--baseline-dir` | Storage directory (default: `.datacheck/schemas/`) | -| `--save-history / --no-history` | Save to history (default: enabled) | - -### Schema compare - -```bash -datacheck schema compare data.csv -datacheck schema compare --baseline v2-baseline -datacheck schema compare --fail-on-breaking -datacheck schema compare --rename-threshold 0.9 -datacheck schema compare --format json +```yaml +# GitHub Actions job env block +env: + DB_HOST: ${{ secrets.DB_HOST }} + DB_NAME: ${{ secrets.DB_NAME }} + DB_USER: ${{ secrets.DB_USER }} + DB_PASSWORD: ${{ secrets.DB_PASSWORD }} ``` -| Flag | Description | -|------|-------------| -| `--baseline / -b` | Baseline name to compare against (default: `baseline`) | -| `--rename-threshold` | Similarity threshold for rename detection (0.0-1.0, default: 0.8) | -| `--fail-on-breaking` | Exit with code 1 on breaking changes | -| `--format / -f` | Output format: `terminal` (default) or `json` | +### Airflow integration -### Schema compare exit codes +See Section 8 of this document (Airflow operators) and Section 10 (Python API) for programmatic use. -| Code | Meaning | -|------|---------| -| 0 | Compatible — no breaking changes | -| 1 | Breaking changes detected (with `--fail-on-breaking`) | -| 2 | Baseline not found | -| 3 | Data load error | -| 4 | Unexpected error | - -### What schema tracks - -For each column: name, data type, nullable status, position, unique value count, null percentage. For the dataset: row count, source identifier, capture timestamp. +**BashOperator pattern** (simplest): -### Change types detected - -| Change | Compatibility Level | -|--------|-------------------| -| Column added | COMPATIBLE | -| Column removed | BREAKING | -| Column renamed | WARNING | -| Nullable changed | WARNING | -| Order changed | COMPATIBLE | - -### Type change compatibility +```python +from airflow.operators.bash import BashOperator -**Compatible changes** (widening): `int→float`, `int→string`, `float→string`, `bool→string`, `date→datetime`, `date→string`, `datetime→string` +validate = BashOperator( + task_id="validate_orders", + bash_command=( + "datacheck validate " + "--config /opt/airflow/dags/checks/orders.yaml " + "--output /tmp/results_{{ ds }}.json" + ), + env={ + "DB_HOST": "{{ var.value.db_host }}", + "DB_PASSWORD": "{{ var.value.db_password }}", + }, +) +``` -**Breaking changes** (narrowing): `float→int`, `string→int`, `string→float`, `string→bool`, `datetime→date`, `string→datetime`, `string→date` +**DataCheckOperator**: -### Baseline storage +```python +from datacheck.airflow.operators import DataCheckOperator -- Baselines are stored as JSON files in `.datacheck/schemas/` -- History entries are stored in `.datacheck/schemas/history/` with timestamps (e.g. `schema_20260212_143000.json`) -- Use `datacheck schema list` to see all baselines -- Use `datacheck schema history --limit 20` to see recent history +validate_orders = DataCheckOperator( + task_id="validate_orders", + config_path="/opt/airflow/dags/checks/orders.yaml", + source_name="production_db", + sources_file="/opt/airflow/dags/sources.yaml", + table="orders", + where="created_at >= '{{ ds }}'", + fail_on_error=True, + push_results=True, + min_pass_rate=None, # None = disabled; set to e.g. 95.0 to require 95% pass rate +) +``` ---- +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `config_path` | str | required | Path to validation config YAML | +| `file_path` | str | None | Path to data file (CSV or Parquet) | +| `source_name` | str | None | Named source from sources.yaml | +| `sources_file` | str | None | Path to sources YAML | +| `table` | str | None | Database table | +| `where` | str | None | SQL WHERE clause | +| `query` | str | None | Custom SQL query | +| `parallel` | bool | False | Enable parallel execution | +| `workers` | int | None | Worker thread count | +| `min_pass_rate` | float | None | Minimum rule pass rate 0–100; None disables | +| `fail_on_error` | bool | True | Fail Airflow task on validation failure | +| `push_results` | bool | True | Push results dict to XCom | -## Sampling Strategies +Template fields (Jinja-rendered): `config_path`, `file_path`, `source_name`, `sources_file`, `table`, `where`, `query`. -### Available strategies +`DataCheckOperator` raises `AirflowException` when `fail_on_error=True` and the validation exits non-zero. -| Strategy | Description | Key Parameters | -|----------|-------------|----------------| -| `random` | Simple random sampling | `sample_rate` or `sample_count`, `seed` | -| `stratified` | Preserve value distributions across groups | `stratify_column`, `min_per_stratum` | -| `time_based` | Sample within a time window | `time_column`, `start_date`, `end_date` | -| `error_focused` | Prioritize rows matching error conditions | `error_conditions` (e.g. `['age<0', 'price>10000']`) | -| `adaptive` | Adjust sample size based on data characteristics | `target_quality`, `initial_size` | -| `reservoir` | Single-pass sampling for streaming data | `sample_count` | -| `systematic` | Every Nth row | `sample_rate` | -| `top_n` | First N rows | `--top N` | +**DataCheckSchemaOperator**: -### CLI sampling flags +```python +from datacheck.airflow.operators import DataCheckSchemaOperator -```bash -# Random sampling -datacheck validate --sample-rate 0.1 # 10% of rows -datacheck validate --sample-count 1000 # Exactly 1000 rows -datacheck validate --sample-count 1000 --seed 42 # Reproducible - -# First N rows -datacheck validate --top 500 - -# Strategy-based -datacheck validate --sample-strategy stratified --stratify region -datacheck validate --sample-strategy time_based --time-column created_at --start-date 2026-01-01 --end-date 2026-02-01 -datacheck validate --sample-strategy error_focused --error-indicators "age<0,price>10000" +check_schema = DataCheckSchemaOperator( + task_id="check_schema", + file_path="/data/orders_{{ ds }}.parquet", + baseline_name="orders-baseline", + baseline_dir="/opt/airflow/schemas", + fail_on_breaking=True, + push_results=True, +) ``` -| Flag | Description | -|------|-------------| -| `--sample-rate` | Fraction to sample (0.0-1.0) | -| `--sample-count` | Exact number of rows to sample | -| `--top` | First N rows only | -| `--sample-strategy` | Strategy name: `random`, `stratified`, `time_based`, `error_focused`, `adaptive`, `reservoir` | -| `--stratify` | Column for stratified sampling | -| `--seed` | Random seed for reproducibility | -| `--time-column` | Column for time-based sampling | -| `--start-date` | Start date (ISO format) | -| `--end-date` | End date (ISO format) | -| `--error-indicators` | Comma-separated error conditions | +Auto-captures a new baseline if none exists at `baseline_name`. --- -## CLI Command Reference +## 9. Schema Contracts -### `datacheck validate` +Schema contracts capture the structural definition of a dataset (column names, types, nullability) as a baseline and fail if the current data deviates in a breaking way. -Run validation against data files or databases. +### `schema capture` -**Data source flags:** +Captures the current schema and saves it as a named baseline. -| Flag | Description | -|------|-------------| -| `data_source` (positional) | File path or connection string | -| `--config / -c` | Path to validation config YAML | -| `--source` | Named source from sources.yaml | -| `--sources-file` | Path to sources YAML file | -| `--table / -t` | Database table name | -| `--where / -w` | SQL WHERE clause for filtering | -| `--query / -q` | Custom SQL query | -| `--schema / -s` | Schema/dataset name | - -**Warehouse-specific flags:** - -| Flag | Description | -|------|-------------| -| `--warehouse` | Snowflake warehouse name | -| `--credentials` | Path to credentials file (BigQuery service account) | -| `--region` | AWS region (Redshift IAM auth) | -| `--cluster` | Cluster identifier (Redshift IAM auth) | -| `--iam-auth` | Use IAM authentication (Redshift) | - -**Delta Lake flags:** - -| Flag | Description | -|------|-------------| -| `--delta-version` | Delta Lake version to load (time travel) | -| `--delta-timestamp` | Timestamp to load data as of (ISO 8601) | -| `--storage-options` | JSON string of storage options for cloud access | - -**Sampling flags:** See [Sampling Strategies](#cli-sampling-flags). - -**Execution flags:** - -| Flag | Description | -|------|-------------| -| `--parallel` | Enable multi-core parallel execution | -| `--workers` | Number of worker processes (default: CPU count) | -| `--chunk-size` | Rows per chunk for parallel processing (default: 10,000) | -| `--progress / --no-progress` | Show/hide progress bar | - -**Output flags:** - -| Flag | Description | -|------|-------------| -| `--output / -o` | Save results to a JSON file | -| `--csv-export` | Export failure details as CSV | -| `--suggestions / --no-suggestions` | Show improvement suggestions (default: enabled) | -| `--slack-webhook` | Slack webhook URL for notifications | - -**Logging flags:** +```bash +datacheck schema capture data.parquet +datacheck schema capture --source production_db --sources-file sources.yaml --table orders +datacheck schema capture data.csv --name v2-baseline --baseline-dir ./schemas +datacheck schema capture data.csv --no-history # skip history entry +``` -| Flag | Description | -|------|-------------| -| `--log-level` | Log level: `DEBUG`, `INFO`, `WARNING`, `ERROR`, `CRITICAL` | -| `--log-format` | Log format: `console` (human-readable) or `json` (machine-parseable) | -| `--log-file` | Path to log file (with automatic rotation) | -| `--verbose / -v` | Shortcut for `--log-level DEBUG` | +| Flag | Short | Default | Description | +|------|-------|---------|-------------| +| `--name` | `-n` | `baseline` | Baseline identifier | +| `--baseline-dir` | | `.datacheck/schemas` | Storage directory | +| `--save-history / --no-history` | | on | Append to schema history | +| `--config` | `-c` | auto | Config file | +| `--source` | | — | Named source | +| `--sources-file` | | — | Sources YAML path | +| `--table` | `-t` | — | Table name | +| `--query` | `-q` | — | Custom SQL query | -### `datacheck profile` +### `schema compare` -Generate data quality profiles with statistics, quality scores, and rule suggestions. +Compares the current data schema against a saved baseline. -Same data source flags as `validate`, plus: +```bash +datacheck schema compare data.parquet +datacheck schema compare data.parquet --baseline v2-baseline +datacheck schema compare data.csv --fail-on-breaking +datacheck schema compare data.parquet --rename-threshold 0.9 --format json +``` -| Flag | Description | -|------|-------------| -| `--format / -f` | Output format: `terminal` (default), `json`, `markdown` | -| `--output / -o` | Write output to file | -| `--outlier-method` | Detection method: `zscore` (default) or `iqr` | -| `--suggestions / --no-suggestions` | Show rule suggestions | -| `--correlations / --no-correlations` | Show correlation matrix | +| Flag | Short | Default | Description | +|------|-------|---------|-------------| +| `--baseline` | `-b` | `baseline` | Baseline name to compare against | +| `--fail-on-breaking` | | off | Exit 1 on breaking changes | +| `--rename-threshold` | | 0.8 | Similarity threshold for rename detection (0.0–1.0) | +| `--format` | `-f` | `terminal` | Output format: `terminal` or `json` | -### `datacheck config` +### Baseline storage model -Configuration management commands. +Baselines are stored as JSON files: -| Subcommand | Description | -|------------|-------------| -| `config init` | Generate config from template | -| `config init --template ` | Use specific template (`basic`, `ecommerce`, `healthcare`, `finance`, `saas`, `iot`, `rules-reference`, `sources`) | -| `config init --with-sample-data` | Also generate a sample CSV file | -| `config init --sample-rows N` | Number of sample rows to generate (default: 100) | -| `config init --force` | Overwrite existing config file | -| `config validate ` | Validate config file syntax and rule definitions | -| `config validate --strict` | Fail on warnings too | -| `config show ` | Show fully resolved config (env vars + inheritance applied) | -| `config show --format yaml/json` | Output format | -| `config show --no-resolve-env` | Skip environment variable resolution | -| `config show --no-resolve-extends` | Skip config inheritance resolution | -| `config merge ` | Merge multiple configs (later files override earlier) | -| `config merge -o output.yaml` | Write merged result to file | -| `config generate ` | Auto-generate rules from data analysis | -| `config generate --confidence` | Minimum confidence: `low`, `medium` (default), `high` | -| `config templates` | List available templates with descriptions | -| `config env ` | Show environment variables referenced in config | +- Active baselines: `.datacheck/schemas/.json` +- History entries: `.datacheck/schemas/history/schema_.json` -### `datacheck schema` +The `.datacheck/schemas/` directory should be committed to version control to track schema evolution alongside code changes. -Schema evolution detection and management. +### Failure semantics -| Subcommand | Description | -|------------|-------------| -| `schema capture` | Save current schema as baseline | -| `schema compare` | Compare current data against baseline | -| `schema show` | Display detected schema (columns, types, nullable, stats) | -| `schema list` | List all saved baseline schemas | -| `schema history` | View capture history (newest first) | +`schema compare` reports each change with a compatibility level: -### `datacheck version` +| Change type | Compatibility | +|-------------|--------------| +| Column added | COMPATIBLE | +| Column order changed | COMPATIBLE | +| Column removed | BREAKING | +| Column renamed (inferred) | WARNING | +| Type narrowed (e.g., `string → int`) | BREAKING | +| Type widened (e.g., `int → string`) | COMPATIBLE | +| Nullable changed (non-null → nullable) | WARNING | +| Nullable changed (nullable → non-null) | BREAKING | -Display version information. +Without `--fail-on-breaking`, `schema compare` exits 0 regardless of changes detected. With `--fail-on-breaking`, any BREAKING-level change causes exit 1. -### Exit codes +Exit codes for `schema compare`: -| Code | Meaning | -|------|---------| -| 0 | All rules passed (or only warning/info severity failures) | -| 1 | Some error-severity rules failed | -| 2 | Configuration error | -| 3 | Data loading error | +| Code | Condition | +|------|-----------| +| 0 | Compatible (no breaking changes), or breaking changes without `--fail-on-breaking` | +| 1 | Breaking changes detected and `--fail-on-breaking` set | +| 2 | Baseline not found | +| 3 | Data load error | | 4 | Unexpected error | --- -## Output and Reporting - -### Terminal output +## 10. Python API -DataCheck uses Rich-formatted terminal output with color-coded results: - -- **Green**: Passed rules -- **Red**: Failed rules -- **Yellow**: Errors during rule execution - -Output includes a statistics table (records, columns, rules, pass/fail counts), detailed failure tables (check name, column, failure count, sample values), and actionable improvement suggestions. - -### JSON export +### ValidationEngine -```bash -datacheck validate --output results.json +```python +from datacheck import ValidationEngine ``` -Exports full validation results in machine-readable JSON format, including all rule results, failure details, and summary statistics. Use this for automation and CI/CD integration. - -### CSV export +**Constructor**: -```bash -datacheck validate --csv-export failures.csv +```python +engine = ValidationEngine( + config=None, # ValidationConfig object (mutually exclusive with config_path) + config_path=None, # str | Path to YAML file; auto-discovered if both are None + parallel=False, # bool + workers=None, # int | None — defaults to CPU count when parallel=True + chunk_size=None, # int | None — defaults to 100000 + show_progress=True, # bool + notifier=None, # optional notifier (e.g. SlackNotifier) + sources_file=None, # str | Path — overrides sources_file from config +) ``` -Exports failure details as CSV with columns: check_name, column, severity, failed_rows, reason, suggestion. +When both `config` and `config_path` are `None`, the constructor searches the current working directory for a config file in the standard discovery order. It raises `ConfigurationError` if none is found. -### Markdown reports +**Methods**: -```bash -datacheck profile --format markdown -o report.md -``` - -Generates markdown-formatted profile reports with tables, statistics, and quality scores. - -### Slack notifications +```python +# Validate a file (CSV or Parquet) +summary = engine.validate_file("data.parquet") +summary = engine.validate_file("data.csv", delimiter="|", encoding="latin-1") -Configure the webhook in your config file so you don't need to pass it every time: +# Validate a pre-loaded DataFrame +import pandas as pd +df = pd.read_parquet("data.parquet") +summary = engine.validate_dataframe(df) + +# Validate against a named source +summary = engine.validate_sources( + source_name="production_db", # None = use config default + table="orders", + where="status = 'pending'", + query=None, # use query or table, not both +) -```yaml -notifications: - slack_webhook: "${SLACK_WEBHOOK}" - mention_on_failure: true # @channel on failures (default: false) +# validate() is a lower-level method — requires file_path or df explicitly +summary = engine.validate(file_path="data.parquet") +summary = engine.validate(df=df) ``` -Or pass it via the CLI (overrides the config value): - -```bash -datacheck validate --slack-webhook https://hooks.slack.com/services/... -``` +### ValidationSummary -Sends validation results to Slack with: -- Color-coded messages (green for pass, red for fail) -- Summary statistics and failed rules -- Optional `@channel` mention on failures (via `mention_on_failure`) -- Up to 5 failed rule details with row counts +`validate_*` methods return a `ValidationSummary` object. ---- +**Properties**: -## Parallel Execution and Performance +| Property | Type | Description | +|----------|------|-------------| +| `total_rules` | int | Total rules evaluated | +| `passed_rules` | int | Rules that passed | +| `failed_rules` | int | Rules that failed (any severity) | +| `failed_errors` | int | Failed rules with `error` severity | +| `failed_warnings` | int | Failed rules with `warning` severity | +| `failed_info` | int | Failed rules with `info` severity | +| `error_rules` | int | Rules that threw an exception | +| `all_passed` | bool | True if `failed_errors == 0` and `error_rules == 0` | +| `has_failures` | bool | True if any rules failed | +| `has_errors` | bool | True if any rules threw exceptions | +| `results` | list[RuleResult] | All rule results | +| `total_rows` | int | Rows in the dataset | +| `total_columns` | int | Columns in the dataset | -### Enabling parallel mode +**Methods**: -```bash -datacheck validate --parallel -datacheck validate --parallel --workers 4 -datacheck validate --parallel --chunk-size 50000 -datacheck validate --parallel --progress +```python +summary.get_passed_results() # list[RuleResult] +summary.get_failed_results() # list[RuleResult] — any severity +summary.get_failed_by_severity("error") # list[RuleResult] +summary.get_error_results() # list[RuleResult] — execution errors +summary.to_dict() # dict ``` -| Flag | Description | -|------|-------------| -| `--parallel` | Enable multi-core parallel execution | -| `--workers` | Number of worker processes (default: CPU count) | -| `--chunk-size` | Rows per chunk (default: 10,000) | -| `--progress / --no-progress` | Show/hide progress bar | - -### How parallel execution works - -1. Splits the DataFrame into chunks based on `--chunk-size` -2. Processes chunks in parallel using `multiprocessing.Pool` -3. Aggregates results across chunks (combines pass/fail counts, merges failure details) -4. Automatically falls back to sequential execution for small datasets -5. Shows a Rich progress bar with spinner, elapsed time, and remaining time - -### Performance features - -- **PyArrow backend**: Vectorized operations for faster validation (e.g. fast null count via Arrow) -- **Lazy loading**: Cloud connectors are loaded only when needed — no unnecessary dependencies -- **Memory optimization**: Memory-aware chunk sizing, worker auto-scaling, and large file handling -- **Caching**: Regex compilation caching (`@lru_cache`) and compute-once patterns for expensive operations -- **Vectorized rules**: NumPy/Pandas vectorized operations — no Python loops in hot paths - ---- +### RuleResult -## Logging +| Property | Type | Description | +|----------|------|-------------| +| `rule_name` | str | Rule identifier (e.g., `order_id_min`) | +| `check_name` | str | Check name from config | +| `rule_type` | str | Rule type (e.g., `min`, `not_null`) | +| `column` | str | Column name | +| `passed` | bool | Pass/fail | +| `total_rows` | int | Total rows checked | +| `failed_rows` | int | Rows that violated the rule | +| `severity` | str | `error`, `warning`, or `info` | +| `error` | str \| None | Exception message if rule errored | +| `has_error` | bool | True if `error` is not None | +| `failure_details` | FailureDetail \| None | Detailed failure info including sample values | -### Log configuration +### Failure iteration -```bash -datacheck validate --verbose # DEBUG level -datacheck validate --log-level WARNING # Specific level -datacheck validate --log-format json # Machine-parseable JSON logs -datacheck validate --log-file validation.log # Log to file (with rotation) -datacheck validate --log-level DEBUG --log-format json --log-file debug.log +```python +engine = ValidationEngine(config_path="checks.yaml") +summary = engine.validate_file("orders.parquet") + +if not summary.all_passed: + for result in summary.get_failed_results(): + rate = result.failed_rows / result.total_rows * 100 if result.total_rows else 0 + print( + f"FAIL [{result.severity}] {result.rule_name} " + f"on {result.column}: " + f"{result.failed_rows}/{result.total_rows} rows ({rate:.1f}%)" + ) ``` -| Flag | Description | -|------|-------------| -| `--log-level` | `DEBUG`, `INFO`, `WARNING`, `ERROR`, `CRITICAL` | -| `--log-format` | `console` (human-readable, default) or `json` (machine-parseable) | -| `--log-file` | Path to log file (automatic rotation) | -| `--verbose / -v` | Shortcut for `--log-level DEBUG` | +### Raising exceptions -### Logging features +`ValidationEngine` does not raise on validation failure by default — it returns a `ValidationSummary`. To raise on failure: -- **Structured logging**: Console and JSON formatters for different use cases -- **Sensitive data masking**: Automatically masks credentials and passwords in log output -- **Trace IDs**: Unique trace ID per validation run for log correlation across systems -- **File rotation**: Automatic log file rotation to prevent unbounded growth - ---- +```python +from datacheck import ValidationEngine +from datacheck.exceptions import DataCheckError, ConfigurationError, DataLoadError -## Security +engine = ValidationEngine(config_path="checks.yaml") -### Credential handling +try: + summary = engine.validate_file("data.parquet") +except ConfigurationError as e: + raise SystemExit(2) from e +except DataLoadError as e: + raise SystemExit(3) from e +except DataCheckError as e: + raise SystemExit(4) from e -- **Environment variables**: Use `${VAR}` and `${VAR:-default}` syntax in config files — never hardcode credentials -- **Credential files**: Load credentials from external files -- **Password masking**: Credentials are automatically masked in logs and terminal output -- **Config env audit**: Use `datacheck config env` to verify all required variables are set +if not summary.all_passed: + raise SystemExit(1) +``` -### Connection security +### Exception hierarchy -- Connection string validation before attempting connections -- SQL injection prevention: table name validation, WHERE clause scanning, parameterized queries -- Path traversal prevention with null byte and symlink detection -- SSL/TLS enforcement for warehouse connections +| Exception | Exit code analog | When raised | +|-----------|-----------------|-------------| +| `DataCheckError` | 4 | Base class for all DataCheck exceptions | +| `ConfigurationError` | 2 | Invalid config, missing required fields | +| `DataLoadError` | 3 | File not found, connection failure, bad credentials | +| `ValidationError` | 4 | Unexpected failure during rule execution | +| `RuleDefinitionError` | 2 | Invalid rule parameters | +| `ColumnNotFoundError` | 4 | Column referenced in rule does not exist in data | --- -## Airflow Integration +## 11. Performance Model -DataCheck provides two Airflow operators for use in DAGs, plus a simpler BashOperator pattern. +### SQL pushdown -### DataCheckOperator +For database sources (PostgreSQL, Redshift, MySQL, SQL Server, Snowflake, BigQuery), DataCheck compiles all eligible rules into a single aggregate `SELECT` per table. The query returns one row of violation counts. No data rows are transferred to the Python process. -Run data validation inside Airflow DAGs: +**Mechanism**: The `SqlAggregateBuilder` partitions checks into pushable and non-pushable sets using each dialect's `pushable_rules` property. It then generates one query with a `CASE WHEN … THEN 1 ELSE 0 END` expression per rule, wrapped in `SUM()`. A single database round-trip produces all violation counts. -```python -from datacheck.airflow.operators import DataCheckOperator - -validate_orders = DataCheckOperator( - task_id="validate_orders", - config_path="/path/to/datacheck.yaml", - file_path="/data/orders.csv", - fail_on_error=True, - push_results=True, - min_pass_rate=95.0, -) +```sql +SELECT + COUNT(*) AS _total_rows, + SUM(CASE WHEN "id" IS NULL THEN 1 ELSE 0 END) AS _c0_not_null, + SUM(CASE WHEN "amount" IS NOT NULL AND "amount" < 0 THEN 1 ELSE 0 END) AS _c1_min, + SUM(CASE WHEN "status" IS NOT NULL AND "status"::text NOT IN ('a','b') + THEN 1 ELSE 0 END) AS _c2_allowed_values, + COUNT(*) - COUNT(DISTINCT "id") AS _c3_unique +FROM "orders" +WHERE created_at > '2026-01-01' ``` -**Parameters:** - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `config_path` | str | required | Path to validation config YAML | -| `file_path` | str | None | Path to data file (CSV, Parquet, Avro, Delta, etc.) | -| `sources_file` | str | None | Path to sources YAML (overrides config) | -| `source_name` | str | None | Named source from sources.yaml | -| `table` | str | None | Database table name | -| `where` | str | None | SQL WHERE clause | -| `query` | str | None | Custom SQL query | -| `sample_rate` | float | None | Random sample fraction (0.0-1.0) | -| `parallel` | bool | False | Enable multi-core validation | -| `workers` | int | None | Number of worker processes | -| `min_pass_rate` | float | 0 | Minimum rule pass rate (0-100, 0=disabled) | -| `min_quality_score` | float | 0 | Minimum quality score (0-100, 0=disabled) | -| `fail_on_error` | bool | True | Fail Airflow task on validation failure | -| `push_results` | bool | True | Push results to XCom | - -**Template fields**: `config_path`, `file_path`, `sources_file`, `source_name`, `table`, `where`, `query` (supports `.yaml` and `.yml` extensions) +SQL pushdown activates automatically when all conditions hold: +- Source type is a supported database (not CSV, Parquet, or S3) +- No `--query` argument (custom queries disable pushdown; use `--where` instead) +- The dialect supports the rule types in the check -**XCom output:** -- `validation_results`: Full results dictionary -- `passed`: Boolean pass/fail result -- `pass_rate`: Percentage of rules passed +### Rule pushdown support by dialect -**Data source resolution order:** -1. `file_path` — file-based validation -2. `source_name` + `sources_file` — named source validation -3. Config default (`source` or `data_source` from config) +All six database dialects support the base rule set. `regex` and `max_age` require dialect-specific functions and are supported on a per-dialect basis. -### DataCheckSchemaOperator +**Base rules (all dialects):** +`not_null`, `boolean`, `min`, `max`, `range`, `positive`, `non_negative`, `allowed_values`, `unique`, `unique_combination`, `sum_equals`, `min_length`, `max_length`, `no_future_timestamps`, `timestamp_range`, `date_range` -Detect schema changes inside Airflow DAGs: +| Rule | PostgreSQL | Redshift | MySQL | SQL Server | Snowflake | BigQuery | +|------|:---:|:---:|:---:|:---:|:---:|:---:| +| Base rules (16) | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | +| `max_age` | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | +| `regex` | ✓ | ✓ | ✓ | — | ✓ | ✓ | -```python -from datacheck.airflow.operators import DataCheckSchemaOperator - -check_schema = DataCheckSchemaOperator( - task_id="check_schema", - config_path="/path/to/datacheck.yaml", - file_path="/data/orders.csv", - baseline_name="orders-v2", - fail_on_breaking=True, - push_results=True, -) -``` +Rules not in this table (`type`, `date_format_valid`, `date_format`, `foreign_key_exists`) always execute in Python. -**Parameters:** +When a check contains any non-pushable rule, the entire check falls back to the Python path. Other checks in the same run that are fully pushable still execute via SQL. -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `config_path` | str | required | Path to validation config YAML | -| `file_path` | str | None | Path to data file | -| `sources_file` | str | None | Path to sources YAML | -| `source_name` | str | None | Named source from sources.yaml | -| `table` | str | None | Database table name | -| `baseline_name` | str | `"baseline"` | Baseline identifier | -| `baseline_dir` | str | `".datacheck/schemas"` | Baseline storage directory | -| `fail_on_breaking` | bool | True | Fail Airflow task on breaking schema changes | -| `push_results` | bool | True | Push results to XCom | +### No row extraction -**XCom output:** -- `schema_results`: Schema comparison results dictionary -- `schema_compatible`: Boolean compatibility flag +The SQL path transfers zero data rows from the database. The only data movement is the single aggregate result row (one integer per rule). For a table with 100M rows and 20 pushable rules, the network payload is roughly 20 integers. -Auto-captures a new baseline if none exists yet. +### Memory characteristics for file sources -### BashOperator pattern +For CSV and Parquet sources, DataCheck loads only the columns referenced by active checks (column pruning). If a file has 50 columns but only 10 are referenced, only those 10 columns are loaded into memory. Memory usage scales with: `(referenced columns) × (row count) × (average cell size)`. -For simpler integration, use Airflow's `BashOperator` directly: +### Parallel execution -```python -from airflow.operators.bash import BashOperator +`--parallel` splits the loaded DataFrame into chunks (default 100,000 rows) and processes each chunk with a thread pool. This is beneficial for large in-memory datasets where rule evaluation is CPU-bound. It does not apply to the SQL pushdown path; the database handles parallelism internally. -validate = BashOperator( - task_id="validate_data", - bash_command="datacheck validate --config /path/to/config.yaml --output /tmp/results.json", -) +```bash +datacheck validate --parallel --workers 8 --chunk-size 50000 ``` -Exit codes work directly with Airflow task status — exit code 0 means success, any non-zero code fails the task. - ---- - -## CI/CD Integration +### Scaling considerations -DataCheck uses standard exit codes for automation. Any non-zero exit code fails the pipeline. +| Scenario | Recommended approach | +|----------|----------------------| +| Large database table (10M+ rows) | Use SQL pushdown (omit `--query`); add `--where` to filter if needed | +| Large Parquet file (multi-GB) | Use `--parallel`; ensure only needed columns are referenced | +| Many small files | Script sequential calls; aggregate exit codes externally | +| Wide tables (100+ columns) | Reference only needed columns in checks; column pruning applies automatically | -| Code | Meaning | CI/CD Effect | -|------|---------|-------------| -| 0 | All rules passed | Pipeline continues | -| 1 | Error-severity failures | Pipeline fails (blocks deploy) | -| 2 | Configuration error | Pipeline fails | -| 3 | Data loading error | Pipeline fails | -| 4 | Unexpected error | Pipeline fails | - -### GitHub Actions +--- -```yaml -name: Data Quality Check -on: [push] +## 12. Troubleshooting -jobs: - validate: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: - python-version: "3.12" - - name: Install DataCheck - run: pip install datacheck-cli - - name: Validate Data - run: datacheck validate --output results.json - - name: Upload Results - if: always() - uses: actions/upload-artifact@v4 - with: - name: validation-results - path: results.json -``` - -### GitLab CI +### Configuration errors (exit 2) -```yaml -validate_data: - image: python:3.12 - script: - - pip install datacheck-cli - - datacheck validate --output results.json - artifacts: - paths: - - results.json - when: always -``` +**`Configuration Error: No configuration provided and no config file found`** -### Jenkins - -```groovy -pipeline { - agent any - stages { - stage('Data Validation') { - steps { - sh 'pip install datacheck-cli' - sh 'datacheck validate --output results.json' - } - post { - always { - archiveArtifacts artifacts: 'results.json', allowEmptyArchive: true - } - } - } - } -} -``` +DataCheck searched for `.datacheck.yaml`, `.datacheck.yml`, `datacheck.yaml`, `datacheck.yml` in the current directory and found none. Either pass `--config path/to/file.yaml` or create a config file in the working directory. ---- +**`Configuration Error: Unknown rule type: 'xyz'`** -## Python API +The rule type `xyz` is not in the supported rule set. Check the rule name against Section 5. Rule names are case-sensitive and use underscores (e.g., `not_null`, `max_age`). -### ValidationEngine +**`Configuration Error: 'checks' is a required field`** -```python -from datacheck import ValidationEngine +The config file exists but does not contain a `checks` key. A minimal valid config requires at least an empty `checks` list. -engine = ValidationEngine(config_path=".datacheck.yaml") -summary = engine.validate() +**Config parses but rules don't run** -print(f"Records: {summary.total_rows:,} rows, {summary.total_columns} columns") -print(f"Passed: {summary.passed_rules}/{summary.total_rules}") +Check whether `enabled: false` is set on the check. Also verify the column name matches exactly (case-sensitive). -for result in summary.get_failed_results(): - print(f" FAIL: {result.rule_name} on {result.column} ({result.failed_rows} rows)") -``` +### Data loading errors (exit 3) -**Constructor parameters:** +**`Data Load Error: File not found: data.csv`** -| Parameter | Description | -|-----------|-------------| -| `config` / `config_path` | Configuration object or path to YAML file | -| `parallel` | Enable parallel execution (bool) | -| `workers` | Number of worker processes (int) | -| `chunk_size` | Rows per chunk for parallel execution (int) | -| `show_progress` | Show progress bar (bool) | -| `notifier` | Optional notifier instance (e.g. `SlackNotifier`) | -| `sources_file` | Path to sources YAML (overrides config) | +The file path is relative to the current working directory (not the config file's directory, unless the `data_source.path` is defined in the config, in which case it is relative to the config file). Use an absolute path or ensure the working directory is correct. -**Methods:** +**`Data Load Error: Connection failed`** -| Method | Description | -|--------|-------------| -| `validate()` | Validate using config defaults | -| `validate_file(file_path, **kwargs)` | Validate a file (supports sampling, delta time travel) | -| `validate_sources(source_name, table, where, query, **kwargs)` | Validate a named source | -| `validate_dataframe(df)` | Validate a pre-loaded pandas DataFrame | +Check that the database host is reachable and the port is correct. Verify that environment variables are set: `echo $DB_HOST`. Use `--verbose` to see the connection attempt details. -### ValidationSummary +**`Data Load Error: Authentication failed`** -| Property | Type | Description | -|----------|------|-------------| -| `total_rules` | int | Total number of rules executed | -| `passed_rules` | int | Rules that passed | -| `failed_rules` | int | Rules that failed | -| `failed_errors` | int | Failed rules with `error` severity | -| `failed_warnings` | int | Failed rules with `warning` severity | -| `failed_info` | int | Failed rules with `info` severity | -| `error_rules` | int | Rules that encountered execution errors | -| `all_passed` | bool | Whether all rules passed | -| `has_errors` | bool | Whether any execution errors occurred | -| `results` | list | List of `RuleResult` objects | -| `total_rows` | int | Number of data rows | -| `total_columns` | int | Number of columns | -| `timestamp` | str | Execution timestamp | -| `duration` | float | Execution duration in milliseconds | -| `trace_id` | str | Unique run identifier for log correlation | - -| Method | Returns | Description | -|--------|---------|-------------| -| `get_passed_results()` | list | RuleResults that passed | -| `get_failed_results()` | list | RuleResults that failed | -| `get_error_results()` | list | RuleResults with execution errors | -| `to_dict()` | dict | Serialize to dictionary | +Verify credentials. For PostgreSQL, confirm `DB_USER` and `DB_PASSWORD` are correct and the user has `SELECT` access on the target table. For BigQuery, confirm the service account JSON path is correct and the account has `bigquery.dataViewer` role. -### RuleResult +**`Data Load Error: Missing extra: postgresql`** -| Property | Type | Description | -|----------|------|-------------| -| `rule_name` | str | Rule identifier | -| `column` | str | Target column | -| `passed` | bool | Whether the rule passed | -| `total_rows` | int | Total rows checked | -| `failed_rows` | int | Rows that failed | -| `rule_type` | str | Rule type name | -| `check_name` | str | Check name from config | -| `severity` | str | `error`, `warning`, or `info` | -| `failure_details` | FailureDetail | Detailed failure information | -| `error` | str | Error message if rule errored | -| `execution_time` | float | Execution time in milliseconds | +The source type requires an optional dependency that is not installed. Install it: `pip install datacheck-cli[postgresql]`. -### DataProfiler +### Environment variable issues -```python -from datacheck.profiling import DataProfiler +Environment variables that use `${VAR}` syntax resolve to an empty string if unset — they do not raise an error. If a connection fails with unexpected values, confirm the variable is set in the shell where DataCheck runs: -profiler = DataProfiler(outlier_method="zscore") -profile = profiler.profile(df, name="orders") +```bash +printenv | grep DB_ ``` ---- - -## Industry Templates +Use `datacheck config env checks.yaml` to list all variables referenced in a config and their current values. -DataCheck ships with 8 config templates: +### Debugging with `--verbose` -| Template | Use Case | -|----------|----------| -| `basic` | Generic starter config for any data | -| `ecommerce` | Order data, product catalogs, customer records | -| `healthcare` | Patient data, HIPAA compliance, date formats | -| `finance` | Transaction data, SOX compliance, sum validations | -| `saas` | User activity, subscription data, engagement metrics | -| `iot` | Sensor data, time-series, device telemetry | -| `rules-reference` | Complete reference of all validation rules with examples | -| `sources` | Data source connection templates with environment variable support | +`--verbose` sets log level to DEBUG and prints detailed information about each step: ```bash -datacheck config init --template ecommerce --with-sample-data -datacheck config init --template healthcare --with-sample-data --sample-rows 500 -datacheck config templates # List all templates with descriptions +datacheck validate --config checks.yaml --verbose +datacheck validate --config checks.yaml --verbose --log-format json --log-file debug.log ``` ---- +Debug output includes: config file path, source resolution, columns loaded, rules evaluated per check, SQL query generated (for database sources), and timing per rule. -## Error Handling +### SQL pushdown not activating -### Exception hierarchy +If you expect SQL pushdown but validation is slow, check: -| Exception | When | -|-----------|------| -| `DataCheckError` | Base exception for all DataCheck errors | -| `ConfigurationError` | Invalid config structure, missing required fields | -| `ValidationError` | Rule execution failures | -| `DataLoadError` | File not found, encoding issues, connection failures | -| `RuleDefinitionError` | Invalid rule parameters or missing required arguments | -| `UnsupportedFormatError` | Unknown file format or missing optional library | -| `ColumnNotFoundError` | Column not found in DataFrame | -| `EmptyDatasetError` | No rows in loaded dataset | +1. The source type is a supported database (not CSV/Parquet/S3) +2. `--query` is not specified (custom queries disable pushdown; use `--where` instead) +3. The rules in the failing check are all in the pushable set for that dialect -All exceptions inherit from `DataCheckError`, so you can catch them broadly: +Use `--verbose` to confirm whether pushdown is active — the generated SQL query is logged at DEBUG level. -```python -from datacheck.exceptions import DataCheckError, ConfigurationError, DataLoadError +### Arrow/type conversion errors -try: - engine = ValidationEngine(config_path="config.yaml") - summary = engine.validate() -except ConfigurationError as e: - print(f"Config error: {e}") -except DataLoadError as e: - print(f"Data load error: {e}") -except DataCheckError as e: - print(f"DataCheck error: {e}") -``` +If a rule throws an execution error on an Arrow-backed column or a Parquet decimal column, this indicates a type conversion issue in the rule implementation. Report the issue at [https://github.com/squrtech/datacheck/issues](https://github.com/squrtech/datacheck/issues) with the column dtype, rule type, and error message. + +As a workaround, adding a `type: numeric` check on the same column before the failing numeric rule will force type validation and surface the root cause. diff --git a/docs/philosophy.md b/docs/philosophy.md new file mode 100644 index 0000000..69b2c14 --- /dev/null +++ b/docs/philosophy.md @@ -0,0 +1,102 @@ +# DataCheck Philosophy + +DataCheck is opinionated by design. This page explains the thinking behind those opinions. + +--- + +## Detection vs Enforcement + +Most data quality tools are detection tools. They tell you that something went wrong - after it already went wrong. You get a dashboard showing null rates over time, an alert that a column drifted, a report that arrived in your inbox on Tuesday about data that was already in production on Monday. + +Detection is useful. Enforcement is different. + +**DataCheck enforces rules at the point of ingestion.** A rule either passes or fails right now, and if it fails, the pipeline stops. Bad data doesn't reach downstream systems. Dashboards don't break. Consumers don't see corrupted rows. The failure is loud, immediate, and at the gate - not silent and discovered later. + +This is the same distinction as a linter vs a code review. A linter enforces standards before code merges. A code review detects problems after the fact. Both have a role. DataCheck is the linter for your data. + +--- + +## Deterministic vs Statistical Validation + +Statistical anomaly detection asks: "Is this value unusual compared to historical patterns?" That's a valid question - but it's the wrong question for a pipeline gate. + +The answer to "is this unusual?" is probabilistic, tunable, and debatable. The answer to "is this null when it shouldn't be?" is binary. DataCheck only asks binary questions: + +- Is this column null? → yes or no +- Is this value in the allowed set? → yes or no +- Does this timestamp fall within the valid range? → yes or no +- Does this regex match? → yes or no + +Binary rules are predictable. They don't false-positive because the data distribution shifted. They don't false-negative because the anomaly model wasn't trained on this edge case. They behave exactly the same in every environment, on every run, at any scale. + +Determinism is not a limitation - it's the feature. A gate you can't trust is not a gate. + +--- + +## Why SQL Pushdown Matters + +When a data quality tool validates a database table, it typically does one of two things: + +1. Loads the entire table into memory on the validation host, then runs checks in Python +2. Runs a targeted query against the database that returns only the aggregate result + +DataCheck does the second. For a `not_null` check on a million-row table, DataCheck executes: + +```sql +SELECT COUNT(*) FROM orders WHERE order_id IS NULL +``` + +One row comes back. Not one million rows. The data stays in your warehouse. + +This matters for three reasons: + +**Performance.** Moving data is expensive - in time, in network, in memory. SQL pushdown runs at warehouse speed, not Python speed. + +**Cost.** Egress costs money on every major cloud provider. Loading a 50GB table to validate three columns costs real money. Running three aggregate queries costs pennies. + +**Security and compliance.** In regulated industries (finance, healthcare, PII-heavy environments), data leaving the warehouse is an audit event. SQL pushdown means the validator never sees the actual rows - only the aggregate result. The data never leaves. + +--- + +## Why Zero Infrastructure Matters + +Data quality tools that require a server, a database, or a cloud account create a dependency problem: you need the data quality infrastructure to be running before you can validate data. That infrastructure needs to be managed, upgraded, backed up, and secured. + +DataCheck has no server. No database. No cloud account. It's a Python package. You install it, you run it, you're done. The only thing it writes to disk is a YAML baseline file for schema comparison. + +This means: + +- It runs in CI with `pip install datacheck-cli` +- It runs in Airflow with no sidecar services +- It runs on a laptop for local development +- It runs in a Docker container with no volume mounts +- It runs in air-gapped environments + +Zero infrastructure isn't just convenient - it removes the possibility of the validation tool itself becoming a reliability dependency for your pipelines. + +--- + +## Why DataCheck is Opinionated + +DataCheck makes choices that some tools avoid: + +**Rules are binary.** There is no "warn if the null rate exceeds 5% of historical average." Rules pass or fail. If you want a warning that doesn't block the pipeline, use `severity: warning` - but the rule itself is still a binary check against an explicit threshold you wrote. + +**Config is YAML, not Python.** Rules are declarative, not code. This means non-engineers can read and modify validation configs. It means configs can be diffed, reviewed, and versioned like any other file. It means the behavior is inspectable without running anything. + +**The CLI is the primary interface.** DataCheck is designed to be invoked by CI systems, orchestrators, and shell scripts. The Python API is a first-class citizen, but the mental model is: write a config, run a command, check the exit code. + +**Fail hard by default.** The default severity is `error`. If a rule fails, exit code is `1`. If you want to let something through, you explicitly opt it down to `warning` or `info`. The default posture is strict. You loosen it deliberately, not accidentally. + +These choices make DataCheck narrower than a general-purpose data quality platform. That's intentional. A narrow tool that does one thing reliably is more valuable in a pipeline than a broad tool that requires configuration expertise to set up correctly. + +--- + +## What DataCheck Is Not + +- **Not a data observability platform.** DataCheck does not store historical runs, show trend graphs, or alert on drift over time. +- **Not a statistical anomaly detector.** DataCheck does not learn from your data or flag values that look unusual. +- **Not a data catalog.** DataCheck does not discover, classify, or document your data assets. +- **Not a SaaS product.** DataCheck has no cloud backend, no user accounts, no billing. + +If you need those things, there are excellent tools that provide them. DataCheck is designed to complement them - running at the gate, enforcing explicit rules, before data reaches the systems those tools monitor. diff --git a/github-action/.github/workflows/test.yml b/github-action/.github/workflows/test.yml new file mode 100644 index 0000000..a1902d7 --- /dev/null +++ b/github-action/.github/workflows/test.yml @@ -0,0 +1,192 @@ +name: Test Action + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + # ── Test 1: basic CSV validation (file-based source) ───────────────────────── + test-csv: + name: Test CSV validation + runs-on: ubuntu-latest + permissions: + contents: read + security-events: write + + steps: + - uses: actions/checkout@v4 + + # Create a minimal sample CSV for the test + - name: Create sample data + run: | + mkdir -p test-data + cat > test-data/sample.csv <<'EOF' + id,name,amount + 1,Alice,100.0 + 2,Bob,200.0 + 3,Carol,150.0 + EOF + + # Create a matching DataCheck config + - name: Create DataCheck config + run: | + cat > test-datacheck.yaml <<'EOF' + data_source: + type: csv + path: test-data/sample.csv + + checks: + - name: id_not_null + column: id + rules: + not_null: true + unique: true + + - name: amount_positive + column: amount + rules: + min: 0 + EOF + + - name: Run DataCheck action + uses: ./ + with: + config: test-datacheck.yaml + output-format: sarif + output-file: test-results.sarif + + - name: Verify SARIF file was created + run: | + if [ ! -f test-results.sarif ]; then + echo "ERROR: SARIF output file not created" + exit 1 + fi + echo "SARIF file created successfully" + cat test-results.sarif + + # ── Test 2: data-source input override ─────────────────────────────────────── + test-data-source-input: + name: Test data-source input override + runs-on: ubuntu-latest + permissions: + contents: read + security-events: write + + steps: + - uses: actions/checkout@v4 + + - name: Create sample data + run: | + mkdir -p test-data + cat > test-data/override.csv <<'EOF' + product_id,price + P001,9.99 + P002,19.99 + P003,4.99 + EOF + + - name: Create config (no data_source — will be overridden) + run: | + cat > test-override.yaml <<'EOF' + checks: + - name: price_positive + column: price + rules: + min: 0 + EOF + + - name: Run DataCheck action with data-source override + uses: ./ + with: + config: test-override.yaml + data-source: test-data/override.csv + output-format: json + output-file: test-override-results.json + upload-sarif: 'false' + + - name: Verify JSON output + run: | + if [ ! -f test-override-results.json ]; then + echo "ERROR: JSON output file not created" + exit 1 + fi + echo "JSON output created successfully" + + # ── Test 3: JSON output format ──────────────────────────────────────────────── + test-json-output: + name: Test JSON output format + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Create sample data + run: | + mkdir -p test-data + printf 'id,value\n1,10\n2,20\n' > test-data/simple.csv + + - name: Create DataCheck config + run: | + cat > test-json.yaml <<'EOF' + data_source: + type: csv + path: test-data/simple.csv + + checks: + - name: id_check + column: id + rules: + not_null: true + EOF + + - name: Run DataCheck action with JSON output + uses: ./ + with: + config: test-json.yaml + output-format: json + output-file: results.json + upload-sarif: 'false' + + - name: Upload results artifact + if: always() + uses: actions/upload-artifact@v4 + with: + name: datacheck-json-results + path: results.json + + # ── Test 4: version pinning ─────────────────────────────────────────────────── + test-version-pin: + name: Test version pinning + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Create sample data + run: | + printf 'col\nvalue\n' > test-data-pin.csv + + - name: Create minimal config + run: | + cat > test-pin.yaml <<'EOF' + data_source: + type: csv + path: test-data-pin.csv + + checks: + - name: col_check + column: col + rules: + not_null: true + EOF + + - name: Run DataCheck action with pinned version + uses: ./ + with: + config: test-pin.yaml + version: '2.1.0' + output-format: json + output-file: pin-results.json + upload-sarif: 'false' diff --git a/github-action/LICENSE b/github-action/LICENSE new file mode 100644 index 0000000..70173f1 --- /dev/null +++ b/github-action/LICENSE @@ -0,0 +1,190 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to the Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright 2026 Squrtech + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/github-action/README.md b/github-action/README.md new file mode 100644 index 0000000..adc74de --- /dev/null +++ b/github-action/README.md @@ -0,0 +1,237 @@ +

+ DataCheck +

+ +

DataCheck Action

+ +

+ GitHub Marketplace + PyPI version + License +

+ +Enforce deterministic validation rules in CI/CD with [DataCheck](https://github.com/squrtech/datacheck). +Define rules in YAML. Fail fast on bad data. Stop pipelines at the gate. +Results appear in the **GitHub Security tab** via SARIF upload. + +--- + +## Quickstart + +Add to `.github/workflows/data-quality.yml`: + +```yaml +name: Data Quality + +on: [push, pull_request] + +permissions: + contents: read + security-events: write # Required for SARIF upload + +jobs: + validate: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: squrtech/datacheck-action@v1 + with: + config: .datacheck.yaml +``` + +That's it. DataCheck validates your data against the rules in `.datacheck.yaml` and posts +results to the GitHub Security tab. The job fails (exit 1) if any `error`-severity rules fail. + +--- + +## Inputs + +| Input | Required | Default | Description | +|-------|----------|---------|-------------| +| `config` | No | `.datacheck.yaml` | Path to your validation config | +| `data-source` | No | _(empty)_ | Path to a data file (CSV or Parquet) to validate. Overrides the source defined in the config — useful for validating a freshly generated file. | +| `sources-file` | No | _(empty)_ | Path to `sources.yaml` — only needed for database/cloud sources | +| `extras` | No | _(empty)_ | Connector extras to install: `postgresql`, `mysql`, `mssql`, `snowflake`, `bigquery`, `redshift`, `s3`, `databases`, `warehouses`, `all`. Comma-separated for multiple. | +| `output-format` | No | `sarif` | Output format: `sarif`, `json`, `markdown`, `csv` | +| `output-file` | No | `datacheck-results.sarif` | Path to save the results file | +| `upload-sarif` | No | `true` | Auto-upload SARIF to GitHub Security tab | +| `version` | No | _(latest)_ | Pin a specific DataCheck version, e.g. `"2.1.0"` | + +## Outputs + +| Output | Description | +|--------|-------------| +| `passed` | `"true"` if all error-severity rules passed | + +--- + +## Examples + +### File-based source (CSV, Parquet) + +```yaml +# .datacheck.yaml +data_source: + type: csv + path: ./data/orders.csv + +checks: + - name: id_not_null + column: id + rules: + not_null: true + unique: true + + - name: amount_range + column: amount + rules: + min: 0 + max: 100000 +``` + +```yaml +# workflow +- uses: squrtech/datacheck-action@v1 + with: + config: .datacheck.yaml +``` + +--- + +### Database source (PostgreSQL, Snowflake, BigQuery, etc.) + +```yaml +# sources.yaml +sources: + production_db: + type: postgresql + host: ${DB_HOST} + port: ${DB_PORT:-5432} + database: ${DB_NAME} + user: ${DB_USER} + password: ${DB_PASSWORD} +``` + +```yaml +# .datacheck.yaml +sources_file: sources.yaml +source: production_db +table: orders + +checks: + - name: order_id_check + column: order_id + rules: + not_null: true + unique: true +``` + +```yaml +# workflow +- uses: squrtech/datacheck-action@v1 + with: + config: .datacheck.yaml + sources-file: sources.yaml + extras: postgresql # installs the psycopg2 connector + env: + DB_HOST: ${{ secrets.DB_HOST }} + DB_NAME: ${{ secrets.DB_NAME }} + DB_USER: ${{ secrets.DB_USER }} + DB_PASSWORD: ${{ secrets.DB_PASSWORD }} +``` + +--- + +### Export results as JSON instead of SARIF + +```yaml +- uses: squrtech/datacheck-action@v1 + with: + config: .datacheck.yaml + output-format: json + output-file: datacheck-results.json + upload-sarif: 'false' + +- uses: actions/upload-artifact@v4 + if: always() + with: + name: datacheck-results + path: datacheck-results.json +``` + +--- + +### Use the `passed` output in subsequent steps + +```yaml +- id: datacheck + uses: squrtech/datacheck-action@v1 + with: + config: .datacheck.yaml + +- name: Post summary + if: always() + run: | + if [ "${{ steps.datacheck.outputs.passed }}" == "true" ]; then + echo "All validation rules passed!" + else + echo "Validation failed - see the Security tab for details." + fi +``` + +--- + +### Pin a specific DataCheck version + +```yaml +- uses: squrtech/datacheck-action@v1 + with: + config: .datacheck.yaml + version: '2.1.0' +``` + +--- + +## Permissions + +The `security-events: write` permission is required for SARIF upload (the default behavior). +If you set `upload-sarif: 'false'` or `output-format` to something other than `sarif`, +you do not need this permission. + +```yaml +permissions: + contents: read + security-events: write +``` + +--- + +## Exit Codes + +DataCheck uses standard exit codes for CI/CD gating: + +| Code | Meaning | +|------|---------| +| `0` | All rules passed (or only warning/info failures) | +| `1` | One or more `error`-severity rules failed | +| `2` | Configuration error | +| `3` | Data loading error | +| `4` | One or more rules encountered an execution error | + +Rules have `severity: error` (default), `severity: warning`, or `severity: info`. +Only `error`-severity failures cause exit code 1 and fail the job. + +--- + +## Links + +- [DataCheck on PyPI](https://pypi.org/project/datacheck-cli/) +- [DataCheck GitHub](https://github.com/squrtech/datacheck) +- [Documentation](https://squrtech.github.io/datacheck/) +- [Available Rules](https://squrtech.github.io/datacheck/#available-rules) +- [Report an Issue](https://github.com/squrtech/datacheck/issues) + +## License + +Apache License 2.0 — Copyright 2026 Squrtech diff --git a/github-action/action.yml b/github-action/action.yml new file mode 100644 index 0000000..67f8851 --- /dev/null +++ b/github-action/action.yml @@ -0,0 +1,131 @@ +name: 'DataCheck — Data Quality Validation' +description: 'Data quality validation for CI/CD. Define rules in YAML, catch bad data before it breaks pipelines. Supports CSV, databases, Snowflake, and BigQuery.' +author: 'Squrtech' +branding: + icon: 'check-circle' + color: 'green' + +inputs: + config: + description: > + Path to your .datacheck.yaml validation config file. + All rules, data sources (files), and check settings are defined here. + required: false + default: '.datacheck.yaml' + + sources-file: + description: > + Path to your sources.yaml file. Only required when connecting to databases or cloud + storage (PostgreSQL, Snowflake, BigQuery, S3, etc.). Leave blank for file-only sources. + required: false + default: '' + + output-format: + description: 'Output format for the results file: sarif (default), json, markdown, csv' + required: false + default: 'sarif' + + output-file: + description: 'Path to save the results file. Defaults to datacheck-results.sarif.' + required: false + default: 'datacheck-results.sarif' + + upload-sarif: + description: > + Automatically upload SARIF results to the GitHub Security tab. + Requires the job to have: security-events: write permission. + Only applies when output-format is sarif. + required: false + default: 'true' + + extras: + description: > + Comma-separated list of connector extras to install alongside DataCheck. + Use this when validating databases or cloud storage. + Options: postgresql, mysql, snowflake, bigquery, redshift, s3, gcs, azure, cloud, + databases, warehouses, deltalake, avro, all + Example: "postgresql" or "snowflake,s3" or "all" + required: false + default: '' + + data-source: + description: > + Optional path to a data file (CSV, Parquet, JSON) to validate directly. + When provided, this overrides the data source defined in the config file. + Useful for validating freshly generated files without editing the config. + required: false + default: '' + + version: + description: > + DataCheck version to install (e.g. "2.0.2"). Defaults to the latest release. + required: false + default: '' + +outputs: + passed: + description: "'true' if all error-severity rules passed, 'false' otherwise" + value: ${{ steps.validate.outputs.passed }} + +runs: + using: 'composite' + + steps: + - name: Install DataCheck + shell: bash + run: | + # Build package specifier: datacheck-cli[extras]==version + PKG="datacheck-cli" + + if [ -n "${{ inputs.extras }}" ]; then + PKG="${PKG}[${{ inputs.extras }}]" + fi + + if [ -n "${{ inputs.version }}" ]; then + PKG="${PKG}==${{ inputs.version }}" + fi + + pip install -q "${PKG}" + + - name: Run validation + id: validate + shell: bash + run: | + CMD="datacheck validate -c ${{ inputs.config }} --no-progress" + + # Add data source override if provided (file path instead of config-defined source) + if [ -n "${{ inputs.data-source }}" ]; then + CMD="$CMD ${{ inputs.data-source }}" + fi + + # Add sources file if provided (databases / cloud storage) + if [ -n "${{ inputs.sources-file }}" ]; then + CMD="$CMD --sources-file ${{ inputs.sources-file }}" + fi + + # Add output file and format if specified + if [ -n "${{ inputs.output-file }}" ]; then + CMD="$CMD --format ${{ inputs.output-format }} --output ${{ inputs.output-file }}" + fi + + # Run validation — capture exit code without failing immediately so we + # can set the output and still upload SARIF before the step fails + set +e + $CMD + EXIT_CODE=$? + set -e + + if [ "$EXIT_CODE" -eq 0 ]; then + echo "passed=true" >> "$GITHUB_OUTPUT" + else + echo "passed=false" >> "$GITHUB_OUTPUT" + fi + + exit $EXIT_CODE + + - name: Upload SARIF to GitHub Security tab + if: ${{ always() && inputs.upload-sarif == 'true' && inputs.output-format == 'sarif' }} + uses: github/codeql-action/upload-sarif@v3 + with: + sarif_file: ${{ inputs.output-file }} + category: datacheck diff --git a/guides/cli-guide.md b/guides/cli-guide.md index b33422d..7862773 100644 --- a/guides/cli-guide.md +++ b/guides/cli-guide.md @@ -10,20 +10,15 @@ This guide covers every command, option, and feature available in the `datacheck - [Commands Overview](#commands-overview) - [Validate](#validate) - [Run Validation](#run-validation) - - [Sampling](#sampling) - [Parallel Execution](#parallel-execution) - [Slack Notifications](#slack-notifications) - [Output Formats](#output-formats) -- [Profile](#profile) - - [Data Source Resolution](#data-source-resolution) - - [Profiling Features](#profiling-features) - [Schema](#schema) - [Capture a Baseline](#capture-a-baseline) - [Compare Against Baseline](#compare-against-baseline) - [Show, List, and History](#show-list-and-history) - [Config](#config) - [Initialize a Config](#initialize-a-config) - - [Generate Config from Data](#generate-config-from-data) - [Validate a Config](#validate-a-config) - [Show Resolved Config](#show-resolved-config) - [Merge Configs](#merge-configs) @@ -34,17 +29,16 @@ This guide covers every command, option, and feature available in the `datacheck - [Data Source Configuration](#data-source-configuration) - [Named Sources](#named-sources) - [Environment Variables](#environment-variables) - - [Sampling Configuration](#sampling-configuration) - [Config Inheritance](#config-inheritance) - [Severity Levels](#severity-levels) + - → [Full Config Guide](config-guide.md) - [Rules Reference](#rules-reference) - [Null and Uniqueness](#null-and-uniqueness) - [Numeric](#numeric) - [String and Pattern](#string-and-pattern) - [Temporal](#temporal) - - [Semantic](#semantic) + - [Boolean](#boolean) - [Cross-Column](#cross-column) - - [Custom Rules](#custom-rules) - [Data Sources](#data-sources) - [Files](#files) - [Databases](#databases) @@ -70,10 +64,7 @@ pip install datacheck-cli[mysql] # MySQL pip install datacheck-cli[snowflake] # Snowflake pip install datacheck-cli[bigquery] # BigQuery pip install datacheck-cli[redshift] # Redshift -pip install datacheck-cli[cloud] # S3, GCS, Azure Blob -pip install datacheck-cli[deltalake] # Delta Lake -pip install datacheck-cli[avro] # Avro -pip install datacheck-cli[duckdb] # DuckDB +pip install datacheck-cli[cloud] # S3 pip install datacheck-cli[all] # All data sources ``` @@ -83,8 +74,7 @@ pip install datacheck-cli[all] # All data sources ``` datacheck validate Validate data against configured rules -datacheck profile Generate a data quality profile -datacheck schema Schema evolution detection commands +datacheck schema Enforce schema contracts against a baseline datacheck config Configuration management commands datacheck version Display version information ``` @@ -144,21 +134,9 @@ DataCheck auto-discovers config files in this order: `.datacheck.yaml` > `.datac | `--region` | | Cloud region (for Redshift IAM auth) | | `--cluster` | | Cluster identifier (for Redshift IAM auth) | | `--iam-auth` | | Use IAM authentication (for Redshift) | -| `--output` | `-o` | Save results to a JSON file (terminal output is always shown) | +| `--format` | `-f` | Output format: `sarif`, `json`, `markdown`, `csv` | +| `--output` | `-o` | Save results to file (path) | | `--csv-export` | | Export failure details as CSV | -| `--sample-rate` | | Random sample fraction (0.0–1.0) | -| `--sample-count` | | Fixed sample size | -| `--top` | | Validate first N rows only | -| `--stratify` | | Column for stratified sampling | -| `--seed` | | Random seed for reproducibility | -| `--sample-strategy` | | Strategy: `random`, `stratified`, `time_based`, `error_focused`, `adaptive`, `reservoir` | -| `--time-column` | | Column for time-based sampling | -| `--start-date` | | Start date (ISO format) for time-based sampling | -| `--end-date` | | End date (ISO format) for time-based sampling | -| `--error-indicators` | | Comma-separated conditions for error-focused sampling | -| `--delta-version` | | Delta Lake version to load (time travel) | -| `--delta-timestamp` | | Delta Lake timestamp (ISO 8601) for time travel | -| `--storage-options` | | JSON string of storage options for Delta Lake cloud access | | `--parallel` | | Enable multi-core execution | | `--workers` | | Number of worker processes (default: CPU count) | | `--chunk-size` | | Rows per chunk (default: 100000) | @@ -170,54 +148,6 @@ DataCheck auto-discovers config files in this order: `.datacheck.yaml` > `.datac | `--log-file` | | Path to log file (enables rotation) | | `--verbose` | `-v` | Set log level to DEBUG | -### Sampling - -Validate a subset of your data to save time on large datasets. Sampling can be configured via CLI flags or in your YAML config. - -**Random sampling** — select a percentage or fixed count: - -```bash -datacheck validate --sample-rate 0.1 # 10% of rows -datacheck validate --sample-count 5000 # exactly 5000 rows -datacheck validate --sample-count 5000 --seed 42 # reproducible -``` - -**Stratified sampling** — proportional representation by column: - -```bash -datacheck validate \ - --sample-strategy stratified \ - --stratify region \ - --sample-count 1000 \ - --seed 42 -``` - -**Time-based sampling** — filter by date range: - -```bash -datacheck validate \ - --sample-strategy time_based \ - --time-column created_at \ - --start-date "2024-01-01" \ - --end-date "2024-06-30" -``` - -**Error-focused sampling** — oversample rows likely to fail: - -```bash -datacheck validate \ - --sample-strategy error_focused \ - --error-indicators "age<0,price>100000" -``` - -**Top N rows:** - -```bash -datacheck validate --top 1000 -``` - -Sampling can also be configured in your YAML config (see [Sampling Configuration](#sampling-configuration)). - ### Parallel Execution Speed up validation on large datasets by distributing work across CPU cores: @@ -328,122 +258,9 @@ datacheck validate --csv-export failures.csv --- -## Profile +## Schema Contract Enforcement -Generate a data quality report with summary statistics for every column. The data source can be provided directly, read from your config, or loaded from a named source. - -Profile a data source using any of these methods: - -```bash -# Direct file path -datacheck profile data.csv -datacheck profile s3://bucket/data.parquet - -# Auto-discover config (looks for .datacheck.yaml, datacheck.yaml, etc.) -datacheck profile - -# Explicit config file -datacheck profile --config checks.yaml - -# Named source from sources file -datacheck profile --source production_db --sources-file sources.yaml - -# Named source with config (sources_file defined in config) -datacheck profile --source analytics_wh --config checks.yaml -``` - -**All options for `profile`:** - -| Option | Short | Description | -|--------|-------|-------------| -| `DATA_SOURCE` | | Data source: file path, connection string, or omit when using config/sources | -| `--config` | `-c` | Path to config file with data_source or sources_file defined | -| `--source` | | Named source from sources.yaml | -| `--sources-file` | | Path to sources YAML file | -| `--table` | `-t` | Database table name (for database sources) | -| `--query` | `-q` | Custom SQL query (alternative to --table) | -| `--delta-version` | | Delta Lake version to load (time travel) | -| `--delta-timestamp` | | Delta Lake timestamp (ISO 8601) for time travel | -| `--storage-options` | | JSON string of storage options for Delta Lake cloud access | -| `--format` | `-f` | Output format: `terminal`, `json`, or `markdown` | -| `--output` | `-o` | Write report to file | -| `--outlier-method` | | `zscore` (default) or `iqr` | -| `--suggestions` / `--no-suggestions` | | Show rule suggestions (default: on) | -| `--correlations` / `--no-correlations` | | Show correlation matrix (default: on) | -| `--verbose` | `-v` | Enable debug logging | -| `--log-level` | | Log level: `DEBUG`, `INFO`, `WARNING`, `ERROR`, `CRITICAL` | -| `--log-format` | | Log format: `console` or `json` | -| `--log-file` | | Path to log file (enables rotation) | - -### Data Source Resolution - -The `profile` command resolves data sources in the following order: - -1. **Named source** (`--source`): If provided, loads from the specified source in `--sources-file` or the config's `sources_file` -2. **Config data_source**: If `--config` is provided (or auto-discovered) and contains `data_source`, uses that -3. **Config sources_file + source**: If config contains `sources_file` and `source`, uses the default source -4. **Direct argument**: If a data source path/connection string is provided as an argument, uses that - -Example config with inline data source: - -```yaml -# datacheck.yaml -data_source: - type: csv - path: ./data/orders.csv - -checks: - - name: id_check - column: id - rules: - not_null: true -``` - -Example config referencing named sources: - -```yaml -# datacheck.yaml -sources_file: sources.yaml -source: production_db # default source - -checks: - - name: id_check - column: id - rules: - not_null: true -``` - -### Profiling Features - -Every profile includes: - -- **Row and column counts**, memory usage, duplicate row detection -- **Null counts and percentages** per column -- **Unique value counts** and cardinality -- **Min, max, mean, median, standard deviation** for numeric columns -- **Top values and frequencies** for categorical columns -- **Date range** for datetime columns (including datetime strings in CSV files) -- **Quality score** (0–100) per column and overall -- **Outlier detection** using Z-score or IQR method -- **Correlation matrix** for numeric columns -- **Rule suggestions** — automatically recommended validation rules based on data patterns -- **Data quality issues** with severity levels - -```bash -datacheck profile -datacheck profile --outlier-method iqr -datacheck profile --format json --output profile.json -datacheck profile --format markdown --output profile.md - -# With named source -datacheck profile --source analytics_wh --sources-file sources.yaml --format json -``` - ---- - -## Schema - -Track schema changes over time. Capture a baseline, then compare future data to detect column additions, removals, type changes, renames, and nullable changes. The data source can be provided directly, read from your config, or loaded from a named source. +Capture a schema baseline, then enforce it - breaking changes (column additions, removals, type changes, nullable changes) fail the pipeline. The data source can be provided directly, read from your config, or loaded from a named source. ### Capture a Baseline @@ -592,38 +409,6 @@ The generated sample data matches the column names and validation rules defined | `rules-reference` | Complete reference of all validation rules with examples | — | | `sources` | Data source connection templates with env var support | — | -### Generate Config from Data - -Auto-generate a config by analyzing your actual data: - -```bash -datacheck config generate data.csv -datacheck config generate data.csv --confidence high --output checks.yaml -datacheck config generate data.csv --confidence low --name "sales_data" --force -``` - -| Option | Short | Default | Description | -|--------|-------|---------|-------------| -| `--output` | `-o` | `datacheck.yaml` | Output file path | -| `--confidence` | `-c` | `medium` | Rule confidence: `low`, `medium`, `high` | -| `--name` | `-n` | filename | Dataset name | -| `--force` | `-f` | off | Overwrite existing file | - -Confidence levels control which rules are suggested: -- **low** — more rules, may include false positives -- **medium** — balanced (default) -- **high** — fewer rules, high confidence only - -The generated config includes: -- Type inference (`int` vs `numeric` vs `bool` vs `date` vs `string`) -- Regex patterns using `[0-9]` character classes for cross-language compatibility -- Statistical rules (`mean_between`, `std_dev_less_than`, `percentile_range`) with data-derived thresholds -- Semantic rules (`email_valid`, `phone_valid`, `url_valid`, `json_valid`) inferred from column names -- Cross-column rules (`sum_equals`) auto-detected when two numeric columns sum to a third -- Temporal rules (`timestamp_range`, `no_future_timestamps`, `date_format`) with detected format strings -- `data_source` block with file type, path, and options (delimiter, encoding) -- `reporting` block with `output_path` and `export_failures` - ### Validate a Config Check config syntax and rule definitions. All errors are reported at once: @@ -691,7 +476,7 @@ metadata: extends: base.yaml # Inherit from another config data_source: - type: csv # File types: csv, parquet, avro, delta, duckdb, sqlite + type: csv # File types: csv, parquet path: "${DATA_PATH}/orders.csv" options: delimiter: "," @@ -702,15 +487,6 @@ sources_file: sources.yaml source: production_db table: orders -plugins: - - ./custom_rules.py - - ./more_rules.py - -sampling: - method: random # none, random, stratified, top, systematic, - rate: 0.1 # time_based, error_focused, adaptive, reservoir - seed: 42 - checks: - name: order_id_check column: order_id @@ -736,11 +512,7 @@ reporting: ### Data Source Configuration -File-based data sources are defined under `data_source` in your config. For databases, use [Named Sources](#named-sources) instead. - -**Supported file types for inline `data_source`:** `csv`, `parquet`, `delta`, `avro`, `duckdb`, `sqlite` - -**CSV:** +**File-based sources** are defined under `data_source` in your config. Supported types: `csv`, `parquet`. ```yaml data_source: @@ -751,175 +523,9 @@ data_source: encoding: utf-8 ``` -**Parquet:** - -```yaml -data_source: - type: parquet - path: ./data/orders.parquet -``` - -**Avro:** - -```yaml -data_source: - type: avro - path: ./data/orders.avro -``` - -**DuckDB/SQLite:** - -```yaml -data_source: - type: duckdb - path: ./data/analytics.duckdb -``` - -**Delta Lake:** - -```yaml -data_source: - type: delta - path: ./data/delta-table -``` - -### Database Connections - -For databases (PostgreSQL, MySQL, SQL Server), use Named Sources. For cloud warehouses (Snowflake, BigQuery, Redshift), you can also pass connection strings directly to the CLI. - -**Via connection string (cloud warehouses only):** - -```bash -datacheck validate snowflake://account/database --table orders --warehouse COMPUTE_WH -datacheck validate bigquery://project/dataset --table orders --credentials /path/to/sa.json -datacheck validate redshift://user:pass@host:5439/db --table orders -``` - -**Via Named Sources (recommended for all databases):** - -See [Named Sources](#named-sources) for configuring database connections in `sources.yaml`. - -**PostgreSQL via sources.yaml:** - -```yaml -# sources.yaml -sources: - production_db: - type: postgresql - host: ${DB_HOST} - port: ${DB_PORT:-5432} - database: ${DB_NAME} - username: ${DB_USER} - password: ${DB_PASSWORD} - table: orders - schema: public -``` - -**MySQL (sources.yaml):** - -```yaml -# sources.yaml -sources: - mysql_db: - type: mysql - host: ${DB_HOST} - port: ${DB_PORT:-3306} - database: ${DB_NAME} - user: ${DB_USER} - password: ${DB_PASSWORD} -``` - -**SQL Server (sources.yaml):** - -```yaml -# sources.yaml -sources: - mssql_db: - type: mssql - host: ${DB_HOST} - port: ${DB_PORT:-1433} - database: ${DB_NAME} - user: ${DB_USER} - password: ${DB_PASSWORD} -``` - -**Snowflake (sources.yaml):** +**Database and cloud sources** (PostgreSQL, MySQL, SQL Server, Snowflake, BigQuery, Redshift, S3) require a `sources.yaml` file with named sources — see [Named Sources](#named-sources) below. -```yaml -# sources.yaml -sources: - snowflake_wh: - type: snowflake - account: ${SF_ACCOUNT} - user: ${SF_USER} - password: ${SF_PASSWORD} - warehouse: ${SF_WAREHOUSE:-COMPUTE_WH} - database: ${SF_DATABASE} - schema: ${SF_SCHEMA:-PUBLIC} - role: SYSADMIN # optional -``` - -**BigQuery (sources.yaml):** - -```yaml -# sources.yaml -sources: - bigquery_ds: - type: bigquery - project_id: ${GCP_PROJECT} - dataset_id: ${GCP_DATASET} - credentials_path: /path/to/service-account.json - location: US # optional -``` - -**Redshift (sources.yaml):** - -```yaml -# sources.yaml -sources: - redshift_db: - type: redshift - host: ${REDSHIFT_HOST} - port: ${REDSHIFT_PORT:-5439} - database: ${REDSHIFT_DB} - user: ${REDSHIFT_USER} - password: ${REDSHIFT_PASSWORD} - schema: public -``` - -**Cloud Storage (sources.yaml):** - -Cloud files (S3, GCS, Azure) are accessed via named sources. Define the cloud source in `sources.yaml`: - -```yaml -# sources.yaml -sources: - s3_data: - type: s3 - bucket: my-bucket - path: data/orders.csv - region: us-east-1 - access_key: ${AWS_ACCESS_KEY_ID} - secret_key: ${AWS_SECRET_ACCESS_KEY} - - gcs_data: - type: gcs - bucket: my-bucket - path: data/orders.parquet - project: ${GCP_PROJECT} - credentials_path: /path/to/service-account.json - - azure_data: - type: azure - container: my-container - path: data/orders.csv - account_name: ${AZURE_ACCOUNT} - account_key: ${AZURE_KEY} -``` - -```bash -datacheck validate --source s3_data --sources-file sources.yaml -``` +For the full YAML configuration for every source type, see the [Config File Guide](config-guide.md#the-sourcesyaml-file). ### Named Sources @@ -1030,103 +636,6 @@ sources: Use `datacheck config env` to list all variables referenced in a config and their current values. -### Sampling Configuration - -Configure sampling in your YAML config. All sampling methods available via CLI are also available in config. - -**Basic sampling methods:** - -```yaml -# Random sampling by rate (10%) -sampling: - method: random - rate: 0.1 - seed: 42 - -# Random sampling by count -sampling: - method: random - count: 5000 - seed: 42 - -# Top N rows -sampling: - method: top - count: 1000 - -# Systematic sampling (every Nth row) -sampling: - method: systematic - interval: 10 # Every 10th row - start: 0 # Starting index (default: 0) - -# Or calculate interval from rate -sampling: - method: systematic - rate: 0.1 # Calculates interval as 1/rate = 10 - -# Stratified sampling -sampling: - method: stratified - stratify_by: region - count: 100 # Rows per stratum - seed: 42 -``` - -**Advanced sampling methods:** - -```yaml -# Time-based sampling - filter by date range -sampling: - method: time_based - time_column: created_at - start_date: "2024-01-01" - end_date: "2024-12-31" - count: 5000 # Optional: limit results - seed: 42 - -# Error-focused sampling - oversample rows likely to fail -sampling: - method: error_focused - error_indicators: - - "age < 0" - - "price > 100000" - count: 5000 - seed: 42 - -# Adaptive sampling - dynamically adjust based on error rate -sampling: - method: adaptive - count: 5000 - error_indicators: # Optional - - "status = 'error'" - seed: 42 - -# Reservoir sampling - memory-efficient streaming sample -sampling: - method: reservoir - count: 5000 # Reservoir size - seed: 42 -``` - -**All sampling fields:** - -| Field | Type | Description | -|-------|------|-------------| -| `method` | string | Sampling method (see below) | -| `rate` | float | Fraction to sample (0.0–1.0) | -| `count` | int | Number of rows to sample | -| `seed` | int | Random seed for reproducibility | -| `stratify_by` | string | Column for stratified sampling | -| `time_column` | string | Column for time-based sampling | -| `start_date` | string | Start date (ISO format) | -| `end_date` | string | End date (ISO format) | -| `error_indicators` | list | Conditions for error-focused sampling | -| `interval` | int | Interval for systematic sampling | -| `start` | int | Start index for systematic sampling | - -**Available methods:** `none`, `random`, `stratified`, `top`, `systematic`, `time_based`, `error_focused`, `adaptive`, `reservoir`. - ### Config Inheritance Extend a base config and override specific fields: @@ -1215,7 +724,7 @@ rules: ### Numeric -**`min`** / **`max`** — Value bounds. +**`min`** / **`max`** — Value bounds (inclusive). ```yaml rules: @@ -1223,45 +732,36 @@ rules: max: 10000 ``` -**`mean_between`** — Column mean falls within a range. +**`range`** — Value must fall within an inclusive range. ```yaml rules: - mean_between: - min: 50 - max: 150 + range: + min: 0 + max: 100 ``` -**`std_dev_less_than`** — Standard deviation is below a threshold. +**`positive`** — Value must be strictly greater than zero. ```yaml rules: - std_dev_less_than: 15.5 + positive: true ``` -**`percentile_range`** — 25th and 75th percentile values are within bounds. +**`non_negative`** — Value must be zero or greater. ```yaml rules: - percentile_range: - p25_min: 20 - p25_max: 40 - p75_min: 60 - p75_max: 80 + non_negative: true ``` -**`z_score_outliers`** — Flag rows with Z-scores above a threshold. +### Boolean -```yaml -rules: - z_score_outliers: 3.0 -``` - -**`distribution_type`** — Data follows an expected distribution. +**`boolean`** — Column must contain only boolean values (`True`/`False`, `true`/`false`, `1`/`0`). ```yaml rules: - distribution_type: normal # normal, uniform, exponential + boolean: true ``` ### String and Pattern @@ -1343,48 +843,6 @@ rules: format: "%Y-%m-%d" ``` -**`business_days_only`** — Dates are weekdays (not weekends or holidays). Pass a country code for holiday awareness. - -```yaml -rules: - business_days_only: "US" -``` - -### Semantic - -**`email_valid`** — Valid email addresses (RFC 5322). - -```yaml -rules: - email_valid: true -``` - -**`phone_valid`** — Valid phone numbers for a country. - -```yaml -rules: - phone_valid: "US" -``` - -**`url_valid`** — Valid URLs. Optionally restrict schemes. - -```yaml -rules: - url_valid: true - -# Or with allowed schemes: -rules: - url_valid: - schemes: [http, https] -``` - -**`json_valid`** — Valid JSON strings. - -```yaml -rules: - json_valid: true -``` - ### Cross-Column **`unique_combination`** — Combination of columns is unique across rows. @@ -1406,52 +864,6 @@ rules: > **Note:** `foreign_key_exists` is available via the [Python API](python-api.md) only. It validates column values against a reference DataFrame — for example, checking that every `customer_id` in orders exists in a live `customers` table. This requires passing a real DataFrame, which can't be practically expressed in YAML config. For small fixed sets of valid values, use `allowed_values` instead. -### Custom Rules - -Write custom validation logic using the `@custom_rule` decorator. The function receives a `pd.Series` and returns a boolean `pd.Series` (`True` = valid). - -**1. Create a plugin file:** - -```python -# custom_rules.py -from datacheck.plugins.decorators import custom_rule -import pandas as pd - -@custom_rule -def is_business_email(column: pd.Series, allowed_domains: list) -> pd.Series: - domains = column.dropna().str.split("@").str[1] - return domains.isin(allowed_domains) - -@custom_rule -def is_valid_age(column: pd.Series, min_age: int = 0, max_age: int = 150) -> pd.Series: - return (column >= min_age) & (column <= max_age) -``` - -**2. Reference the plugin and rule in your config:** - -```yaml -plugins: - - ./custom_rules.py - -checks: - - name: email_domain_check - column: email - rules: - custom: - rule: is_business_email - params: - allowed_domains: ["company.com", "company.co.uk"] - - - name: age_range_check - column: age - rules: - custom: - rule: is_valid_age - params: - min_age: 18 - max_age: 120 -``` - --- ## Data Sources @@ -1462,9 +874,6 @@ checks: |--------|-----------|---------------| | CSV | `.csv` | None (built-in) | | Parquet | `.parquet`, `.pq` | `pyarrow` (included) | -| SQLite | `.sqlite`, `.sqlite3`, `.db` | None (built-in) | -| Avro | `.avro` | `pip install datacheck-cli[avro]` | -| Delta Lake | directory | `pip install datacheck-cli[deltalake]` | ### Databases @@ -1473,7 +882,6 @@ checks: | PostgreSQL | `pip install datacheck-cli[postgresql]` | 5432 | | MySQL | `pip install datacheck-cli[mysql]` | 3306 | | SQL Server | `pip install datacheck-cli[mssql]` | 1433 | -| DuckDB | `pip install datacheck-cli[duckdb]` | — | ### Cloud Warehouses @@ -1488,8 +896,6 @@ checks: | Provider | Install Extra | |----------|---------------| | AWS S3 | `pip install datacheck-cli[cloud]` | -| Google Cloud Storage | `pip install datacheck-cli[cloud]` | -| Azure Blob Storage | `pip install datacheck-cli[cloud]` | See [Data Source Configuration](#data-source-configuration) for YAML config examples for each source type. @@ -1497,7 +903,7 @@ See [Data Source Configuration](#data-source-configuration) for YAML config exam ## Logging -Control log output with these options (available on `validate` and `profile`): +Control log output with these options (available on `validate`): ```bash # Verbose mode (sets level to DEBUG) @@ -1529,13 +935,25 @@ All log entries include a trace ID for correlating events within a single run. DataCheck returns standard exit codes that CI systems understand. A non-zero exit code fails the pipeline step. -**GitHub Actions:** +**GitHub Actions (native action — recommended):** + +```yaml +permissions: + contents: read + security-events: write + +- uses: squrtech/datacheck-action@v1 + with: + config: .datacheck.yaml +``` + +**GitHub Actions (CLI directly):** ```yaml - name: Validate Data run: | pip install datacheck-cli - datacheck validate + datacheck validate --format sarif --output results.sarif ``` **GitLab CI:** @@ -1577,7 +995,7 @@ validate = DataCheckOperator( file_path="/data/orders_{{ ds }}.parquet", ) -# Detect schema changes +# Enforce schema contracts schema_check = DataCheckSchemaOperator( task_id="schema_check", file_path="/data/orders_{{ ds }}.parquet", @@ -1606,15 +1024,6 @@ Both operators support Jinja templating, XCom result pushing, and database sourc **Note:** Only rules with `severity: error` (the default) affect the exit code. Rules with `severity: warning` or `severity: info` are reported but do not cause exit code 1. -### `profile` - -| Code | Meaning | -|------|---------| -| `0` | Profile generated successfully | -| `2` | Configuration or argument error | -| `3` | Data loading error | -| `4` | Unexpected error | - ### `config` | Code | Meaning | diff --git a/guides/config-guide.md b/guides/config-guide.md new file mode 100644 index 0000000..fa6bf6b --- /dev/null +++ b/guides/config-guide.md @@ -0,0 +1,1068 @@ +# DataCheck Config File Guide + +This guide explains how to write, place, and use DataCheck configuration files. By the end you will know how to define any data source, write validation rules for every use case, manage credentials securely, and run your config through CI/CD. + +--- + +## Table of Contents + +- [What is the config file?](#what-is-the-config-file) +- [Where to place your config](#where-to-place-your-config) +- [Config file anatomy](#config-file-anatomy) +- [Data sources](#data-sources) + - [CSV](#csv) + - [Parquet](#parquet) + - [Databases and cloud warehouses](#databases-and-cloud-warehouses) +- [The sources.yaml file](#the-sourcesyaml-file) + - [PostgreSQL](#postgresql) + - [MySQL](#mysql) + - [SQL Server](#sql-server) + - [Snowflake](#snowflake) + - [BigQuery](#bigquery) + - [Redshift](#redshift) + - [Cloud storage (S3)](#cloud-storage-s3) +- [Checks — the core of your config](#checks--the-core-of-your-config) + - [Check fields](#check-fields) + - [Per-check source override](#per-check-source-override) +- [Rules reference](#rules-reference) + - [Null and uniqueness](#null-and-uniqueness) + - [Numeric](#numeric) + - [String and pattern](#string-and-pattern) + - [Temporal](#temporal) + - [Boolean](#boolean) + - [Cross-column](#cross-column) +- [Severity levels](#severity-levels) +- [Notifications](#notifications) +- [Reporting](#reporting) +- [Config inheritance](#config-inheritance) +- [Environment variables](#environment-variables) +- [Config management commands](#config-management-commands) +- [Common patterns](#common-patterns) + - [Per-environment configs](#per-environment-configs) + - [CI/CD setup](#cicd-setup) + - [Multiple tables in one config](#multiple-tables-in-one-config) +- [Troubleshooting](#troubleshooting) + +--- + +## What is the config file? + +A DataCheck config file is a YAML file that tells DataCheck two things: + +1. **Where is the data?** — a file path, database table, or cloud source. +2. **What rules apply?** — a list of checks, each targeting a column with one or more validation rules. + +Run validation against it with: + +```bash +datacheck validate +datacheck validate --config my-checks.yaml +``` + +--- + +## Where to place your config + +**Auto-discovery** — when you run `datacheck validate` without `--config`, DataCheck searches the current working directory in this order: + +``` +.datacheck.yaml ← searched first (recommended name) +.datacheck.yml +datacheck.yaml +datacheck.yml +``` + +The first file found is used. This means you can commit `.datacheck.yaml` to the root of your repo and run `datacheck validate` from anywhere in that repo without specifying a path. + +**Explicit path** — use `--config` to load any file regardless of name or location: + +```bash +datacheck validate --config configs/production-checks.yaml +datacheck validate -c ./checks/orders.yaml +``` + +**Typical project layout:** + +``` +my-project/ +├── .datacheck.yaml # Default config (auto-discovered) +├── sources.yaml # Database/cloud credentials +├── data/ +│ └── orders.csv +└── configs/ + ├── base.yaml # Shared rules + ├── staging.yaml # Staging-specific overrides + └── production.yaml # Production-specific overrides +``` + +--- + +## Config file anatomy + +A complete config with all supported top-level fields: + +```yaml +# .datacheck.yaml + +version: "1.0" # Optional — documents config schema version + +metadata: # Optional — for documentation purposes + description: "Daily order validation" + author: "data-engineering" + tags: ["production", "orders"] + +extends: base.yaml # Optional — inherit from another config + +# ── Data source ────────────────────────────────────────────────────────────── +data_source: # For file-based sources (csv, parquet, etc.) + type: csv + path: ./data/orders.csv + +# Or for database/cloud sources, reference a named source: +# sources_file: sources.yaml +# source: production_db +# table: orders + +# ── Validation checks ───────────────────────────────────────────────────────── +checks: + - name: order_id_check + column: order_id + description: "Order IDs must be unique and non-null" + severity: error # error (default), warning, info + enabled: true # default: true + rules: + not_null: true + unique: true + + - name: amount_check + column: amount + rules: + not_null: true + min: 0 + max: 100000 + +# ── Reporting ───────────────────────────────────────────────────────────────── +reporting: + output_path: "./reports" # Directory for output files + export_failures: true + failures_file: "failures.csv" + +# ── Notifications ───────────────────────────────────────────────────────────── +notifications: + slack_webhook: "${SLACK_WEBHOOK}" + mention_on_failure: true +``` + +--- + +## Data sources + +### CSV + +```yaml +data_source: + type: csv + path: ./data/orders.csv + options: + delimiter: "," # Default: "," + encoding: utf-8 # Default: utf-8 +``` + +For tab-separated files: + +```yaml +data_source: + type: csv + path: ./data/export.tsv + options: + delimiter: "\t" +``` + +### Parquet + +```yaml +data_source: + type: parquet + path: ./data/orders.parquet +``` + +No options required — column types are read from the Parquet schema automatically. + +### Databases and cloud warehouses + +Databases cannot be defined under `data_source`. Instead, define them in a separate `sources.yaml` file and reference them by name: + +```yaml +# .datacheck.yaml +sources_file: sources.yaml +source: production_db +table: orders +``` + +See [The sources.yaml file](#the-sourcesyaml-file) for full configuration. + +--- + +## The sources.yaml file + +The `sources.yaml` file defines named data source connections. Keep it separate from your config so credentials stay out of version control. + +**Recommended setup:** + +``` +.gitignore ← add sources.yaml (or use only env vars in it) +sources.yaml ← connection definitions (env var references only) +.datacheck.yaml ← safe to commit, references sources.yaml by name +``` + +All connection values support `${VAR}` and `${VAR:-default}` environment variable substitution. + +### PostgreSQL + +```yaml +# sources.yaml +sources: + production_db: + type: postgresql + host: ${DB_HOST} + port: ${DB_PORT:-5432} + database: ${DB_NAME} + user: ${DB_USER} + password: ${DB_PASSWORD} + schema: public # optional, default: public +``` + +Requires: `pip install datacheck-cli[postgresql]` + +### MySQL + +```yaml +sources: + mysql_db: + type: mysql + host: ${MYSQL_HOST} + port: ${MYSQL_PORT:-3306} + database: ${MYSQL_DB} + user: ${MYSQL_USER} + password: ${MYSQL_PASSWORD} +``` + +Requires: `pip install datacheck-cli[mysql]` + +### SQL Server + +```yaml +sources: + mssql_db: + type: mssql + host: ${MSSQL_HOST} + port: ${MSSQL_PORT:-1433} + database: ${MSSQL_DB} + user: ${MSSQL_USER} + password: ${MSSQL_PASSWORD} +``` + +Requires: `pip install datacheck-cli[mssql]` + +### Snowflake + +```yaml +sources: + snowflake_wh: + type: snowflake + account: ${SF_ACCOUNT} # e.g. myorg-myaccount + user: ${SF_USER} + password: ${SF_PASSWORD} + warehouse: ${SF_WAREHOUSE:-COMPUTE_WH} + database: ${SF_DATABASE} + schema: ${SF_SCHEMA:-PUBLIC} + role: ${SF_ROLE} # optional + + # SSO / browser auth (uncomment one): + # authenticator: externalbrowser + # authenticator: okta_https://mycompany.okta.com + + # Key-pair auth (uncomment): + # private_key_path: /path/to/rsa_key.p8 + # private_key_passphrase: ${SF_KEY_PASSPHRASE} +``` + +Requires: `pip install datacheck-cli[snowflake]` + +### BigQuery + +```yaml +sources: + bigquery_ds: + type: bigquery + project_id: ${GCP_PROJECT} + dataset_id: ${GCP_DATASET} + credentials_path: /path/to/service-account.json + location: US # optional, default: US +``` + +Requires: `pip install datacheck-cli[bigquery]` + +For Application Default Credentials (running on GCP or with `gcloud auth`), omit `credentials_path`. + +### Redshift + +```yaml +sources: + redshift_db: + type: redshift + host: ${REDSHIFT_HOST} + port: ${REDSHIFT_PORT:-5439} + database: ${REDSHIFT_DB} + user: ${REDSHIFT_USER} + password: ${REDSHIFT_PASSWORD} + schema: public + + # IAM auth (uncomment instead of user/password): + # iam_auth: true + # cluster_identifier: ${REDSHIFT_CLUSTER} + # region: ${AWS_REGION:-us-east-1} +``` + +Requires: `pip install datacheck-cli[redshift]` + +### Cloud storage (S3) + +Cloud files are accessed through named sources in `sources.yaml`. DataCheck downloads the file and validates it locally. + +**AWS S3:** + +```yaml +sources: + s3_data: + type: s3 + bucket: ${S3_BUCKET} + path: data/orders.csv # path inside the bucket + region: ${AWS_REGION:-us-east-1} + access_key: ${AWS_ACCESS_KEY_ID} + secret_key: ${AWS_SECRET_ACCESS_KEY} +``` + +**Reference a cloud source in your config:** + +```yaml +# .datacheck.yaml +sources_file: sources.yaml +source: s3_data + +checks: + - name: id_check + column: id + rules: + not_null: true +``` + +--- + +## Checks — the core of your config + +A `checks` list is the heart of your config. Each item targets one column and applies one or more rules. + +### Check fields + +```yaml +checks: + - name: order_id_check # Required. Unique identifier for this check. + column: order_id # Required. Column in the dataset to validate. + description: "Must be unique" # Optional. Human-readable note. + severity: error # Optional. error (default), warning, info. + enabled: true # Optional. Set false to skip without deleting. + rules: # Required. One or more rules (see Rules Reference). + not_null: true + unique: true +``` + +**Multiple rules on the same column** — each rule generates its own result: + +```yaml + - name: amount_validation + column: amount + rules: + not_null: true # rule 1: no nulls + min: 0 # rule 2: no negatives + max: 100000 # rule 3: cap at 100K +``` + +**Disabling a check** — the check is skipped and does not affect the pass/fail result: + +```yaml + - name: legacy_check + column: old_field + enabled: false + rules: + not_null: true +``` + +### Per-check source override + +Individual checks can query a different source or table than the config default: + +```yaml +sources_file: sources.yaml +source: production_db +table: customers + +checks: + - name: customer_email + column: email + rules: + not_null: true + + - name: order_amount # This check hits a different source + column: total + source: snowflake_wh # Override source for this check only + table: orders # Override table for this check only + rules: + min: 0 +``` + +--- + +## Rules reference + +### Null and uniqueness + +**`not_null`** — Column must contain no null or missing values. + +```yaml +rules: + not_null: true +``` + +**`unique`** — Column must have no duplicate values. Null values are excluded from uniqueness checking. + +```yaml +rules: + unique: true +``` + +**`unique_combination`** — The combination of multiple columns must be unique across all rows. + +```yaml +rules: + unique_combination: [first_name, last_name, date_of_birth] +``` + +--- + +### Numeric + +**`min`** — Value must be greater than or equal to the threshold. + +```yaml +rules: + min: 0 +``` + +**`max`** — Value must be less than or equal to the threshold. + +```yaml +rules: + max: 10000 +``` + +**`range`** — Value must fall within an inclusive range (combines min and max in one rule). + +```yaml +rules: + range: + min: 0 + max: 100 +``` + +**`positive`** — Value must be strictly greater than zero. + +```yaml +rules: + positive: true +``` + +**`non_negative`** — Value must be zero or greater (allows zero, rejects negatives). + +```yaml +rules: + non_negative: true +``` + +--- + +### String and pattern + +**`regex`** — Column values must match a regular expression. Use single quotes to avoid YAML escape issues. + +```yaml +rules: + regex: '^[A-Z]{2}-[0-9]{5}$' +``` + +Common patterns: + +```yaml +rules: + regex: '^[0-9]{10}$' # 10-digit number string + regex: '^[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}$' # email + regex: '^(19|20)[0-9]{2}-[0-9]{2}-[0-9]{2}$' # YYYY-MM-DD + regex: '^[A-Z]{2}[0-9]{2}[A-Z0-9]{11,30}$' # IBAN +``` + +**`allowed_values`** — Value must belong to a fixed set. Case-sensitive. + +```yaml +rules: + allowed_values: [active, inactive, pending, archived] +``` + +Multi-line form (easier to read for long lists): + +```yaml +rules: + allowed_values: + - pending + - confirmed + - shipped + - delivered + - cancelled + - refunded +``` + +**`min_length`** / **`max_length`** — Enforce minimum and maximum string length. Leading/trailing whitespace is included. + +```yaml +rules: + min_length: 1 + max_length: 255 +``` + +**`type`** — Column values must be of the specified type. Accepts string representations of values (e.g., `"123"` passes `type: int`). + +```yaml +rules: + type: int # Accepts: int, integer + type: float # Accepts: float, numeric + type: string # Accepts: string, str + type: bool # Accepts: bool, boolean + type: date # Accepts: date + type: datetime # Accepts: datetime +``` + +--- + +### Temporal + +**`max_age`** — Data must not be older than a duration. Supports minutes (`m`), hours (`h`), days (`d`), weeks (`w`). + +```yaml +rules: + max_age: "24h" # Data must be less than 24 hours old + max_age: "7d" # Less than 7 days old + max_age: "30m" # Less than 30 minutes old + max_age: "2w" # Less than 2 weeks old +``` + +**`timestamp_range`** (alias: **`date_range`**) — Timestamps must fall within a min/max range. Use ISO 8601 format. + +```yaml +rules: + timestamp_range: + min: "2024-01-01T00:00:00" + max: "2024-12-31T23:59:59" +``` + +Date-only form: + +```yaml +rules: + date_range: + min: "2024-01-01" + max: "2024-12-31" +``` + +**`no_future_timestamps`** — No timestamps ahead of the current system time. + +```yaml +rules: + no_future_timestamps: true +``` + +**`date_format_valid`** (alias: **`date_format`**) — Dates must conform to a Python `strftime` format string. + +```yaml +rules: + date_format_valid: "%Y-%m-%d" + date_format_valid: "%Y-%m-%d %H:%M:%S" + date_format_valid: "%d/%m/%Y" +``` + +Dict form (using the alias): + +```yaml +rules: + date_format: + format: "%Y-%m-%d" +``` + +Common format codes: `%Y` = 4-digit year, `%m` = month 01-12, `%d` = day 01-31, `%H` = hour 00-23, `%M` = minute 00-59, `%S` = second 00-59. + +--- + +### Boolean + +**`boolean`** — Column must contain only boolean values. Accepts `True`/`False`, `true`/`false`, `1`/`0`. Null values are ignored. + +```yaml +rules: + boolean: true +``` + +--- + +### Cross-column + +**`sum_equals`** — A column's value must equal the sum of two other columns. Useful for verifying that `total = subtotal + tax`. + +```yaml +rules: + sum_equals: + column_a: subtotal + column_b: tax + tolerance: 0.01 # Optional. Allowed floating-point delta. Default: 0.01. +``` + +**`unique_combination`** — A multi-column composite key must be unique across all rows. Applies at the check level (not per-column). + +```yaml + - name: composite_key_check + column: order_id # Primary column (required by check structure) + rules: + unique_combination: [order_id, line_item_id] +``` + +**`foreign_key_exists`** — Validates that every value in a column exists in a reference dataset. Available via the **Python API only** — it requires a live DataFrame, which cannot be expressed in YAML. + +```python +from datacheck.rules import ForeignKeyRule + +rule = ForeignKeyRule( + name="customer_exists", + column="customer_id", + reference_data=customers_df, + reference_column="id", +) +``` + +For YAML-based validation against a fixed set of allowed values, use `allowed_values` instead. + +--- + +## Severity levels + +Every check has a `severity` field (default: `error`). Only `error`-severity failures cause exit code 1. + +| Severity | Exit code | Use case | +|----------|-----------|----------| +| `error` | 1 | Critical failures that must block the pipeline | +| `warning` | 0 | Soft violations worth logging but not blocking | +| `info` | 0 | Informational checks that log results without blocking | + +```yaml +checks: + - name: id_not_null + column: id + severity: error # Blocks the pipeline if id is null + rules: + not_null: true + + - name: description_length + column: description + severity: warning # Reports issue but does not fail + rules: + max_length: 500 + + - name: created_at_fresh + column: created_at + severity: info # Logged for monitoring only + rules: + max_age: "30d" +``` + +--- + +## Notifications + +DataCheck can send results to a Slack channel automatically. Configure the webhook in your config so it runs on every validation without extra CLI flags. + +```yaml +notifications: + slack_webhook: "${SLACK_WEBHOOK}" # Use env var — never hardcode the URL + mention_on_failure: true # @channel on failures (default: false) +``` + +The Slack message includes: pass/fail status, summary counts, up to 5 failed rules with row counts, and success rate. + +Override the webhook via CLI (useful in CI): + +```bash +datacheck validate --slack-webhook https://hooks.slack.com/services/T.../B.../... +``` + +--- + +## Reporting + +Configure output file locations: + +```yaml +reporting: + output_path: "./reports" # Directory for auto-named JSON reports + export_failures: true # Export failure rows to CSV + failures_file: "failures.csv" # Path for failure CSV (default: failures.csv) +``` + +Override at runtime: + +```bash +datacheck validate --output results.json # JSON report +datacheck validate --format sarif --output results.sarif # SARIF (for GitHub Security tab) +datacheck validate --format markdown --output results.md # Markdown +datacheck validate --csv-export failures.csv # CSV of failing rows +``` + +--- + +## Config inheritance + +Use `extends` to inherit all settings from a base config and override specific fields. This is useful for managing dev / staging / production variants of the same rules. + +**base.yaml** — shared rules used across all environments: + +```yaml +data_source: + type: csv + path: ./data/orders.csv + +checks: + - name: id_check + column: id + rules: + not_null: true + unique: true + + - name: amount_check + column: amount + rules: + not_null: true + min: 0 +``` + +**production.yaml** — inherits base, switches to database, adds stricter checks: + +```yaml +extends: base.yaml + +# Override data source +sources_file: sources.yaml +source: production_db +table: orders + +# Add production-only checks +checks: + - name: created_at_freshness + column: created_at + severity: error + rules: + max_age: "48h" +``` + +**Run with the appropriate config:** + +```bash +datacheck validate --config production.yaml +``` + +Inheritance is single-level: `production.yaml` cannot extend a file that also uses `extends`. + +--- + +## Environment variables + +Config files support `${VAR}` and `${VAR:-default}` substitution. Variables are resolved from the current shell environment. + +```yaml +sources: + production_db: + type: postgresql + host: ${DB_HOST} # Required — error if not set + port: ${DB_PORT:-5432} # Optional — uses 5432 if DB_PORT not set + database: ${DB_NAME} + user: ${DB_USER} + password: ${DB_PASSWORD} +``` + +**Best practice:** + +- Use `${VAR}` for required values (will fail clearly if unset) +- Use `${VAR:-default}` for values with sensible defaults (port, schema, region) +- Never hardcode passwords or API keys in config files + +**List all variables used in a config:** + +```bash +datacheck config env +datacheck config env sources.yaml +``` + +Output shows each variable, its current value (masked if it looks like a credential), and whether it is set. + +**Setting variables in CI:** + +```yaml +# GitHub Actions +env: + DB_HOST: ${{ secrets.DB_HOST }} + DB_NAME: ${{ secrets.DB_NAME }} + DB_USER: ${{ secrets.DB_USER }} + DB_PASSWORD: ${{ secrets.DB_PASSWORD }} +``` + +--- + +## Config management commands + +### Generate a starter config + +```bash +datacheck config init # Basic template → datacheck.yaml +datacheck config init --template ecommerce # E-commerce template +datacheck config init --template finance # Finance template +datacheck config init --with-sample-data # Also generate sample CSV +datacheck config init --output my-checks.yaml # Custom output path +datacheck config init --force # Overwrite existing file +``` + +Available templates: `basic`, `ecommerce`, `healthcare`, `finance`, `saas`, `iot`, `rules-reference`, `sources` + +Use `--with-sample-data` to get a ready-to-run config with generated CSV data — you can run `datacheck validate` immediately. + +### Validate config syntax + +Check your config for errors before running validation. Reports **all** errors at once: + +```bash +datacheck config validate +datacheck config validate my-checks.yaml +datacheck config validate my-checks.yaml --strict # Fail on warnings too +``` + +Example output when errors are present: + +``` +Configuration has errors: + - Check #2: Missing required field 'column' + - Check #5: Missing required field 'rules' + - Schema validation failed at 'checks.3.rules.min': -1 is not valid under the schema +``` + +### Show resolved config + +Display the fully resolved config with environment variables expanded and `extends` applied: + +```bash +datacheck config show +datacheck config show my-checks.yaml +datacheck config show my-checks.yaml --format json +datacheck config show --no-resolve-env # Show raw ${VAR} references +datacheck config show --no-resolve-extends # Show without inheritance applied +``` + +Use this to verify that env vars are resolving to the right values before running validation. + +### Merge configs + +Combine multiple configs. Later files override values from earlier files: + +```bash +datacheck config merge base.yaml production.yaml +datacheck config merge base.yaml env.yaml --output merged.yaml +``` + +### List templates + +```bash +datacheck config templates +``` + +--- + +## Common patterns + +### Per-environment configs + +Structure for managing dev / staging / production: + +``` +configs/ +├── base.yaml # All shared rules +├── development.yaml # Local dev: points to CSV fixture +├── staging.yaml # Staging: points to staging DB +└── production.yaml # Production: points to prod DB + stricter rules +``` + +```yaml +# configs/development.yaml +extends: ./base.yaml +data_source: + type: csv + path: ./fixtures/orders.csv +``` + +```yaml +# configs/staging.yaml +extends: ./base.yaml +sources_file: ./sources.yaml +source: staging_db +table: orders +``` + +```yaml +# configs/production.yaml +extends: ./base.yaml +sources_file: ./sources.yaml +source: production_db +table: orders + +checks: + - name: freshness_check + column: created_at + severity: error + rules: + max_age: "48h" +``` + +```bash +datacheck validate --config configs/production.yaml +``` + +### CI/CD setup + +Commit a `.datacheck.yaml` at your repo root, plus a `sources.yaml` with env var references only: + +```yaml +# .datacheck.yaml (committed to git) +sources_file: sources.yaml +source: production_db +table: orders + +checks: + - name: id_check + column: id + rules: + not_null: true + unique: true +``` + +```yaml +# sources.yaml (committed to git — no secrets, only env var references) +sources: + production_db: + type: postgresql + host: ${DB_HOST} + port: ${DB_PORT:-5432} + database: ${DB_NAME} + user: ${DB_USER} + password: ${DB_PASSWORD} +``` + +**GitHub Actions with the native action** (results appear in Security tab): + +```yaml +# .github/workflows/data-quality.yml +name: Data Quality +on: [push, pull_request] + +permissions: + contents: read + security-events: write + +jobs: + validate: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: squrtech/datacheck-action@v1 + with: + config: .datacheck.yaml + env: + DB_HOST: ${{ secrets.DB_HOST }} + DB_NAME: ${{ secrets.DB_NAME }} + DB_USER: ${{ secrets.DB_USER }} + DB_PASSWORD: ${{ secrets.DB_PASSWORD }} +``` + +**Plain CLI** (works in any CI system): + +```bash +pip install datacheck-cli[postgresql] +datacheck validate --config .datacheck.yaml --format sarif --output results.sarif +``` + +### Multiple tables in one config + +Validate several tables from the same source by running separate configs or using per-check source overrides: + +```yaml +# multi-table.yaml +sources_file: sources.yaml +source: production_db + +checks: + - name: orders_id_check + column: id + source: production_db + table: orders + rules: + not_null: true + unique: true + + - name: customers_email_check + column: email + source: production_db + table: customers + rules: + not_null: true + regex: '^[^@]+@[^@]+\.[^@]+$' + + - name: products_price_check + column: price + source: production_db + table: products + rules: + positive: true +``` + +--- + +## Troubleshooting + +**"No config file found"** — DataCheck searched for `.datacheck.yaml`, `.datacheck.yml`, `datacheck.yaml`, `datacheck.yml` in the current directory and found none. Either create one or use `--config path/to/config.yaml`. + +**"Configuration has errors"** — Run `datacheck config validate` to see all errors at once. Common causes: missing `name`, `column`, or `rules` fields; invalid rule parameters (e.g., `min: -1` where the rule requires a non-negative value). + +**"Source 'X' not found"** — Check that `sources_file` points to the correct file and the source name matches exactly (case-sensitive). + +**"Environment variable not set"** — Run `datacheck config env sources.yaml` to see which variables are missing. Set them in your shell or CI environment before running. + +**"Column 'X' not found in DataFrame"** — The column name in your check does not match the actual column in the data. Run `datacheck schema show` to see the exact column names. + +**Test your config first:** + +```bash +datacheck config validate # Check syntax +datacheck config env # Verify env vars +datacheck schema show # Inspect column names and types +datacheck validate --no-progress # Run validation +``` diff --git a/guides/guide-who-uses-datacheck.md b/guides/guide-who-uses-datacheck.md index 1024f24..12f2261 100644 --- a/guides/guide-who-uses-datacheck.md +++ b/guides/guide-who-uses-datacheck.md @@ -1,8 +1,8 @@ -# Who Uses DataCheck — A Complete Guide for Every Team +# Who Uses DataCheck - A Complete Guide for Every Team -**Catch bad data before it breaks your pipelines.** +**Enforce data quality at the pipeline boundary. Fail fast. Stop bad data before it moves.** -DataCheck is a CLI-first data validation tool built for data engineers. You define rules in YAML, connect to your data — local files via config, databases and cloud sources via `sources.yaml` — and DataCheck tells you what's wrong before it reaches production. +DataCheck is a linter for data pipelines. You define rules in YAML, connect to your data - local files via config, databases and cloud sources via `sources.yaml` - and DataCheck enforces those rules at the gate. If data fails, the pipeline stops. Exit code 1. No silent failures, no corrupted rows reaching downstream consumers. This guide walks through every team and person who benefits from DataCheck, with real-world scenarios, infrastructure setup, and step-by-step usage for each. @@ -65,23 +65,7 @@ pip install datacheck-cli[postgresql] # or snowflake, bigquery, etc. **Step 2: Create a config** -Option A — Generate rules automatically from your actual data: - -```bash -datacheck config generate ./staging/orders.csv -``` - -DataCheck analyzes your data — column types, null patterns, value ranges, uniqueness, string formats — and writes a config with appropriate rules. Control how aggressive the rules are with `--confidence`: - -```bash -datacheck config generate ./staging/orders.csv --confidence high # Fewer rules, high certainty -datacheck config generate ./staging/orders.csv --confidence medium # Balanced (default) -datacheck config generate ./staging/orders.csv --confidence low # More rules, may include false positives -``` - -This is the fastest way to go from zero to a working validation config. Review the generated rules, remove anything irrelevant, tighten thresholds, and you're done. - -Option B — Start from a domain template: +Option A — Start from a domain template: ```bash datacheck config init --template ecommerce --with-sample-data @@ -89,7 +73,7 @@ datacheck config init --template ecommerce --with-sample-data This creates `datacheck.yaml` and a sample `orders.csv` so you can test immediately. Available templates: `basic`, `ecommerce`, `healthcare`, `finance`, `saas`, `iot`. -Option C — Write rules by hand: +Option B — Write rules by hand: ```yaml # .datacheck.yaml @@ -116,12 +100,6 @@ checks: min: 0 max: 100000 - - name: email_format - column: email - severity: warning - rules: - email_valid: true - - name: no_future_orders column: order_date rules: @@ -229,10 +207,9 @@ DataCheck connects to PostgreSQL, loads the `orders` table, and runs the rules a | Feature | Why It Matters | |---------|----------------| -| 27+ built-in rules | Covers nulls, ranges, patterns, dates, emails, phones, URLs, JSON, cross-column checks — no custom code needed | -| Sampling | Validate 10% of a 100M row file in seconds: `--sample-rate 0.1` | +| 20+ built-in rules | Covers nulls, ranges, patterns, dates, email validation, cross-column checks — no custom code needed | | Parallel execution | Split work across CPU cores: `--parallel --workers 8` | -| Multiple data sources | CSV, Parquet, Avro, Delta Lake, PostgreSQL, MySQL, Snowflake, BigQuery, Redshift | +| Multiple data sources | CSV, Parquet, PostgreSQL, MySQL, Snowflake, BigQuery, Redshift, S3 | | Exit codes | `0` = pass, `1` = fail, `2` = config error, `3` = data error — CI systems understand these | | JSON output | `--output results.json` for machine-readable results | | Severity levels | `error` (blocks pipeline), `warning` (logged but doesn't block), `info` (tracked only) | @@ -249,7 +226,7 @@ You maintain a `dim_customers` table that joins data from three sources. A schem ### How DataCheck Helps -DataCheck combines rule-based validation with schema evolution detection. You define rules for data quality and track schema changes separately, so you catch both content issues and structural changes. +DataCheck enforces both validation rules and schema contracts. You define rules for content correctness and baseline schema separately - both are gates that fail the pipeline when violated. ### Setup @@ -291,7 +268,7 @@ Compatibility: BREAKING (2 breaking changes) With `--fail-on-breaking`, exit code 1 stops your pipeline. -**Step 3: Validate data quality rules** +**Step 3: Enforce validation rules** ```yaml # dim_customers_checks.yaml @@ -306,11 +283,6 @@ checks: not_null: true unique: true - - name: email_valid - column: email - rules: - email_valid: true - - name: revenue_positive column: lifetime_revenue rules: @@ -334,38 +306,6 @@ checks: datacheck validate --config dim_customers_checks.yaml ``` -### Don't Write Rules by Hand — Generate Them - -Not sure what rules to write? Let DataCheck figure it out. - -**Option 1: Auto-generate a full config from your data** - -```bash -datacheck config generate dim_customers_export.csv --confidence medium --output dim_customers_checks.yaml -``` - -DataCheck analyzes every column — types, null patterns, value distributions, uniqueness, string formats, numeric ranges — and writes a complete validation config. You review, adjust thresholds, and you're done in minutes instead of hours. - -Confidence levels control how aggressive the rules are: -- `low` — catches more issues, may flag some false positives -- `medium` — balanced (default) -- `high` — only high-certainty rules, fewer false positives - -**Option 2: Profile first, then decide** - -```bash -datacheck profile --source production_db --sources-file sources.yaml --format json -o profile.json -``` - -Profiling gives you: -- Quality score per column (0-100) -- Null percentages and uniqueness counts -- Outlier detection (Z-score or IQR) -- Automatic rule suggestions based on patterns in your data -- Correlation matrix for numeric columns - -Use the profile output to understand your data, then use `config generate` to create the rules automatically — or write them by hand with the profile as your reference. - --- ## 3. The DevOps / Platform Engineer @@ -382,6 +322,33 @@ DataCheck integrates into CI/CD like a test suite. Add it as a step in your pipe ### GitHub Actions +**Option 1 — Native DataCheck Action (recommended for simple validation):** + +Results appear in the GitHub Security tab via SARIF: + +```yaml +# .github/workflows/data-validation.yml +name: Data Validation + +on: [push, pull_request] + +permissions: + contents: read + security-events: write + +jobs: + validate: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: squrtech/datacheck-action@v1 + with: + config: .datacheck.yaml +``` + +**Option 2 — CLI directly (for full control, schema checks, etc.):** + ```yaml # .github/workflows/data-validation.yml name: Data Validation @@ -411,7 +378,7 @@ jobs: run: datacheck config validate --strict - name: Run validation - run: datacheck validate --output results.json + run: datacheck validate --format sarif --output results.sarif - name: Check for schema drift run: datacheck schema compare --fail-on-breaking @@ -421,7 +388,7 @@ jobs: uses: actions/upload-artifact@v4 with: name: validation-results - path: results.json + path: results.sarif ``` ### GitLab CI @@ -479,8 +446,8 @@ Developer pushes PR [ CI Pipeline ] | +---> datacheck config validate (config syntax OK?) - +---> datacheck validate (data quality OK?) - +---> datacheck schema compare (schema unchanged?) + +---> datacheck validate (validation rules passed?) + +---> datacheck schema compare (schema contract enforced?) | all pass? / \ @@ -540,7 +507,7 @@ validate = DataCheckOperator( **Data source options:** - `source_name` + `sources_file` — validate a named database source (recommended) -- `file_path` — point at a file (CSV, Parquet, Avro, Delta Lake) +- `file_path` — point at a file (CSV or Parquet) - Config default — uses `data_source` or `source` from the config file **Threshold mode** — allow some failures without blocking the DAG: @@ -556,7 +523,7 @@ validate = DataCheckOperator( ) ``` -**Parallel + sampling for large datasets:** +**Parallel execution for large datasets:** ```python validate = DataCheckOperator( @@ -565,7 +532,6 @@ validate = DataCheckOperator( sources_file="/opt/airflow/config/sources.yaml", source_name="analytics_wh", table="events", - sample_rate=0.1, # Validate 10% of rows parallel=True, workers=4, ) @@ -619,7 +585,7 @@ with DAG( fail_on_breaking=True, ) - # 2. Validate data quality + # 2. Enforce validation rules validate = DataCheckOperator( task_id="validate_orders", config_path="/opt/airflow/config/order_checks.yaml", @@ -754,7 +720,7 @@ You pull a dataset for model training and spend two hours cleaning it before rea ### How DataCheck Helps -Run a quick validation or profile before starting analysis. DataCheck tells you exactly what's wrong with the data, suggests rules, and quantifies quality — in your notebook or terminal. +Run a quick validation before starting analysis. DataCheck tells you exactly what's wrong with the data, suggests rules, and quantifies quality — in your notebook or terminal. ### Setup @@ -762,37 +728,6 @@ Run a quick validation or profile before starting analysis. DataCheck tells you pip install datacheck-cli ``` -### Scenario: Auto-Generate Quality Rules in 10 Seconds - -Point DataCheck at your dataset and let it write the rules for you: - -```bash -datacheck config generate training_data.csv --output training_checks.yaml -``` - -DataCheck scans every column — detects types, null patterns, value ranges, uniqueness, string formats, numeric distributions — and writes a complete validation config. Open it, review, adjust anything that looks too strict or too loose, and you have a reusable quality gate for that dataset. - -```bash -# Now validate anytime with those rules -datacheck validate --config training_checks.yaml -``` - -This is especially useful when you receive a new dataset from another team. Instead of manually exploring and writing checks, generate them in one command and review. - -### Scenario: Profile Before Analysis - -```bash -datacheck profile training_data.csv --format json -o profile.json -``` - -The profile gives you: -- Row and column counts -- Null percentage per column -- Unique value counts -- Quality score (0-100) per column and overall -- Outlier detection (Z-score or IQR) -- Automatic rule suggestions - ### Scenario: Validate in a Jupyter Notebook Use the Python API directly: @@ -806,7 +741,6 @@ config = ValidationConfig( checks=[ RuleConfig(name="age_valid", column="age", rules={"not_null": True, "min": 0, "max": 120}), RuleConfig(name="target_present", column="churn", rules={"not_null": True, "allowed_values": [0, 1]}), - RuleConfig(name="email_format", column="email", rules={"email_valid": True}), RuleConfig(name="signup_date", column="signup_date", rules={"no_future_timestamps": True}), ] ) @@ -864,15 +798,7 @@ DataCheck is a single `pip install` with zero infrastructure. No servers, no dat pip install datacheck-cli ``` -The fastest way to start — point DataCheck at your data and let it generate the rules: - -```bash -datacheck config generate ./exports/weekly_orders.csv -``` - -DataCheck analyzes the file, detects column types, null patterns, value ranges, and string formats, and writes a complete `datacheck.yaml` config. Review it, remove anything unnecessary, and you're ready to validate. No YAML to write from scratch. - -Or start from a template if you don't have data yet: +Start from a template or write rules by hand: ```bash datacheck config init --template basic --with-sample-data @@ -900,12 +826,6 @@ checks: min: 0 type: numeric - - name: customer_email - column: email - rules: - email_valid: true - severity: warning - - name: date_sane column: order_date rules: @@ -961,28 +881,13 @@ Add extras for your data sources: pip install datacheck-cli[postgresql] # PostgreSQL pip install datacheck-cli[snowflake] # Snowflake pip install datacheck-cli[bigquery] # BigQuery -pip install datacheck-cli[cloud] # S3, GCS, Azure +pip install datacheck-cli[cloud] # S3 pip install datacheck-cli[all] # Everything ``` -### Step 2: Generate a Config - -If you already have data, let DataCheck write the rules for you: - -```bash -datacheck config generate your_data.csv -``` - -DataCheck analyzes column types, null patterns, value ranges, uniqueness, and string formats, then writes a complete `datacheck.yaml` with appropriate rules. Review and adjust as needed. - -Control rule aggressiveness with `--confidence`: - -```bash -datacheck config generate your_data.csv --confidence high # Strict, fewer rules -datacheck config generate your_data.csv --confidence low # Broad, more rules -``` +### Step 2: Create a Config -Or start from a domain template (includes sample data to test with): +Start from a domain template (includes sample data to test with): ```bash datacheck config init --with-sample-data @@ -995,26 +900,21 @@ datacheck config init --template ecommerce --with-sample-data datacheck validate ``` -### Step 4: Profile Your Data - -```bash -datacheck profile your_data.csv -``` - -### Step 5: Track Schema Changes +### Step 4: Track Schema Changes ```bash datacheck schema capture # First time: saves baseline datacheck schema compare # Every run after: compares against baseline ``` -### Step 6: Add to Your Pipeline +### Step 5: Add to Your Pipeline -**CI/CD (one line):** +**CI/CD (GitHub Actions native action):** ```yaml -# GitHub Actions -- run: pip install datacheck-cli && datacheck validate +- uses: squrtech/datacheck-action@v1 + with: + config: .datacheck.yaml ``` **Airflow (two operators):** @@ -1063,28 +963,23 @@ if not summary.all_passed: | Category | Rules | |----------|-------| | Null & Uniqueness | `not_null`, `unique`, `unique_combination` | -| Numeric | `min`, `max`, `mean_between`, `std_dev_less_than`, `percentile_range`, `z_score_outliers`, `distribution_type` | +| Numeric | `min`, `max`, `range`, `boolean` | | String & Pattern | `regex`, `allowed_values`, `length`, `min_length`, `max_length`, `type` | -| Temporal | `max_age`, `timestamp_range`, `no_future_timestamps`, `date_format_valid`, `business_days_only` | -| Semantic | `email_valid`, `phone_valid`, `url_valid`, `json_valid` | +| Temporal | `max_age`, `timestamp_range`, `no_future_timestamps`, `date_format_valid` | | Cross-Column | `unique_combination`, `foreign_key_exists` (Python API), `sum_equals` | -| Custom | Any Python function via `@custom_rule` decorator | ### Data Sources Supported | Source | Install Extra | |--------|---------------| -| CSV, Parquet, SQLite | Built-in (no extras) | +| CSV, Parquet | Built-in (no extras) | | PostgreSQL | `datacheck-cli[postgresql]` | | MySQL | `datacheck-cli[mysql]` | | SQL Server | `datacheck-cli[mssql]` | | Snowflake | `datacheck-cli[snowflake]` | | BigQuery | `datacheck-cli[bigquery]` | | Redshift | `datacheck-cli[redshift]` | -| S3, GCS, Azure | `datacheck-cli[cloud]` | -| Delta Lake | `datacheck-cli[deltalake]` | -| Avro | `datacheck-cli[avro]` | -| DuckDB | `datacheck-cli[duckdb]` | +| S3 | `datacheck-cli[cloud]` | ### Exit Codes diff --git a/guides/python-api.md b/guides/python-api.md index dee94ce..20b0c24 100644 --- a/guides/python-api.md +++ b/guides/python-api.md @@ -1,6 +1,6 @@ # DataCheck Python API Guide -This guide covers the full Python API for DataCheck. Use it to embed data validation, profiling, and schema detection into your pipelines, notebooks, and applications. +This guide covers the full Python API for DataCheck. Use it to embed data validation and schema detection into your pipelines, notebooks, and applications. For CLI usage, see the [CLI Guide](cli-guide.md). For a quick overview, see the [README](../README.md). @@ -27,22 +27,12 @@ For CLI usage, see the [CLI Guide](cli-guide.md). For a quick overview, see the - [LoaderFactory](#loaderfactory) - [CSV Loader](#csv-loader) - [Parquet Loader](#parquet-loader) - - [Delta Lake Loader](#delta-lake-loader) - - [Avro Loader](#avro-loader) - [Database Loader](#database-loader) -- [Data Profiling](#data-profiling) - - [DataProfiler](#dataprofiler) - - [DatasetProfile](#datasetprofile) - - [ColumnProfile](#columnprofile) - [Schema Evolution](#schema-evolution) - [SchemaDetector](#schemadetector) - [SchemaComparator](#schemacomparator) - [BaselineManager](#baselinemanager) - [Schema Models](#schema-models) -- [Custom Rules](#custom-rules) - - [The @custom_rule Decorator](#the-custom_rule-decorator) - - [Plugin Loader](#plugin-loader) - - [Rule Registry](#rule-registry) - [Validation API (Multi-Column Rules)](#validation-api-multi-column-rules) - [Cross-Column Rules](#cross-column-rules) - [All Available Validation Rules](#all-available-validation-rules) @@ -53,9 +43,6 @@ For CLI usage, see the [CLI Guide](cli-guide.md). For a quick overview, see the - [Snowflake](#snowflake) - [BigQuery](#bigquery) - [Redshift](#redshift) -- [Sampling](#sampling) - - [DataSampler](#datasampler) - - [Advanced Samplers](#advanced-samplers) - [Notifications](#notifications) - [SlackNotifier](#slacknotifier) - [Airflow Integration](#airflow-integration) @@ -144,28 +131,6 @@ Load a file and validate it against configured rules. The data source path overr ```python summary = engine.validate_file("data.csv") summary = engine.validate_file("data.parquet") -summary = engine.validate_file("data.avro") -``` - -With sampling: - -```python -summary = engine.validate_file( - "data.csv", - sample_rate=0.1, # 10% random sample - seed=42 # Reproducible -) - -summary = engine.validate_file( - "data.csv", - sample_count=5000, # Exactly 5000 rows - stratify="region" # Stratified by region -) - -summary = engine.validate_file( - "data.csv", - top=1000 # First 1000 rows only -) ``` ### Validate a DataFrame @@ -192,7 +157,6 @@ ValidationEngine.validate_sources( table: str | None = None, where: str | None = None, query: str | None = None, - sample_rate: float | None = None ) -> ValidationSummary ``` @@ -391,7 +355,7 @@ data_source: **ValidationConfig:** ```python -from datacheck.config import ValidationConfig, RuleConfig, SamplingConfig +from datacheck.config import ValidationConfig, RuleConfig config = ValidationConfig( checks=[ @@ -406,8 +370,6 @@ config = ValidationConfig( rules={"min": 0, "max": 10000} ) ], - plugins=["./custom_rules.py"], - sampling=SamplingConfig(method="random", rate=0.1, seed=42), sources_file="sources.yaml", source="production_db", table="orders" @@ -426,16 +388,6 @@ engine = ValidationEngine(config=config) | `source` | `str \| None` | Named source override | | `table` | `str \| None` | Table override | -**SamplingConfig:** - -| Field | Type | Default | Description | -|-------|------|---------|-------------| -| `method` | `str` | `"none"` | `none`, `random`, `stratified`, `top`, `systematic` | -| `rate` | `float \| None` | `None` | Fraction (0.0–1.0) | -| `count` | `int \| None` | `None` | Row count | -| `stratify_by` | `str \| None` | `None` | Column for stratified sampling | -| `seed` | `int \| None` | `None` | Random seed | - --- ## Data Loading @@ -449,8 +401,6 @@ from datacheck.loader import LoaderFactory df = LoaderFactory.load("data.csv") df = LoaderFactory.load("data.parquet") -df = LoaderFactory.load("data.avro") -df = LoaderFactory.load("analytics.duckdb", table_name="orders") ``` Create a specific loader instance: @@ -478,43 +428,6 @@ loader = ParquetLoader("data.parquet") df = loader.load() ``` -### Delta Lake Loader - -```python -from datacheck.loader import DeltaLakeLoader - -loader = DeltaLakeLoader( - "s3://bucket/delta-table", - version=10, # Time travel by version - timestamp="2024-06-15T12:00:00", # Or by timestamp - columns=["id", "amount", "email"], # Column selection - storage_options={ - "AWS_ACCESS_KEY_ID": "...", - "AWS_SECRET_ACCESS_KEY": "..." - } -) -df = loader.load() - -# Get table metadata -metadata = loader.load_metadata() -# Returns: {version, file_uris, schema, metadata, protocol} - -# Get version history -history = loader.history(limit=10) -``` - -### Avro Loader - -```python -from datacheck.loader import AvroLoader - -loader = AvroLoader("data.avro", reader_schema=None) -df = loader.load() - -schema = loader.load_schema() -is_valid = loader.validate_schema(expected_schema) -``` - ### Database Loader ```python @@ -531,129 +444,6 @@ df = loader.load() --- -## Data Profiling - -### DataProfiler - -Generate quality profiles with statistics, outlier detection, and rule suggestions. - -```python -from datacheck.profiling import DataProfiler - -profiler = DataProfiler( - outlier_method="zscore", # "zscore" or "iqr" - outlier_threshold=3.0, # For Z-score method - iqr_multiplier=1.5 # For IQR method -) - -profile = profiler.profile(df, name="customer_data") - -print(f"Rows: {profile.row_count}") -print(f"Columns: {profile.column_count}") -print(f"Memory: {profile.memory_usage_mb:.1f} MB") -print(f"Quality score: {profile.overall_quality_score}/100") -print(f"Completeness: {profile.completeness_percentage:.1f}%") -print(f"Duplicates: {profile.total_duplicates}") -``` - -### DatasetProfile - -Returned by `profiler.profile()`. - -| Attribute | Type | Description | -|-----------|------|-------------| -| `name` | `str` | Dataset name | -| `row_count` | `int` | Number of rows | -| `column_count` | `int` | Number of columns | -| `created_at` | `datetime` | Profile creation time | -| `columns` | `dict[str, ColumnProfile]` | Per-column profiles | -| `overall_quality_score` | `float` | 0–100 | -| `correlations` | `dict[str, dict[str, float]]` | Numeric column correlations | -| `total_nulls` | `int` | Total null cells across all columns | -| `total_duplicates` | `int` | Duplicate rows | -| `completeness_percentage` | `float` | Overall data completeness | -| `memory_usage_mb` | `float` | Memory usage | -| `cross_column_rules` | `list[dict]` | Auto-detected cross-column rules (`sum_equals`, `unique_combination`) | - -**Properties:** - -```python -profile.column_names -> list[str] -``` - -**Methods:** - -```python -profile.to_dict() -> dict[str, Any] -``` - -### ColumnProfile - -Per-column statistics within a `DatasetProfile`. - -| Attribute | Type | Description | -|-----------|------|-------------| -| `name` | `str` | Column name | -| `dtype` | `str` | Raw pandas data type | -| `inferred_type` | `str` | Inferred type: `integer`, `numeric`, `categorical`, `boolean`, or `datetime` | -| `column_type` | `str` | Display-friendly type (same as `inferred_type`) | -| `total_count` | `int` | Row count | -| `null_count` | `int` | Null values | -| `null_percentage` | `float` | Null percentage | -| `unique_count` | `int` | Unique values | -| `unique_percentage` | `float` | Uniqueness percentage | -| `completeness` | `float` | Non-null percentage | -| `quality_score` | `float` | 0–100 quality score | - -Numeric columns also have: - -| Attribute | Type | Description | -|-----------|------|-------------| -| `min_value` | `float` | Minimum | -| `max_value` | `float` | Maximum | -| `mean` | `float` | Mean | -| `median` | `float` | Median | -| `std_dev` | `float` | Standard deviation | -| `percentile_25` | `float` | 25th percentile | -| `percentile_75` | `float` | 75th percentile | -| `outlier_count` | `int` | Detected outliers | -| `outlier_percentage` | `float` | Outlier percentage | - -Datetime columns also have: - -| Attribute | Type | Description | -|-----------|------|-------------| -| `min_date` | `str` | Earliest date | -| `max_date` | `str` | Latest date | - -All columns have: - -| Attribute | Type | Description | -|-----------|------|-------------| -| `top_values` | `list[tuple]` | Most frequent values and counts | -| `issues` | `list[str]` | Detected quality issues | -| `suggestions` | `list[dict]` | Suggested validation rules | - -**Example:** - -```python -for col_name, col in profile.columns.items(): - print(f"\n{col_name} ({col.inferred_type}, {col.dtype})") - print(f" Nulls: {col.null_count} ({col.null_percentage:.1f}%)") - print(f" Unique: {col.unique_count}") - print(f" Quality: {col.quality_score}/100") - - if col.issues: - for issue in col.issues: - print(f" Issue: {issue}") - - if col.suggestions: - for suggestion in col.suggestions: - print(f" Suggestion: {suggestion['rule']}") -``` - ---- - ## Schema Evolution Capture schema baselines, compare against them, and detect breaking changes. @@ -779,103 +569,6 @@ col = ColumnSchema( --- -## Custom Rules - -### The @custom_rule Decorator - -Create custom validation functions. The function receives a `pd.Series` (the column data) and returns a `pd.Series` of booleans (`True` = valid). - -```python -from datacheck.plugins.decorators import custom_rule -import pandas as pd - -@custom_rule -def is_business_email(column: pd.Series, allowed_domains: list[str]) -> pd.Series: - domains = column.dropna().str.split("@").str[1] - return domains.isin(allowed_domains) - -@custom_rule -def positive_and_even(column: pd.Series) -> pd.Series: - return (column > 0) & (column % 2 == 0) - -@custom_rule -def is_valid_age(column: pd.Series, min_age: int = 0, max_age: int = 150) -> pd.Series: - return (column >= min_age) & (column <= max_age) -``` - -Reference in your YAML config: - -```yaml -plugins: - - ./custom_rules.py - -checks: - - name: email_check - column: email - rules: - custom: - rule: is_business_email - params: - allowed_domains: ["company.com", "partner.org"] - - - name: age_check - column: age - rules: - custom: - rule: is_valid_age - params: - min_age: 18 - max_age: 120 -``` - -### Plugin Loader - -Load custom rule files programmatically. - -```python -from datacheck.plugins.loader import PluginLoader - -loader = PluginLoader() - -# Load from a file -loaded = loader.load_from_file("my_rules.py") -# Returns: list of rule names loaded - -# Load all rules from a directory -loaded = loader.load_from_directory("rules/") -``` - -### Rule Registry - -Register and manage rules programmatically without files. - -```python -from datacheck.plugins.registry import get_global_registry - -registry = get_global_registry() - -# Register -registry.register("my_rule", my_rule_function) - -# Check -registry.has_rule("my_rule") - -# List all -registry.list_rules() - -# Execute -result = registry.execute_rule( - "my_rule", - df["column"], - params={"threshold": 100} -) - -# Clear all -registry.clear() -``` - ---- - ## Validation API (Multi-Column Rules) The `datacheck.validation` module provides a higher-level API that wraps engine rules with multi-column support and severity levels. Use this for programmatic validation in scripts and notebooks. @@ -971,13 +664,11 @@ results = rule.validate(df) | Category | Rule Class | Engine Rule | |----------|-----------|-------------| | Null & Uniqueness | `NotNullRule`, `UniqueRule` | `not_null`, `unique` | -| Numeric | `RangeRule`, `MeanBetweenRule`, `StdDevLessThanRule`, `PercentileRangeRule`, `ZScoreOutliersRule`, `DistributionTypeRule` | `min`/`max`, `mean_between`, `std_dev_less_than`, `percentile_range`, `z_score_outliers`, `distribution_type` | +| Numeric | `RangeRule`, `BooleanRule` | `min`/`max`, `range`, `boolean` | | String & Pattern | `RegexRule`, `EnumRule`, `LengthRule` | `regex`, `allowed_values`, `length` | | Type | `TypeRule` | `type` | -| Temporal | `MaxAgeRule`, `TimestampRangeRule`, `NoFutureTimestampsRule`, `DateFormatValidRule`, `BusinessDaysOnlyRule` | `max_age`, `timestamp_range`, `no_future_timestamps`, `date_format_valid`, `business_days_only` | -| Semantic | `EmailValidRule`, `PhoneValidRule`, `UrlValidRule`, `JsonValidRule` | `email_valid`, `phone_valid`, `url_valid`, `json_valid` | +| Temporal | `MaxAgeRule`, `TimestampRangeRule`, `NoFutureTimestampsRule`, `DateFormatValidRule` | `max_age`, `timestamp_range`, `no_future_timestamps`, `date_format_valid` | | Cross-Column | `ForeignKeyExistsRule`, `SumEqualsRule`, `UniqueCombinationRule` | `foreign_key_exists`, `sum_equals`, `unique_combination` | -| Custom | `CustomRule` | `custom` | --- @@ -1074,78 +765,6 @@ with connector: --- -## Sampling - -### DataSampler - -Simple sampling methods. - -```python -from datacheck.sampling import DataSampler - -# Random -sample = DataSampler.random_sample(df, rate=0.1, seed=42) -sample = DataSampler.random_sample(df, count=5000, seed=42) - -# Stratified -sample = DataSampler.stratified_sample(df, column="region", count=100, seed=42) - -# Top N -sample = DataSampler.top_n(df, n=1000) - -# Systematic (every Nth row) -sample = DataSampler.systematic_sample(df, interval=10, start=0) -``` - -### Advanced Samplers - -```python -from datacheck.sampling import ( - SamplerFactory, - SamplingStrategy, - RandomSampler, - StratifiedSampler, - TimeBasedSampler, - ErrorFocusedSampler, - AdaptiveSampler, - ReservoirSampler -) - -# Create by strategy enum -sampler = SamplerFactory.create(SamplingStrategy.RANDOM) - -# Time-based -time_sampler = TimeBasedSampler() -sample = time_sampler.sample( - df, - time_column="created_at", - start_date="2024-01-01", - end_date="2024-06-30" -) - -# Error-focused (oversamples likely failures) -error_sampler = ErrorFocusedSampler() -sample = error_sampler.sample( - df, - n=1000, - error_indicators=["age < 0", "price > 100000"] -) - -# Stratified with proportional allocation -strat_sampler = StratifiedSampler() -sample = strat_sampler.sample_proportional( - df, - stratify_column="region", - total_sample_size=1000 -) - -# Reservoir (fixed memory, streaming-friendly) -reservoir_sampler = ReservoirSampler() -sample = reservoir_sampler.sample(df, k=5000) -``` - ---- - ## Notifications ### SlackNotifier @@ -1215,17 +834,15 @@ Runs config-based validation rules against files or database sources. Uses `Vali DataCheckOperator( task_id: str, config_path: str, # Path to DataCheck YAML config (required) - file_path: str | None = None, # Data file path (CSV, Parquet, Avro, Delta, etc.) + file_path: str | None = None, # Data file path (CSV, Parquet, etc.) sources_file: str | None = None, # Path to sources.yaml source_name: str | None = None, # Named source from sources.yaml table: str | None = None, # Database table name where: str | None = None, # SQL WHERE clause query: str | None = None, # Custom SQL query (alternative to table) - sample_rate: float | None = None, # Random sample fraction (0.0-1.0) parallel: bool = False, # Enable multi-core validation workers: int | None = None, # Worker processes (default: CPU count) min_pass_rate: float = 0.0, # Minimum pass rate threshold (0-100) - min_quality_score: float = 0.0, # Minimum quality score threshold (0-100) fail_on_error: bool = True, # Raise AirflowException on failure push_results: bool = True, # Push results to XCom ) @@ -1241,8 +858,8 @@ DataCheckOperator( | Mode | Condition | Behavior | |------|-----------|----------| -| Strict (default) | `min_pass_rate` and `min_quality_score` are both `0` | Fails if **any** error-severity rule fails | -| Threshold | Either threshold is set `> 0` | Fails only if pass rate drops below the threshold | +| Strict (default) | `min_pass_rate` is `0` | Fails if **any** error-severity rule fails | +| Threshold | `min_pass_rate` is set `> 0` | Fails only if pass rate drops below the threshold | **Validate a file:** @@ -1278,14 +895,13 @@ validate_lenient = DataCheckOperator( ) ``` -**With sampling and parallel execution:** +**With parallel execution:** ```python validate_large = DataCheckOperator( task_id="validate_large_dataset", config_path="/opt/airflow/config/checks.yaml", file_path="/data/events.parquet", - sample_rate=0.1, parallel=True, workers=4, ) @@ -1319,7 +935,7 @@ validate = DataCheckOperator( ### DataCheckSchemaOperator -Detects schema changes by comparing current data against a saved baseline. If no baseline exists, it captures one automatically. Uses DataCheck's `SchemaDetector`, `SchemaComparator`, and `BaselineManager`. +Enforces schema contracts by comparing current data against a saved baseline - fails if breaking changes are detected. If no baseline exists, it captures one automatically. Uses DataCheck's `SchemaDetector`, `SchemaComparator`, and `BaselineManager`. ```python DataCheckSchemaOperator( @@ -1387,8 +1003,8 @@ schema_check = DataCheckSchemaOperator( **Allow breaking changes (don't fail the task):** ```python -schema_monitor = DataCheckSchemaOperator( - task_id="schema_monitor", +schema_check = DataCheckSchemaOperator( + task_id="schema_check", file_path="/data/events.csv", baseline_name="events", fail_on_breaking=False, @@ -1477,7 +1093,7 @@ with DAG( fail_on_breaking=True, ) - # Step 2: Validate data quality + # Step 2: Enforce validation rules validate = DataCheckOperator( task_id="validate_orders", config_path="/opt/airflow/config/checks.yaml", @@ -1515,7 +1131,6 @@ with DAG( table="orders", where="updated_at >= '{{ data_interval_start }}'", min_pass_rate=95.0, - sample_rate=0.1, ) validate_customers = DataCheckOperator( @@ -1530,7 +1145,7 @@ with DAG( [validate_orders, validate_customers] ``` -**Schema evolution monitoring:** +**Schema evolution enforcement:** ```python from airflow import DAG @@ -1552,7 +1167,7 @@ def log_schema_changes(**context): print("No schema changes") with DAG( - "schema_monitoring", + "schema_enforcement", schedule_interval="@daily", start_date=days_ago(1), catchup=False, @@ -1632,7 +1247,7 @@ from datacheck.exceptions import ( ConfigurationError, # Invalid config, missing file, bad rule definitions ValidationError, # Validation execution errors DataLoadError, # File not found, connection failure, unsupported format - RuleDefinitionError, # Invalid rule config, bad custom rule signature + RuleDefinitionError, # Invalid rule config or parameters UnsupportedFormatError, # Unsupported file type ColumnNotFoundError, # Column missing from DataFrame EmptyDatasetError, # Dataset has no rows @@ -1655,11 +1270,10 @@ except ColumnNotFoundError as e: ## Complete Example -An end-to-end pipeline: profile data, capture schema, validate, check for schema drift, and notify on failure. +An end-to-end pipeline: capture schema, validate, check for schema drift, and notify on failure. ```python from datacheck.engine import ValidationEngine -from datacheck.profiling import DataProfiler from datacheck.schema import SchemaDetector, SchemaComparator, BaselineManager from datacheck.loader import LoaderFactory from datacheck.notifications.slack import SlackNotifier @@ -1667,17 +1281,7 @@ from datacheck.notifications.slack import SlackNotifier # 1. Load data df = LoaderFactory.load("data.csv") -# 2. Profile -profiler = DataProfiler() -profile = profiler.profile(df, name="daily_orders") -print(f"Quality score: {profile.overall_quality_score}/100") -print(f"Completeness: {profile.completeness_percentage:.1f}%") - -for col_name, col in profile.columns.items(): - if col.issues: - print(f" {col_name}: {', '.join(col.issues)}") - -# 3. Schema evolution check +# 2. Schema evolution check detector = SchemaDetector() current_schema = detector.detect(df, name="orders", source="data.csv") @@ -1695,7 +1299,7 @@ else: manager.save_baseline(current_schema, name="orders") print("Schema baseline saved") -# 4. Validate +# 3. Validate notifier = SlackNotifier("https://hooks.slack.com/services/...") engine = ValidationEngine( config_path=".datacheck.yaml", @@ -1706,7 +1310,7 @@ engine = ValidationEngine( summary = engine.validate_dataframe(df) -# 5. Report +# 4. Report print(f"\nResults: {summary.passed_rules}/{summary.total_rules} passed") if not summary.all_passed: @@ -1715,7 +1319,7 @@ if not summary.all_passed: print(f" FAIL: {result.rule_name} on {result.column}") print(f" {detail.failed_count}/{detail.total_count} rows ({detail.failure_rate:.1f}%)") -# 6. Exit with appropriate code +# 5. Exit with appropriate code import sys sys.exit(0 if summary.all_passed else 1) ``` diff --git a/poetry.lock b/poetry.lock index f95ddb4..c8eaf3f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,84 +1,5 @@ # This file is automatically @generated by Poetry 2.3.2 and should not be changed by hand. -[[package]] -name = "arro3-core" -version = "0.6.5" -description = "" -optional = false -python-versions = ">=3.9" -groups = ["main", "dev"] -files = [ - {file = "arro3_core-0.6.5-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:da193dc2fb8c2005d0b3887b09d1a90d42cec1f59f17a8a1a5791f0de90946ae"}, - {file = "arro3_core-0.6.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed1a760ec39fe19c65e98f45515582408002d0212df5db227a5959ffeb07ad4a"}, - {file = "arro3_core-0.6.5-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6584a3d28007740afcef1e301332876e2b785bd8edd59a458a6bc9b051bce052"}, - {file = "arro3_core-0.6.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8e0af4789618f02bead4a0cd4d0a54abd9c8aa4fcedf9872b4891d2e3e984161"}, - {file = "arro3_core-0.6.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c73f212e549e9b6d11cfe3f14bbf3fba9d0891426afb5916688d16d0df724085"}, - {file = "arro3_core-0.6.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89f88f62e4e276a9e84f250722d2e5ffc078af9a3f67ac691f572a0e05dd6095"}, - {file = "arro3_core-0.6.5-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:b2635e4c227f25ff8784dc8efb38cb7c1674646cfdc68ded53f2426289885f0e"}, - {file = "arro3_core-0.6.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a5f3e936686bcd8542fafc94c68fdb23ec42d1d51a4777967ae815c90aff7296"}, - {file = "arro3_core-0.6.5-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:705c32fec03dadc08f807d69ce557882005d43eb20ec62699f7036340f0d580f"}, - {file = "arro3_core-0.6.5-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:56d8166235a4c54e4f7ba082ec76890c820fa8c1b6c995ec59cead62a9698e59"}, - {file = "arro3_core-0.6.5-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:1ba43ba9081c00767083195222b6be74913de668296f55599658c4b0bb7cd327"}, - {file = "arro3_core-0.6.5-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:4f5df13c6742e3f0b494cfe9025dccdc8426a74cc9e3e5a1239311e07a4b24e0"}, - {file = "arro3_core-0.6.5-cp310-cp310-win_amd64.whl", hash = "sha256:34676b728178236df63c9ea10b21432392d4b5bb51e2030e77c68eed4dede2ad"}, - {file = "arro3_core-0.6.5-cp311-abi3-macosx_10_12_x86_64.whl", hash = "sha256:9d5999506daec1ab31096b3deb1e3573041d6ecadb4ca99c96f7ab26720c592c"}, - {file = "arro3_core-0.6.5-cp311-abi3-macosx_11_0_arm64.whl", hash = "sha256:bd3e251184c2dd6ade81c5613256b6d85ab3ddbd5af838b1de657e0ddec017f8"}, - {file = "arro3_core-0.6.5-cp311-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7cadb29349960d3821b0515d9df80f2725cea155ad966c699f6084de32e313cb"}, - {file = "arro3_core-0.6.5-cp311-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a922e560ed2ccee3293d51b39e013b51cc233895d25ddafcacfb83c540a19e6f"}, - {file = "arro3_core-0.6.5-cp311-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:68fe6672bf51f039b12046a209cba0a9405e10ae44e5a0d557f091b356a62051"}, - {file = "arro3_core-0.6.5-cp311-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5c3ee95603e375401a58ff763ce2c8aa858e0c4f757c1fb719f48fb070f540b2"}, - {file = "arro3_core-0.6.5-cp311-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:fbaf6b65213630007b798b565e0701c2092a330deeba16bd3d896d401f7e9f28"}, - {file = "arro3_core-0.6.5-cp311-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:20679f874558bb2113e96325522625ec64a72687000b7a9578031a4d082c6ef5"}, - {file = "arro3_core-0.6.5-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:d82d6ec32d5c7c73057fb9c528390289fd5bc94b8d8f28fca9c56fc8e41c412c"}, - {file = "arro3_core-0.6.5-cp311-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:4cba4db0a4203a3ccf131c3fb7804d77f0740d6165ec9efa3aa3acbca87c43a3"}, - {file = "arro3_core-0.6.5-cp311-abi3-musllinux_1_2_i686.whl", hash = "sha256:e358affc4a0fe5c1b5dccf4f92c43a836aaa4c4eab0906c83b00b60275de3b6d"}, - {file = "arro3_core-0.6.5-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:324e43f07b7681846d00a8995b78bdc4b4a719047aa0d34426b462b8f208ee98"}, - {file = "arro3_core-0.6.5-cp311-abi3-win_amd64.whl", hash = "sha256:285f802c8a42fe29ecb84584d1700bc4c4f974552b75f805e1f4362d28b97080"}, - {file = "arro3_core-0.6.5-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:8c20e69c3b3411fd6ed56091f388e699072651e880e682be5bd14f3a392ed3e8"}, - {file = "arro3_core-0.6.5-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:92211f1d03221ff74d0b535a576b39601083d8e98e9d47228314573f9d4f9ae2"}, - {file = "arro3_core-0.6.5-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:280d933b75f2649779d76e32a07f91d2352a952f2c97ddf7b320e267f440cd42"}, - {file = "arro3_core-0.6.5-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bfc3f6b93b924f43fb7985b06202343c30b43da6bd5055ba8b84eda431e494d4"}, - {file = "arro3_core-0.6.5-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a5963635eb698ebc7da689e641f68b3998864bab894cf0ca84bd058b8c60d97f"}, - {file = "arro3_core-0.6.5-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac291b3e74b57e56e03373d57530540cbbbfd92e4219fe2778ea531006673fe9"}, - {file = "arro3_core-0.6.5-cp313-cp313t-manylinux_2_24_aarch64.whl", hash = "sha256:5d3f4cc58a654037d61f61ba230419da2c8f88a0ac82b9d41fe307f7cf9fda97"}, - {file = "arro3_core-0.6.5-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:93cddac90238d64451f5e66c630ded89d0b5fd6d2c099bf3a5151dde2c1ddf1d"}, - {file = "arro3_core-0.6.5-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:1fa7ac10db5846c33f4e8b66a6eaa705d84998e38575a835acac9a6a6649933d"}, - {file = "arro3_core-0.6.5-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:ca69f698a065cdbf845d59d412bc204e8f8af12f93737d82e6a18f3cff812349"}, - {file = "arro3_core-0.6.5-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:de74a2512e2e2366d4b064c498c38672bf6ddea38acec8b1999b4e66182dd001"}, - {file = "arro3_core-0.6.5-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:806ca8e20507675b2de68b3d009f76e898cc3c3e441c834ea5220866f68aac50"}, - {file = "arro3_core-0.6.5-cp313-cp313t-win_amd64.whl", hash = "sha256:8f6f0cc78877ade7ad6e678a4671b191406547e7b407bc9637436869c017ed47"}, - {file = "arro3_core-0.6.5-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:dfac7fac3c6a302399d94644d48682a19488a5b67bd1ccbdf6c560a7ffabde6d"}, - {file = "arro3_core-0.6.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9fc70042e558d1cd5fbe917b58e8ef52701441e38ff30b1912858050f796a62c"}, - {file = "arro3_core-0.6.5-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1583b29b2ba83927a33e5435e5d9d134114c45a6360a8bb4db4beda13dab4fd8"}, - {file = "arro3_core-0.6.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6a170fe53f18dda4a4647fd3b8b4a9373fc11ac42c41a4b65f55d79ad531a33e"}, - {file = "arro3_core-0.6.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:83047b4e6e18835c91c8d12c5494e6ababc7c185c5a772d3429e8f9b0c185894"}, - {file = "arro3_core-0.6.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d3d4393d281d1ef18927915a11187da27287d279f99d5325bc9afb417f76084f"}, - {file = "arro3_core-0.6.5-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:f0c88d8babcf51affdd69390882e2f0ecb1890a1b8a5abfc087d003e7181eb6e"}, - {file = "arro3_core-0.6.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:36424e1d62212466a5cacdc27d414e99bf0fdab1544cc2b7e5b81e41437e5970"}, - {file = "arro3_core-0.6.5-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4eb4d96f7db618f100758a8b7ec1b221c8737d543073701b7ffee74bc5019d46"}, - {file = "arro3_core-0.6.5-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:2cfe9b4b1dd663d256754f1aa7aae783a1cddd3eb5698892b9caf381431f0af7"}, - {file = "arro3_core-0.6.5-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:a3b2621505f97eb5ce80f1c6fa8c77d18d757ab48d1f11d33a805e9ccbcd6fb6"}, - {file = "arro3_core-0.6.5-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:6c1becbb96ceba0b20f3d4318dd35f3417ee9a49065813d99f52b0fa285fc569"}, - {file = "arro3_core-0.6.5-cp39-cp39-win_amd64.whl", hash = "sha256:5459e7bd39bb9dd8c57aa06856d2bebc5c1ca782cbccab0e186c6c89530e4ca9"}, - {file = "arro3_core-0.6.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:26d5b50139f1a96727fa1760b4d70393acf5ee0fba45346ad2d4f69824d3bdc2"}, - {file = "arro3_core-0.6.5-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:b65b3d8d7f65f2f3c36002dc467380d7a31ea771132986dddc6341c5a9dc726f"}, - {file = "arro3_core-0.6.5-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6c3442a79a757ed3fbd7793de180019ae3201f04237537c2e2e3f1e3dd99b31c"}, - {file = "arro3_core-0.6.5-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:def7b0065a684d6f903a658d2567da47e2fcecde716e0b34eff4d899c6468c8d"}, - {file = "arro3_core-0.6.5-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cbfe2f2d4d0d393833cd6a4bd9c15266a02307a3028f159155a1c536469c3ae7"}, - {file = "arro3_core-0.6.5-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a191a3e4f72c34f7ace7724a94f2d90b06c804a6cbece4ae0f18d36325479cf3"}, - {file = "arro3_core-0.6.5-pp310-pypy310_pp73-manylinux_2_24_aarch64.whl", hash = "sha256:e3f6ab4c6ea96c451eff72aa6c5b9835a0ea8a9847cfe3995c88cce0c7701fb5"}, - {file = "arro3_core-0.6.5-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:27df5239835330299636a02977f2cb34d5c460cc03b2ae1d6ab6a03d28051b08"}, - {file = "arro3_core-0.6.5-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:71dce89c0e91be4cfb42591f03809235bbc374c396e08acdf93c4d85b09e40f5"}, - {file = "arro3_core-0.6.5-pp310-pypy310_pp73-musllinux_1_2_armv7l.whl", hash = "sha256:d380c28f85568ed99c1686fb9d64b5a811d76d569f367cbec8ef7e58f6e2fdf9"}, - {file = "arro3_core-0.6.5-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:8e359c0c4fe9992f5a863a4a31502ea58eb2f92988fc2e501850540b3eff0328"}, - {file = "arro3_core-0.6.5-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:9a58acbc61480b533aa84d735db04b1e68fc7f6807ab694d606c03b5e694d83d"}, - {file = "arro3_core-0.6.5.tar.gz", hash = "sha256:768078887cd7ac82de4736f94bbd91f6d660f10779848bd5b019f511badd9d75"}, -] -markers = {main = "extra == \"deltalake\" or extra == \"formats\" or extra == \"all\""} - -[package.dependencies] -typing-extensions = {version = "*", markers = "python_full_version < \"3.12.0\""} - [[package]] name = "asn1crypto" version = "1.5.1" @@ -90,7 +11,7 @@ files = [ {file = "asn1crypto-1.5.1-py2.py3-none-any.whl", hash = "sha256:db4e40728b728508912cbb3d44f19ce188f218e9eba635821bb4b68564f8fd67"}, {file = "asn1crypto-1.5.1.tar.gz", hash = "sha256:13ae38502be632115abf8a24cbe5f4da52e3b5231990aff31123c805306ccb9c"}, ] -markers = {main = "python_version <= \"3.13\" and (extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\")", dev = "python_version <= \"3.13\""} +markers = {main = "(extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\") and python_version <= \"3.13\"", dev = "python_version <= \"3.13\""} [[package]] name = "attrs" @@ -105,49 +26,6 @@ files = [ ] markers = {main = "extra == \"validation\" or extra == \"all\""} -[[package]] -name = "azure-core" -version = "1.38.0" -description = "Microsoft Azure Core Library for Python" -optional = false -python-versions = ">=3.9" -groups = ["main", "dev"] -files = [ - {file = "azure_core-1.38.0-py3-none-any.whl", hash = "sha256:ab0c9b2cd71fecb1842d52c965c95285d3cfb38902f6766e4a471f1cd8905335"}, - {file = "azure_core-1.38.0.tar.gz", hash = "sha256:8194d2682245a3e4e3151a667c686464c3786fed7918b394d035bdcd61bb5993"}, -] -markers = {main = "extra == \"azure\" or extra == \"cloud\" or extra == \"all\""} - -[package.dependencies] -requests = ">=2.21.0" -typing-extensions = ">=4.6.0" - -[package.extras] -aio = ["aiohttp (>=3.0)"] -tracing = ["opentelemetry-api (>=1.26,<2.0)"] - -[[package]] -name = "azure-storage-blob" -version = "12.28.0" -description = "Microsoft Azure Blob Storage Client Library for Python" -optional = false -python-versions = ">=3.9" -groups = ["main", "dev"] -files = [ - {file = "azure_storage_blob-12.28.0-py3-none-any.whl", hash = "sha256:00fb1db28bf6a7b7ecaa48e3b1d5c83bfadacc5a678b77826081304bd87d6461"}, - {file = "azure_storage_blob-12.28.0.tar.gz", hash = "sha256:e7d98ea108258d29aa0efbfd591b2e2075fa1722a2fae8699f0b3c9de11eff41"}, -] -markers = {main = "extra == \"azure\" or extra == \"cloud\" or extra == \"all\""} - -[package.dependencies] -azure-core = ">=1.30.0" -cryptography = ">=2.1.4" -isodate = ">=0.6.1" -typing-extensions = ">=4.6.0" - -[package.extras] -aio = ["azure-core[aio] (>=1.30.0)"] - [[package]] name = "boto3" version = "1.42.30" @@ -159,7 +37,7 @@ files = [ {file = "boto3-1.42.30-py3-none-any.whl", hash = "sha256:d7e548bea65e0ae2c465c77de937bc686b591aee6a352d5a19a16bc751e591c1"}, {file = "boto3-1.42.30.tar.gz", hash = "sha256:ba9cd2f7819637d15bfbeb63af4c567fcc8a7dcd7b93dd12734ec58601169538"}, ] -markers = {main = "(python_version <= \"3.13\" or extra == \"s3\" or extra == \"cloud\" or extra == \"redshift\" or extra == \"warehouses\" or extra == \"all\") and (extra == \"s3\" or extra == \"cloud\" or extra == \"redshift\" or extra == \"warehouses\" or extra == \"all\" or extra == \"snowflake\")"} +markers = {main = "(extra == \"s3\" or extra == \"cloud\" or extra == \"redshift\" or extra == \"warehouses\" or extra == \"all\" or extra == \"snowflake\") and (python_version <= \"3.13\" or extra == \"s3\" or extra == \"cloud\" or extra == \"redshift\" or extra == \"warehouses\" or extra == \"all\")"} [package.dependencies] botocore = ">=1.42.30,<1.43.0" @@ -180,7 +58,7 @@ files = [ {file = "botocore-1.42.30-py3-none-any.whl", hash = "sha256:97070a438cac92430bb7b65f8ebd7075224f4a289719da4ee293d22d1e98db02"}, {file = "botocore-1.42.30.tar.gz", hash = "sha256:9bf1662b8273d5cc3828a49f71ca85abf4e021011c1f0a71f41a2ea5769a5116"}, ] -markers = {main = "(python_version <= \"3.13\" or extra == \"s3\" or extra == \"cloud\" or extra == \"redshift\" or extra == \"warehouses\" or extra == \"all\") and (extra == \"s3\" or extra == \"cloud\" or extra == \"redshift\" or extra == \"warehouses\" or extra == \"all\" or extra == \"snowflake\")"} +markers = {main = "(extra == \"s3\" or extra == \"cloud\" or extra == \"redshift\" or extra == \"warehouses\" or extra == \"all\" or extra == \"snowflake\") and (python_version <= \"3.13\" or extra == \"s3\" or extra == \"cloud\" or extra == \"redshift\" or extra == \"warehouses\" or extra == \"all\")"} [package.dependencies] jmespath = ">=0.7.1,<2.0.0" @@ -201,7 +79,7 @@ files = [ {file = "certifi-2026.1.4-py3-none-any.whl", hash = "sha256:9943707519e4add1115f44c2bc244f782c0249876bf51b6599fee1ffbedd685c"}, {file = "certifi-2026.1.4.tar.gz", hash = "sha256:ac726dd470482006e014ad384921ed6438c457018f4b3d204aea4281258b2120"}, ] -markers = {main = "(python_version <= \"3.13\" or extra == \"gcs\" or extra == \"cloud\" or extra == \"all\" or extra == \"bigquery\" or extra == \"warehouses\" or extra == \"azure\") and (extra == \"gcs\" or extra == \"cloud\" or extra == \"all\" or extra == \"bigquery\" or extra == \"warehouses\" or extra == \"azure\" or extra == \"snowflake\")"} +markers = {main = "(extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\" or extra == \"bigquery\") and (python_version <= \"3.13\" or extra == \"bigquery\" or extra == \"warehouses\" or extra == \"all\")"} [[package]] name = "cffi" @@ -279,7 +157,7 @@ files = [ {file = "cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662"}, {file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"}, ] -markers = {main = "python_version <= \"3.13\" and (platform_python_implementation != \"PyPy\" or extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\") and (extra == \"azure\" or extra == \"cloud\" or extra == \"all\" or extra == \"snowflake\" or extra == \"warehouses\")", dev = "python_version <= \"3.13\""} +markers = {main = "(extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\") and python_version <= \"3.13\"", dev = "python_version <= \"3.13\""} [package.dependencies] pycparser = "*" @@ -290,7 +168,8 @@ version = "2.0.0" description = "Foreign Function Interface for Python calling C code." optional = false python-versions = ">=3.9" -groups = ["main", "dev"] +groups = ["dev"] +markers = "python_version >= \"3.14\" and platform_python_implementation != \"PyPy\"" files = [ {file = "cffi-2.0.0-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:0cf2d91ecc3fcc0625c2c530fe004f82c110405f101548512cce44322fa8ac44"}, {file = "cffi-2.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f73b96c41e3b2adedc34a7356e64c8eb96e03a3782b535e043a986276ce12a49"}, @@ -377,7 +256,6 @@ files = [ {file = "cffi-2.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:b882b3df248017dba09d6b16defe9b5c407fe32fc7c65a9c69798e6175601be9"}, {file = "cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529"}, ] -markers = {main = "python_version >= \"3.14\" and platform_python_implementation != \"PyPy\" and (extra == \"azure\" or extra == \"cloud\" or extra == \"all\")", dev = "python_version >= \"3.14\" and platform_python_implementation != \"PyPy\""} [package.dependencies] pycparser = {version = "*", markers = "implementation_name != \"PyPy\""} @@ -516,7 +394,7 @@ files = [ {file = "charset_normalizer-3.4.4-py3-none-any.whl", hash = "sha256:7a32c560861a02ff789ad905a2fe94e3f840803362c84fecf1851cb4cf3dc37f"}, {file = "charset_normalizer-3.4.4.tar.gz", hash = "sha256:94537985111c35f28720e43603b8e7b43a6ecfb2ce1d3058bbe955b73404e21a"}, ] -markers = {main = "(python_version <= \"3.13\" or extra == \"gcs\" or extra == \"cloud\" or extra == \"all\" or extra == \"bigquery\" or extra == \"warehouses\" or extra == \"azure\") and (extra == \"gcs\" or extra == \"cloud\" or extra == \"all\" or extra == \"bigquery\" or extra == \"warehouses\" or extra == \"azure\" or extra == \"snowflake\")"} +markers = {main = "(extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\" or extra == \"bigquery\") and (python_version <= \"3.13\" or extra == \"bigquery\" or extra == \"warehouses\" or extra == \"all\")"} [[package]] name = "click" @@ -778,7 +656,7 @@ files = [ {file = "cryptography-46.0.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:bf1961037309ee0bdf874ccba9820b1c2f720c2016895c44d8eb2316226c1ad5"}, {file = "cryptography-46.0.0.tar.gz", hash = "sha256:99f64a6d15f19f3afd78720ad2978f6d8d4c68cd4eb600fab82ab1a7c2071dca"}, ] -markers = {main = "(python_version <= \"3.13\" or extra == \"azure\" or extra == \"cloud\" or extra == \"all\") and (extra == \"azure\" or extra == \"cloud\" or extra == \"all\" or extra == \"snowflake\" or extra == \"warehouses\")"} +markers = {main = "(extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\") and python_version <= \"3.13\""} [package.dependencies] cffi = [ @@ -797,51 +675,6 @@ ssh = ["bcrypt (>=3.1.5)"] test = ["certifi (>=2024)", "cryptography-vectors (==46.0.0)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"] test-randomorder = ["pytest-randomly"] -[[package]] -name = "deltalake" -version = "1.4.1" -description = "Native Delta Lake Python binding based on delta-rs with Pandas integration" -optional = false -python-versions = ">=3.10" -groups = ["main", "dev"] -files = [ - {file = "deltalake-1.4.1-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:dc7b6b99bd8a8d4949645b8f6073d0ced9dd06109fa9669b7802ddf8207291e1"}, - {file = "deltalake-1.4.1-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:0d4c460a4fab802cf051ca66a49583d93a3490842eb849bd1aae7176b12b8030"}, - {file = "deltalake-1.4.1-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bcd312abe5928f0da3217901431f6f537da4d51162d23cd81fc3849559c5f5cc"}, - {file = "deltalake-1.4.1-cp310-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:cc98a04918d0acd0a425ecaf33b6fbbbe458d395da31139554b97b7a62a045f9"}, - {file = "deltalake-1.4.1-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:9a3af28f14535122395b89ff146c5acca4c00db63191c59c39a0ae30356030b6"}, - {file = "deltalake-1.4.1-cp310-abi3-win_amd64.whl", hash = "sha256:4dd4648be88375b5dadd119cd2a45f481e3df6007da6a519d49646f202b036da"}, - {file = "deltalake-1.4.1.tar.gz", hash = "sha256:0a7e7f2f0f60edab087087f0144e539428c8d5a0e6f80f86fe49db82499a50ab"}, -] -markers = {main = "extra == \"deltalake\" or extra == \"formats\" or extra == \"all\""} - -[package.dependencies] -arro3-core = ">=0.5.0" -deprecated = ">=1.2.18" - -[package.extras] -pandas = ["pandas"] -pyarrow = ["pyarrow (>=16)"] - -[[package]] -name = "deprecated" -version = "1.3.1" -description = "Python @deprecated decorator to deprecate old python classes, functions or methods." -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" -groups = ["main", "dev"] -files = [ - {file = "deprecated-1.3.1-py2.py3-none-any.whl", hash = "sha256:597bfef186b6f60181535a29fbe44865ce137a5079f295b479886c82729d5f3f"}, - {file = "deprecated-1.3.1.tar.gz", hash = "sha256:b1b50e0ff0c1fddaa5708a2c6b0a6588bb09b892825ab2b214ac9ea9d92a5223"}, -] -markers = {main = "extra == \"deltalake\" or extra == \"formats\" or extra == \"all\""} - -[package.dependencies] -wrapt = ">=1.10,<3" - -[package.extras] -dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "setuptools ; python_version >= \"3.12\"", "tox"] - [[package]] name = "distlib" version = "0.4.0" @@ -854,98 +687,6 @@ files = [ {file = "distlib-0.4.0.tar.gz", hash = "sha256:feec40075be03a04501a973d81f633735b4b69f98b05450592310c0f401a4e0d"}, ] -[[package]] -name = "dnspython" -version = "2.8.0" -description = "DNS toolkit" -optional = false -python-versions = ">=3.10" -groups = ["main"] -files = [ - {file = "dnspython-2.8.0-py3-none-any.whl", hash = "sha256:01d9bbc4a2d76bf0db7c1f729812ded6d912bd318d3b1cf81d30c0f845dbf3af"}, - {file = "dnspython-2.8.0.tar.gz", hash = "sha256:181d3c6996452cb1189c4046c61599b84a5a86e099562ffde77d26984ff26d0f"}, -] - -[package.extras] -dev = ["black (>=25.1.0)", "coverage (>=7.0)", "flake8 (>=7)", "hypercorn (>=0.17.0)", "mypy (>=1.17)", "pylint (>=3)", "pytest (>=8.4)", "pytest-cov (>=6.2.0)", "quart-trio (>=0.12.0)", "sphinx (>=8.2.0)", "sphinx-rtd-theme (>=3.0.0)", "twine (>=6.1.0)", "wheel (>=0.45.0)"] -dnssec = ["cryptography (>=45)"] -doh = ["h2 (>=4.2.0)", "httpcore (>=1.0.0)", "httpx (>=0.28.0)"] -doq = ["aioquic (>=1.2.0)"] -idna = ["idna (>=3.10)"] -trio = ["trio (>=0.30)"] -wmi = ["wmi (>=1.5.1) ; platform_system == \"Windows\""] - -[[package]] -name = "duckdb" -version = "1.4.3" -description = "DuckDB in-process database" -optional = false -python-versions = ">=3.9.0" -groups = ["main", "dev"] -files = [ - {file = "duckdb-1.4.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:efa7f1191c59e34b688fcd4e588c1b903a4e4e1f4804945902cf0b20e08a9001"}, - {file = "duckdb-1.4.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4fef6a053a1c485292000bf0c338bba60f89d334f6a06fc76ba4085a5a322b76"}, - {file = "duckdb-1.4.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:702dabbc22b27dc5b73e7599c60deef3d8c59968527c36b391773efddd8f4cf1"}, - {file = "duckdb-1.4.3-cp310-cp310-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:854b79375fa618f6ffa8d84fb45cbc9db887f6c4834076ea10d20bc106f1fd90"}, - {file = "duckdb-1.4.3-cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1bb8bd5a3dd205983726185b280a211eacc9f5bc0c4d4505bec8c87ac33a8ccb"}, - {file = "duckdb-1.4.3-cp310-cp310-win_amd64.whl", hash = "sha256:d0ff08388ef8b1d1a4c95c321d6c5fa11201b241036b1ee740f9d841df3d6ba2"}, - {file = "duckdb-1.4.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:366bf607088053dce845c9d24c202c04d78022436cc5d8e4c9f0492de04afbe7"}, - {file = "duckdb-1.4.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:8d080e8d1bf2d226423ec781f539c8f6b6ef3fd42a9a58a7160de0a00877a21f"}, - {file = "duckdb-1.4.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9dc049ba7e906cb49ca2b6d4fbf7b6615ec3883193e8abb93f0bef2652e42dda"}, - {file = "duckdb-1.4.3-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2b30245375ea94ab528c87c61fc3ab3e36331180b16af92ee3a37b810a745d24"}, - {file = "duckdb-1.4.3-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a7c864df027da1ee95f0c32def67e15d02cd4a906c9c1cbae82c09c5112f526b"}, - {file = "duckdb-1.4.3-cp311-cp311-win_amd64.whl", hash = "sha256:813f189039b46877b5517f1909c7b94a8fe01b4bde2640ab217537ea0fe9b59b"}, - {file = "duckdb-1.4.3-cp311-cp311-win_arm64.whl", hash = "sha256:fbc63ffdd03835f660155b37a1b6db2005bcd46e5ad398b8cac141eb305d2a3d"}, - {file = "duckdb-1.4.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:6302452e57aef29aae3977063810ed7b2927967b97912947b9cca45c1c21955f"}, - {file = "duckdb-1.4.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:deab351ac43b6282a3270e3d40e3d57b3b50f472d9fd8c30975d88a31be41231"}, - {file = "duckdb-1.4.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5634e40e1e2d972e4f75bced1fbdd9e9e90faa26445c1052b27de97ee546944a"}, - {file = "duckdb-1.4.3-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:274d4a31aba63115f23e7e7b401e3e3a937f3626dc9dea820a9c7d3073f450d2"}, - {file = "duckdb-1.4.3-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4f868a7e6d9b37274a1aa34849ea92aa964e9bd59a5237d6c17e8540533a1e4f"}, - {file = "duckdb-1.4.3-cp312-cp312-win_amd64.whl", hash = "sha256:ef7ef15347ce97201b1b5182a5697682679b04c3374d5a01ac10ba31cf791b95"}, - {file = "duckdb-1.4.3-cp312-cp312-win_arm64.whl", hash = "sha256:1b9b445970fd18274d5ac07a0b24c032e228f967332fb5ebab3d7db27738c0e4"}, - {file = "duckdb-1.4.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:16952ac05bd7e7b39946695452bf450db1ebbe387e1e7178e10f593f2ea7b9a8"}, - {file = "duckdb-1.4.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:de984cd24a6cbefdd6d4a349f7b9a46e583ca3e58ce10d8def0b20a6e5fcbe78"}, - {file = "duckdb-1.4.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1e5457dda91b67258aae30fb1a0df84183a9f6cd27abac1d5536c0d876c6dfa1"}, - {file = "duckdb-1.4.3-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:006aca6a6d6736c441b02ff5c7600b099bb8b7f4de094b8b062137efddce42df"}, - {file = "duckdb-1.4.3-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a2813f4635f4d6681cc3304020374c46aca82758c6740d7edbc237fe3aae2744"}, - {file = "duckdb-1.4.3-cp313-cp313-win_amd64.whl", hash = "sha256:6db124f53a3edcb32b0a896ad3519e37477f7e67bf4811cb41ab60c1ef74e4c8"}, - {file = "duckdb-1.4.3-cp313-cp313-win_arm64.whl", hash = "sha256:a8b0a8764e1b5dd043d168c8f749314f7a1252b5a260fa415adaa26fa3b958fd"}, - {file = "duckdb-1.4.3-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:316711a9e852bcfe1ed6241a5f654983f67e909e290495f3562cccdf43be8180"}, - {file = "duckdb-1.4.3-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:9e625b2b4d52bafa1fd0ebdb0990c3961dac8bb00e30d327185de95b68202131"}, - {file = "duckdb-1.4.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:130c6760f6c573f9c9fe9aba56adba0fab48811a4871b7b8fd667318b4a3e8da"}, - {file = "duckdb-1.4.3-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:20c88effaa557a11267706b01419c542fe42f893dee66e5a6daa5974ea2d4a46"}, - {file = "duckdb-1.4.3-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1b35491db98ccd11d151165497c084a9d29d3dc42fc80abea2715a6c861ca43d"}, - {file = "duckdb-1.4.3-cp314-cp314-win_amd64.whl", hash = "sha256:23b12854032c1a58d0452e2b212afa908d4ce64171862f3792ba9a596ba7c765"}, - {file = "duckdb-1.4.3-cp314-cp314-win_arm64.whl", hash = "sha256:90f241f25cffe7241bf9f376754a5845c74775e00e1c5731119dc88cd71e0cb2"}, - {file = "duckdb-1.4.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:aa26a7406205bc1426cee28bdfdf084f669a5686977dafa4c3ec65873989593c"}, - {file = "duckdb-1.4.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:caa2164c91f7e91befb1ffb081b3cd97a137117533aef7abe1538b03ad72e3a9"}, - {file = "duckdb-1.4.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8d53b217698a76c4957e2c807dd9295d409146f9d3d7932f372883201ba9d25a"}, - {file = "duckdb-1.4.3-cp39-cp39-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8afba22c370f06b7314aa46bfed052509269e482bcfb3f7b1ea0fa17ae49ce42"}, - {file = "duckdb-1.4.3-cp39-cp39-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2b195270ff1a661f22cbd547a215baff265b7d4469a76a215c8992b5994107c3"}, - {file = "duckdb-1.4.3-cp39-cp39-win_amd64.whl", hash = "sha256:23a3a077821bed1768a84ac9cbf6b6487ead33e28e62cb118bda5fb8f9e53dea"}, - {file = "duckdb-1.4.3.tar.gz", hash = "sha256:fea43e03604c713e25a25211ada87d30cd2a044d8f27afab5deba26ac49e5268"}, -] -markers = {main = "(extra == \"duckdb\" or extra == \"databases\" or extra == \"formats\" or extra == \"all\") and platform_system != \"Windows\"", dev = "platform_system != \"Windows\""} - -[package.extras] -all = ["adbc-driver-manager", "fsspec", "ipython", "numpy", "pandas", "pyarrow"] - -[[package]] -name = "email-validator" -version = "2.3.0" -description = "A robust email address syntax and deliverability validation library." -optional = false -python-versions = ">=3.8" -groups = ["main"] -files = [ - {file = "email_validator-2.3.0-py3-none-any.whl", hash = "sha256:80f13f623413e6b197ae73bb10bf4eb0908faf509ad8362c5edeb0be7fd450b4"}, - {file = "email_validator-2.3.0.tar.gz", hash = "sha256:9fc05c37f2f6cf439ff414f8fc46d917929974a82244c20eb10231ba60c54426"}, -] - -[package.dependencies] -dnspython = ">=2.0.0" -idna = ">=2.0.0" - [[package]] name = "exceptiongroup" version = "1.3.1" @@ -965,69 +706,6 @@ typing-extensions = {version = ">=4.6.0", markers = "python_version < \"3.13\""} [package.extras] test = ["pytest (>=6)"] -[[package]] -name = "fastavro" -version = "1.12.1" -description = "Fast read/write of AVRO files" -optional = false -python-versions = ">=3.9" -groups = ["main", "dev"] -files = [ - {file = "fastavro-1.12.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:00650ca533907361edda22e6ffe8cf87ab2091c5d8aee5c8000b0f2dcdda7ed3"}, - {file = "fastavro-1.12.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ac76d6d95f909c72ee70d314b460b7e711d928845771531d823eb96a10952d26"}, - {file = "fastavro-1.12.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f55eef18c41d4476bd32a82ed5dd86aabc3f614e1b66bdb09ffa291612e1670"}, - {file = "fastavro-1.12.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:81563e1f93570e6565487cdb01ba241a36a00e58cff9c5a0614af819d1155d8f"}, - {file = "fastavro-1.12.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:bec207360f76f0b3de540758a297193c5390e8e081c43c3317f610b1414d8c8f"}, - {file = "fastavro-1.12.1-cp310-cp310-win_amd64.whl", hash = "sha256:c0390bfe4a9f8056a75ac6785fbbff8f5e317f5356481d2e29ec980877d2314b"}, - {file = "fastavro-1.12.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6b632b713bc5d03928a87d811fa4a11d5f25cd43e79c161e291c7d3f7aa740fd"}, - {file = "fastavro-1.12.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eaa7ab3769beadcebb60f0539054c7755f63bd9cf7666e2c15e615ab605f89a8"}, - {file = "fastavro-1.12.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:123fb221df3164abd93f2d042c82f538a1d5a43ce41375f12c91ce1355a9141e"}, - {file = "fastavro-1.12.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:632a4e3ff223f834ddb746baae0cc7cee1068eb12c32e4d982c2fee8a5b483d0"}, - {file = "fastavro-1.12.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:83e6caf4e7a8717d932a3b1ff31595ad169289bbe1128a216be070d3a8391671"}, - {file = "fastavro-1.12.1-cp311-cp311-win_amd64.whl", hash = "sha256:b91a0fe5a173679a6c02d53ca22dcaad0a2c726b74507e0c1c2e71a7c3f79ef9"}, - {file = "fastavro-1.12.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:509818cb24b98a804fc80be9c5fed90f660310ae3d59382fc811bfa187122167"}, - {file = "fastavro-1.12.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:089e155c0c76e0d418d7e79144ce000524dd345eab3bc1e9c5ae69d500f71b14"}, - {file = "fastavro-1.12.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44cbff7518901c91a82aab476fcab13d102e4999499df219d481b9e15f61af34"}, - {file = "fastavro-1.12.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a275e48df0b1701bb764b18a8a21900b24cf882263cb03d35ecdba636bbc830b"}, - {file = "fastavro-1.12.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2de72d786eb38be6b16d556b27232b1bf1b2797ea09599507938cdb7a9fe3e7c"}, - {file = "fastavro-1.12.1-cp312-cp312-win_amd64.whl", hash = "sha256:9090f0dee63fe022ee9cc5147483366cc4171c821644c22da020d6b48f576b4f"}, - {file = "fastavro-1.12.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:78df838351e4dff9edd10a1c41d1324131ffecbadefb9c297d612ef5363c049a"}, - {file = "fastavro-1.12.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:780476c23175d2ae457c52f45b9ffa9d504593499a36cd3c1929662bf5b7b14b"}, - {file = "fastavro-1.12.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0714b285160fcd515eb0455540f40dd6dac93bdeacdb03f24e8eac3d8aa51f8d"}, - {file = "fastavro-1.12.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a8bc2dcec5843d499f2489bfe0747999108f78c5b29295d877379f1972a3d41a"}, - {file = "fastavro-1.12.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:3b1921ac35f3d89090a5816b626cf46e67dbecf3f054131f84d56b4e70496f45"}, - {file = "fastavro-1.12.1-cp313-cp313-win_amd64.whl", hash = "sha256:5aa777b8ee595b50aa084104cd70670bf25a7bbb9fd8bb5d07524b0785ee1699"}, - {file = "fastavro-1.12.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:c3d67c47f177e486640404a56f2f50b165fe892cc343ac3a34673b80cc7f1dd6"}, - {file = "fastavro-1.12.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5217f773492bac43dae15ff2931432bce2d7a80be7039685a78d3fab7df910bd"}, - {file = "fastavro-1.12.1-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:469fecb25cba07f2e1bfa4c8d008477cd6b5b34a59d48715e1b1a73f6160097d"}, - {file = "fastavro-1.12.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:d71c8aa841ef65cfab709a22bb887955f42934bced3ddb571e98fdbdade4c609"}, - {file = "fastavro-1.12.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:b81fc04e85dfccf7c028e0580c606e33aa8472370b767ef058aae2c674a90746"}, - {file = "fastavro-1.12.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:9445da127751ba65975d8e4bdabf36bfcfdad70fc35b2d988e3950cce0ec0e7c"}, - {file = "fastavro-1.12.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ed924233272719b5d5a6a0b4d80ef3345fc7e84fc7a382b6232192a9112d38a6"}, - {file = "fastavro-1.12.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3616e2f0e1c9265e92954fa099db79c6e7817356d3ff34f4bcc92699ae99697c"}, - {file = "fastavro-1.12.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:cb0337b42fd3c047fcf0e9b7597bd6ad25868de719f29da81eabb6343f08d399"}, - {file = "fastavro-1.12.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:64961ab15b74b7c168717bbece5660e0f3d457837c3cc9d9145181d011199fa7"}, - {file = "fastavro-1.12.1-cp314-cp314-win_amd64.whl", hash = "sha256:792356d320f6e757e89f7ac9c22f481e546c886454a6709247f43c0dd7058004"}, - {file = "fastavro-1.12.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:120aaf82ac19d60a1016afe410935fe94728752d9c2d684e267e5b7f0e70f6d9"}, - {file = "fastavro-1.12.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b6a3462934b20a74f9ece1daa49c2e4e749bd9a35fa2657b53bf62898fba80f5"}, - {file = "fastavro-1.12.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:1f81011d54dd47b12437b51dd93a70a9aa17b61307abf26542fc3c13efbc6c51"}, - {file = "fastavro-1.12.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:43ded16b3f4a9f1a42f5970c2aa618acb23ea59c4fcaa06680bdf470b255e5a8"}, - {file = "fastavro-1.12.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:02281432dcb11c78b3280da996eff61ee0eff39c5de06c6e0fbf19275093e6d4"}, - {file = "fastavro-1.12.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4128978b930aaf930332db4b3acc290783183f3be06a241ae4a482f3ed8ce892"}, - {file = "fastavro-1.12.1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:546ffffda6610fca672f0ed41149808e106d8272bb246aa7539fa8bb6f117f17"}, - {file = "fastavro-1.12.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:a7d840ccd9aacada3ddc80fbcc4ea079b658107fe62e9d289a0de9d54e95d366"}, - {file = "fastavro-1.12.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:3100ad643e7fa658469a2a2db229981c1a000ff16b8037c0b58ce3ec4d2107e8"}, - {file = "fastavro-1.12.1-cp39-cp39-win_amd64.whl", hash = "sha256:a38607444281619eda3a9c1be9f5397634012d1b237142eee1540e810b30ac8b"}, - {file = "fastavro-1.12.1.tar.gz", hash = "sha256:2f285be49e45bc047ab2f6bed040bb349da85db3f3c87880e4b92595ea093b2b"}, -] -markers = {main = "extra == \"avro\" or extra == \"formats\" or extra == \"all\""} - -[package.extras] -codecs = ["cramjam", "lz4", "zstandard"] -lz4 = ["lz4"] -snappy = ["cramjam"] -zstandard = ["zstandard"] - [[package]] name = "filelock" version = "3.20.3" @@ -1039,7 +717,7 @@ files = [ {file = "filelock-3.20.3-py3-none-any.whl", hash = "sha256:4b0dda527ee31078689fc205ec4f1c1bf7d56cf88b6dc9426c4f230e46c2dce1"}, {file = "filelock-3.20.3.tar.gz", hash = "sha256:18c57ee915c7ec61cff0ecf7f0f869936c7c30191bb0cf406f1341778d0834e1"}, ] -markers = {main = "python_version <= \"3.13\" and (extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\")"} +markers = {main = "(extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\") and python_version <= \"3.13\""} [[package]] name = "google-api-core" @@ -1052,7 +730,7 @@ files = [ {file = "google_api_core-2.29.0-py3-none-any.whl", hash = "sha256:d30bc60980daa36e314b5d5a3e5958b0200cb44ca8fa1be2b614e932b75a3ea9"}, {file = "google_api_core-2.29.0.tar.gz", hash = "sha256:84181be0f8e6b04006df75ddfe728f24489f0af57c96a529ff7cf45bc28797f7"}, ] -markers = {main = "extra == \"gcs\" or extra == \"cloud\" or extra == \"all\" or extra == \"bigquery\" or extra == \"warehouses\""} +markers = {main = "extra == \"bigquery\" or extra == \"warehouses\" or extra == \"all\""} [package.dependencies] google-auth = ">=2.14.1,<3.0.0" @@ -1091,7 +769,7 @@ files = [ {file = "google_auth-2.47.0-py3-none-any.whl", hash = "sha256:c516d68336bfde7cf0da26aab674a36fedcf04b37ac4edd59c597178760c3498"}, {file = "google_auth-2.47.0.tar.gz", hash = "sha256:833229070a9dfee1a353ae9877dcd2dec069a8281a4e72e72f77d4a70ff945da"}, ] -markers = {main = "extra == \"gcs\" or extra == \"cloud\" or extra == \"all\" or extra == \"bigquery\" or extra == \"warehouses\""} +markers = {main = "extra == \"bigquery\" or extra == \"warehouses\" or extra == \"all\""} [package.dependencies] pyasn1-modules = ">=0.2.1" @@ -1153,7 +831,7 @@ files = [ {file = "google_cloud_core-2.5.0-py3-none-any.whl", hash = "sha256:67d977b41ae6c7211ee830c7912e41003ea8194bff15ae7d72fd6f51e57acabc"}, {file = "google_cloud_core-2.5.0.tar.gz", hash = "sha256:7c1b7ef5c92311717bd05301aa1a91ffbc565673d3b0b4163a52d8413a186963"}, ] -markers = {main = "extra == \"gcs\" or extra == \"cloud\" or extra == \"all\" or extra == \"bigquery\" or extra == \"warehouses\""} +markers = {main = "extra == \"bigquery\" or extra == \"warehouses\" or extra == \"all\""} [package.dependencies] google-api-core = ">=1.31.6,<2.0.dev0 || >2.3.0,<3.0.0" @@ -1162,31 +840,6 @@ google-auth = ">=1.25.0,<3.0.0" [package.extras] grpc = ["grpcio (>=1.38.0,<2.0.0) ; python_version < \"3.14\"", "grpcio (>=1.75.1,<2.0.0) ; python_version >= \"3.14\"", "grpcio-status (>=1.38.0,<2.0.0)"] -[[package]] -name = "google-cloud-storage" -version = "2.19.0" -description = "Google Cloud Storage API client library" -optional = false -python-versions = ">=3.7" -groups = ["main", "dev"] -files = [ - {file = "google_cloud_storage-2.19.0-py2.py3-none-any.whl", hash = "sha256:aeb971b5c29cf8ab98445082cbfe7b161a1f48ed275822f59ed3f1524ea54fba"}, - {file = "google_cloud_storage-2.19.0.tar.gz", hash = "sha256:cd05e9e7191ba6cb68934d8eb76054d9be4562aa89dbc4236feee4d7d51342b2"}, -] -markers = {main = "extra == \"gcs\" or extra == \"cloud\" or extra == \"all\""} - -[package.dependencies] -google-api-core = ">=2.15.0,<3.0.0.dev0" -google-auth = ">=2.26.1,<3.0.dev0" -google-cloud-core = ">=2.3.0,<3.0.dev0" -google-crc32c = ">=1.0,<2.0.dev0" -google-resumable-media = ">=2.7.2" -requests = ">=2.18.0,<3.0.0.dev0" - -[package.extras] -protobuf = ["protobuf (<6.0.0.dev0)"] -tracing = ["opentelemetry-api (>=1.1.0)"] - [[package]] name = "google-crc32c" version = "1.8.0" @@ -1229,7 +882,7 @@ files = [ {file = "google_crc32c-1.8.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f639065ea2042d5c034bf258a9f085eaa7af0cd250667c0635a3118e8f92c69c"}, {file = "google_crc32c-1.8.0.tar.gz", hash = "sha256:a428e25fb7691024de47fecfbff7ff957214da51eddded0da0ae0e0f03a2cf79"}, ] -markers = {main = "extra == \"gcs\" or extra == \"cloud\" or extra == \"all\" or extra == \"bigquery\" or extra == \"warehouses\""} +markers = {main = "extra == \"bigquery\" or extra == \"warehouses\" or extra == \"all\""} [[package]] name = "google-resumable-media" @@ -1242,7 +895,7 @@ files = [ {file = "google_resumable_media-2.8.0-py3-none-any.whl", hash = "sha256:dd14a116af303845a8d932ddae161a26e86cc229645bc98b39f026f9b1717582"}, {file = "google_resumable_media-2.8.0.tar.gz", hash = "sha256:f1157ed8b46994d60a1bc432544db62352043113684d4e030ee02e77ebe9a1ae"}, ] -markers = {main = "extra == \"gcs\" or extra == \"cloud\" or extra == \"all\" or extra == \"bigquery\" or extra == \"warehouses\""} +markers = {main = "extra == \"bigquery\" or extra == \"warehouses\" or extra == \"all\""} [package.dependencies] google-crc32c = ">=1.0.0,<2.0.0" @@ -1262,7 +915,7 @@ files = [ {file = "googleapis_common_protos-1.72.0-py3-none-any.whl", hash = "sha256:4299c5a82d5ae1a9702ada957347726b167f9f8d1fc352477702a1e851ff4038"}, {file = "googleapis_common_protos-1.72.0.tar.gz", hash = "sha256:e55a601c1b32b52d7a3e65f43563e2aa61bcd737998ee672ac9b951cd49319f5"}, ] -markers = {main = "extra == \"bigquery\" or extra == \"warehouses\" or extra == \"all\" or extra == \"gcs\" or extra == \"cloud\""} +markers = {main = "extra == \"bigquery\" or extra == \"warehouses\" or extra == \"all\""} [package.dependencies] protobuf = ">=3.20.2,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<7.0.0" @@ -1455,6 +1108,7 @@ files = [ {file = "idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea"}, {file = "idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902"}, ] +markers = {main = "(extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\" or extra == \"bigquery\") and (python_version <= \"3.13\" or extra == \"bigquery\" or extra == \"warehouses\" or extra == \"all\")"} [package.extras] all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"] @@ -1471,19 +1125,6 @@ files = [ {file = "iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730"}, ] -[[package]] -name = "isodate" -version = "0.7.2" -description = "An ISO 8601 date/time/duration parser and formatter" -optional = false -python-versions = ">=3.7" -groups = ["main", "dev"] -files = [ - {file = "isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15"}, - {file = "isodate-0.7.2.tar.gz", hash = "sha256:4cd1aa0f43ca76f4a6c6c0292a85f40b35ec2e43e315b59f06e6d32171a953e6"}, -] -markers = {main = "extra == \"azure\" or extra == \"cloud\" or extra == \"all\""} - [[package]] name = "jinja2" version = "3.1.6" @@ -1513,7 +1154,7 @@ files = [ {file = "jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980"}, {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, ] -markers = {main = "(python_version <= \"3.13\" or extra == \"s3\" or extra == \"cloud\" or extra == \"redshift\" or extra == \"warehouses\" or extra == \"all\") and (extra == \"s3\" or extra == \"cloud\" or extra == \"redshift\" or extra == \"warehouses\" or extra == \"all\" or extra == \"snowflake\")"} +markers = {main = "(extra == \"s3\" or extra == \"cloud\" or extra == \"redshift\" or extra == \"warehouses\" or extra == \"all\" or extra == \"snowflake\") and (python_version <= \"3.13\" or extra == \"s3\" or extra == \"cloud\" or extra == \"redshift\" or extra == \"warehouses\" or extra == \"all\")"} [[package]] name = "jsonschema" @@ -2114,7 +1755,7 @@ files = [ {file = "packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484"}, {file = "packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f"}, ] -markers = {main = "(python_version <= \"3.13\" or extra == \"bigquery\" or extra == \"warehouses\" or extra == \"all\") and (extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\" or extra == \"bigquery\")"} +markers = {main = "(extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\" or extra == \"bigquery\") and (python_version <= \"3.13\" or extra == \"bigquery\" or extra == \"warehouses\" or extra == \"all\")"} [[package]] name = "pandas" @@ -2250,18 +1891,6 @@ optional = ["typing-extensions (>=4)"] re2 = ["google-re2 (>=1.1)"] tests = ["pytest (>=9)", "typing-extensions (>=4.15)"] -[[package]] -name = "phonenumbers" -version = "9.0.22" -description = "Python version of Google's common library for parsing, formatting, storing and validating international phone numbers." -optional = false -python-versions = ">=2.5" -groups = ["main"] -files = [ - {file = "phonenumbers-9.0.22-py2.py3-none-any.whl", hash = "sha256:645e66cd9a136b3b257b5f941fa97d324124114d31ad3c9f2488682f47ad7ee1"}, - {file = "phonenumbers-9.0.22.tar.gz", hash = "sha256:eff985c65575749d1d54e07c56c3517d5243e03e08e4a6191761df9aab2278f2"}, -] - [[package]] name = "platformdirs" version = "4.5.1" @@ -2273,7 +1902,7 @@ files = [ {file = "platformdirs-4.5.1-py3-none-any.whl", hash = "sha256:d03afa3963c806a9bed9d5125c8f4cb2fdaf74a55ab60e5d59b3fde758104d31"}, {file = "platformdirs-4.5.1.tar.gz", hash = "sha256:61d5cdcc6065745cdd94f0f878977f8de9437be93de97c1c12f853c9c0cdcbda"}, ] -markers = {main = "python_version <= \"3.13\" and (extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\")"} +markers = {main = "(extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\") and python_version <= \"3.13\""} [package.extras] docs = ["furo (>=2025.9.25)", "proselint (>=0.14)", "sphinx (>=8.2.3)", "sphinx-autodoc-typehints (>=3.2)"] @@ -2326,7 +1955,7 @@ files = [ {file = "proto_plus-1.27.0-py3-none-any.whl", hash = "sha256:1baa7f81cf0f8acb8bc1f6d085008ba4171eaf669629d1b6d1673b21ed1c0a82"}, {file = "proto_plus-1.27.0.tar.gz", hash = "sha256:873af56dd0d7e91836aee871e5799e1c6f1bda86ac9a983e0bb9f0c266a568c4"}, ] -markers = {main = "extra == \"gcs\" or extra == \"cloud\" or extra == \"all\" or extra == \"bigquery\" or extra == \"warehouses\""} +markers = {main = "extra == \"bigquery\" or extra == \"warehouses\" or extra == \"all\""} [package.dependencies] protobuf = ">=3.19.0,<7.0.0" @@ -2353,7 +1982,7 @@ files = [ {file = "protobuf-6.33.2-py3-none-any.whl", hash = "sha256:7636aad9bb01768870266de5dc009de2d1b936771b38a793f73cbbf279c91c5c"}, {file = "protobuf-6.33.2.tar.gz", hash = "sha256:56dc370c91fbb8ac85bc13582c9e373569668a290aa2e66a590c2a0d35ddb9e4"}, ] -markers = {main = "extra == \"bigquery\" or extra == \"warehouses\" or extra == \"all\" or extra == \"gcs\" or extra == \"cloud\""} +markers = {main = "extra == \"bigquery\" or extra == \"warehouses\" or extra == \"all\""} [[package]] name = "psycopg2-binary" @@ -2504,7 +2133,7 @@ files = [ {file = "pyasn1-0.6.2-py3-none-any.whl", hash = "sha256:1eb26d860996a18e9b6ed05e7aae0e9fc21619fcee6af91cca9bad4fbea224bf"}, {file = "pyasn1-0.6.2.tar.gz", hash = "sha256:9b59a2b25ba7e4f8197db7686c09fb33e658b98339fadb826e9512629017833b"}, ] -markers = {main = "extra == \"gcs\" or extra == \"cloud\" or extra == \"all\" or extra == \"bigquery\" or extra == \"warehouses\""} +markers = {main = "extra == \"bigquery\" or extra == \"warehouses\" or extra == \"all\""} [[package]] name = "pyasn1-modules" @@ -2517,7 +2146,7 @@ files = [ {file = "pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a"}, {file = "pyasn1_modules-0.4.2.tar.gz", hash = "sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6"}, ] -markers = {main = "extra == \"gcs\" or extra == \"cloud\" or extra == \"all\" or extra == \"bigquery\" or extra == \"warehouses\""} +markers = {main = "extra == \"bigquery\" or extra == \"warehouses\" or extra == \"all\""} [package.dependencies] pyasn1 = ">=0.6.1,<0.7.0" @@ -2533,7 +2162,7 @@ files = [ {file = "pycparser-2.23-py3-none-any.whl", hash = "sha256:e5c6e8d3fbad53479cab09ac03729e0a9faf2bee3db8208a550daf5af81a5934"}, {file = "pycparser-2.23.tar.gz", hash = "sha256:78816d4f24add8f10a06d6f05b4d424ad9e96cfebf68a4ddc99c65c0720d00c2"}, ] -markers = {main = "(python_version <= \"3.13\" or platform_python_implementation != \"PyPy\") and (python_version <= \"3.13\" or extra == \"azure\" or extra == \"cloud\" or extra == \"all\") and (python_version <= \"3.13\" or implementation_name != \"PyPy\") and (platform_python_implementation != \"PyPy\" or extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\") and (extra == \"azure\" or extra == \"cloud\" or extra == \"all\" or extra == \"snowflake\" or extra == \"warehouses\")", dev = "(python_version <= \"3.13\" or platform_python_implementation != \"PyPy\") and (python_version <= \"3.13\" or implementation_name != \"PyPy\")"} +markers = {main = "(extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\") and python_version <= \"3.13\"", dev = "python_version <= \"3.13\" or platform_python_implementation != \"PyPy\" and implementation_name != \"PyPy\""} [[package]] name = "pygments" @@ -2561,7 +2190,7 @@ files = [ {file = "PyJWT-2.10.1-py3-none-any.whl", hash = "sha256:dcdd193e30abefd5debf142f9adfcdd2b58004e644f25406ffaebd50bd98dacb"}, {file = "pyjwt-2.10.1.tar.gz", hash = "sha256:3cc5772eb20009233caf06e9d8a0577824723b44e6648ee0a2aedb6cf9381953"}, ] -markers = {main = "python_version <= \"3.13\" and (extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\")", dev = "python_version <= \"3.13\""} +markers = {main = "(extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\") and python_version <= \"3.13\"", dev = "python_version <= \"3.13\""} [package.extras] crypto = ["cryptography (>=3.4.0)"] @@ -2655,7 +2284,7 @@ files = [ {file = "pyopenssl-25.3.0-py3-none-any.whl", hash = "sha256:1fda6fc034d5e3d179d39e59c1895c9faeaf40a79de5fc4cbbfbe0d36f4a77b6"}, {file = "pyopenssl-25.3.0.tar.gz", hash = "sha256:c981cb0a3fd84e8602d7afc209522773b94c1c2446a3c710a75b06fe1beae329"}, ] -markers = {main = "python_version <= \"3.13\" and (extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\")", dev = "python_version <= \"3.13\""} +markers = {main = "(extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\") and python_version <= \"3.13\"", dev = "python_version <= \"3.13\""} [package.dependencies] cryptography = ">=45.0.7,<47" @@ -2882,7 +2511,7 @@ files = [ {file = "requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6"}, {file = "requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf"}, ] -markers = {main = "(python_version <= \"3.13\" or extra == \"gcs\" or extra == \"cloud\" or extra == \"all\" or extra == \"bigquery\" or extra == \"warehouses\" or extra == \"azure\") and (extra == \"gcs\" or extra == \"cloud\" or extra == \"all\" or extra == \"bigquery\" or extra == \"warehouses\" or extra == \"azure\" or extra == \"snowflake\")"} +markers = {main = "(extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\" or extra == \"bigquery\") and (python_version <= \"3.13\" or extra == \"bigquery\" or extra == \"warehouses\" or extra == \"all\")"} [package.dependencies] certifi = ">=2017.4.17" @@ -3070,7 +2699,7 @@ files = [ {file = "rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762"}, {file = "rsa-4.9.1.tar.gz", hash = "sha256:e7bdbfdb5497da4c07dfd35530e1a902659db6ff241e39d9953cad06ebd0ae75"}, ] -markers = {main = "extra == \"gcs\" or extra == \"cloud\" or extra == \"all\" or extra == \"bigquery\" or extra == \"warehouses\""} +markers = {main = "extra == \"bigquery\" or extra == \"warehouses\" or extra == \"all\""} [package.dependencies] pyasn1 = ">=0.1.3" @@ -3115,7 +2744,7 @@ files = [ {file = "s3transfer-0.16.0-py3-none-any.whl", hash = "sha256:18e25d66fed509e3868dc1572b3f427ff947dd2c56f844a5bf09481ad3f3b2fe"}, {file = "s3transfer-0.16.0.tar.gz", hash = "sha256:8e990f13268025792229cd52fa10cb7163744bf56e719e0b9cb925ab79abf920"}, ] -markers = {main = "(python_version <= \"3.13\" or extra == \"s3\" or extra == \"cloud\" or extra == \"redshift\" or extra == \"warehouses\" or extra == \"all\") and (extra == \"s3\" or extra == \"cloud\" or extra == \"redshift\" or extra == \"warehouses\" or extra == \"all\" or extra == \"snowflake\")"} +markers = {main = "(extra == \"s3\" or extra == \"cloud\" or extra == \"redshift\" or extra == \"warehouses\" or extra == \"all\" or extra == \"snowflake\") and (python_version <= \"3.13\" or extra == \"s3\" or extra == \"cloud\" or extra == \"redshift\" or extra == \"warehouses\" or extra == \"all\")"} [package.dependencies] botocore = ">=1.37.4,<2.0a0" @@ -3130,7 +2759,7 @@ description = "Fundamental algorithms for scientific computing in Python" optional = true python-versions = ">=3.11" groups = ["main"] -markers = "(extra == \"statistical\" or extra == \"all\") and python_version >= \"3.11\"" +markers = "python_version >= \"3.11\" and (extra == \"statistical\" or extra == \"all\")" files = [ {file = "scipy-1.17.0-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:2abd71643797bd8a106dff97894ff7869eeeb0af0f7a5ce02e4227c6a2e9d6fd"}, {file = "scipy-1.17.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:ef28d815f4d2686503e5f4f00edc387ae58dfd7a2f42e348bb53359538f01558"}, @@ -3262,7 +2891,7 @@ files = [ {file = "snowflake_connector_python-3.18.0-cp39-cp39-win_amd64.whl", hash = "sha256:a8c570edff5a4888840dbe1e9e65c5e4d77d55c5c800cd359fe0903a769201e0"}, {file = "snowflake_connector_python-3.18.0.tar.gz", hash = "sha256:41a46eb9824574c5f8068e3ed5c02a2dc0a733ed08ee81fa1fb3dd0ebe921728"}, ] -markers = {main = "python_version <= \"3.13\" and (extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\")", dev = "python_version <= \"3.13\""} +markers = {main = "(extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\") and python_version <= \"3.13\"", dev = "python_version <= \"3.13\""} [package.dependencies] asn1crypto = ">0.24.0,<2.0.0" @@ -3300,7 +2929,7 @@ files = [ {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"}, {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, ] -markers = {main = "python_version <= \"3.13\" and (extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\")", dev = "python_version <= \"3.13\""} +markers = {main = "(extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\") and python_version <= \"3.13\"", dev = "python_version <= \"3.13\""} [[package]] name = "sqlalchemy" @@ -3458,7 +3087,7 @@ files = [ {file = "tomlkit-0.14.0-py3-none-any.whl", hash = "sha256:592064ed85b40fa213469f81ac584f67a4f2992509a7c3ea2d632208623a3680"}, {file = "tomlkit-0.14.0.tar.gz", hash = "sha256:cf00efca415dbd57575befb1f6634c4f42d2d87dbba376128adb42c121b87064"}, ] -markers = {main = "python_version <= \"3.13\" and (extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\")", dev = "python_version <= \"3.13\""} +markers = {main = "(extra == \"snowflake\" or extra == \"warehouses\" or extra == \"all\") and python_version <= \"3.13\"", dev = "python_version <= \"3.13\""} [[package]] name = "typer" @@ -3537,7 +3166,7 @@ files = [ {file = "urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4"}, {file = "urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed"}, ] -markers = {main = "(python_version <= \"3.13\" or extra == \"s3\" or extra == \"cloud\" or extra == \"redshift\" or extra == \"warehouses\" or extra == \"all\" or extra == \"gcs\" or extra == \"bigquery\" or extra == \"azure\") and (extra == \"s3\" or extra == \"cloud\" or extra == \"redshift\" or extra == \"warehouses\" or extra == \"all\" or extra == \"snowflake\" or extra == \"gcs\" or extra == \"bigquery\" or extra == \"azure\")"} +markers = {main = "(extra == \"s3\" or extra == \"cloud\" or extra == \"redshift\" or extra == \"warehouses\" or extra == \"all\" or extra == \"snowflake\" or extra == \"bigquery\") and (python_version <= \"3.13\" or extra == \"s3\" or extra == \"cloud\" or extra == \"redshift\" or extra == \"warehouses\" or extra == \"all\" or extra == \"bigquery\")"} [package.extras] brotli = ["brotli (>=1.2.0) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=1.2.0.0) ; platform_python_implementation != \"CPython\""] @@ -3585,127 +3214,6 @@ markupsafe = ">=2.1.1" [package.extras] watchdog = ["watchdog (>=2.3)"] -[[package]] -name = "wrapt" -version = "2.0.1" -description = "Module for decorators, wrappers and monkey patching." -optional = false -python-versions = ">=3.8" -groups = ["main", "dev"] -files = [ - {file = "wrapt-2.0.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:64b103acdaa53b7caf409e8d45d39a8442fe6dcfec6ba3f3d141e0cc2b5b4dbd"}, - {file = "wrapt-2.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:91bcc576260a274b169c3098e9a3519fb01f2989f6d3d386ef9cbf8653de1374"}, - {file = "wrapt-2.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ab594f346517010050126fcd822697b25a7031d815bb4fbc238ccbe568216489"}, - {file = "wrapt-2.0.1-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:36982b26f190f4d737f04a492a68accbfc6fa042c3f42326fdfbb6c5b7a20a31"}, - {file = "wrapt-2.0.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:23097ed8bc4c93b7bf36fa2113c6c733c976316ce0ee2c816f64ca06102034ef"}, - {file = "wrapt-2.0.1-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8bacfe6e001749a3b64db47bcf0341da757c95959f592823a93931a422395013"}, - {file = "wrapt-2.0.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:8ec3303e8a81932171f455f792f8df500fc1a09f20069e5c16bd7049ab4e8e38"}, - {file = "wrapt-2.0.1-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:3f373a4ab5dbc528a94334f9fe444395b23c2f5332adab9ff4ea82f5a9e33bc1"}, - {file = "wrapt-2.0.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f49027b0b9503bf6c8cdc297ca55006b80c2f5dd36cecc72c6835ab6e10e8a25"}, - {file = "wrapt-2.0.1-cp310-cp310-win32.whl", hash = "sha256:8330b42d769965e96e01fa14034b28a2a7600fbf7e8f0cc90ebb36d492c993e4"}, - {file = "wrapt-2.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:1218573502a8235bb8a7ecaed12736213b22dcde9feab115fa2989d42b5ded45"}, - {file = "wrapt-2.0.1-cp310-cp310-win_arm64.whl", hash = "sha256:eda8e4ecd662d48c28bb86be9e837c13e45c58b8300e43ba3c9b4fa9900302f7"}, - {file = "wrapt-2.0.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0e17283f533a0d24d6e5429a7d11f250a58d28b4ae5186f8f47853e3e70d2590"}, - {file = "wrapt-2.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:85df8d92158cb8f3965aecc27cf821461bb5f40b450b03facc5d9f0d4d6ddec6"}, - {file = "wrapt-2.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c1be685ac7700c966b8610ccc63c3187a72e33cab53526a27b2a285a662cd4f7"}, - {file = "wrapt-2.0.1-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:df0b6d3b95932809c5b3fecc18fda0f1e07452d05e2662a0b35548985f256e28"}, - {file = "wrapt-2.0.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4da7384b0e5d4cae05c97cd6f94faaf78cc8b0f791fc63af43436d98c4ab37bb"}, - {file = "wrapt-2.0.1-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:ec65a78fbd9d6f083a15d7613b2800d5663dbb6bb96003899c834beaa68b242c"}, - {file = "wrapt-2.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7de3cc939be0e1174969f943f3b44e0d79b6f9a82198133a5b7fc6cc92882f16"}, - {file = "wrapt-2.0.1-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:fb1a5b72cbd751813adc02ef01ada0b0d05d3dcbc32976ce189a1279d80ad4a2"}, - {file = "wrapt-2.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3fa272ca34332581e00bf7773e993d4f632594eb2d1b0b162a9038df0fd971dd"}, - {file = "wrapt-2.0.1-cp311-cp311-win32.whl", hash = "sha256:fc007fdf480c77301ab1afdbb6ab22a5deee8885f3b1ed7afcb7e5e84a0e27be"}, - {file = "wrapt-2.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:47434236c396d04875180171ee1f3815ca1eada05e24a1ee99546320d54d1d1b"}, - {file = "wrapt-2.0.1-cp311-cp311-win_arm64.whl", hash = "sha256:837e31620e06b16030b1d126ed78e9383815cbac914693f54926d816d35d8edf"}, - {file = "wrapt-2.0.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:1fdbb34da15450f2b1d735a0e969c24bdb8d8924892380126e2a293d9902078c"}, - {file = "wrapt-2.0.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3d32794fe940b7000f0519904e247f902f0149edbe6316c710a8562fb6738841"}, - {file = "wrapt-2.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:386fb54d9cd903ee0012c09291336469eb7b244f7183d40dc3e86a16a4bace62"}, - {file = "wrapt-2.0.1-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:7b219cb2182f230676308cdcacd428fa837987b89e4b7c5c9025088b8a6c9faf"}, - {file = "wrapt-2.0.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:641e94e789b5f6b4822bb8d8ebbdfc10f4e4eae7756d648b717d980f657a9eb9"}, - {file = "wrapt-2.0.1-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:fe21b118b9f58859b5ebaa4b130dee18669df4bd111daad082b7beb8799ad16b"}, - {file = "wrapt-2.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:17fb85fa4abc26a5184d93b3efd2dcc14deb4b09edcdb3535a536ad34f0b4dba"}, - {file = "wrapt-2.0.1-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:b89ef9223d665ab255ae42cc282d27d69704d94be0deffc8b9d919179a609684"}, - {file = "wrapt-2.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a453257f19c31b31ba593c30d997d6e5be39e3b5ad9148c2af5a7314061c63eb"}, - {file = "wrapt-2.0.1-cp312-cp312-win32.whl", hash = "sha256:3e271346f01e9c8b1130a6a3b0e11908049fe5be2d365a5f402778049147e7e9"}, - {file = "wrapt-2.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:2da620b31a90cdefa9cd0c2b661882329e2e19d1d7b9b920189956b76c564d75"}, - {file = "wrapt-2.0.1-cp312-cp312-win_arm64.whl", hash = "sha256:aea9c7224c302bc8bfc892b908537f56c430802560e827b75ecbde81b604598b"}, - {file = "wrapt-2.0.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:47b0f8bafe90f7736151f61482c583c86b0693d80f075a58701dd1549b0010a9"}, - {file = "wrapt-2.0.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:cbeb0971e13b4bd81d34169ed57a6dda017328d1a22b62fda45e1d21dd06148f"}, - {file = "wrapt-2.0.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:eb7cffe572ad0a141a7886a1d2efa5bef0bf7fe021deeea76b3ab334d2c38218"}, - {file = "wrapt-2.0.1-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:c8d60527d1ecfc131426b10d93ab5d53e08a09c5fa0175f6b21b3252080c70a9"}, - {file = "wrapt-2.0.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c654eafb01afac55246053d67a4b9a984a3567c3808bb7df2f8de1c1caba2e1c"}, - {file = "wrapt-2.0.1-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:98d873ed6c8b4ee2418f7afce666751854d6d03e3c0ec2a399bb039cd2ae89db"}, - {file = "wrapt-2.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c9e850f5b7fc67af856ff054c71690d54fa940c3ef74209ad9f935b4f66a0233"}, - {file = "wrapt-2.0.1-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:e505629359cb5f751e16e30cf3f91a1d3ddb4552480c205947da415d597f7ac2"}, - {file = "wrapt-2.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:2879af909312d0baf35f08edeea918ee3af7ab57c37fe47cb6a373c9f2749c7b"}, - {file = "wrapt-2.0.1-cp313-cp313-win32.whl", hash = "sha256:d67956c676be5a24102c7407a71f4126d30de2a569a1c7871c9f3cabc94225d7"}, - {file = "wrapt-2.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:9ca66b38dd642bf90c59b6738af8070747b610115a39af2498535f62b5cdc1c3"}, - {file = "wrapt-2.0.1-cp313-cp313-win_arm64.whl", hash = "sha256:5a4939eae35db6b6cec8e7aa0e833dcca0acad8231672c26c2a9ab7a0f8ac9c8"}, - {file = "wrapt-2.0.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:a52f93d95c8d38fed0669da2ebdb0b0376e895d84596a976c15a9eb45e3eccb3"}, - {file = "wrapt-2.0.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4e54bbf554ee29fcceee24fa41c4d091398b911da6e7f5d7bffda963c9aed2e1"}, - {file = "wrapt-2.0.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:908f8c6c71557f4deaa280f55d0728c3bca0960e8c3dd5ceeeafb3c19942719d"}, - {file = "wrapt-2.0.1-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:e2f84e9af2060e3904a32cea9bb6db23ce3f91cfd90c6b426757cf7cc01c45c7"}, - {file = "wrapt-2.0.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e3612dc06b436968dfb9142c62e5dfa9eb5924f91120b3c8ff501ad878f90eb3"}, - {file = "wrapt-2.0.1-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:6d2d947d266d99a1477cd005b23cbd09465276e302515e122df56bb9511aca1b"}, - {file = "wrapt-2.0.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:7d539241e87b650cbc4c3ac9f32c8d1ac8a54e510f6dca3f6ab60dcfd48c9b10"}, - {file = "wrapt-2.0.1-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:4811e15d88ee62dbf5c77f2c3ff3932b1e3ac92323ba3912f51fc4016ce81ecf"}, - {file = "wrapt-2.0.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c1c91405fcf1d501fa5d55df21e58ea49e6b879ae829f1039faaf7e5e509b41e"}, - {file = "wrapt-2.0.1-cp313-cp313t-win32.whl", hash = "sha256:e76e3f91f864e89db8b8d2a8311d57df93f01ad6bb1e9b9976d1f2e83e18315c"}, - {file = "wrapt-2.0.1-cp313-cp313t-win_amd64.whl", hash = "sha256:83ce30937f0ba0d28818807b303a412440c4b63e39d3d8fc036a94764b728c92"}, - {file = "wrapt-2.0.1-cp313-cp313t-win_arm64.whl", hash = "sha256:4b55cacc57e1dc2d0991dbe74c6419ffd415fb66474a02335cb10efd1aa3f84f"}, - {file = "wrapt-2.0.1-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:5e53b428f65ece6d9dad23cb87e64506392b720a0b45076c05354d27a13351a1"}, - {file = "wrapt-2.0.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:ad3ee9d0f254851c71780966eb417ef8e72117155cff04821ab9b60549694a55"}, - {file = "wrapt-2.0.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:d7b822c61ed04ee6ad64bc90d13368ad6eb094db54883b5dde2182f67a7f22c0"}, - {file = "wrapt-2.0.1-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:7164a55f5e83a9a0b031d3ffab4d4e36bbec42e7025db560f225489fa929e509"}, - {file = "wrapt-2.0.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e60690ba71a57424c8d9ff28f8d006b7ad7772c22a4af432188572cd7fa004a1"}, - {file = "wrapt-2.0.1-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:3cd1a4bd9a7a619922a8557e1318232e7269b5fb69d4ba97b04d20450a6bf970"}, - {file = "wrapt-2.0.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b4c2e3d777e38e913b8ce3a6257af72fb608f86a1df471cb1d4339755d0a807c"}, - {file = "wrapt-2.0.1-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:3d366aa598d69416b5afedf1faa539fac40c1d80a42f6b236c88c73a3c8f2d41"}, - {file = "wrapt-2.0.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:c235095d6d090aa903f1db61f892fffb779c1eaeb2a50e566b52001f7a0f66ed"}, - {file = "wrapt-2.0.1-cp314-cp314-win32.whl", hash = "sha256:bfb5539005259f8127ea9c885bdc231978c06b7a980e63a8a61c8c4c979719d0"}, - {file = "wrapt-2.0.1-cp314-cp314-win_amd64.whl", hash = "sha256:4ae879acc449caa9ed43fc36ba08392b9412ee67941748d31d94e3cedb36628c"}, - {file = "wrapt-2.0.1-cp314-cp314-win_arm64.whl", hash = "sha256:8639b843c9efd84675f1e100ed9e99538ebea7297b62c4b45a7042edb84db03e"}, - {file = "wrapt-2.0.1-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:9219a1d946a9b32bb23ccae66bdb61e35c62773ce7ca6509ceea70f344656b7b"}, - {file = "wrapt-2.0.1-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:fa4184e74197af3adad3c889a1af95b53bb0466bced92ea99a0c014e48323eec"}, - {file = "wrapt-2.0.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c5ef2f2b8a53b7caee2f797ef166a390fef73979b15778a4a153e4b5fedce8fa"}, - {file = "wrapt-2.0.1-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:e042d653a4745be832d5aa190ff80ee4f02c34b21f4b785745eceacd0907b815"}, - {file = "wrapt-2.0.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2afa23318136709c4b23d87d543b425c399887b4057936cd20386d5b1422b6fa"}, - {file = "wrapt-2.0.1-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:6c72328f668cf4c503ffcf9434c2b71fdd624345ced7941bc6693e61bbe36bef"}, - {file = "wrapt-2.0.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:3793ac154afb0e5b45d1233cb94d354ef7a983708cc3bb12563853b1d8d53747"}, - {file = "wrapt-2.0.1-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:fec0d993ecba3991645b4857837277469c8cc4c554a7e24d064d1ca291cfb81f"}, - {file = "wrapt-2.0.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:949520bccc1fa227274da7d03bf238be15389cd94e32e4297b92337df9b7a349"}, - {file = "wrapt-2.0.1-cp314-cp314t-win32.whl", hash = "sha256:be9e84e91d6497ba62594158d3d31ec0486c60055c49179edc51ee43d095f79c"}, - {file = "wrapt-2.0.1-cp314-cp314t-win_amd64.whl", hash = "sha256:61c4956171c7434634401db448371277d07032a81cc21c599c22953374781395"}, - {file = "wrapt-2.0.1-cp314-cp314t-win_arm64.whl", hash = "sha256:35cdbd478607036fee40273be8ed54a451f5f23121bd9d4be515158f9498f7ad"}, - {file = "wrapt-2.0.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:90897ea1cf0679763b62e79657958cd54eae5659f6360fc7d2ccc6f906342183"}, - {file = "wrapt-2.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:50844efc8cdf63b2d90cd3d62d4947a28311e6266ce5235a219d21b195b4ec2c"}, - {file = "wrapt-2.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:49989061a9977a8cbd6d20f2efa813f24bf657c6990a42967019ce779a878dbf"}, - {file = "wrapt-2.0.1-cp38-cp38-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:09c7476ab884b74dce081ad9bfd07fe5822d8600abade571cb1f66d5fc915af6"}, - {file = "wrapt-2.0.1-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d1a8a09a004ef100e614beec82862d11fc17d601092c3599afd22b1f36e4137e"}, - {file = "wrapt-2.0.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:89a82053b193837bf93c0f8a57ded6e4b6d88033a499dadff5067e912c2a41e9"}, - {file = "wrapt-2.0.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:f26f8e2ca19564e2e1fdbb6a0e47f36e0efbab1acc31e15471fad88f828c75f6"}, - {file = "wrapt-2.0.1-cp38-cp38-win32.whl", hash = "sha256:115cae4beed3542e37866469a8a1f2b9ec549b4463572b000611e9946b86e6f6"}, - {file = "wrapt-2.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:c4012a2bd37059d04f8209916aa771dfb564cccb86079072bdcd48a308b6a5c5"}, - {file = "wrapt-2.0.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:68424221a2dc00d634b54f92441914929c5ffb1c30b3b837343978343a3512a3"}, - {file = "wrapt-2.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6bd1a18f5a797fe740cb3d7a0e853a8ce6461cc62023b630caec80171a6b8097"}, - {file = "wrapt-2.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fb3a86e703868561c5cad155a15c36c716e1ab513b7065bd2ac8ed353c503333"}, - {file = "wrapt-2.0.1-cp39-cp39-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5dc1b852337c6792aa111ca8becff5bacf576bf4a0255b0f05eb749da6a1643e"}, - {file = "wrapt-2.0.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c046781d422f0830de6329fa4b16796096f28a92c8aef3850674442cdcb87b7f"}, - {file = "wrapt-2.0.1-cp39-cp39-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f73f9f7a0ebd0db139253d27e5fc8d2866ceaeef19c30ab5d69dcbe35e1a6981"}, - {file = "wrapt-2.0.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:b667189cf8efe008f55bbda321890bef628a67ab4147ebf90d182f2dadc78790"}, - {file = "wrapt-2.0.1-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:a9a83618c4f0757557c077ef71d708ddd9847ed66b7cc63416632af70d3e2308"}, - {file = "wrapt-2.0.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1e9b121e9aeb15df416c2c960b8255a49d44b4038016ee17af03975992d03931"}, - {file = "wrapt-2.0.1-cp39-cp39-win32.whl", hash = "sha256:1f186e26ea0a55f809f232e92cc8556a0977e00183c3ebda039a807a42be1494"}, - {file = "wrapt-2.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:bf4cb76f36be5de950ce13e22e7fdf462b35b04665a12b64f3ac5c1bbbcf3728"}, - {file = "wrapt-2.0.1-cp39-cp39-win_arm64.whl", hash = "sha256:d6cc985b9c8b235bd933990cdbf0f891f8e010b65a3911f7a55179cd7b0fc57b"}, - {file = "wrapt-2.0.1-py3-none-any.whl", hash = "sha256:4d2ce1bf1a48c5277d7969259232b57645aae5686dba1eaeade39442277afbca"}, - {file = "wrapt-2.0.1.tar.gz", hash = "sha256:9c9c635e78497cacb81e84f8b11b23e0aacac7a136e73b8e5b2109a1d9fc468f"}, -] -markers = {main = "extra == \"deltalake\" or extra == \"formats\" or extra == \"all\""} - -[package.extras] -dev = ["pytest", "setuptools"] - [[package]] name = "xmltodict" version = "1.0.2" @@ -3722,16 +3230,10 @@ files = [ test = ["pytest", "pytest-cov"] [extras] -all = ["azure-storage-blob", "boto3", "deltalake", "duckdb", "fastavro", "google-auth", "google-cloud-bigquery", "google-cloud-storage", "jsonschema", "mysql-connector-python", "psycopg2-binary", "pyodbc", "scipy", "snowflake-connector-python", "sqlalchemy"] -avro = ["fastavro"] -azure = ["azure-storage-blob"] +all = ["boto3", "google-auth", "google-cloud-bigquery", "jsonschema", "mysql-connector-python", "psycopg2-binary", "pyodbc", "scipy", "snowflake-connector-python", "sqlalchemy"] bigquery = ["google-auth", "google-cloud-bigquery"] -cloud = ["azure-storage-blob", "boto3", "google-auth", "google-cloud-storage"] -databases = ["duckdb", "mysql-connector-python", "psycopg2-binary", "pyodbc", "sqlalchemy"] -deltalake = ["deltalake"] -duckdb = ["duckdb"] -formats = ["deltalake", "duckdb", "fastavro"] -gcs = ["google-auth", "google-cloud-storage"] +cloud = ["boto3"] +databases = ["mysql-connector-python", "psycopg2-binary", "pyodbc", "sqlalchemy"] mssql = ["pyodbc", "sqlalchemy"] mysql = ["mysql-connector-python", "sqlalchemy"] postgres = ["psycopg2-binary", "sqlalchemy"] @@ -3746,4 +3248,4 @@ warehouses = ["boto3", "google-auth", "google-cloud-bigquery", "psycopg2-binary" [metadata] lock-version = "2.1" python-versions = ">=3.10,<4.0" -content-hash = "57f5ef0f9def33af10db85c6ef1e1473eaf563d6ebe5aec769b7e96cf91d1782" +content-hash = "3ec4ffd949a462e23313f979e5cfdbf09997f47dccaf83d0b30709bde33c1cf8" diff --git a/pyproject.toml b/pyproject.toml index ff81006..63896f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,22 +1,37 @@ [tool.poetry] name = "datacheck-cli" -version = "2.0.2" -description = "CLI-first data validation tool for data engineers. Catch bad data before it breaks pipelines." +version = "2.1.0" +description = "A linter for data pipelines. Enforce deterministic validation rules in CI/CD, Airflow, and beyond." authors = ["Squrtech "] readme = "README_PYPI.md" license = "Apache-2.0" homepage = "https://github.com/squrtech/datacheck" repository = "https://github.com/squrtech/datacheck" -keywords = ["data-validation", "cli", "data-engineering", "pipeline", "ci-cd", "data-quality", "yaml", "testing", "csv", "parquet", "postgres", "data-testing"] +keywords = [ + "data-validation", "data-linter", "cli", "data-engineering", + "pipeline", "ci-cd", "yaml", "testing", "csv", "parquet", + "postgres", "data-testing", "great-expectations-alternative", + "soda-alternative", "dbt-testing", "data-contracts", + "airflow", "snowflake", "bigquery", + "redshift", "schema-contracts", "schema-validation", + "data-pipeline", "etl-testing", +] classifiers = [ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Intended Audience :: System Administrators", "License :: OSI Approved :: Apache Software License", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", + "Topic :: Software Development :: Quality Assurance", + "Topic :: Database :: Database Engines/Servers", + "Topic :: Scientific/Engineering :: Information Analysis", + "Environment :: Console", + "Operating System :: OS Independent", ] packages = [{include = "datacheck"}] @@ -30,31 +45,19 @@ numpy = ">=1.24.0,<3.0.0" pyarrow = ">=14.0.0,<24.0.0" click = ">=8.1.0,<9.0.0" -# Validation rule dependencies -email-validator = ">=2.1.0,<3.0.0" -phonenumbers = ">=8.13.0,<10.0.0" - # Database connectors (optional) sqlalchemy = { version = ">=2.0.23,<3.0.0", optional = true } psycopg2-binary = { version = ">=2.9.9,<3.0.0", optional = true } mysql-connector-python = { version = ">=8.2.0,<10.0.0", optional = true } pyodbc = { version = ">=5.0.1,<6.0.0", optional = true } -duckdb = { version = ">=0.8.1,<2.0.0", optional = true, markers = "platform_system != 'Windows'" } - # Cloud storage (optional) boto3 = { version = ">=1.34.0,<2.0.0", optional = true } -google-cloud-storage = { version = ">=2.14.0,<3.0.0", optional = true } -azure-storage-blob = { version = ">=12.19.0,<13.0.0", optional = true } # Cloud data warehouse connectors (optional) snowflake-connector-python = { version = ">=3.0.0,<4.0.0", optional = true } google-cloud-bigquery = { version = ">=3.0.0,<4.0.0", optional = true } google-auth = { version = ">=2.0.0,<3.0.0", optional = true } -# Data format dependencies (optional) -deltalake = { version = ">=1.4.1,<2.0.0", optional = true } -fastavro = { version = ">=1.12.1,<2.0.0", optional = true } - # Statistical / advanced features (optional) scipy = { version = ">=1.11.0,<2.0.0", optional = true, python = ">=3.11" } jsonschema = { version = ">=4.17.0,<5.0.0", optional = true } @@ -65,15 +68,12 @@ postgresql = ["psycopg2-binary", "sqlalchemy"] postgres = ["psycopg2-binary", "sqlalchemy"] mysql = ["mysql-connector-python", "sqlalchemy"] mssql = ["pyodbc", "sqlalchemy"] -duckdb = ["duckdb"] # All database connectors -databases = ["psycopg2-binary", "mysql-connector-python", "pyodbc", "sqlalchemy", "duckdb"] +databases = ["psycopg2-binary", "mysql-connector-python", "pyodbc", "sqlalchemy"] -# Individual cloud storage +# Cloud storage s3 = ["boto3"] -gcs = ["google-cloud-storage", "google-auth"] -azure = ["azure-storage-blob"] -cloud = ["boto3", "google-cloud-storage", "azure-storage-blob", "google-auth"] +cloud = ["boto3"] # Cloud data warehouse connectors snowflake = ["snowflake-connector-python"] @@ -81,21 +81,15 @@ bigquery = ["google-cloud-bigquery", "google-auth"] redshift = ["boto3", "psycopg2-binary", "sqlalchemy"] warehouses = ["snowflake-connector-python", "google-cloud-bigquery", "google-auth", "boto3", "psycopg2-binary", "sqlalchemy"] -# Data format extras -deltalake = ["deltalake"] -avro = ["fastavro"] -formats = ["deltalake", "fastavro", "duckdb"] - # Feature extras statistical = ["scipy"] validation = ["jsonschema"] # Everything all = [ - "psycopg2-binary", "mysql-connector-python", "pyodbc", "sqlalchemy", "duckdb", - "boto3", "google-cloud-storage", "azure-storage-blob", + "psycopg2-binary", "mysql-connector-python", "pyodbc", "sqlalchemy", + "boto3", "snowflake-connector-python", "google-cloud-bigquery", "google-auth", - "deltalake", "fastavro", "scipy", "jsonschema", ] @@ -126,11 +120,6 @@ sqlalchemy = ">=2.0.23,<3.0.0" snowflake-connector-python = { version = ">=3.0.0,<4.0.0", python = "<3.14" } google-cloud-bigquery = ">=3.0.0,<4.0.0" google-auth = ">=2.0.0,<3.0.0" -google-cloud-storage = ">=2.14.0,<3.0.0" -azure-storage-blob = ">=12.19.0,<13.0.0" -deltalake = ">=1.4.1,<2.0.0" -fastavro = ">=1.12.1,<2.0.0" -duckdb = { version = ">=0.8.1,<2.0.0", markers = "platform_system != 'Windows'" } jsonschema = ">=4.17.0,<5.0.0" [tool.poetry.scripts] @@ -178,10 +167,6 @@ warn_unused_ignores = true warn_no_return = true strict_equality = true -[[tool.mypy.overrides]] -module = "duckdb.*" -ignore_missing_imports = true - [[tool.mypy.overrides]] module = "psycopg2.*" ignore_missing_imports = true @@ -214,22 +199,10 @@ ignore_missing_imports = true module = "google.cloud.*" ignore_missing_imports = true -[[tool.mypy.overrides]] -module = "azure.*" -ignore_missing_imports = true - [[tool.mypy.overrides]] module = "snowflake.*" ignore_missing_imports = true -[[tool.mypy.overrides]] -module = "deltalake.*" -ignore_missing_imports = true - -[[tool.mypy.overrides]] -module = "fastavro.*" -ignore_missing_imports = true - [[tool.mypy.overrides]] module = "airflow.*" ignore_missing_imports = true